1996-07-09 08:22:35 +02:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
1999-02-14 00:22:53 +01:00
|
|
|
* nodeAgg.c
|
1996-07-09 08:22:35 +02:00
|
|
|
* Routines to handle aggregate nodes.
|
|
|
|
*
|
2016-01-20 19:46:50 +01:00
|
|
|
* ExecAgg normally evaluates each aggregate in the following steps:
|
1999-12-13 02:27:21 +01:00
|
|
|
*
|
2000-07-17 05:05:41 +02:00
|
|
|
* transvalue = initcond
|
2006-07-27 21:52:07 +02:00
|
|
|
* foreach input_tuple do
|
|
|
|
* transvalue = transfunc(transvalue, input_value(s))
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
* result = finalfunc(transvalue, direct_argument(s))
|
1999-12-13 02:27:21 +01:00
|
|
|
*
|
2016-06-26 20:33:38 +02:00
|
|
|
* If a finalfunc is not supplied then the result is just the ending
|
|
|
|
* value of transvalue.
|
|
|
|
*
|
|
|
|
* Other behaviors can be selected by the "aggsplit" mode, which exists
|
|
|
|
* to support partial aggregation. It is possible to:
|
|
|
|
* * Skip running the finalfunc, so that the output is always the
|
|
|
|
* final transvalue state.
|
|
|
|
* * Substitute the combinefunc for the transfunc, so that transvalue
|
|
|
|
* states (propagated up from a child partial-aggregation step) are merged
|
|
|
|
* rather than processing raw input rows. (The statements below about
|
|
|
|
* the transfunc apply equally to the combinefunc, when it's selected.)
|
|
|
|
* * Apply the serializefunc to the output values (this only makes sense
|
|
|
|
* when skipping the finalfunc, since the serializefunc works on the
|
|
|
|
* transvalue data type).
|
|
|
|
* * Apply the deserializefunc to the input values (this only makes sense
|
|
|
|
* when using the combinefunc, for similar reasons).
|
|
|
|
* It is the planner's responsibility to connect up Agg nodes using these
|
|
|
|
* alternate behaviors in a way that makes sense, with partial aggregation
|
|
|
|
* results being fed to nodes that expect them.
|
2016-03-29 21:04:05 +02:00
|
|
|
*
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
* If a normal aggregate call specifies DISTINCT or ORDER BY, we sort the
|
|
|
|
* input tuples and eliminate duplicates (if required) before performing
|
|
|
|
* the above-depicted process. (However, we don't do that for ordered-set
|
|
|
|
* aggregates; their "ORDER BY" inputs are ordinary aggregate arguments
|
2016-06-26 20:33:38 +02:00
|
|
|
* so far as this module is concerned.) Note that partial aggregation
|
|
|
|
* is not supported in these cases, since we couldn't ensure global
|
|
|
|
* ordering or distinctness of the inputs.
|
2009-12-15 18:57:48 +01:00
|
|
|
*
|
2000-07-17 05:05:41 +02:00
|
|
|
* If transfunc is marked "strict" in pg_proc and initcond is NULL,
|
|
|
|
* then the first non-NULL input_value is assigned directly to transvalue,
|
|
|
|
* and transfunc isn't applied until the second non-NULL input_value.
|
2006-07-27 21:52:07 +02:00
|
|
|
* The agg's first input type and transtype must be the same in this case!
|
2000-07-17 05:05:41 +02:00
|
|
|
*
|
|
|
|
* If transfunc is marked "strict" then NULL input_values are skipped,
|
|
|
|
* keeping the previous transvalue. If transfunc is not strict then it
|
|
|
|
* is called for every input tuple and must deal with NULL initcond
|
2006-07-27 21:52:07 +02:00
|
|
|
* or NULL input_values for itself.
|
2000-07-17 05:05:41 +02:00
|
|
|
*
|
|
|
|
* If finalfunc is marked "strict" then it is not called when the
|
|
|
|
* ending transvalue is NULL, instead a NULL result is created
|
|
|
|
* automatically (this is just the usual handling of strict functions,
|
|
|
|
* of course). A non-strict finalfunc can make its own choice of
|
|
|
|
* what to return for a NULL ending transvalue.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
* Ordered-set aggregates are treated specially in one other way: we
|
|
|
|
* evaluate any "direct" arguments and pass them to the finalfunc along
|
Allow polymorphic aggregates to have non-polymorphic state data types.
Before 9.4, such an aggregate couldn't be declared, because its final
function would have to have polymorphic result type but no polymorphic
argument, which CREATE FUNCTION would quite properly reject. The
ordered-set-aggregate patch found a workaround: allow the final function
to be declared as accepting additional dummy arguments that have types
matching the aggregate's regular input arguments. However, we failed
to notice that this problem applies just as much to regular aggregates,
despite the fact that we had a built-in regular aggregate array_agg()
that was known to be undeclarable in SQL because its final function
had an illegal signature. So what we should have done, and what this
patch does, is to decouple the extra-dummy-arguments behavior from
ordered-set aggregates and make it generally available for all aggregate
declarations. We have to put this into 9.4 rather than waiting till
later because it slightly alters the rules for declaring ordered-set
aggregates.
The patch turned out a bit bigger than I'd hoped because it proved
necessary to record the extra-arguments option in a new pg_aggregate
column. I'd thought we could just look at the final function's pronargs
at runtime, but that didn't work well for variadic final functions.
It's probably just as well though, because it simplifies life for pg_dump
to record the option explicitly.
While at it, fix array_agg() to have a valid final-function signature,
and add an opr_sanity test to notice future deviations from polymorphic
consistency. I also marked the percentile_cont() aggregates as not
needing extra arguments, since they don't.
2014-04-24 01:17:31 +02:00
|
|
|
* with the transition value.
|
|
|
|
*
|
|
|
|
* A finalfunc can have additional arguments beyond the transvalue and
|
|
|
|
* any "direct" arguments, corresponding to the input arguments of the
|
|
|
|
* aggregate. These are always just passed as NULL. Such arguments may be
|
|
|
|
* needed to allow resolution of a polymorphic aggregate's result type.
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
*
|
2002-11-06 23:31:24 +01:00
|
|
|
* We compute aggregate input expressions and run the transition functions
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
* in a temporary econtext (aggstate->tmpcontext). This is reset at least
|
|
|
|
* once per input tuple, so when the transvalue datatype is
|
2002-11-06 23:31:24 +01:00
|
|
|
* pass-by-reference, we have to be careful to copy it into a longer-lived
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
* memory context, and free the prior value to avoid memory leakage. We
|
|
|
|
* store transvalues in another set of econtexts, aggstate->aggcontexts
|
|
|
|
* (one per grouping set, see below), which are also used for the hashtable
|
|
|
|
* structures in AGG_HASHED mode. These econtexts are rescanned, not just
|
|
|
|
* reset, at group boundaries so that aggregate transition functions can
|
|
|
|
* register shutdown callbacks via AggRegisterCallback.
|
|
|
|
*
|
|
|
|
* The node's regular econtext (aggstate->ss.ps.ps_ExprContext) is used to
|
|
|
|
* run finalize functions and compute the output tuple; this context can be
|
|
|
|
* reset once per output tuple.
|
2001-02-15 22:47:08 +01:00
|
|
|
*
|
2010-02-08 21:39:52 +01:00
|
|
|
* The executor's AggState node is passed as the fmgr "context" value in
|
|
|
|
* all transfunc and finalfunc calls. It is not recommended that the
|
|
|
|
* transition functions look at the AggState node directly, but they can
|
|
|
|
* use AggCheckCallContext() to verify that they are being called by
|
|
|
|
* nodeAgg.c (and not as ordinary SQL functions). The main reason a
|
|
|
|
* transition function might want to know this is so that it can avoid
|
|
|
|
* palloc'ing a fixed-size pass-by-ref transition value on every call:
|
|
|
|
* it can instead just scribble on and return its left input. Ordinarily
|
|
|
|
* it is completely forbidden for functions to modify pass-by-ref inputs,
|
|
|
|
* but in the aggregate case we know the left input is either the initial
|
|
|
|
* transition value or a previous function result, and in either case its
|
|
|
|
* value need not be preserved. See int8inc() for an example. Notice that
|
2018-01-09 22:25:38 +01:00
|
|
|
* the EEOP_AGG_PLAIN_TRANS step is coded to avoid a data copy step when
|
Improve speed of aggregates that use array_append as transition function.
In the previous coding, if an aggregate's transition function returned an
expanded array, nodeAgg.c and nodeWindowAgg.c would always copy it and thus
force it into the flat representation. This led to ping-ponging between
flat and expanded formats, which costs a lot. For an aggregate using
array_append as transition function, I measured about a 15X slowdown
compared to the pre-9.5 code, when working on simple int[] arrays.
Of course, the old code was already O(N^2) in this usage due to copying
flat arrays all the time, but it wasn't quite this inefficient.
To fix, teach nodeAgg.c and nodeWindowAgg.c to allow expanded transition
values without copying, so long as the transition function takes care to
return the transition value already properly parented under the aggcontext.
That puts a bit of extra responsibility on the transition function, but
doing it this way allows us to not need any extra logic in the fast path
of advance_transition_function (ie, with a pass-by-value transition value,
or with a modified-in-place pass-by-reference value). We already know
that that's a hot spot so I'm loath to add any cycles at all there. Also,
while only array_append currently knows how to follow this convention,
this solution allows other transition functions to opt-in without needing
to have a whitelist in the core aggregation code.
(The reason we would need a whitelist is that currently, if you pass a
R/W expanded-object pointer to an arbitrary function, it's allowed to do
anything with it including deleting it; that breaks the core agg code's
assumption that it should free discarded values. Returning a value under
aggcontext is the transition function's signal that it knows it is an
aggregate transition function and will play nice. Possibly the API rules
for expanded objects should be refined, but that would not be a
back-patchable change.)
With this fix, an aggregate using array_append is no longer O(N^2), so it's
much faster than pre-9.5 code rather than much slower. It's still a bit
slower than the bespoke infrastructure for array_agg, but the differential
seems to be only about 10%-20% rather than orders of magnitude.
Discussion: <6315.1477677885@sss.pgh.pa.us>
2016-10-30 17:27:41 +01:00
|
|
|
* the previous transition value pointer is returned. It is also possible
|
|
|
|
* to avoid repeated data copying when the transition value is an expanded
|
|
|
|
* object: to do that, the transition function must take care to return
|
|
|
|
* an expanded object that is in a child context of the memory context
|
|
|
|
* returned by AggCheckCallContext(). Also, some transition functions want
|
|
|
|
* to store working state in addition to the nominal transition value; they
|
|
|
|
* can use the memory context returned by AggCheckCallContext() to do that.
|
2010-02-08 21:39:52 +01:00
|
|
|
*
|
|
|
|
* Note: AggCheckCallContext() is available as of PostgreSQL 9.0. The
|
|
|
|
* AggState is available as context in earlier releases (back to 8.1),
|
|
|
|
* but direct examination of the node is needed to use it before 9.0.
|
2005-03-12 21:25:06 +01:00
|
|
|
*
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
* As of 9.4, aggregate transition functions can also use AggGetAggref()
|
|
|
|
* to get hold of the Aggref expression node for their aggregate call.
|
|
|
|
* This is mainly intended for ordered-set aggregates, which are not
|
|
|
|
* supported as window functions. (A regular aggregate function would
|
|
|
|
* need some fallback logic to use this, since there's no Aggref node
|
|
|
|
* for a window function.)
|
|
|
|
*
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
* Grouping sets:
|
|
|
|
*
|
|
|
|
* A list of grouping sets which is structurally equivalent to a ROLLUP
|
|
|
|
* clause (e.g. (a,b,c), (a,b), (a)) can be processed in a single pass over
|
|
|
|
* ordered data. We do this by keeping a separate set of transition values
|
|
|
|
* for each grouping set being concurrently processed; for each input tuple
|
|
|
|
* we update them all, and on group boundaries we reset those states
|
|
|
|
* (starting at the front of the list) whose grouping values have changed
|
|
|
|
* (the list of grouping sets is ordered from most specific to least
|
|
|
|
* specific).
|
|
|
|
*
|
|
|
|
* Where more complex grouping sets are used, we break them down into
|
2017-03-27 05:20:54 +02:00
|
|
|
* "phases", where each phase has a different sort order (except phase 0
|
|
|
|
* which is reserved for hashing). During each phase but the last, the
|
|
|
|
* input tuples are additionally stored in a tuplesort which is keyed to the
|
|
|
|
* next phase's sort order; during each phase but the first, the input
|
|
|
|
* tuples are drawn from the previously sorted data. (The sorting of the
|
|
|
|
* data for the first phase is handled by the planner, as it might be
|
|
|
|
* satisfied by underlying nodes.)
|
|
|
|
*
|
|
|
|
* Hashing can be mixed with sorted grouping. To do this, we have an
|
|
|
|
* AGG_MIXED strategy that populates the hashtables during the first sorted
|
|
|
|
* phase, and switches to reading them out after completing all sort phases.
|
|
|
|
* We can also support AGG_HASHED with multiple hash tables and no sorting
|
|
|
|
* at all.
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
*
|
|
|
|
* From the perspective of aggregate transition and final functions, the
|
|
|
|
* only issue regarding grouping sets is this: a single call site (flinfo)
|
|
|
|
* of an aggregate function may be used for updating several different
|
|
|
|
* transition values in turn. So the function must not cache in the flinfo
|
|
|
|
* anything which logically belongs as part of the transition value (most
|
|
|
|
* importantly, the memory context in which the transition value exists).
|
|
|
|
* The support API functions (AggCheckCallContext, AggRegisterCallback) are
|
|
|
|
* sensitive to the grouping set for which the aggregate function is
|
|
|
|
* currently being called.
|
|
|
|
*
|
2017-03-27 05:20:54 +02:00
|
|
|
* Plan structure:
|
|
|
|
*
|
|
|
|
* What we get from the planner is actually one "real" Agg node which is
|
|
|
|
* part of the plan tree proper, but which optionally has an additional list
|
|
|
|
* of Agg nodes hung off the side via the "chain" field. This is because an
|
|
|
|
* Agg node happens to be a convenient representation of all the data we
|
|
|
|
* need for grouping sets.
|
|
|
|
*
|
|
|
|
* For many purposes, we treat the "real" node as if it were just the first
|
|
|
|
* node in the chain. The chain must be ordered such that hashed entries
|
|
|
|
* come before sorted/plain entries; the real node is marked AGG_MIXED if
|
|
|
|
* there are both types present (in which case the real node describes one
|
|
|
|
* of the hashed groupings, other AGG_HASHED nodes may optionally follow in
|
|
|
|
* the chain, followed in turn by AGG_SORTED or (one) AGG_PLAIN node). If
|
|
|
|
* the real node is marked AGG_HASHED or AGG_SORTED, then all the chained
|
|
|
|
* nodes must be of the same type; if it is AGG_PLAIN, there can be no
|
|
|
|
* chained nodes.
|
|
|
|
*
|
|
|
|
* We collect all hashed nodes into a single "phase", numbered 0, and create
|
|
|
|
* a sorted phase (numbered 1..n) for each AGG_SORTED or AGG_PLAIN node.
|
|
|
|
* Phase 0 is allocated even if there are no hashes, but remains unused in
|
|
|
|
* that case.
|
|
|
|
*
|
|
|
|
* AGG_HASHED nodes actually refer to only a single grouping set each,
|
|
|
|
* because for each hashed grouping we need a separate grpColIdx and
|
|
|
|
* numGroups estimate. AGG_SORTED nodes represent a "rollup", a list of
|
|
|
|
* grouping sets that share a sort order. Each AGG_SORTED node other than
|
|
|
|
* the first one has an associated Sort node which describes the sort order
|
|
|
|
* to be used; the first sorted node takes its input from the outer subtree,
|
|
|
|
* which the planner has already arranged to provide ordered data.
|
|
|
|
*
|
|
|
|
* Memory and ExprContext usage:
|
|
|
|
*
|
|
|
|
* Because we're accumulating aggregate values across input rows, we need to
|
|
|
|
* use more memory contexts than just simple input/output tuple contexts.
|
|
|
|
* In fact, for a rollup, we need a separate context for each grouping set
|
|
|
|
* so that we can reset the inner (finer-grained) aggregates on their group
|
|
|
|
* boundaries while continuing to accumulate values for outer
|
|
|
|
* (coarser-grained) groupings. On top of this, we might be simultaneously
|
|
|
|
* populating hashtables; however, we only need one context for all the
|
|
|
|
* hashtables.
|
|
|
|
*
|
|
|
|
* So we create an array, aggcontexts, with an ExprContext for each grouping
|
|
|
|
* set in the largest rollup that we're going to process, and use the
|
|
|
|
* per-tuple memory context of those ExprContexts to store the aggregate
|
|
|
|
* transition values. hashcontext is the single context created to support
|
|
|
|
* all hash tables.
|
|
|
|
*
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
* Spilling To Disk
|
|
|
|
*
|
|
|
|
* When performing hash aggregation, if the hash table memory exceeds the
|
|
|
|
* limit (see hash_agg_check_limits()), we enter "spill mode". In spill
|
|
|
|
* mode, we advance the transition states only for groups already in the
|
|
|
|
* hash table. For tuples that would need to create a new hash table
|
|
|
|
* entries (and initialize new transition states), we instead spill them to
|
|
|
|
* disk to be processed later. The tuples are spilled in a partitioned
|
|
|
|
* manner, so that subsequent batches are smaller and less likely to exceed
|
2020-07-29 23:14:58 +02:00
|
|
|
* hash_mem (if a batch does exceed hash_mem, it must be spilled
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
* recursively).
|
|
|
|
*
|
|
|
|
* Spilled data is written to logical tapes. These provide better control
|
|
|
|
* over memory usage, disk space, and the number of files than if we were
|
2021-10-18 13:30:00 +02:00
|
|
|
* to use a BufFile for each spill. We don't know the number of tapes needed
|
|
|
|
* at the start of the algorithm (because it can recurse), so a tape set is
|
|
|
|
* allocated at the beginning, and individual tapes are created as needed.
|
|
|
|
* As a particular tape is read, logtape.c recycles its disk space. When a
|
|
|
|
* tape is read to completion, it is destroyed entirely.
|
|
|
|
*
|
|
|
|
* Tapes' buffers can take up substantial memory when many tapes are open at
|
|
|
|
* once. We only need one tape open at a time in read mode (using a buffer
|
|
|
|
* that's a multiple of BLCKSZ); but we need one tape open in write mode (each
|
|
|
|
* requiring a buffer of size BLCKSZ) for each partition.
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
*
|
|
|
|
* Note that it's possible for transition states to start small but then
|
|
|
|
* grow very large; for instance in the case of ARRAY_AGG. In such cases,
|
2020-07-29 23:14:58 +02:00
|
|
|
* it's still possible to significantly exceed hash_mem. We try to avoid
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
* this situation by estimating what will fit in the available memory, and
|
|
|
|
* imposing a limit on the number of groups separately from the amount of
|
|
|
|
* memory consumed.
|
|
|
|
*
|
2018-01-09 22:25:38 +01:00
|
|
|
* Transition / Combine function invocation:
|
|
|
|
*
|
|
|
|
* For performance reasons transition functions, including combine
|
|
|
|
* functions, aren't invoked one-by-one from nodeAgg.c after computing
|
|
|
|
* arguments using the expression evaluation engine. Instead
|
|
|
|
* ExecBuildAggTrans() builds one large expression that does both argument
|
|
|
|
* evaluation and transition function invocation. That avoids performance
|
|
|
|
* issues due to repeated uses of expression evaluation, complications due
|
|
|
|
* to filter expressions having to be evaluated early, and allows to JIT
|
|
|
|
* the entire expression into one native function.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2021-01-02 19:06:25 +01:00
|
|
|
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
|
2000-01-26 06:58:53 +01:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
2010-09-20 22:08:53 +02:00
|
|
|
* src/backend/executor/nodeAgg.c
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
1996-11-06 07:52:23 +01:00
|
|
|
|
1996-10-31 11:12:26 +01:00
|
|
|
#include "postgres.h"
|
|
|
|
|
2012-08-30 22:15:44 +02:00
|
|
|
#include "access/htup_details.h"
|
2020-06-19 07:24:27 +02:00
|
|
|
#include "access/parallel.h"
|
2013-04-12 14:55:56 +02:00
|
|
|
#include "catalog/objectaccess.h"
|
1996-07-09 08:22:35 +02:00
|
|
|
#include "catalog/pg_aggregate.h"
|
2005-01-28 00:42:18 +01:00
|
|
|
#include "catalog/pg_proc.h"
|
2016-01-20 19:46:50 +01:00
|
|
|
#include "catalog/pg_type.h"
|
2020-07-29 08:15:47 +02:00
|
|
|
#include "common/hashfn.h"
|
Fix edge case leading to agg transitions skipping ExecAggTransReparent() calls.
The code checking whether an aggregate transition value needs to be
reparented into the current context has always only compared the
transition return value with the previous transition value by datum,
i.e. without regard for NULLness. This normally works, because when
the transition function returns NULL (via fcinfo->isnull), it'll
return a value that won't be the same as its input value.
But there's no hard requirement that that's the case. And it turns
out, it's possible to hit this case (see discussion or reproducers),
leading to a non-null transition value not being reparented, followed
by a crash caused by that.
Instead of adding another comparison of NULLness, instead have
ExecAggTransReparent() ensure that pergroup->transValue ends up as 0
when the new transition value is NULL. That avoids having to add an
additional branch to the much more common cases of the transition
function returning the old transition value (which is a pointer in
this case), and when the new value is different, but not NULL.
In branches since 69c3936a149, also deduplicate the reparenting code
between the expression evaluation based transitions, and the path for
ordered aggregates.
Reported-By: Teodor Sigaev, Nikita Glukhov
Author: Andres Freund
Discussion: https://postgr.es/m/bd34e930-cfec-ea9b-3827-a8bc50891393@sigaev.ru
Backpatch: 9.4-, this issue has existed since at least 7.4
2020-01-21 08:26:51 +01:00
|
|
|
#include "executor/execExpr.h"
|
1996-07-09 08:22:35 +02:00
|
|
|
#include "executor/executor.h"
|
|
|
|
#include "executor/nodeAgg.h"
|
2020-07-29 08:15:47 +02:00
|
|
|
#include "lib/hyperloglog.h"
|
2002-04-30 00:28:19 +02:00
|
|
|
#include "miscadmin.h"
|
2016-12-01 01:08:11 +01:00
|
|
|
#include "nodes/makefuncs.h"
|
2008-08-26 00:42:34 +02:00
|
|
|
#include "nodes/nodeFuncs.h"
|
2019-01-29 21:48:51 +01:00
|
|
|
#include "optimizer/optimizer.h"
|
2003-07-01 21:10:53 +02:00
|
|
|
#include "parser/parse_agg.h"
|
2000-07-17 05:05:41 +02:00
|
|
|
#include "parser/parse_coerce.h"
|
2002-04-30 00:28:19 +02:00
|
|
|
#include "utils/acl.h"
|
2002-04-11 22:00:18 +02:00
|
|
|
#include "utils/builtins.h"
|
2019-11-12 04:00:16 +01:00
|
|
|
#include "utils/datum.h"
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
#include "utils/dynahash.h"
|
2019-08-16 19:33:30 +02:00
|
|
|
#include "utils/expandeddatum.h"
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
#include "utils/logtape.h"
|
2000-11-16 23:30:52 +01:00
|
|
|
#include "utils/lsyscache.h"
|
2005-05-06 19:24:55 +02:00
|
|
|
#include "utils/memutils.h"
|
1999-07-16 07:00:38 +02:00
|
|
|
#include "utils/syscache.h"
|
1999-12-13 02:27:21 +01:00
|
|
|
#include "utils/tuplesort.h"
|
2002-04-11 22:00:18 +02:00
|
|
|
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
/*
|
|
|
|
* Control how many partitions are created when spilling HashAgg to
|
|
|
|
* disk.
|
|
|
|
*
|
|
|
|
* HASHAGG_PARTITION_FACTOR is multiplied by the estimated number of
|
|
|
|
* partitions needed such that each partition will fit in memory. The factor
|
|
|
|
* is set higher than one because there's not a high cost to having a few too
|
|
|
|
* many partitions, and it makes it less likely that a partition will need to
|
|
|
|
* be spilled recursively. Another benefit of having more, smaller partitions
|
|
|
|
* is that small hash tables may perform better than large ones due to memory
|
|
|
|
* caching effects.
|
|
|
|
*
|
|
|
|
* We also specify a min and max number of partitions per spill. Too few might
|
|
|
|
* mean a lot of wasted I/O from repeated spilling of the same tuples. Too
|
|
|
|
* many will result in lots of memory wasted buffering the spill files (which
|
|
|
|
* could instead be spent on a larger hash table).
|
|
|
|
*/
|
|
|
|
#define HASHAGG_PARTITION_FACTOR 1.50
|
|
|
|
#define HASHAGG_MIN_PARTITIONS 4
|
|
|
|
#define HASHAGG_MAX_PARTITIONS 1024
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For reading from tapes, the buffer size must be a multiple of
|
|
|
|
* BLCKSZ. Larger values help when reading from multiple tapes concurrently,
|
|
|
|
* but that doesn't happen in HashAgg, so we simply use BLCKSZ. Writing to a
|
|
|
|
* tape always uses a buffer of size BLCKSZ.
|
|
|
|
*/
|
|
|
|
#define HASHAGG_READ_BUFFER_SIZE BLCKSZ
|
|
|
|
#define HASHAGG_WRITE_BUFFER_SIZE BLCKSZ
|
|
|
|
|
2020-07-29 08:15:47 +02:00
|
|
|
/*
|
|
|
|
* HyperLogLog is used for estimating the cardinality of the spilled tuples in
|
|
|
|
* a given partition. 5 bits corresponds to a size of about 32 bytes and a
|
|
|
|
* worst-case error of around 18%. That's effective enough to choose a
|
|
|
|
* reasonable number of partitions when recursing.
|
|
|
|
*/
|
|
|
|
#define HASHAGG_HLL_BIT_WIDTH 5
|
|
|
|
|
2020-04-04 04:52:16 +02:00
|
|
|
/*
|
|
|
|
* Estimate chunk overhead as a constant 16 bytes. XXX: should this be
|
|
|
|
* improved?
|
|
|
|
*/
|
|
|
|
#define CHUNKHDRSZ 16
|
|
|
|
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
/*
|
|
|
|
* Represents partitioned spill data for a single hashtable. Contains the
|
|
|
|
* necessary information to route tuples to the correct partition, and to
|
|
|
|
* transform the spilled data into new batches.
|
|
|
|
*
|
|
|
|
* The high bits are used for partition selection (when recursing, we ignore
|
|
|
|
* the bits that have already been used for partition selection at an earlier
|
|
|
|
* level).
|
|
|
|
*/
|
|
|
|
typedef struct HashAggSpill
|
|
|
|
{
|
|
|
|
int npartitions; /* number of partitions */
|
2021-10-18 13:30:00 +02:00
|
|
|
LogicalTape **partitions; /* spill partition tapes */
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
int64 *ntuples; /* number of tuples in each partition */
|
|
|
|
uint32 mask; /* mask to find partition from hash value */
|
|
|
|
int shift; /* after masking, shift by this amount */
|
2020-07-29 08:15:47 +02:00
|
|
|
hyperLogLogState *hll_card; /* cardinality estimate for contents */
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
} HashAggSpill;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Represents work to be done for one pass of hash aggregation (with only one
|
|
|
|
* grouping set).
|
|
|
|
*
|
|
|
|
* Also tracks the bits of the hash already used for partition selection by
|
|
|
|
* earlier iterations, so that this batch can use new bits. If all bits have
|
|
|
|
* already been used, no partitioning will be done (any spilled data will go
|
|
|
|
* to a single output tape).
|
|
|
|
*/
|
|
|
|
typedef struct HashAggBatch
|
|
|
|
{
|
|
|
|
int setno; /* grouping set */
|
|
|
|
int used_bits; /* number of bits of hash already used */
|
2021-10-18 13:30:00 +02:00
|
|
|
LogicalTape *input_tape; /* input partition tape */
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
int64 input_tuples; /* number of tuples in this batch */
|
2020-07-29 08:15:47 +02:00
|
|
|
double input_card; /* estimated group cardinality */
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
} HashAggBatch;
|
|
|
|
|
2020-07-13 02:48:49 +02:00
|
|
|
/* used to find referenced colnos */
|
|
|
|
typedef struct FindColsContext
|
|
|
|
{
|
|
|
|
bool is_aggref; /* is under an aggref */
|
|
|
|
Bitmapset *aggregated; /* column references under an aggref */
|
|
|
|
Bitmapset *unaggregated; /* other column references */
|
|
|
|
} FindColsContext;
|
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
static void select_current_set(AggState *aggstate, int setno, bool is_hash);
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
static void initialize_phase(AggState *aggstate, int newphase);
|
|
|
|
static TupleTableSlot *fetch_input_tuple(AggState *aggstate);
|
2002-11-06 23:31:24 +01:00
|
|
|
static void initialize_aggregates(AggState *aggstate,
|
2018-01-03 03:02:37 +01:00
|
|
|
AggStatePerGroup *pergroups,
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
int numReset);
|
2002-11-06 23:31:24 +01:00
|
|
|
static void advance_transition_function(AggState *aggstate,
|
2015-08-04 16:53:10 +02:00
|
|
|
AggStatePerTrans pertrans,
|
2014-01-08 19:58:15 +01:00
|
|
|
AggStatePerGroup pergroupstate);
|
2018-01-09 22:25:38 +01:00
|
|
|
static void advance_aggregates(AggState *aggstate);
|
2009-12-15 18:57:48 +01:00
|
|
|
static void process_ordered_aggregate_single(AggState *aggstate,
|
2015-08-04 16:53:10 +02:00
|
|
|
AggStatePerTrans pertrans,
|
2009-12-15 18:57:48 +01:00
|
|
|
AggStatePerGroup pergroupstate);
|
|
|
|
static void process_ordered_aggregate_multi(AggState *aggstate,
|
2015-08-04 16:53:10 +02:00
|
|
|
AggStatePerTrans pertrans,
|
2002-11-06 23:31:24 +01:00
|
|
|
AggStatePerGroup pergroupstate);
|
|
|
|
static void finalize_aggregate(AggState *aggstate,
|
2015-08-04 16:53:10 +02:00
|
|
|
AggStatePerAgg peragg,
|
2002-11-06 23:31:24 +01:00
|
|
|
AggStatePerGroup pergroupstate,
|
|
|
|
Datum *resultVal, bool *resultIsNull);
|
2016-03-29 21:04:05 +02:00
|
|
|
static void finalize_partialaggregate(AggState *aggstate,
|
|
|
|
AggStatePerAgg peragg,
|
|
|
|
AggStatePerGroup pergroupstate,
|
|
|
|
Datum *resultVal, bool *resultIsNull);
|
2020-07-26 23:55:52 +02:00
|
|
|
static inline void prepare_hash_slot(AggStatePerHash perhash,
|
|
|
|
TupleTableSlot *inputslot,
|
|
|
|
TupleTableSlot *hashslot);
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
static void prepare_projection_slot(AggState *aggstate,
|
|
|
|
TupleTableSlot *slot,
|
|
|
|
int currentSet);
|
|
|
|
static void finalize_aggregates(AggState *aggstate,
|
|
|
|
AggStatePerAgg peragg,
|
2017-03-27 05:20:54 +02:00
|
|
|
AggStatePerGroup pergroup);
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
static TupleTableSlot *project_aggregates(AggState *aggstate);
|
2020-07-13 02:48:49 +02:00
|
|
|
static void find_cols(AggState *aggstate, Bitmapset **aggregated,
|
|
|
|
Bitmapset **unaggregated);
|
|
|
|
static bool find_cols_walker(Node *node, FindColsContext *context);
|
2020-02-19 19:15:16 +01:00
|
|
|
static void build_hash_tables(AggState *aggstate);
|
|
|
|
static void build_hash_table(AggState *aggstate, int setno, long nbuckets);
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
static void hashagg_recompile_expressions(AggState *aggstate, bool minslot,
|
|
|
|
bool nullcheck);
|
|
|
|
static long hash_choose_num_buckets(double hashentrysize,
|
|
|
|
long estimated_nbuckets,
|
|
|
|
Size memory);
|
2020-07-29 08:15:47 +02:00
|
|
|
static int hash_choose_num_partitions(double input_groups,
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
double hashentrysize,
|
|
|
|
int used_bits,
|
|
|
|
int *log2_npartittions);
|
2020-07-26 23:55:52 +02:00
|
|
|
static void initialize_hash_entry(AggState *aggstate,
|
|
|
|
TupleHashTable hashtable,
|
|
|
|
TupleHashEntry entry);
|
2018-01-09 22:25:38 +01:00
|
|
|
static void lookup_hash_entries(AggState *aggstate);
|
2002-12-05 16:50:39 +01:00
|
|
|
static TupleTableSlot *agg_retrieve_direct(AggState *aggstate);
|
|
|
|
static void agg_fill_hash_table(AggState *aggstate);
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
static bool agg_refill_hash_table(AggState *aggstate);
|
2002-12-05 16:50:39 +01:00
|
|
|
static TupleTableSlot *agg_retrieve_hash_table(AggState *aggstate);
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
static TupleTableSlot *agg_retrieve_hash_table_in_memory(AggState *aggstate);
|
|
|
|
static void hash_agg_check_limits(AggState *aggstate);
|
|
|
|
static void hash_agg_enter_spill_mode(AggState *aggstate);
|
|
|
|
static void hash_agg_update_metrics(AggState *aggstate, bool from_tape,
|
|
|
|
int npartitions);
|
|
|
|
static void hashagg_finish_initial_spills(AggState *aggstate);
|
|
|
|
static void hashagg_reset_spill_state(AggState *aggstate);
|
2021-10-18 13:30:00 +02:00
|
|
|
static HashAggBatch *hashagg_batch_new(LogicalTape *input_tape, int setno,
|
2020-07-29 08:15:47 +02:00
|
|
|
int64 input_tuples, double input_card,
|
|
|
|
int used_bits);
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
static MinimalTuple hashagg_batch_read(HashAggBatch *batch, uint32 *hashp);
|
2021-10-18 13:30:00 +02:00
|
|
|
static void hashagg_spill_init(HashAggSpill *spill, LogicalTapeSet *lts,
|
2020-07-29 08:15:47 +02:00
|
|
|
int used_bits, double input_groups,
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
double hashentrysize);
|
2020-07-13 02:48:49 +02:00
|
|
|
static Size hashagg_spill_tuple(AggState *aggstate, HashAggSpill *spill,
|
|
|
|
TupleTableSlot *slot, uint32 hash);
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
static void hashagg_spill_finish(AggState *aggstate, HashAggSpill *spill,
|
|
|
|
int setno);
|
2002-04-11 22:00:18 +02:00
|
|
|
static Datum GetAggInitVal(Datum textInitVal, Oid transtype);
|
2015-08-04 16:53:10 +02:00
|
|
|
static void build_pertrans_for_aggref(AggStatePerTrans pertrans,
|
2017-03-14 17:57:10 +01:00
|
|
|
AggState *aggstate, EState *estate,
|
2021-07-04 08:47:31 +02:00
|
|
|
Aggref *aggref, Oid transfn_oid,
|
|
|
|
Oid aggtranstype, Oid aggserialfn,
|
|
|
|
Oid aggdeserialfn, Datum initValue,
|
|
|
|
bool initValueIsNull, Oid *inputTypes,
|
|
|
|
int numArguments);
|
2003-06-25 23:30:34 +02:00
|
|
|
|
1999-12-13 02:27:21 +01:00
|
|
|
|
1999-09-26 23:21:15 +02:00
|
|
|
/*
|
2017-03-27 05:20:54 +02:00
|
|
|
* Select the current grouping set; affects current_set and
|
|
|
|
* curaggcontext.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
select_current_set(AggState *aggstate, int setno, bool is_hash)
|
|
|
|
{
|
2020-02-24 23:39:22 +01:00
|
|
|
/*
|
|
|
|
* When changing this, also adapt ExecAggPlainTransByVal() and
|
|
|
|
* ExecAggPlainTransByRef().
|
|
|
|
*/
|
2017-03-27 05:20:54 +02:00
|
|
|
if (is_hash)
|
|
|
|
aggstate->curaggcontext = aggstate->hashcontext;
|
|
|
|
else
|
|
|
|
aggstate->curaggcontext = aggstate->aggcontexts[setno];
|
|
|
|
|
|
|
|
aggstate->current_set = setno;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Switch to phase "newphase", which must either be 0 or 1 (to reset) or
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
* current_phase + 1. Juggle the tuplesorts accordingly.
|
2017-03-27 05:20:54 +02:00
|
|
|
*
|
|
|
|
* Phase 0 is for hashing, which we currently handle last in the AGG_MIXED
|
|
|
|
* case, so when entering phase 0, all we need to do is drop open sorts.
|
1999-09-26 23:21:15 +02:00
|
|
|
*/
|
1999-12-13 02:27:21 +01:00
|
|
|
static void
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
initialize_phase(AggState *aggstate, int newphase)
|
1999-09-26 23:21:15 +02:00
|
|
|
{
|
2017-03-27 05:20:54 +02:00
|
|
|
Assert(newphase <= 1 || newphase == aggstate->current_phase + 1);
|
1999-12-13 02:27:21 +01:00
|
|
|
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
/*
|
|
|
|
* Whatever the previous state, we're now done with whatever input
|
|
|
|
* tuplesort was in use.
|
|
|
|
*/
|
|
|
|
if (aggstate->sort_in)
|
1999-12-13 02:27:21 +01:00
|
|
|
{
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
tuplesort_end(aggstate->sort_in);
|
|
|
|
aggstate->sort_in = NULL;
|
|
|
|
}
|
2002-11-06 23:31:24 +01:00
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
if (newphase <= 1)
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
{
|
1999-12-13 02:27:21 +01:00
|
|
|
/*
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
* Discard any existing output tuplesort.
|
1999-12-13 02:27:21 +01:00
|
|
|
*/
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
if (aggstate->sort_out)
|
2002-11-06 23:31:24 +01:00
|
|
|
{
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
tuplesort_end(aggstate->sort_out);
|
|
|
|
aggstate->sort_out = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* The old output tuplesort becomes the new input one, and this is the
|
|
|
|
* right time to actually sort it.
|
|
|
|
*/
|
|
|
|
aggstate->sort_in = aggstate->sort_out;
|
|
|
|
aggstate->sort_out = NULL;
|
|
|
|
Assert(aggstate->sort_in);
|
|
|
|
tuplesort_performsort(aggstate->sort_in);
|
|
|
|
}
|
1999-12-13 02:27:21 +01:00
|
|
|
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
/*
|
|
|
|
* If this isn't the last phase, we need to sort appropriately for the
|
|
|
|
* next phase in sequence.
|
|
|
|
*/
|
2017-03-27 05:20:54 +02:00
|
|
|
if (newphase > 0 && newphase < aggstate->numphases - 1)
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
{
|
|
|
|
Sort *sortnode = aggstate->phases[newphase + 1].sortnode;
|
|
|
|
PlanState *outerNode = outerPlanState(aggstate);
|
|
|
|
TupleDesc tupDesc = ExecGetResultType(outerNode);
|
|
|
|
|
|
|
|
aggstate->sort_out = tuplesort_begin_heap(tupDesc,
|
|
|
|
sortnode->numCols,
|
|
|
|
sortnode->sortColIdx,
|
|
|
|
sortnode->sortOperators,
|
|
|
|
sortnode->collations,
|
|
|
|
sortnode->nullsFirst,
|
|
|
|
work_mem,
|
Support parallel btree index builds.
To make this work, tuplesort.c and logtape.c must also support
parallelism, so this patch adds that infrastructure and then applies
it to the particular case of parallel btree index builds. Testing
to date shows that this can often be 2-3x faster than a serial
index build.
The model for deciding how many workers to use is fairly primitive
at present, but it's better than not having the feature. We can
refine it as we get more experience.
Peter Geoghegan with some help from Rushabh Lathia. While Heikki
Linnakangas is not an author of this patch, he wrote other patches
without which this feature would not have been possible, and
therefore the release notes should possibly credit him as an author
of this feature. Reviewed by Claudio Freire, Heikki Linnakangas,
Thomas Munro, Tels, Amit Kapila, me.
Discussion: http://postgr.es/m/CAM3SWZQKM=Pzc=CAHzRixKjp2eO5Q0Jg1SoFQqeXFQ647JiwqQ@mail.gmail.com
Discussion: http://postgr.es/m/CAH2-Wz=AxWqDoVvGU7dq856S4r6sJAj6DBn7VMtigkB33N5eyg@mail.gmail.com
2018-02-02 19:25:55 +01:00
|
|
|
NULL, false);
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
aggstate->current_phase = newphase;
|
|
|
|
aggstate->phase = &aggstate->phases[newphase];
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2017-03-27 05:20:54 +02:00
|
|
|
* Fetch a tuple from either the outer plan (for phase 1) or from the sorter
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
* populated by the previous phase. Copy it to the sorter for the next phase
|
|
|
|
* if any.
|
2017-04-06 23:48:59 +02:00
|
|
|
*
|
|
|
|
* Callers cannot rely on memory for tuple in returned slot remaining valid
|
|
|
|
* past any subsequently fetched tuple.
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
*/
|
|
|
|
static TupleTableSlot *
|
|
|
|
fetch_input_tuple(AggState *aggstate)
|
|
|
|
{
|
|
|
|
TupleTableSlot *slot;
|
|
|
|
|
|
|
|
if (aggstate->sort_in)
|
|
|
|
{
|
2017-07-26 02:37:17 +02:00
|
|
|
/* make sure we check for interrupts in either path through here */
|
|
|
|
CHECK_FOR_INTERRUPTS();
|
2017-04-06 23:48:59 +02:00
|
|
|
if (!tuplesort_gettupleslot(aggstate->sort_in, true, false,
|
|
|
|
aggstate->sort_slot, NULL))
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
return NULL;
|
|
|
|
slot = aggstate->sort_slot;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
slot = ExecProcNode(outerPlanState(aggstate));
|
|
|
|
|
|
|
|
if (!TupIsNull(slot) && aggstate->sort_out)
|
|
|
|
tuplesort_puttupleslot(aggstate->sort_out, slot);
|
|
|
|
|
|
|
|
return slot;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* (Re)Initialize an individual aggregate.
|
|
|
|
*
|
2017-03-27 05:20:54 +02:00
|
|
|
* This function handles only one grouping set, already set in
|
|
|
|
* aggstate->current_set.
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
*
|
|
|
|
* When called, CurrentMemoryContext should be the per-query context.
|
|
|
|
*/
|
|
|
|
static void
|
2015-08-04 16:53:10 +02:00
|
|
|
initialize_aggregate(AggState *aggstate, AggStatePerTrans pertrans,
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
AggStatePerGroup pergroupstate)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Start a fresh sort operation for each DISTINCT/ORDER BY aggregate.
|
|
|
|
*/
|
2015-08-04 16:53:10 +02:00
|
|
|
if (pertrans->numSortCols > 0)
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* In case of rescan, maybe there could be an uncompleted sort
|
|
|
|
* operation? Clean it up if so.
|
|
|
|
*/
|
2015-08-04 16:53:10 +02:00
|
|
|
if (pertrans->sortstates[aggstate->current_set])
|
|
|
|
tuplesort_end(pertrans->sortstates[aggstate->current_set]);
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We use a plain Datum sorter when there's a single input column;
|
|
|
|
* otherwise sort the full tuple. (See comments for
|
|
|
|
* process_ordered_aggregate_single.)
|
|
|
|
*/
|
2015-08-04 16:53:10 +02:00
|
|
|
if (pertrans->numInputs == 1)
|
2017-08-20 20:19:07 +02:00
|
|
|
{
|
|
|
|
Form_pg_attribute attr = TupleDescAttr(pertrans->sortdesc, 0);
|
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
pertrans->sortstates[aggstate->current_set] =
|
2017-08-20 20:19:07 +02:00
|
|
|
tuplesort_begin_datum(attr->atttypid,
|
2015-08-04 16:53:10 +02:00
|
|
|
pertrans->sortOperators[0],
|
|
|
|
pertrans->sortCollations[0],
|
|
|
|
pertrans->sortNullsFirst[0],
|
Support parallel btree index builds.
To make this work, tuplesort.c and logtape.c must also support
parallelism, so this patch adds that infrastructure and then applies
it to the particular case of parallel btree index builds. Testing
to date shows that this can often be 2-3x faster than a serial
index build.
The model for deciding how many workers to use is fairly primitive
at present, but it's better than not having the feature. We can
refine it as we get more experience.
Peter Geoghegan with some help from Rushabh Lathia. While Heikki
Linnakangas is not an author of this patch, he wrote other patches
without which this feature would not have been possible, and
therefore the release notes should possibly credit him as an author
of this feature. Reviewed by Claudio Freire, Heikki Linnakangas,
Thomas Munro, Tels, Amit Kapila, me.
Discussion: http://postgr.es/m/CAM3SWZQKM=Pzc=CAHzRixKjp2eO5Q0Jg1SoFQqeXFQ647JiwqQ@mail.gmail.com
Discussion: http://postgr.es/m/CAH2-Wz=AxWqDoVvGU7dq856S4r6sJAj6DBn7VMtigkB33N5eyg@mail.gmail.com
2018-02-02 19:25:55 +01:00
|
|
|
work_mem, NULL, false);
|
2017-08-20 20:19:07 +02:00
|
|
|
}
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
else
|
2015-08-04 16:53:10 +02:00
|
|
|
pertrans->sortstates[aggstate->current_set] =
|
2016-12-01 01:08:11 +01:00
|
|
|
tuplesort_begin_heap(pertrans->sortdesc,
|
2015-08-04 16:53:10 +02:00
|
|
|
pertrans->numSortCols,
|
|
|
|
pertrans->sortColIdx,
|
|
|
|
pertrans->sortOperators,
|
|
|
|
pertrans->sortCollations,
|
|
|
|
pertrans->sortNullsFirst,
|
Support parallel btree index builds.
To make this work, tuplesort.c and logtape.c must also support
parallelism, so this patch adds that infrastructure and then applies
it to the particular case of parallel btree index builds. Testing
to date shows that this can often be 2-3x faster than a serial
index build.
The model for deciding how many workers to use is fairly primitive
at present, but it's better than not having the feature. We can
refine it as we get more experience.
Peter Geoghegan with some help from Rushabh Lathia. While Heikki
Linnakangas is not an author of this patch, he wrote other patches
without which this feature would not have been possible, and
therefore the release notes should possibly credit him as an author
of this feature. Reviewed by Claudio Freire, Heikki Linnakangas,
Thomas Munro, Tels, Amit Kapila, me.
Discussion: http://postgr.es/m/CAM3SWZQKM=Pzc=CAHzRixKjp2eO5Q0Jg1SoFQqeXFQ647JiwqQ@mail.gmail.com
Discussion: http://postgr.es/m/CAH2-Wz=AxWqDoVvGU7dq856S4r6sJAj6DBn7VMtigkB33N5eyg@mail.gmail.com
2018-02-02 19:25:55 +01:00
|
|
|
work_mem, NULL, false);
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
}
|
1999-12-13 02:27:21 +01:00
|
|
|
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
/*
|
|
|
|
* (Re)set transValue to the initial value.
|
|
|
|
*
|
|
|
|
* Note that when the initial value is pass-by-ref, we must copy it (into
|
|
|
|
* the aggcontext) since we will pfree the transValue later.
|
|
|
|
*/
|
2015-08-04 16:53:10 +02:00
|
|
|
if (pertrans->initValueIsNull)
|
|
|
|
pergroupstate->transValue = pertrans->initValue;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
else
|
|
|
|
{
|
|
|
|
MemoryContext oldContext;
|
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
oldContext = MemoryContextSwitchTo(aggstate->curaggcontext->ecxt_per_tuple_memory);
|
2015-08-04 16:53:10 +02:00
|
|
|
pergroupstate->transValue = datumCopy(pertrans->initValue,
|
|
|
|
pertrans->transtypeByVal,
|
|
|
|
pertrans->transtypeLen);
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
MemoryContextSwitchTo(oldContext);
|
|
|
|
}
|
2015-08-04 16:53:10 +02:00
|
|
|
pergroupstate->transValueIsNull = pertrans->initValueIsNull;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If the initial value for the transition state doesn't exist in the
|
|
|
|
* pg_aggregate table then we will let the first non-NULL value returned
|
|
|
|
* from the outer procNode become the initial value. (This is useful for
|
|
|
|
* aggregates like max() and min().) The noTransValue flag signals that we
|
|
|
|
* still need to do this.
|
|
|
|
*/
|
2015-08-04 16:53:10 +02:00
|
|
|
pergroupstate->noTransValue = pertrans->initValueIsNull;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2015-08-04 16:53:10 +02:00
|
|
|
* Initialize all aggregate transition states for a new group of input values.
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
*
|
|
|
|
* If there are multiple grouping sets, we initialize only the first numReset
|
|
|
|
* of them (the grouping sets are ordered so that the most specific one, which
|
2017-03-27 05:20:54 +02:00
|
|
|
* is reset most often, is first). As a convenience, if numReset is 0, we
|
2018-01-03 03:02:37 +01:00
|
|
|
* reinitialize all sets.
|
|
|
|
*
|
|
|
|
* NB: This cannot be used for hash aggregates, as for those the grouping set
|
|
|
|
* number has to be specified from further up.
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
*
|
|
|
|
* When called, CurrentMemoryContext should be the per-query context.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
initialize_aggregates(AggState *aggstate,
|
2018-01-03 03:02:37 +01:00
|
|
|
AggStatePerGroup *pergroups,
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
int numReset)
|
|
|
|
{
|
2015-08-04 16:53:10 +02:00
|
|
|
int transno;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
int numGroupingSets = Max(aggstate->phase->numsets, 1);
|
|
|
|
int setno = 0;
|
2017-03-27 05:20:54 +02:00
|
|
|
int numTrans = aggstate->numtrans;
|
2015-08-04 16:53:10 +02:00
|
|
|
AggStatePerTrans transstates = aggstate->pertrans;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
if (numReset == 0)
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
numReset = numGroupingSets;
|
|
|
|
|
2018-01-03 03:02:37 +01:00
|
|
|
for (setno = 0; setno < numReset; setno++)
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
{
|
2018-01-03 03:02:37 +01:00
|
|
|
AggStatePerGroup pergroup = pergroups[setno];
|
1999-12-13 02:27:21 +01:00
|
|
|
|
2018-01-03 03:02:37 +01:00
|
|
|
select_current_set(aggstate, setno, false);
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
|
2018-01-03 03:02:37 +01:00
|
|
|
for (transno = 0; transno < numTrans; transno++)
|
2017-03-27 05:20:54 +02:00
|
|
|
{
|
2018-01-03 03:02:37 +01:00
|
|
|
AggStatePerTrans pertrans = &transstates[transno];
|
|
|
|
AggStatePerGroup pergroupstate = &pergroup[transno];
|
2017-03-27 05:20:54 +02:00
|
|
|
|
2018-01-03 03:02:37 +01:00
|
|
|
initialize_aggregate(aggstate, pertrans, pergroupstate);
|
2017-03-27 05:20:54 +02:00
|
|
|
}
|
2002-11-06 23:31:24 +01:00
|
|
|
}
|
1999-12-13 02:27:21 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
* Given new input value(s), advance the transition function of one aggregate
|
2015-08-04 16:53:10 +02:00
|
|
|
* state within one grouping set only (already set in aggstate->current_set)
|
2006-07-27 21:52:07 +02:00
|
|
|
*
|
|
|
|
* The new values (and null flags) have been preloaded into argument positions
|
2015-08-04 16:53:10 +02:00
|
|
|
* 1 and up in pertrans->transfn_fcinfo, so that we needn't copy them again to
|
|
|
|
* pass to the transition function. We also expect that the static fields of
|
|
|
|
* the fcinfo are already initialized; that was done by ExecInitAgg().
|
1999-12-13 02:27:21 +01:00
|
|
|
*
|
2002-11-06 23:31:24 +01:00
|
|
|
* It doesn't matter which memory context this is called in.
|
1999-12-13 02:27:21 +01:00
|
|
|
*/
|
|
|
|
static void
|
2002-11-06 23:31:24 +01:00
|
|
|
advance_transition_function(AggState *aggstate,
|
2015-08-04 16:53:10 +02:00
|
|
|
AggStatePerTrans pertrans,
|
2014-01-08 19:58:15 +01:00
|
|
|
AggStatePerGroup pergroupstate)
|
1999-12-13 02:27:21 +01:00
|
|
|
{
|
Change function call information to be variable length.
Before this change FunctionCallInfoData, the struct arguments etc for
V1 function calls are stored in, always had space for
FUNC_MAX_ARGS/100 arguments, storing datums and their nullness in two
arrays. For nearly every function call 100 arguments is far more than
needed, therefore wasting memory. Arg and argnull being two separate
arrays also guarantees that to access a single argument, two
cachelines have to be touched.
Change the layout so there's a single variable-length array with pairs
of value / isnull. That drastically reduces memory consumption for
most function calls (on x86-64 a two argument function now uses
64bytes, previously 936 bytes), and makes it very likely that argument
value and its nullness are on the same cacheline.
Arguments are stored in a new NullableDatum struct, which, due to
padding, needs more memory per argument than before. But as usually
far fewer arguments are stored, and individual arguments are cheaper
to access, that's still a clear win. It's likely that there's other
places where conversion to NullableDatum arrays would make sense,
e.g. TupleTableSlots, but that's for another commit.
Because the function call information is now variable-length
allocations have to take the number of arguments into account. For
heap allocations that can be done with SizeForFunctionCallInfoData(),
for on-stack allocations there's a new LOCAL_FCINFO(name, nargs) macro
that helps to allocate an appropriately sized and aligned variable.
Some places with stack allocation function call information don't know
the number of arguments at compile time, and currently variably sized
stack allocations aren't allowed in postgres. Therefore allow for
FUNC_MAX_ARGS space in these cases. They're not that common, so for
now that seems acceptable.
Because of the need to allocate FunctionCallInfo of the appropriate
size, older extensions may need to update their code. To avoid subtle
breakages, the FunctionCallInfoData struct has been renamed to
FunctionCallInfoBaseData. Most code only references FunctionCallInfo,
so that shouldn't cause much collateral damage.
This change is also a prerequisite for more efficient expression JIT
compilation (by allocating the function call information on the stack,
allowing LLVM to optimize it away); previously the size of the call
information caused problems inside LLVM's optimizer.
Author: Andres Freund
Reviewed-By: Tom Lane
Discussion: https://postgr.es/m/20180605172952.x34m5uz6ju6enaem@alap3.anarazel.de
2019-01-26 23:17:52 +01:00
|
|
|
FunctionCallInfo fcinfo = pertrans->transfn_fcinfo;
|
2002-11-06 23:31:24 +01:00
|
|
|
MemoryContext oldContext;
|
2006-07-27 21:52:07 +02:00
|
|
|
Datum newVal;
|
1999-12-13 02:27:21 +01:00
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
if (pertrans->transfn.fn_strict)
|
1999-09-26 23:21:15 +02:00
|
|
|
{
|
2002-11-06 23:31:24 +01:00
|
|
|
/*
|
2006-07-27 21:52:07 +02:00
|
|
|
* For a strict transfn, nothing happens when there's a NULL input; we
|
|
|
|
* just keep the prior transValue.
|
2002-11-06 23:31:24 +01:00
|
|
|
*/
|
2015-08-04 16:53:10 +02:00
|
|
|
int numTransInputs = pertrans->numTransInputs;
|
2014-01-08 19:58:15 +01:00
|
|
|
int i;
|
|
|
|
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
for (i = 1; i <= numTransInputs; i++)
|
2006-07-27 21:52:07 +02:00
|
|
|
{
|
Change function call information to be variable length.
Before this change FunctionCallInfoData, the struct arguments etc for
V1 function calls are stored in, always had space for
FUNC_MAX_ARGS/100 arguments, storing datums and their nullness in two
arrays. For nearly every function call 100 arguments is far more than
needed, therefore wasting memory. Arg and argnull being two separate
arrays also guarantees that to access a single argument, two
cachelines have to be touched.
Change the layout so there's a single variable-length array with pairs
of value / isnull. That drastically reduces memory consumption for
most function calls (on x86-64 a two argument function now uses
64bytes, previously 936 bytes), and makes it very likely that argument
value and its nullness are on the same cacheline.
Arguments are stored in a new NullableDatum struct, which, due to
padding, needs more memory per argument than before. But as usually
far fewer arguments are stored, and individual arguments are cheaper
to access, that's still a clear win. It's likely that there's other
places where conversion to NullableDatum arrays would make sense,
e.g. TupleTableSlots, but that's for another commit.
Because the function call information is now variable-length
allocations have to take the number of arguments into account. For
heap allocations that can be done with SizeForFunctionCallInfoData(),
for on-stack allocations there's a new LOCAL_FCINFO(name, nargs) macro
that helps to allocate an appropriately sized and aligned variable.
Some places with stack allocation function call information don't know
the number of arguments at compile time, and currently variably sized
stack allocations aren't allowed in postgres. Therefore allow for
FUNC_MAX_ARGS space in these cases. They're not that common, so for
now that seems acceptable.
Because of the need to allocate FunctionCallInfo of the appropriate
size, older extensions may need to update their code. To avoid subtle
breakages, the FunctionCallInfoData struct has been renamed to
FunctionCallInfoBaseData. Most code only references FunctionCallInfo,
so that shouldn't cause much collateral damage.
This change is also a prerequisite for more efficient expression JIT
compilation (by allocating the function call information on the stack,
allowing LLVM to optimize it away); previously the size of the call
information caused problems inside LLVM's optimizer.
Author: Andres Freund
Reviewed-By: Tom Lane
Discussion: https://postgr.es/m/20180605172952.x34m5uz6ju6enaem@alap3.anarazel.de
2019-01-26 23:17:52 +01:00
|
|
|
if (fcinfo->args[i].isnull)
|
2006-07-27 21:52:07 +02:00
|
|
|
return;
|
|
|
|
}
|
2002-11-06 23:31:24 +01:00
|
|
|
if (pergroupstate->noTransValue)
|
2000-07-17 05:05:41 +02:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* transValue has not been initialized. This is the first non-NULL
|
|
|
|
* input value. We use it as the initial value for transValue. (We
|
|
|
|
* already checked that the agg's input type is binary-compatible
|
2002-04-11 22:00:18 +02:00
|
|
|
* with its transtype, so straight copy here is OK.)
|
1999-12-13 02:27:21 +01:00
|
|
|
*
|
2002-11-06 23:31:24 +01:00
|
|
|
* We must copy the datum into aggcontext if it is pass-by-ref. We
|
|
|
|
* do not need to pfree the old transValue, since it's NULL.
|
1999-12-13 02:27:21 +01:00
|
|
|
*/
|
2017-03-27 05:20:54 +02:00
|
|
|
oldContext = MemoryContextSwitchTo(aggstate->curaggcontext->ecxt_per_tuple_memory);
|
Change function call information to be variable length.
Before this change FunctionCallInfoData, the struct arguments etc for
V1 function calls are stored in, always had space for
FUNC_MAX_ARGS/100 arguments, storing datums and their nullness in two
arrays. For nearly every function call 100 arguments is far more than
needed, therefore wasting memory. Arg and argnull being two separate
arrays also guarantees that to access a single argument, two
cachelines have to be touched.
Change the layout so there's a single variable-length array with pairs
of value / isnull. That drastically reduces memory consumption for
most function calls (on x86-64 a two argument function now uses
64bytes, previously 936 bytes), and makes it very likely that argument
value and its nullness are on the same cacheline.
Arguments are stored in a new NullableDatum struct, which, due to
padding, needs more memory per argument than before. But as usually
far fewer arguments are stored, and individual arguments are cheaper
to access, that's still a clear win. It's likely that there's other
places where conversion to NullableDatum arrays would make sense,
e.g. TupleTableSlots, but that's for another commit.
Because the function call information is now variable-length
allocations have to take the number of arguments into account. For
heap allocations that can be done with SizeForFunctionCallInfoData(),
for on-stack allocations there's a new LOCAL_FCINFO(name, nargs) macro
that helps to allocate an appropriately sized and aligned variable.
Some places with stack allocation function call information don't know
the number of arguments at compile time, and currently variably sized
stack allocations aren't allowed in postgres. Therefore allow for
FUNC_MAX_ARGS space in these cases. They're not that common, so for
now that seems acceptable.
Because of the need to allocate FunctionCallInfo of the appropriate
size, older extensions may need to update their code. To avoid subtle
breakages, the FunctionCallInfoData struct has been renamed to
FunctionCallInfoBaseData. Most code only references FunctionCallInfo,
so that shouldn't cause much collateral damage.
This change is also a prerequisite for more efficient expression JIT
compilation (by allocating the function call information on the stack,
allowing LLVM to optimize it away); previously the size of the call
information caused problems inside LLVM's optimizer.
Author: Andres Freund
Reviewed-By: Tom Lane
Discussion: https://postgr.es/m/20180605172952.x34m5uz6ju6enaem@alap3.anarazel.de
2019-01-26 23:17:52 +01:00
|
|
|
pergroupstate->transValue = datumCopy(fcinfo->args[1].value,
|
2015-08-04 16:53:10 +02:00
|
|
|
pertrans->transtypeByVal,
|
|
|
|
pertrans->transtypeLen);
|
2002-11-06 23:31:24 +01:00
|
|
|
pergroupstate->transValueIsNull = false;
|
|
|
|
pergroupstate->noTransValue = false;
|
|
|
|
MemoryContextSwitchTo(oldContext);
|
2000-07-17 05:05:41 +02:00
|
|
|
return;
|
1999-12-13 02:27:21 +01:00
|
|
|
}
|
2002-11-06 23:31:24 +01:00
|
|
|
if (pergroupstate->transValueIsNull)
|
1999-12-13 02:27:21 +01:00
|
|
|
{
|
2000-07-12 04:37:39 +02:00
|
|
|
/*
|
2000-07-17 05:05:41 +02:00
|
|
|
* Don't call a strict function with NULL inputs. Note it is
|
|
|
|
* possible to get here despite the above tests, if the transfn is
|
|
|
|
* strict *and* returned a NULL on a prior cycle. If that happens
|
|
|
|
* we will propagate the NULL all the way to the end.
|
2000-07-12 04:37:39 +02:00
|
|
|
*/
|
2000-07-17 05:05:41 +02:00
|
|
|
return;
|
1999-12-13 02:27:21 +01:00
|
|
|
}
|
|
|
|
}
|
1999-09-26 23:21:15 +02:00
|
|
|
|
2002-11-06 23:31:24 +01:00
|
|
|
/* We run the transition functions in per-input-tuple memory context */
|
|
|
|
oldContext = MemoryContextSwitchTo(aggstate->tmpcontext->ecxt_per_tuple_memory);
|
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
/* set up aggstate->curpertrans for AggGetAggref() */
|
|
|
|
aggstate->curpertrans = pertrans;
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
|
2002-10-04 19:19:55 +02:00
|
|
|
/*
|
|
|
|
* OK to call the transition function
|
|
|
|
*/
|
Change function call information to be variable length.
Before this change FunctionCallInfoData, the struct arguments etc for
V1 function calls are stored in, always had space for
FUNC_MAX_ARGS/100 arguments, storing datums and their nullness in two
arrays. For nearly every function call 100 arguments is far more than
needed, therefore wasting memory. Arg and argnull being two separate
arrays also guarantees that to access a single argument, two
cachelines have to be touched.
Change the layout so there's a single variable-length array with pairs
of value / isnull. That drastically reduces memory consumption for
most function calls (on x86-64 a two argument function now uses
64bytes, previously 936 bytes), and makes it very likely that argument
value and its nullness are on the same cacheline.
Arguments are stored in a new NullableDatum struct, which, due to
padding, needs more memory per argument than before. But as usually
far fewer arguments are stored, and individual arguments are cheaper
to access, that's still a clear win. It's likely that there's other
places where conversion to NullableDatum arrays would make sense,
e.g. TupleTableSlots, but that's for another commit.
Because the function call information is now variable-length
allocations have to take the number of arguments into account. For
heap allocations that can be done with SizeForFunctionCallInfoData(),
for on-stack allocations there's a new LOCAL_FCINFO(name, nargs) macro
that helps to allocate an appropriately sized and aligned variable.
Some places with stack allocation function call information don't know
the number of arguments at compile time, and currently variably sized
stack allocations aren't allowed in postgres. Therefore allow for
FUNC_MAX_ARGS space in these cases. They're not that common, so for
now that seems acceptable.
Because of the need to allocate FunctionCallInfo of the appropriate
size, older extensions may need to update their code. To avoid subtle
breakages, the FunctionCallInfoData struct has been renamed to
FunctionCallInfoBaseData. Most code only references FunctionCallInfo,
so that shouldn't cause much collateral damage.
This change is also a prerequisite for more efficient expression JIT
compilation (by allocating the function call information on the stack,
allowing LLVM to optimize it away); previously the size of the call
information caused problems inside LLVM's optimizer.
Author: Andres Freund
Reviewed-By: Tom Lane
Discussion: https://postgr.es/m/20180605172952.x34m5uz6ju6enaem@alap3.anarazel.de
2019-01-26 23:17:52 +01:00
|
|
|
fcinfo->args[0].value = pergroupstate->transValue;
|
|
|
|
fcinfo->args[0].isnull = pergroupstate->transValueIsNull;
|
2014-01-08 19:58:15 +01:00
|
|
|
fcinfo->isnull = false; /* just in case transfn doesn't set it */
|
2003-06-25 23:30:34 +02:00
|
|
|
|
2006-07-27 21:52:07 +02:00
|
|
|
newVal = FunctionCallInvoke(fcinfo);
|
2000-07-17 05:05:41 +02:00
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
aggstate->curpertrans = NULL;
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
|
2000-07-17 05:05:41 +02:00
|
|
|
/*
|
2002-11-06 23:31:24 +01:00
|
|
|
* If pass-by-ref datatype, must copy the new value into aggcontext and
|
Improve speed of aggregates that use array_append as transition function.
In the previous coding, if an aggregate's transition function returned an
expanded array, nodeAgg.c and nodeWindowAgg.c would always copy it and thus
force it into the flat representation. This led to ping-ponging between
flat and expanded formats, which costs a lot. For an aggregate using
array_append as transition function, I measured about a 15X slowdown
compared to the pre-9.5 code, when working on simple int[] arrays.
Of course, the old code was already O(N^2) in this usage due to copying
flat arrays all the time, but it wasn't quite this inefficient.
To fix, teach nodeAgg.c and nodeWindowAgg.c to allow expanded transition
values without copying, so long as the transition function takes care to
return the transition value already properly parented under the aggcontext.
That puts a bit of extra responsibility on the transition function, but
doing it this way allows us to not need any extra logic in the fast path
of advance_transition_function (ie, with a pass-by-value transition value,
or with a modified-in-place pass-by-reference value). We already know
that that's a hot spot so I'm loath to add any cycles at all there. Also,
while only array_append currently knows how to follow this convention,
this solution allows other transition functions to opt-in without needing
to have a whitelist in the core aggregation code.
(The reason we would need a whitelist is that currently, if you pass a
R/W expanded-object pointer to an arbitrary function, it's allowed to do
anything with it including deleting it; that breaks the core agg code's
assumption that it should free discarded values. Returning a value under
aggcontext is the transition function's signal that it knows it is an
aggregate transition function and will play nice. Possibly the API rules
for expanded objects should be refined, but that would not be a
back-patchable change.)
With this fix, an aggregate using array_append is no longer O(N^2), so it's
much faster than pre-9.5 code rather than much slower. It's still a bit
slower than the bespoke infrastructure for array_agg, but the differential
seems to be only about 10%-20% rather than orders of magnitude.
Discussion: <6315.1477677885@sss.pgh.pa.us>
2016-10-30 17:27:41 +01:00
|
|
|
* free the prior transValue. But if transfn returned a pointer to its
|
|
|
|
* first input, we don't need to do anything. Also, if transfn returned a
|
|
|
|
* pointer to a R/W expanded object that is already a child of the
|
|
|
|
* aggcontext, assume we can adopt that value without copying it.
|
Fix edge case leading to agg transitions skipping ExecAggTransReparent() calls.
The code checking whether an aggregate transition value needs to be
reparented into the current context has always only compared the
transition return value with the previous transition value by datum,
i.e. without regard for NULLness. This normally works, because when
the transition function returns NULL (via fcinfo->isnull), it'll
return a value that won't be the same as its input value.
But there's no hard requirement that that's the case. And it turns
out, it's possible to hit this case (see discussion or reproducers),
leading to a non-null transition value not being reparented, followed
by a crash caused by that.
Instead of adding another comparison of NULLness, instead have
ExecAggTransReparent() ensure that pergroup->transValue ends up as 0
when the new transition value is NULL. That avoids having to add an
additional branch to the much more common cases of the transition
function returning the old transition value (which is a pointer in
this case), and when the new value is different, but not NULL.
In branches since 69c3936a149, also deduplicate the reparenting code
between the expression evaluation based transitions, and the path for
ordered aggregates.
Reported-By: Teodor Sigaev, Nikita Glukhov
Author: Andres Freund
Discussion: https://postgr.es/m/bd34e930-cfec-ea9b-3827-a8bc50891393@sigaev.ru
Backpatch: 9.4-, this issue has existed since at least 7.4
2020-01-21 08:26:51 +01:00
|
|
|
*
|
|
|
|
* It's safe to compare newVal with pergroup->transValue without regard
|
|
|
|
* for either being NULL, because ExecAggTransReparent() takes care to set
|
|
|
|
* transValue to 0 when NULL. Otherwise we could end up accidentally not
|
|
|
|
* reparenting, when the transValue has the same numerical value as
|
|
|
|
* newValue, despite being NULL. This is a somewhat hot path, making it
|
|
|
|
* undesirable to instead solve this with another branch for the common
|
|
|
|
* case of the transition function returning its (modified) input
|
|
|
|
* argument.
|
2000-07-17 05:05:41 +02:00
|
|
|
*/
|
2015-08-04 16:53:10 +02:00
|
|
|
if (!pertrans->transtypeByVal &&
|
2002-11-06 23:31:24 +01:00
|
|
|
DatumGetPointer(newVal) != DatumGetPointer(pergroupstate->transValue))
|
Fix edge case leading to agg transitions skipping ExecAggTransReparent() calls.
The code checking whether an aggregate transition value needs to be
reparented into the current context has always only compared the
transition return value with the previous transition value by datum,
i.e. without regard for NULLness. This normally works, because when
the transition function returns NULL (via fcinfo->isnull), it'll
return a value that won't be the same as its input value.
But there's no hard requirement that that's the case. And it turns
out, it's possible to hit this case (see discussion or reproducers),
leading to a non-null transition value not being reparented, followed
by a crash caused by that.
Instead of adding another comparison of NULLness, instead have
ExecAggTransReparent() ensure that pergroup->transValue ends up as 0
when the new transition value is NULL. That avoids having to add an
additional branch to the much more common cases of the transition
function returning the old transition value (which is a pointer in
this case), and when the new value is different, but not NULL.
In branches since 69c3936a149, also deduplicate the reparenting code
between the expression evaluation based transitions, and the path for
ordered aggregates.
Reported-By: Teodor Sigaev, Nikita Glukhov
Author: Andres Freund
Discussion: https://postgr.es/m/bd34e930-cfec-ea9b-3827-a8bc50891393@sigaev.ru
Backpatch: 9.4-, this issue has existed since at least 7.4
2020-01-21 08:26:51 +01:00
|
|
|
newVal = ExecAggTransReparent(aggstate, pertrans,
|
|
|
|
newVal, fcinfo->isnull,
|
|
|
|
pergroupstate->transValue,
|
|
|
|
pergroupstate->transValueIsNull);
|
2002-11-06 23:31:24 +01:00
|
|
|
|
|
|
|
pergroupstate->transValue = newVal;
|
2006-07-27 21:52:07 +02:00
|
|
|
pergroupstate->transValueIsNull = fcinfo->isnull;
|
2000-07-17 05:05:41 +02:00
|
|
|
|
2002-11-06 23:31:24 +01:00
|
|
|
MemoryContextSwitchTo(oldContext);
|
1999-09-26 23:21:15 +02:00
|
|
|
}
|
1996-07-09 08:22:35 +02:00
|
|
|
|
2002-11-06 01:00:45 +01:00
|
|
|
/*
|
2015-08-04 16:53:10 +02:00
|
|
|
* Advance each aggregate transition state for one input tuple. The input
|
|
|
|
* tuple has been stored in tmpcontext->ecxt_outertuple, so that it is
|
2017-03-27 05:20:54 +02:00
|
|
|
* accessible to ExecEvalExpr.
|
|
|
|
*
|
|
|
|
* We have two sets of transition states to handle: one for sorted aggregation
|
|
|
|
* and one for hashed; we do them both here, to avoid multiple evaluation of
|
|
|
|
* the inputs.
|
2002-11-06 01:00:45 +01:00
|
|
|
*
|
|
|
|
* When called, CurrentMemoryContext should be the per-query context.
|
|
|
|
*/
|
|
|
|
static void
|
2018-01-09 22:25:38 +01:00
|
|
|
advance_aggregates(AggState *aggstate)
|
2002-11-06 01:00:45 +01:00
|
|
|
{
|
2018-01-09 22:25:38 +01:00
|
|
|
bool dummynull;
|
2009-12-15 18:57:48 +01:00
|
|
|
|
2018-01-09 22:25:38 +01:00
|
|
|
ExecEvalExprSwitchContext(aggstate->phase->evaltrans,
|
|
|
|
aggstate->tmpcontext,
|
|
|
|
&dummynull);
|
2002-11-06 01:00:45 +01:00
|
|
|
}
|
|
|
|
|
1999-12-13 02:27:21 +01:00
|
|
|
/*
|
2009-12-15 18:57:48 +01:00
|
|
|
* Run the transition function for a DISTINCT or ORDER BY aggregate
|
|
|
|
* with only one input. This is called after we have completed
|
|
|
|
* entering all the input values into the sort object. We complete the
|
|
|
|
* sort, read out the values in sorted order, and run the transition
|
|
|
|
* function on each value (applying DISTINCT if appropriate).
|
|
|
|
*
|
|
|
|
* Note that the strictness of the transition function was checked when
|
|
|
|
* entering the values into the sort, so we don't check it again here;
|
|
|
|
* we just apply standard SQL DISTINCT logic.
|
|
|
|
*
|
|
|
|
* The one-input case is handled separately from the multi-input case
|
|
|
|
* for performance reasons: for single by-value inputs, such as the
|
|
|
|
* common case of count(distinct id), the tuplesort_getdatum code path
|
|
|
|
* is around 300% faster. (The speedup for by-reference types is less
|
|
|
|
* but still noticeable.)
|
2000-07-12 04:37:39 +02:00
|
|
|
*
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
* This function handles only one grouping set (already set in
|
|
|
|
* aggstate->current_set).
|
|
|
|
*
|
2000-07-12 04:37:39 +02:00
|
|
|
* When called, CurrentMemoryContext should be the per-query context.
|
1999-12-13 02:27:21 +01:00
|
|
|
*/
|
|
|
|
static void
|
2009-12-15 18:57:48 +01:00
|
|
|
process_ordered_aggregate_single(AggState *aggstate,
|
2015-08-04 16:53:10 +02:00
|
|
|
AggStatePerTrans pertrans,
|
2009-12-15 18:57:48 +01:00
|
|
|
AggStatePerGroup pergroupstate)
|
1999-12-13 02:27:21 +01:00
|
|
|
{
|
2000-07-12 04:37:39 +02:00
|
|
|
Datum oldVal = (Datum) 0;
|
2009-12-15 18:57:48 +01:00
|
|
|
bool oldIsNull = true;
|
2000-07-12 04:37:39 +02:00
|
|
|
bool haveOldVal = false;
|
2002-11-06 23:31:24 +01:00
|
|
|
MemoryContext workcontext = aggstate->tmpcontext->ecxt_per_tuple_memory;
|
2000-07-12 04:37:39 +02:00
|
|
|
MemoryContext oldContext;
|
2015-08-04 16:53:10 +02:00
|
|
|
bool isDistinct = (pertrans->numDistinctCols > 0);
|
2016-02-17 11:10:00 +01:00
|
|
|
Datum newAbbrevVal = (Datum) 0;
|
|
|
|
Datum oldAbbrevVal = (Datum) 0;
|
Change function call information to be variable length.
Before this change FunctionCallInfoData, the struct arguments etc for
V1 function calls are stored in, always had space for
FUNC_MAX_ARGS/100 arguments, storing datums and their nullness in two
arrays. For nearly every function call 100 arguments is far more than
needed, therefore wasting memory. Arg and argnull being two separate
arrays also guarantees that to access a single argument, two
cachelines have to be touched.
Change the layout so there's a single variable-length array with pairs
of value / isnull. That drastically reduces memory consumption for
most function calls (on x86-64 a two argument function now uses
64bytes, previously 936 bytes), and makes it very likely that argument
value and its nullness are on the same cacheline.
Arguments are stored in a new NullableDatum struct, which, due to
padding, needs more memory per argument than before. But as usually
far fewer arguments are stored, and individual arguments are cheaper
to access, that's still a clear win. It's likely that there's other
places where conversion to NullableDatum arrays would make sense,
e.g. TupleTableSlots, but that's for another commit.
Because the function call information is now variable-length
allocations have to take the number of arguments into account. For
heap allocations that can be done with SizeForFunctionCallInfoData(),
for on-stack allocations there's a new LOCAL_FCINFO(name, nargs) macro
that helps to allocate an appropriately sized and aligned variable.
Some places with stack allocation function call information don't know
the number of arguments at compile time, and currently variably sized
stack allocations aren't allowed in postgres. Therefore allow for
FUNC_MAX_ARGS space in these cases. They're not that common, so for
now that seems acceptable.
Because of the need to allocate FunctionCallInfo of the appropriate
size, older extensions may need to update their code. To avoid subtle
breakages, the FunctionCallInfoData struct has been renamed to
FunctionCallInfoBaseData. Most code only references FunctionCallInfo,
so that shouldn't cause much collateral damage.
This change is also a prerequisite for more efficient expression JIT
compilation (by allocating the function call information on the stack,
allowing LLVM to optimize it away); previously the size of the call
information caused problems inside LLVM's optimizer.
Author: Andres Freund
Reviewed-By: Tom Lane
Discussion: https://postgr.es/m/20180605172952.x34m5uz6ju6enaem@alap3.anarazel.de
2019-01-26 23:17:52 +01:00
|
|
|
FunctionCallInfo fcinfo = pertrans->transfn_fcinfo;
|
2006-07-27 21:52:07 +02:00
|
|
|
Datum *newVal;
|
|
|
|
bool *isNull;
|
2000-05-28 19:56:29 +02:00
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
Assert(pertrans->numDistinctCols < 2);
|
2009-12-15 18:57:48 +01:00
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
tuplesort_performsort(pertrans->sortstates[aggstate->current_set]);
|
1999-12-13 02:27:21 +01:00
|
|
|
|
2009-12-15 18:57:48 +01:00
|
|
|
/* Load the column into argument 1 (arg 0 will be transition value) */
|
Change function call information to be variable length.
Before this change FunctionCallInfoData, the struct arguments etc for
V1 function calls are stored in, always had space for
FUNC_MAX_ARGS/100 arguments, storing datums and their nullness in two
arrays. For nearly every function call 100 arguments is far more than
needed, therefore wasting memory. Arg and argnull being two separate
arrays also guarantees that to access a single argument, two
cachelines have to be touched.
Change the layout so there's a single variable-length array with pairs
of value / isnull. That drastically reduces memory consumption for
most function calls (on x86-64 a two argument function now uses
64bytes, previously 936 bytes), and makes it very likely that argument
value and its nullness are on the same cacheline.
Arguments are stored in a new NullableDatum struct, which, due to
padding, needs more memory per argument than before. But as usually
far fewer arguments are stored, and individual arguments are cheaper
to access, that's still a clear win. It's likely that there's other
places where conversion to NullableDatum arrays would make sense,
e.g. TupleTableSlots, but that's for another commit.
Because the function call information is now variable-length
allocations have to take the number of arguments into account. For
heap allocations that can be done with SizeForFunctionCallInfoData(),
for on-stack allocations there's a new LOCAL_FCINFO(name, nargs) macro
that helps to allocate an appropriately sized and aligned variable.
Some places with stack allocation function call information don't know
the number of arguments at compile time, and currently variably sized
stack allocations aren't allowed in postgres. Therefore allow for
FUNC_MAX_ARGS space in these cases. They're not that common, so for
now that seems acceptable.
Because of the need to allocate FunctionCallInfo of the appropriate
size, older extensions may need to update their code. To avoid subtle
breakages, the FunctionCallInfoData struct has been renamed to
FunctionCallInfoBaseData. Most code only references FunctionCallInfo,
so that shouldn't cause much collateral damage.
This change is also a prerequisite for more efficient expression JIT
compilation (by allocating the function call information on the stack,
allowing LLVM to optimize it away); previously the size of the call
information caused problems inside LLVM's optimizer.
Author: Andres Freund
Reviewed-By: Tom Lane
Discussion: https://postgr.es/m/20180605172952.x34m5uz6ju6enaem@alap3.anarazel.de
2019-01-26 23:17:52 +01:00
|
|
|
newVal = &fcinfo->args[1].value;
|
|
|
|
isNull = &fcinfo->args[1].isnull;
|
2006-07-27 21:52:07 +02:00
|
|
|
|
1999-12-13 02:27:21 +01:00
|
|
|
/*
|
2000-07-12 04:37:39 +02:00
|
|
|
* Note: if input type is pass-by-ref, the datums returned by the sort are
|
|
|
|
* freshly palloc'd in the per-query context, so we must be careful to
|
|
|
|
* pfree them when they are no longer needed.
|
1999-12-13 02:27:21 +01:00
|
|
|
*/
|
2000-07-12 04:37:39 +02:00
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
while (tuplesort_getdatum(pertrans->sortstates[aggstate->current_set],
|
2016-02-17 11:10:00 +01:00
|
|
|
true, newVal, isNull, &newAbbrevVal))
|
1999-12-13 02:27:21 +01:00
|
|
|
{
|
2000-07-12 04:37:39 +02:00
|
|
|
/*
|
2002-11-06 23:31:24 +01:00
|
|
|
* Clear and select the working context for evaluation of the equality
|
2000-07-17 05:05:41 +02:00
|
|
|
* function and transition function.
|
2000-07-12 04:37:39 +02:00
|
|
|
*/
|
2002-11-06 23:31:24 +01:00
|
|
|
MemoryContextReset(workcontext);
|
|
|
|
oldContext = MemoryContextSwitchTo(workcontext);
|
2000-07-12 04:37:39 +02:00
|
|
|
|
2009-12-15 18:57:48 +01:00
|
|
|
/*
|
|
|
|
* If DISTINCT mode, and not distinct from prior, skip it.
|
|
|
|
*/
|
|
|
|
if (isDistinct &&
|
|
|
|
haveOldVal &&
|
|
|
|
((oldIsNull && *isNull) ||
|
|
|
|
(!oldIsNull && !*isNull &&
|
2016-02-17 11:10:00 +01:00
|
|
|
oldAbbrevVal == newAbbrevVal &&
|
2019-03-22 12:09:32 +01:00
|
|
|
DatumGetBool(FunctionCall2Coll(&pertrans->equalfnOne,
|
|
|
|
pertrans->aggCollation,
|
2009-12-15 18:57:48 +01:00
|
|
|
oldVal, *newVal)))))
|
2000-07-12 04:37:39 +02:00
|
|
|
{
|
|
|
|
/* equal to prior, so forget this one */
|
2015-08-04 16:53:10 +02:00
|
|
|
if (!pertrans->inputtypeByVal && !*isNull)
|
2006-07-27 21:52:07 +02:00
|
|
|
pfree(DatumGetPointer(*newVal));
|
2000-07-12 04:37:39 +02:00
|
|
|
}
|
|
|
|
else
|
1999-12-13 02:27:21 +01:00
|
|
|
{
|
2015-08-04 16:53:10 +02:00
|
|
|
advance_transition_function(aggstate, pertrans, pergroupstate);
|
2000-07-12 04:37:39 +02:00
|
|
|
/* forget the old value, if any */
|
2015-08-04 16:53:10 +02:00
|
|
|
if (!oldIsNull && !pertrans->inputtypeByVal)
|
1999-12-13 02:27:21 +01:00
|
|
|
pfree(DatumGetPointer(oldVal));
|
2002-11-06 23:31:24 +01:00
|
|
|
/* and remember the new one for subsequent equality checks */
|
2006-07-27 21:52:07 +02:00
|
|
|
oldVal = *newVal;
|
2016-02-17 11:10:00 +01:00
|
|
|
oldAbbrevVal = newAbbrevVal;
|
2009-12-15 18:57:48 +01:00
|
|
|
oldIsNull = *isNull;
|
1999-12-13 02:27:21 +01:00
|
|
|
haveOldVal = true;
|
|
|
|
}
|
2000-07-12 04:37:39 +02:00
|
|
|
|
|
|
|
MemoryContextSwitchTo(oldContext);
|
1999-12-13 02:27:21 +01:00
|
|
|
}
|
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
if (!oldIsNull && !pertrans->inputtypeByVal)
|
2000-07-12 04:37:39 +02:00
|
|
|
pfree(DatumGetPointer(oldVal));
|
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
tuplesort_end(pertrans->sortstates[aggstate->current_set]);
|
|
|
|
pertrans->sortstates[aggstate->current_set] = NULL;
|
2000-07-12 04:37:39 +02:00
|
|
|
}
|
|
|
|
|
2009-12-15 18:57:48 +01:00
|
|
|
/*
|
|
|
|
* Run the transition function for a DISTINCT or ORDER BY aggregate
|
|
|
|
* with more than one input. This is called after we have completed
|
|
|
|
* entering all the input values into the sort object. We complete the
|
|
|
|
* sort, read out the values in sorted order, and run the transition
|
|
|
|
* function on each value (applying DISTINCT if appropriate).
|
|
|
|
*
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
* This function handles only one grouping set (already set in
|
|
|
|
* aggstate->current_set).
|
|
|
|
*
|
2009-12-15 18:57:48 +01:00
|
|
|
* When called, CurrentMemoryContext should be the per-query context.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
process_ordered_aggregate_multi(AggState *aggstate,
|
2015-08-04 16:53:10 +02:00
|
|
|
AggStatePerTrans pertrans,
|
2009-12-15 18:57:48 +01:00
|
|
|
AggStatePerGroup pergroupstate)
|
|
|
|
{
|
2018-02-16 06:55:31 +01:00
|
|
|
ExprContext *tmpcontext = aggstate->tmpcontext;
|
Change function call information to be variable length.
Before this change FunctionCallInfoData, the struct arguments etc for
V1 function calls are stored in, always had space for
FUNC_MAX_ARGS/100 arguments, storing datums and their nullness in two
arrays. For nearly every function call 100 arguments is far more than
needed, therefore wasting memory. Arg and argnull being two separate
arrays also guarantees that to access a single argument, two
cachelines have to be touched.
Change the layout so there's a single variable-length array with pairs
of value / isnull. That drastically reduces memory consumption for
most function calls (on x86-64 a two argument function now uses
64bytes, previously 936 bytes), and makes it very likely that argument
value and its nullness are on the same cacheline.
Arguments are stored in a new NullableDatum struct, which, due to
padding, needs more memory per argument than before. But as usually
far fewer arguments are stored, and individual arguments are cheaper
to access, that's still a clear win. It's likely that there's other
places where conversion to NullableDatum arrays would make sense,
e.g. TupleTableSlots, but that's for another commit.
Because the function call information is now variable-length
allocations have to take the number of arguments into account. For
heap allocations that can be done with SizeForFunctionCallInfoData(),
for on-stack allocations there's a new LOCAL_FCINFO(name, nargs) macro
that helps to allocate an appropriately sized and aligned variable.
Some places with stack allocation function call information don't know
the number of arguments at compile time, and currently variably sized
stack allocations aren't allowed in postgres. Therefore allow for
FUNC_MAX_ARGS space in these cases. They're not that common, so for
now that seems acceptable.
Because of the need to allocate FunctionCallInfo of the appropriate
size, older extensions may need to update their code. To avoid subtle
breakages, the FunctionCallInfoData struct has been renamed to
FunctionCallInfoBaseData. Most code only references FunctionCallInfo,
so that shouldn't cause much collateral damage.
This change is also a prerequisite for more efficient expression JIT
compilation (by allocating the function call information on the stack,
allowing LLVM to optimize it away); previously the size of the call
information caused problems inside LLVM's optimizer.
Author: Andres Freund
Reviewed-By: Tom Lane
Discussion: https://postgr.es/m/20180605172952.x34m5uz6ju6enaem@alap3.anarazel.de
2019-01-26 23:17:52 +01:00
|
|
|
FunctionCallInfo fcinfo = pertrans->transfn_fcinfo;
|
2016-12-01 01:08:11 +01:00
|
|
|
TupleTableSlot *slot1 = pertrans->sortslot;
|
2015-08-04 16:53:10 +02:00
|
|
|
TupleTableSlot *slot2 = pertrans->uniqslot;
|
|
|
|
int numTransInputs = pertrans->numTransInputs;
|
|
|
|
int numDistinctCols = pertrans->numDistinctCols;
|
2016-02-17 11:10:00 +01:00
|
|
|
Datum newAbbrevVal = (Datum) 0;
|
|
|
|
Datum oldAbbrevVal = (Datum) 0;
|
2009-12-15 18:57:48 +01:00
|
|
|
bool haveOldValue = false;
|
2018-02-16 06:55:31 +01:00
|
|
|
TupleTableSlot *save = aggstate->tmpcontext->ecxt_outertuple;
|
2009-12-15 18:57:48 +01:00
|
|
|
int i;
|
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
tuplesort_performsort(pertrans->sortstates[aggstate->current_set]);
|
2009-12-15 18:57:48 +01:00
|
|
|
|
|
|
|
ExecClearTuple(slot1);
|
|
|
|
if (slot2)
|
|
|
|
ExecClearTuple(slot2);
|
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
while (tuplesort_gettupleslot(pertrans->sortstates[aggstate->current_set],
|
2017-04-06 23:48:59 +02:00
|
|
|
true, true, slot1, &newAbbrevVal))
|
2009-12-15 18:57:48 +01:00
|
|
|
{
|
2017-07-26 02:37:17 +02:00
|
|
|
CHECK_FOR_INTERRUPTS();
|
|
|
|
|
2018-02-16 06:55:31 +01:00
|
|
|
tmpcontext->ecxt_outertuple = slot1;
|
|
|
|
tmpcontext->ecxt_innertuple = slot2;
|
2009-12-15 18:57:48 +01:00
|
|
|
|
|
|
|
if (numDistinctCols == 0 ||
|
|
|
|
!haveOldValue ||
|
2016-02-17 11:10:00 +01:00
|
|
|
newAbbrevVal != oldAbbrevVal ||
|
2018-02-16 06:55:31 +01:00
|
|
|
!ExecQual(pertrans->equalfnMulti, tmpcontext))
|
2009-12-15 18:57:48 +01:00
|
|
|
{
|
2018-02-16 06:55:31 +01:00
|
|
|
/*
|
|
|
|
* Extract the first numTransInputs columns as datums to pass to
|
|
|
|
* the transfn.
|
|
|
|
*/
|
|
|
|
slot_getsomeattrs(slot1, numTransInputs);
|
|
|
|
|
2009-12-15 18:57:48 +01:00
|
|
|
/* Load values into fcinfo */
|
|
|
|
/* Start from 1, since the 0th arg will be the transition value */
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
for (i = 0; i < numTransInputs; i++)
|
2009-12-15 18:57:48 +01:00
|
|
|
{
|
Change function call information to be variable length.
Before this change FunctionCallInfoData, the struct arguments etc for
V1 function calls are stored in, always had space for
FUNC_MAX_ARGS/100 arguments, storing datums and their nullness in two
arrays. For nearly every function call 100 arguments is far more than
needed, therefore wasting memory. Arg and argnull being two separate
arrays also guarantees that to access a single argument, two
cachelines have to be touched.
Change the layout so there's a single variable-length array with pairs
of value / isnull. That drastically reduces memory consumption for
most function calls (on x86-64 a two argument function now uses
64bytes, previously 936 bytes), and makes it very likely that argument
value and its nullness are on the same cacheline.
Arguments are stored in a new NullableDatum struct, which, due to
padding, needs more memory per argument than before. But as usually
far fewer arguments are stored, and individual arguments are cheaper
to access, that's still a clear win. It's likely that there's other
places where conversion to NullableDatum arrays would make sense,
e.g. TupleTableSlots, but that's for another commit.
Because the function call information is now variable-length
allocations have to take the number of arguments into account. For
heap allocations that can be done with SizeForFunctionCallInfoData(),
for on-stack allocations there's a new LOCAL_FCINFO(name, nargs) macro
that helps to allocate an appropriately sized and aligned variable.
Some places with stack allocation function call information don't know
the number of arguments at compile time, and currently variably sized
stack allocations aren't allowed in postgres. Therefore allow for
FUNC_MAX_ARGS space in these cases. They're not that common, so for
now that seems acceptable.
Because of the need to allocate FunctionCallInfo of the appropriate
size, older extensions may need to update their code. To avoid subtle
breakages, the FunctionCallInfoData struct has been renamed to
FunctionCallInfoBaseData. Most code only references FunctionCallInfo,
so that shouldn't cause much collateral damage.
This change is also a prerequisite for more efficient expression JIT
compilation (by allocating the function call information on the stack,
allowing LLVM to optimize it away); previously the size of the call
information caused problems inside LLVM's optimizer.
Author: Andres Freund
Reviewed-By: Tom Lane
Discussion: https://postgr.es/m/20180605172952.x34m5uz6ju6enaem@alap3.anarazel.de
2019-01-26 23:17:52 +01:00
|
|
|
fcinfo->args[i + 1].value = slot1->tts_values[i];
|
|
|
|
fcinfo->args[i + 1].isnull = slot1->tts_isnull[i];
|
2009-12-15 18:57:48 +01:00
|
|
|
}
|
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
advance_transition_function(aggstate, pertrans, pergroupstate);
|
2009-12-15 18:57:48 +01:00
|
|
|
|
|
|
|
if (numDistinctCols > 0)
|
|
|
|
{
|
|
|
|
/* swap the slot pointers to retain the current tuple */
|
|
|
|
TupleTableSlot *tmpslot = slot2;
|
|
|
|
|
|
|
|
slot2 = slot1;
|
|
|
|
slot1 = tmpslot;
|
2018-02-16 06:55:31 +01:00
|
|
|
/* avoid ExecQual() calls by reusing abbreviated keys */
|
2016-02-17 11:10:00 +01:00
|
|
|
oldAbbrevVal = newAbbrevVal;
|
2009-12-15 18:57:48 +01:00
|
|
|
haveOldValue = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-02-16 06:55:31 +01:00
|
|
|
/* Reset context each time */
|
|
|
|
ResetExprContext(tmpcontext);
|
2009-12-15 18:57:48 +01:00
|
|
|
|
|
|
|
ExecClearTuple(slot1);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (slot2)
|
|
|
|
ExecClearTuple(slot2);
|
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
tuplesort_end(pertrans->sortstates[aggstate->current_set]);
|
|
|
|
pertrans->sortstates[aggstate->current_set] = NULL;
|
2018-02-16 06:55:31 +01:00
|
|
|
|
|
|
|
/* restore previous slot, potentially in use for grouping sets */
|
|
|
|
tmpcontext->ecxt_outertuple = save;
|
2009-12-15 18:57:48 +01:00
|
|
|
}
|
|
|
|
|
2000-07-12 04:37:39 +02:00
|
|
|
/*
|
|
|
|
* Compute the final value of one aggregate.
|
|
|
|
*
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
* This function handles only one grouping set (already set in
|
|
|
|
* aggstate->current_set).
|
|
|
|
*
|
2019-07-16 06:23:53 +02:00
|
|
|
* The finalfn will be run, and the result delivered, in the
|
2002-11-06 23:31:24 +01:00
|
|
|
* output-tuple context; caller's CurrentMemoryContext does not matter.
|
2015-08-04 16:53:10 +02:00
|
|
|
*
|
|
|
|
* The finalfn uses the state as set in the transno. This also might be
|
|
|
|
* being used by another aggregate function, so it's important that we do
|
|
|
|
* nothing destructive here.
|
2000-07-12 04:37:39 +02:00
|
|
|
*/
|
|
|
|
static void
|
2002-11-06 23:31:24 +01:00
|
|
|
finalize_aggregate(AggState *aggstate,
|
2015-08-04 16:53:10 +02:00
|
|
|
AggStatePerAgg peragg,
|
2002-11-06 23:31:24 +01:00
|
|
|
AggStatePerGroup pergroupstate,
|
2000-07-12 04:37:39 +02:00
|
|
|
Datum *resultVal, bool *resultIsNull)
|
|
|
|
{
|
Change function call information to be variable length.
Before this change FunctionCallInfoData, the struct arguments etc for
V1 function calls are stored in, always had space for
FUNC_MAX_ARGS/100 arguments, storing datums and their nullness in two
arrays. For nearly every function call 100 arguments is far more than
needed, therefore wasting memory. Arg and argnull being two separate
arrays also guarantees that to access a single argument, two
cachelines have to be touched.
Change the layout so there's a single variable-length array with pairs
of value / isnull. That drastically reduces memory consumption for
most function calls (on x86-64 a two argument function now uses
64bytes, previously 936 bytes), and makes it very likely that argument
value and its nullness are on the same cacheline.
Arguments are stored in a new NullableDatum struct, which, due to
padding, needs more memory per argument than before. But as usually
far fewer arguments are stored, and individual arguments are cheaper
to access, that's still a clear win. It's likely that there's other
places where conversion to NullableDatum arrays would make sense,
e.g. TupleTableSlots, but that's for another commit.
Because the function call information is now variable-length
allocations have to take the number of arguments into account. For
heap allocations that can be done with SizeForFunctionCallInfoData(),
for on-stack allocations there's a new LOCAL_FCINFO(name, nargs) macro
that helps to allocate an appropriately sized and aligned variable.
Some places with stack allocation function call information don't know
the number of arguments at compile time, and currently variably sized
stack allocations aren't allowed in postgres. Therefore allow for
FUNC_MAX_ARGS space in these cases. They're not that common, so for
now that seems acceptable.
Because of the need to allocate FunctionCallInfo of the appropriate
size, older extensions may need to update their code. To avoid subtle
breakages, the FunctionCallInfoData struct has been renamed to
FunctionCallInfoBaseData. Most code only references FunctionCallInfo,
so that shouldn't cause much collateral damage.
This change is also a prerequisite for more efficient expression JIT
compilation (by allocating the function call information on the stack,
allowing LLVM to optimize it away); previously the size of the call
information caused problems inside LLVM's optimizer.
Author: Andres Freund
Reviewed-By: Tom Lane
Discussion: https://postgr.es/m/20180605172952.x34m5uz6ju6enaem@alap3.anarazel.de
2019-01-26 23:17:52 +01:00
|
|
|
LOCAL_FCINFO(fcinfo, FUNC_MAX_ARGS);
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
bool anynull = false;
|
2002-11-06 23:31:24 +01:00
|
|
|
MemoryContext oldContext;
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
int i;
|
|
|
|
ListCell *lc;
|
2015-08-04 16:53:10 +02:00
|
|
|
AggStatePerTrans pertrans = &aggstate->pertrans[peragg->transno];
|
2002-11-06 23:31:24 +01:00
|
|
|
|
2002-12-05 16:50:39 +01:00
|
|
|
oldContext = MemoryContextSwitchTo(aggstate->ss.ps.ps_ExprContext->ecxt_per_tuple_memory);
|
2002-11-06 23:31:24 +01:00
|
|
|
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
/*
|
|
|
|
* Evaluate any direct arguments. We do this even if there's no finalfn
|
|
|
|
* (which is unlikely anyway), so that side-effects happen as expected.
|
Allow polymorphic aggregates to have non-polymorphic state data types.
Before 9.4, such an aggregate couldn't be declared, because its final
function would have to have polymorphic result type but no polymorphic
argument, which CREATE FUNCTION would quite properly reject. The
ordered-set-aggregate patch found a workaround: allow the final function
to be declared as accepting additional dummy arguments that have types
matching the aggregate's regular input arguments. However, we failed
to notice that this problem applies just as much to regular aggregates,
despite the fact that we had a built-in regular aggregate array_agg()
that was known to be undeclarable in SQL because its final function
had an illegal signature. So what we should have done, and what this
patch does, is to decouple the extra-dummy-arguments behavior from
ordered-set aggregates and make it generally available for all aggregate
declarations. We have to put this into 9.4 rather than waiting till
later because it slightly alters the rules for declaring ordered-set
aggregates.
The patch turned out a bit bigger than I'd hoped because it proved
necessary to record the extra-arguments option in a new pg_aggregate
column. I'd thought we could just look at the final function's pronargs
at runtime, but that didn't work well for variadic final functions.
It's probably just as well though, because it simplifies life for pg_dump
to record the option explicitly.
While at it, fix array_agg() to have a valid final-function signature,
and add an opr_sanity test to notice future deviations from polymorphic
consistency. I also marked the percentile_cont() aggregates as not
needing extra arguments, since they don't.
2014-04-24 01:17:31 +02:00
|
|
|
* The direct arguments go into arg positions 1 and up, leaving position 0
|
|
|
|
* for the transition state value.
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
*/
|
|
|
|
i = 1;
|
2017-10-16 22:02:51 +02:00
|
|
|
foreach(lc, peragg->aggdirectargs)
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
{
|
|
|
|
ExprState *expr = (ExprState *) lfirst(lc);
|
|
|
|
|
Change function call information to be variable length.
Before this change FunctionCallInfoData, the struct arguments etc for
V1 function calls are stored in, always had space for
FUNC_MAX_ARGS/100 arguments, storing datums and their nullness in two
arrays. For nearly every function call 100 arguments is far more than
needed, therefore wasting memory. Arg and argnull being two separate
arrays also guarantees that to access a single argument, two
cachelines have to be touched.
Change the layout so there's a single variable-length array with pairs
of value / isnull. That drastically reduces memory consumption for
most function calls (on x86-64 a two argument function now uses
64bytes, previously 936 bytes), and makes it very likely that argument
value and its nullness are on the same cacheline.
Arguments are stored in a new NullableDatum struct, which, due to
padding, needs more memory per argument than before. But as usually
far fewer arguments are stored, and individual arguments are cheaper
to access, that's still a clear win. It's likely that there's other
places where conversion to NullableDatum arrays would make sense,
e.g. TupleTableSlots, but that's for another commit.
Because the function call information is now variable-length
allocations have to take the number of arguments into account. For
heap allocations that can be done with SizeForFunctionCallInfoData(),
for on-stack allocations there's a new LOCAL_FCINFO(name, nargs) macro
that helps to allocate an appropriately sized and aligned variable.
Some places with stack allocation function call information don't know
the number of arguments at compile time, and currently variably sized
stack allocations aren't allowed in postgres. Therefore allow for
FUNC_MAX_ARGS space in these cases. They're not that common, so for
now that seems acceptable.
Because of the need to allocate FunctionCallInfo of the appropriate
size, older extensions may need to update their code. To avoid subtle
breakages, the FunctionCallInfoData struct has been renamed to
FunctionCallInfoBaseData. Most code only references FunctionCallInfo,
so that shouldn't cause much collateral damage.
This change is also a prerequisite for more efficient expression JIT
compilation (by allocating the function call information on the stack,
allowing LLVM to optimize it away); previously the size of the call
information caused problems inside LLVM's optimizer.
Author: Andres Freund
Reviewed-By: Tom Lane
Discussion: https://postgr.es/m/20180605172952.x34m5uz6ju6enaem@alap3.anarazel.de
2019-01-26 23:17:52 +01:00
|
|
|
fcinfo->args[i].value = ExecEvalExpr(expr,
|
|
|
|
aggstate->ss.ps.ps_ExprContext,
|
|
|
|
&fcinfo->args[i].isnull);
|
|
|
|
anynull |= fcinfo->args[i].isnull;
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
i++;
|
|
|
|
}
|
|
|
|
|
1999-12-13 02:27:21 +01:00
|
|
|
/*
|
2000-07-17 05:05:41 +02:00
|
|
|
* Apply the agg's finalfn if one is provided, else return transValue.
|
1999-12-13 02:27:21 +01:00
|
|
|
*/
|
2015-08-04 16:53:10 +02:00
|
|
|
if (OidIsValid(peragg->finalfn_oid))
|
1999-12-13 02:27:21 +01:00
|
|
|
{
|
2015-08-04 16:53:10 +02:00
|
|
|
int numFinalArgs = peragg->numFinalArgs;
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
|
2017-10-12 21:20:04 +02:00
|
|
|
/* set up aggstate->curperagg for AggGetAggref() */
|
|
|
|
aggstate->curperagg = peragg;
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
|
Change function call information to be variable length.
Before this change FunctionCallInfoData, the struct arguments etc for
V1 function calls are stored in, always had space for
FUNC_MAX_ARGS/100 arguments, storing datums and their nullness in two
arrays. For nearly every function call 100 arguments is far more than
needed, therefore wasting memory. Arg and argnull being two separate
arrays also guarantees that to access a single argument, two
cachelines have to be touched.
Change the layout so there's a single variable-length array with pairs
of value / isnull. That drastically reduces memory consumption for
most function calls (on x86-64 a two argument function now uses
64bytes, previously 936 bytes), and makes it very likely that argument
value and its nullness are on the same cacheline.
Arguments are stored in a new NullableDatum struct, which, due to
padding, needs more memory per argument than before. But as usually
far fewer arguments are stored, and individual arguments are cheaper
to access, that's still a clear win. It's likely that there's other
places where conversion to NullableDatum arrays would make sense,
e.g. TupleTableSlots, but that's for another commit.
Because the function call information is now variable-length
allocations have to take the number of arguments into account. For
heap allocations that can be done with SizeForFunctionCallInfoData(),
for on-stack allocations there's a new LOCAL_FCINFO(name, nargs) macro
that helps to allocate an appropriately sized and aligned variable.
Some places with stack allocation function call information don't know
the number of arguments at compile time, and currently variably sized
stack allocations aren't allowed in postgres. Therefore allow for
FUNC_MAX_ARGS space in these cases. They're not that common, so for
now that seems acceptable.
Because of the need to allocate FunctionCallInfo of the appropriate
size, older extensions may need to update their code. To avoid subtle
breakages, the FunctionCallInfoData struct has been renamed to
FunctionCallInfoBaseData. Most code only references FunctionCallInfo,
so that shouldn't cause much collateral damage.
This change is also a prerequisite for more efficient expression JIT
compilation (by allocating the function call information on the stack,
allowing LLVM to optimize it away); previously the size of the call
information caused problems inside LLVM's optimizer.
Author: Andres Freund
Reviewed-By: Tom Lane
Discussion: https://postgr.es/m/20180605172952.x34m5uz6ju6enaem@alap3.anarazel.de
2019-01-26 23:17:52 +01:00
|
|
|
InitFunctionCallInfoData(*fcinfo, &peragg->finalfn,
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
numFinalArgs,
|
2015-08-04 16:53:10 +02:00
|
|
|
pertrans->aggCollation,
|
2005-03-22 21:13:09 +01:00
|
|
|
(void *) aggstate, NULL);
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
|
|
|
|
/* Fill in the transition state value */
|
Change function call information to be variable length.
Before this change FunctionCallInfoData, the struct arguments etc for
V1 function calls are stored in, always had space for
FUNC_MAX_ARGS/100 arguments, storing datums and their nullness in two
arrays. For nearly every function call 100 arguments is far more than
needed, therefore wasting memory. Arg and argnull being two separate
arrays also guarantees that to access a single argument, two
cachelines have to be touched.
Change the layout so there's a single variable-length array with pairs
of value / isnull. That drastically reduces memory consumption for
most function calls (on x86-64 a two argument function now uses
64bytes, previously 936 bytes), and makes it very likely that argument
value and its nullness are on the same cacheline.
Arguments are stored in a new NullableDatum struct, which, due to
padding, needs more memory per argument than before. But as usually
far fewer arguments are stored, and individual arguments are cheaper
to access, that's still a clear win. It's likely that there's other
places where conversion to NullableDatum arrays would make sense,
e.g. TupleTableSlots, but that's for another commit.
Because the function call information is now variable-length
allocations have to take the number of arguments into account. For
heap allocations that can be done with SizeForFunctionCallInfoData(),
for on-stack allocations there's a new LOCAL_FCINFO(name, nargs) macro
that helps to allocate an appropriately sized and aligned variable.
Some places with stack allocation function call information don't know
the number of arguments at compile time, and currently variably sized
stack allocations aren't allowed in postgres. Therefore allow for
FUNC_MAX_ARGS space in these cases. They're not that common, so for
now that seems acceptable.
Because of the need to allocate FunctionCallInfo of the appropriate
size, older extensions may need to update their code. To avoid subtle
breakages, the FunctionCallInfoData struct has been renamed to
FunctionCallInfoBaseData. Most code only references FunctionCallInfo,
so that shouldn't cause much collateral damage.
This change is also a prerequisite for more efficient expression JIT
compilation (by allocating the function call information on the stack,
allowing LLVM to optimize it away); previously the size of the call
information caused problems inside LLVM's optimizer.
Author: Andres Freund
Reviewed-By: Tom Lane
Discussion: https://postgr.es/m/20180605172952.x34m5uz6ju6enaem@alap3.anarazel.de
2019-01-26 23:17:52 +01:00
|
|
|
fcinfo->args[0].value =
|
|
|
|
MakeExpandedObjectReadOnly(pergroupstate->transValue,
|
|
|
|
pergroupstate->transValueIsNull,
|
|
|
|
pertrans->transtypeLen);
|
|
|
|
fcinfo->args[0].isnull = pergroupstate->transValueIsNull;
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
anynull |= pergroupstate->transValueIsNull;
|
|
|
|
|
|
|
|
/* Fill any remaining argument positions with nulls */
|
Allow polymorphic aggregates to have non-polymorphic state data types.
Before 9.4, such an aggregate couldn't be declared, because its final
function would have to have polymorphic result type but no polymorphic
argument, which CREATE FUNCTION would quite properly reject. The
ordered-set-aggregate patch found a workaround: allow the final function
to be declared as accepting additional dummy arguments that have types
matching the aggregate's regular input arguments. However, we failed
to notice that this problem applies just as much to regular aggregates,
despite the fact that we had a built-in regular aggregate array_agg()
that was known to be undeclarable in SQL because its final function
had an illegal signature. So what we should have done, and what this
patch does, is to decouple the extra-dummy-arguments behavior from
ordered-set aggregates and make it generally available for all aggregate
declarations. We have to put this into 9.4 rather than waiting till
later because it slightly alters the rules for declaring ordered-set
aggregates.
The patch turned out a bit bigger than I'd hoped because it proved
necessary to record the extra-arguments option in a new pg_aggregate
column. I'd thought we could just look at the final function's pronargs
at runtime, but that didn't work well for variadic final functions.
It's probably just as well though, because it simplifies life for pg_dump
to record the option explicitly.
While at it, fix array_agg() to have a valid final-function signature,
and add an opr_sanity test to notice future deviations from polymorphic
consistency. I also marked the percentile_cont() aggregates as not
needing extra arguments, since they don't.
2014-04-24 01:17:31 +02:00
|
|
|
for (; i < numFinalArgs; i++)
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
{
|
Change function call information to be variable length.
Before this change FunctionCallInfoData, the struct arguments etc for
V1 function calls are stored in, always had space for
FUNC_MAX_ARGS/100 arguments, storing datums and their nullness in two
arrays. For nearly every function call 100 arguments is far more than
needed, therefore wasting memory. Arg and argnull being two separate
arrays also guarantees that to access a single argument, two
cachelines have to be touched.
Change the layout so there's a single variable-length array with pairs
of value / isnull. That drastically reduces memory consumption for
most function calls (on x86-64 a two argument function now uses
64bytes, previously 936 bytes), and makes it very likely that argument
value and its nullness are on the same cacheline.
Arguments are stored in a new NullableDatum struct, which, due to
padding, needs more memory per argument than before. But as usually
far fewer arguments are stored, and individual arguments are cheaper
to access, that's still a clear win. It's likely that there's other
places where conversion to NullableDatum arrays would make sense,
e.g. TupleTableSlots, but that's for another commit.
Because the function call information is now variable-length
allocations have to take the number of arguments into account. For
heap allocations that can be done with SizeForFunctionCallInfoData(),
for on-stack allocations there's a new LOCAL_FCINFO(name, nargs) macro
that helps to allocate an appropriately sized and aligned variable.
Some places with stack allocation function call information don't know
the number of arguments at compile time, and currently variably sized
stack allocations aren't allowed in postgres. Therefore allow for
FUNC_MAX_ARGS space in these cases. They're not that common, so for
now that seems acceptable.
Because of the need to allocate FunctionCallInfo of the appropriate
size, older extensions may need to update their code. To avoid subtle
breakages, the FunctionCallInfoData struct has been renamed to
FunctionCallInfoBaseData. Most code only references FunctionCallInfo,
so that shouldn't cause much collateral damage.
This change is also a prerequisite for more efficient expression JIT
compilation (by allocating the function call information on the stack,
allowing LLVM to optimize it away); previously the size of the call
information caused problems inside LLVM's optimizer.
Author: Andres Freund
Reviewed-By: Tom Lane
Discussion: https://postgr.es/m/20180605172952.x34m5uz6ju6enaem@alap3.anarazel.de
2019-01-26 23:17:52 +01:00
|
|
|
fcinfo->args[i].value = (Datum) 0;
|
|
|
|
fcinfo->args[i].isnull = true;
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
anynull = true;
|
|
|
|
}
|
|
|
|
|
Change function call information to be variable length.
Before this change FunctionCallInfoData, the struct arguments etc for
V1 function calls are stored in, always had space for
FUNC_MAX_ARGS/100 arguments, storing datums and their nullness in two
arrays. For nearly every function call 100 arguments is far more than
needed, therefore wasting memory. Arg and argnull being two separate
arrays also guarantees that to access a single argument, two
cachelines have to be touched.
Change the layout so there's a single variable-length array with pairs
of value / isnull. That drastically reduces memory consumption for
most function calls (on x86-64 a two argument function now uses
64bytes, previously 936 bytes), and makes it very likely that argument
value and its nullness are on the same cacheline.
Arguments are stored in a new NullableDatum struct, which, due to
padding, needs more memory per argument than before. But as usually
far fewer arguments are stored, and individual arguments are cheaper
to access, that's still a clear win. It's likely that there's other
places where conversion to NullableDatum arrays would make sense,
e.g. TupleTableSlots, but that's for another commit.
Because the function call information is now variable-length
allocations have to take the number of arguments into account. For
heap allocations that can be done with SizeForFunctionCallInfoData(),
for on-stack allocations there's a new LOCAL_FCINFO(name, nargs) macro
that helps to allocate an appropriately sized and aligned variable.
Some places with stack allocation function call information don't know
the number of arguments at compile time, and currently variably sized
stack allocations aren't allowed in postgres. Therefore allow for
FUNC_MAX_ARGS space in these cases. They're not that common, so for
now that seems acceptable.
Because of the need to allocate FunctionCallInfo of the appropriate
size, older extensions may need to update their code. To avoid subtle
breakages, the FunctionCallInfoData struct has been renamed to
FunctionCallInfoBaseData. Most code only references FunctionCallInfo,
so that shouldn't cause much collateral damage.
This change is also a prerequisite for more efficient expression JIT
compilation (by allocating the function call information on the stack,
allowing LLVM to optimize it away); previously the size of the call
information caused problems inside LLVM's optimizer.
Author: Andres Freund
Reviewed-By: Tom Lane
Discussion: https://postgr.es/m/20180605172952.x34m5uz6ju6enaem@alap3.anarazel.de
2019-01-26 23:17:52 +01:00
|
|
|
if (fcinfo->flinfo->fn_strict && anynull)
|
2000-05-28 19:56:29 +02:00
|
|
|
{
|
|
|
|
/* don't call a strict function with NULL inputs */
|
|
|
|
*resultVal = (Datum) 0;
|
|
|
|
*resultIsNull = true;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
Change function call information to be variable length.
Before this change FunctionCallInfoData, the struct arguments etc for
V1 function calls are stored in, always had space for
FUNC_MAX_ARGS/100 arguments, storing datums and their nullness in two
arrays. For nearly every function call 100 arguments is far more than
needed, therefore wasting memory. Arg and argnull being two separate
arrays also guarantees that to access a single argument, two
cachelines have to be touched.
Change the layout so there's a single variable-length array with pairs
of value / isnull. That drastically reduces memory consumption for
most function calls (on x86-64 a two argument function now uses
64bytes, previously 936 bytes), and makes it very likely that argument
value and its nullness are on the same cacheline.
Arguments are stored in a new NullableDatum struct, which, due to
padding, needs more memory per argument than before. But as usually
far fewer arguments are stored, and individual arguments are cheaper
to access, that's still a clear win. It's likely that there's other
places where conversion to NullableDatum arrays would make sense,
e.g. TupleTableSlots, but that's for another commit.
Because the function call information is now variable-length
allocations have to take the number of arguments into account. For
heap allocations that can be done with SizeForFunctionCallInfoData(),
for on-stack allocations there's a new LOCAL_FCINFO(name, nargs) macro
that helps to allocate an appropriately sized and aligned variable.
Some places with stack allocation function call information don't know
the number of arguments at compile time, and currently variably sized
stack allocations aren't allowed in postgres. Therefore allow for
FUNC_MAX_ARGS space in these cases. They're not that common, so for
now that seems acceptable.
Because of the need to allocate FunctionCallInfo of the appropriate
size, older extensions may need to update their code. To avoid subtle
breakages, the FunctionCallInfoData struct has been renamed to
FunctionCallInfoBaseData. Most code only references FunctionCallInfo,
so that shouldn't cause much collateral damage.
This change is also a prerequisite for more efficient expression JIT
compilation (by allocating the function call information on the stack,
allowing LLVM to optimize it away); previously the size of the call
information caused problems inside LLVM's optimizer.
Author: Andres Freund
Reviewed-By: Tom Lane
Discussion: https://postgr.es/m/20180605172952.x34m5uz6ju6enaem@alap3.anarazel.de
2019-01-26 23:17:52 +01:00
|
|
|
*resultVal = FunctionCallInvoke(fcinfo);
|
|
|
|
*resultIsNull = fcinfo->isnull;
|
2000-05-28 19:56:29 +02:00
|
|
|
}
|
2017-10-12 21:20:04 +02:00
|
|
|
aggstate->curperagg = NULL;
|
1999-12-13 02:27:21 +01:00
|
|
|
}
|
2000-07-17 05:05:41 +02:00
|
|
|
else
|
1999-12-13 02:27:21 +01:00
|
|
|
{
|
Improve speed of aggregates that use array_append as transition function.
In the previous coding, if an aggregate's transition function returned an
expanded array, nodeAgg.c and nodeWindowAgg.c would always copy it and thus
force it into the flat representation. This led to ping-ponging between
flat and expanded formats, which costs a lot. For an aggregate using
array_append as transition function, I measured about a 15X slowdown
compared to the pre-9.5 code, when working on simple int[] arrays.
Of course, the old code was already O(N^2) in this usage due to copying
flat arrays all the time, but it wasn't quite this inefficient.
To fix, teach nodeAgg.c and nodeWindowAgg.c to allow expanded transition
values without copying, so long as the transition function takes care to
return the transition value already properly parented under the aggcontext.
That puts a bit of extra responsibility on the transition function, but
doing it this way allows us to not need any extra logic in the fast path
of advance_transition_function (ie, with a pass-by-value transition value,
or with a modified-in-place pass-by-reference value). We already know
that that's a hot spot so I'm loath to add any cycles at all there. Also,
while only array_append currently knows how to follow this convention,
this solution allows other transition functions to opt-in without needing
to have a whitelist in the core aggregation code.
(The reason we would need a whitelist is that currently, if you pass a
R/W expanded-object pointer to an arbitrary function, it's allowed to do
anything with it including deleting it; that breaks the core agg code's
assumption that it should free discarded values. Returning a value under
aggcontext is the transition function's signal that it knows it is an
aggregate transition function and will play nice. Possibly the API rules
for expanded objects should be refined, but that would not be a
back-patchable change.)
With this fix, an aggregate using array_append is no longer O(N^2), so it's
much faster than pre-9.5 code rather than much slower. It's still a bit
slower than the bespoke infrastructure for array_agg, but the differential
seems to be only about 10%-20% rather than orders of magnitude.
Discussion: <6315.1477677885@sss.pgh.pa.us>
2016-10-30 17:27:41 +01:00
|
|
|
/* Don't need MakeExpandedObjectReadOnly; datumCopy will copy it */
|
2002-11-06 23:31:24 +01:00
|
|
|
*resultVal = pergroupstate->transValue;
|
|
|
|
*resultIsNull = pergroupstate->transValueIsNull;
|
1999-12-13 02:27:21 +01:00
|
|
|
}
|
2000-07-17 05:05:41 +02:00
|
|
|
|
1999-12-13 02:27:21 +01:00
|
|
|
/*
|
2000-07-12 04:37:39 +02:00
|
|
|
* If result is pass-by-ref, make sure it is in the right context.
|
1999-12-13 02:27:21 +01:00
|
|
|
*/
|
2015-08-04 16:53:10 +02:00
|
|
|
if (!peragg->resulttypeByVal && !*resultIsNull &&
|
2000-07-12 04:37:39 +02:00
|
|
|
!MemoryContextContains(CurrentMemoryContext,
|
|
|
|
DatumGetPointer(*resultVal)))
|
|
|
|
*resultVal = datumCopy(*resultVal,
|
2015-08-04 16:53:10 +02:00
|
|
|
peragg->resulttypeByVal,
|
|
|
|
peragg->resulttypeLen);
|
2002-11-06 23:31:24 +01:00
|
|
|
|
|
|
|
MemoryContextSwitchTo(oldContext);
|
1999-12-13 02:27:21 +01:00
|
|
|
}
|
1996-07-09 08:22:35 +02:00
|
|
|
|
2016-03-29 21:04:05 +02:00
|
|
|
/*
|
2016-06-26 20:33:38 +02:00
|
|
|
* Compute the output value of one partial aggregate.
|
2016-03-29 21:04:05 +02:00
|
|
|
*
|
|
|
|
* The serialization function will be run, and the result delivered, in the
|
|
|
|
* output-tuple context; caller's CurrentMemoryContext does not matter.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
finalize_partialaggregate(AggState *aggstate,
|
|
|
|
AggStatePerAgg peragg,
|
|
|
|
AggStatePerGroup pergroupstate,
|
|
|
|
Datum *resultVal, bool *resultIsNull)
|
|
|
|
{
|
|
|
|
AggStatePerTrans pertrans = &aggstate->pertrans[peragg->transno];
|
|
|
|
MemoryContext oldContext;
|
|
|
|
|
|
|
|
oldContext = MemoryContextSwitchTo(aggstate->ss.ps.ps_ExprContext->ecxt_per_tuple_memory);
|
|
|
|
|
|
|
|
/*
|
2016-06-26 20:33:38 +02:00
|
|
|
* serialfn_oid will be set if we must serialize the transvalue before
|
|
|
|
* returning it
|
2016-03-29 21:04:05 +02:00
|
|
|
*/
|
|
|
|
if (OidIsValid(pertrans->serialfn_oid))
|
|
|
|
{
|
|
|
|
/* Don't call a strict serialization function with NULL input. */
|
|
|
|
if (pertrans->serialfn.fn_strict && pergroupstate->transValueIsNull)
|
|
|
|
{
|
|
|
|
*resultVal = (Datum) 0;
|
|
|
|
*resultIsNull = true;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
Change function call information to be variable length.
Before this change FunctionCallInfoData, the struct arguments etc for
V1 function calls are stored in, always had space for
FUNC_MAX_ARGS/100 arguments, storing datums and their nullness in two
arrays. For nearly every function call 100 arguments is far more than
needed, therefore wasting memory. Arg and argnull being two separate
arrays also guarantees that to access a single argument, two
cachelines have to be touched.
Change the layout so there's a single variable-length array with pairs
of value / isnull. That drastically reduces memory consumption for
most function calls (on x86-64 a two argument function now uses
64bytes, previously 936 bytes), and makes it very likely that argument
value and its nullness are on the same cacheline.
Arguments are stored in a new NullableDatum struct, which, due to
padding, needs more memory per argument than before. But as usually
far fewer arguments are stored, and individual arguments are cheaper
to access, that's still a clear win. It's likely that there's other
places where conversion to NullableDatum arrays would make sense,
e.g. TupleTableSlots, but that's for another commit.
Because the function call information is now variable-length
allocations have to take the number of arguments into account. For
heap allocations that can be done with SizeForFunctionCallInfoData(),
for on-stack allocations there's a new LOCAL_FCINFO(name, nargs) macro
that helps to allocate an appropriately sized and aligned variable.
Some places with stack allocation function call information don't know
the number of arguments at compile time, and currently variably sized
stack allocations aren't allowed in postgres. Therefore allow for
FUNC_MAX_ARGS space in these cases. They're not that common, so for
now that seems acceptable.
Because of the need to allocate FunctionCallInfo of the appropriate
size, older extensions may need to update their code. To avoid subtle
breakages, the FunctionCallInfoData struct has been renamed to
FunctionCallInfoBaseData. Most code only references FunctionCallInfo,
so that shouldn't cause much collateral damage.
This change is also a prerequisite for more efficient expression JIT
compilation (by allocating the function call information on the stack,
allowing LLVM to optimize it away); previously the size of the call
information caused problems inside LLVM's optimizer.
Author: Andres Freund
Reviewed-By: Tom Lane
Discussion: https://postgr.es/m/20180605172952.x34m5uz6ju6enaem@alap3.anarazel.de
2019-01-26 23:17:52 +01:00
|
|
|
FunctionCallInfo fcinfo = pertrans->serialfn_fcinfo;
|
2016-06-10 00:02:36 +02:00
|
|
|
|
Change function call information to be variable length.
Before this change FunctionCallInfoData, the struct arguments etc for
V1 function calls are stored in, always had space for
FUNC_MAX_ARGS/100 arguments, storing datums and their nullness in two
arrays. For nearly every function call 100 arguments is far more than
needed, therefore wasting memory. Arg and argnull being two separate
arrays also guarantees that to access a single argument, two
cachelines have to be touched.
Change the layout so there's a single variable-length array with pairs
of value / isnull. That drastically reduces memory consumption for
most function calls (on x86-64 a two argument function now uses
64bytes, previously 936 bytes), and makes it very likely that argument
value and its nullness are on the same cacheline.
Arguments are stored in a new NullableDatum struct, which, due to
padding, needs more memory per argument than before. But as usually
far fewer arguments are stored, and individual arguments are cheaper
to access, that's still a clear win. It's likely that there's other
places where conversion to NullableDatum arrays would make sense,
e.g. TupleTableSlots, but that's for another commit.
Because the function call information is now variable-length
allocations have to take the number of arguments into account. For
heap allocations that can be done with SizeForFunctionCallInfoData(),
for on-stack allocations there's a new LOCAL_FCINFO(name, nargs) macro
that helps to allocate an appropriately sized and aligned variable.
Some places with stack allocation function call information don't know
the number of arguments at compile time, and currently variably sized
stack allocations aren't allowed in postgres. Therefore allow for
FUNC_MAX_ARGS space in these cases. They're not that common, so for
now that seems acceptable.
Because of the need to allocate FunctionCallInfo of the appropriate
size, older extensions may need to update their code. To avoid subtle
breakages, the FunctionCallInfoData struct has been renamed to
FunctionCallInfoBaseData. Most code only references FunctionCallInfo,
so that shouldn't cause much collateral damage.
This change is also a prerequisite for more efficient expression JIT
compilation (by allocating the function call information on the stack,
allowing LLVM to optimize it away); previously the size of the call
information caused problems inside LLVM's optimizer.
Author: Andres Freund
Reviewed-By: Tom Lane
Discussion: https://postgr.es/m/20180605172952.x34m5uz6ju6enaem@alap3.anarazel.de
2019-01-26 23:17:52 +01:00
|
|
|
fcinfo->args[0].value =
|
|
|
|
MakeExpandedObjectReadOnly(pergroupstate->transValue,
|
|
|
|
pergroupstate->transValueIsNull,
|
|
|
|
pertrans->transtypeLen);
|
|
|
|
fcinfo->args[0].isnull = pergroupstate->transValueIsNull;
|
Fix minor violations of FunctionCallInvoke usage protocol.
Working on commit 1c455078b led me to check through FunctionCallInvoke
call sites to see if every one was being honest about (a) making sure
that fcinfo.isnull is initially false, and (b) checking its state after
the call. Sure enough, I found some violations.
The main one is that finalize_partialaggregate re-used serialfn_fcinfo
without resetting isnull, even though it clearly intends to cater for
serialfns that return NULL. There would only be an issue with a
non-strict serialfn, since it's unlikely that a serialfn would return
NULL for non-null input. We have no non-strict serialfns in core, and
there may be none in the wild either, which would account for the lack
of complaints. Still, it's clearly wrong, so back-patch that fix to
9.6 where finalize_partialaggregate was introduced.
Also, arrayfuncs.c and rowtypes.c contained various callers that were
not bothering to check for result nulls. While what's being called is
a comparison or hash function that probably *shouldn't* return null,
that's a lousy excuse for not having any check at all. There are
existing places that just Assert(!fcinfo->isnull) in comparable
situations, so I added that to the places that were calling btree
comparison or hash support functions. In the places calling
boolean-returning equality functions, it's quite cheap to have them
treat isnull as FALSE, so make those places do that. Also remove some
"locfcinfo->isnull = false" assignments that are unnecessary given the
assumption that no previous call returned null. These changes seem like
mostly neatnik-ism or debugging support, so I didn't back-patch.
2020-04-21 20:23:42 +02:00
|
|
|
fcinfo->isnull = false;
|
2016-03-29 21:04:05 +02:00
|
|
|
|
|
|
|
*resultVal = FunctionCallInvoke(fcinfo);
|
|
|
|
*resultIsNull = fcinfo->isnull;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
Improve speed of aggregates that use array_append as transition function.
In the previous coding, if an aggregate's transition function returned an
expanded array, nodeAgg.c and nodeWindowAgg.c would always copy it and thus
force it into the flat representation. This led to ping-ponging between
flat and expanded formats, which costs a lot. For an aggregate using
array_append as transition function, I measured about a 15X slowdown
compared to the pre-9.5 code, when working on simple int[] arrays.
Of course, the old code was already O(N^2) in this usage due to copying
flat arrays all the time, but it wasn't quite this inefficient.
To fix, teach nodeAgg.c and nodeWindowAgg.c to allow expanded transition
values without copying, so long as the transition function takes care to
return the transition value already properly parented under the aggcontext.
That puts a bit of extra responsibility on the transition function, but
doing it this way allows us to not need any extra logic in the fast path
of advance_transition_function (ie, with a pass-by-value transition value,
or with a modified-in-place pass-by-reference value). We already know
that that's a hot spot so I'm loath to add any cycles at all there. Also,
while only array_append currently knows how to follow this convention,
this solution allows other transition functions to opt-in without needing
to have a whitelist in the core aggregation code.
(The reason we would need a whitelist is that currently, if you pass a
R/W expanded-object pointer to an arbitrary function, it's allowed to do
anything with it including deleting it; that breaks the core agg code's
assumption that it should free discarded values. Returning a value under
aggcontext is the transition function's signal that it knows it is an
aggregate transition function and will play nice. Possibly the API rules
for expanded objects should be refined, but that would not be a
back-patchable change.)
With this fix, an aggregate using array_append is no longer O(N^2), so it's
much faster than pre-9.5 code rather than much slower. It's still a bit
slower than the bespoke infrastructure for array_agg, but the differential
seems to be only about 10%-20% rather than orders of magnitude.
Discussion: <6315.1477677885@sss.pgh.pa.us>
2016-10-30 17:27:41 +01:00
|
|
|
/* Don't need MakeExpandedObjectReadOnly; datumCopy will copy it */
|
2016-03-29 21:04:05 +02:00
|
|
|
*resultVal = pergroupstate->transValue;
|
|
|
|
*resultIsNull = pergroupstate->transValueIsNull;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* If result is pass-by-ref, make sure it is in the right context. */
|
|
|
|
if (!peragg->resulttypeByVal && !*resultIsNull &&
|
|
|
|
!MemoryContextContains(CurrentMemoryContext,
|
|
|
|
DatumGetPointer(*resultVal)))
|
|
|
|
*resultVal = datumCopy(*resultVal,
|
|
|
|
peragg->resulttypeByVal,
|
|
|
|
peragg->resulttypeLen);
|
|
|
|
|
|
|
|
MemoryContextSwitchTo(oldContext);
|
|
|
|
}
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
|
2020-02-19 19:15:16 +01:00
|
|
|
/*
|
|
|
|
* Extract the attributes that make up the grouping key into the
|
|
|
|
* hashslot. This is necessary to compute the hash or perform a lookup.
|
|
|
|
*/
|
2020-07-26 23:55:52 +02:00
|
|
|
static inline void
|
|
|
|
prepare_hash_slot(AggStatePerHash perhash,
|
|
|
|
TupleTableSlot *inputslot,
|
|
|
|
TupleTableSlot *hashslot)
|
2020-02-19 19:15:16 +01:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
/* transfer just the needed columns into hashslot */
|
|
|
|
slot_getsomeattrs(inputslot, perhash->largestGrpColIdx);
|
|
|
|
ExecClearTuple(hashslot);
|
|
|
|
|
|
|
|
for (i = 0; i < perhash->numhashGrpCols; i++)
|
|
|
|
{
|
|
|
|
int varNumber = perhash->hashGrpColIdxInput[i] - 1;
|
|
|
|
|
|
|
|
hashslot->tts_values[i] = inputslot->tts_values[varNumber];
|
|
|
|
hashslot->tts_isnull[i] = inputslot->tts_isnull[varNumber];
|
|
|
|
}
|
|
|
|
ExecStoreVirtualTuple(hashslot);
|
|
|
|
}
|
|
|
|
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
/*
|
|
|
|
* Prepare to finalize and project based on the specified representative tuple
|
|
|
|
* slot and grouping set.
|
|
|
|
*
|
|
|
|
* In the specified tuple slot, force to null all attributes that should be
|
|
|
|
* read as null in the context of the current grouping set. Also stash the
|
|
|
|
* current group bitmap where GroupingExpr can get at it.
|
|
|
|
*
|
|
|
|
* This relies on three conditions:
|
|
|
|
*
|
|
|
|
* 1) Nothing is ever going to try and extract the whole tuple from this slot,
|
|
|
|
* only reference it in evaluations, which will only access individual
|
|
|
|
* attributes.
|
|
|
|
*
|
|
|
|
* 2) No system columns are going to need to be nulled. (If a system column is
|
|
|
|
* referenced in a group clause, it is actually projected in the outer plan
|
|
|
|
* tlist.)
|
|
|
|
*
|
|
|
|
* 3) Within a given phase, we never need to recover the value of an attribute
|
|
|
|
* once it has been set to null.
|
|
|
|
*
|
|
|
|
* Poking into the slot this way is a bit ugly, but the consensus is that the
|
|
|
|
* alternative was worse.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
prepare_projection_slot(AggState *aggstate, TupleTableSlot *slot, int currentSet)
|
|
|
|
{
|
|
|
|
if (aggstate->phase->grouped_cols)
|
|
|
|
{
|
|
|
|
Bitmapset *grouped_cols = aggstate->phase->grouped_cols[currentSet];
|
|
|
|
|
|
|
|
aggstate->grouped_cols = grouped_cols;
|
|
|
|
|
2018-10-16 00:24:33 +02:00
|
|
|
if (TTS_EMPTY(slot))
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Force all values to be NULL if working on an empty input tuple
|
|
|
|
* (i.e. an empty grouping set for which no input rows were
|
|
|
|
* supplied).
|
|
|
|
*/
|
|
|
|
ExecStoreAllNullTuple(slot);
|
|
|
|
}
|
|
|
|
else if (aggstate->all_grouped_cols)
|
|
|
|
{
|
|
|
|
ListCell *lc;
|
|
|
|
|
|
|
|
/* all_grouped_cols is arranged in desc order */
|
|
|
|
slot_getsomeattrs(slot, linitial_int(aggstate->all_grouped_cols));
|
|
|
|
|
|
|
|
foreach(lc, aggstate->all_grouped_cols)
|
|
|
|
{
|
|
|
|
int attnum = lfirst_int(lc);
|
|
|
|
|
|
|
|
if (!bms_is_member(attnum, grouped_cols))
|
|
|
|
slot->tts_isnull[attnum - 1] = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Compute the final value of all aggregates for one group.
|
|
|
|
*
|
2017-03-27 05:20:54 +02:00
|
|
|
* This function handles only one grouping set at a time, which the caller must
|
|
|
|
* have selected. It's also the caller's responsibility to adjust the supplied
|
|
|
|
* pergroup parameter to point to the current set's transvalues.
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
*
|
|
|
|
* Results are stored in the output econtext aggvalues/aggnulls.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
finalize_aggregates(AggState *aggstate,
|
2015-08-04 16:53:10 +02:00
|
|
|
AggStatePerAgg peraggs,
|
2017-03-27 05:20:54 +02:00
|
|
|
AggStatePerGroup pergroup)
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
{
|
|
|
|
ExprContext *econtext = aggstate->ss.ps.ps_ExprContext;
|
|
|
|
Datum *aggvalues = econtext->ecxt_aggvalues;
|
|
|
|
bool *aggnulls = econtext->ecxt_aggnulls;
|
|
|
|
int aggno;
|
2016-12-20 08:20:17 +01:00
|
|
|
int transno;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
|
2016-12-20 08:20:17 +01:00
|
|
|
/*
|
|
|
|
* If there were any DISTINCT and/or ORDER BY aggregates, sort their
|
|
|
|
* inputs and run the transition functions.
|
|
|
|
*/
|
|
|
|
for (transno = 0; transno < aggstate->numtrans; transno++)
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
{
|
2015-08-04 16:53:10 +02:00
|
|
|
AggStatePerTrans pertrans = &aggstate->pertrans[transno];
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
AggStatePerGroup pergroupstate;
|
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
pergroupstate = &pergroup[transno];
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
if (pertrans->numSortCols > 0)
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
{
|
2017-03-27 05:20:54 +02:00
|
|
|
Assert(aggstate->aggstrategy != AGG_HASHED &&
|
|
|
|
aggstate->aggstrategy != AGG_MIXED);
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
if (pertrans->numInputs == 1)
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
process_ordered_aggregate_single(aggstate,
|
2015-08-04 16:53:10 +02:00
|
|
|
pertrans,
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
pergroupstate);
|
|
|
|
else
|
|
|
|
process_ordered_aggregate_multi(aggstate,
|
2015-08-04 16:53:10 +02:00
|
|
|
pertrans,
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
pergroupstate);
|
|
|
|
}
|
2016-12-20 08:20:17 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Run the final functions.
|
|
|
|
*/
|
|
|
|
for (aggno = 0; aggno < aggstate->numaggs; aggno++)
|
|
|
|
{
|
|
|
|
AggStatePerAgg peragg = &peraggs[aggno];
|
|
|
|
int transno = peragg->transno;
|
|
|
|
AggStatePerGroup pergroupstate;
|
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
pergroupstate = &pergroup[transno];
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
|
2016-06-26 20:33:38 +02:00
|
|
|
if (DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit))
|
2016-03-29 21:04:05 +02:00
|
|
|
finalize_partialaggregate(aggstate, peragg, pergroupstate,
|
|
|
|
&aggvalues[aggno], &aggnulls[aggno]);
|
2016-06-26 20:33:38 +02:00
|
|
|
else
|
|
|
|
finalize_aggregate(aggstate, peragg, pergroupstate,
|
|
|
|
&aggvalues[aggno], &aggnulls[aggno]);
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Project the result of a group (whose aggs have already been calculated by
|
|
|
|
* finalize_aggregates). Returns the result slot, or NULL if no row is
|
2017-01-19 23:12:38 +01:00
|
|
|
* projected (suppressed by qual).
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
*/
|
|
|
|
static TupleTableSlot *
|
|
|
|
project_aggregates(AggState *aggstate)
|
|
|
|
{
|
|
|
|
ExprContext *econtext = aggstate->ss.ps.ps_ExprContext;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check the qual (HAVING clause); if the group does not match, ignore it.
|
|
|
|
*/
|
Faster expression evaluation and targetlist projection.
This replaces the old, recursive tree-walk based evaluation, with
non-recursive, opcode dispatch based, expression evaluation.
Projection is now implemented as part of expression evaluation.
This both leads to significant performance improvements, and makes
future just-in-time compilation of expressions easier.
The speed gains primarily come from:
- non-recursive implementation reduces stack usage / overhead
- simple sub-expressions are implemented with a single jump, without
function calls
- sharing some state between different sub-expressions
- reduced amount of indirect/hard to predict memory accesses by laying
out operation metadata sequentially; including the avoidance of
nearly all of the previously used linked lists
- more code has been moved to expression initialization, avoiding
constant re-checks at evaluation time
Future just-in-time compilation (JIT) has become easier, as
demonstrated by released patches intended to be merged in a later
release, for primarily two reasons: Firstly, due to a stricter split
between expression initialization and evaluation, less code has to be
handled by the JIT. Secondly, due to the non-recursive nature of the
generated "instructions", less performance-critical code-paths can
easily be shared between interpreted and compiled evaluation.
The new framework allows for significant future optimizations. E.g.:
- basic infrastructure for to later reduce the per executor-startup
overhead of expression evaluation, by caching state in prepared
statements. That'd be helpful in OLTPish scenarios where
initialization overhead is measurable.
- optimizing the generated "code". A number of proposals for potential
work has already been made.
- optimizing the interpreter. Similarly a number of proposals have
been made here too.
The move of logic into the expression initialization step leads to some
backward-incompatible changes:
- Function permission checks are now done during expression
initialization, whereas previously they were done during
execution. In edge cases this can lead to errors being raised that
previously wouldn't have been, e.g. a NULL array being coerced to a
different array type previously didn't perform checks.
- The set of domain constraints to be checked, is now evaluated once
during expression initialization, previously it was re-built
every time a domain check was evaluated. For normal queries this
doesn't change much, but e.g. for plpgsql functions, which caches
ExprStates, the old set could stick around longer. The behavior
around might still change.
Author: Andres Freund, with significant changes by Tom Lane,
changes by Heikki Linnakangas
Reviewed-By: Tom Lane, Heikki Linnakangas
Discussion: https://postgr.es/m/20161206034955.bh33paeralxbtluv@alap3.anarazel.de
2017-03-14 23:45:36 +01:00
|
|
|
if (ExecQual(aggstate->ss.ps.qual, econtext))
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
{
|
|
|
|
/*
|
2017-01-19 23:12:38 +01:00
|
|
|
* Form and return projection tuple using the aggregate results and
|
|
|
|
* the representative input tuple.
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
*/
|
2017-01-19 23:12:38 +01:00
|
|
|
return ExecProject(aggstate->ss.ps.ps_ProjInfo);
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
}
|
|
|
|
else
|
|
|
|
InstrCountFiltered1(aggstate, 1);
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2006-06-28 21:40:52 +02:00
|
|
|
/*
|
Fix bug in HashAgg's selective-column-spilling logic.
Commit 230230223 taught nodeAgg.c that, when spilling tuples from
memory in an oversized hash aggregation, it only needed to spill
input columns referenced in the node's tlist and quals. Unfortunately,
that's wrong: we also have to save the grouping columns. The error
is masked in common cases because the grouping columns also appear
in the tlist, but that's not necessarily true. The main category
of plans where it's not true seem to come from semijoins ("WHERE
outercol IN (SELECT innercol FROM innertable)") where the innercol
needs an implicit promotion to make it comparable to the outercol.
The grouping column will be "innercol::promotedtype", but that
expression appears nowhere in the Agg node's own tlist and quals;
only the bare "innercol" is found in the tlist.
I spent quite a bit of time looking for a suitable regression test
case for this, without much success. If the number of distinct
values of the innercol is large enough to make spilling happen,
the planner tends to prefer a non-HashAgg plan, at least for
problem sizes that are reasonable to use in the regression tests.
So, no new regression test. However, this patch does demonstrably
fix the originally-reported test case.
Per report from s.p.e (at) gmx-topmail.de. Backpatch to v13
where the troublesome code came in.
Discussion: https://postgr.es/m/trinity-1c565d44-159f-488b-a518-caf13883134f-1611835701633@3c-app-gmx-bap78
2021-02-05 05:01:33 +01:00
|
|
|
* Find input-tuple columns that are needed, dividing them into
|
2020-07-13 02:48:49 +02:00
|
|
|
* aggregated and unaggregated sets.
|
2006-06-28 21:40:52 +02:00
|
|
|
*/
|
2020-07-13 02:48:49 +02:00
|
|
|
static void
|
|
|
|
find_cols(AggState *aggstate, Bitmapset **aggregated, Bitmapset **unaggregated)
|
2006-06-28 21:40:52 +02:00
|
|
|
{
|
2020-07-13 02:48:49 +02:00
|
|
|
Agg *agg = (Agg *) aggstate->ss.ps.plan;
|
|
|
|
FindColsContext context;
|
|
|
|
|
|
|
|
context.is_aggref = false;
|
|
|
|
context.aggregated = NULL;
|
|
|
|
context.unaggregated = NULL;
|
|
|
|
|
Fix bug in HashAgg's selective-column-spilling logic.
Commit 230230223 taught nodeAgg.c that, when spilling tuples from
memory in an oversized hash aggregation, it only needed to spill
input columns referenced in the node's tlist and quals. Unfortunately,
that's wrong: we also have to save the grouping columns. The error
is masked in common cases because the grouping columns also appear
in the tlist, but that's not necessarily true. The main category
of plans where it's not true seem to come from semijoins ("WHERE
outercol IN (SELECT innercol FROM innertable)") where the innercol
needs an implicit promotion to make it comparable to the outercol.
The grouping column will be "innercol::promotedtype", but that
expression appears nowhere in the Agg node's own tlist and quals;
only the bare "innercol" is found in the tlist.
I spent quite a bit of time looking for a suitable regression test
case for this, without much success. If the number of distinct
values of the innercol is large enough to make spilling happen,
the planner tends to prefer a non-HashAgg plan, at least for
problem sizes that are reasonable to use in the regression tests.
So, no new regression test. However, this patch does demonstrably
fix the originally-reported test case.
Per report from s.p.e (at) gmx-topmail.de. Backpatch to v13
where the troublesome code came in.
Discussion: https://postgr.es/m/trinity-1c565d44-159f-488b-a518-caf13883134f-1611835701633@3c-app-gmx-bap78
2021-02-05 05:01:33 +01:00
|
|
|
/* Examine tlist and quals */
|
2020-07-13 02:48:49 +02:00
|
|
|
(void) find_cols_walker((Node *) agg->plan.targetlist, &context);
|
|
|
|
(void) find_cols_walker((Node *) agg->plan.qual, &context);
|
|
|
|
|
Fix bug in HashAgg's selective-column-spilling logic.
Commit 230230223 taught nodeAgg.c that, when spilling tuples from
memory in an oversized hash aggregation, it only needed to spill
input columns referenced in the node's tlist and quals. Unfortunately,
that's wrong: we also have to save the grouping columns. The error
is masked in common cases because the grouping columns also appear
in the tlist, but that's not necessarily true. The main category
of plans where it's not true seem to come from semijoins ("WHERE
outercol IN (SELECT innercol FROM innertable)") where the innercol
needs an implicit promotion to make it comparable to the outercol.
The grouping column will be "innercol::promotedtype", but that
expression appears nowhere in the Agg node's own tlist and quals;
only the bare "innercol" is found in the tlist.
I spent quite a bit of time looking for a suitable regression test
case for this, without much success. If the number of distinct
values of the innercol is large enough to make spilling happen,
the planner tends to prefer a non-HashAgg plan, at least for
problem sizes that are reasonable to use in the regression tests.
So, no new regression test. However, this patch does demonstrably
fix the originally-reported test case.
Per report from s.p.e (at) gmx-topmail.de. Backpatch to v13
where the troublesome code came in.
Discussion: https://postgr.es/m/trinity-1c565d44-159f-488b-a518-caf13883134f-1611835701633@3c-app-gmx-bap78
2021-02-05 05:01:33 +01:00
|
|
|
/* In some cases, grouping columns will not appear in the tlist */
|
|
|
|
for (int i = 0; i < agg->numCols; i++)
|
|
|
|
context.unaggregated = bms_add_member(context.unaggregated,
|
|
|
|
agg->grpColIdx[i]);
|
|
|
|
|
2020-07-13 02:48:49 +02:00
|
|
|
*aggregated = context.aggregated;
|
|
|
|
*unaggregated = context.unaggregated;
|
2006-06-28 21:40:52 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static bool
|
2020-07-13 02:48:49 +02:00
|
|
|
find_cols_walker(Node *node, FindColsContext *context)
|
2006-06-28 21:40:52 +02:00
|
|
|
{
|
|
|
|
if (node == NULL)
|
|
|
|
return false;
|
|
|
|
if (IsA(node, Var))
|
|
|
|
{
|
|
|
|
Var *var = (Var *) node;
|
|
|
|
|
2011-10-11 20:20:06 +02:00
|
|
|
/* setrefs.c should have set the varno to OUTER_VAR */
|
|
|
|
Assert(var->varno == OUTER_VAR);
|
2006-06-28 21:40:52 +02:00
|
|
|
Assert(var->varlevelsup == 0);
|
2020-07-13 02:48:49 +02:00
|
|
|
if (context->is_aggref)
|
|
|
|
context->aggregated = bms_add_member(context->aggregated,
|
|
|
|
var->varattno);
|
|
|
|
else
|
|
|
|
context->unaggregated = bms_add_member(context->unaggregated,
|
|
|
|
var->varattno);
|
2006-06-28 21:40:52 +02:00
|
|
|
return false;
|
|
|
|
}
|
2020-07-13 02:48:49 +02:00
|
|
|
if (IsA(node, Aggref))
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
{
|
2020-07-13 02:48:49 +02:00
|
|
|
Assert(!context->is_aggref);
|
|
|
|
context->is_aggref = true;
|
|
|
|
expression_tree_walker(node, find_cols_walker, (void *) context);
|
|
|
|
context->is_aggref = false;
|
2006-06-28 21:40:52 +02:00
|
|
|
return false;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
}
|
2020-07-13 02:48:49 +02:00
|
|
|
return expression_tree_walker(node, find_cols_walker,
|
|
|
|
(void *) context);
|
2006-06-28 21:40:52 +02:00
|
|
|
}
|
|
|
|
|
2002-11-06 23:31:24 +01:00
|
|
|
/*
|
2019-02-09 09:35:57 +01:00
|
|
|
* (Re-)initialize the hash table(s) to empty.
|
2002-11-06 23:31:24 +01:00
|
|
|
*
|
2016-10-15 02:22:51 +02:00
|
|
|
* To implement hashed aggregation, we need a hashtable that stores a
|
|
|
|
* representative tuple and an array of AggStatePerGroup structs for each
|
|
|
|
* distinct set of GROUP BY column values. We compute the hash key from the
|
|
|
|
* GROUP BY columns. The per-group data is allocated in lookup_hash_entry(),
|
|
|
|
* for each entry.
|
|
|
|
*
|
2017-03-27 05:20:54 +02:00
|
|
|
* We have a separate hashtable and associated perhash data structure for each
|
|
|
|
* grouping set for which we're doing hashing.
|
|
|
|
*
|
2019-02-09 09:35:57 +01:00
|
|
|
* The contents of the hash tables always live in the hashcontext's per-tuple
|
|
|
|
* memory context (there is only one of these for all tables together, since
|
|
|
|
* they are all reset at the same time).
|
2002-11-06 23:31:24 +01:00
|
|
|
*/
|
|
|
|
static void
|
2020-02-19 19:15:16 +01:00
|
|
|
build_hash_tables(AggState *aggstate)
|
2002-11-06 23:31:24 +01:00
|
|
|
{
|
2020-02-19 19:15:16 +01:00
|
|
|
int setno;
|
2002-11-06 23:31:24 +01:00
|
|
|
|
2020-02-19 19:15:16 +01:00
|
|
|
for (setno = 0; setno < aggstate->num_hashes; ++setno)
|
2017-03-27 05:20:54 +02:00
|
|
|
{
|
2020-02-19 19:15:16 +01:00
|
|
|
AggStatePerHash perhash = &aggstate->perhash[setno];
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
long nbuckets;
|
|
|
|
Size memory;
|
|
|
|
|
|
|
|
if (perhash->hashtable != NULL)
|
|
|
|
{
|
|
|
|
ResetTupleHashTable(perhash->hashtable);
|
|
|
|
continue;
|
|
|
|
}
|
2017-03-27 05:20:54 +02:00
|
|
|
|
|
|
|
Assert(perhash->aggnode->numGroups > 0);
|
|
|
|
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
memory = aggstate->hash_mem_limit / aggstate->num_hashes;
|
|
|
|
|
|
|
|
/* choose reasonable number of buckets per hashtable */
|
|
|
|
nbuckets = hash_choose_num_buckets(aggstate->hashentrysize,
|
|
|
|
perhash->aggnode->numGroups,
|
|
|
|
memory);
|
|
|
|
|
|
|
|
build_hash_table(aggstate, setno, nbuckets);
|
2017-03-27 05:20:54 +02:00
|
|
|
}
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
|
|
|
aggstate->hash_ngroups_current = 0;
|
2008-10-16 21:25:55 +02:00
|
|
|
}
|
2006-06-28 21:40:52 +02:00
|
|
|
|
2020-02-19 19:15:16 +01:00
|
|
|
/*
|
|
|
|
* Build a single hashtable for this grouping set.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
build_hash_table(AggState *aggstate, int setno, long nbuckets)
|
|
|
|
{
|
|
|
|
AggStatePerHash perhash = &aggstate->perhash[setno];
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
MemoryContext metacxt = aggstate->hash_metacxt;
|
2020-02-19 19:15:16 +01:00
|
|
|
MemoryContext hashcxt = aggstate->hashcontext->ecxt_per_tuple_memory;
|
|
|
|
MemoryContext tmpcxt = aggstate->tmpcontext->ecxt_per_tuple_memory;
|
|
|
|
Size additionalsize;
|
|
|
|
|
|
|
|
Assert(aggstate->aggstrategy == AGG_HASHED ||
|
|
|
|
aggstate->aggstrategy == AGG_MIXED);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Used to make sure initial hash table allocation does not exceed
|
2020-07-29 23:14:58 +02:00
|
|
|
* hash_mem. Note that the estimate does not include space for
|
2020-02-19 19:15:16 +01:00
|
|
|
* pass-by-reference transition data values, nor for the representative
|
|
|
|
* tuple of each group.
|
|
|
|
*/
|
|
|
|
additionalsize = aggstate->numtrans * sizeof(AggStatePerGroupData);
|
|
|
|
|
|
|
|
perhash->hashtable = BuildTupleHashTableExt(&aggstate->ss.ps,
|
|
|
|
perhash->hashslot->tts_tupleDescriptor,
|
|
|
|
perhash->numCols,
|
|
|
|
perhash->hashGrpColIdxHash,
|
|
|
|
perhash->eqfuncoids,
|
|
|
|
perhash->hashfunctions,
|
|
|
|
perhash->aggnode->grpCollations,
|
|
|
|
nbuckets,
|
|
|
|
additionalsize,
|
|
|
|
metacxt,
|
|
|
|
hashcxt,
|
|
|
|
tmpcxt,
|
|
|
|
DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit));
|
|
|
|
}
|
|
|
|
|
2008-10-16 21:25:55 +02:00
|
|
|
/*
|
2016-12-01 02:30:09 +01:00
|
|
|
* Compute columns that actually need to be stored in hashtable entries. The
|
|
|
|
* incoming tuples from the child plan node will contain grouping columns,
|
|
|
|
* other columns referenced in our targetlist and qual, columns used to
|
|
|
|
* compute the aggregate functions, and perhaps just junk columns we don't use
|
|
|
|
* at all. Only columns of the first two types need to be stored in the
|
|
|
|
* hashtable, and getting rid of the others can make the table entries
|
|
|
|
* significantly smaller. The hashtable only contains the relevant columns,
|
|
|
|
* and is packed/unpacked in lookup_hash_entry() / agg_retrieve_hash_table()
|
|
|
|
* into the format of the normal input descriptor.
|
|
|
|
*
|
|
|
|
* Additional columns, in addition to the columns grouped by, come from two
|
|
|
|
* sources: Firstly functionally dependent columns that we don't need to group
|
|
|
|
* by themselves, and secondly ctids for row-marks.
|
|
|
|
*
|
|
|
|
* To eliminate duplicates, we build a bitmapset of the needed columns, and
|
2019-05-23 16:26:01 +02:00
|
|
|
* then build an array of the columns included in the hashtable. We might
|
|
|
|
* still have duplicates if the passed-in grpColIdx has them, which can happen
|
|
|
|
* in edge cases from semijoins/distinct; these can't always be removed,
|
|
|
|
* because it's not certain that the duplicate cols will be using the same
|
|
|
|
* hash function.
|
|
|
|
*
|
|
|
|
* Note that the array is preserved over ExecReScanAgg, so we allocate it in
|
|
|
|
* the per-query context (unlike the hash table itself).
|
2008-10-16 21:25:55 +02:00
|
|
|
*/
|
2017-03-27 05:20:54 +02:00
|
|
|
static void
|
2008-10-16 21:25:55 +02:00
|
|
|
find_hash_columns(AggState *aggstate)
|
|
|
|
{
|
2017-03-27 05:20:54 +02:00
|
|
|
Bitmapset *base_colnos;
|
2020-07-13 02:48:49 +02:00
|
|
|
Bitmapset *aggregated_colnos;
|
|
|
|
TupleDesc scanDesc = aggstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor;
|
2016-12-01 02:30:09 +01:00
|
|
|
List *outerTlist = outerPlanState(aggstate)->plan->targetlist;
|
2017-03-27 05:20:54 +02:00
|
|
|
int numHashes = aggstate->num_hashes;
|
2018-02-16 06:55:31 +01:00
|
|
|
EState *estate = aggstate->ss.ps.state;
|
2017-03-27 05:20:54 +02:00
|
|
|
int j;
|
2016-12-01 02:30:09 +01:00
|
|
|
|
2006-06-28 21:40:52 +02:00
|
|
|
/* Find Vars that will be needed in tlist and qual */
|
2020-07-13 02:48:49 +02:00
|
|
|
find_cols(aggstate, &aggregated_colnos, &base_colnos);
|
|
|
|
aggstate->colnos_needed = bms_union(base_colnos, aggregated_colnos);
|
|
|
|
aggstate->max_colno_needed = 0;
|
|
|
|
aggstate->all_cols_needed = true;
|
|
|
|
|
|
|
|
for (int i = 0; i < scanDesc->natts; i++)
|
|
|
|
{
|
|
|
|
int colno = i + 1;
|
2021-05-12 19:14:10 +02:00
|
|
|
|
2020-07-13 02:48:49 +02:00
|
|
|
if (bms_is_member(colno, aggstate->colnos_needed))
|
|
|
|
aggstate->max_colno_needed = colno;
|
|
|
|
else
|
|
|
|
aggstate->all_cols_needed = false;
|
|
|
|
}
|
2016-12-01 02:30:09 +01:00
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
for (j = 0; j < numHashes; ++j)
|
2016-12-01 02:30:09 +01:00
|
|
|
{
|
2017-03-27 05:20:54 +02:00
|
|
|
AggStatePerHash perhash = &aggstate->perhash[j];
|
|
|
|
Bitmapset *colnos = bms_copy(base_colnos);
|
|
|
|
AttrNumber *grpColIdx = perhash->aggnode->grpColIdx;
|
|
|
|
List *hashTlist = NIL;
|
|
|
|
TupleDesc hashDesc;
|
2019-05-23 16:26:01 +02:00
|
|
|
int maxCols;
|
2017-03-27 05:20:54 +02:00
|
|
|
int i;
|
2016-12-01 02:30:09 +01:00
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
perhash->largestGrpColIdx = 0;
|
2016-12-01 02:30:09 +01:00
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
/*
|
|
|
|
* If we're doing grouping sets, then some Vars might be referenced in
|
|
|
|
* tlist/qual for the benefit of other grouping sets, but not needed
|
|
|
|
* when hashing; i.e. prepare_projection_slot will null them out, so
|
|
|
|
* there'd be no point storing them. Use prepare_projection_slot's
|
|
|
|
* logic to determine which.
|
|
|
|
*/
|
|
|
|
if (aggstate->phases[0].grouped_cols)
|
|
|
|
{
|
|
|
|
Bitmapset *grouped_cols = aggstate->phases[0].grouped_cols[j];
|
|
|
|
ListCell *lc;
|
2016-12-01 02:30:09 +01:00
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
foreach(lc, aggstate->all_grouped_cols)
|
|
|
|
{
|
|
|
|
int attnum = lfirst_int(lc);
|
2016-12-01 02:30:09 +01:00
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
if (!bms_is_member(attnum, grouped_cols))
|
|
|
|
colnos = bms_del_member(colnos, attnum);
|
|
|
|
}
|
|
|
|
}
|
2019-05-23 16:26:01 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Compute maximum number of input columns accounting for possible
|
|
|
|
* duplications in the grpColIdx array, which can happen in some edge
|
|
|
|
* cases where HashAggregate was generated as part of a semijoin or a
|
|
|
|
* DISTINCT.
|
|
|
|
*/
|
|
|
|
maxCols = bms_num_members(colnos) + perhash->numCols;
|
2016-12-01 02:30:09 +01:00
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
perhash->hashGrpColIdxInput =
|
2019-05-23 16:26:01 +02:00
|
|
|
palloc(maxCols * sizeof(AttrNumber));
|
2017-03-27 05:20:54 +02:00
|
|
|
perhash->hashGrpColIdxHash =
|
|
|
|
palloc(perhash->numCols * sizeof(AttrNumber));
|
|
|
|
|
2019-05-23 16:26:01 +02:00
|
|
|
/* Add all the grouping columns to colnos */
|
|
|
|
for (i = 0; i < perhash->numCols; i++)
|
|
|
|
colnos = bms_add_member(colnos, grpColIdx[i]);
|
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
/*
|
|
|
|
* First build mapping for columns directly hashed. These are the
|
|
|
|
* first, because they'll be accessed when computing hash values and
|
|
|
|
* comparing tuples for exact matches. We also build simple mapping
|
|
|
|
* for execGrouping, so it knows where to find the to-be-hashed /
|
|
|
|
* compared columns in the input.
|
|
|
|
*/
|
|
|
|
for (i = 0; i < perhash->numCols; i++)
|
|
|
|
{
|
|
|
|
perhash->hashGrpColIdxInput[i] = grpColIdx[i];
|
|
|
|
perhash->hashGrpColIdxHash[i] = i + 1;
|
|
|
|
perhash->numhashGrpCols++;
|
|
|
|
/* delete already mapped columns */
|
|
|
|
bms_del_member(colnos, grpColIdx[i]);
|
|
|
|
}
|
2008-10-16 21:25:55 +02:00
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
/* and add the remaining columns */
|
|
|
|
while ((i = bms_first_member(colnos)) >= 0)
|
|
|
|
{
|
|
|
|
perhash->hashGrpColIdxInput[perhash->numhashGrpCols] = i;
|
|
|
|
perhash->numhashGrpCols++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* and build a tuple descriptor for the hashtable */
|
|
|
|
for (i = 0; i < perhash->numhashGrpCols; i++)
|
|
|
|
{
|
|
|
|
int varNumber = perhash->hashGrpColIdxInput[i] - 1;
|
|
|
|
|
|
|
|
hashTlist = lappend(hashTlist, list_nth(outerTlist, varNumber));
|
|
|
|
perhash->largestGrpColIdx =
|
|
|
|
Max(varNumber + 1, perhash->largestGrpColIdx);
|
|
|
|
}
|
|
|
|
|
Remove WITH OIDS support, change oid catalog column visibility.
Previously tables declared WITH OIDS, including a significant fraction
of the catalog tables, stored the oid column not as a normal column,
but as part of the tuple header.
This special column was not shown by default, which was somewhat odd,
as it's often (consider e.g. pg_class.oid) one of the more important
parts of a row. Neither pg_dump nor COPY included the contents of the
oid column by default.
The fact that the oid column was not an ordinary column necessitated a
significant amount of special case code to support oid columns. That
already was painful for the existing, but upcoming work aiming to make
table storage pluggable, would have required expanding and duplicating
that "specialness" significantly.
WITH OIDS has been deprecated since 2005 (commit ff02d0a05280e0).
Remove it.
Removing includes:
- CREATE TABLE and ALTER TABLE syntax for declaring the table to be
WITH OIDS has been removed (WITH (oids[ = true]) will error out)
- pg_dump does not support dumping tables declared WITH OIDS and will
issue a warning when dumping one (and ignore the oid column).
- restoring an pg_dump archive with pg_restore will warn when
restoring a table with oid contents (and ignore the oid column)
- COPY will refuse to load binary dump that includes oids.
- pg_upgrade will error out when encountering tables declared WITH
OIDS, they have to be altered to remove the oid column first.
- Functionality to access the oid of the last inserted row (like
plpgsql's RESULT_OID, spi's SPI_lastoid, ...) has been removed.
The syntax for declaring a table WITHOUT OIDS (or WITH (oids = false)
for CREATE TABLE) is still supported. While that requires a bit of
support code, it seems unnecessary to break applications / dumps that
do not use oids, and are explicit about not using them.
The biggest user of WITH OID columns was postgres' catalog. This
commit changes all 'magic' oid columns to be columns that are normally
declared and stored. To reduce unnecessary query breakage all the
newly added columns are still named 'oid', even if a table's column
naming scheme would indicate 'reloid' or such. This obviously
requires adapting a lot code, mostly replacing oid access via
HeapTupleGetOid() with access to the underlying Form_pg_*->oid column.
The bootstrap process now assigns oids for all oid columns in
genbki.pl that do not have an explicit value (starting at the largest
oid previously used), only oids assigned later by oids will be above
FirstBootstrapObjectId. As the oid column now is a normal column the
special bootstrap syntax for oids has been removed.
Oids are not automatically assigned during insertion anymore, all
backend code explicitly assigns oids with GetNewOidWithIndex(). For
the rare case that insertions into the catalog via SQL are called for
the new pg_nextoid() function can be used (which only works on catalog
tables).
The fact that oid columns on system tables are now normal columns
means that they will be included in the set of columns expanded
by * (i.e. SELECT * FROM pg_class will now include the table's oid,
previously it did not). It'd not technically be hard to hide oid
column by default, but that'd mean confusing behavior would either
have to be carried forward forever, or it'd cause breakage down the
line.
While it's not unlikely that further adjustments are needed, the
scope/invasiveness of the patch makes it worthwhile to get merge this
now. It's painful to maintain externally, too complicated to commit
after the code code freeze, and a dependency of a number of other
patches.
Catversion bump, for obvious reasons.
Author: Andres Freund, with contributions by John Naylor
Discussion: https://postgr.es/m/20180930034810.ywp2c7awz7opzcfr@alap3.anarazel.de
2018-11-21 00:36:57 +01:00
|
|
|
hashDesc = ExecTypeFromTL(hashTlist);
|
2018-02-16 06:55:31 +01:00
|
|
|
|
|
|
|
execTuplesHashPrepare(perhash->numCols,
|
|
|
|
perhash->aggnode->grpOperators,
|
|
|
|
&perhash->eqfuncoids,
|
|
|
|
&perhash->hashfunctions);
|
2018-02-17 06:17:38 +01:00
|
|
|
perhash->hashslot =
|
Introduce notion of different types of slots (without implementing them).
Upcoming work intends to allow pluggable ways to introduce new ways of
storing table data. Accessing those table access methods from the
executor requires TupleTableSlots to be carry tuples in the native
format of such storage methods; otherwise there'll be a significant
conversion overhead.
Different access methods will require different data to store tuples
efficiently (just like virtual, minimal, heap already require fields
in TupleTableSlot). To allow that without requiring additional pointer
indirections, we want to have different structs (embedding
TupleTableSlot) for different types of slots. Thus different types of
slots are needed, which requires adapting creators of slots.
The slot that most efficiently can represent a type of tuple in an
executor node will often depend on the type of slot a child node
uses. Therefore we need to track the type of slot is returned by
nodes, so parent slots can create slots based on that.
Relatedly, JIT compilation of tuple deforming needs to know which type
of slot a certain expression refers to, so it can create an
appropriate deforming function for the type of tuple in the slot.
But not all nodes will only return one type of slot, e.g. an append
node will potentially return different types of slots for each of its
subplans.
Therefore add function that allows to query the type of a node's
result slot, and whether it'll always be the same type (whether it's
fixed). This can be queried using ExecGetResultSlotOps().
The scan, result, inner, outer type of slots are automatically
inferred from ExecInitScanTupleSlot(), ExecInitResultSlot(),
left/right subtrees respectively. If that's not correct for a node,
that can be overwritten using new fields in PlanState.
This commit does not introduce the actually abstracted implementation
of different kind of TupleTableSlots, that will be left for a followup
commit. The different types of slots introduced will, for now, still
use the same backing implementation.
While this already partially invalidates the big comment in
tuptable.h, it seems to make more sense to update it later, when the
different TupleTableSlot implementations actually exist.
Author: Ashutosh Bapat and Andres Freund, with changes by Amit Khandekar
Discussion: https://postgr.es/m/20181105210039.hh4vvi4vwoq5ba2q@alap3.anarazel.de
2018-11-16 07:00:30 +01:00
|
|
|
ExecAllocTableSlot(&estate->es_tupleTable, hashDesc,
|
|
|
|
&TTSOpsMinimalTuple);
|
2017-03-27 05:20:54 +02:00
|
|
|
|
|
|
|
list_free(hashTlist);
|
|
|
|
bms_free(colnos);
|
|
|
|
}
|
|
|
|
|
|
|
|
bms_free(base_colnos);
|
2002-11-06 23:31:24 +01:00
|
|
|
}
|
|
|
|
|
2005-01-28 20:34:28 +01:00
|
|
|
/*
|
2020-02-06 20:49:56 +01:00
|
|
|
* Estimate per-hash-table-entry overhead.
|
2005-01-28 20:34:28 +01:00
|
|
|
*/
|
|
|
|
Size
|
2020-04-04 04:52:16 +02:00
|
|
|
hash_agg_entry_size(int numTrans, Size tupleWidth, Size transitionSpace)
|
2005-01-28 20:34:28 +01:00
|
|
|
{
|
2020-04-04 04:52:16 +02:00
|
|
|
Size tupleChunkSize;
|
|
|
|
Size pergroupChunkSize;
|
|
|
|
Size transitionChunkSize;
|
|
|
|
Size tupleSize = (MAXALIGN(SizeofMinimalTupleHeader) +
|
|
|
|
tupleWidth);
|
|
|
|
Size pergroupSize = numTrans * sizeof(AggStatePerGroupData);
|
|
|
|
|
|
|
|
tupleChunkSize = CHUNKHDRSZ + tupleSize;
|
|
|
|
|
|
|
|
if (pergroupSize > 0)
|
|
|
|
pergroupChunkSize = CHUNKHDRSZ + pergroupSize;
|
|
|
|
else
|
|
|
|
pergroupChunkSize = 0;
|
|
|
|
|
|
|
|
if (transitionSpace > 0)
|
|
|
|
transitionChunkSize = CHUNKHDRSZ + transitionSpace;
|
|
|
|
else
|
|
|
|
transitionChunkSize = 0;
|
|
|
|
|
2020-02-06 20:49:56 +01:00
|
|
|
return
|
2020-04-04 04:52:16 +02:00
|
|
|
sizeof(TupleHashEntryData) +
|
|
|
|
tupleChunkSize +
|
|
|
|
pergroupChunkSize +
|
|
|
|
transitionChunkSize;
|
2005-01-28 20:34:28 +01:00
|
|
|
}
|
|
|
|
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
/*
|
|
|
|
* hashagg_recompile_expressions()
|
|
|
|
*
|
|
|
|
* Identifies the right phase, compiles the right expression given the
|
|
|
|
* arguments, and then sets phase->evalfunc to that expression.
|
|
|
|
*
|
|
|
|
* Different versions of the compiled expression are needed depending on
|
|
|
|
* whether hash aggregation has spilled or not, and whether it's reading from
|
|
|
|
* the outer plan or a tape. Before spilling to disk, the expression reads
|
|
|
|
* from the outer plan and does not need to perform a NULL check. After
|
|
|
|
* HashAgg begins to spill, new groups will not be created in the hash table,
|
|
|
|
* and the AggStatePerGroup array may be NULL; therefore we need to add a null
|
|
|
|
* pointer check to the expression. Then, when reading spilled data from a
|
|
|
|
* tape, we change the outer slot type to be a fixed minimal tuple slot.
|
|
|
|
*
|
|
|
|
* It would be wasteful to recompile every time, so cache the compiled
|
|
|
|
* expressions in the AggStatePerPhase, and reuse when appropriate.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
hashagg_recompile_expressions(AggState *aggstate, bool minslot, bool nullcheck)
|
|
|
|
{
|
|
|
|
AggStatePerPhase phase;
|
|
|
|
int i = minslot ? 1 : 0;
|
|
|
|
int j = nullcheck ? 1 : 0;
|
|
|
|
|
|
|
|
Assert(aggstate->aggstrategy == AGG_HASHED ||
|
|
|
|
aggstate->aggstrategy == AGG_MIXED);
|
|
|
|
|
|
|
|
if (aggstate->aggstrategy == AGG_HASHED)
|
|
|
|
phase = &aggstate->phases[0];
|
|
|
|
else /* AGG_MIXED */
|
|
|
|
phase = &aggstate->phases[1];
|
|
|
|
|
|
|
|
if (phase->evaltrans_cache[i][j] == NULL)
|
|
|
|
{
|
|
|
|
const TupleTableSlotOps *outerops = aggstate->ss.ps.outerops;
|
|
|
|
bool outerfixed = aggstate->ss.ps.outeropsfixed;
|
|
|
|
bool dohash = true;
|
2020-12-27 02:25:30 +01:00
|
|
|
bool dosort = false;
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
2020-12-27 02:25:30 +01:00
|
|
|
/*
|
|
|
|
* If minslot is true, that means we are processing a spilled batch
|
|
|
|
* (inside agg_refill_hash_table()), and we must not advance the
|
|
|
|
* sorted grouping sets.
|
|
|
|
*/
|
|
|
|
if (aggstate->aggstrategy == AGG_MIXED && !minslot)
|
|
|
|
dosort = true;
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
|
|
|
/* temporarily change the outerops while compiling the expression */
|
|
|
|
if (minslot)
|
|
|
|
{
|
|
|
|
aggstate->ss.ps.outerops = &TTSOpsMinimalTuple;
|
|
|
|
aggstate->ss.ps.outeropsfixed = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
phase->evaltrans_cache[i][j] = ExecBuildAggTrans(aggstate, phase,
|
|
|
|
dosort, dohash,
|
|
|
|
nullcheck);
|
|
|
|
|
|
|
|
/* change back */
|
|
|
|
aggstate->ss.ps.outerops = outerops;
|
|
|
|
aggstate->ss.ps.outeropsfixed = outerfixed;
|
|
|
|
}
|
|
|
|
|
|
|
|
phase->evaltrans = phase->evaltrans_cache[i][j];
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2020-07-29 23:14:58 +02:00
|
|
|
* Set limits that trigger spilling to avoid exceeding hash_mem. Consider the
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
* number of partitions we expect to create (if we do spill).
|
|
|
|
*
|
|
|
|
* There are two limits: a memory limit, and also an ngroups limit. The
|
|
|
|
* ngroups limit becomes important when we expect transition values to grow
|
|
|
|
* substantially larger than the initial value.
|
|
|
|
*/
|
|
|
|
void
|
2020-07-29 08:15:47 +02:00
|
|
|
hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits,
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
Size *mem_limit, uint64 *ngroups_limit,
|
|
|
|
int *num_partitions)
|
|
|
|
{
|
|
|
|
int npartitions;
|
|
|
|
Size partition_mem;
|
Get rid of artificial restriction on hash table sizes on Windows.
The point of introducing the hash_mem_multiplier GUC was to let users
reproduce the old behavior of hash aggregation, i.e. that it could use
more than work_mem at need. However, the implementation failed to get
the job done on Win64, where work_mem is clamped to 2GB to protect
various places that calculate memory sizes using "long int". As
written, the same clamp was applied to hash_mem. This resulted in
severe performance regressions for queries requiring a bit more than
2GB for hash aggregation, as they now spill to disk and there's no
way to stop that.
Getting rid of the work_mem restriction seems like a good idea, but
it's a big job and could not conceivably be back-patched. However,
there's only a fairly small number of places that are concerned with
the hash_mem value, and it turns out to be possible to remove the
restriction there without too much code churn or any ABI breaks.
So, let's do that for now to fix the regression, and leave the
larger task for another day.
This patch does introduce a bit more infrastructure that should help
with the larger task, namely pg_bitutils.h support for working with
size_t values.
Per gripe from Laurent Hasson. Back-patch to v13 where the
behavior change came in.
Discussion: https://postgr.es/m/997817.1627074924@sss.pgh.pa.us
Discussion: https://postgr.es/m/MN2PR15MB25601E80A9B6D1BA6F592B1985E39@MN2PR15MB2560.namprd15.prod.outlook.com
2021-07-25 20:02:27 +02:00
|
|
|
Size hash_mem_limit = get_hash_memory_limit();
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
2020-07-29 23:14:58 +02:00
|
|
|
/* if not expected to spill, use all of hash_mem */
|
Get rid of artificial restriction on hash table sizes on Windows.
The point of introducing the hash_mem_multiplier GUC was to let users
reproduce the old behavior of hash aggregation, i.e. that it could use
more than work_mem at need. However, the implementation failed to get
the job done on Win64, where work_mem is clamped to 2GB to protect
various places that calculate memory sizes using "long int". As
written, the same clamp was applied to hash_mem. This resulted in
severe performance regressions for queries requiring a bit more than
2GB for hash aggregation, as they now spill to disk and there's no
way to stop that.
Getting rid of the work_mem restriction seems like a good idea, but
it's a big job and could not conceivably be back-patched. However,
there's only a fairly small number of places that are concerned with
the hash_mem value, and it turns out to be possible to remove the
restriction there without too much code churn or any ABI breaks.
So, let's do that for now to fix the regression, and leave the
larger task for another day.
This patch does introduce a bit more infrastructure that should help
with the larger task, namely pg_bitutils.h support for working with
size_t values.
Per gripe from Laurent Hasson. Back-patch to v13 where the
behavior change came in.
Discussion: https://postgr.es/m/997817.1627074924@sss.pgh.pa.us
Discussion: https://postgr.es/m/MN2PR15MB25601E80A9B6D1BA6F592B1985E39@MN2PR15MB2560.namprd15.prod.outlook.com
2021-07-25 20:02:27 +02:00
|
|
|
if (input_groups * hashentrysize <= hash_mem_limit)
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
{
|
2020-03-28 18:53:01 +01:00
|
|
|
if (num_partitions != NULL)
|
|
|
|
*num_partitions = 0;
|
Get rid of artificial restriction on hash table sizes on Windows.
The point of introducing the hash_mem_multiplier GUC was to let users
reproduce the old behavior of hash aggregation, i.e. that it could use
more than work_mem at need. However, the implementation failed to get
the job done on Win64, where work_mem is clamped to 2GB to protect
various places that calculate memory sizes using "long int". As
written, the same clamp was applied to hash_mem. This resulted in
severe performance regressions for queries requiring a bit more than
2GB for hash aggregation, as they now spill to disk and there's no
way to stop that.
Getting rid of the work_mem restriction seems like a good idea, but
it's a big job and could not conceivably be back-patched. However,
there's only a fairly small number of places that are concerned with
the hash_mem value, and it turns out to be possible to remove the
restriction there without too much code churn or any ABI breaks.
So, let's do that for now to fix the regression, and leave the
larger task for another day.
This patch does introduce a bit more infrastructure that should help
with the larger task, namely pg_bitutils.h support for working with
size_t values.
Per gripe from Laurent Hasson. Back-patch to v13 where the
behavior change came in.
Discussion: https://postgr.es/m/997817.1627074924@sss.pgh.pa.us
Discussion: https://postgr.es/m/MN2PR15MB25601E80A9B6D1BA6F592B1985E39@MN2PR15MB2560.namprd15.prod.outlook.com
2021-07-25 20:02:27 +02:00
|
|
|
*mem_limit = hash_mem_limit;
|
|
|
|
*ngroups_limit = hash_mem_limit / hashentrysize;
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Calculate expected memory requirements for spilling, which is the size
|
|
|
|
* of the buffers needed for all the tapes that need to be open at once.
|
|
|
|
* Then, subtract that from the memory available for holding hash tables.
|
|
|
|
*/
|
|
|
|
npartitions = hash_choose_num_partitions(input_groups,
|
|
|
|
hashentrysize,
|
|
|
|
used_bits,
|
|
|
|
NULL);
|
|
|
|
if (num_partitions != NULL)
|
|
|
|
*num_partitions = npartitions;
|
|
|
|
|
|
|
|
partition_mem =
|
|
|
|
HASHAGG_READ_BUFFER_SIZE +
|
|
|
|
HASHAGG_WRITE_BUFFER_SIZE * npartitions;
|
|
|
|
|
|
|
|
/*
|
2020-07-29 23:14:58 +02:00
|
|
|
* Don't set the limit below 3/4 of hash_mem. In that case, we are at the
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
* minimum number of partitions, so we aren't going to dramatically exceed
|
|
|
|
* work mem anyway.
|
|
|
|
*/
|
Get rid of artificial restriction on hash table sizes on Windows.
The point of introducing the hash_mem_multiplier GUC was to let users
reproduce the old behavior of hash aggregation, i.e. that it could use
more than work_mem at need. However, the implementation failed to get
the job done on Win64, where work_mem is clamped to 2GB to protect
various places that calculate memory sizes using "long int". As
written, the same clamp was applied to hash_mem. This resulted in
severe performance regressions for queries requiring a bit more than
2GB for hash aggregation, as they now spill to disk and there's no
way to stop that.
Getting rid of the work_mem restriction seems like a good idea, but
it's a big job and could not conceivably be back-patched. However,
there's only a fairly small number of places that are concerned with
the hash_mem value, and it turns out to be possible to remove the
restriction there without too much code churn or any ABI breaks.
So, let's do that for now to fix the regression, and leave the
larger task for another day.
This patch does introduce a bit more infrastructure that should help
with the larger task, namely pg_bitutils.h support for working with
size_t values.
Per gripe from Laurent Hasson. Back-patch to v13 where the
behavior change came in.
Discussion: https://postgr.es/m/997817.1627074924@sss.pgh.pa.us
Discussion: https://postgr.es/m/MN2PR15MB25601E80A9B6D1BA6F592B1985E39@MN2PR15MB2560.namprd15.prod.outlook.com
2021-07-25 20:02:27 +02:00
|
|
|
if (hash_mem_limit > 4 * partition_mem)
|
|
|
|
*mem_limit = hash_mem_limit - partition_mem;
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
else
|
Get rid of artificial restriction on hash table sizes on Windows.
The point of introducing the hash_mem_multiplier GUC was to let users
reproduce the old behavior of hash aggregation, i.e. that it could use
more than work_mem at need. However, the implementation failed to get
the job done on Win64, where work_mem is clamped to 2GB to protect
various places that calculate memory sizes using "long int". As
written, the same clamp was applied to hash_mem. This resulted in
severe performance regressions for queries requiring a bit more than
2GB for hash aggregation, as they now spill to disk and there's no
way to stop that.
Getting rid of the work_mem restriction seems like a good idea, but
it's a big job and could not conceivably be back-patched. However,
there's only a fairly small number of places that are concerned with
the hash_mem value, and it turns out to be possible to remove the
restriction there without too much code churn or any ABI breaks.
So, let's do that for now to fix the regression, and leave the
larger task for another day.
This patch does introduce a bit more infrastructure that should help
with the larger task, namely pg_bitutils.h support for working with
size_t values.
Per gripe from Laurent Hasson. Back-patch to v13 where the
behavior change came in.
Discussion: https://postgr.es/m/997817.1627074924@sss.pgh.pa.us
Discussion: https://postgr.es/m/MN2PR15MB25601E80A9B6D1BA6F592B1985E39@MN2PR15MB2560.namprd15.prod.outlook.com
2021-07-25 20:02:27 +02:00
|
|
|
*mem_limit = hash_mem_limit * 0.75;
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
|
|
|
if (*mem_limit > hashentrysize)
|
|
|
|
*ngroups_limit = *mem_limit / hashentrysize;
|
|
|
|
else
|
|
|
|
*ngroups_limit = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* hash_agg_check_limits
|
|
|
|
*
|
|
|
|
* After adding a new group to the hash table, check whether we need to enter
|
|
|
|
* spill mode. Allocations may happen without adding new groups (for instance,
|
|
|
|
* if the transition state size grows), so this check is imperfect.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
hash_agg_check_limits(AggState *aggstate)
|
|
|
|
{
|
|
|
|
uint64 ngroups = aggstate->hash_ngroups_current;
|
|
|
|
Size meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt,
|
|
|
|
true);
|
2020-07-29 02:59:16 +02:00
|
|
|
Size hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory,
|
|
|
|
true);
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Don't spill unless there's at least one group in the hash table so we
|
|
|
|
* can be sure to make progress even in edge cases.
|
|
|
|
*/
|
|
|
|
if (aggstate->hash_ngroups_current > 0 &&
|
2020-07-29 02:59:16 +02:00
|
|
|
(meta_mem + hashkey_mem > aggstate->hash_mem_limit ||
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
ngroups > aggstate->hash_ngroups_limit))
|
|
|
|
{
|
|
|
|
hash_agg_enter_spill_mode(aggstate);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Enter "spill mode", meaning that no new groups are added to any of the hash
|
|
|
|
* tables. Tuples that would create a new group are instead spilled, and
|
|
|
|
* processed later.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
hash_agg_enter_spill_mode(AggState *aggstate)
|
|
|
|
{
|
|
|
|
aggstate->hash_spill_mode = true;
|
|
|
|
hashagg_recompile_expressions(aggstate, aggstate->table_filled, true);
|
|
|
|
|
|
|
|
if (!aggstate->hash_ever_spilled)
|
|
|
|
{
|
2021-10-18 13:30:00 +02:00
|
|
|
Assert(aggstate->hash_tapeset == NULL);
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
Assert(aggstate->hash_spills == NULL);
|
|
|
|
|
|
|
|
aggstate->hash_ever_spilled = true;
|
|
|
|
|
2021-10-18 13:30:00 +02:00
|
|
|
aggstate->hash_tapeset = LogicalTapeSetCreate(true, NULL, -1);
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
|
|
|
aggstate->hash_spills = palloc(sizeof(HashAggSpill) * aggstate->num_hashes);
|
|
|
|
|
|
|
|
for (int setno = 0; setno < aggstate->num_hashes; setno++)
|
|
|
|
{
|
|
|
|
AggStatePerHash perhash = &aggstate->perhash[setno];
|
|
|
|
HashAggSpill *spill = &aggstate->hash_spills[setno];
|
|
|
|
|
2021-10-18 13:30:00 +02:00
|
|
|
hashagg_spill_init(spill, aggstate->hash_tapeset, 0,
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
perhash->aggnode->numGroups,
|
|
|
|
aggstate->hashentrysize);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Update metrics after filling the hash table.
|
|
|
|
*
|
|
|
|
* If reading from the outer plan, from_tape should be false; if reading from
|
|
|
|
* another tape, from_tape should be true.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
|
|
|
|
{
|
|
|
|
Size meta_mem;
|
2020-07-18 03:24:23 +02:00
|
|
|
Size hashkey_mem;
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
Size buffer_mem;
|
|
|
|
Size total_mem;
|
|
|
|
|
|
|
|
if (aggstate->aggstrategy != AGG_MIXED &&
|
|
|
|
aggstate->aggstrategy != AGG_HASHED)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* memory for the hash table itself */
|
|
|
|
meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, true);
|
|
|
|
|
|
|
|
/* memory for the group keys and transition states */
|
2020-07-18 03:24:23 +02:00
|
|
|
hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, true);
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
|
|
|
/* memory for read/write tape buffers, if spilled */
|
|
|
|
buffer_mem = npartitions * HASHAGG_WRITE_BUFFER_SIZE;
|
|
|
|
if (from_tape)
|
|
|
|
buffer_mem += HASHAGG_READ_BUFFER_SIZE;
|
|
|
|
|
|
|
|
/* update peak mem */
|
2020-07-18 03:24:23 +02:00
|
|
|
total_mem = meta_mem + hashkey_mem + buffer_mem;
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
if (total_mem > aggstate->hash_mem_peak)
|
|
|
|
aggstate->hash_mem_peak = total_mem;
|
|
|
|
|
|
|
|
/* update disk usage */
|
2021-10-18 13:30:00 +02:00
|
|
|
if (aggstate->hash_tapeset != NULL)
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
{
|
2021-10-18 13:30:00 +02:00
|
|
|
uint64 disk_used = LogicalTapeSetBlocks(aggstate->hash_tapeset) * (BLCKSZ / 1024);
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
|
|
|
if (aggstate->hash_disk_used < disk_used)
|
|
|
|
aggstate->hash_disk_used = disk_used;
|
|
|
|
}
|
|
|
|
|
2020-03-23 21:56:28 +01:00
|
|
|
/* update hashentrysize estimate based on contents */
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
if (aggstate->hash_ngroups_current > 0)
|
|
|
|
{
|
|
|
|
aggstate->hashentrysize =
|
2020-03-23 21:56:28 +01:00
|
|
|
sizeof(TupleHashEntryData) +
|
2020-07-18 03:24:23 +02:00
|
|
|
(hashkey_mem / (double) aggstate->hash_ngroups_current);
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Choose a reasonable number of buckets for the initial hash table size.
|
|
|
|
*/
|
|
|
|
static long
|
|
|
|
hash_choose_num_buckets(double hashentrysize, long ngroups, Size memory)
|
|
|
|
{
|
|
|
|
long max_nbuckets;
|
|
|
|
long nbuckets = ngroups;
|
|
|
|
|
|
|
|
max_nbuckets = memory / hashentrysize;
|
|
|
|
|
|
|
|
/*
|
2020-03-23 21:56:28 +01:00
|
|
|
* Underestimating is better than overestimating. Too many buckets crowd
|
|
|
|
* out space for group keys and transition state values.
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
*/
|
2020-03-23 21:56:28 +01:00
|
|
|
max_nbuckets >>= 1;
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
|
|
|
if (nbuckets > max_nbuckets)
|
|
|
|
nbuckets = max_nbuckets;
|
2020-06-09 05:59:45 +02:00
|
|
|
|
|
|
|
return Max(nbuckets, 1);
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Determine the number of partitions to create when spilling, which will
|
|
|
|
* always be a power of two. If log2_npartitions is non-NULL, set
|
|
|
|
* *log2_npartitions to the log2() of the number of partitions.
|
|
|
|
*/
|
|
|
|
static int
|
2020-07-29 08:15:47 +02:00
|
|
|
hash_choose_num_partitions(double input_groups, double hashentrysize,
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
int used_bits, int *log2_npartitions)
|
|
|
|
{
|
Get rid of artificial restriction on hash table sizes on Windows.
The point of introducing the hash_mem_multiplier GUC was to let users
reproduce the old behavior of hash aggregation, i.e. that it could use
more than work_mem at need. However, the implementation failed to get
the job done on Win64, where work_mem is clamped to 2GB to protect
various places that calculate memory sizes using "long int". As
written, the same clamp was applied to hash_mem. This resulted in
severe performance regressions for queries requiring a bit more than
2GB for hash aggregation, as they now spill to disk and there's no
way to stop that.
Getting rid of the work_mem restriction seems like a good idea, but
it's a big job and could not conceivably be back-patched. However,
there's only a fairly small number of places that are concerned with
the hash_mem value, and it turns out to be possible to remove the
restriction there without too much code churn or any ABI breaks.
So, let's do that for now to fix the regression, and leave the
larger task for another day.
This patch does introduce a bit more infrastructure that should help
with the larger task, namely pg_bitutils.h support for working with
size_t values.
Per gripe from Laurent Hasson. Back-patch to v13 where the
behavior change came in.
Discussion: https://postgr.es/m/997817.1627074924@sss.pgh.pa.us
Discussion: https://postgr.es/m/MN2PR15MB25601E80A9B6D1BA6F592B1985E39@MN2PR15MB2560.namprd15.prod.outlook.com
2021-07-25 20:02:27 +02:00
|
|
|
Size hash_mem_limit = get_hash_memory_limit();
|
|
|
|
double partition_limit;
|
|
|
|
double mem_wanted;
|
|
|
|
double dpartitions;
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
int npartitions;
|
|
|
|
int partition_bits;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Avoid creating so many partitions that the memory requirements of the
|
2020-07-29 23:14:58 +02:00
|
|
|
* open partition files are greater than 1/4 of hash_mem.
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
*/
|
|
|
|
partition_limit =
|
Get rid of artificial restriction on hash table sizes on Windows.
The point of introducing the hash_mem_multiplier GUC was to let users
reproduce the old behavior of hash aggregation, i.e. that it could use
more than work_mem at need. However, the implementation failed to get
the job done on Win64, where work_mem is clamped to 2GB to protect
various places that calculate memory sizes using "long int". As
written, the same clamp was applied to hash_mem. This resulted in
severe performance regressions for queries requiring a bit more than
2GB for hash aggregation, as they now spill to disk and there's no
way to stop that.
Getting rid of the work_mem restriction seems like a good idea, but
it's a big job and could not conceivably be back-patched. However,
there's only a fairly small number of places that are concerned with
the hash_mem value, and it turns out to be possible to remove the
restriction there without too much code churn or any ABI breaks.
So, let's do that for now to fix the regression, and leave the
larger task for another day.
This patch does introduce a bit more infrastructure that should help
with the larger task, namely pg_bitutils.h support for working with
size_t values.
Per gripe from Laurent Hasson. Back-patch to v13 where the
behavior change came in.
Discussion: https://postgr.es/m/997817.1627074924@sss.pgh.pa.us
Discussion: https://postgr.es/m/MN2PR15MB25601E80A9B6D1BA6F592B1985E39@MN2PR15MB2560.namprd15.prod.outlook.com
2021-07-25 20:02:27 +02:00
|
|
|
(hash_mem_limit * 0.25 - HASHAGG_READ_BUFFER_SIZE) /
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
HASHAGG_WRITE_BUFFER_SIZE;
|
|
|
|
|
|
|
|
mem_wanted = HASHAGG_PARTITION_FACTOR * input_groups * hashentrysize;
|
|
|
|
|
|
|
|
/* make enough partitions so that each one is likely to fit in memory */
|
Get rid of artificial restriction on hash table sizes on Windows.
The point of introducing the hash_mem_multiplier GUC was to let users
reproduce the old behavior of hash aggregation, i.e. that it could use
more than work_mem at need. However, the implementation failed to get
the job done on Win64, where work_mem is clamped to 2GB to protect
various places that calculate memory sizes using "long int". As
written, the same clamp was applied to hash_mem. This resulted in
severe performance regressions for queries requiring a bit more than
2GB for hash aggregation, as they now spill to disk and there's no
way to stop that.
Getting rid of the work_mem restriction seems like a good idea, but
it's a big job and could not conceivably be back-patched. However,
there's only a fairly small number of places that are concerned with
the hash_mem value, and it turns out to be possible to remove the
restriction there without too much code churn or any ABI breaks.
So, let's do that for now to fix the regression, and leave the
larger task for another day.
This patch does introduce a bit more infrastructure that should help
with the larger task, namely pg_bitutils.h support for working with
size_t values.
Per gripe from Laurent Hasson. Back-patch to v13 where the
behavior change came in.
Discussion: https://postgr.es/m/997817.1627074924@sss.pgh.pa.us
Discussion: https://postgr.es/m/MN2PR15MB25601E80A9B6D1BA6F592B1985E39@MN2PR15MB2560.namprd15.prod.outlook.com
2021-07-25 20:02:27 +02:00
|
|
|
dpartitions = 1 + (mem_wanted / hash_mem_limit);
|
|
|
|
|
|
|
|
if (dpartitions > partition_limit)
|
|
|
|
dpartitions = partition_limit;
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
Get rid of artificial restriction on hash table sizes on Windows.
The point of introducing the hash_mem_multiplier GUC was to let users
reproduce the old behavior of hash aggregation, i.e. that it could use
more than work_mem at need. However, the implementation failed to get
the job done on Win64, where work_mem is clamped to 2GB to protect
various places that calculate memory sizes using "long int". As
written, the same clamp was applied to hash_mem. This resulted in
severe performance regressions for queries requiring a bit more than
2GB for hash aggregation, as they now spill to disk and there's no
way to stop that.
Getting rid of the work_mem restriction seems like a good idea, but
it's a big job and could not conceivably be back-patched. However,
there's only a fairly small number of places that are concerned with
the hash_mem value, and it turns out to be possible to remove the
restriction there without too much code churn or any ABI breaks.
So, let's do that for now to fix the regression, and leave the
larger task for another day.
This patch does introduce a bit more infrastructure that should help
with the larger task, namely pg_bitutils.h support for working with
size_t values.
Per gripe from Laurent Hasson. Back-patch to v13 where the
behavior change came in.
Discussion: https://postgr.es/m/997817.1627074924@sss.pgh.pa.us
Discussion: https://postgr.es/m/MN2PR15MB25601E80A9B6D1BA6F592B1985E39@MN2PR15MB2560.namprd15.prod.outlook.com
2021-07-25 20:02:27 +02:00
|
|
|
if (dpartitions < HASHAGG_MIN_PARTITIONS)
|
|
|
|
dpartitions = HASHAGG_MIN_PARTITIONS;
|
|
|
|
if (dpartitions > HASHAGG_MAX_PARTITIONS)
|
|
|
|
dpartitions = HASHAGG_MAX_PARTITIONS;
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
Get rid of artificial restriction on hash table sizes on Windows.
The point of introducing the hash_mem_multiplier GUC was to let users
reproduce the old behavior of hash aggregation, i.e. that it could use
more than work_mem at need. However, the implementation failed to get
the job done on Win64, where work_mem is clamped to 2GB to protect
various places that calculate memory sizes using "long int". As
written, the same clamp was applied to hash_mem. This resulted in
severe performance regressions for queries requiring a bit more than
2GB for hash aggregation, as they now spill to disk and there's no
way to stop that.
Getting rid of the work_mem restriction seems like a good idea, but
it's a big job and could not conceivably be back-patched. However,
there's only a fairly small number of places that are concerned with
the hash_mem value, and it turns out to be possible to remove the
restriction there without too much code churn or any ABI breaks.
So, let's do that for now to fix the regression, and leave the
larger task for another day.
This patch does introduce a bit more infrastructure that should help
with the larger task, namely pg_bitutils.h support for working with
size_t values.
Per gripe from Laurent Hasson. Back-patch to v13 where the
behavior change came in.
Discussion: https://postgr.es/m/997817.1627074924@sss.pgh.pa.us
Discussion: https://postgr.es/m/MN2PR15MB25601E80A9B6D1BA6F592B1985E39@MN2PR15MB2560.namprd15.prod.outlook.com
2021-07-25 20:02:27 +02:00
|
|
|
/* HASHAGG_MAX_PARTITIONS limit makes this safe */
|
|
|
|
npartitions = (int) dpartitions;
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
|
|
|
/* ceil(log2(npartitions)) */
|
|
|
|
partition_bits = my_log2(npartitions);
|
|
|
|
|
|
|
|
/* make sure that we don't exhaust the hash bits */
|
|
|
|
if (partition_bits + used_bits >= 32)
|
|
|
|
partition_bits = 32 - used_bits;
|
|
|
|
|
|
|
|
if (log2_npartitions != NULL)
|
|
|
|
*log2_npartitions = partition_bits;
|
|
|
|
|
|
|
|
/* number of partitions will be a power of two */
|
Get rid of artificial restriction on hash table sizes on Windows.
The point of introducing the hash_mem_multiplier GUC was to let users
reproduce the old behavior of hash aggregation, i.e. that it could use
more than work_mem at need. However, the implementation failed to get
the job done on Win64, where work_mem is clamped to 2GB to protect
various places that calculate memory sizes using "long int". As
written, the same clamp was applied to hash_mem. This resulted in
severe performance regressions for queries requiring a bit more than
2GB for hash aggregation, as they now spill to disk and there's no
way to stop that.
Getting rid of the work_mem restriction seems like a good idea, but
it's a big job and could not conceivably be back-patched. However,
there's only a fairly small number of places that are concerned with
the hash_mem value, and it turns out to be possible to remove the
restriction there without too much code churn or any ABI breaks.
So, let's do that for now to fix the regression, and leave the
larger task for another day.
This patch does introduce a bit more infrastructure that should help
with the larger task, namely pg_bitutils.h support for working with
size_t values.
Per gripe from Laurent Hasson. Back-patch to v13 where the
behavior change came in.
Discussion: https://postgr.es/m/997817.1627074924@sss.pgh.pa.us
Discussion: https://postgr.es/m/MN2PR15MB25601E80A9B6D1BA6F592B1985E39@MN2PR15MB2560.namprd15.prod.outlook.com
2021-07-25 20:02:27 +02:00
|
|
|
npartitions = 1 << partition_bits;
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
|
|
|
return npartitions;
|
|
|
|
}
|
|
|
|
|
2002-11-06 23:31:24 +01:00
|
|
|
/*
|
2020-07-26 23:55:52 +02:00
|
|
|
* Initialize a freshly-created TupleHashEntry.
|
2002-11-06 23:31:24 +01:00
|
|
|
*/
|
2020-07-26 23:55:52 +02:00
|
|
|
static void
|
|
|
|
initialize_hash_entry(AggState *aggstate, TupleHashTable hashtable,
|
|
|
|
TupleHashEntry entry)
|
2002-11-06 23:31:24 +01:00
|
|
|
{
|
2020-07-26 23:55:52 +02:00
|
|
|
AggStatePerGroup pergroup;
|
|
|
|
int transno;
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
2020-07-26 23:55:52 +02:00
|
|
|
aggstate->hash_ngroups_current++;
|
|
|
|
hash_agg_check_limits(aggstate);
|
2018-01-03 03:02:37 +01:00
|
|
|
|
2020-07-26 23:55:52 +02:00
|
|
|
/* no need to allocate or initialize per-group state */
|
|
|
|
if (aggstate->numtrans == 0)
|
|
|
|
return;
|
2020-03-25 02:19:34 +01:00
|
|
|
|
2020-07-26 23:55:52 +02:00
|
|
|
pergroup = (AggStatePerGroup)
|
|
|
|
MemoryContextAlloc(hashtable->tablecxt,
|
|
|
|
sizeof(AggStatePerGroupData) * aggstate->numtrans);
|
2020-03-25 02:19:34 +01:00
|
|
|
|
2020-07-26 23:55:52 +02:00
|
|
|
entry->additional = pergroup;
|
2018-01-03 03:02:37 +01:00
|
|
|
|
2020-07-26 23:55:52 +02:00
|
|
|
/*
|
|
|
|
* Initialize aggregates for new tuple group, lookup_hash_entries()
|
|
|
|
* already has selected the relevant grouping set.
|
|
|
|
*/
|
|
|
|
for (transno = 0; transno < aggstate->numtrans; transno++)
|
|
|
|
{
|
|
|
|
AggStatePerTrans pertrans = &aggstate->pertrans[transno];
|
|
|
|
AggStatePerGroup pergroupstate = &pergroup[transno];
|
2018-01-03 03:02:37 +01:00
|
|
|
|
2020-07-26 23:55:52 +02:00
|
|
|
initialize_aggregate(aggstate, pertrans, pergroupstate);
|
2002-11-06 23:31:24 +01:00
|
|
|
}
|
|
|
|
}
|
2000-07-12 04:37:39 +02:00
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
/*
|
2021-02-24 08:13:17 +01:00
|
|
|
* Look up hash entries for the current tuple in all hashed grouping sets.
|
2017-03-27 05:20:54 +02:00
|
|
|
*
|
|
|
|
* Be aware that lookup_hash_entry can reset the tmpcontext.
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
*
|
|
|
|
* Some entries may be left NULL if we are in "spill mode". The same tuple
|
|
|
|
* will belong to different groups for each grouping set, so may match a group
|
|
|
|
* already in memory for one set and match a group not in memory for another
|
|
|
|
* set. When in "spill mode", the tuple will be spilled for each grouping set
|
|
|
|
* where it doesn't match a group in memory.
|
|
|
|
*
|
|
|
|
* NB: It's possible to spill the same tuple for several different grouping
|
|
|
|
* sets. This may seem wasteful, but it's actually a trade-off: if we spill
|
|
|
|
* the tuple multiple times for multiple grouping sets, it can be partitioned
|
|
|
|
* for each grouping set, making the refilling of the hash table very
|
|
|
|
* efficient.
|
2017-03-27 05:20:54 +02:00
|
|
|
*/
|
2018-01-09 22:25:38 +01:00
|
|
|
static void
|
2017-03-27 05:20:54 +02:00
|
|
|
lookup_hash_entries(AggState *aggstate)
|
|
|
|
{
|
|
|
|
AggStatePerGroup *pergroup = aggstate->hash_pergroup;
|
2020-07-26 23:55:52 +02:00
|
|
|
TupleTableSlot *outerslot = aggstate->tmpcontext->ecxt_outertuple;
|
2017-03-27 05:20:54 +02:00
|
|
|
int setno;
|
|
|
|
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
for (setno = 0; setno < aggstate->num_hashes; setno++)
|
2017-03-27 05:20:54 +02:00
|
|
|
{
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
AggStatePerHash perhash = &aggstate->perhash[setno];
|
2020-07-26 23:55:52 +02:00
|
|
|
TupleHashTable hashtable = perhash->hashtable;
|
|
|
|
TupleTableSlot *hashslot = perhash->hashslot;
|
|
|
|
TupleHashEntry entry;
|
2020-02-19 19:15:16 +01:00
|
|
|
uint32 hash;
|
2020-07-26 23:55:52 +02:00
|
|
|
bool isnew = false;
|
|
|
|
bool *p_isnew;
|
|
|
|
|
|
|
|
/* if hash table already spilled, don't create new entries */
|
|
|
|
p_isnew = aggstate->hash_spill_mode ? NULL : &isnew;
|
2020-02-19 19:15:16 +01:00
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
select_current_set(aggstate, setno, true);
|
2020-07-26 23:55:52 +02:00
|
|
|
prepare_hash_slot(perhash,
|
|
|
|
outerslot,
|
|
|
|
hashslot);
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
2020-07-26 23:55:52 +02:00
|
|
|
entry = LookupTupleHashEntry(hashtable, hashslot,
|
|
|
|
p_isnew, &hash);
|
|
|
|
|
|
|
|
if (entry != NULL)
|
|
|
|
{
|
|
|
|
if (isnew)
|
|
|
|
initialize_hash_entry(aggstate, hashtable, entry);
|
|
|
|
pergroup[setno] = entry->additional;
|
|
|
|
}
|
|
|
|
else
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
{
|
|
|
|
HashAggSpill *spill = &aggstate->hash_spills[setno];
|
|
|
|
TupleTableSlot *slot = aggstate->tmpcontext->ecxt_outertuple;
|
|
|
|
|
|
|
|
if (spill->partitions == NULL)
|
2021-10-18 13:30:00 +02:00
|
|
|
hashagg_spill_init(spill, aggstate->hash_tapeset, 0,
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
perhash->aggnode->numGroups,
|
|
|
|
aggstate->hashentrysize);
|
|
|
|
|
2020-07-13 02:48:49 +02:00
|
|
|
hashagg_spill_tuple(aggstate, spill, slot, hash);
|
2020-07-26 23:55:52 +02:00
|
|
|
pergroup[setno] = NULL;
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
}
|
2017-03-27 05:20:54 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2002-11-06 01:00:45 +01:00
|
|
|
/*
|
1996-07-09 08:22:35 +02:00
|
|
|
* ExecAgg -
|
|
|
|
*
|
|
|
|
* ExecAgg receives tuples from its outer subplan and aggregates over
|
1999-09-26 23:21:15 +02:00
|
|
|
* the appropriate attribute for each aggregate function use (Aggref
|
|
|
|
* node) appearing in the targetlist or qual of the node. The number
|
2002-11-06 01:00:45 +01:00
|
|
|
* of tuples to aggregate over depends on whether grouped or plain
|
|
|
|
* aggregation is selected. In grouped aggregation, we produce a result
|
|
|
|
* row for each group; in plain aggregation there's a single result row
|
|
|
|
* for the whole query. In either case, the value of each aggregate is
|
|
|
|
* stored in the expression context to be used when ExecProject evaluates
|
|
|
|
* the result tuple.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
2017-07-17 09:33:49 +02:00
|
|
|
static TupleTableSlot *
|
|
|
|
ExecAgg(PlanState *pstate)
|
2002-11-06 23:31:24 +01:00
|
|
|
{
|
2017-07-17 09:33:49 +02:00
|
|
|
AggState *node = castNode(AggState, pstate);
|
2017-03-27 05:20:54 +02:00
|
|
|
TupleTableSlot *result = NULL;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
|
2017-07-26 02:37:17 +02:00
|
|
|
CHECK_FOR_INTERRUPTS();
|
|
|
|
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
if (!node->agg_done)
|
2002-11-06 23:31:24 +01:00
|
|
|
{
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
/* Dispatch based on strategy */
|
2017-03-27 05:20:54 +02:00
|
|
|
switch (node->phase->aggstrategy)
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
{
|
|
|
|
case AGG_HASHED:
|
|
|
|
if (!node->table_filled)
|
|
|
|
agg_fill_hash_table(node);
|
2017-03-27 05:20:54 +02:00
|
|
|
/* FALLTHROUGH */
|
|
|
|
case AGG_MIXED:
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
result = agg_retrieve_hash_table(node);
|
|
|
|
break;
|
2017-03-27 05:20:54 +02:00
|
|
|
case AGG_PLAIN:
|
|
|
|
case AGG_SORTED:
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
result = agg_retrieve_direct(node);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!TupIsNull(result))
|
|
|
|
return result;
|
2002-11-06 23:31:24 +01:00
|
|
|
}
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
|
|
|
|
return NULL;
|
2002-11-06 23:31:24 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ExecAgg for non-hashed case
|
|
|
|
*/
|
|
|
|
static TupleTableSlot *
|
2002-12-05 16:50:39 +01:00
|
|
|
agg_retrieve_direct(AggState *aggstate)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
Agg *node = aggstate->phase->aggnode;
|
1996-07-09 08:22:35 +02:00
|
|
|
ExprContext *econtext;
|
2002-11-06 23:31:24 +01:00
|
|
|
ExprContext *tmpcontext;
|
1999-09-26 23:21:15 +02:00
|
|
|
AggStatePerAgg peragg;
|
2018-01-03 03:02:37 +01:00
|
|
|
AggStatePerGroup *pergroups;
|
2002-11-06 01:00:45 +01:00
|
|
|
TupleTableSlot *outerslot;
|
|
|
|
TupleTableSlot *firstSlot;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
TupleTableSlot *result;
|
|
|
|
bool hasGroupingSets = aggstate->phase->numsets > 0;
|
|
|
|
int numGroupingSets = Max(aggstate->phase->numsets, 1);
|
|
|
|
int currentSet;
|
|
|
|
int nextSetSize;
|
|
|
|
int numReset;
|
|
|
|
int i;
|
1998-03-30 18:36:43 +02:00
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/*
|
|
|
|
* get state info from node
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
*
|
2015-05-24 21:04:10 +02:00
|
|
|
* econtext is the per-output-tuple expression context
|
|
|
|
*
|
|
|
|
* tmpcontext is the per-input-tuple expression context
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
2002-12-05 16:50:39 +01:00
|
|
|
econtext = aggstate->ss.ps.ps_ExprContext;
|
2002-11-06 23:31:24 +01:00
|
|
|
tmpcontext = aggstate->tmpcontext;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
|
1999-09-26 23:21:15 +02:00
|
|
|
peragg = aggstate->peragg;
|
2018-01-03 03:02:37 +01:00
|
|
|
pergroups = aggstate->pergroups;
|
2002-12-05 16:50:39 +01:00
|
|
|
firstSlot = aggstate->ss.ss_ScanTupleSlot;
|
1998-03-30 18:36:43 +02:00
|
|
|
|
1998-07-19 07:49:26 +02:00
|
|
|
/*
|
2000-07-12 04:37:39 +02:00
|
|
|
* We loop retrieving groups until we find one matching
|
2002-12-05 16:50:39 +01:00
|
|
|
* aggstate->ss.ps.qual
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
*
|
|
|
|
* For grouping sets, we have the invariant that aggstate->projected_set
|
|
|
|
* is either -1 (initial call) or the index (starting from 0) in
|
|
|
|
* gset_lengths for the group we just completed (either by projecting a
|
|
|
|
* row or by discarding it in the qual).
|
1998-07-19 07:49:26 +02:00
|
|
|
*/
|
2004-07-10 20:39:23 +02:00
|
|
|
while (!aggstate->agg_done)
|
1998-07-19 07:49:26 +02:00
|
|
|
{
|
2002-11-06 01:00:45 +01:00
|
|
|
/*
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
* Clear the per-output-tuple context for each group, as well as
|
|
|
|
* aggcontext (which contains any pass-by-ref transvalues of the old
|
|
|
|
* group). Some aggregate functions store working state in child
|
|
|
|
* contexts; those now get reset automatically without us needing to
|
|
|
|
* do anything special.
|
|
|
|
*
|
|
|
|
* We use ReScanExprContext not just ResetExprContext because we want
|
|
|
|
* any registered shutdown callbacks to be called. That allows
|
|
|
|
* aggregate functions to ensure they've cleaned up any non-memory
|
|
|
|
* resources.
|
|
|
|
*/
|
|
|
|
ReScanExprContext(econtext);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Determine how many grouping sets need to be reset at this boundary.
|
2002-11-06 01:00:45 +01:00
|
|
|
*/
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
if (aggstate->projected_set >= 0 &&
|
|
|
|
aggstate->projected_set < numGroupingSets)
|
|
|
|
numReset = aggstate->projected_set + 1;
|
|
|
|
else
|
|
|
|
numReset = numGroupingSets;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* numReset can change on a phase boundary, but that's OK; we want to
|
|
|
|
* reset the contexts used in _this_ phase, and later, after possibly
|
|
|
|
* changing phase, initialize the right number of aggregates for the
|
|
|
|
* _new_ phase.
|
|
|
|
*/
|
|
|
|
|
|
|
|
for (i = 0; i < numReset; i++)
|
|
|
|
{
|
|
|
|
ReScanExprContext(aggstate->aggcontexts[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check if input is complete and there are no more groups to project
|
|
|
|
* in this phase; move to next phase or mark as done.
|
|
|
|
*/
|
|
|
|
if (aggstate->input_done == true &&
|
|
|
|
aggstate->projected_set >= (numGroupingSets - 1))
|
2002-11-06 01:00:45 +01:00
|
|
|
{
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
if (aggstate->current_phase < aggstate->numphases - 1)
|
2002-11-06 01:00:45 +01:00
|
|
|
{
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
initialize_phase(aggstate, aggstate->current_phase + 1);
|
|
|
|
aggstate->input_done = false;
|
|
|
|
aggstate->projected_set = -1;
|
|
|
|
numGroupingSets = Max(aggstate->phase->numsets, 1);
|
|
|
|
node = aggstate->phase->aggnode;
|
|
|
|
numReset = numGroupingSets;
|
2002-11-06 01:00:45 +01:00
|
|
|
}
|
2017-03-27 05:20:54 +02:00
|
|
|
else if (aggstate->aggstrategy == AGG_MIXED)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Mixed mode; we've output all the grouped stuff and have
|
|
|
|
* full hashtables, so switch to outputting those.
|
|
|
|
*/
|
|
|
|
initialize_phase(aggstate, 0);
|
|
|
|
aggstate->table_filled = true;
|
|
|
|
ResetTupleHashIterator(aggstate->perhash[0].hashtable,
|
|
|
|
&aggstate->perhash[0].hashiter);
|
|
|
|
select_current_set(aggstate, 0, true);
|
|
|
|
return agg_retrieve_hash_table(aggstate);
|
|
|
|
}
|
2002-11-06 01:00:45 +01:00
|
|
|
else
|
|
|
|
{
|
|
|
|
aggstate->agg_done = true;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
break;
|
2002-11-06 01:00:45 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2000-07-12 04:37:39 +02:00
|
|
|
/*
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
* Get the number of columns in the next grouping set after the last
|
|
|
|
* projected one (if any). This is the number of columns to compare to
|
|
|
|
* see if we reached the boundary of that set too.
|
2000-07-12 04:37:39 +02:00
|
|
|
*/
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
if (aggstate->projected_set >= 0 &&
|
|
|
|
aggstate->projected_set < (numGroupingSets - 1))
|
|
|
|
nextSetSize = aggstate->phase->gset_lengths[aggstate->projected_set + 1];
|
|
|
|
else
|
|
|
|
nextSetSize = 0;
|
2009-07-23 22:45:27 +02:00
|
|
|
|
2015-05-24 21:04:10 +02:00
|
|
|
/*----------
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
* If a subgroup for the current grouping set is present, project it.
|
|
|
|
*
|
|
|
|
* We have a new group if:
|
|
|
|
* - we're out of input but haven't projected all grouping sets
|
|
|
|
* (checked above)
|
|
|
|
* OR
|
|
|
|
* - we already projected a row that wasn't from the last grouping
|
|
|
|
* set
|
|
|
|
* AND
|
|
|
|
* - the next grouping set has at least one grouping column (since
|
|
|
|
* empty grouping sets project only once input is exhausted)
|
|
|
|
* AND
|
|
|
|
* - the previous and pending rows differ on the grouping columns
|
|
|
|
* of the next grouping set
|
2015-05-24 21:04:10 +02:00
|
|
|
*----------
|
1999-09-26 23:21:15 +02:00
|
|
|
*/
|
2018-02-16 06:55:31 +01:00
|
|
|
tmpcontext->ecxt_innertuple = econtext->ecxt_outertuple;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
if (aggstate->input_done ||
|
2017-03-27 05:20:54 +02:00
|
|
|
(node->aggstrategy != AGG_PLAIN &&
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
aggstate->projected_set != -1 &&
|
|
|
|
aggstate->projected_set < (numGroupingSets - 1) &&
|
|
|
|
nextSetSize > 0 &&
|
2018-02-16 06:55:31 +01:00
|
|
|
!ExecQualAndReset(aggstate->phase->eqfunctions[nextSetSize - 1],
|
|
|
|
tmpcontext)))
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
{
|
|
|
|
aggstate->projected_set += 1;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
Assert(aggstate->projected_set < numGroupingSets);
|
|
|
|
Assert(nextSetSize > 0 || aggstate->input_done);
|
|
|
|
}
|
|
|
|
else
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2002-11-06 01:00:45 +01:00
|
|
|
/*
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
* We no longer care what group we just projected, the next
|
|
|
|
* projection will always be the first (or only) grouping set
|
|
|
|
* (unless the input proves to be empty).
|
2002-11-06 01:00:45 +01:00
|
|
|
*/
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
aggstate->projected_set = 0;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-12 04:37:39 +02:00
|
|
|
/*
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
* If we don't already have the first tuple of the new group,
|
|
|
|
* fetch it from the outer plan.
|
2000-07-12 04:37:39 +02:00
|
|
|
*/
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
if (aggstate->grp_firstTuple == NULL)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
outerslot = fetch_input_tuple(aggstate);
|
|
|
|
if (!TupIsNull(outerslot))
|
2000-07-12 04:37:39 +02:00
|
|
|
{
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
/*
|
|
|
|
* Make a copy of the first input tuple; we will use this
|
|
|
|
* for comparisons (in group mode) and for projection.
|
|
|
|
*/
|
Make TupleTableSlots extensible, finish split of existing slot type.
This commit completes the work prepared in 1a0586de36, splitting the
old TupleTableSlot implementation (which could store buffer, heap,
minimal and virtual slots) into four different slot types. As
described in the aforementioned commit, this is done with the goal of
making tuple table slots extensible, to allow for pluggable table
access methods.
To achieve runtime extensibility for TupleTableSlots, operations on
slots that can differ between types of slots are performed using the
TupleTableSlotOps struct provided at slot creation time. That
includes information from the size of TupleTableSlot struct to be
allocated, initialization, deforming etc. See the struct's definition
for more detailed information about callbacks TupleTableSlotOps.
I decided to rename TTSOpsBufferTuple to TTSOpsBufferHeapTuple and
ExecCopySlotTuple to ExecCopySlotHeapTuple, as that seems more
consistent with other naming introduced in recent patches.
There's plenty optimization potential in the slot implementation, but
according to benchmarking the state after this commit has similar
performance characteristics to before this set of changes, which seems
sufficient.
There's a few changes in execReplication.c that currently need to poke
through the slot abstraction, that'll be repaired once the pluggable
storage patchset provides the necessary infrastructure.
Author: Andres Freund and Ashutosh Bapat, with changes by Amit Khandekar
Discussion: https://postgr.es/m/20181105210039.hh4vvi4vwoq5ba2q@alap3.anarazel.de
2018-11-17 01:35:11 +01:00
|
|
|
aggstate->grp_firstTuple = ExecCopySlotHeapTuple(outerslot);
|
2000-07-12 04:37:39 +02:00
|
|
|
}
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
else
|
2000-07-17 05:05:41 +02:00
|
|
|
{
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
/* outer plan produced no tuples at all */
|
|
|
|
if (hasGroupingSets)
|
2002-11-06 01:00:45 +01:00
|
|
|
{
|
|
|
|
/*
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
* If there was no input at all, we need to project
|
|
|
|
* rows only if there are grouping sets of size 0.
|
|
|
|
* Note that this implies that there can't be any
|
|
|
|
* references to ungrouped Vars, which would otherwise
|
|
|
|
* cause issues with the empty output slot.
|
|
|
|
*
|
|
|
|
* XXX: This is no longer true, we currently deal with
|
|
|
|
* this in finalize_aggregates().
|
2002-11-06 01:00:45 +01:00
|
|
|
*/
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
aggstate->input_done = true;
|
|
|
|
|
|
|
|
while (aggstate->phase->gset_lengths[aggstate->projected_set] > 0)
|
|
|
|
{
|
|
|
|
aggstate->projected_set += 1;
|
|
|
|
if (aggstate->projected_set >= numGroupingSets)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* We can't set agg_done here because we might
|
|
|
|
* have more phases to do, even though the
|
|
|
|
* input is empty. So we need to restart the
|
|
|
|
* whole outer loop.
|
|
|
|
*/
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (aggstate->projected_set >= numGroupingSets)
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
aggstate->agg_done = true;
|
|
|
|
/* If we are grouping, we should produce no tuples too */
|
|
|
|
if (node->aggstrategy != AGG_PLAIN)
|
|
|
|
return NULL;
|
2002-11-06 01:00:45 +01:00
|
|
|
}
|
2000-07-17 05:05:41 +02:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
|
|
|
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
/*
|
|
|
|
* Initialize working state for a new input tuple group.
|
|
|
|
*/
|
2018-01-03 03:02:37 +01:00
|
|
|
initialize_aggregates(aggstate, pergroups, numReset);
|
1999-09-26 23:21:15 +02:00
|
|
|
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
if (aggstate->grp_firstTuple != NULL)
|
2009-12-15 18:57:48 +01:00
|
|
|
{
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
/*
|
|
|
|
* Store the copied first input tuple in the tuple table slot
|
|
|
|
* reserved for it. The tuple will be deleted when it is
|
|
|
|
* cleared from the slot.
|
|
|
|
*/
|
Make TupleTableSlots extensible, finish split of existing slot type.
This commit completes the work prepared in 1a0586de36, splitting the
old TupleTableSlot implementation (which could store buffer, heap,
minimal and virtual slots) into four different slot types. As
described in the aforementioned commit, this is done with the goal of
making tuple table slots extensible, to allow for pluggable table
access methods.
To achieve runtime extensibility for TupleTableSlots, operations on
slots that can differ between types of slots are performed using the
TupleTableSlotOps struct provided at slot creation time. That
includes information from the size of TupleTableSlot struct to be
allocated, initialization, deforming etc. See the struct's definition
for more detailed information about callbacks TupleTableSlotOps.
I decided to rename TTSOpsBufferTuple to TTSOpsBufferHeapTuple and
ExecCopySlotTuple to ExecCopySlotHeapTuple, as that seems more
consistent with other naming introduced in recent patches.
There's plenty optimization potential in the slot implementation, but
according to benchmarking the state after this commit has similar
performance characteristics to before this set of changes, which seems
sufficient.
There's a few changes in execReplication.c that currently need to poke
through the slot abstraction, that'll be repaired once the pluggable
storage patchset provides the necessary infrastructure.
Author: Andres Freund and Ashutosh Bapat, with changes by Amit Khandekar
Discussion: https://postgr.es/m/20181105210039.hh4vvi4vwoq5ba2q@alap3.anarazel.de
2018-11-17 01:35:11 +01:00
|
|
|
ExecForceStoreHeapTuple(aggstate->grp_firstTuple,
|
2019-04-19 20:33:37 +02:00
|
|
|
firstSlot, true);
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
aggstate->grp_firstTuple = NULL; /* don't keep two pointers */
|
2002-11-06 23:31:24 +01:00
|
|
|
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
/* set up for first advance_aggregates call */
|
|
|
|
tmpcontext->ecxt_outertuple = firstSlot;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
/*
|
|
|
|
* Process each outer-plan tuple, and then fetch the next one,
|
|
|
|
* until we exhaust the outer plan or cross a group boundary.
|
|
|
|
*/
|
|
|
|
for (;;)
|
|
|
|
{
|
2017-03-27 05:20:54 +02:00
|
|
|
/*
|
|
|
|
* During phase 1 only of a mixed agg, we need to update
|
|
|
|
* hashtables as well in advance_aggregates.
|
|
|
|
*/
|
|
|
|
if (aggstate->aggstrategy == AGG_MIXED &&
|
|
|
|
aggstate->current_phase == 1)
|
|
|
|
{
|
2018-01-09 22:25:38 +01:00
|
|
|
lookup_hash_entries(aggstate);
|
2017-03-27 05:20:54 +02:00
|
|
|
}
|
|
|
|
|
2018-01-09 22:25:38 +01:00
|
|
|
/* Advance the aggregates (or combine functions) */
|
|
|
|
advance_aggregates(aggstate);
|
2008-09-08 02:22:56 +02:00
|
|
|
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
/* Reset per-input-tuple context after each tuple */
|
|
|
|
ResetExprContext(tmpcontext);
|
2008-09-08 02:22:56 +02:00
|
|
|
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
outerslot = fetch_input_tuple(aggstate);
|
|
|
|
if (TupIsNull(outerslot))
|
|
|
|
{
|
|
|
|
/* no more outer-plan tuples available */
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
|
|
|
/* if we built hash tables, finalize any spills */
|
|
|
|
if (aggstate->aggstrategy == AGG_MIXED &&
|
|
|
|
aggstate->current_phase == 1)
|
|
|
|
hashagg_finish_initial_spills(aggstate);
|
|
|
|
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
if (hasGroupingSets)
|
|
|
|
{
|
|
|
|
aggstate->input_done = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
aggstate->agg_done = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* set up for next advance_aggregates call */
|
|
|
|
tmpcontext->ecxt_outertuple = outerslot;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we are grouping, check whether we've crossed a group
|
|
|
|
* boundary.
|
|
|
|
*/
|
2017-03-27 05:20:54 +02:00
|
|
|
if (node->aggstrategy != AGG_PLAIN)
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
{
|
2018-02-16 06:55:31 +01:00
|
|
|
tmpcontext->ecxt_innertuple = firstSlot;
|
|
|
|
if (!ExecQual(aggstate->phase->eqfunctions[node->numCols - 1],
|
|
|
|
tmpcontext))
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
{
|
Make TupleTableSlots extensible, finish split of existing slot type.
This commit completes the work prepared in 1a0586de36, splitting the
old TupleTableSlot implementation (which could store buffer, heap,
minimal and virtual slots) into four different slot types. As
described in the aforementioned commit, this is done with the goal of
making tuple table slots extensible, to allow for pluggable table
access methods.
To achieve runtime extensibility for TupleTableSlots, operations on
slots that can differ between types of slots are performed using the
TupleTableSlotOps struct provided at slot creation time. That
includes information from the size of TupleTableSlot struct to be
allocated, initialization, deforming etc. See the struct's definition
for more detailed information about callbacks TupleTableSlotOps.
I decided to rename TTSOpsBufferTuple to TTSOpsBufferHeapTuple and
ExecCopySlotTuple to ExecCopySlotHeapTuple, as that seems more
consistent with other naming introduced in recent patches.
There's plenty optimization potential in the slot implementation, but
according to benchmarking the state after this commit has similar
performance characteristics to before this set of changes, which seems
sufficient.
There's a few changes in execReplication.c that currently need to poke
through the slot abstraction, that'll be repaired once the pluggable
storage patchset provides the necessary infrastructure.
Author: Andres Freund and Ashutosh Bapat, with changes by Amit Khandekar
Discussion: https://postgr.es/m/20181105210039.hh4vvi4vwoq5ba2q@alap3.anarazel.de
2018-11-17 01:35:11 +01:00
|
|
|
aggstate->grp_firstTuple = ExecCopySlotHeapTuple(outerslot);
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2008-09-08 02:22:56 +02:00
|
|
|
}
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Use the representative input tuple for any references to
|
|
|
|
* non-aggregated input columns in aggregate direct args, the node
|
|
|
|
* qual, and the tlist. (If we are not grouping, and there are no
|
|
|
|
* input rows at all, we will come here with an empty firstSlot
|
|
|
|
* ... but if not grouping, there can't be any references to
|
|
|
|
* non-aggregated input columns, so no problem.)
|
|
|
|
*/
|
|
|
|
econtext->ecxt_outertuple = firstSlot;
|
2004-07-10 20:39:23 +02:00
|
|
|
}
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
|
|
|
|
Assert(aggstate->projected_set >= 0);
|
|
|
|
|
|
|
|
currentSet = aggstate->projected_set;
|
|
|
|
|
|
|
|
prepare_projection_slot(aggstate, econtext->ecxt_outertuple, currentSet);
|
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
select_current_set(aggstate, currentSet, false);
|
|
|
|
|
|
|
|
finalize_aggregates(aggstate,
|
|
|
|
peragg,
|
2018-01-03 03:02:37 +01:00
|
|
|
pergroups[currentSet]);
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If there's no row to project right now, we must continue rather
|
|
|
|
* than returning a null since there might be more groups.
|
|
|
|
*/
|
|
|
|
result = project_aggregates(aggstate);
|
|
|
|
if (result)
|
|
|
|
return result;
|
2002-11-06 23:31:24 +01:00
|
|
|
}
|
|
|
|
|
2004-07-10 20:39:23 +02:00
|
|
|
/* No more groups */
|
|
|
|
return NULL;
|
2002-11-06 23:31:24 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2017-03-27 05:20:54 +02:00
|
|
|
* ExecAgg for hashed case: read input and build hash table
|
2002-11-06 23:31:24 +01:00
|
|
|
*/
|
|
|
|
static void
|
2002-12-05 16:50:39 +01:00
|
|
|
agg_fill_hash_table(AggState *aggstate)
|
2002-11-06 23:31:24 +01:00
|
|
|
{
|
|
|
|
TupleTableSlot *outerslot;
|
2017-03-27 05:20:54 +02:00
|
|
|
ExprContext *tmpcontext = aggstate->tmpcontext;
|
2002-11-06 23:31:24 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Process each outer-plan tuple, and then fetch the next one, until we
|
|
|
|
* exhaust the outer plan.
|
|
|
|
*/
|
|
|
|
for (;;)
|
|
|
|
{
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
outerslot = fetch_input_tuple(aggstate);
|
2002-11-06 23:31:24 +01:00
|
|
|
if (TupIsNull(outerslot))
|
|
|
|
break;
|
2017-03-27 05:20:54 +02:00
|
|
|
|
|
|
|
/* set up for lookup_hash_entries and advance_aggregates */
|
2007-02-23 00:44:25 +01:00
|
|
|
tmpcontext->ecxt_outertuple = outerslot;
|
2002-11-06 23:31:24 +01:00
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
/* Find or build hashtable entries */
|
2018-01-09 22:25:38 +01:00
|
|
|
lookup_hash_entries(aggstate);
|
2002-11-06 23:31:24 +01:00
|
|
|
|
2018-01-09 22:25:38 +01:00
|
|
|
/* Advance the aggregates (or combine functions) */
|
|
|
|
advance_aggregates(aggstate);
|
2002-11-06 23:31:24 +01:00
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
/*
|
|
|
|
* Reset per-input-tuple context after each tuple, but note that the
|
|
|
|
* hash lookups do this too
|
|
|
|
*/
|
|
|
|
ResetExprContext(aggstate->tmpcontext);
|
2002-11-06 23:31:24 +01:00
|
|
|
}
|
|
|
|
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
/* finalize spills, if any */
|
|
|
|
hashagg_finish_initial_spills(aggstate);
|
|
|
|
|
2002-11-06 23:31:24 +01:00
|
|
|
aggstate->table_filled = true;
|
2017-03-27 05:20:54 +02:00
|
|
|
/* Initialize to walk the first hash table */
|
|
|
|
select_current_set(aggstate, 0, true);
|
|
|
|
ResetTupleHashIterator(aggstate->perhash[0].hashtable,
|
|
|
|
&aggstate->perhash[0].hashiter);
|
2002-11-06 23:31:24 +01:00
|
|
|
}
|
|
|
|
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
/*
|
|
|
|
* If any data was spilled during hash aggregation, reset the hash table and
|
|
|
|
* reprocess one batch of spilled data. After reprocessing a batch, the hash
|
|
|
|
* table will again contain data, ready to be consumed by
|
|
|
|
* agg_retrieve_hash_table_in_memory().
|
|
|
|
*
|
|
|
|
* Should only be called after all in memory hash table entries have been
|
|
|
|
* finalized and emitted.
|
|
|
|
*
|
|
|
|
* Return false when input is exhausted and there's no more work to be done;
|
|
|
|
* otherwise return true.
|
|
|
|
*/
|
|
|
|
static bool
|
|
|
|
agg_refill_hash_table(AggState *aggstate)
|
|
|
|
{
|
|
|
|
HashAggBatch *batch;
|
2020-07-26 23:55:52 +02:00
|
|
|
AggStatePerHash perhash;
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
HashAggSpill spill;
|
2021-10-18 13:30:00 +02:00
|
|
|
LogicalTapeSet *tapeset = aggstate->hash_tapeset;
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
bool spill_initialized = false;
|
|
|
|
|
|
|
|
if (aggstate->hash_batches == NIL)
|
|
|
|
return false;
|
|
|
|
|
2021-11-01 21:24:39 +01:00
|
|
|
/* hash_batches is a stack, with the top item at the end of the list */
|
|
|
|
batch = llast(aggstate->hash_batches);
|
|
|
|
aggstate->hash_batches = list_delete_last(aggstate->hash_batches);
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
2020-07-29 08:15:47 +02:00
|
|
|
hash_agg_set_limits(aggstate->hashentrysize, batch->input_card,
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
batch->used_bits, &aggstate->hash_mem_limit,
|
|
|
|
&aggstate->hash_ngroups_limit, NULL);
|
|
|
|
|
2020-12-27 02:25:30 +01:00
|
|
|
/*
|
|
|
|
* Each batch only processes one grouping set; set the rest to NULL so
|
|
|
|
* that advance_aggregates() knows to ignore them. We don't touch
|
|
|
|
* pergroups for sorted grouping sets here, because they will be needed if
|
|
|
|
* we rescan later. The expressions for sorted grouping sets will not be
|
|
|
|
* evaluated after we recompile anyway.
|
|
|
|
*/
|
|
|
|
MemSet(aggstate->hash_pergroup, 0,
|
|
|
|
sizeof(AggStatePerGroup) * aggstate->num_hashes);
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
|
|
|
/* free memory and reset hash tables */
|
|
|
|
ReScanExprContext(aggstate->hashcontext);
|
|
|
|
for (int setno = 0; setno < aggstate->num_hashes; setno++)
|
|
|
|
ResetTupleHashTable(aggstate->perhash[setno].hashtable);
|
|
|
|
|
|
|
|
aggstate->hash_ngroups_current = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* In AGG_MIXED mode, hash aggregation happens in phase 1 and the output
|
|
|
|
* happens in phase 0. So, we switch to phase 1 when processing a batch,
|
|
|
|
* and back to phase 0 after the batch is done.
|
|
|
|
*/
|
|
|
|
Assert(aggstate->current_phase == 0);
|
|
|
|
if (aggstate->phase->aggstrategy == AGG_MIXED)
|
|
|
|
{
|
|
|
|
aggstate->current_phase = 1;
|
|
|
|
aggstate->phase = &aggstate->phases[aggstate->current_phase];
|
|
|
|
}
|
|
|
|
|
|
|
|
select_current_set(aggstate, batch->setno, true);
|
|
|
|
|
2020-07-26 23:55:52 +02:00
|
|
|
perhash = &aggstate->perhash[aggstate->current_set];
|
|
|
|
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
/*
|
|
|
|
* Spilled tuples are always read back as MinimalTuples, which may be
|
|
|
|
* different from the outer plan, so recompile the aggregate expressions.
|
|
|
|
*
|
|
|
|
* We still need the NULL check, because we are only processing one
|
|
|
|
* grouping set at a time and the rest will be NULL.
|
|
|
|
*/
|
|
|
|
hashagg_recompile_expressions(aggstate, true, true);
|
|
|
|
|
|
|
|
for (;;)
|
|
|
|
{
|
2020-07-26 23:55:52 +02:00
|
|
|
TupleTableSlot *spillslot = aggstate->hash_spill_rslot;
|
|
|
|
TupleTableSlot *hashslot = perhash->hashslot;
|
|
|
|
TupleHashEntry entry;
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
MinimalTuple tuple;
|
|
|
|
uint32 hash;
|
2020-07-26 23:55:52 +02:00
|
|
|
bool isnew = false;
|
|
|
|
bool *p_isnew = aggstate->hash_spill_mode ? NULL : &isnew;
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
|
|
|
CHECK_FOR_INTERRUPTS();
|
|
|
|
|
|
|
|
tuple = hashagg_batch_read(batch, &hash);
|
|
|
|
if (tuple == NULL)
|
|
|
|
break;
|
|
|
|
|
2020-07-26 23:55:52 +02:00
|
|
|
ExecStoreMinimalTuple(tuple, spillslot, true);
|
|
|
|
aggstate->tmpcontext->ecxt_outertuple = spillslot;
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
2020-07-26 23:55:52 +02:00
|
|
|
prepare_hash_slot(perhash,
|
|
|
|
aggstate->tmpcontext->ecxt_outertuple,
|
|
|
|
hashslot);
|
|
|
|
entry = LookupTupleHashEntryHash(
|
|
|
|
perhash->hashtable, hashslot, p_isnew, hash);
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
2020-07-26 23:55:52 +02:00
|
|
|
if (entry != NULL)
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
{
|
2020-07-26 23:55:52 +02:00
|
|
|
if (isnew)
|
|
|
|
initialize_hash_entry(aggstate, perhash->hashtable, entry);
|
|
|
|
aggstate->hash_pergroup[batch->setno] = entry->additional;
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
advance_aggregates(aggstate);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (!spill_initialized)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Avoid initializing the spill until we actually need it so
|
|
|
|
* that we don't assign tapes that will never be used.
|
|
|
|
*/
|
|
|
|
spill_initialized = true;
|
2021-10-18 13:30:00 +02:00
|
|
|
hashagg_spill_init(&spill, tapeset, batch->used_bits,
|
2020-07-29 08:15:47 +02:00
|
|
|
batch->input_card, aggstate->hashentrysize);
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
}
|
|
|
|
/* no memory for a new group, spill */
|
2020-07-26 23:55:52 +02:00
|
|
|
hashagg_spill_tuple(aggstate, &spill, spillslot, hash);
|
|
|
|
|
|
|
|
aggstate->hash_pergroup[batch->setno] = NULL;
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Reset per-input-tuple context after each tuple, but note that the
|
|
|
|
* hash lookups do this too
|
|
|
|
*/
|
|
|
|
ResetExprContext(aggstate->tmpcontext);
|
|
|
|
}
|
|
|
|
|
2021-10-18 13:30:00 +02:00
|
|
|
LogicalTapeClose(batch->input_tape);
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
|
|
|
/* change back to phase 0 */
|
|
|
|
aggstate->current_phase = 0;
|
|
|
|
aggstate->phase = &aggstate->phases[aggstate->current_phase];
|
|
|
|
|
|
|
|
if (spill_initialized)
|
|
|
|
{
|
|
|
|
hashagg_spill_finish(aggstate, &spill, batch->setno);
|
2020-09-16 06:34:05 +02:00
|
|
|
hash_agg_update_metrics(aggstate, true, spill.npartitions);
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
}
|
|
|
|
else
|
|
|
|
hash_agg_update_metrics(aggstate, true, 0);
|
|
|
|
|
|
|
|
aggstate->hash_spill_mode = false;
|
|
|
|
|
|
|
|
/* prepare to walk the first hash table */
|
|
|
|
select_current_set(aggstate, batch->setno, true);
|
|
|
|
ResetTupleHashIterator(aggstate->perhash[batch->setno].hashtable,
|
|
|
|
&aggstate->perhash[batch->setno].hashiter);
|
|
|
|
|
|
|
|
pfree(batch);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2002-11-06 23:31:24 +01:00
|
|
|
/*
|
2017-03-27 05:20:54 +02:00
|
|
|
* ExecAgg for hashed case: retrieving groups from hash table
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
*
|
|
|
|
* After exhausting in-memory tuples, also try refilling the hash table using
|
|
|
|
* previously-spilled tuples. Only returns NULL after all in-memory and
|
|
|
|
* spilled tuples are exhausted.
|
2002-11-06 23:31:24 +01:00
|
|
|
*/
|
|
|
|
static TupleTableSlot *
|
2002-12-05 16:50:39 +01:00
|
|
|
agg_retrieve_hash_table(AggState *aggstate)
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
{
|
|
|
|
TupleTableSlot *result = NULL;
|
|
|
|
|
|
|
|
while (result == NULL)
|
|
|
|
{
|
|
|
|
result = agg_retrieve_hash_table_in_memory(aggstate);
|
|
|
|
if (result == NULL)
|
|
|
|
{
|
|
|
|
if (!agg_refill_hash_table(aggstate))
|
|
|
|
{
|
|
|
|
aggstate->agg_done = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Retrieve the groups from the in-memory hash tables without considering any
|
|
|
|
* spilled tuples.
|
|
|
|
*/
|
|
|
|
static TupleTableSlot *
|
|
|
|
agg_retrieve_hash_table_in_memory(AggState *aggstate)
|
2002-11-06 23:31:24 +01:00
|
|
|
{
|
|
|
|
ExprContext *econtext;
|
|
|
|
AggStatePerAgg peragg;
|
|
|
|
AggStatePerGroup pergroup;
|
2016-10-15 02:22:51 +02:00
|
|
|
TupleHashEntryData *entry;
|
2002-11-06 23:31:24 +01:00
|
|
|
TupleTableSlot *firstSlot;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
TupleTableSlot *result;
|
2017-03-27 05:20:54 +02:00
|
|
|
AggStatePerHash perhash;
|
2002-11-06 23:31:24 +01:00
|
|
|
|
|
|
|
/*
|
2017-03-27 05:20:54 +02:00
|
|
|
* get state info from node.
|
|
|
|
*
|
|
|
|
* econtext is the per-output-tuple expression context.
|
2002-11-06 23:31:24 +01:00
|
|
|
*/
|
2002-12-05 16:50:39 +01:00
|
|
|
econtext = aggstate->ss.ps.ps_ExprContext;
|
2002-11-06 23:31:24 +01:00
|
|
|
peragg = aggstate->peragg;
|
2002-12-05 16:50:39 +01:00
|
|
|
firstSlot = aggstate->ss.ss_ScanTupleSlot;
|
2016-12-01 02:30:09 +01:00
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
/*
|
|
|
|
* Note that perhash (and therefore anything accessed through it) can
|
|
|
|
* change inside the loop, as we change between grouping sets.
|
|
|
|
*/
|
|
|
|
perhash = &aggstate->perhash[aggstate->current_set];
|
2002-11-06 23:31:24 +01:00
|
|
|
|
|
|
|
/*
|
2002-12-05 16:50:39 +01:00
|
|
|
* We loop retrieving groups until we find one satisfying
|
|
|
|
* aggstate->ss.ps.qual
|
2002-11-06 23:31:24 +01:00
|
|
|
*/
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
for (;;)
|
2002-11-06 23:31:24 +01:00
|
|
|
{
|
2017-03-27 05:20:54 +02:00
|
|
|
TupleTableSlot *hashslot = perhash->hashslot;
|
|
|
|
int i;
|
2016-12-01 02:30:09 +01:00
|
|
|
|
2017-07-26 02:37:17 +02:00
|
|
|
CHECK_FOR_INTERRUPTS();
|
|
|
|
|
2002-11-06 23:31:24 +01:00
|
|
|
/*
|
|
|
|
* Find the next entry in the hash table
|
|
|
|
*/
|
2017-03-27 05:20:54 +02:00
|
|
|
entry = ScanTupleHashTable(perhash->hashtable, &perhash->hashiter);
|
2003-01-11 00:54:24 +01:00
|
|
|
if (entry == NULL)
|
2002-11-06 23:31:24 +01:00
|
|
|
{
|
2017-03-27 05:20:54 +02:00
|
|
|
int nextset = aggstate->current_set + 1;
|
|
|
|
|
|
|
|
if (nextset < aggstate->num_hashes)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Switch to next grouping set, reinitialize, and restart the
|
|
|
|
* loop.
|
|
|
|
*/
|
|
|
|
select_current_set(aggstate, nextset, true);
|
|
|
|
|
|
|
|
perhash = &aggstate->perhash[aggstate->current_set];
|
|
|
|
|
|
|
|
ResetTupleHashIterator(perhash->hashtable, &perhash->hashiter);
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
2002-11-06 23:31:24 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Clear the per-output-tuple context for each group
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
*
|
|
|
|
* We intentionally don't use ReScanExprContext here; if any aggs have
|
|
|
|
* registered shutdown callbacks, they mustn't be called yet, since we
|
|
|
|
* might not be done with that agg.
|
2002-11-06 23:31:24 +01:00
|
|
|
*/
|
|
|
|
ResetExprContext(econtext);
|
|
|
|
|
|
|
|
/*
|
2016-12-01 02:30:09 +01:00
|
|
|
* Transform representative tuple back into one with the right
|
|
|
|
* columns.
|
2000-07-12 04:37:39 +02:00
|
|
|
*/
|
2016-12-01 02:30:09 +01:00
|
|
|
ExecStoreMinimalTuple(entry->firstTuple, hashslot, false);
|
|
|
|
slot_getallattrs(hashslot);
|
|
|
|
|
|
|
|
ExecClearTuple(firstSlot);
|
|
|
|
memset(firstSlot->tts_isnull, true,
|
|
|
|
firstSlot->tts_tupleDescriptor->natts * sizeof(bool));
|
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
for (i = 0; i < perhash->numhashGrpCols; i++)
|
2016-12-01 02:30:09 +01:00
|
|
|
{
|
2017-03-27 05:20:54 +02:00
|
|
|
int varNumber = perhash->hashGrpColIdxInput[i] - 1;
|
2016-12-01 02:30:09 +01:00
|
|
|
|
|
|
|
firstSlot->tts_values[varNumber] = hashslot->tts_values[i];
|
|
|
|
firstSlot->tts_isnull[varNumber] = hashslot->tts_isnull[i];
|
|
|
|
}
|
|
|
|
ExecStoreVirtualTuple(firstSlot);
|
2002-11-06 23:31:24 +01:00
|
|
|
|
2016-10-15 02:22:51 +02:00
|
|
|
pergroup = (AggStatePerGroup) entry->additional;
|
2002-11-06 23:31:24 +01:00
|
|
|
|
1999-09-26 23:21:15 +02:00
|
|
|
/*
|
2004-07-10 20:39:23 +02:00
|
|
|
* Use the representative input tuple for any references to
|
|
|
|
* non-aggregated input columns in the qual and tlist.
|
1999-09-26 23:21:15 +02:00
|
|
|
*/
|
2007-02-23 00:44:25 +01:00
|
|
|
econtext->ecxt_outertuple = firstSlot;
|
1998-09-01 06:40:42 +02:00
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
prepare_projection_slot(aggstate,
|
|
|
|
econtext->ecxt_outertuple,
|
|
|
|
aggstate->current_set);
|
|
|
|
|
|
|
|
finalize_aggregates(aggstate, peragg, pergroup);
|
|
|
|
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
result = project_aggregates(aggstate);
|
|
|
|
if (result)
|
|
|
|
return result;
|
1998-07-19 07:49:26 +02:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2004-07-10 20:39:23 +02:00
|
|
|
/* No more groups */
|
|
|
|
return NULL;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
/*
|
|
|
|
* hashagg_spill_init
|
|
|
|
*
|
|
|
|
* Called after we determined that spilling is necessary. Chooses the number
|
|
|
|
* of partitions to create, and initializes them.
|
|
|
|
*/
|
|
|
|
static void
|
2021-10-18 13:30:00 +02:00
|
|
|
hashagg_spill_init(HashAggSpill *spill, LogicalTapeSet *tapeset, int used_bits,
|
2020-07-29 08:15:47 +02:00
|
|
|
double input_groups, double hashentrysize)
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
{
|
|
|
|
int npartitions;
|
|
|
|
int partition_bits;
|
|
|
|
|
|
|
|
npartitions = hash_choose_num_partitions(input_groups, hashentrysize,
|
|
|
|
used_bits, &partition_bits);
|
|
|
|
|
2021-10-18 13:30:00 +02:00
|
|
|
spill->partitions = palloc0(sizeof(LogicalTape *) * npartitions);
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
spill->ntuples = palloc0(sizeof(int64) * npartitions);
|
2020-07-29 08:15:47 +02:00
|
|
|
spill->hll_card = palloc0(sizeof(hyperLogLogState) * npartitions);
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
2021-10-18 13:30:00 +02:00
|
|
|
for (int i = 0; i < npartitions; i++)
|
|
|
|
spill->partitions[i] = LogicalTapeCreate(tapeset);
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
|
|
|
spill->shift = 32 - used_bits - partition_bits;
|
|
|
|
spill->mask = (npartitions - 1) << spill->shift;
|
|
|
|
spill->npartitions = npartitions;
|
2020-07-29 08:15:47 +02:00
|
|
|
|
|
|
|
for (int i = 0; i < npartitions; i++)
|
|
|
|
initHyperLogLog(&spill->hll_card[i], HASHAGG_HLL_BIT_WIDTH);
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* hashagg_spill_tuple
|
|
|
|
*
|
|
|
|
* No room for new groups in the hash table. Save for later in the appropriate
|
|
|
|
* partition.
|
|
|
|
*/
|
|
|
|
static Size
|
2020-07-13 02:48:49 +02:00
|
|
|
hashagg_spill_tuple(AggState *aggstate, HashAggSpill *spill,
|
|
|
|
TupleTableSlot *inputslot, uint32 hash)
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
{
|
2020-07-13 02:48:49 +02:00
|
|
|
TupleTableSlot *spillslot;
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
int partition;
|
|
|
|
MinimalTuple tuple;
|
2021-10-18 13:30:00 +02:00
|
|
|
LogicalTape *tape;
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
int total_written = 0;
|
|
|
|
bool shouldFree;
|
|
|
|
|
|
|
|
Assert(spill->partitions != NULL);
|
|
|
|
|
2020-07-13 02:48:49 +02:00
|
|
|
/* spill only attributes that we actually need */
|
|
|
|
if (!aggstate->all_cols_needed)
|
|
|
|
{
|
|
|
|
spillslot = aggstate->hash_spill_wslot;
|
|
|
|
slot_getsomeattrs(inputslot, aggstate->max_colno_needed);
|
|
|
|
ExecClearTuple(spillslot);
|
|
|
|
for (int i = 0; i < spillslot->tts_tupleDescriptor->natts; i++)
|
|
|
|
{
|
|
|
|
if (bms_is_member(i + 1, aggstate->colnos_needed))
|
|
|
|
{
|
|
|
|
spillslot->tts_values[i] = inputslot->tts_values[i];
|
|
|
|
spillslot->tts_isnull[i] = inputslot->tts_isnull[i];
|
|
|
|
}
|
|
|
|
else
|
|
|
|
spillslot->tts_isnull[i] = true;
|
|
|
|
}
|
|
|
|
ExecStoreVirtualTuple(spillslot);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
spillslot = inputslot;
|
|
|
|
|
|
|
|
tuple = ExecFetchSlotMinimalTuple(spillslot, &shouldFree);
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
|
|
|
partition = (hash & spill->mask) >> spill->shift;
|
|
|
|
spill->ntuples[partition]++;
|
|
|
|
|
2020-07-29 08:15:47 +02:00
|
|
|
/*
|
|
|
|
* All hash values destined for a given partition have some bits in
|
|
|
|
* common, which causes bad HLL cardinality estimates. Hash the hash to
|
|
|
|
* get a more uniform distribution.
|
|
|
|
*/
|
|
|
|
addHyperLogLog(&spill->hll_card[partition], hash_bytes_uint32(hash));
|
|
|
|
|
2021-10-18 13:30:00 +02:00
|
|
|
tape = spill->partitions[partition];
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
2021-10-18 13:30:00 +02:00
|
|
|
LogicalTapeWrite(tape, (void *) &hash, sizeof(uint32));
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
total_written += sizeof(uint32);
|
|
|
|
|
2021-10-18 13:30:00 +02:00
|
|
|
LogicalTapeWrite(tape, (void *) tuple, tuple->t_len);
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
total_written += tuple->t_len;
|
|
|
|
|
|
|
|
if (shouldFree)
|
|
|
|
pfree(tuple);
|
|
|
|
|
|
|
|
return total_written;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* hashagg_batch_new
|
|
|
|
*
|
|
|
|
* Construct a HashAggBatch item, which represents one iteration of HashAgg to
|
|
|
|
* be done.
|
|
|
|
*/
|
|
|
|
static HashAggBatch *
|
2021-10-18 13:30:00 +02:00
|
|
|
hashagg_batch_new(LogicalTape *input_tape, int setno,
|
2020-07-29 08:15:47 +02:00
|
|
|
int64 input_tuples, double input_card, int used_bits)
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
{
|
|
|
|
HashAggBatch *batch = palloc0(sizeof(HashAggBatch));
|
|
|
|
|
|
|
|
batch->setno = setno;
|
|
|
|
batch->used_bits = used_bits;
|
2021-10-18 13:30:00 +02:00
|
|
|
batch->input_tape = input_tape;
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
batch->input_tuples = input_tuples;
|
2020-07-29 08:15:47 +02:00
|
|
|
batch->input_card = input_card;
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
|
|
|
return batch;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* read_spilled_tuple
|
|
|
|
* read the next tuple from a batch's tape. Return NULL if no more.
|
|
|
|
*/
|
|
|
|
static MinimalTuple
|
|
|
|
hashagg_batch_read(HashAggBatch *batch, uint32 *hashp)
|
|
|
|
{
|
2021-10-18 13:30:00 +02:00
|
|
|
LogicalTape *tape = batch->input_tape;
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
MinimalTuple tuple;
|
|
|
|
uint32 t_len;
|
|
|
|
size_t nread;
|
|
|
|
uint32 hash;
|
|
|
|
|
2021-10-18 13:30:00 +02:00
|
|
|
nread = LogicalTapeRead(tape, &hash, sizeof(uint32));
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
if (nread == 0)
|
|
|
|
return NULL;
|
|
|
|
if (nread != sizeof(uint32))
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode_for_file_access(),
|
2021-10-18 13:30:00 +02:00
|
|
|
errmsg("unexpected EOF for tape %p: requested %zu bytes, read %zu bytes",
|
|
|
|
tape, sizeof(uint32), nread)));
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
if (hashp != NULL)
|
|
|
|
*hashp = hash;
|
|
|
|
|
2021-10-18 13:30:00 +02:00
|
|
|
nread = LogicalTapeRead(tape, &t_len, sizeof(t_len));
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
if (nread != sizeof(uint32))
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode_for_file_access(),
|
2021-10-18 13:30:00 +02:00
|
|
|
errmsg("unexpected EOF for tape %p: requested %zu bytes, read %zu bytes",
|
|
|
|
tape, sizeof(uint32), nread)));
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
|
|
|
tuple = (MinimalTuple) palloc(t_len);
|
|
|
|
tuple->t_len = t_len;
|
|
|
|
|
2021-10-18 13:30:00 +02:00
|
|
|
nread = LogicalTapeRead(tape,
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
(void *) ((char *) tuple + sizeof(uint32)),
|
|
|
|
t_len - sizeof(uint32));
|
|
|
|
if (nread != t_len - sizeof(uint32))
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode_for_file_access(),
|
2021-10-18 13:30:00 +02:00
|
|
|
errmsg("unexpected EOF for tape %p: requested %zu bytes, read %zu bytes",
|
|
|
|
tape, t_len - sizeof(uint32), nread)));
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
|
|
|
return tuple;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* hashagg_finish_initial_spills
|
|
|
|
*
|
|
|
|
* After a HashAggBatch has been processed, it may have spilled tuples to
|
|
|
|
* disk. If so, turn the spilled partitions into new batches that must later
|
|
|
|
* be executed.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
hashagg_finish_initial_spills(AggState *aggstate)
|
|
|
|
{
|
|
|
|
int setno;
|
|
|
|
int total_npartitions = 0;
|
|
|
|
|
|
|
|
if (aggstate->hash_spills != NULL)
|
|
|
|
{
|
|
|
|
for (setno = 0; setno < aggstate->num_hashes; setno++)
|
|
|
|
{
|
|
|
|
HashAggSpill *spill = &aggstate->hash_spills[setno];
|
2020-05-14 19:06:38 +02:00
|
|
|
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
total_npartitions += spill->npartitions;
|
|
|
|
hashagg_spill_finish(aggstate, spill, setno);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We're not processing tuples from outer plan any more; only
|
|
|
|
* processing batches of spilled tuples. The initial spill structures
|
|
|
|
* are no longer needed.
|
|
|
|
*/
|
|
|
|
pfree(aggstate->hash_spills);
|
|
|
|
aggstate->hash_spills = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
hash_agg_update_metrics(aggstate, false, total_npartitions);
|
|
|
|
aggstate->hash_spill_mode = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* hashagg_spill_finish
|
|
|
|
*
|
|
|
|
* Transform spill partitions into new batches.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
hashagg_spill_finish(AggState *aggstate, HashAggSpill *spill, int setno)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
int used_bits = 32 - spill->shift;
|
|
|
|
|
|
|
|
if (spill->npartitions == 0)
|
|
|
|
return; /* didn't spill */
|
|
|
|
|
|
|
|
for (i = 0; i < spill->npartitions; i++)
|
|
|
|
{
|
2021-10-18 13:30:00 +02:00
|
|
|
LogicalTape *tape = spill->partitions[i];
|
2020-07-29 08:15:47 +02:00
|
|
|
HashAggBatch *new_batch;
|
|
|
|
double cardinality;
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
|
|
|
/* if the partition is empty, don't create a new batch of work */
|
|
|
|
if (spill->ntuples[i] == 0)
|
|
|
|
continue;
|
|
|
|
|
2020-07-29 08:15:47 +02:00
|
|
|
cardinality = estimateHyperLogLog(&spill->hll_card[i]);
|
|
|
|
freeHyperLogLog(&spill->hll_card[i]);
|
|
|
|
|
2020-09-16 06:16:31 +02:00
|
|
|
/* rewinding frees the buffer while not in use */
|
2021-10-18 13:30:00 +02:00
|
|
|
LogicalTapeRewindForRead(tape, HASHAGG_READ_BUFFER_SIZE);
|
2020-09-16 06:16:31 +02:00
|
|
|
|
2021-10-18 13:30:00 +02:00
|
|
|
new_batch = hashagg_batch_new(tape, setno,
|
2020-09-16 06:16:31 +02:00
|
|
|
spill->ntuples[i], cardinality,
|
|
|
|
used_bits);
|
2021-11-01 21:24:39 +01:00
|
|
|
aggstate->hash_batches = lappend(aggstate->hash_batches, new_batch);
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
aggstate->hash_batches_used++;
|
|
|
|
}
|
|
|
|
|
|
|
|
pfree(spill->ntuples);
|
2020-07-29 08:15:47 +02:00
|
|
|
pfree(spill->hll_card);
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
pfree(spill->partitions);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Free resources related to a spilled HashAgg.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
hashagg_reset_spill_state(AggState *aggstate)
|
|
|
|
{
|
|
|
|
/* free spills from initial pass */
|
|
|
|
if (aggstate->hash_spills != NULL)
|
|
|
|
{
|
|
|
|
int setno;
|
|
|
|
|
|
|
|
for (setno = 0; setno < aggstate->num_hashes; setno++)
|
|
|
|
{
|
|
|
|
HashAggSpill *spill = &aggstate->hash_spills[setno];
|
2020-05-14 19:06:38 +02:00
|
|
|
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
pfree(spill->ntuples);
|
|
|
|
pfree(spill->partitions);
|
|
|
|
}
|
|
|
|
pfree(aggstate->hash_spills);
|
|
|
|
aggstate->hash_spills = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* free batches */
|
2021-11-01 21:24:39 +01:00
|
|
|
list_free_deep(aggstate->hash_batches);
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
aggstate->hash_batches = NIL;
|
|
|
|
|
|
|
|
/* close tape set */
|
2021-10-18 13:30:00 +02:00
|
|
|
if (aggstate->hash_tapeset != NULL)
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
{
|
2021-10-18 13:30:00 +02:00
|
|
|
LogicalTapeSetClose(aggstate->hash_tapeset);
|
|
|
|
aggstate->hash_tapeset = NULL;
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/* -----------------
|
|
|
|
* ExecInitAgg
|
|
|
|
*
|
|
|
|
* Creates the run-time information for the agg node produced by the
|
2017-03-27 05:20:54 +02:00
|
|
|
* planner and initializes its outer subtree.
|
|
|
|
*
|
1996-07-09 08:22:35 +02:00
|
|
|
* -----------------
|
|
|
|
*/
|
2002-12-05 16:50:39 +01:00
|
|
|
AggState *
|
2006-02-28 05:10:28 +01:00
|
|
|
ExecInitAgg(Agg *node, EState *estate, int eflags)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
1999-09-26 23:21:15 +02:00
|
|
|
AggState *aggstate;
|
2015-08-04 16:53:10 +02:00
|
|
|
AggStatePerAgg peraggs;
|
|
|
|
AggStatePerTrans pertransstates;
|
2018-01-09 22:25:38 +01:00
|
|
|
AggStatePerGroup *pergroups;
|
1999-09-26 23:21:15 +02:00
|
|
|
Plan *outerPlan;
|
|
|
|
ExprContext *econtext;
|
2018-02-16 06:55:31 +01:00
|
|
|
TupleDesc scanDesc;
|
2020-11-24 09:45:00 +01:00
|
|
|
int max_aggno;
|
|
|
|
int max_transno;
|
|
|
|
int numaggrefs;
|
|
|
|
int numaggs;
|
|
|
|
int numtrans;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
int phase;
|
2017-03-27 05:20:54 +02:00
|
|
|
int phaseidx;
|
2004-05-26 06:41:50 +02:00
|
|
|
ListCell *l;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
Bitmapset *all_grouped_cols = NULL;
|
|
|
|
int numGroupingSets = 1;
|
|
|
|
int numPhases;
|
2017-03-27 05:20:54 +02:00
|
|
|
int numHashes;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
int i = 0;
|
|
|
|
int j = 0;
|
2017-03-27 05:20:54 +02:00
|
|
|
bool use_hashing = (node->aggstrategy == AGG_HASHED ||
|
|
|
|
node->aggstrategy == AGG_MIXED);
|
1999-05-25 18:15:34 +02:00
|
|
|
|
2006-02-28 05:10:28 +01:00
|
|
|
/* check for unsupported flags */
|
|
|
|
Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/*
|
|
|
|
* create state structure
|
|
|
|
*/
|
|
|
|
aggstate = makeNode(AggState);
|
2002-12-05 16:50:39 +01:00
|
|
|
aggstate->ss.ps.plan = (Plan *) node;
|
|
|
|
aggstate->ss.ps.state = estate;
|
2017-07-17 09:33:49 +02:00
|
|
|
aggstate->ss.ps.ExecProcNode = ExecAgg;
|
2002-12-05 16:50:39 +01:00
|
|
|
|
|
|
|
aggstate->aggs = NIL;
|
|
|
|
aggstate->numaggs = 0;
|
2015-08-04 16:53:10 +02:00
|
|
|
aggstate->numtrans = 0;
|
2017-03-27 05:20:54 +02:00
|
|
|
aggstate->aggstrategy = node->aggstrategy;
|
2016-06-26 20:33:38 +02:00
|
|
|
aggstate->aggsplit = node->aggsplit;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
aggstate->maxsets = 0;
|
|
|
|
aggstate->projected_set = -1;
|
|
|
|
aggstate->current_set = 0;
|
2002-11-06 23:31:24 +01:00
|
|
|
aggstate->peragg = NULL;
|
2015-08-04 16:53:10 +02:00
|
|
|
aggstate->pertrans = NULL;
|
2017-10-12 21:20:04 +02:00
|
|
|
aggstate->curperagg = NULL;
|
2015-08-04 16:53:10 +02:00
|
|
|
aggstate->curpertrans = NULL;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
aggstate->input_done = false;
|
2016-06-26 20:33:38 +02:00
|
|
|
aggstate->agg_done = false;
|
2018-01-03 03:02:37 +01:00
|
|
|
aggstate->pergroups = NULL;
|
2002-11-06 23:31:24 +01:00
|
|
|
aggstate->grp_firstTuple = NULL;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
aggstate->sort_in = NULL;
|
|
|
|
aggstate->sort_out = NULL;
|
1999-08-21 05:49:17 +02:00
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
/*
|
|
|
|
* phases[0] always exists, but is dummy in sorted/plain mode
|
|
|
|
*/
|
|
|
|
numPhases = (use_hashing ? 1 : 2);
|
|
|
|
numHashes = (use_hashing ? 1 : 0);
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/*
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
* Calculate the maximum number of grouping sets in any phase; this
|
2017-03-27 05:20:54 +02:00
|
|
|
* determines the size of some allocations. Also calculate the number of
|
|
|
|
* phases, since all hashed/mixed nodes contribute to only a single phase.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
if (node->groupingSets)
|
|
|
|
{
|
|
|
|
numGroupingSets = list_length(node->groupingSets);
|
|
|
|
|
|
|
|
foreach(l, node->chain)
|
|
|
|
{
|
|
|
|
Agg *agg = lfirst(l);
|
|
|
|
|
|
|
|
numGroupingSets = Max(numGroupingSets,
|
|
|
|
list_length(agg->groupingSets));
|
2017-03-27 05:20:54 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* additional AGG_HASHED aggs become part of phase 0, but all
|
|
|
|
* others add an extra phase.
|
|
|
|
*/
|
|
|
|
if (agg->aggstrategy != AGG_HASHED)
|
|
|
|
++numPhases;
|
|
|
|
else
|
|
|
|
++numHashes;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
aggstate->maxsets = numGroupingSets;
|
2017-03-27 05:20:54 +02:00
|
|
|
aggstate->numphases = numPhases;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
|
|
|
|
aggstate->aggcontexts = (ExprContext **)
|
|
|
|
palloc0(sizeof(ExprContext *) * numGroupingSets);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-12 04:37:39 +02:00
|
|
|
/*
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
* Create expression contexts. We need three or more, one for
|
2017-03-27 05:20:54 +02:00
|
|
|
* per-input-tuple processing, one for per-output-tuple processing, one
|
|
|
|
* for all the hashtables, and one for each grouping set. The per-tuple
|
|
|
|
* memory context of the per-grouping-set ExprContexts (aggcontexts)
|
|
|
|
* replaces the standalone memory context formerly used to hold transition
|
|
|
|
* values. We cheat a little by using ExecAssignExprContext() to build
|
|
|
|
* all of them.
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
*
|
|
|
|
* NOTE: the details of what is stored in aggcontexts and what is stored
|
|
|
|
* in the regular per-query memory context are driven by a simple
|
|
|
|
* decision: we want to reset the aggcontext at group boundaries (if not
|
|
|
|
* hashing) and in ExecReScanAgg to recover no-longer-wanted space.
|
2000-07-12 04:37:39 +02:00
|
|
|
*/
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
ExecAssignExprContext(estate, &aggstate->ss.ps);
|
|
|
|
aggstate->tmpcontext = aggstate->ss.ps.ps_ExprContext;
|
|
|
|
|
|
|
|
for (i = 0; i < numGroupingSets; ++i)
|
|
|
|
{
|
|
|
|
ExecAssignExprContext(estate, &aggstate->ss.ps);
|
|
|
|
aggstate->aggcontexts[i] = aggstate->ss.ps.ps_ExprContext;
|
|
|
|
}
|
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
if (use_hashing)
|
2020-04-08 05:42:04 +02:00
|
|
|
aggstate->hashcontext = CreateWorkExprContext(estate);
|
2017-03-27 05:20:54 +02:00
|
|
|
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
ExecAssignExprContext(estate, &aggstate->ss.ps);
|
2000-07-12 04:37:39 +02:00
|
|
|
|
2002-12-05 16:50:39 +01:00
|
|
|
/*
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
* Initialize child nodes.
|
2006-02-28 05:10:28 +01:00
|
|
|
*
|
|
|
|
* If we are doing a hashed aggregation then the child plan does not need
|
|
|
|
* to handle REWIND efficiently; see ExecReScanAgg.
|
2002-12-05 16:50:39 +01:00
|
|
|
*/
|
2006-02-28 05:10:28 +01:00
|
|
|
if (node->aggstrategy == AGG_HASHED)
|
|
|
|
eflags &= ~EXEC_FLAG_REWIND;
|
2002-12-05 16:50:39 +01:00
|
|
|
outerPlan = outerPlan(node);
|
2006-02-28 05:10:28 +01:00
|
|
|
outerPlanState(aggstate) = ExecInitNode(outerPlan, estate, eflags);
|
2002-12-05 16:50:39 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* initialize source tuple type.
|
|
|
|
*/
|
Fix slot type handling for Agg nodes performing internal sorts.
Since 15d8f8312 we assert that - and since 7ef04e4d2cb2, 4da597edf1
rely on - the slot type for an expression's
ecxt_{outer,inner,scan}tuple not changing, unless explicitly flagged
as such. That allows to either skip deforming (for a virtual tuple
slot) or optimize the code for JIT accelerated deforming
appropriately (for other known slot types).
This assumption was sometimes violated for grouping sets, when
nodeAgg.c internally uses tuplesorts, and the child node doesn't
return a TTSOpsMinimalTuple type slot. Detect that case, and flag that
the outer slot might not be "fixed".
It's probably worthwhile to optimize this further in the future, and
more granularly determine whether the slot is fixed. As we already
instantiate per-phase transition and equal expressions, we could
cheaply set the slot type appropriately for each phase. But that's a
separate change from this bugfix.
This commit does include a very minor optimization by avoiding to
create a slot for handling tuplesorts, if no such sorts are
performed. Previously we created that slot unnecessarily in the common
case of computing all grouping sets via hashing. The code looked too
confusing without that, as the conditions for needing a sort slot and
flagging that the slot type isn't fixed, are the same.
Reported-By: Ashutosh Sharma
Author: Andres Freund
Discussion: https://postgr.es/m/CAE9k0PmNaMD2oHTEAhRyxnxpaDaYkuBYkLa1dpOpn=RS0iS2AQ@mail.gmail.com
Backpatch: 12-, where the bug was introduced in 15d8f8312
2019-07-25 23:22:52 +02:00
|
|
|
aggstate->ss.ps.outerops =
|
|
|
|
ExecGetResultSlotOps(outerPlanState(&aggstate->ss),
|
|
|
|
&aggstate->ss.ps.outeropsfixed);
|
|
|
|
aggstate->ss.ps.outeropsset = true;
|
|
|
|
|
Introduce notion of different types of slots (without implementing them).
Upcoming work intends to allow pluggable ways to introduce new ways of
storing table data. Accessing those table access methods from the
executor requires TupleTableSlots to be carry tuples in the native
format of such storage methods; otherwise there'll be a significant
conversion overhead.
Different access methods will require different data to store tuples
efficiently (just like virtual, minimal, heap already require fields
in TupleTableSlot). To allow that without requiring additional pointer
indirections, we want to have different structs (embedding
TupleTableSlot) for different types of slots. Thus different types of
slots are needed, which requires adapting creators of slots.
The slot that most efficiently can represent a type of tuple in an
executor node will often depend on the type of slot a child node
uses. Therefore we need to track the type of slot is returned by
nodes, so parent slots can create slots based on that.
Relatedly, JIT compilation of tuple deforming needs to know which type
of slot a certain expression refers to, so it can create an
appropriate deforming function for the type of tuple in the slot.
But not all nodes will only return one type of slot, e.g. an append
node will potentially return different types of slots for each of its
subplans.
Therefore add function that allows to query the type of a node's
result slot, and whether it'll always be the same type (whether it's
fixed). This can be queried using ExecGetResultSlotOps().
The scan, result, inner, outer type of slots are automatically
inferred from ExecInitScanTupleSlot(), ExecInitResultSlot(),
left/right subtrees respectively. If that's not correct for a node,
that can be overwritten using new fields in PlanState.
This commit does not introduce the actually abstracted implementation
of different kind of TupleTableSlots, that will be left for a followup
commit. The different types of slots introduced will, for now, still
use the same backing implementation.
While this already partially invalidates the big comment in
tuptable.h, it seems to make more sense to update it later, when the
different TupleTableSlot implementations actually exist.
Author: Ashutosh Bapat and Andres Freund, with changes by Amit Khandekar
Discussion: https://postgr.es/m/20181105210039.hh4vvi4vwoq5ba2q@alap3.anarazel.de
2018-11-16 07:00:30 +01:00
|
|
|
ExecCreateScanSlotFromOuterPlan(estate, &aggstate->ss,
|
Fix slot type handling for Agg nodes performing internal sorts.
Since 15d8f8312 we assert that - and since 7ef04e4d2cb2, 4da597edf1
rely on - the slot type for an expression's
ecxt_{outer,inner,scan}tuple not changing, unless explicitly flagged
as such. That allows to either skip deforming (for a virtual tuple
slot) or optimize the code for JIT accelerated deforming
appropriately (for other known slot types).
This assumption was sometimes violated for grouping sets, when
nodeAgg.c internally uses tuplesorts, and the child node doesn't
return a TTSOpsMinimalTuple type slot. Detect that case, and flag that
the outer slot might not be "fixed".
It's probably worthwhile to optimize this further in the future, and
more granularly determine whether the slot is fixed. As we already
instantiate per-phase transition and equal expressions, we could
cheaply set the slot type appropriately for each phase. But that's a
separate change from this bugfix.
This commit does include a very minor optimization by avoiding to
create a slot for handling tuplesorts, if no such sorts are
performed. Previously we created that slot unnecessarily in the common
case of computing all grouping sets via hashing. The code looked too
confusing without that, as the conditions for needing a sort slot and
flagging that the slot type isn't fixed, are the same.
Reported-By: Ashutosh Sharma
Author: Andres Freund
Discussion: https://postgr.es/m/CAE9k0PmNaMD2oHTEAhRyxnxpaDaYkuBYkLa1dpOpn=RS0iS2AQ@mail.gmail.com
Backpatch: 12-, where the bug was introduced in 15d8f8312
2019-07-25 23:22:52 +02:00
|
|
|
aggstate->ss.ps.outerops);
|
2018-02-16 06:55:31 +01:00
|
|
|
scanDesc = aggstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor;
|
Fix slot type handling for Agg nodes performing internal sorts.
Since 15d8f8312 we assert that - and since 7ef04e4d2cb2, 4da597edf1
rely on - the slot type for an expression's
ecxt_{outer,inner,scan}tuple not changing, unless explicitly flagged
as such. That allows to either skip deforming (for a virtual tuple
slot) or optimize the code for JIT accelerated deforming
appropriately (for other known slot types).
This assumption was sometimes violated for grouping sets, when
nodeAgg.c internally uses tuplesorts, and the child node doesn't
return a TTSOpsMinimalTuple type slot. Detect that case, and flag that
the outer slot might not be "fixed".
It's probably worthwhile to optimize this further in the future, and
more granularly determine whether the slot is fixed. As we already
instantiate per-phase transition and equal expressions, we could
cheaply set the slot type appropriately for each phase. But that's a
separate change from this bugfix.
This commit does include a very minor optimization by avoiding to
create a slot for handling tuplesorts, if no such sorts are
performed. Previously we created that slot unnecessarily in the common
case of computing all grouping sets via hashing. The code looked too
confusing without that, as the conditions for needing a sort slot and
flagging that the slot type isn't fixed, are the same.
Reported-By: Ashutosh Sharma
Author: Andres Freund
Discussion: https://postgr.es/m/CAE9k0PmNaMD2oHTEAhRyxnxpaDaYkuBYkLa1dpOpn=RS0iS2AQ@mail.gmail.com
Backpatch: 12-, where the bug was introduced in 15d8f8312
2019-07-25 23:22:52 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If there are more than two phases (including a potential dummy phase
|
|
|
|
* 0), input will be resorted using tuplesort. Need a slot for that.
|
|
|
|
*/
|
|
|
|
if (numPhases > 2)
|
|
|
|
{
|
Introduce notion of different types of slots (without implementing them).
Upcoming work intends to allow pluggable ways to introduce new ways of
storing table data. Accessing those table access methods from the
executor requires TupleTableSlots to be carry tuples in the native
format of such storage methods; otherwise there'll be a significant
conversion overhead.
Different access methods will require different data to store tuples
efficiently (just like virtual, minimal, heap already require fields
in TupleTableSlot). To allow that without requiring additional pointer
indirections, we want to have different structs (embedding
TupleTableSlot) for different types of slots. Thus different types of
slots are needed, which requires adapting creators of slots.
The slot that most efficiently can represent a type of tuple in an
executor node will often depend on the type of slot a child node
uses. Therefore we need to track the type of slot is returned by
nodes, so parent slots can create slots based on that.
Relatedly, JIT compilation of tuple deforming needs to know which type
of slot a certain expression refers to, so it can create an
appropriate deforming function for the type of tuple in the slot.
But not all nodes will only return one type of slot, e.g. an append
node will potentially return different types of slots for each of its
subplans.
Therefore add function that allows to query the type of a node's
result slot, and whether it'll always be the same type (whether it's
fixed). This can be queried using ExecGetResultSlotOps().
The scan, result, inner, outer type of slots are automatically
inferred from ExecInitScanTupleSlot(), ExecInitResultSlot(),
left/right subtrees respectively. If that's not correct for a node,
that can be overwritten using new fields in PlanState.
This commit does not introduce the actually abstracted implementation
of different kind of TupleTableSlots, that will be left for a followup
commit. The different types of slots introduced will, for now, still
use the same backing implementation.
While this already partially invalidates the big comment in
tuptable.h, it seems to make more sense to update it later, when the
different TupleTableSlot implementations actually exist.
Author: Ashutosh Bapat and Andres Freund, with changes by Amit Khandekar
Discussion: https://postgr.es/m/20181105210039.hh4vvi4vwoq5ba2q@alap3.anarazel.de
2018-11-16 07:00:30 +01:00
|
|
|
aggstate->sort_slot = ExecInitExtraTupleSlot(estate, scanDesc,
|
|
|
|
&TTSOpsMinimalTuple);
|
2002-12-05 16:50:39 +01:00
|
|
|
|
Fix slot type handling for Agg nodes performing internal sorts.
Since 15d8f8312 we assert that - and since 7ef04e4d2cb2, 4da597edf1
rely on - the slot type for an expression's
ecxt_{outer,inner,scan}tuple not changing, unless explicitly flagged
as such. That allows to either skip deforming (for a virtual tuple
slot) or optimize the code for JIT accelerated deforming
appropriately (for other known slot types).
This assumption was sometimes violated for grouping sets, when
nodeAgg.c internally uses tuplesorts, and the child node doesn't
return a TTSOpsMinimalTuple type slot. Detect that case, and flag that
the outer slot might not be "fixed".
It's probably worthwhile to optimize this further in the future, and
more granularly determine whether the slot is fixed. As we already
instantiate per-phase transition and equal expressions, we could
cheaply set the slot type appropriately for each phase. But that's a
separate change from this bugfix.
This commit does include a very minor optimization by avoiding to
create a slot for handling tuplesorts, if no such sorts are
performed. Previously we created that slot unnecessarily in the common
case of computing all grouping sets via hashing. The code looked too
confusing without that, as the conditions for needing a sort slot and
flagging that the slot type isn't fixed, are the same.
Reported-By: Ashutosh Sharma
Author: Andres Freund
Discussion: https://postgr.es/m/CAE9k0PmNaMD2oHTEAhRyxnxpaDaYkuBYkLa1dpOpn=RS0iS2AQ@mail.gmail.com
Backpatch: 12-, where the bug was introduced in 15d8f8312
2019-07-25 23:22:52 +02:00
|
|
|
/*
|
|
|
|
* The output of the tuplesort, and the output from the outer child
|
|
|
|
* might not use the same type of slot. In most cases the child will
|
|
|
|
* be a Sort, and thus return a TTSOpsMinimalTuple type slot - but the
|
2020-02-18 04:20:55 +01:00
|
|
|
* input can also be presorted due an index, in which case it could be
|
|
|
|
* a different type of slot.
|
Fix slot type handling for Agg nodes performing internal sorts.
Since 15d8f8312 we assert that - and since 7ef04e4d2cb2, 4da597edf1
rely on - the slot type for an expression's
ecxt_{outer,inner,scan}tuple not changing, unless explicitly flagged
as such. That allows to either skip deforming (for a virtual tuple
slot) or optimize the code for JIT accelerated deforming
appropriately (for other known slot types).
This assumption was sometimes violated for grouping sets, when
nodeAgg.c internally uses tuplesorts, and the child node doesn't
return a TTSOpsMinimalTuple type slot. Detect that case, and flag that
the outer slot might not be "fixed".
It's probably worthwhile to optimize this further in the future, and
more granularly determine whether the slot is fixed. As we already
instantiate per-phase transition and equal expressions, we could
cheaply set the slot type appropriately for each phase. But that's a
separate change from this bugfix.
This commit does include a very minor optimization by avoiding to
create a slot for handling tuplesorts, if no such sorts are
performed. Previously we created that slot unnecessarily in the common
case of computing all grouping sets via hashing. The code looked too
confusing without that, as the conditions for needing a sort slot and
flagging that the slot type isn't fixed, are the same.
Reported-By: Ashutosh Sharma
Author: Andres Freund
Discussion: https://postgr.es/m/CAE9k0PmNaMD2oHTEAhRyxnxpaDaYkuBYkLa1dpOpn=RS0iS2AQ@mail.gmail.com
Backpatch: 12-, where the bug was introduced in 15d8f8312
2019-07-25 23:22:52 +02:00
|
|
|
*
|
|
|
|
* XXX: For efficiency it would be good to instead/additionally
|
|
|
|
* generate expressions with corresponding settings of outerops* for
|
|
|
|
* the individual phases - deforming is often a bottleneck for
|
|
|
|
* aggregations with lots of rows per group. If there's multiple
|
|
|
|
* sorts, we know that all but the first use TTSOpsMinimalTuple (via
|
|
|
|
* the nodeAgg.c internal tuplesort).
|
|
|
|
*/
|
|
|
|
if (aggstate->ss.ps.outeropsfixed &&
|
|
|
|
aggstate->ss.ps.outerops != &TTSOpsMinimalTuple)
|
|
|
|
aggstate->ss.ps.outeropsfixed = false;
|
|
|
|
}
|
|
|
|
|
2002-12-05 16:50:39 +01:00
|
|
|
/*
|
2018-02-17 06:17:38 +01:00
|
|
|
* Initialize result type, slot and projection.
|
2002-12-05 16:50:39 +01:00
|
|
|
*/
|
Introduce notion of different types of slots (without implementing them).
Upcoming work intends to allow pluggable ways to introduce new ways of
storing table data. Accessing those table access methods from the
executor requires TupleTableSlots to be carry tuples in the native
format of such storage methods; otherwise there'll be a significant
conversion overhead.
Different access methods will require different data to store tuples
efficiently (just like virtual, minimal, heap already require fields
in TupleTableSlot). To allow that without requiring additional pointer
indirections, we want to have different structs (embedding
TupleTableSlot) for different types of slots. Thus different types of
slots are needed, which requires adapting creators of slots.
The slot that most efficiently can represent a type of tuple in an
executor node will often depend on the type of slot a child node
uses. Therefore we need to track the type of slot is returned by
nodes, so parent slots can create slots based on that.
Relatedly, JIT compilation of tuple deforming needs to know which type
of slot a certain expression refers to, so it can create an
appropriate deforming function for the type of tuple in the slot.
But not all nodes will only return one type of slot, e.g. an append
node will potentially return different types of slots for each of its
subplans.
Therefore add function that allows to query the type of a node's
result slot, and whether it'll always be the same type (whether it's
fixed). This can be queried using ExecGetResultSlotOps().
The scan, result, inner, outer type of slots are automatically
inferred from ExecInitScanTupleSlot(), ExecInitResultSlot(),
left/right subtrees respectively. If that's not correct for a node,
that can be overwritten using new fields in PlanState.
This commit does not introduce the actually abstracted implementation
of different kind of TupleTableSlots, that will be left for a followup
commit. The different types of slots introduced will, for now, still
use the same backing implementation.
While this already partially invalidates the big comment in
tuptable.h, it seems to make more sense to update it later, when the
different TupleTableSlot implementations actually exist.
Author: Ashutosh Bapat and Andres Freund, with changes by Amit Khandekar
Discussion: https://postgr.es/m/20181105210039.hh4vvi4vwoq5ba2q@alap3.anarazel.de
2018-11-16 07:00:30 +01:00
|
|
|
ExecInitResultTupleSlotTL(&aggstate->ss.ps, &TTSOpsVirtual);
|
2007-02-02 01:07:03 +01:00
|
|
|
ExecAssignProjectionInfo(&aggstate->ss.ps, NULL);
|
2002-12-05 16:50:39 +01:00
|
|
|
|
2018-02-17 06:17:38 +01:00
|
|
|
/*
|
|
|
|
* initialize child expressions
|
|
|
|
*
|
|
|
|
* We expect the parser to have checked that no aggs contain other agg
|
|
|
|
* calls in their arguments (and just to be sure, we verify it again while
|
|
|
|
* initializing the plan node). This would make no sense under SQL
|
|
|
|
* semantics, and it's forbidden by the spec. Because it is true, we
|
|
|
|
* don't need to worry about evaluating the aggs in any particular order.
|
|
|
|
*
|
2020-11-24 09:45:00 +01:00
|
|
|
* Note: execExpr.c finds Aggrefs for us, and adds them to aggstate->aggs.
|
|
|
|
* Aggrefs in the qual are found here; Aggrefs in the targetlist are found
|
|
|
|
* during ExecAssignProjectionInfo, above.
|
2018-02-17 06:17:38 +01:00
|
|
|
*/
|
|
|
|
aggstate->ss.ps.qual =
|
|
|
|
ExecInitQual(node->plan.qual, (PlanState *) aggstate);
|
|
|
|
|
2002-12-05 16:50:39 +01:00
|
|
|
/*
|
Faster expression evaluation and targetlist projection.
This replaces the old, recursive tree-walk based evaluation, with
non-recursive, opcode dispatch based, expression evaluation.
Projection is now implemented as part of expression evaluation.
This both leads to significant performance improvements, and makes
future just-in-time compilation of expressions easier.
The speed gains primarily come from:
- non-recursive implementation reduces stack usage / overhead
- simple sub-expressions are implemented with a single jump, without
function calls
- sharing some state between different sub-expressions
- reduced amount of indirect/hard to predict memory accesses by laying
out operation metadata sequentially; including the avoidance of
nearly all of the previously used linked lists
- more code has been moved to expression initialization, avoiding
constant re-checks at evaluation time
Future just-in-time compilation (JIT) has become easier, as
demonstrated by released patches intended to be merged in a later
release, for primarily two reasons: Firstly, due to a stricter split
between expression initialization and evaluation, less code has to be
handled by the JIT. Secondly, due to the non-recursive nature of the
generated "instructions", less performance-critical code-paths can
easily be shared between interpreted and compiled evaluation.
The new framework allows for significant future optimizations. E.g.:
- basic infrastructure for to later reduce the per executor-startup
overhead of expression evaluation, by caching state in prepared
statements. That'd be helpful in OLTPish scenarios where
initialization overhead is measurable.
- optimizing the generated "code". A number of proposals for potential
work has already been made.
- optimizing the interpreter. Similarly a number of proposals have
been made here too.
The move of logic into the expression initialization step leads to some
backward-incompatible changes:
- Function permission checks are now done during expression
initialization, whereas previously they were done during
execution. In edge cases this can lead to errors being raised that
previously wouldn't have been, e.g. a NULL array being coerced to a
different array type previously didn't perform checks.
- The set of domain constraints to be checked, is now evaluated once
during expression initialization, previously it was re-built
every time a domain check was evaluated. For normal queries this
doesn't change much, but e.g. for plpgsql functions, which caches
ExprStates, the old set could stick around longer. The behavior
around might still change.
Author: Andres Freund, with significant changes by Tom Lane,
changes by Heikki Linnakangas
Reviewed-By: Tom Lane, Heikki Linnakangas
Discussion: https://postgr.es/m/20161206034955.bh33paeralxbtluv@alap3.anarazel.de
2017-03-14 23:45:36 +01:00
|
|
|
* We should now have found all Aggrefs in the targetlist and quals.
|
2002-12-05 16:50:39 +01:00
|
|
|
*/
|
2020-11-24 09:45:00 +01:00
|
|
|
numaggrefs = list_length(aggstate->aggs);
|
|
|
|
max_aggno = -1;
|
|
|
|
max_transno = -1;
|
|
|
|
foreach(l, aggstate->aggs)
|
|
|
|
{
|
|
|
|
Aggref *aggref = (Aggref *) lfirst(l);
|
|
|
|
|
|
|
|
max_aggno = Max(max_aggno, aggref->aggno);
|
|
|
|
max_transno = Max(max_transno, aggref->aggtransno);
|
|
|
|
}
|
|
|
|
numaggs = max_aggno + 1;
|
|
|
|
numtrans = max_transno + 1;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2003-01-11 00:54:24 +01:00
|
|
|
/*
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
* For each phase, prepare grouping set data and fmgr lookup data for
|
|
|
|
* compare functions. Accumulate all_grouped_cols in passing.
|
2003-01-11 00:54:24 +01:00
|
|
|
*/
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
aggstate->phases = palloc0(numPhases * sizeof(AggStatePerPhaseData));
|
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
aggstate->num_hashes = numHashes;
|
|
|
|
if (numHashes)
|
|
|
|
{
|
|
|
|
aggstate->perhash = palloc0(sizeof(AggStatePerHashData) * numHashes);
|
|
|
|
aggstate->phases[0].numsets = 0;
|
|
|
|
aggstate->phases[0].gset_lengths = palloc(numHashes * sizeof(int));
|
|
|
|
aggstate->phases[0].grouped_cols = palloc(numHashes * sizeof(Bitmapset *));
|
|
|
|
}
|
|
|
|
|
|
|
|
phase = 0;
|
|
|
|
for (phaseidx = 0; phaseidx <= list_length(node->chain); ++phaseidx)
|
2003-01-11 00:54:24 +01:00
|
|
|
{
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
Agg *aggnode;
|
|
|
|
Sort *sortnode;
|
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
if (phaseidx > 0)
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
{
|
Improve castNode notation by introducing list-extraction-specific variants.
This extends the castNode() notation introduced by commit 5bcab1114 to
provide, in one step, extraction of a list cell's pointer and coercion to
a concrete node type. For example, "lfirst_node(Foo, lc)" is the same
as "castNode(Foo, lfirst(lc))". Almost half of the uses of castNode
that have appeared so far include a list extraction call, so this is
pretty widely useful, and it saves a few more keystrokes compared to the
old way.
As with the previous patch, back-patch the addition of these macros to
pg_list.h, so that the notation will be available when back-patching.
Patch by me, after an idea of Andrew Gierth's.
Discussion: https://postgr.es/m/14197.1491841216@sss.pgh.pa.us
2017-04-10 19:51:29 +02:00
|
|
|
aggnode = list_nth_node(Agg, node->chain, phaseidx - 1);
|
2017-01-27 01:47:03 +01:00
|
|
|
sortnode = castNode(Sort, aggnode->plan.lefttree);
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
aggnode = node;
|
|
|
|
sortnode = NULL;
|
|
|
|
}
|
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
Assert(phase <= 1 || sortnode);
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
if (aggnode->aggstrategy == AGG_HASHED
|
|
|
|
|| aggnode->aggstrategy == AGG_MIXED)
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
{
|
2017-03-27 05:20:54 +02:00
|
|
|
AggStatePerPhase phasedata = &aggstate->phases[0];
|
|
|
|
AggStatePerHash perhash;
|
|
|
|
Bitmapset *cols = NULL;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
Assert(phase == 0);
|
|
|
|
i = phasedata->numsets++;
|
|
|
|
perhash = &aggstate->perhash[i];
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
/* phase 0 always points to the "real" Agg in the hash case */
|
|
|
|
phasedata->aggnode = node;
|
|
|
|
phasedata->aggstrategy = node->aggstrategy;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
/* but the actual Agg node representing this hash is saved here */
|
|
|
|
perhash->aggnode = aggnode;
|
|
|
|
|
|
|
|
phasedata->gset_lengths[i] = perhash->numCols = aggnode->numCols;
|
|
|
|
|
|
|
|
for (j = 0; j < aggnode->numCols; ++j)
|
|
|
|
cols = bms_add_member(cols, aggnode->grpColIdx[j]);
|
|
|
|
|
|
|
|
phasedata->grouped_cols[i] = cols;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
all_grouped_cols = bms_add_members(all_grouped_cols, cols);
|
|
|
|
continue;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
}
|
2003-06-23 00:04:55 +02:00
|
|
|
else
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
{
|
2017-03-27 05:20:54 +02:00
|
|
|
AggStatePerPhase phasedata = &aggstate->phases[++phase];
|
|
|
|
int num_sets;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
phasedata->numsets = num_sets = list_length(aggnode->groupingSets);
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
if (num_sets)
|
|
|
|
{
|
|
|
|
phasedata->gset_lengths = palloc(num_sets * sizeof(int));
|
|
|
|
phasedata->grouped_cols = palloc(num_sets * sizeof(Bitmapset *));
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
i = 0;
|
|
|
|
foreach(l, aggnode->groupingSets)
|
|
|
|
{
|
|
|
|
int current_length = list_length(lfirst(l));
|
|
|
|
Bitmapset *cols = NULL;
|
|
|
|
|
|
|
|
/* planner forces this to be correct */
|
|
|
|
for (j = 0; j < current_length; ++j)
|
|
|
|
cols = bms_add_member(cols, aggnode->grpColIdx[j]);
|
|
|
|
|
|
|
|
phasedata->grouped_cols[i] = cols;
|
|
|
|
phasedata->gset_lengths[i] = current_length;
|
|
|
|
|
|
|
|
++i;
|
|
|
|
}
|
|
|
|
|
|
|
|
all_grouped_cols = bms_add_members(all_grouped_cols,
|
|
|
|
phasedata->grouped_cols[0]);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
Assert(phaseidx == 0);
|
|
|
|
|
|
|
|
phasedata->gset_lengths = NULL;
|
|
|
|
phasedata->grouped_cols = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we are grouping, precompute fmgr lookup data for inner loop.
|
|
|
|
*/
|
|
|
|
if (aggnode->aggstrategy == AGG_SORTED)
|
|
|
|
{
|
2018-02-16 06:55:31 +01:00
|
|
|
int i = 0;
|
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
Assert(aggnode->numCols > 0);
|
|
|
|
|
2018-02-16 06:55:31 +01:00
|
|
|
/*
|
|
|
|
* Build a separate function for each subset of columns that
|
|
|
|
* need to be compared.
|
|
|
|
*/
|
2017-03-27 05:20:54 +02:00
|
|
|
phasedata->eqfunctions =
|
2018-02-16 06:55:31 +01:00
|
|
|
(ExprState **) palloc0(aggnode->numCols * sizeof(ExprState *));
|
|
|
|
|
|
|
|
/* for each grouping set */
|
|
|
|
for (i = 0; i < phasedata->numsets; i++)
|
|
|
|
{
|
|
|
|
int length = phasedata->gset_lengths[i];
|
|
|
|
|
|
|
|
if (phasedata->eqfunctions[length - 1] != NULL)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
phasedata->eqfunctions[length - 1] =
|
|
|
|
execTuplesMatchPrepare(scanDesc,
|
|
|
|
length,
|
|
|
|
aggnode->grpColIdx,
|
|
|
|
aggnode->grpOperators,
|
2019-03-22 12:09:32 +01:00
|
|
|
aggnode->grpCollations,
|
2018-02-16 06:55:31 +01:00
|
|
|
(PlanState *) aggstate);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* and for all grouped columns, unless already computed */
|
|
|
|
if (phasedata->eqfunctions[aggnode->numCols - 1] == NULL)
|
|
|
|
{
|
|
|
|
phasedata->eqfunctions[aggnode->numCols - 1] =
|
|
|
|
execTuplesMatchPrepare(scanDesc,
|
|
|
|
aggnode->numCols,
|
|
|
|
aggnode->grpColIdx,
|
|
|
|
aggnode->grpOperators,
|
2019-03-22 12:09:32 +01:00
|
|
|
aggnode->grpCollations,
|
2018-02-16 06:55:31 +01:00
|
|
|
(PlanState *) aggstate);
|
|
|
|
}
|
2017-03-27 05:20:54 +02:00
|
|
|
}
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
phasedata->aggnode = aggnode;
|
|
|
|
phasedata->aggstrategy = aggnode->aggstrategy;
|
|
|
|
phasedata->sortnode = sortnode;
|
|
|
|
}
|
2003-01-11 00:54:24 +01:00
|
|
|
}
|
|
|
|
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
/*
|
|
|
|
* Convert all_grouped_cols to a descending-order list.
|
|
|
|
*/
|
|
|
|
i = -1;
|
|
|
|
while ((i = bms_next_member(all_grouped_cols, i)) >= 0)
|
|
|
|
aggstate->all_grouped_cols = lcons_int(i, aggstate->all_grouped_cols);
|
|
|
|
|
1999-09-26 23:21:15 +02:00
|
|
|
/*
|
2002-11-06 23:31:24 +01:00
|
|
|
* Set up aggregate-result storage in the output expr context, and also
|
1999-09-26 23:21:15 +02:00
|
|
|
* allocate my private per-agg working storage
|
|
|
|
*/
|
2002-12-05 16:50:39 +01:00
|
|
|
econtext = aggstate->ss.ps.ps_ExprContext;
|
2002-11-13 01:39:48 +01:00
|
|
|
econtext->ecxt_aggvalues = (Datum *) palloc0(sizeof(Datum) * numaggs);
|
|
|
|
econtext->ecxt_aggnulls = (bool *) palloc0(sizeof(bool) * numaggs);
|
1999-09-26 23:21:15 +02:00
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
peraggs = (AggStatePerAgg) palloc0(sizeof(AggStatePerAggData) * numaggs);
|
2020-11-24 09:45:00 +01:00
|
|
|
pertransstates = (AggStatePerTrans) palloc0(sizeof(AggStatePerTransData) * numtrans);
|
2015-08-04 16:53:10 +02:00
|
|
|
|
|
|
|
aggstate->peragg = peraggs;
|
|
|
|
aggstate->pertrans = pertransstates;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2018-01-09 22:25:38 +01:00
|
|
|
|
|
|
|
aggstate->all_pergroups =
|
|
|
|
(AggStatePerGroup *) palloc0(sizeof(AggStatePerGroup)
|
|
|
|
* (numGroupingSets + numHashes));
|
|
|
|
pergroups = aggstate->all_pergroups;
|
|
|
|
|
|
|
|
if (node->aggstrategy != AGG_HASHED)
|
|
|
|
{
|
|
|
|
for (i = 0; i < numGroupingSets; i++)
|
|
|
|
{
|
|
|
|
pergroups[i] = (AggStatePerGroup) palloc0(sizeof(AggStatePerGroupData)
|
|
|
|
* numaggs);
|
|
|
|
}
|
|
|
|
|
|
|
|
aggstate->pergroups = pergroups;
|
|
|
|
pergroups += numGroupingSets;
|
|
|
|
}
|
|
|
|
|
2016-12-01 02:30:09 +01:00
|
|
|
/*
|
|
|
|
* Hashing can only appear in the initial phase.
|
|
|
|
*/
|
2017-03-27 05:20:54 +02:00
|
|
|
if (use_hashing)
|
2002-11-06 23:31:24 +01:00
|
|
|
{
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
Plan *outerplan = outerPlan(node);
|
|
|
|
uint64 totalGroups = 0;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
aggstate->hash_metacxt = AllocSetContextCreate(aggstate->ss.ps.state->es_query_cxt,
|
|
|
|
"HashAgg meta context",
|
|
|
|
ALLOCSET_DEFAULT_SIZES);
|
2020-07-13 02:48:49 +02:00
|
|
|
aggstate->hash_spill_rslot = ExecInitExtraTupleSlot(estate, scanDesc,
|
|
|
|
&TTSOpsMinimalTuple);
|
|
|
|
aggstate->hash_spill_wslot = ExecInitExtraTupleSlot(estate, scanDesc,
|
|
|
|
&TTSOpsVirtual);
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
/* this is an array of pointers, not structures */
|
2018-01-09 22:25:38 +01:00
|
|
|
aggstate->hash_pergroup = pergroups;
|
2016-12-01 02:30:09 +01:00
|
|
|
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
aggstate->hashentrysize = hash_agg_entry_size(aggstate->numtrans,
|
|
|
|
outerplan->plan_width,
|
|
|
|
node->transitionSpace);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Consider all of the grouping sets together when setting the limits
|
|
|
|
* and estimating the number of partitions. This can be inaccurate
|
|
|
|
* when there is more than one grouping set, but should still be
|
|
|
|
* reasonable.
|
|
|
|
*/
|
|
|
|
for (i = 0; i < aggstate->num_hashes; i++)
|
2020-03-23 21:56:28 +01:00
|
|
|
totalGroups += aggstate->perhash[i].aggnode->numGroups;
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
|
|
|
hash_agg_set_limits(aggstate->hashentrysize, totalGroups, 0,
|
|
|
|
&aggstate->hash_mem_limit,
|
|
|
|
&aggstate->hash_ngroups_limit,
|
|
|
|
&aggstate->hash_planned_partitions);
|
2017-03-27 05:20:54 +02:00
|
|
|
find_hash_columns(aggstate);
|
2020-11-18 11:39:15 +01:00
|
|
|
|
|
|
|
/* Skip massive memory allocation if we are just doing EXPLAIN */
|
|
|
|
if (!(eflags & EXEC_FLAG_EXPLAIN_ONLY))
|
|
|
|
build_hash_tables(aggstate);
|
|
|
|
|
2002-11-06 23:31:24 +01:00
|
|
|
aggstate->table_filled = false;
|
2020-07-29 01:42:21 +02:00
|
|
|
|
|
|
|
/* Initialize this to 1, meaning nothing spilled, yet */
|
|
|
|
aggstate->hash_batches_used = 1;
|
2002-11-06 23:31:24 +01:00
|
|
|
}
|
2017-03-27 05:20:54 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialize current phase-dependent values to initial phase. The initial
|
|
|
|
* phase is 1 (first sort pass) for all strategies that use sorting (if
|
|
|
|
* hashing is being done too, then phase 0 is processed last); but if only
|
|
|
|
* hashing is being done, then phase 0 is all there is.
|
|
|
|
*/
|
|
|
|
if (node->aggstrategy == AGG_HASHED)
|
|
|
|
{
|
|
|
|
aggstate->current_phase = 0;
|
|
|
|
initialize_phase(aggstate, 0);
|
|
|
|
select_current_set(aggstate, 0, true);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
aggstate->current_phase = 1;
|
|
|
|
initialize_phase(aggstate, 1);
|
|
|
|
select_current_set(aggstate, 0, false);
|
|
|
|
}
|
|
|
|
|
2020-11-24 09:45:00 +01:00
|
|
|
/*
|
1999-09-26 23:21:15 +02:00
|
|
|
* Perform lookups of aggregate function info, and initialize the
|
2015-08-04 16:53:10 +02:00
|
|
|
* unchanging fields of the per-agg and per-trans data.
|
1999-09-26 23:21:15 +02:00
|
|
|
*/
|
2004-05-26 06:41:50 +02:00
|
|
|
foreach(l, aggstate->aggs)
|
1999-09-26 23:21:15 +02:00
|
|
|
{
|
2020-11-24 09:45:00 +01:00
|
|
|
Aggref *aggref = lfirst(l);
|
2015-08-04 16:53:10 +02:00
|
|
|
AggStatePerAgg peragg;
|
|
|
|
AggStatePerTrans pertrans;
|
2021-07-04 08:47:31 +02:00
|
|
|
Oid aggTransFnInputTypes[FUNC_MAX_ARGS];
|
|
|
|
int numAggTransFnArgs;
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
int numDirectArgs;
|
1999-09-26 23:21:15 +02:00
|
|
|
HeapTuple aggTuple;
|
|
|
|
Form_pg_aggregate aggform;
|
2002-04-30 00:28:19 +02:00
|
|
|
AclResult aclresult;
|
2020-11-24 09:45:00 +01:00
|
|
|
Oid finalfn_oid;
|
Fix type-safety problem with parallel aggregate serial/deserialization.
The original specification for this called for the deserialization function
to have signature "deserialize(serialtype) returns transtype", which is a
security violation if transtype is INTERNAL (which it always would be in
practice) and serialtype is not (which ditto). The patch blithely overrode
the opr_sanity check for that, which was sloppy-enough work in itself,
but the indisputable reason this cannot be allowed to stand is that CREATE
FUNCTION will reject such a signature and thus it'd be impossible for
extensions to create parallelizable aggregates.
The minimum fix to make the signature type-safe is to add a second, dummy
argument of type INTERNAL. But to lock it down a bit more and make misuse
of INTERNAL-accepting functions less likely, let's get rid of the ability
to specify a "serialtype" for an aggregate and just say that the only
useful serialtype is BYTEA --- which, in practice, is the only interesting
value anyway, due to the usefulness of the send/recv infrastructure for
this purpose. That means we only have to allow "serialize(internal)
returns bytea" and "deserialize(bytea, internal) returns internal" as
the signatures for these support functions.
In passing fix bogus signature of int4_avg_combine, which I found thanks
to adding an opr_sanity check on combinefunc signatures.
catversion bump due to removing pg_aggregate.aggserialtype and adjusting
signatures of assorted built-in functions.
David Rowley and Tom Lane
Discussion: <27247.1466185504@sss.pgh.pa.us>
2016-06-22 22:52:41 +02:00
|
|
|
Oid serialfn_oid,
|
2016-03-29 21:04:05 +02:00
|
|
|
deserialfn_oid;
|
2020-11-24 09:45:00 +01:00
|
|
|
Oid aggOwner;
|
2015-08-04 16:53:10 +02:00
|
|
|
Expr *finalfnexpr;
|
|
|
|
Oid aggtranstype;
|
2003-02-04 01:48:23 +01:00
|
|
|
|
2003-06-06 17:04:03 +02:00
|
|
|
/* Planner should have assigned aggregate to correct level */
|
|
|
|
Assert(aggref->agglevelsup == 0);
|
2016-06-26 20:33:38 +02:00
|
|
|
/* ... and the split mode should match */
|
|
|
|
Assert(aggref->aggsplit == aggstate->aggsplit);
|
2003-06-06 17:04:03 +02:00
|
|
|
|
2020-11-24 09:45:00 +01:00
|
|
|
peragg = &peraggs[aggref->aggno];
|
|
|
|
|
|
|
|
/* Check if we initialized the state for this aggregate already. */
|
|
|
|
if (peragg->aggref != NULL)
|
2003-02-04 01:48:23 +01:00
|
|
|
continue;
|
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
peragg->aggref = aggref;
|
2020-11-24 09:45:00 +01:00
|
|
|
peragg->transno = aggref->aggtransno;
|
1999-09-26 23:21:15 +02:00
|
|
|
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
/* Fetch the pg_aggregate row */
|
2010-02-14 19:42:19 +01:00
|
|
|
aggTuple = SearchSysCache1(AGGFNOID,
|
|
|
|
ObjectIdGetDatum(aggref->aggfnoid));
|
1999-09-26 23:21:15 +02:00
|
|
|
if (!HeapTupleIsValid(aggTuple))
|
2003-07-21 19:05:12 +02:00
|
|
|
elog(ERROR, "cache lookup failed for aggregate %u",
|
2002-04-11 22:00:18 +02:00
|
|
|
aggref->aggfnoid);
|
1999-09-26 23:21:15 +02:00
|
|
|
aggform = (Form_pg_aggregate) GETSTRUCT(aggTuple);
|
|
|
|
|
2002-04-30 00:28:19 +02:00
|
|
|
/* Check permission to call aggregate function */
|
|
|
|
aclresult = pg_proc_aclcheck(aggref->aggfnoid, GetUserId(),
|
|
|
|
ACL_EXECUTE);
|
|
|
|
if (aclresult != ACLCHECK_OK)
|
2017-12-02 15:26:34 +01:00
|
|
|
aclcheck_error(aclresult, OBJECT_AGGREGATE,
|
2003-08-01 02:15:26 +02:00
|
|
|
get_func_name(aggref->aggfnoid));
|
2013-04-12 14:55:56 +02:00
|
|
|
InvokeFunctionExecuteHook(aggref->aggfnoid);
|
2002-04-30 00:28:19 +02:00
|
|
|
|
Fix handling of argument and result datatypes for partial aggregation.
When doing partial aggregation, the args list of the upper (combining)
Aggref node is replaced by a Var representing the output of the partial
aggregation steps, which has either the aggregate's transition data type
or a serialized representation of that. However, nodeAgg.c blindly
continued to use the args list as an indication of the user-level argument
types. This broke resolution of polymorphic transition datatypes at
executor startup (though it accidentally failed to fail for the ANYARRAY
case, which is likely the only one anyone had tested). Moreover, the
constructed FuncExpr passed to the finalfunc contained completely wrong
information, which would have led to bogus answers or crashes for any case
where the finalfunc examined that information (which is only likely to be
with polymorphic aggregates using a non-polymorphic transition type).
As an independent bug, apply_partialaggref_adjustment neglected to resolve
a polymorphic transition datatype before assigning it as the output type
of the lower-level Aggref node. This again accidentally failed to fail
for ANYARRAY but would be unlikely to work in other cases.
To fix the first problem, record the user-level argument types in a
separate OID-list field of Aggref, and look to that rather than the args
list when asking what the argument types were. (It turns out to be
convenient to include any "direct" arguments in this list too, although
those are not currently subject to being overwritten.)
Rather than adding yet another resolve_aggregate_transtype() call to fix
the second problem, add an aggtranstype field to Aggref, and store the
resolved transition type OID there when the planner first computes it.
(By doing this in the planner and not the parser, we can allow the
aggregate's transition type to change from time to time, although no DDL
support yet exists for that.) This saves nothing of consequence for
simple non-polymorphic aggregates, but for polymorphic transition types
we save a catalog lookup during executor startup as well as several
planner lookups that are new in 9.6 due to parallel query planning.
In passing, fix an error that was introduced into count_agg_clauses_walker
some time ago: it was applying exprTypmod() to something that wasn't an
expression node at all, but a TargetEntry. exprTypmod silently returned
-1 so that there was not an obvious failure, but this broke the intended
sensitivity of aggregate space consumption estimates to the typmod of
varchar and similar data types. This part needs to be back-patched.
Catversion bump due to change of stored Aggref nodes.
Discussion: <8229.1466109074@sss.pgh.pa.us>
2016-06-18 03:44:37 +02:00
|
|
|
/* planner recorded transition state type in the Aggref itself */
|
|
|
|
aggtranstype = aggref->aggtranstype;
|
|
|
|
Assert(OidIsValid(aggtranstype));
|
|
|
|
|
2016-01-20 19:46:50 +01:00
|
|
|
/* Final function only required if we're finalizing the aggregates */
|
2016-06-26 20:33:38 +02:00
|
|
|
if (DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit))
|
2016-01-20 19:46:50 +01:00
|
|
|
peragg->finalfn_oid = finalfn_oid = InvalidOid;
|
2016-06-26 20:33:38 +02:00
|
|
|
else
|
|
|
|
peragg->finalfn_oid = finalfn_oid = aggform->aggfinalfn;
|
2003-07-01 21:10:53 +02:00
|
|
|
|
2016-03-29 21:04:05 +02:00
|
|
|
serialfn_oid = InvalidOid;
|
|
|
|
deserialfn_oid = InvalidOid;
|
|
|
|
|
|
|
|
/*
|
2016-06-26 20:33:38 +02:00
|
|
|
* Check if serialization/deserialization is required. We only do it
|
|
|
|
* for aggregates that have transtype INTERNAL.
|
2016-03-29 21:04:05 +02:00
|
|
|
*/
|
2016-06-26 20:33:38 +02:00
|
|
|
if (aggtranstype == INTERNALOID)
|
2016-03-29 21:04:05 +02:00
|
|
|
{
|
|
|
|
/*
|
2016-06-26 20:33:38 +02:00
|
|
|
* The planner should only have generated a serialize agg node if
|
|
|
|
* every aggregate with an INTERNAL state has a serialization
|
|
|
|
* function. Verify that.
|
2016-03-29 21:04:05 +02:00
|
|
|
*/
|
2016-06-26 20:33:38 +02:00
|
|
|
if (DO_AGGSPLIT_SERIALIZE(aggstate->aggsplit))
|
|
|
|
{
|
|
|
|
/* serialization only valid when not running finalfn */
|
|
|
|
Assert(DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit));
|
2016-03-29 21:04:05 +02:00
|
|
|
|
2016-06-26 20:33:38 +02:00
|
|
|
if (!OidIsValid(aggform->aggserialfn))
|
|
|
|
elog(ERROR, "serialfunc not provided for serialization aggregation");
|
2016-03-29 21:04:05 +02:00
|
|
|
serialfn_oid = aggform->aggserialfn;
|
2016-06-26 20:33:38 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Likewise for deserialization functions */
|
|
|
|
if (DO_AGGSPLIT_DESERIALIZE(aggstate->aggsplit))
|
|
|
|
{
|
|
|
|
/* deserialization only valid when combining states */
|
|
|
|
Assert(DO_AGGSPLIT_COMBINE(aggstate->aggsplit));
|
2016-03-29 21:04:05 +02:00
|
|
|
|
2016-06-26 20:33:38 +02:00
|
|
|
if (!OidIsValid(aggform->aggdeserialfn))
|
|
|
|
elog(ERROR, "deserialfunc not provided for deserialization aggregation");
|
2016-03-29 21:04:05 +02:00
|
|
|
deserialfn_oid = aggform->aggdeserialfn;
|
2016-06-26 20:33:38 +02:00
|
|
|
}
|
2016-03-29 21:04:05 +02:00
|
|
|
}
|
|
|
|
|
2005-01-28 00:42:18 +01:00
|
|
|
/* Check that aggregate owner has permission to call component fns */
|
|
|
|
{
|
|
|
|
HeapTuple procTuple;
|
|
|
|
|
2010-02-14 19:42:19 +01:00
|
|
|
procTuple = SearchSysCache1(PROCOID,
|
|
|
|
ObjectIdGetDatum(aggref->aggfnoid));
|
2005-01-28 00:42:18 +01:00
|
|
|
if (!HeapTupleIsValid(procTuple))
|
|
|
|
elog(ERROR, "cache lookup failed for function %u",
|
|
|
|
aggref->aggfnoid);
|
|
|
|
aggOwner = ((Form_pg_proc) GETSTRUCT(procTuple))->proowner;
|
|
|
|
ReleaseSysCache(procTuple);
|
|
|
|
|
|
|
|
if (OidIsValid(finalfn_oid))
|
|
|
|
{
|
|
|
|
aclresult = pg_proc_aclcheck(finalfn_oid, aggOwner,
|
|
|
|
ACL_EXECUTE);
|
|
|
|
if (aclresult != ACLCHECK_OK)
|
2017-12-02 15:26:34 +01:00
|
|
|
aclcheck_error(aclresult, OBJECT_FUNCTION,
|
2005-01-28 00:42:18 +01:00
|
|
|
get_func_name(finalfn_oid));
|
2013-04-12 14:55:56 +02:00
|
|
|
InvokeFunctionExecuteHook(finalfn_oid);
|
2005-01-28 00:42:18 +01:00
|
|
|
}
|
2016-03-29 21:04:05 +02:00
|
|
|
if (OidIsValid(serialfn_oid))
|
|
|
|
{
|
|
|
|
aclresult = pg_proc_aclcheck(serialfn_oid, aggOwner,
|
|
|
|
ACL_EXECUTE);
|
|
|
|
if (aclresult != ACLCHECK_OK)
|
2017-12-02 15:26:34 +01:00
|
|
|
aclcheck_error(aclresult, OBJECT_FUNCTION,
|
2016-03-29 21:04:05 +02:00
|
|
|
get_func_name(serialfn_oid));
|
|
|
|
InvokeFunctionExecuteHook(serialfn_oid);
|
|
|
|
}
|
|
|
|
if (OidIsValid(deserialfn_oid))
|
|
|
|
{
|
|
|
|
aclresult = pg_proc_aclcheck(deserialfn_oid, aggOwner,
|
|
|
|
ACL_EXECUTE);
|
|
|
|
if (aclresult != ACLCHECK_OK)
|
2017-12-02 15:26:34 +01:00
|
|
|
aclcheck_error(aclresult, OBJECT_FUNCTION,
|
2016-03-29 21:04:05 +02:00
|
|
|
get_func_name(deserialfn_oid));
|
|
|
|
InvokeFunctionExecuteHook(deserialfn_oid);
|
|
|
|
}
|
2005-01-28 00:42:18 +01:00
|
|
|
}
|
|
|
|
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
/*
|
|
|
|
* Get actual datatypes of the (nominal) aggregate inputs. These
|
|
|
|
* could be different from the agg's declared input types, when the
|
|
|
|
* agg accepts ANY or a polymorphic type.
|
|
|
|
*/
|
2021-07-04 08:47:31 +02:00
|
|
|
numAggTransFnArgs = get_aggregate_argtypes(aggref,
|
|
|
|
aggTransFnInputTypes);
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
|
|
|
|
/* Count the "direct" arguments, if any */
|
|
|
|
numDirectArgs = list_length(aggref->aggdirectargs);
|
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
/* Detect how many arguments to pass to the finalfn */
|
|
|
|
if (aggform->aggfinalextra)
|
2021-07-04 08:47:31 +02:00
|
|
|
peragg->numFinalArgs = numAggTransFnArgs + 1;
|
2015-08-04 16:53:10 +02:00
|
|
|
else
|
|
|
|
peragg->numFinalArgs = numDirectArgs + 1;
|
2003-07-01 21:10:53 +02:00
|
|
|
|
2017-10-16 22:02:51 +02:00
|
|
|
/* Initialize any direct-argument expressions */
|
|
|
|
peragg->aggdirectargs = ExecInitExprList(aggref->aggdirectargs,
|
|
|
|
(PlanState *) aggstate);
|
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
/*
|
|
|
|
* build expression trees using actual argument & result types for the
|
2016-01-20 19:46:50 +01:00
|
|
|
* finalfn, if it exists and is required.
|
2015-08-04 16:53:10 +02:00
|
|
|
*/
|
2003-07-01 21:10:53 +02:00
|
|
|
if (OidIsValid(finalfn_oid))
|
|
|
|
{
|
2021-07-04 08:47:31 +02:00
|
|
|
build_aggregate_finalfn_expr(aggTransFnInputTypes,
|
2015-08-04 16:53:10 +02:00
|
|
|
peragg->numFinalArgs,
|
|
|
|
aggtranstype,
|
|
|
|
aggref->aggtype,
|
|
|
|
aggref->inputcollid,
|
|
|
|
finalfn_oid,
|
|
|
|
&finalfnexpr);
|
|
|
|
fmgr_info(finalfn_oid, &peragg->finalfn);
|
|
|
|
fmgr_info_set_expr((Node *) finalfnexpr, &peragg->finalfn);
|
2003-07-01 21:10:53 +02:00
|
|
|
}
|
|
|
|
|
2016-03-29 21:21:57 +02:00
|
|
|
/* get info about the output value's datatype */
|
2016-06-26 20:33:38 +02:00
|
|
|
get_typlenbyval(aggref->aggtype,
|
2016-03-29 21:21:57 +02:00
|
|
|
&peragg->resulttypeLen,
|
|
|
|
&peragg->resulttypeByVal);
|
1999-09-26 23:21:15 +02:00
|
|
|
|
2002-04-11 22:00:18 +02:00
|
|
|
/*
|
2020-11-24 09:45:00 +01:00
|
|
|
* Build working state for invoking the transition function, if we
|
|
|
|
* haven't done it already.
|
2002-04-11 22:00:18 +02:00
|
|
|
*/
|
2020-11-24 09:45:00 +01:00
|
|
|
pertrans = &pertransstates[aggref->aggtransno];
|
|
|
|
if (pertrans->aggref == NULL)
|
2015-08-04 16:53:10 +02:00
|
|
|
{
|
2020-11-24 09:45:00 +01:00
|
|
|
Datum textInitVal;
|
|
|
|
Datum initValue;
|
|
|
|
bool initValueIsNull;
|
|
|
|
Oid transfn_oid;
|
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
/*
|
2020-11-24 09:45:00 +01:00
|
|
|
* If this aggregation is performing state combines, then instead
|
|
|
|
* of using the transition function, we'll use the combine
|
2021-07-04 08:47:31 +02:00
|
|
|
* function.
|
2015-08-04 16:53:10 +02:00
|
|
|
*/
|
2020-11-24 09:45:00 +01:00
|
|
|
if (DO_AGGSPLIT_COMBINE(aggstate->aggsplit))
|
|
|
|
{
|
|
|
|
transfn_oid = aggform->aggcombinefn;
|
|
|
|
|
|
|
|
/* If not set then the planner messed up */
|
|
|
|
if (!OidIsValid(transfn_oid))
|
|
|
|
elog(ERROR, "combinefn not set for aggregate function");
|
|
|
|
}
|
|
|
|
else
|
|
|
|
transfn_oid = aggform->aggtransfn;
|
|
|
|
|
2021-07-04 08:47:31 +02:00
|
|
|
aclresult = pg_proc_aclcheck(transfn_oid, aggOwner, ACL_EXECUTE);
|
2020-11-24 09:45:00 +01:00
|
|
|
if (aclresult != ACLCHECK_OK)
|
|
|
|
aclcheck_error(aclresult, OBJECT_FUNCTION,
|
|
|
|
get_func_name(transfn_oid));
|
|
|
|
InvokeFunctionExecuteHook(transfn_oid);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* initval is potentially null, so don't try to access it as a
|
|
|
|
* struct field. Must do it the hard way with SysCacheGetAttr.
|
|
|
|
*/
|
|
|
|
textInitVal = SysCacheGetAttr(AGGFNOID, aggTuple,
|
|
|
|
Anum_pg_aggregate_agginitval,
|
|
|
|
&initValueIsNull);
|
|
|
|
if (initValueIsNull)
|
|
|
|
initValue = (Datum) 0;
|
|
|
|
else
|
|
|
|
initValue = GetAggInitVal(textInitVal, aggtranstype);
|
|
|
|
|
2021-07-04 08:47:31 +02:00
|
|
|
if (DO_AGGSPLIT_COMBINE(aggstate->aggsplit))
|
|
|
|
{
|
|
|
|
Oid combineFnInputTypes[] = {aggtranstype,
|
|
|
|
aggtranstype};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* When combining there's only one input, the to-be-combined
|
|
|
|
* transition value. The transition value is not counted
|
|
|
|
* here.
|
|
|
|
*/
|
|
|
|
pertrans->numTransInputs = 1;
|
|
|
|
|
|
|
|
/* aggcombinefn always has two arguments of aggtranstype */
|
|
|
|
build_pertrans_for_aggref(pertrans, aggstate, estate,
|
|
|
|
aggref, transfn_oid, aggtranstype,
|
|
|
|
serialfn_oid, deserialfn_oid,
|
|
|
|
initValue, initValueIsNull,
|
|
|
|
combineFnInputTypes, 2);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Ensure that a combine function to combine INTERNAL states
|
|
|
|
* is not strict. This should have been checked during CREATE
|
|
|
|
* AGGREGATE, but the strict property could have been changed
|
|
|
|
* since then.
|
|
|
|
*/
|
|
|
|
if (pertrans->transfn.fn_strict && aggtranstype == INTERNALOID)
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
|
|
|
|
errmsg("combine function with transition type %s must not be declared STRICT",
|
|
|
|
format_type_be(aggtranstype))));
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* Detect how many arguments to pass to the transfn */
|
|
|
|
if (AGGKIND_IS_ORDERED_SET(aggref->aggkind))
|
|
|
|
pertrans->numTransInputs = list_length(aggref->args);
|
|
|
|
else
|
|
|
|
pertrans->numTransInputs = numAggTransFnArgs;
|
|
|
|
|
|
|
|
build_pertrans_for_aggref(pertrans, aggstate, estate,
|
|
|
|
aggref, transfn_oid, aggtranstype,
|
|
|
|
serialfn_oid, deserialfn_oid,
|
|
|
|
initValue, initValueIsNull,
|
|
|
|
aggTransFnInputTypes,
|
|
|
|
numAggTransFnArgs);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the transfn is strict and the initval is NULL, make sure
|
|
|
|
* input type and transtype are the same (or at least
|
|
|
|
* binary-compatible), so that it's OK to use the first
|
|
|
|
* aggregated input value as the initial transValue. This
|
|
|
|
* should have been checked at agg definition time, but we
|
|
|
|
* must check again in case the transfn's strictness property
|
|
|
|
* has been changed.
|
|
|
|
*/
|
|
|
|
if (pertrans->transfn.fn_strict && pertrans->initValueIsNull)
|
|
|
|
{
|
|
|
|
if (numAggTransFnArgs <= numDirectArgs ||
|
|
|
|
!IsBinaryCoercible(aggTransFnInputTypes[numDirectArgs],
|
|
|
|
aggtranstype))
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
|
|
|
|
errmsg("aggregate %u needs to have compatible input type and transition type",
|
|
|
|
aggref->aggfnoid)));
|
|
|
|
}
|
|
|
|
}
|
2000-07-17 05:05:41 +02:00
|
|
|
}
|
2020-11-24 09:45:00 +01:00
|
|
|
else
|
|
|
|
pertrans->aggshared = true;
|
2015-08-04 16:53:10 +02:00
|
|
|
ReleaseSysCache(aggTuple);
|
|
|
|
}
|
1999-12-13 02:27:21 +01:00
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
/*
|
Restore nodeAgg.c's ability to check for improperly-nested aggregates.
While poking around in the aggregate logic, I noticed that commit
8ed3f11bb broke the logic in nodeAgg.c that purports to detect nested
aggregates, by moving initialization of regular aggregate argument
expressions out of the code segment that checks for that.
You could argue that this check is unnecessary, but it's not much code
so I'm inclined to keep it as a backstop against parser and planner
bugs. However, there's certainly zero value in checking only some of
the subexpressions.
We can make the check complete again, and as a bonus make it a good
deal more bulletproof against future mistakes of the same ilk, by
moving it out to the outermost level of ExecInitAgg. This means we
need to check only once per Agg node not once per aggregate, which
also seems like a good thing --- if the check does find something
wrong, it's not urgent that we report it before the plan node
initialization finishes.
Since this requires remembering the original length of the aggs list,
I deleted a long-obsolete stanza that changed numaggs from 0 to 1.
That's so old it predates our decision that palloc(0) is a valid
operation, in (digs...) 2004, see commit 24a1e20f1.
In passing improve a few comments.
Back-patch to v10, just in case.
2017-10-16 01:19:18 +02:00
|
|
|
* Update aggstate->numaggs to be the number of unique aggregates found.
|
|
|
|
* Also set numstates to the number of unique transition states found.
|
2015-08-04 16:53:10 +02:00
|
|
|
*/
|
2020-11-24 09:45:00 +01:00
|
|
|
aggstate->numaggs = numaggs;
|
|
|
|
aggstate->numtrans = numtrans;
|
2015-08-04 16:53:10 +02:00
|
|
|
|
2016-12-01 01:08:11 +01:00
|
|
|
/*
|
2018-01-09 22:25:38 +01:00
|
|
|
* Last, check whether any more aggregates got added onto the node while
|
|
|
|
* we processed the expressions for the aggregate arguments (including not
|
|
|
|
* only the regular arguments and FILTER expressions handled immediately
|
|
|
|
* above, but any direct arguments we might've handled earlier). If so,
|
|
|
|
* we have nested aggregate functions, which is semantically nonsensical,
|
|
|
|
* so complain. (This should have been caught by the parser, so we don't
|
|
|
|
* need to work hard on a helpful error message; but we defend against it
|
|
|
|
* here anyway, just to be sure.)
|
2016-12-01 01:08:11 +01:00
|
|
|
*/
|
2020-11-24 09:45:00 +01:00
|
|
|
if (numaggrefs != list_length(aggstate->aggs))
|
2018-01-09 22:25:38 +01:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_GROUPING_ERROR),
|
|
|
|
errmsg("aggregate function calls cannot be nested")));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Build expressions doing all the transition work at once. We build a
|
|
|
|
* different one for each phase, as the number of transition function
|
|
|
|
* invocation can differ between phases. Note this'll work both for
|
|
|
|
* transition and combination functions (although there'll only be one
|
|
|
|
* phase in the latter case).
|
|
|
|
*/
|
|
|
|
for (phaseidx = 0; phaseidx < aggstate->numphases; phaseidx++)
|
2016-12-01 01:08:11 +01:00
|
|
|
{
|
2018-01-09 22:25:38 +01:00
|
|
|
AggStatePerPhase phase = &aggstate->phases[phaseidx];
|
|
|
|
bool dohash = false;
|
|
|
|
bool dosort = false;
|
2016-12-01 01:08:11 +01:00
|
|
|
|
2018-01-09 22:25:38 +01:00
|
|
|
/* phase 0 doesn't necessarily exist */
|
|
|
|
if (!phase->aggnode)
|
|
|
|
continue;
|
2016-12-01 01:08:11 +01:00
|
|
|
|
2018-01-09 22:25:38 +01:00
|
|
|
if (aggstate->aggstrategy == AGG_MIXED && phaseidx == 1)
|
2016-12-01 01:08:11 +01:00
|
|
|
{
|
2017-10-16 21:24:36 +02:00
|
|
|
/*
|
2018-01-09 22:25:38 +01:00
|
|
|
* Phase one, and only phase one, in a mixed agg performs both
|
|
|
|
* sorting and aggregation.
|
2017-10-16 21:24:36 +02:00
|
|
|
*/
|
2018-01-09 22:25:38 +01:00
|
|
|
dohash = true;
|
|
|
|
dosort = true;
|
2016-12-01 01:08:11 +01:00
|
|
|
}
|
2018-01-09 22:25:38 +01:00
|
|
|
else if (aggstate->aggstrategy == AGG_MIXED && phaseidx == 0)
|
2017-10-16 21:24:36 +02:00
|
|
|
{
|
|
|
|
/*
|
2018-01-09 22:25:38 +01:00
|
|
|
* No need to compute a transition function for an AGG_MIXED phase
|
|
|
|
* 0 - the contents of the hashtables will have been computed
|
|
|
|
* during phase 1.
|
2017-10-16 21:24:36 +02:00
|
|
|
*/
|
2018-01-09 22:25:38 +01:00
|
|
|
continue;
|
2017-10-16 21:24:36 +02:00
|
|
|
}
|
2018-01-09 22:25:38 +01:00
|
|
|
else if (phase->aggstrategy == AGG_PLAIN ||
|
|
|
|
phase->aggstrategy == AGG_SORTED)
|
|
|
|
{
|
|
|
|
dohash = false;
|
|
|
|
dosort = true;
|
|
|
|
}
|
|
|
|
else if (phase->aggstrategy == AGG_HASHED)
|
|
|
|
{
|
|
|
|
dohash = true;
|
|
|
|
dosort = false;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
Assert(false);
|
2016-12-01 01:08:11 +01:00
|
|
|
|
2020-03-05 02:20:20 +01:00
|
|
|
phase->evaltrans = ExecBuildAggTrans(aggstate, phase, dosort, dohash,
|
|
|
|
false);
|
2016-12-01 01:08:11 +01:00
|
|
|
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
/* cache compiled expression for outer slot without NULL check */
|
|
|
|
phase->evaltrans_cache[0][0] = phase->evaltrans;
|
2018-01-09 22:25:38 +01:00
|
|
|
}
|
Restore nodeAgg.c's ability to check for improperly-nested aggregates.
While poking around in the aggregate logic, I noticed that commit
8ed3f11bb broke the logic in nodeAgg.c that purports to detect nested
aggregates, by moving initialization of regular aggregate argument
expressions out of the code segment that checks for that.
You could argue that this check is unnecessary, but it's not much code
so I'm inclined to keep it as a backstop against parser and planner
bugs. However, there's certainly zero value in checking only some of
the subexpressions.
We can make the check complete again, and as a bonus make it a good
deal more bulletproof against future mistakes of the same ilk, by
moving it out to the outermost level of ExecInitAgg. This means we
need to check only once per Agg node not once per aggregate, which
also seems like a good thing --- if the check does find something
wrong, it's not urgent that we report it before the plan node
initialization finishes.
Since this requires remembering the original length of the aggs list,
I deleted a long-obsolete stanza that changed numaggs from 0 to 1.
That's so old it predates our decision that palloc(0) is a valid
operation, in (digs...) 2004, see commit 24a1e20f1.
In passing improve a few comments.
Back-patch to v10, just in case.
2017-10-16 01:19:18 +02:00
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
return aggstate;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Build the state needed to calculate a state value for an aggregate.
|
|
|
|
*
|
|
|
|
* This initializes all the fields in 'pertrans'. 'aggref' is the aggregate
|
2021-07-04 08:47:31 +02:00
|
|
|
* to initialize the state for. 'transfn_oid', 'aggtranstype', and the rest
|
2015-08-04 16:53:10 +02:00
|
|
|
* of the arguments could be calculated from 'aggref', but the caller has
|
|
|
|
* calculated them already, so might as well pass them.
|
2021-07-04 08:47:31 +02:00
|
|
|
*
|
|
|
|
* 'transfn_oid' may be either the Oid of the aggtransfn or the aggcombinefn.
|
2015-08-04 16:53:10 +02:00
|
|
|
*/
|
|
|
|
static void
|
|
|
|
build_pertrans_for_aggref(AggStatePerTrans pertrans,
|
|
|
|
AggState *aggstate, EState *estate,
|
|
|
|
Aggref *aggref,
|
2021-07-04 08:47:31 +02:00
|
|
|
Oid transfn_oid, Oid aggtranstype,
|
2016-03-29 21:04:05 +02:00
|
|
|
Oid aggserialfn, Oid aggdeserialfn,
|
2015-08-04 16:53:10 +02:00
|
|
|
Datum initValue, bool initValueIsNull,
|
|
|
|
Oid *inputTypes, int numArguments)
|
|
|
|
{
|
|
|
|
int numGroupingSets = Max(aggstate->maxsets, 1);
|
2021-07-04 08:47:31 +02:00
|
|
|
Expr *transfnexpr;
|
|
|
|
int numTransArgs;
|
2016-03-29 21:04:05 +02:00
|
|
|
Expr *serialfnexpr = NULL;
|
|
|
|
Expr *deserialfnexpr = NULL;
|
2015-08-04 16:53:10 +02:00
|
|
|
ListCell *lc;
|
|
|
|
int numInputs;
|
|
|
|
int numDirectArgs;
|
|
|
|
List *sortlist;
|
|
|
|
int numSortCols;
|
|
|
|
int numDistinctCols;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
/* Begin filling in the pertrans data */
|
|
|
|
pertrans->aggref = aggref;
|
2017-10-16 21:51:23 +02:00
|
|
|
pertrans->aggshared = false;
|
2015-08-04 16:53:10 +02:00
|
|
|
pertrans->aggCollation = aggref->inputcollid;
|
2021-07-04 08:47:31 +02:00
|
|
|
pertrans->transfn_oid = transfn_oid;
|
2016-03-29 21:04:05 +02:00
|
|
|
pertrans->serialfn_oid = aggserialfn;
|
|
|
|
pertrans->deserialfn_oid = aggdeserialfn;
|
2015-08-04 16:53:10 +02:00
|
|
|
pertrans->initValue = initValue;
|
|
|
|
pertrans->initValueIsNull = initValueIsNull;
|
|
|
|
|
|
|
|
/* Count the "direct" arguments, if any */
|
|
|
|
numDirectArgs = list_length(aggref->aggdirectargs);
|
|
|
|
|
|
|
|
/* Count the number of aggregated input columns */
|
|
|
|
pertrans->numInputs = numInputs = list_length(aggref->args);
|
|
|
|
|
|
|
|
pertrans->aggtranstype = aggtranstype;
|
|
|
|
|
2021-07-04 08:47:31 +02:00
|
|
|
/* account for the current transition state */
|
|
|
|
numTransArgs = pertrans->numTransInputs + 1;
|
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
/*
|
2021-07-04 08:47:31 +02:00
|
|
|
* Set up infrastructure for calling the transfn. Note that invtrans is
|
|
|
|
* not needed here.
|
2015-08-04 16:53:10 +02:00
|
|
|
*/
|
2021-07-04 08:47:31 +02:00
|
|
|
build_aggregate_transfn_expr(inputTypes,
|
|
|
|
numArguments,
|
|
|
|
numDirectArgs,
|
|
|
|
aggref->aggvariadic,
|
|
|
|
aggtranstype,
|
|
|
|
aggref->inputcollid,
|
|
|
|
transfn_oid,
|
|
|
|
InvalidOid,
|
|
|
|
&transfnexpr,
|
|
|
|
NULL);
|
Minimally fix partial aggregation for aggregates that don't have one argument.
For partial aggregation combine steps,
AggStatePerTrans->numTransInputs was set to the transition function's
number of inputs, rather than the combine function's number of
inputs (always 1).
That lead to partial aggregates with strict combine functions to
wrongly check for NOT NULL input as required by strictness. When the
aggregate wasn't exactly passed one argument, the strictness check was
either omitted (in the 0 args case) or too many arguments were
checked. In the latter case we'd read beyond the end of
FunctionCallInfoData->args (only in master).
AggStatePerTrans->numTransInputs actually has been wrong since since
9.6, where partial aggregates were added. But it turns out to not be
an active problem in 9.6 and 10, because numTransInputs wasn't used at
all for combine functions: Before c253b722f6 there simply was no NULL
check for the input to strict trans functions, and after that the
check was simply hardcoded for the right offset in fcinfo, as it's
done by code specific to combine functions.
In bf6c614a2f2 (11) the strictness check was generalized, with common
code doing the strictness checks for both plain and combine transition
functions, based on numTransInputs. For combine functions this lead to
not emitting an expression step to check for strict input in the 0
arguments case, and in the > 1 arguments case, we'd check too many
arguments.Due to the fact that the relevant fcinfo->isnull[2..] was
always zero-initialized (more or less by accident, by being part of
the AggStatePerTrans struct, which is palloc0'ed), there was no
observable damage in the latter case before a9c35cf85ca1f, we just
checked too many array elements.
Due to the changes in a9c35cf85ca1f, > 1 argument bug became visible,
because these days fcinfo is a) dynamically allocated without being
zeroed b) exactly the length required for the number of specified
arguments (hardcoded to 2 in this case).
This commit only contains a fairly minimal fix, setting numTransInputs
to a hardcoded 1 when building a pertrans for a combine function. It
seems likely that we'll want to clean this up further (e.g. the
arguments build_pertrans_for_aggref() aren't particularly meaningful
for combine functions). But the wrap date for 12 beta1 is coming up
fast, so it seems good to have a minimal fix in place.
Backpatch to 11. While AggStatePerTrans->numTransInputs was set
wrongly before that, the value was not used for combine functions.
Reported-By: Rajkumar Raghuwanshi
Diagnosed-By: Kyotaro Horiguchi, Jeevan Chalke, Andres Freund, David Rowley
Author: David Rowley, Kyotaro Horiguchi, Andres Freund
Discussion: https://postgr.es/m/CAKcux6=uZEyWyLw0N7HtR9OBc-sWEFeByEZC7t-KDf15FKxVew@mail.gmail.com
2019-05-20 03:01:06 +02:00
|
|
|
|
2021-07-04 08:47:31 +02:00
|
|
|
fmgr_info(transfn_oid, &pertrans->transfn);
|
|
|
|
fmgr_info_set_expr((Node *) transfnexpr, &pertrans->transfn);
|
2016-01-20 19:46:50 +01:00
|
|
|
|
2021-07-04 08:47:31 +02:00
|
|
|
pertrans->transfn_fcinfo =
|
|
|
|
(FunctionCallInfo) palloc(SizeForFunctionCallInfo(numTransArgs));
|
|
|
|
InitFunctionCallInfoData(*pertrans->transfn_fcinfo,
|
|
|
|
&pertrans->transfn,
|
|
|
|
numTransArgs,
|
|
|
|
pertrans->aggCollation,
|
|
|
|
(void *) aggstate, NULL);
|
2015-08-04 16:53:10 +02:00
|
|
|
|
|
|
|
/* get info about the state value's datatype */
|
|
|
|
get_typlenbyval(aggtranstype,
|
|
|
|
&pertrans->transtypeLen,
|
|
|
|
&pertrans->transtypeByVal);
|
|
|
|
|
2016-03-29 21:04:05 +02:00
|
|
|
if (OidIsValid(aggserialfn))
|
|
|
|
{
|
Fix type-safety problem with parallel aggregate serial/deserialization.
The original specification for this called for the deserialization function
to have signature "deserialize(serialtype) returns transtype", which is a
security violation if transtype is INTERNAL (which it always would be in
practice) and serialtype is not (which ditto). The patch blithely overrode
the opr_sanity check for that, which was sloppy-enough work in itself,
but the indisputable reason this cannot be allowed to stand is that CREATE
FUNCTION will reject such a signature and thus it'd be impossible for
extensions to create parallelizable aggregates.
The minimum fix to make the signature type-safe is to add a second, dummy
argument of type INTERNAL. But to lock it down a bit more and make misuse
of INTERNAL-accepting functions less likely, let's get rid of the ability
to specify a "serialtype" for an aggregate and just say that the only
useful serialtype is BYTEA --- which, in practice, is the only interesting
value anyway, due to the usefulness of the send/recv infrastructure for
this purpose. That means we only have to allow "serialize(internal)
returns bytea" and "deserialize(bytea, internal) returns internal" as
the signatures for these support functions.
In passing fix bogus signature of int4_avg_combine, which I found thanks
to adding an opr_sanity check on combinefunc signatures.
catversion bump due to removing pg_aggregate.aggserialtype and adjusting
signatures of assorted built-in functions.
David Rowley and Tom Lane
Discussion: <27247.1466185504@sss.pgh.pa.us>
2016-06-22 22:52:41 +02:00
|
|
|
build_aggregate_serialfn_expr(aggserialfn,
|
2016-03-29 21:04:05 +02:00
|
|
|
&serialfnexpr);
|
|
|
|
fmgr_info(aggserialfn, &pertrans->serialfn);
|
|
|
|
fmgr_info_set_expr((Node *) serialfnexpr, &pertrans->serialfn);
|
|
|
|
|
Change function call information to be variable length.
Before this change FunctionCallInfoData, the struct arguments etc for
V1 function calls are stored in, always had space for
FUNC_MAX_ARGS/100 arguments, storing datums and their nullness in two
arrays. For nearly every function call 100 arguments is far more than
needed, therefore wasting memory. Arg and argnull being two separate
arrays also guarantees that to access a single argument, two
cachelines have to be touched.
Change the layout so there's a single variable-length array with pairs
of value / isnull. That drastically reduces memory consumption for
most function calls (on x86-64 a two argument function now uses
64bytes, previously 936 bytes), and makes it very likely that argument
value and its nullness are on the same cacheline.
Arguments are stored in a new NullableDatum struct, which, due to
padding, needs more memory per argument than before. But as usually
far fewer arguments are stored, and individual arguments are cheaper
to access, that's still a clear win. It's likely that there's other
places where conversion to NullableDatum arrays would make sense,
e.g. TupleTableSlots, but that's for another commit.
Because the function call information is now variable-length
allocations have to take the number of arguments into account. For
heap allocations that can be done with SizeForFunctionCallInfoData(),
for on-stack allocations there's a new LOCAL_FCINFO(name, nargs) macro
that helps to allocate an appropriately sized and aligned variable.
Some places with stack allocation function call information don't know
the number of arguments at compile time, and currently variably sized
stack allocations aren't allowed in postgres. Therefore allow for
FUNC_MAX_ARGS space in these cases. They're not that common, so for
now that seems acceptable.
Because of the need to allocate FunctionCallInfo of the appropriate
size, older extensions may need to update their code. To avoid subtle
breakages, the FunctionCallInfoData struct has been renamed to
FunctionCallInfoBaseData. Most code only references FunctionCallInfo,
so that shouldn't cause much collateral damage.
This change is also a prerequisite for more efficient expression JIT
compilation (by allocating the function call information on the stack,
allowing LLVM to optimize it away); previously the size of the call
information caused problems inside LLVM's optimizer.
Author: Andres Freund
Reviewed-By: Tom Lane
Discussion: https://postgr.es/m/20180605172952.x34m5uz6ju6enaem@alap3.anarazel.de
2019-01-26 23:17:52 +01:00
|
|
|
pertrans->serialfn_fcinfo =
|
|
|
|
(FunctionCallInfo) palloc(SizeForFunctionCallInfo(1));
|
|
|
|
InitFunctionCallInfoData(*pertrans->serialfn_fcinfo,
|
2016-03-29 21:04:05 +02:00
|
|
|
&pertrans->serialfn,
|
|
|
|
1,
|
Fix type-safety problem with parallel aggregate serial/deserialization.
The original specification for this called for the deserialization function
to have signature "deserialize(serialtype) returns transtype", which is a
security violation if transtype is INTERNAL (which it always would be in
practice) and serialtype is not (which ditto). The patch blithely overrode
the opr_sanity check for that, which was sloppy-enough work in itself,
but the indisputable reason this cannot be allowed to stand is that CREATE
FUNCTION will reject such a signature and thus it'd be impossible for
extensions to create parallelizable aggregates.
The minimum fix to make the signature type-safe is to add a second, dummy
argument of type INTERNAL. But to lock it down a bit more and make misuse
of INTERNAL-accepting functions less likely, let's get rid of the ability
to specify a "serialtype" for an aggregate and just say that the only
useful serialtype is BYTEA --- which, in practice, is the only interesting
value anyway, due to the usefulness of the send/recv infrastructure for
this purpose. That means we only have to allow "serialize(internal)
returns bytea" and "deserialize(bytea, internal) returns internal" as
the signatures for these support functions.
In passing fix bogus signature of int4_avg_combine, which I found thanks
to adding an opr_sanity check on combinefunc signatures.
catversion bump due to removing pg_aggregate.aggserialtype and adjusting
signatures of assorted built-in functions.
David Rowley and Tom Lane
Discussion: <27247.1466185504@sss.pgh.pa.us>
2016-06-22 22:52:41 +02:00
|
|
|
InvalidOid,
|
2016-03-29 21:04:05 +02:00
|
|
|
(void *) aggstate, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (OidIsValid(aggdeserialfn))
|
|
|
|
{
|
Fix type-safety problem with parallel aggregate serial/deserialization.
The original specification for this called for the deserialization function
to have signature "deserialize(serialtype) returns transtype", which is a
security violation if transtype is INTERNAL (which it always would be in
practice) and serialtype is not (which ditto). The patch blithely overrode
the opr_sanity check for that, which was sloppy-enough work in itself,
but the indisputable reason this cannot be allowed to stand is that CREATE
FUNCTION will reject such a signature and thus it'd be impossible for
extensions to create parallelizable aggregates.
The minimum fix to make the signature type-safe is to add a second, dummy
argument of type INTERNAL. But to lock it down a bit more and make misuse
of INTERNAL-accepting functions less likely, let's get rid of the ability
to specify a "serialtype" for an aggregate and just say that the only
useful serialtype is BYTEA --- which, in practice, is the only interesting
value anyway, due to the usefulness of the send/recv infrastructure for
this purpose. That means we only have to allow "serialize(internal)
returns bytea" and "deserialize(bytea, internal) returns internal" as
the signatures for these support functions.
In passing fix bogus signature of int4_avg_combine, which I found thanks
to adding an opr_sanity check on combinefunc signatures.
catversion bump due to removing pg_aggregate.aggserialtype and adjusting
signatures of assorted built-in functions.
David Rowley and Tom Lane
Discussion: <27247.1466185504@sss.pgh.pa.us>
2016-06-22 22:52:41 +02:00
|
|
|
build_aggregate_deserialfn_expr(aggdeserialfn,
|
|
|
|
&deserialfnexpr);
|
2016-03-29 21:04:05 +02:00
|
|
|
fmgr_info(aggdeserialfn, &pertrans->deserialfn);
|
|
|
|
fmgr_info_set_expr((Node *) deserialfnexpr, &pertrans->deserialfn);
|
|
|
|
|
Change function call information to be variable length.
Before this change FunctionCallInfoData, the struct arguments etc for
V1 function calls are stored in, always had space for
FUNC_MAX_ARGS/100 arguments, storing datums and their nullness in two
arrays. For nearly every function call 100 arguments is far more than
needed, therefore wasting memory. Arg and argnull being two separate
arrays also guarantees that to access a single argument, two
cachelines have to be touched.
Change the layout so there's a single variable-length array with pairs
of value / isnull. That drastically reduces memory consumption for
most function calls (on x86-64 a two argument function now uses
64bytes, previously 936 bytes), and makes it very likely that argument
value and its nullness are on the same cacheline.
Arguments are stored in a new NullableDatum struct, which, due to
padding, needs more memory per argument than before. But as usually
far fewer arguments are stored, and individual arguments are cheaper
to access, that's still a clear win. It's likely that there's other
places where conversion to NullableDatum arrays would make sense,
e.g. TupleTableSlots, but that's for another commit.
Because the function call information is now variable-length
allocations have to take the number of arguments into account. For
heap allocations that can be done with SizeForFunctionCallInfoData(),
for on-stack allocations there's a new LOCAL_FCINFO(name, nargs) macro
that helps to allocate an appropriately sized and aligned variable.
Some places with stack allocation function call information don't know
the number of arguments at compile time, and currently variably sized
stack allocations aren't allowed in postgres. Therefore allow for
FUNC_MAX_ARGS space in these cases. They're not that common, so for
now that seems acceptable.
Because of the need to allocate FunctionCallInfo of the appropriate
size, older extensions may need to update their code. To avoid subtle
breakages, the FunctionCallInfoData struct has been renamed to
FunctionCallInfoBaseData. Most code only references FunctionCallInfo,
so that shouldn't cause much collateral damage.
This change is also a prerequisite for more efficient expression JIT
compilation (by allocating the function call information on the stack,
allowing LLVM to optimize it away); previously the size of the call
information caused problems inside LLVM's optimizer.
Author: Andres Freund
Reviewed-By: Tom Lane
Discussion: https://postgr.es/m/20180605172952.x34m5uz6ju6enaem@alap3.anarazel.de
2019-01-26 23:17:52 +01:00
|
|
|
pertrans->deserialfn_fcinfo =
|
|
|
|
(FunctionCallInfo) palloc(SizeForFunctionCallInfo(2));
|
|
|
|
InitFunctionCallInfoData(*pertrans->deserialfn_fcinfo,
|
2016-03-29 21:04:05 +02:00
|
|
|
&pertrans->deserialfn,
|
Fix type-safety problem with parallel aggregate serial/deserialization.
The original specification for this called for the deserialization function
to have signature "deserialize(serialtype) returns transtype", which is a
security violation if transtype is INTERNAL (which it always would be in
practice) and serialtype is not (which ditto). The patch blithely overrode
the opr_sanity check for that, which was sloppy-enough work in itself,
but the indisputable reason this cannot be allowed to stand is that CREATE
FUNCTION will reject such a signature and thus it'd be impossible for
extensions to create parallelizable aggregates.
The minimum fix to make the signature type-safe is to add a second, dummy
argument of type INTERNAL. But to lock it down a bit more and make misuse
of INTERNAL-accepting functions less likely, let's get rid of the ability
to specify a "serialtype" for an aggregate and just say that the only
useful serialtype is BYTEA --- which, in practice, is the only interesting
value anyway, due to the usefulness of the send/recv infrastructure for
this purpose. That means we only have to allow "serialize(internal)
returns bytea" and "deserialize(bytea, internal) returns internal" as
the signatures for these support functions.
In passing fix bogus signature of int4_avg_combine, which I found thanks
to adding an opr_sanity check on combinefunc signatures.
catversion bump due to removing pg_aggregate.aggserialtype and adjusting
signatures of assorted built-in functions.
David Rowley and Tom Lane
Discussion: <27247.1466185504@sss.pgh.pa.us>
2016-06-22 22:52:41 +02:00
|
|
|
2,
|
|
|
|
InvalidOid,
|
2016-03-29 21:04:05 +02:00
|
|
|
(void *) aggstate, NULL);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
/*
|
|
|
|
* If we're doing either DISTINCT or ORDER BY for a plain agg, then we
|
|
|
|
* have a list of SortGroupClause nodes; fish out the data in them and
|
|
|
|
* stick them into arrays. We ignore ORDER BY for an ordered-set agg,
|
|
|
|
* however; the agg's transfn and finalfn are responsible for that.
|
|
|
|
*
|
|
|
|
* Note that by construction, if there is a DISTINCT clause then the ORDER
|
|
|
|
* BY clause is a prefix of it (see transformDistinctClause).
|
|
|
|
*/
|
|
|
|
if (AGGKIND_IS_ORDERED_SET(aggref->aggkind))
|
|
|
|
{
|
|
|
|
sortlist = NIL;
|
|
|
|
numSortCols = numDistinctCols = 0;
|
|
|
|
}
|
|
|
|
else if (aggref->aggdistinct)
|
|
|
|
{
|
|
|
|
sortlist = aggref->aggdistinct;
|
|
|
|
numSortCols = numDistinctCols = list_length(sortlist);
|
|
|
|
Assert(numSortCols >= list_length(aggref->aggorder));
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
sortlist = aggref->aggorder;
|
|
|
|
numSortCols = list_length(sortlist);
|
|
|
|
numDistinctCols = 0;
|
|
|
|
}
|
2009-12-15 18:57:48 +01:00
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
pertrans->numSortCols = numSortCols;
|
|
|
|
pertrans->numDistinctCols = numDistinctCols;
|
2009-12-15 18:57:48 +01:00
|
|
|
|
2017-10-16 21:24:36 +02:00
|
|
|
/*
|
|
|
|
* If we have either sorting or filtering to do, create a tupledesc and
|
|
|
|
* slot corresponding to the aggregated inputs (including sort
|
|
|
|
* expressions) of the agg.
|
|
|
|
*/
|
|
|
|
if (numSortCols > 0 || aggref->aggfilter)
|
2015-08-04 16:53:10 +02:00
|
|
|
{
|
Remove WITH OIDS support, change oid catalog column visibility.
Previously tables declared WITH OIDS, including a significant fraction
of the catalog tables, stored the oid column not as a normal column,
but as part of the tuple header.
This special column was not shown by default, which was somewhat odd,
as it's often (consider e.g. pg_class.oid) one of the more important
parts of a row. Neither pg_dump nor COPY included the contents of the
oid column by default.
The fact that the oid column was not an ordinary column necessitated a
significant amount of special case code to support oid columns. That
already was painful for the existing, but upcoming work aiming to make
table storage pluggable, would have required expanding and duplicating
that "specialness" significantly.
WITH OIDS has been deprecated since 2005 (commit ff02d0a05280e0).
Remove it.
Removing includes:
- CREATE TABLE and ALTER TABLE syntax for declaring the table to be
WITH OIDS has been removed (WITH (oids[ = true]) will error out)
- pg_dump does not support dumping tables declared WITH OIDS and will
issue a warning when dumping one (and ignore the oid column).
- restoring an pg_dump archive with pg_restore will warn when
restoring a table with oid contents (and ignore the oid column)
- COPY will refuse to load binary dump that includes oids.
- pg_upgrade will error out when encountering tables declared WITH
OIDS, they have to be altered to remove the oid column first.
- Functionality to access the oid of the last inserted row (like
plpgsql's RESULT_OID, spi's SPI_lastoid, ...) has been removed.
The syntax for declaring a table WITHOUT OIDS (or WITH (oids = false)
for CREATE TABLE) is still supported. While that requires a bit of
support code, it seems unnecessary to break applications / dumps that
do not use oids, and are explicit about not using them.
The biggest user of WITH OID columns was postgres' catalog. This
commit changes all 'magic' oid columns to be columns that are normally
declared and stored. To reduce unnecessary query breakage all the
newly added columns are still named 'oid', even if a table's column
naming scheme would indicate 'reloid' or such. This obviously
requires adapting a lot code, mostly replacing oid access via
HeapTupleGetOid() with access to the underlying Form_pg_*->oid column.
The bootstrap process now assigns oids for all oid columns in
genbki.pl that do not have an explicit value (starting at the largest
oid previously used), only oids assigned later by oids will be above
FirstBootstrapObjectId. As the oid column now is a normal column the
special bootstrap syntax for oids has been removed.
Oids are not automatically assigned during insertion anymore, all
backend code explicitly assigns oids with GetNewOidWithIndex(). For
the rare case that insertions into the catalog via SQL are called for
the new pg_nextoid() function can be used (which only works on catalog
tables).
The fact that oid columns on system tables are now normal columns
means that they will be included in the set of columns expanded
by * (i.e. SELECT * FROM pg_class will now include the table's oid,
previously it did not). It'd not technically be hard to hide oid
column by default, but that'd mean confusing behavior would either
have to be carried forward forever, or it'd cause breakage down the
line.
While it's not unlikely that further adjustments are needed, the
scope/invasiveness of the patch makes it worthwhile to get merge this
now. It's painful to maintain externally, too complicated to commit
after the code code freeze, and a dependency of a number of other
patches.
Catversion bump, for obvious reasons.
Author: Andres Freund, with contributions by John Naylor
Discussion: https://postgr.es/m/20180930034810.ywp2c7awz7opzcfr@alap3.anarazel.de
2018-11-21 00:36:57 +01:00
|
|
|
pertrans->sortdesc = ExecTypeFromTL(aggref->args);
|
2018-02-17 06:17:38 +01:00
|
|
|
pertrans->sortslot =
|
Introduce notion of different types of slots (without implementing them).
Upcoming work intends to allow pluggable ways to introduce new ways of
storing table data. Accessing those table access methods from the
executor requires TupleTableSlots to be carry tuples in the native
format of such storage methods; otherwise there'll be a significant
conversion overhead.
Different access methods will require different data to store tuples
efficiently (just like virtual, minimal, heap already require fields
in TupleTableSlot). To allow that without requiring additional pointer
indirections, we want to have different structs (embedding
TupleTableSlot) for different types of slots. Thus different types of
slots are needed, which requires adapting creators of slots.
The slot that most efficiently can represent a type of tuple in an
executor node will often depend on the type of slot a child node
uses. Therefore we need to track the type of slot is returned by
nodes, so parent slots can create slots based on that.
Relatedly, JIT compilation of tuple deforming needs to know which type
of slot a certain expression refers to, so it can create an
appropriate deforming function for the type of tuple in the slot.
But not all nodes will only return one type of slot, e.g. an append
node will potentially return different types of slots for each of its
subplans.
Therefore add function that allows to query the type of a node's
result slot, and whether it'll always be the same type (whether it's
fixed). This can be queried using ExecGetResultSlotOps().
The scan, result, inner, outer type of slots are automatically
inferred from ExecInitScanTupleSlot(), ExecInitResultSlot(),
left/right subtrees respectively. If that's not correct for a node,
that can be overwritten using new fields in PlanState.
This commit does not introduce the actually abstracted implementation
of different kind of TupleTableSlots, that will be left for a followup
commit. The different types of slots introduced will, for now, still
use the same backing implementation.
While this already partially invalidates the big comment in
tuptable.h, it seems to make more sense to update it later, when the
different TupleTableSlot implementations actually exist.
Author: Ashutosh Bapat and Andres Freund, with changes by Amit Khandekar
Discussion: https://postgr.es/m/20181105210039.hh4vvi4vwoq5ba2q@alap3.anarazel.de
2018-11-16 07:00:30 +01:00
|
|
|
ExecInitExtraTupleSlot(estate, pertrans->sortdesc,
|
|
|
|
&TTSOpsMinimalTuple);
|
2017-10-16 21:24:36 +02:00
|
|
|
}
|
2016-12-01 01:08:11 +01:00
|
|
|
|
2017-10-16 21:24:36 +02:00
|
|
|
if (numSortCols > 0)
|
|
|
|
{
|
2009-12-15 18:57:48 +01:00
|
|
|
/*
|
2015-08-04 16:53:10 +02:00
|
|
|
* We don't implement DISTINCT or ORDER BY aggs in the HASHED case
|
|
|
|
* (yet)
|
2009-12-15 18:57:48 +01:00
|
|
|
*/
|
2017-03-27 05:20:54 +02:00
|
|
|
Assert(aggstate->aggstrategy != AGG_HASHED && aggstate->aggstrategy != AGG_MIXED);
|
2015-08-04 16:53:10 +02:00
|
|
|
|
2021-07-04 08:47:31 +02:00
|
|
|
/* ORDER BY aggregates are not supported with partial aggregation */
|
|
|
|
Assert(!DO_AGGSPLIT_COMBINE(aggstate->aggsplit));
|
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
/* If we have only one input, we need its len/byval info. */
|
|
|
|
if (numInputs == 1)
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
{
|
2015-08-04 16:53:10 +02:00
|
|
|
get_typlenbyval(inputTypes[numDirectArgs],
|
|
|
|
&pertrans->inputtypeLen,
|
|
|
|
&pertrans->inputtypeByVal);
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
}
|
2015-08-04 16:53:10 +02:00
|
|
|
else if (numDistinctCols > 0)
|
1999-12-13 02:27:21 +01:00
|
|
|
{
|
2015-08-04 16:53:10 +02:00
|
|
|
/* we will need an extra slot to store prior values */
|
2018-02-17 06:17:38 +01:00
|
|
|
pertrans->uniqslot =
|
Introduce notion of different types of slots (without implementing them).
Upcoming work intends to allow pluggable ways to introduce new ways of
storing table data. Accessing those table access methods from the
executor requires TupleTableSlots to be carry tuples in the native
format of such storage methods; otherwise there'll be a significant
conversion overhead.
Different access methods will require different data to store tuples
efficiently (just like virtual, minimal, heap already require fields
in TupleTableSlot). To allow that without requiring additional pointer
indirections, we want to have different structs (embedding
TupleTableSlot) for different types of slots. Thus different types of
slots are needed, which requires adapting creators of slots.
The slot that most efficiently can represent a type of tuple in an
executor node will often depend on the type of slot a child node
uses. Therefore we need to track the type of slot is returned by
nodes, so parent slots can create slots based on that.
Relatedly, JIT compilation of tuple deforming needs to know which type
of slot a certain expression refers to, so it can create an
appropriate deforming function for the type of tuple in the slot.
But not all nodes will only return one type of slot, e.g. an append
node will potentially return different types of slots for each of its
subplans.
Therefore add function that allows to query the type of a node's
result slot, and whether it'll always be the same type (whether it's
fixed). This can be queried using ExecGetResultSlotOps().
The scan, result, inner, outer type of slots are automatically
inferred from ExecInitScanTupleSlot(), ExecInitResultSlot(),
left/right subtrees respectively. If that's not correct for a node,
that can be overwritten using new fields in PlanState.
This commit does not introduce the actually abstracted implementation
of different kind of TupleTableSlots, that will be left for a followup
commit. The different types of slots introduced will, for now, still
use the same backing implementation.
While this already partially invalidates the big comment in
tuptable.h, it seems to make more sense to update it later, when the
different TupleTableSlot implementations actually exist.
Author: Ashutosh Bapat and Andres Freund, with changes by Amit Khandekar
Discussion: https://postgr.es/m/20181105210039.hh4vvi4vwoq5ba2q@alap3.anarazel.de
2018-11-16 07:00:30 +01:00
|
|
|
ExecInitExtraTupleSlot(estate, pertrans->sortdesc,
|
|
|
|
&TTSOpsMinimalTuple);
|
2009-12-15 18:57:48 +01:00
|
|
|
}
|
2002-11-06 23:31:24 +01:00
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
/* Extract the sort information for use later */
|
|
|
|
pertrans->sortColIdx =
|
|
|
|
(AttrNumber *) palloc(numSortCols * sizeof(AttrNumber));
|
|
|
|
pertrans->sortOperators =
|
|
|
|
(Oid *) palloc(numSortCols * sizeof(Oid));
|
|
|
|
pertrans->sortCollations =
|
|
|
|
(Oid *) palloc(numSortCols * sizeof(Oid));
|
|
|
|
pertrans->sortNullsFirst =
|
|
|
|
(bool *) palloc(numSortCols * sizeof(bool));
|
|
|
|
|
|
|
|
i = 0;
|
|
|
|
foreach(lc, sortlist)
|
2009-12-15 18:57:48 +01:00
|
|
|
{
|
2015-08-04 16:53:10 +02:00
|
|
|
SortGroupClause *sortcl = (SortGroupClause *) lfirst(lc);
|
|
|
|
TargetEntry *tle = get_sortgroupclause_tle(sortcl, aggref->args);
|
2009-12-15 18:57:48 +01:00
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
/* the parser should have made sure of this */
|
|
|
|
Assert(OidIsValid(sortcl->sortop));
|
2009-12-15 18:57:48 +01:00
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
pertrans->sortColIdx[i] = tle->resno;
|
|
|
|
pertrans->sortOperators[i] = sortcl->sortop;
|
|
|
|
pertrans->sortCollations[i] = exprCollation((Node *) tle->expr);
|
|
|
|
pertrans->sortNullsFirst[i] = sortcl->nulls_first;
|
|
|
|
i++;
|
2009-12-15 18:57:48 +01:00
|
|
|
}
|
2015-08-04 16:53:10 +02:00
|
|
|
Assert(i == numSortCols);
|
|
|
|
}
|
2009-12-15 18:57:48 +01:00
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
if (aggref->aggdistinct)
|
|
|
|
{
|
2018-02-16 06:55:31 +01:00
|
|
|
Oid *ops;
|
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
Assert(numArguments > 0);
|
2018-02-16 06:55:31 +01:00
|
|
|
Assert(list_length(aggref->aggdistinct) == numDistinctCols);
|
1999-12-13 02:27:21 +01:00
|
|
|
|
2018-02-16 06:55:31 +01:00
|
|
|
ops = palloc(numDistinctCols * sizeof(Oid));
|
2009-12-15 18:57:48 +01:00
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
i = 0;
|
|
|
|
foreach(lc, aggref->aggdistinct)
|
2018-02-16 06:55:31 +01:00
|
|
|
ops[i++] = ((SortGroupClause *) lfirst(lc))->eqop;
|
2009-12-15 18:57:48 +01:00
|
|
|
|
2018-02-16 06:55:31 +01:00
|
|
|
/* lookup / build the necessary comparators */
|
|
|
|
if (numDistinctCols == 1)
|
|
|
|
fmgr_info(get_opcode(ops[0]), &pertrans->equalfnOne);
|
|
|
|
else
|
|
|
|
pertrans->equalfnMulti =
|
|
|
|
execTuplesMatchPrepare(pertrans->sortdesc,
|
|
|
|
numDistinctCols,
|
|
|
|
pertrans->sortColIdx,
|
|
|
|
ops,
|
2019-03-22 12:09:32 +01:00
|
|
|
pertrans->sortCollations,
|
2018-02-16 06:55:31 +01:00
|
|
|
&aggstate->ss.ps);
|
|
|
|
pfree(ops);
|
1999-09-26 23:21:15 +02:00
|
|
|
}
|
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
pertrans->sortstates = (Tuplesortstate **)
|
|
|
|
palloc0(sizeof(Tuplesortstate *) * numGroupingSets);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
|
2002-04-11 22:00:18 +02:00
|
|
|
static Datum
|
|
|
|
GetAggInitVal(Datum textInitVal, Oid transtype)
|
|
|
|
{
|
|
|
|
Oid typinput,
|
2004-06-06 02:41:28 +02:00
|
|
|
typioparam;
|
|
|
|
char *strInitVal;
|
2002-04-11 22:00:18 +02:00
|
|
|
Datum initVal;
|
|
|
|
|
2004-06-06 02:41:28 +02:00
|
|
|
getTypeInputInfo(transtype, &typinput, &typioparam);
|
2008-03-25 23:42:46 +01:00
|
|
|
strInitVal = TextDatumGetCString(textInitVal);
|
2006-04-04 21:35:37 +02:00
|
|
|
initVal = OidInputFunctionCall(typinput, strInitVal,
|
|
|
|
typioparam, -1);
|
2002-04-11 22:00:18 +02:00
|
|
|
pfree(strInitVal);
|
|
|
|
return initVal;
|
|
|
|
}
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
void
|
2002-12-05 16:50:39 +01:00
|
|
|
ExecEndAgg(AggState *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2002-12-05 16:50:39 +01:00
|
|
|
PlanState *outerPlan;
|
2015-08-04 16:53:10 +02:00
|
|
|
int transno;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
int numGroupingSets = Max(node->maxsets, 1);
|
|
|
|
int setno;
|
2002-11-06 23:31:24 +01:00
|
|
|
|
2020-06-19 07:24:27 +02:00
|
|
|
/*
|
|
|
|
* When ending a parallel worker, copy the statistics gathered by the
|
|
|
|
* worker back into shared memory so that it can be picked up by the main
|
|
|
|
* process to report in EXPLAIN ANALYZE.
|
|
|
|
*/
|
|
|
|
if (node->shared_info && IsParallelWorker())
|
|
|
|
{
|
|
|
|
AggregateInstrumentation *si;
|
|
|
|
|
|
|
|
Assert(ParallelWorkerNumber <= node->shared_info->num_workers);
|
|
|
|
si = &node->shared_info->sinstrument[ParallelWorkerNumber];
|
|
|
|
si->hash_batches_used = node->hash_batches_used;
|
|
|
|
si->hash_disk_used = node->hash_disk_used;
|
|
|
|
si->hash_mem_peak = node->hash_mem_peak;
|
|
|
|
}
|
|
|
|
|
2002-11-06 23:31:24 +01:00
|
|
|
/* Make sure we have closed any open tuplesorts */
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
|
|
|
|
if (node->sort_in)
|
|
|
|
tuplesort_end(node->sort_in);
|
|
|
|
if (node->sort_out)
|
|
|
|
tuplesort_end(node->sort_out);
|
|
|
|
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
hashagg_reset_spill_state(node);
|
|
|
|
|
|
|
|
if (node->hash_metacxt != NULL)
|
|
|
|
{
|
|
|
|
MemoryContextDelete(node->hash_metacxt);
|
|
|
|
node->hash_metacxt = NULL;
|
|
|
|
}
|
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
for (transno = 0; transno < node->numtrans; transno++)
|
2002-11-06 23:31:24 +01:00
|
|
|
{
|
2015-08-04 16:53:10 +02:00
|
|
|
AggStatePerTrans pertrans = &node->pertrans[transno];
|
2002-11-06 23:31:24 +01:00
|
|
|
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
for (setno = 0; setno < numGroupingSets; setno++)
|
|
|
|
{
|
2015-08-04 16:53:10 +02:00
|
|
|
if (pertrans->sortstates[setno])
|
|
|
|
tuplesort_end(pertrans->sortstates[setno]);
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
}
|
2002-11-06 23:31:24 +01:00
|
|
|
}
|
1996-07-09 08:22:35 +02:00
|
|
|
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
/* And ensure any agg shutdown callbacks have been called */
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
for (setno = 0; setno < numGroupingSets; setno++)
|
|
|
|
ReScanExprContext(node->aggcontexts[setno]);
|
2017-03-27 05:20:54 +02:00
|
|
|
if (node->hashcontext)
|
|
|
|
ReScanExprContext(node->hashcontext);
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
|
2000-07-12 04:37:39 +02:00
|
|
|
/*
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
* We don't actually free any ExprContexts here (see comment in
|
|
|
|
* ExecFreeExprContext), just unlinking the output one from the plan node
|
|
|
|
* suffices.
|
2000-07-12 04:37:39 +02:00
|
|
|
*/
|
2002-12-05 16:50:39 +01:00
|
|
|
ExecFreeExprContext(&node->ss.ps);
|
2001-03-22 05:01:46 +01:00
|
|
|
|
2002-12-15 17:17:59 +01:00
|
|
|
/* clean up tuple table */
|
|
|
|
ExecClearTuple(node->ss.ss_ScanTupleSlot);
|
|
|
|
|
2002-12-05 16:50:39 +01:00
|
|
|
outerPlan = outerPlanState(node);
|
|
|
|
ExecEndNode(outerPlan);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
1998-02-13 04:26:53 +01:00
|
|
|
void
|
2010-07-12 19:01:06 +02:00
|
|
|
ExecReScanAgg(AggState *node)
|
1998-02-13 04:26:53 +01:00
|
|
|
{
|
2002-12-05 16:50:39 +01:00
|
|
|
ExprContext *econtext = node->ss.ps.ps_ExprContext;
|
2015-05-04 22:13:07 +02:00
|
|
|
PlanState *outerPlan = outerPlanState(node);
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
Agg *aggnode = (Agg *) node->ss.ps.plan;
|
2015-08-04 16:53:10 +02:00
|
|
|
int transno;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
int numGroupingSets = Max(node->maxsets, 1);
|
|
|
|
int setno;
|
2002-11-06 23:31:24 +01:00
|
|
|
|
2003-05-30 22:23:10 +02:00
|
|
|
node->agg_done = false;
|
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
if (node->aggstrategy == AGG_HASHED)
|
2003-05-30 22:23:10 +02:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* In the hashed case, if we haven't yet built the hash table then we
|
|
|
|
* can just return; nothing done yet, so nothing to undo. If subnode's
|
|
|
|
* chgParam is not NULL then it will be re-scanned by ExecProcNode,
|
|
|
|
* else no reason to re-scan it at all.
|
|
|
|
*/
|
|
|
|
if (!node->table_filled)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
* If we do have the hash table, and it never spilled, and the subplan
|
|
|
|
* does not have any parameter changes, and none of our own parameter
|
|
|
|
* changes affect input expressions of the aggregated functions, then
|
|
|
|
* we can just rescan the existing hash table; no need to build it
|
|
|
|
* again.
|
2003-05-30 22:23:10 +02:00
|
|
|
*/
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
if (outerPlan->chgParam == NULL && !node->hash_ever_spilled &&
|
2016-08-24 20:37:50 +02:00
|
|
|
!bms_overlap(node->ss.ps.chgParam, aggnode->aggParams))
|
2003-05-30 22:23:10 +02:00
|
|
|
{
|
2017-03-27 05:20:54 +02:00
|
|
|
ResetTupleHashIterator(node->perhash[0].hashtable,
|
|
|
|
&node->perhash[0].hashiter);
|
|
|
|
select_current_set(node, 0, true);
|
2003-05-30 22:23:10 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2002-11-06 23:31:24 +01:00
|
|
|
/* Make sure we have closed any open tuplesorts */
|
2015-08-04 16:53:10 +02:00
|
|
|
for (transno = 0; transno < node->numtrans; transno++)
|
2002-11-06 23:31:24 +01:00
|
|
|
{
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
for (setno = 0; setno < numGroupingSets; setno++)
|
|
|
|
{
|
2015-08-04 16:53:10 +02:00
|
|
|
AggStatePerTrans pertrans = &node->pertrans[transno];
|
2002-11-06 23:31:24 +01:00
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
if (pertrans->sortstates[setno])
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
{
|
2015-08-04 16:53:10 +02:00
|
|
|
tuplesort_end(pertrans->sortstates[setno]);
|
|
|
|
pertrans->sortstates[setno] = NULL;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
}
|
|
|
|
}
|
2002-11-06 23:31:24 +01:00
|
|
|
}
|
1998-02-13 04:26:53 +01:00
|
|
|
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
/*
|
|
|
|
* We don't need to ReScanExprContext the output tuple context here;
|
|
|
|
* ExecReScan already did it. But we do need to reset our per-grouping-set
|
|
|
|
* contexts, which may have transvalues stored in them. (We use rescan
|
|
|
|
* rather than just reset because transfns may have registered callbacks
|
2017-03-27 05:20:54 +02:00
|
|
|
* that need to be run now.) For the AGG_HASHED case, see below.
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
*/
|
|
|
|
|
|
|
|
for (setno = 0; setno < numGroupingSets; setno++)
|
|
|
|
{
|
|
|
|
ReScanExprContext(node->aggcontexts[setno]);
|
|
|
|
}
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
|
2003-05-30 22:23:10 +02:00
|
|
|
/* Release first tuple of group, if we have made a copy */
|
2002-12-05 16:50:39 +01:00
|
|
|
if (node->grp_firstTuple != NULL)
|
2002-11-06 01:00:45 +01:00
|
|
|
{
|
2002-12-05 16:50:39 +01:00
|
|
|
heap_freetuple(node->grp_firstTuple);
|
|
|
|
node->grp_firstTuple = NULL;
|
2002-11-06 01:00:45 +01:00
|
|
|
}
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
ExecClearTuple(node->ss.ss_ScanTupleSlot);
|
2003-05-30 22:23:10 +02:00
|
|
|
|
|
|
|
/* Forget current agg values */
|
2002-12-05 16:50:39 +01:00
|
|
|
MemSet(econtext->ecxt_aggvalues, 0, sizeof(Datum) * node->numaggs);
|
|
|
|
MemSet(econtext->ecxt_aggnulls, 0, sizeof(bool) * node->numaggs);
|
1998-02-26 05:46:47 +01:00
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
/*
|
|
|
|
* With AGG_HASHED/MIXED, the hash table is allocated in a sub-context of
|
|
|
|
* the hashcontext. This used to be an issue, but now, resetting a context
|
|
|
|
* automatically deletes sub-contexts too.
|
|
|
|
*/
|
|
|
|
if (node->aggstrategy == AGG_HASHED || node->aggstrategy == AGG_MIXED)
|
2002-11-06 23:31:24 +01:00
|
|
|
{
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
hashagg_reset_spill_state(node);
|
|
|
|
|
|
|
|
node->hash_ever_spilled = false;
|
|
|
|
node->hash_spill_mode = false;
|
|
|
|
node->hash_ngroups_current = 0;
|
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
ReScanExprContext(node->hashcontext);
|
2003-05-30 22:23:10 +02:00
|
|
|
/* Rebuild an empty hash table */
|
2020-02-19 19:15:16 +01:00
|
|
|
build_hash_tables(node);
|
2002-12-05 16:50:39 +01:00
|
|
|
node->table_filled = false;
|
2017-03-27 05:20:54 +02:00
|
|
|
/* iterator will be reset when the table is filled */
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
|
|
|
hashagg_recompile_expressions(node, false, false);
|
2002-11-06 23:31:24 +01:00
|
|
|
}
|
2017-03-27 05:20:54 +02:00
|
|
|
|
|
|
|
if (node->aggstrategy != AGG_HASHED)
|
2004-03-13 01:54:10 +01:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Reset the per-group state (in particular, mark transvalues null)
|
|
|
|
*/
|
2018-01-03 03:02:37 +01:00
|
|
|
for (setno = 0; setno < numGroupingSets; setno++)
|
|
|
|
{
|
|
|
|
MemSet(node->pergroups[setno], 0,
|
|
|
|
sizeof(AggStatePerGroupData) * node->numaggs);
|
|
|
|
}
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
/* reset to phase 1 */
|
|
|
|
initialize_phase(node, 1);
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
|
|
|
|
node->input_done = false;
|
|
|
|
node->projected_set = -1;
|
2004-03-13 01:54:10 +01:00
|
|
|
}
|
2002-11-06 23:31:24 +01:00
|
|
|
|
2015-05-04 22:13:07 +02:00
|
|
|
if (outerPlan->chgParam == NULL)
|
|
|
|
ExecReScan(outerPlan);
|
1999-12-13 02:27:21 +01:00
|
|
|
}
|
2002-04-11 22:00:18 +02:00
|
|
|
|
2014-04-12 17:58:53 +02:00
|
|
|
|
|
|
|
/***********************************************************************
|
|
|
|
* API exposed to aggregate functions
|
|
|
|
***********************************************************************/
|
|
|
|
|
|
|
|
|
2010-02-08 21:39:52 +01:00
|
|
|
/*
|
|
|
|
* AggCheckCallContext - test if a SQL function is being called as an aggregate
|
|
|
|
*
|
|
|
|
* The transition and/or final functions of an aggregate may want to verify
|
|
|
|
* that they are being called as aggregates, rather than as plain SQL
|
|
|
|
* functions. They should use this function to do so. The return value
|
|
|
|
* is nonzero if being called as an aggregate, or zero if not. (Specific
|
|
|
|
* nonzero values are AGG_CONTEXT_AGGREGATE or AGG_CONTEXT_WINDOW, but more
|
|
|
|
* values could conceivably appear in future.)
|
|
|
|
*
|
|
|
|
* If aggcontext isn't NULL, the function also stores at *aggcontext the
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
* identity of the memory context that aggregate transition values are being
|
|
|
|
* stored in. Note that the same aggregate call site (flinfo) may be called
|
|
|
|
* interleaved on different transition values in different contexts, so it's
|
|
|
|
* not kosher to cache aggcontext under fn_extra. It is, however, kosher to
|
|
|
|
* cache it in the transvalue itself (for internal-type transvalues).
|
2010-02-08 21:39:52 +01:00
|
|
|
*/
|
|
|
|
int
|
|
|
|
AggCheckCallContext(FunctionCallInfo fcinfo, MemoryContext *aggcontext)
|
|
|
|
{
|
|
|
|
if (fcinfo->context && IsA(fcinfo->context, AggState))
|
|
|
|
{
|
|
|
|
if (aggcontext)
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
{
|
|
|
|
AggState *aggstate = ((AggState *) fcinfo->context);
|
2017-03-27 05:20:54 +02:00
|
|
|
ExprContext *cxt = aggstate->curaggcontext;
|
2015-05-24 03:35:49 +02:00
|
|
|
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
*aggcontext = cxt->ecxt_per_tuple_memory;
|
|
|
|
}
|
2010-02-08 21:39:52 +01:00
|
|
|
return AGG_CONTEXT_AGGREGATE;
|
|
|
|
}
|
|
|
|
if (fcinfo->context && IsA(fcinfo->context, WindowAggState))
|
|
|
|
{
|
|
|
|
if (aggcontext)
|
2014-04-12 17:58:53 +02:00
|
|
|
*aggcontext = ((WindowAggState *) fcinfo->context)->curaggcontext;
|
2010-02-08 21:39:52 +01:00
|
|
|
return AGG_CONTEXT_WINDOW;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* this is just to prevent "uninitialized variable" warnings */
|
|
|
|
if (aggcontext)
|
|
|
|
*aggcontext = NULL;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
/*
|
|
|
|
* AggGetAggref - allow an aggregate support function to get its Aggref
|
|
|
|
*
|
|
|
|
* If the function is being called as an aggregate support function,
|
|
|
|
* return the Aggref node for the aggregate call. Otherwise, return NULL.
|
|
|
|
*
|
2017-10-12 21:20:04 +02:00
|
|
|
* Aggregates sharing the same inputs and transition functions can get
|
|
|
|
* merged into a single transition calculation. If the transition function
|
|
|
|
* calls AggGetAggref, it will get some one of the Aggrefs for which it is
|
|
|
|
* executing. It must therefore not pay attention to the Aggref fields that
|
|
|
|
* relate to the final function, as those are indeterminate. But if a final
|
|
|
|
* function calls AggGetAggref, it will get a precise result.
|
|
|
|
*
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
* Note that if an aggregate is being used as a window function, this will
|
|
|
|
* return NULL. We could provide a similar function to return the relevant
|
|
|
|
* WindowFunc node in such cases, but it's not needed yet.
|
|
|
|
*/
|
|
|
|
Aggref *
|
|
|
|
AggGetAggref(FunctionCallInfo fcinfo)
|
|
|
|
{
|
|
|
|
if (fcinfo->context && IsA(fcinfo->context, AggState))
|
|
|
|
{
|
2017-10-16 21:51:23 +02:00
|
|
|
AggState *aggstate = (AggState *) fcinfo->context;
|
2017-10-12 21:20:04 +02:00
|
|
|
AggStatePerAgg curperagg;
|
2015-08-04 16:53:10 +02:00
|
|
|
AggStatePerTrans curpertrans;
|
|
|
|
|
2017-10-12 21:20:04 +02:00
|
|
|
/* check curperagg (valid when in a final function) */
|
2017-10-16 21:51:23 +02:00
|
|
|
curperagg = aggstate->curperagg;
|
2017-10-12 21:20:04 +02:00
|
|
|
|
|
|
|
if (curperagg)
|
|
|
|
return curperagg->aggref;
|
|
|
|
|
|
|
|
/* check curpertrans (valid when in a transition function) */
|
2017-10-16 21:51:23 +02:00
|
|
|
curpertrans = aggstate->curpertrans;
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
|
2015-08-04 16:53:10 +02:00
|
|
|
if (curpertrans)
|
|
|
|
return curpertrans->aggref;
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2014-07-04 00:25:33 +02:00
|
|
|
* AggGetTempMemoryContext - fetch short-term memory context for aggregates
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
*
|
2014-07-04 00:25:33 +02:00
|
|
|
* This is useful in agg final functions; the context returned is one that
|
|
|
|
* the final function can safely reset as desired. This isn't useful for
|
|
|
|
* transition functions, since the context returned MAY (we don't promise)
|
|
|
|
* be the same as the context those are called in.
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
*
|
|
|
|
* As above, this is currently not useful for aggs called as window functions.
|
|
|
|
*/
|
2014-07-04 00:25:33 +02:00
|
|
|
MemoryContext
|
|
|
|
AggGetTempMemoryContext(FunctionCallInfo fcinfo)
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
{
|
|
|
|
if (fcinfo->context && IsA(fcinfo->context, AggState))
|
|
|
|
{
|
|
|
|
AggState *aggstate = (AggState *) fcinfo->context;
|
|
|
|
|
2014-07-04 00:25:33 +02:00
|
|
|
return aggstate->tmpcontext->ecxt_per_tuple_memory;
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2017-10-16 21:51:23 +02:00
|
|
|
/*
|
|
|
|
* AggStateIsShared - find out whether transition state is shared
|
|
|
|
*
|
|
|
|
* If the function is being called as an aggregate support function,
|
2017-08-16 06:22:32 +02:00
|
|
|
* return true if the aggregate's transition state is shared across
|
|
|
|
* multiple aggregates, false if it is not.
|
2017-10-16 21:51:23 +02:00
|
|
|
*
|
2017-08-16 06:22:32 +02:00
|
|
|
* Returns true if not called as an aggregate support function.
|
2017-10-16 21:51:23 +02:00
|
|
|
* This is intended as a conservative answer, ie "no you'd better not
|
2017-08-16 06:22:32 +02:00
|
|
|
* scribble on your input". In particular, will return true if the
|
2017-10-16 21:51:23 +02:00
|
|
|
* aggregate is being used as a window function, which is a scenario
|
|
|
|
* in which changing the transition state is a bad idea. We might
|
|
|
|
* want to refine the behavior for the window case in future.
|
|
|
|
*/
|
|
|
|
bool
|
|
|
|
AggStateIsShared(FunctionCallInfo fcinfo)
|
|
|
|
{
|
|
|
|
if (fcinfo->context && IsA(fcinfo->context, AggState))
|
|
|
|
{
|
|
|
|
AggState *aggstate = (AggState *) fcinfo->context;
|
|
|
|
AggStatePerAgg curperagg;
|
|
|
|
AggStatePerTrans curpertrans;
|
|
|
|
|
|
|
|
/* check curperagg (valid when in a final function) */
|
|
|
|
curperagg = aggstate->curperagg;
|
|
|
|
|
|
|
|
if (curperagg)
|
|
|
|
return aggstate->pertrans[curperagg->transno].aggshared;
|
|
|
|
|
|
|
|
/* check curpertrans (valid when in a transition function) */
|
|
|
|
curpertrans = aggstate->curpertrans;
|
|
|
|
|
|
|
|
if (curpertrans)
|
|
|
|
return curpertrans->aggshared;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
/*
|
2014-07-04 00:25:33 +02:00
|
|
|
* AggRegisterCallback - register a cleanup callback for an aggregate
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
*
|
|
|
|
* This is useful for aggs to register shutdown callbacks, which will ensure
|
2014-07-04 00:25:33 +02:00
|
|
|
* that non-memory resources are freed. The callback will occur just before
|
|
|
|
* the associated aggcontext (as returned by AggCheckCallContext) is reset,
|
|
|
|
* either between groups or as a result of rescanning the query. The callback
|
|
|
|
* will NOT be called on error paths. The typical use-case is for freeing of
|
|
|
|
* tuplestores or tuplesorts maintained in aggcontext, or pins held by slots
|
|
|
|
* created by the agg functions. (The callback will not be called until after
|
|
|
|
* the result of the finalfn is no longer needed, so it's safe for the finalfn
|
|
|
|
* to return data that will be freed by the callback.)
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
*
|
|
|
|
* As above, this is currently not useful for aggs called as window functions.
|
|
|
|
*/
|
2014-07-04 00:25:33 +02:00
|
|
|
void
|
|
|
|
AggRegisterCallback(FunctionCallInfo fcinfo,
|
|
|
|
ExprContextCallbackFunction func,
|
|
|
|
Datum arg)
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
{
|
|
|
|
if (fcinfo->context && IsA(fcinfo->context, AggState))
|
|
|
|
{
|
|
|
|
AggState *aggstate = (AggState *) fcinfo->context;
|
2017-03-27 05:20:54 +02:00
|
|
|
ExprContext *cxt = aggstate->curaggcontext;
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
RegisterExprContextCallback(cxt, func, arg);
|
2014-07-04 00:25:33 +02:00
|
|
|
|
|
|
|
return;
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
}
|
2014-07-04 00:25:33 +02:00
|
|
|
elog(ERROR, "aggregate function cannot register a callback in this context");
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2020-06-19 07:24:27 +02:00
|
|
|
/* ----------------------------------------------------------------
|
|
|
|
* Parallel Query Support
|
|
|
|
* ----------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* ----------------------------------------------------------------
|
|
|
|
* ExecAggEstimate
|
|
|
|
*
|
|
|
|
* Estimate space required to propagate aggregate statistics.
|
|
|
|
* ----------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
ExecAggEstimate(AggState *node, ParallelContext *pcxt)
|
|
|
|
{
|
|
|
|
Size size;
|
|
|
|
|
|
|
|
/* don't need this if not instrumenting or no workers */
|
|
|
|
if (!node->ss.ps.instrument || pcxt->nworkers == 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
size = mul_size(pcxt->nworkers, sizeof(AggregateInstrumentation));
|
|
|
|
size = add_size(size, offsetof(SharedAggInfo, sinstrument));
|
|
|
|
shm_toc_estimate_chunk(&pcxt->estimator, size);
|
|
|
|
shm_toc_estimate_keys(&pcxt->estimator, 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ----------------------------------------------------------------
|
|
|
|
* ExecAggInitializeDSM
|
|
|
|
*
|
|
|
|
* Initialize DSM space for aggregate statistics.
|
|
|
|
* ----------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
ExecAggInitializeDSM(AggState *node, ParallelContext *pcxt)
|
|
|
|
{
|
|
|
|
Size size;
|
|
|
|
|
|
|
|
/* don't need this if not instrumenting or no workers */
|
|
|
|
if (!node->ss.ps.instrument || pcxt->nworkers == 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
size = offsetof(SharedAggInfo, sinstrument)
|
|
|
|
+ pcxt->nworkers * sizeof(AggregateInstrumentation);
|
|
|
|
node->shared_info = shm_toc_allocate(pcxt->toc, size);
|
|
|
|
/* ensure any unfilled slots will contain zeroes */
|
|
|
|
memset(node->shared_info, 0, size);
|
|
|
|
node->shared_info->num_workers = pcxt->nworkers;
|
|
|
|
shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id,
|
|
|
|
node->shared_info);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ----------------------------------------------------------------
|
|
|
|
* ExecAggInitializeWorker
|
|
|
|
*
|
|
|
|
* Attach worker to DSM space for aggregate statistics.
|
|
|
|
* ----------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
ExecAggInitializeWorker(AggState *node, ParallelWorkerContext *pwcxt)
|
|
|
|
{
|
|
|
|
node->shared_info =
|
|
|
|
shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ----------------------------------------------------------------
|
|
|
|
* ExecAggRetrieveInstrumentation
|
|
|
|
*
|
|
|
|
* Transfer aggregate statistics from DSM to private memory.
|
|
|
|
* ----------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
ExecAggRetrieveInstrumentation(AggState *node)
|
|
|
|
{
|
|
|
|
Size size;
|
|
|
|
SharedAggInfo *si;
|
|
|
|
|
|
|
|
if (node->shared_info == NULL)
|
|
|
|
return;
|
|
|
|
|
|
|
|
size = offsetof(SharedAggInfo, sinstrument)
|
|
|
|
+ node->shared_info->num_workers * sizeof(AggregateInstrumentation);
|
|
|
|
si = palloc(size);
|
|
|
|
memcpy(si, node->shared_info, size);
|
|
|
|
node->shared_info = si;
|
|
|
|
}
|