1996-07-09 08:22:35 +02:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
1999-02-14 00:22:53 +01:00
|
|
|
* nodeAgg.h
|
2003-01-11 00:54:24 +01:00
|
|
|
* prototypes for nodeAgg.c
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
|
|
|
*
|
2022-01-08 01:04:57 +01:00
|
|
|
* Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
|
2000-01-26 06:58:53 +01:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2010-09-20 22:08:53 +02:00
|
|
|
* src/include/executor/nodeAgg.h
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
#ifndef NODEAGG_H
|
|
|
|
#define NODEAGG_H
|
|
|
|
|
2020-06-19 07:24:27 +02:00
|
|
|
#include "access/parallel.h"
|
2002-12-05 16:50:39 +01:00
|
|
|
#include "nodes/execnodes.h"
|
1997-11-26 02:14:33 +01:00
|
|
|
|
2018-01-09 22:25:38 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* AggStatePerTransData - per aggregate state value information
|
|
|
|
*
|
|
|
|
* Working state for updating the aggregate's state value, by calling the
|
|
|
|
* transition function with an input row. This struct does not store the
|
|
|
|
* information needed to produce the final aggregate result from the transition
|
|
|
|
* state, that's stored in AggStatePerAggData instead. This separation allows
|
|
|
|
* multiple aggregate results to be produced from a single state value.
|
|
|
|
*/
|
|
|
|
typedef struct AggStatePerTransData
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* These values are set up during ExecInitAgg() and do not change
|
|
|
|
* thereafter:
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Link to an Aggref expr this state value is for.
|
|
|
|
*
|
|
|
|
* There can be multiple Aggref's sharing the same state value, so long as
|
|
|
|
* the inputs and transition functions are identical and the final
|
|
|
|
* functions are not read-write. This points to the first one of them.
|
|
|
|
*/
|
|
|
|
Aggref *aggref;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Is this state value actually being shared by more than one Aggref?
|
|
|
|
*/
|
|
|
|
bool aggshared;
|
|
|
|
|
Improve performance of ORDER BY / DISTINCT aggregates
ORDER BY / DISTINCT aggreagtes have, since implemented in Postgres, been
executed by always performing a sort in nodeAgg.c to sort the tuples in
the current group into the correct order before calling the transition
function on the sorted tuples. This was not great as often there might be
an index that could have provided pre-sorted input and allowed the
transition functions to be called as the rows come in, rather than having
to store them in a tuplestore in order to sort them once all the tuples
for the group have arrived.
Here we change the planner so it requests a path with a sort order which
supports the most amount of ORDER BY / DISTINCT aggregate functions and
add new code to the executor to allow it to support the processing of
ORDER BY / DISTINCT aggregates where the tuples are already sorted in the
correct order.
Since there can be many ORDER BY / DISTINCT aggregates in any given query
level, it's very possible that we can't find an order that suits all of
these aggregates. The sort order that the planner chooses is simply the
one that suits the most aggregate functions. We take the most strictly
sorted variation of each order and see how many aggregate functions can
use that, then we try again with the order of the remaining aggregates to
see if another order would suit more aggregate functions. For example:
SELECT agg(a ORDER BY a),agg2(a ORDER BY a,b) ...
would request the sort order to be {a, b} because {a} is a subset of the
sort order of {a,b}, but;
SELECT agg(a ORDER BY a),agg2(a ORDER BY c) ...
would just pick a plan ordered by {a} (we give precedence to aggregates
which are earlier in the targetlist).
SELECT agg(a ORDER BY a),agg2(a ORDER BY b),agg3(a ORDER BY b) ...
would choose to order by {b} since two aggregates suit that vs just one
that requires input ordered by {a}.
Author: David Rowley
Reviewed-by: Ronan Dunklau, James Coleman, Ranier Vilela, Richard Guo, Tom Lane
Discussion: https://postgr.es/m/CAApHDvpHzfo92%3DR4W0%2BxVua3BUYCKMckWAmo-2t_KiXN-wYH%3Dw%40mail.gmail.com
2022-08-02 13:11:45 +02:00
|
|
|
/*
|
|
|
|
* True for ORDER BY and DISTINCT Aggrefs that are not aggpresorted.
|
|
|
|
*/
|
|
|
|
bool aggsortrequired;
|
|
|
|
|
2018-01-09 22:25:38 +01:00
|
|
|
/*
|
|
|
|
* Number of aggregated input columns. This includes ORDER BY expressions
|
|
|
|
* in both the plain-agg and ordered-set cases. Ordered-set direct args
|
|
|
|
* are not counted, though.
|
|
|
|
*/
|
|
|
|
int numInputs;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Number of aggregated input columns to pass to the transfn. This
|
|
|
|
* includes the ORDER BY columns for ordered-set aggs, but not for plain
|
|
|
|
* aggs. (This doesn't count the transition state value!)
|
|
|
|
*/
|
|
|
|
int numTransInputs;
|
|
|
|
|
|
|
|
/* Oid of the state transition or combine function */
|
|
|
|
Oid transfn_oid;
|
|
|
|
|
|
|
|
/* Oid of the serialization function or InvalidOid */
|
|
|
|
Oid serialfn_oid;
|
|
|
|
|
|
|
|
/* Oid of the deserialization function or InvalidOid */
|
|
|
|
Oid deserialfn_oid;
|
|
|
|
|
|
|
|
/* Oid of state value's datatype */
|
|
|
|
Oid aggtranstype;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* fmgr lookup data for transition function or combine function. Note in
|
|
|
|
* particular that the fn_strict flag is kept here.
|
|
|
|
*/
|
|
|
|
FmgrInfo transfn;
|
|
|
|
|
|
|
|
/* fmgr lookup data for serialization function */
|
|
|
|
FmgrInfo serialfn;
|
|
|
|
|
|
|
|
/* fmgr lookup data for deserialization function */
|
|
|
|
FmgrInfo deserialfn;
|
|
|
|
|
|
|
|
/* Input collation derived for aggregate */
|
|
|
|
Oid aggCollation;
|
|
|
|
|
|
|
|
/* number of sorting columns */
|
|
|
|
int numSortCols;
|
|
|
|
|
|
|
|
/* number of sorting columns to consider in DISTINCT comparisons */
|
|
|
|
/* (this is either zero or the same as numSortCols) */
|
|
|
|
int numDistinctCols;
|
|
|
|
|
|
|
|
/* deconstructed sorting information (arrays of length numSortCols) */
|
|
|
|
AttrNumber *sortColIdx;
|
|
|
|
Oid *sortOperators;
|
|
|
|
Oid *sortCollations;
|
|
|
|
bool *sortNullsFirst;
|
|
|
|
|
|
|
|
/*
|
2018-02-16 06:55:31 +01:00
|
|
|
* Comparators for input columns --- only set/used when aggregate has
|
|
|
|
* DISTINCT flag. equalfnOne version is used for single-column
|
2018-04-01 21:01:28 +02:00
|
|
|
* comparisons, equalfnMulti for the case of multiple columns.
|
2018-01-09 22:25:38 +01:00
|
|
|
*/
|
2018-02-16 06:55:31 +01:00
|
|
|
FmgrInfo equalfnOne;
|
|
|
|
ExprState *equalfnMulti;
|
2018-01-09 22:25:38 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* initial value from pg_aggregate entry
|
|
|
|
*/
|
|
|
|
Datum initValue;
|
|
|
|
bool initValueIsNull;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We need the len and byval info for the agg's input and transition data
|
|
|
|
* types in order to know how to copy/delete values.
|
|
|
|
*
|
|
|
|
* Note that the info for the input type is used only when handling
|
|
|
|
* DISTINCT aggs with just one argument, so there is only one input type.
|
|
|
|
*/
|
|
|
|
int16 inputtypeLen,
|
|
|
|
transtypeLen;
|
|
|
|
bool inputtypeByVal,
|
|
|
|
transtypeByVal;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Slots for holding the evaluated input arguments. These are set up
|
|
|
|
* during ExecInitAgg() and then used for each input row requiring either
|
|
|
|
* FILTER or ORDER BY/DISTINCT processing.
|
|
|
|
*/
|
|
|
|
TupleTableSlot *sortslot; /* current input tuple */
|
|
|
|
TupleTableSlot *uniqslot; /* used for multi-column DISTINCT */
|
|
|
|
TupleDesc sortdesc; /* descriptor of input tuples */
|
Improve performance of ORDER BY / DISTINCT aggregates
ORDER BY / DISTINCT aggreagtes have, since implemented in Postgres, been
executed by always performing a sort in nodeAgg.c to sort the tuples in
the current group into the correct order before calling the transition
function on the sorted tuples. This was not great as often there might be
an index that could have provided pre-sorted input and allowed the
transition functions to be called as the rows come in, rather than having
to store them in a tuplestore in order to sort them once all the tuples
for the group have arrived.
Here we change the planner so it requests a path with a sort order which
supports the most amount of ORDER BY / DISTINCT aggregate functions and
add new code to the executor to allow it to support the processing of
ORDER BY / DISTINCT aggregates where the tuples are already sorted in the
correct order.
Since there can be many ORDER BY / DISTINCT aggregates in any given query
level, it's very possible that we can't find an order that suits all of
these aggregates. The sort order that the planner chooses is simply the
one that suits the most aggregate functions. We take the most strictly
sorted variation of each order and see how many aggregate functions can
use that, then we try again with the order of the remaining aggregates to
see if another order would suit more aggregate functions. For example:
SELECT agg(a ORDER BY a),agg2(a ORDER BY a,b) ...
would request the sort order to be {a, b} because {a} is a subset of the
sort order of {a,b}, but;
SELECT agg(a ORDER BY a),agg2(a ORDER BY c) ...
would just pick a plan ordered by {a} (we give precedence to aggregates
which are earlier in the targetlist).
SELECT agg(a ORDER BY a),agg2(a ORDER BY b),agg3(a ORDER BY b) ...
would choose to order by {b} since two aggregates suit that vs just one
that requires input ordered by {a}.
Author: David Rowley
Reviewed-by: Ronan Dunklau, James Coleman, Ranier Vilela, Richard Guo, Tom Lane
Discussion: https://postgr.es/m/CAApHDvpHzfo92%3DR4W0%2BxVua3BUYCKMckWAmo-2t_KiXN-wYH%3Dw%40mail.gmail.com
2022-08-02 13:11:45 +02:00
|
|
|
Datum lastdatum; /* used for single-column DISTINCT */
|
|
|
|
bool lastisnull; /* used for single-column DISTINCT */
|
|
|
|
bool haslast; /* got a last value for DISTINCT check */
|
2018-01-09 22:25:38 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* These values are working state that is initialized at the start of an
|
|
|
|
* input tuple group and updated for each input tuple.
|
|
|
|
*
|
|
|
|
* For a simple (non DISTINCT/ORDER BY) aggregate, we just feed the input
|
|
|
|
* values straight to the transition function. If it's DISTINCT or
|
|
|
|
* requires ORDER BY, we pass the input values into a Tuplesort object;
|
|
|
|
* then at completion of the input tuple group, we scan the sorted values,
|
|
|
|
* eliminate duplicates if needed, and run the transition function on the
|
|
|
|
* rest.
|
|
|
|
*
|
|
|
|
* We need a separate tuplesort for each grouping set.
|
|
|
|
*/
|
|
|
|
|
|
|
|
Tuplesortstate **sortstates; /* sort objects, if DISTINCT or ORDER BY */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This field is a pre-initialized FunctionCallInfo struct used for
|
|
|
|
* calling this aggregate's transfn. We save a few cycles per row by not
|
|
|
|
* re-initializing the unchanging fields; which isn't much, but it seems
|
|
|
|
* worth the extra space consumption.
|
|
|
|
*/
|
Change function call information to be variable length.
Before this change FunctionCallInfoData, the struct arguments etc for
V1 function calls are stored in, always had space for
FUNC_MAX_ARGS/100 arguments, storing datums and their nullness in two
arrays. For nearly every function call 100 arguments is far more than
needed, therefore wasting memory. Arg and argnull being two separate
arrays also guarantees that to access a single argument, two
cachelines have to be touched.
Change the layout so there's a single variable-length array with pairs
of value / isnull. That drastically reduces memory consumption for
most function calls (on x86-64 a two argument function now uses
64bytes, previously 936 bytes), and makes it very likely that argument
value and its nullness are on the same cacheline.
Arguments are stored in a new NullableDatum struct, which, due to
padding, needs more memory per argument than before. But as usually
far fewer arguments are stored, and individual arguments are cheaper
to access, that's still a clear win. It's likely that there's other
places where conversion to NullableDatum arrays would make sense,
e.g. TupleTableSlots, but that's for another commit.
Because the function call information is now variable-length
allocations have to take the number of arguments into account. For
heap allocations that can be done with SizeForFunctionCallInfoData(),
for on-stack allocations there's a new LOCAL_FCINFO(name, nargs) macro
that helps to allocate an appropriately sized and aligned variable.
Some places with stack allocation function call information don't know
the number of arguments at compile time, and currently variably sized
stack allocations aren't allowed in postgres. Therefore allow for
FUNC_MAX_ARGS space in these cases. They're not that common, so for
now that seems acceptable.
Because of the need to allocate FunctionCallInfo of the appropriate
size, older extensions may need to update their code. To avoid subtle
breakages, the FunctionCallInfoData struct has been renamed to
FunctionCallInfoBaseData. Most code only references FunctionCallInfo,
so that shouldn't cause much collateral damage.
This change is also a prerequisite for more efficient expression JIT
compilation (by allocating the function call information on the stack,
allowing LLVM to optimize it away); previously the size of the call
information caused problems inside LLVM's optimizer.
Author: Andres Freund
Reviewed-By: Tom Lane
Discussion: https://postgr.es/m/20180605172952.x34m5uz6ju6enaem@alap3.anarazel.de
2019-01-26 23:17:52 +01:00
|
|
|
FunctionCallInfo transfn_fcinfo;
|
2018-01-09 22:25:38 +01:00
|
|
|
|
|
|
|
/* Likewise for serialization and deserialization functions */
|
Change function call information to be variable length.
Before this change FunctionCallInfoData, the struct arguments etc for
V1 function calls are stored in, always had space for
FUNC_MAX_ARGS/100 arguments, storing datums and their nullness in two
arrays. For nearly every function call 100 arguments is far more than
needed, therefore wasting memory. Arg and argnull being two separate
arrays also guarantees that to access a single argument, two
cachelines have to be touched.
Change the layout so there's a single variable-length array with pairs
of value / isnull. That drastically reduces memory consumption for
most function calls (on x86-64 a two argument function now uses
64bytes, previously 936 bytes), and makes it very likely that argument
value and its nullness are on the same cacheline.
Arguments are stored in a new NullableDatum struct, which, due to
padding, needs more memory per argument than before. But as usually
far fewer arguments are stored, and individual arguments are cheaper
to access, that's still a clear win. It's likely that there's other
places where conversion to NullableDatum arrays would make sense,
e.g. TupleTableSlots, but that's for another commit.
Because the function call information is now variable-length
allocations have to take the number of arguments into account. For
heap allocations that can be done with SizeForFunctionCallInfoData(),
for on-stack allocations there's a new LOCAL_FCINFO(name, nargs) macro
that helps to allocate an appropriately sized and aligned variable.
Some places with stack allocation function call information don't know
the number of arguments at compile time, and currently variably sized
stack allocations aren't allowed in postgres. Therefore allow for
FUNC_MAX_ARGS space in these cases. They're not that common, so for
now that seems acceptable.
Because of the need to allocate FunctionCallInfo of the appropriate
size, older extensions may need to update their code. To avoid subtle
breakages, the FunctionCallInfoData struct has been renamed to
FunctionCallInfoBaseData. Most code only references FunctionCallInfo,
so that shouldn't cause much collateral damage.
This change is also a prerequisite for more efficient expression JIT
compilation (by allocating the function call information on the stack,
allowing LLVM to optimize it away); previously the size of the call
information caused problems inside LLVM's optimizer.
Author: Andres Freund
Reviewed-By: Tom Lane
Discussion: https://postgr.es/m/20180605172952.x34m5uz6ju6enaem@alap3.anarazel.de
2019-01-26 23:17:52 +01:00
|
|
|
FunctionCallInfo serialfn_fcinfo;
|
2018-01-09 22:25:38 +01:00
|
|
|
|
Change function call information to be variable length.
Before this change FunctionCallInfoData, the struct arguments etc for
V1 function calls are stored in, always had space for
FUNC_MAX_ARGS/100 arguments, storing datums and their nullness in two
arrays. For nearly every function call 100 arguments is far more than
needed, therefore wasting memory. Arg and argnull being two separate
arrays also guarantees that to access a single argument, two
cachelines have to be touched.
Change the layout so there's a single variable-length array with pairs
of value / isnull. That drastically reduces memory consumption for
most function calls (on x86-64 a two argument function now uses
64bytes, previously 936 bytes), and makes it very likely that argument
value and its nullness are on the same cacheline.
Arguments are stored in a new NullableDatum struct, which, due to
padding, needs more memory per argument than before. But as usually
far fewer arguments are stored, and individual arguments are cheaper
to access, that's still a clear win. It's likely that there's other
places where conversion to NullableDatum arrays would make sense,
e.g. TupleTableSlots, but that's for another commit.
Because the function call information is now variable-length
allocations have to take the number of arguments into account. For
heap allocations that can be done with SizeForFunctionCallInfoData(),
for on-stack allocations there's a new LOCAL_FCINFO(name, nargs) macro
that helps to allocate an appropriately sized and aligned variable.
Some places with stack allocation function call information don't know
the number of arguments at compile time, and currently variably sized
stack allocations aren't allowed in postgres. Therefore allow for
FUNC_MAX_ARGS space in these cases. They're not that common, so for
now that seems acceptable.
Because of the need to allocate FunctionCallInfo of the appropriate
size, older extensions may need to update their code. To avoid subtle
breakages, the FunctionCallInfoData struct has been renamed to
FunctionCallInfoBaseData. Most code only references FunctionCallInfo,
so that shouldn't cause much collateral damage.
This change is also a prerequisite for more efficient expression JIT
compilation (by allocating the function call information on the stack,
allowing LLVM to optimize it away); previously the size of the call
information caused problems inside LLVM's optimizer.
Author: Andres Freund
Reviewed-By: Tom Lane
Discussion: https://postgr.es/m/20180605172952.x34m5uz6ju6enaem@alap3.anarazel.de
2019-01-26 23:17:52 +01:00
|
|
|
FunctionCallInfo deserialfn_fcinfo;
|
2018-01-09 22:25:38 +01:00
|
|
|
} AggStatePerTransData;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* AggStatePerAggData - per-aggregate information
|
|
|
|
*
|
|
|
|
* This contains the information needed to call the final function, to produce
|
|
|
|
* a final aggregate result from the state value. If there are multiple
|
|
|
|
* identical Aggrefs in the query, they can all share the same per-agg data.
|
|
|
|
*
|
|
|
|
* These values are set up during ExecInitAgg() and do not change thereafter.
|
|
|
|
*/
|
|
|
|
typedef struct AggStatePerAggData
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Link to an Aggref expr this state value is for.
|
|
|
|
*
|
|
|
|
* There can be multiple identical Aggref's sharing the same per-agg. This
|
|
|
|
* points to the first one of them.
|
|
|
|
*/
|
|
|
|
Aggref *aggref;
|
|
|
|
|
|
|
|
/* index to the state value which this agg should use */
|
|
|
|
int transno;
|
|
|
|
|
|
|
|
/* Optional Oid of final function (may be InvalidOid) */
|
|
|
|
Oid finalfn_oid;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* fmgr lookup data for final function --- only valid when finalfn_oid is
|
|
|
|
* not InvalidOid.
|
|
|
|
*/
|
|
|
|
FmgrInfo finalfn;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Number of arguments to pass to the finalfn. This is always at least 1
|
|
|
|
* (the transition state value) plus any ordered-set direct args. If the
|
|
|
|
* finalfn wants extra args then we pass nulls corresponding to the
|
|
|
|
* aggregated input columns.
|
|
|
|
*/
|
|
|
|
int numFinalArgs;
|
|
|
|
|
|
|
|
/* ExprStates for any direct-argument expressions */
|
|
|
|
List *aggdirectargs;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We need the len and byval info for the agg's result data type in order
|
|
|
|
* to know how to copy/delete values.
|
|
|
|
*/
|
|
|
|
int16 resulttypeLen;
|
|
|
|
bool resulttypeByVal;
|
|
|
|
|
|
|
|
/*
|
2018-05-21 17:41:42 +02:00
|
|
|
* "shareable" is false if this agg cannot share state values with other
|
2018-01-09 22:25:38 +01:00
|
|
|
* aggregates because the final function is read-write.
|
|
|
|
*/
|
2018-05-21 17:41:42 +02:00
|
|
|
bool shareable;
|
2018-01-09 22:25:38 +01:00
|
|
|
} AggStatePerAggData;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* AggStatePerGroupData - per-aggregate-per-group working state
|
|
|
|
*
|
|
|
|
* These values are working state that is initialized at the start of
|
|
|
|
* an input tuple group and updated for each input tuple.
|
|
|
|
*
|
|
|
|
* In AGG_PLAIN and AGG_SORTED modes, we have a single array of these
|
|
|
|
* structs (pointed to by aggstate->pergroup); we re-use the array for
|
|
|
|
* each input group, if it's AGG_SORTED mode. In AGG_HASHED mode, the
|
|
|
|
* hash table contains an array of these structs for each tuple group.
|
|
|
|
*
|
|
|
|
* Logically, the sortstate field belongs in this struct, but we do not
|
|
|
|
* keep it here for space reasons: we don't support DISTINCT aggregates
|
|
|
|
* in AGG_HASHED mode, so there's no reason to use up a pointer field
|
|
|
|
* in every entry of the hashtable.
|
|
|
|
*/
|
|
|
|
typedef struct AggStatePerGroupData
|
|
|
|
{
|
2018-01-24 08:20:02 +01:00
|
|
|
#define FIELDNO_AGGSTATEPERGROUPDATA_TRANSVALUE 0
|
2018-01-09 22:25:38 +01:00
|
|
|
Datum transValue; /* current transition value */
|
2018-01-24 08:20:02 +01:00
|
|
|
#define FIELDNO_AGGSTATEPERGROUPDATA_TRANSVALUEISNULL 1
|
2018-01-09 22:25:38 +01:00
|
|
|
bool transValueIsNull;
|
|
|
|
|
2018-01-24 08:20:02 +01:00
|
|
|
#define FIELDNO_AGGSTATEPERGROUPDATA_NOTRANSVALUE 2
|
2018-01-09 22:25:38 +01:00
|
|
|
bool noTransValue; /* true if transValue not set yet */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Note: noTransValue initially has the same value as transValueIsNull,
|
|
|
|
* and if true both are cleared to false at the same time. They are not
|
|
|
|
* the same though: if transfn later returns a NULL, we want to keep that
|
|
|
|
* NULL and not auto-replace it with a later input value. Only the first
|
|
|
|
* non-NULL input will be auto-substituted.
|
|
|
|
*/
|
|
|
|
} AggStatePerGroupData;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* AggStatePerPhaseData - per-grouping-set-phase state
|
|
|
|
*
|
|
|
|
* Grouping sets are divided into "phases", where a single phase can be
|
|
|
|
* processed in one pass over the input. If there is more than one phase, then
|
|
|
|
* at the end of input from the current phase, state is reset and another pass
|
|
|
|
* taken over the data which has been re-sorted in the mean time.
|
|
|
|
*
|
|
|
|
* Accordingly, each phase specifies a list of grouping sets and group clause
|
|
|
|
* information, plus each phase after the first also has a sort order.
|
|
|
|
*/
|
|
|
|
typedef struct AggStatePerPhaseData
|
|
|
|
{
|
|
|
|
AggStrategy aggstrategy; /* strategy for this phase */
|
|
|
|
int numsets; /* number of grouping sets (or 0) */
|
|
|
|
int *gset_lengths; /* lengths of grouping sets */
|
|
|
|
Bitmapset **grouped_cols; /* column groupings for rollup */
|
2018-02-16 06:55:31 +01:00
|
|
|
ExprState **eqfunctions; /* expression returning equality, indexed by
|
|
|
|
* nr of cols to compare */
|
2018-01-09 22:25:38 +01:00
|
|
|
Agg *aggnode; /* Agg node for phase data */
|
|
|
|
Sort *sortnode; /* Sort node for input ordering for phase */
|
|
|
|
|
|
|
|
ExprState *evaltrans; /* evaluation of transition functions */
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
|
|
|
|
/*----------
|
|
|
|
* Cached variants of the compiled expression.
|
|
|
|
* first subscript: 0: outerops; 1: TTSOpsMinimalTuple
|
|
|
|
* second subscript: 0: no NULL check; 1: with NULL check
|
1996-07-09 08:22:35 +02:00
|
|
|
*----------
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
*/
|
|
|
|
ExprState *evaltrans_cache[2][2];
|
2018-01-09 22:25:38 +01:00
|
|
|
} AggStatePerPhaseData;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* AggStatePerHashData - per-hashtable state
|
|
|
|
*
|
|
|
|
* When doing grouping sets with hashing, we have one of these for each
|
|
|
|
* grouping set. (When doing hashing without grouping sets, we have just one of
|
|
|
|
* them.)
|
|
|
|
*/
|
|
|
|
typedef struct AggStatePerHashData
|
|
|
|
{
|
|
|
|
TupleHashTable hashtable; /* hash table with one entry per group */
|
|
|
|
TupleHashIterator hashiter; /* for iterating through hash table */
|
|
|
|
TupleTableSlot *hashslot; /* slot for loading hash table */
|
|
|
|
FmgrInfo *hashfunctions; /* per-grouping-field hash fns */
|
2018-02-16 06:55:31 +01:00
|
|
|
Oid *eqfuncoids; /* per-grouping-field equality fns */
|
2018-01-09 22:25:38 +01:00
|
|
|
int numCols; /* number of hash key columns */
|
|
|
|
int numhashGrpCols; /* number of columns in hash table */
|
|
|
|
int largestGrpColIdx; /* largest col required for hashing */
|
|
|
|
AttrNumber *hashGrpColIdxInput; /* hash col indices in input slot */
|
2019-07-22 03:01:50 +02:00
|
|
|
AttrNumber *hashGrpColIdxHash; /* indices in hash table tuples */
|
2018-01-09 22:25:38 +01:00
|
|
|
Agg *aggnode; /* original Agg node, for numGroups etc. */
|
|
|
|
} AggStatePerHashData;
|
|
|
|
|
|
|
|
|
2006-02-28 05:10:28 +01:00
|
|
|
extern AggState *ExecInitAgg(Agg *node, EState *estate, int eflags);
|
2002-12-05 16:50:39 +01:00
|
|
|
extern void ExecEndAgg(AggState *node);
|
2010-07-12 19:01:06 +02:00
|
|
|
extern void ExecReScanAgg(AggState *node);
|
2001-10-28 07:26:15 +01:00
|
|
|
|
2020-04-04 04:52:16 +02:00
|
|
|
extern Size hash_agg_entry_size(int numTrans, Size tupleWidth,
|
2020-02-06 20:49:56 +01:00
|
|
|
Size transitionSpace);
|
2020-07-29 08:15:47 +02:00
|
|
|
extern void hash_agg_set_limits(double hashentrysize, double input_groups,
|
Disk-based Hash Aggregation.
While performing hash aggregation, track memory usage when adding new
groups to a hash table. If the memory usage exceeds work_mem, enter
"spill mode".
In spill mode, new groups are not created in the hash table(s), but
existing groups continue to be advanced if input tuples match. Tuples
that would cause a new group to be created are instead spilled to a
logical tape to be processed later.
The tuples are spilled in a partitioned fashion. When all tuples from
the outer plan are processed (either by advancing the group or
spilling the tuple), finalize and emit the groups from the hash
table. Then, create new batches of work from the spilled partitions,
and select one of the saved batches and process it (possibly spilling
recursively).
Author: Jeff Davis
Reviewed-by: Tomas Vondra, Adam Lee, Justin Pryzby, Taylor Vesely, Melanie Plageman
Discussion: https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel@j-davis.com
2020-03-18 23:42:02 +01:00
|
|
|
int used_bits, Size *mem_limit,
|
|
|
|
uint64 *ngroups_limit, int *num_partitions);
|
2005-01-28 20:34:28 +01:00
|
|
|
|
2020-06-19 07:24:27 +02:00
|
|
|
/* parallel instrumentation support */
|
|
|
|
extern void ExecAggEstimate(AggState *node, ParallelContext *pcxt);
|
|
|
|
extern void ExecAggInitializeDSM(AggState *node, ParallelContext *pcxt);
|
|
|
|
extern void ExecAggInitializeWorker(AggState *node, ParallelWorkerContext *pwcxt);
|
|
|
|
extern void ExecAggRetrieveInstrumentation(AggState *node);
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
#endif /* NODEAGG_H */
|