1999-10-18 00:15:09 +02:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
|
|
|
* tuplesort.h
|
|
|
|
* Generalized tuple sorting routines.
|
|
|
|
*
|
1999-12-13 02:27:21 +01:00
|
|
|
* This module handles sorting of heap tuples, index tuples, or single
|
|
|
|
* Datums (and could easily support other kinds of sortable objects,
|
1999-10-18 00:15:09 +02:00
|
|
|
* if necessary). It works efficiently for both small and large amounts
|
|
|
|
* of data. Small amounts are sorted in-memory using qsort(). Large
|
|
|
|
* amounts are sorted using temporary files and a standard external sort
|
Support parallel btree index builds.
To make this work, tuplesort.c and logtape.c must also support
parallelism, so this patch adds that infrastructure and then applies
it to the particular case of parallel btree index builds. Testing
to date shows that this can often be 2-3x faster than a serial
index build.
The model for deciding how many workers to use is fairly primitive
at present, but it's better than not having the feature. We can
refine it as we get more experience.
Peter Geoghegan with some help from Rushabh Lathia. While Heikki
Linnakangas is not an author of this patch, he wrote other patches
without which this feature would not have been possible, and
therefore the release notes should possibly credit him as an author
of this feature. Reviewed by Claudio Freire, Heikki Linnakangas,
Thomas Munro, Tels, Amit Kapila, me.
Discussion: http://postgr.es/m/CAM3SWZQKM=Pzc=CAHzRixKjp2eO5Q0Jg1SoFQqeXFQ647JiwqQ@mail.gmail.com
Discussion: http://postgr.es/m/CAH2-Wz=AxWqDoVvGU7dq856S4r6sJAj6DBn7VMtigkB33N5eyg@mail.gmail.com
2018-02-02 19:25:55 +01:00
|
|
|
* algorithm. Parallel sorts use a variant of this external sort
|
|
|
|
* algorithm, and are typically only used for large amounts of data.
|
1999-10-18 00:15:09 +02:00
|
|
|
*
|
2023-01-02 21:00:37 +01:00
|
|
|
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
|
2000-01-26 06:58:53 +01:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
1999-10-18 00:15:09 +02:00
|
|
|
*
|
2010-09-20 22:08:53 +02:00
|
|
|
* src/include/utils/tuplesort.h
|
1999-10-18 00:15:09 +02:00
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
#ifndef TUPLESORT_H
|
|
|
|
#define TUPLESORT_H
|
|
|
|
|
|
|
|
#include "access/itup.h"
|
2006-06-27 18:53:02 +02:00
|
|
|
#include "executor/tuptable.h"
|
Support parallel btree index builds.
To make this work, tuplesort.c and logtape.c must also support
parallelism, so this patch adds that infrastructure and then applies
it to the particular case of parallel btree index builds. Testing
to date shows that this can often be 2-3x faster than a serial
index build.
The model for deciding how many workers to use is fairly primitive
at present, but it's better than not having the feature. We can
refine it as we get more experience.
Peter Geoghegan with some help from Rushabh Lathia. While Heikki
Linnakangas is not an author of this patch, he wrote other patches
without which this feature would not have been possible, and
therefore the release notes should possibly credit him as an author
of this feature. Reviewed by Claudio Freire, Heikki Linnakangas,
Thomas Munro, Tels, Amit Kapila, me.
Discussion: http://postgr.es/m/CAM3SWZQKM=Pzc=CAHzRixKjp2eO5Q0Jg1SoFQqeXFQ647JiwqQ@mail.gmail.com
Discussion: http://postgr.es/m/CAH2-Wz=AxWqDoVvGU7dq856S4r6sJAj6DBn7VMtigkB33N5eyg@mail.gmail.com
2018-02-02 19:25:55 +01:00
|
|
|
#include "storage/dsm.h"
|
2022-07-27 07:28:26 +02:00
|
|
|
#include "utils/logtape.h"
|
2008-06-19 02:46:06 +02:00
|
|
|
#include "utils/relcache.h"
|
2022-07-27 07:28:26 +02:00
|
|
|
#include "utils/sortsupport.h"
|
1999-10-18 00:15:09 +02:00
|
|
|
|
|
|
|
|
Support parallel btree index builds.
To make this work, tuplesort.c and logtape.c must also support
parallelism, so this patch adds that infrastructure and then applies
it to the particular case of parallel btree index builds. Testing
to date shows that this can often be 2-3x faster than a serial
index build.
The model for deciding how many workers to use is fairly primitive
at present, but it's better than not having the feature. We can
refine it as we get more experience.
Peter Geoghegan with some help from Rushabh Lathia. While Heikki
Linnakangas is not an author of this patch, he wrote other patches
without which this feature would not have been possible, and
therefore the release notes should possibly credit him as an author
of this feature. Reviewed by Claudio Freire, Heikki Linnakangas,
Thomas Munro, Tels, Amit Kapila, me.
Discussion: http://postgr.es/m/CAM3SWZQKM=Pzc=CAHzRixKjp2eO5Q0Jg1SoFQqeXFQ647JiwqQ@mail.gmail.com
Discussion: http://postgr.es/m/CAH2-Wz=AxWqDoVvGU7dq856S4r6sJAj6DBn7VMtigkB33N5eyg@mail.gmail.com
2018-02-02 19:25:55 +01:00
|
|
|
/*
|
|
|
|
* Tuplesortstate and Sharedsort are opaque types whose details are not
|
|
|
|
* known outside tuplesort.c.
|
2006-06-27 18:53:02 +02:00
|
|
|
*/
|
1999-10-18 00:15:09 +02:00
|
|
|
typedef struct Tuplesortstate Tuplesortstate;
|
Support parallel btree index builds.
To make this work, tuplesort.c and logtape.c must also support
parallelism, so this patch adds that infrastructure and then applies
it to the particular case of parallel btree index builds. Testing
to date shows that this can often be 2-3x faster than a serial
index build.
The model for deciding how many workers to use is fairly primitive
at present, but it's better than not having the feature. We can
refine it as we get more experience.
Peter Geoghegan with some help from Rushabh Lathia. While Heikki
Linnakangas is not an author of this patch, he wrote other patches
without which this feature would not have been possible, and
therefore the release notes should possibly credit him as an author
of this feature. Reviewed by Claudio Freire, Heikki Linnakangas,
Thomas Munro, Tels, Amit Kapila, me.
Discussion: http://postgr.es/m/CAM3SWZQKM=Pzc=CAHzRixKjp2eO5Q0Jg1SoFQqeXFQ647JiwqQ@mail.gmail.com
Discussion: http://postgr.es/m/CAH2-Wz=AxWqDoVvGU7dq856S4r6sJAj6DBn7VMtigkB33N5eyg@mail.gmail.com
2018-02-02 19:25:55 +01:00
|
|
|
typedef struct Sharedsort Sharedsort;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Tuplesort parallel coordination state, allocated by each participant in
|
|
|
|
* local memory. Participant caller initializes everything. See usage notes
|
|
|
|
* below.
|
|
|
|
*/
|
|
|
|
typedef struct SortCoordinateData
|
|
|
|
{
|
|
|
|
/* Worker process? If not, must be leader. */
|
|
|
|
bool isWorker;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Leader-process-passed number of participants known launched (workers
|
|
|
|
* set this to -1). Includes state within leader needed for it to
|
|
|
|
* participate as a worker, if any.
|
|
|
|
*/
|
|
|
|
int nParticipants;
|
|
|
|
|
|
|
|
/* Private opaque state (points to shared memory) */
|
|
|
|
Sharedsort *sharedsort;
|
|
|
|
} SortCoordinateData;
|
|
|
|
|
|
|
|
typedef struct SortCoordinateData *SortCoordinate;
|
1999-10-18 00:15:09 +02:00
|
|
|
|
2017-08-29 19:22:49 +02:00
|
|
|
/*
|
|
|
|
* Data structures for reporting sort statistics. Note that
|
|
|
|
* TuplesortInstrumentation can't contain any pointers because we
|
|
|
|
* sometimes put it in shared memory.
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
*
|
2020-04-07 04:22:13 +02:00
|
|
|
* The parallel-sort infrastructure relies on having a zero TuplesortMethod
|
2020-04-14 07:45:43 +02:00
|
|
|
* to indicate that a worker never did anything, so we assign zero to
|
2020-04-07 04:22:13 +02:00
|
|
|
* SORT_TYPE_STILL_IN_PROGRESS. The other values of this enum can be
|
|
|
|
* OR'ed together to represent a situation where different workers used
|
|
|
|
* different methods, so we need a separate bit for each one. Keep the
|
|
|
|
* NUM_TUPLESORTMETHODS constant in sync with the number of bits!
|
2017-08-29 19:22:49 +02:00
|
|
|
*/
|
|
|
|
typedef enum
|
|
|
|
{
|
2020-04-07 04:22:13 +02:00
|
|
|
SORT_TYPE_STILL_IN_PROGRESS = 0,
|
|
|
|
SORT_TYPE_TOP_N_HEAPSORT = 1 << 0,
|
|
|
|
SORT_TYPE_QUICKSORT = 1 << 1,
|
|
|
|
SORT_TYPE_EXTERNAL_SORT = 1 << 2,
|
|
|
|
SORT_TYPE_EXTERNAL_MERGE = 1 << 3
|
2017-08-29 19:22:49 +02:00
|
|
|
} TuplesortMethod;
|
|
|
|
|
2020-04-07 04:22:13 +02:00
|
|
|
#define NUM_TUPLESORTMETHODS 4
|
|
|
|
|
2017-08-29 19:22:49 +02:00
|
|
|
typedef enum
|
|
|
|
{
|
|
|
|
SORT_SPACE_TYPE_DISK,
|
|
|
|
SORT_SPACE_TYPE_MEMORY
|
|
|
|
} TuplesortSpaceType;
|
|
|
|
|
2022-04-04 12:24:59 +02:00
|
|
|
/* Bitwise option flags for tuple sorts */
|
|
|
|
#define TUPLESORT_NONE 0
|
|
|
|
|
|
|
|
/* specifies whether non-sequential access to the sort result is required */
|
|
|
|
#define TUPLESORT_RANDOMACCESS (1 << 0)
|
|
|
|
|
2022-04-04 12:52:35 +02:00
|
|
|
/* specifies if the tuplesort is able to support bounded sorts */
|
|
|
|
#define TUPLESORT_ALLOWBOUNDED (1 << 1)
|
|
|
|
|
2017-08-29 19:22:49 +02:00
|
|
|
typedef struct TuplesortInstrumentation
|
|
|
|
{
|
|
|
|
TuplesortMethod sortMethod; /* sort algorithm used */
|
|
|
|
TuplesortSpaceType spaceType; /* type of space spaceUsed represents */
|
2020-08-02 04:24:46 +02:00
|
|
|
int64 spaceUsed; /* space consumption, in kB */
|
2017-08-29 19:22:49 +02:00
|
|
|
} TuplesortInstrumentation;
|
|
|
|
|
2022-07-27 07:28:26 +02:00
|
|
|
/*
|
|
|
|
* The objects we actually sort are SortTuple structs. These contain
|
|
|
|
* a pointer to the tuple proper (might be a MinimalTuple or IndexTuple),
|
|
|
|
* which is a separate palloc chunk --- we assume it is just one chunk and
|
|
|
|
* can be freed by a simple pfree() (except during merge, when we use a
|
|
|
|
* simple slab allocator). SortTuples also contain the tuple's first key
|
|
|
|
* column in Datum/nullflag format, and a source/input tape number that
|
|
|
|
* tracks which tape each heap element/slot belongs to during merging.
|
|
|
|
*
|
|
|
|
* Storing the first key column lets us save heap_getattr or index_getattr
|
|
|
|
* calls during tuple comparisons. We could extract and save all the key
|
|
|
|
* columns not just the first, but this would increase code complexity and
|
|
|
|
* overhead, and wouldn't actually save any comparison cycles in the common
|
|
|
|
* case where the first key determines the comparison result. Note that
|
|
|
|
* for a pass-by-reference datatype, datum1 points into the "tuple" storage.
|
|
|
|
*
|
|
|
|
* There is one special case: when the sort support infrastructure provides an
|
|
|
|
* "abbreviated key" representation, where the key is (typically) a pass by
|
|
|
|
* value proxy for a pass by reference type. In this case, the abbreviated key
|
|
|
|
* is stored in datum1 in place of the actual first key column.
|
|
|
|
*
|
|
|
|
* When sorting single Datums, the data value is represented directly by
|
|
|
|
* datum1/isnull1 for pass by value types (or null values). If the datatype is
|
|
|
|
* pass-by-reference and isnull1 is false, then "tuple" points to a separately
|
|
|
|
* palloc'd data value, otherwise "tuple" is NULL. The value of datum1 is then
|
|
|
|
* either the same pointer as "tuple", or is an abbreviated key value as
|
|
|
|
* described above. Accordingly, "tuple" is always used in preference to
|
|
|
|
* datum1 as the authoritative value for pass-by-reference cases.
|
|
|
|
*/
|
|
|
|
typedef struct
|
|
|
|
{
|
|
|
|
void *tuple; /* the tuple itself */
|
|
|
|
Datum datum1; /* value of first key column */
|
|
|
|
bool isnull1; /* is first key column NULL? */
|
|
|
|
int srctape; /* source tape number */
|
|
|
|
} SortTuple;
|
|
|
|
|
|
|
|
typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b,
|
|
|
|
Tuplesortstate *state);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The public part of a Tuple sort operation state. This data structure
|
2023-04-18 03:23:23 +02:00
|
|
|
* contains the definition of sort-variant-specific interface methods and
|
2022-07-27 07:28:26 +02:00
|
|
|
* the part of Tuple sort operation state required by their implementations.
|
|
|
|
*/
|
|
|
|
typedef struct
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* These function pointers decouple the routines that must know what kind
|
|
|
|
* of tuple we are sorting from the routines that don't need to know it.
|
|
|
|
* They are set up by the tuplesort_begin_xxx routines.
|
|
|
|
*
|
|
|
|
* Function to compare two tuples; result is per qsort() convention, ie:
|
|
|
|
* <0, 0, >0 according as a<b, a=b, a>b. The API must match
|
|
|
|
* qsort_arg_comparator.
|
|
|
|
*/
|
|
|
|
SortTupleComparator comparetup;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Alter datum1 representation in the SortTuple's array back from the
|
|
|
|
* abbreviated key to the first column value.
|
|
|
|
*/
|
|
|
|
void (*removeabbrev) (Tuplesortstate *state, SortTuple *stups,
|
|
|
|
int count);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Function to write a stored tuple onto tape. The representation of the
|
|
|
|
* tuple on tape need not be the same as it is in memory.
|
|
|
|
*/
|
|
|
|
void (*writetup) (Tuplesortstate *state, LogicalTape *tape,
|
|
|
|
SortTuple *stup);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Function to read a stored tuple from tape back into memory. 'len' is
|
|
|
|
* the already-read length of the stored tuple. The tuple is allocated
|
|
|
|
* from the slab memory arena, or is palloc'd, see
|
|
|
|
* tuplesort_readtup_alloc().
|
|
|
|
*/
|
|
|
|
void (*readtup) (Tuplesortstate *state, SortTuple *stup,
|
|
|
|
LogicalTape *tape, unsigned int len);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Function to do some specific release of resources for the sort variant.
|
|
|
|
* In particular, this function should free everything stored in the "arg"
|
|
|
|
* field, which wouldn't be cleared on reset of the Tuple sort memory
|
2023-04-18 03:23:23 +02:00
|
|
|
* contexts. This can be NULL if nothing specific needs to be done.
|
2022-07-27 07:28:26 +02:00
|
|
|
*/
|
|
|
|
void (*freestate) (Tuplesortstate *state);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The subsequent fields are used in the implementations of the functions
|
|
|
|
* above.
|
|
|
|
*/
|
|
|
|
MemoryContext maincontext; /* memory context for tuple sort metadata that
|
|
|
|
* persists across multiple batches */
|
|
|
|
MemoryContext sortcontext; /* memory context holding most sort data */
|
|
|
|
MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Whether SortTuple's datum1 and isnull1 members are maintained by the
|
|
|
|
* above routines. If not, some sort specializations are disabled.
|
|
|
|
*/
|
|
|
|
bool haveDatum1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The sortKeys variable is used by every case other than the hash index
|
|
|
|
* case; it is set by tuplesort_begin_xxx. tupDesc is only used by the
|
|
|
|
* MinimalTuple and CLUSTER routines, though.
|
|
|
|
*/
|
|
|
|
int nKeys; /* number of columns in sort key */
|
|
|
|
SortSupport sortKeys; /* array of length nKeys */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This variable is shared by the single-key MinimalTuple case and the
|
|
|
|
* Datum case (which both use qsort_ssup()). Otherwise, it's NULL. The
|
|
|
|
* presence of a value in this field is also checked by various sort
|
|
|
|
* specialization functions as an optimization when comparing the leading
|
|
|
|
* key in a tiebreak situation to determine if there are any subsequent
|
|
|
|
* keys to sort on.
|
|
|
|
*/
|
|
|
|
SortSupport onlyKey;
|
|
|
|
|
|
|
|
int sortopt; /* Bitmask of flags used to setup sort */
|
|
|
|
|
|
|
|
bool tuples; /* Can SortTuple.tuple ever be set? */
|
|
|
|
|
|
|
|
void *arg; /* Specific information for the sort variant */
|
|
|
|
} TuplesortPublic;
|
|
|
|
|
|
|
|
/* Sort parallel code from state for sort__start probes */
|
|
|
|
#define PARALLEL_SORT(coordinate) (coordinate == NULL || \
|
|
|
|
(coordinate)->sharedsort == NULL ? 0 : \
|
|
|
|
(coordinate)->isWorker ? 1 : 2)
|
|
|
|
|
|
|
|
#define TuplesortstateGetPublic(state) ((TuplesortPublic *) state)
|
|
|
|
|
|
|
|
/* When using this macro, beware of double evaluation of len */
|
|
|
|
#define LogicalTapeReadExact(tape, ptr, len) \
|
|
|
|
do { \
|
|
|
|
if (LogicalTapeRead(tape, ptr, len) != (size_t) (len)) \
|
|
|
|
elog(ERROR, "unexpected end of data"); \
|
|
|
|
} while(0)
|
2017-08-29 19:22:49 +02:00
|
|
|
|
1999-10-18 00:15:09 +02:00
|
|
|
/*
|
2010-10-08 02:00:28 +02:00
|
|
|
* We provide multiple interfaces to what is essentially the same code,
|
|
|
|
* since different callers have different data to be sorted and want to
|
|
|
|
* specify the sort key information differently. There are two APIs for
|
|
|
|
* sorting HeapTuples and two more for sorting IndexTuples. Yet another
|
|
|
|
* API supports sorting bare Datums.
|
2006-05-23 23:37:59 +02:00
|
|
|
*
|
Support parallel btree index builds.
To make this work, tuplesort.c and logtape.c must also support
parallelism, so this patch adds that infrastructure and then applies
it to the particular case of parallel btree index builds. Testing
to date shows that this can often be 2-3x faster than a serial
index build.
The model for deciding how many workers to use is fairly primitive
at present, but it's better than not having the feature. We can
refine it as we get more experience.
Peter Geoghegan with some help from Rushabh Lathia. While Heikki
Linnakangas is not an author of this patch, he wrote other patches
without which this feature would not have been possible, and
therefore the release notes should possibly credit him as an author
of this feature. Reviewed by Claudio Freire, Heikki Linnakangas,
Thomas Munro, Tels, Amit Kapila, me.
Discussion: http://postgr.es/m/CAM3SWZQKM=Pzc=CAHzRixKjp2eO5Q0Jg1SoFQqeXFQ647JiwqQ@mail.gmail.com
Discussion: http://postgr.es/m/CAH2-Wz=AxWqDoVvGU7dq856S4r6sJAj6DBn7VMtigkB33N5eyg@mail.gmail.com
2018-02-02 19:25:55 +01:00
|
|
|
* Serial sort callers should pass NULL for their coordinate argument.
|
|
|
|
*
|
2010-10-08 02:00:28 +02:00
|
|
|
* The "heap" API actually stores/sorts MinimalTuples, which means it doesn't
|
|
|
|
* preserve the system columns (tuple identity and transaction visibility
|
|
|
|
* info). The sort keys are specified by column numbers within the tuples
|
|
|
|
* and sort operator OIDs. We save some cycles by passing and returning the
|
|
|
|
* tuples in TupleTableSlots, rather than forming actual HeapTuples (which'd
|
|
|
|
* have to be converted to MinimalTuples). This API works well for sorts
|
|
|
|
* executed as parts of plan trees.
|
2008-03-17 00:15:08 +01:00
|
|
|
*
|
2010-10-08 02:00:28 +02:00
|
|
|
* The "cluster" API stores/sorts full HeapTuples including all visibility
|
|
|
|
* info. The sort keys are specified by reference to a btree index that is
|
|
|
|
* defined on the relation to be sorted. Note that putheaptuple/getheaptuple
|
|
|
|
* go with this API, not the "begin_heap" one!
|
|
|
|
*
|
|
|
|
* The "index_btree" API stores/sorts IndexTuples (preserving all their
|
|
|
|
* header fields). The sort keys are specified by a btree index definition.
|
|
|
|
*
|
|
|
|
* The "index_hash" API is similar to index_btree, but the tuples are
|
|
|
|
* actually sorted by their hash codes not the raw data.
|
Support parallel btree index builds.
To make this work, tuplesort.c and logtape.c must also support
parallelism, so this patch adds that infrastructure and then applies
it to the particular case of parallel btree index builds. Testing
to date shows that this can often be 2-3x faster than a serial
index build.
The model for deciding how many workers to use is fairly primitive
at present, but it's better than not having the feature. We can
refine it as we get more experience.
Peter Geoghegan with some help from Rushabh Lathia. While Heikki
Linnakangas is not an author of this patch, he wrote other patches
without which this feature would not have been possible, and
therefore the release notes should possibly credit him as an author
of this feature. Reviewed by Claudio Freire, Heikki Linnakangas,
Thomas Munro, Tels, Amit Kapila, me.
Discussion: http://postgr.es/m/CAM3SWZQKM=Pzc=CAHzRixKjp2eO5Q0Jg1SoFQqeXFQ647JiwqQ@mail.gmail.com
Discussion: http://postgr.es/m/CAH2-Wz=AxWqDoVvGU7dq856S4r6sJAj6DBn7VMtigkB33N5eyg@mail.gmail.com
2018-02-02 19:25:55 +01:00
|
|
|
*
|
|
|
|
* Parallel sort callers are required to coordinate multiple tuplesort states
|
|
|
|
* in a leader process and one or more worker processes. The leader process
|
|
|
|
* must launch workers, and have each perform an independent "partial"
|
|
|
|
* tuplesort, typically fed by the parallel heap interface. The leader later
|
|
|
|
* produces the final output (internally, it merges runs output by workers).
|
|
|
|
*
|
|
|
|
* Callers must do the following to perform a sort in parallel using multiple
|
|
|
|
* worker processes:
|
|
|
|
*
|
|
|
|
* 1. Request tuplesort-private shared memory for n workers. Use
|
|
|
|
* tuplesort_estimate_shared() to get the required size.
|
|
|
|
* 2. Have leader process initialize allocated shared memory using
|
|
|
|
* tuplesort_initialize_shared(). Launch workers.
|
|
|
|
* 3. Initialize a coordinate argument within both the leader process, and
|
|
|
|
* for each worker process. This has a pointer to the shared
|
|
|
|
* tuplesort-private structure, as well as some caller-initialized fields.
|
|
|
|
* Leader's coordinate argument reliably indicates number of workers
|
|
|
|
* launched (this is unused by workers).
|
|
|
|
* 4. Begin a tuplesort using some appropriate tuplesort_begin* routine,
|
|
|
|
* (passing the coordinate argument) within each worker. The workMem
|
|
|
|
* arguments need not be identical. All other arguments should match
|
|
|
|
* exactly, though.
|
|
|
|
* 5. tuplesort_attach_shared() should be called by all workers. Feed tuples
|
|
|
|
* to each worker, and call tuplesort_performsort() within each when input
|
|
|
|
* is exhausted.
|
|
|
|
* 6. Call tuplesort_end() in each worker process. Worker processes can shut
|
|
|
|
* down once tuplesort_end() returns.
|
|
|
|
* 7. Begin a tuplesort in the leader using the same tuplesort_begin*
|
|
|
|
* routine, passing a leader-appropriate coordinate argument (this can
|
|
|
|
* happen as early as during step 3, actually, since we only need to know
|
|
|
|
* the number of workers successfully launched). The leader must now wait
|
|
|
|
* for workers to finish. Caller must use own mechanism for ensuring that
|
|
|
|
* next step isn't reached until all workers have called and returned from
|
|
|
|
* tuplesort_performsort(). (Note that it's okay if workers have already
|
|
|
|
* also called tuplesort_end() by then.)
|
|
|
|
* 8. Call tuplesort_performsort() in leader. Consume output using the
|
|
|
|
* appropriate tuplesort_get* routine. Leader can skip this step if
|
|
|
|
* tuplesort turns out to be unnecessary.
|
|
|
|
* 9. Call tuplesort_end() in leader.
|
|
|
|
*
|
|
|
|
* This division of labor assumes nothing about how input tuples are produced,
|
|
|
|
* but does require that caller combine the state of multiple tuplesorts for
|
|
|
|
* any purpose other than producing the final output. For example, callers
|
|
|
|
* must consider that tuplesort_get_stats() reports on only one worker's role
|
|
|
|
* in a sort (or the leader's role), and not statistics for the sort as a
|
|
|
|
* whole.
|
|
|
|
*
|
|
|
|
* Note that callers may use the leader process to sort runs as if it was an
|
|
|
|
* independent worker process (prior to the process performing a leader sort
|
|
|
|
* to produce the final sorted output). Doing so only requires a second
|
|
|
|
* "partial" tuplesort within the leader process, initialized like that of a
|
|
|
|
* worker process. The steps above don't touch on this directly. The only
|
|
|
|
* difference is that the tuplesort_attach_shared() call is never needed within
|
|
|
|
* leader process, because the backend as a whole holds the shared fileset
|
|
|
|
* reference. A worker Tuplesortstate in leader is expected to do exactly the
|
|
|
|
* same amount of total initial processing work as a worker process
|
|
|
|
* Tuplesortstate, since the leader process has nothing else to do before
|
|
|
|
* workers finish.
|
|
|
|
*
|
|
|
|
* Note that only a very small amount of memory will be allocated prior to
|
|
|
|
* the leader state first consuming input, and that workers will free the
|
|
|
|
* vast majority of their memory upon returning from tuplesort_performsort().
|
|
|
|
* Callers can rely on this to arrange for memory to be used in a way that
|
|
|
|
* respects a workMem-style budget across an entire parallel sort operation.
|
|
|
|
*
|
|
|
|
* Callers are responsible for parallel safety in general. However, they
|
|
|
|
* can at least rely on there being no parallel safety hazards within
|
|
|
|
* tuplesort, because tuplesort thinks of the sort as several independent
|
|
|
|
* sorts whose results are combined. Since, in general, the behavior of
|
|
|
|
* sort operators is immutable, caller need only worry about the parallel
|
|
|
|
* safety of whatever the process is through which input tuples are
|
|
|
|
* generated (typically, caller uses a parallel heap scan).
|
1999-10-18 00:15:09 +02:00
|
|
|
*/
|
|
|
|
|
2022-07-27 07:28:26 +02:00
|
|
|
|
|
|
|
extern Tuplesortstate *tuplesort_begin_common(int workMem,
|
|
|
|
SortCoordinate coordinate,
|
|
|
|
int sortopt);
|
|
|
|
extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound);
|
|
|
|
extern bool tuplesort_used_bound(Tuplesortstate *state);
|
|
|
|
extern void tuplesort_puttuple_common(Tuplesortstate *state,
|
|
|
|
SortTuple *tuple, bool useAbbrev);
|
|
|
|
extern void tuplesort_performsort(Tuplesortstate *state);
|
|
|
|
extern bool tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
|
|
|
|
SortTuple *stup);
|
|
|
|
extern bool tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples,
|
|
|
|
bool forward);
|
|
|
|
extern void tuplesort_end(Tuplesortstate *state);
|
|
|
|
extern void tuplesort_reset(Tuplesortstate *state);
|
|
|
|
|
|
|
|
extern void tuplesort_get_stats(Tuplesortstate *state,
|
|
|
|
TuplesortInstrumentation *stats);
|
|
|
|
extern const char *tuplesort_method_name(TuplesortMethod m);
|
|
|
|
extern const char *tuplesort_space_type_name(TuplesortSpaceType t);
|
|
|
|
|
|
|
|
extern int tuplesort_merge_order(int64 allowedMem);
|
|
|
|
|
2022-09-20 22:09:30 +02:00
|
|
|
extern Size tuplesort_estimate_shared(int nWorkers);
|
2022-07-27 07:28:26 +02:00
|
|
|
extern void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers,
|
|
|
|
dsm_segment *seg);
|
|
|
|
extern void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg);
|
|
|
|
|
|
|
|
/*
|
2022-11-02 03:29:31 +01:00
|
|
|
* These routines may only be called if TUPLESORT_RANDOMACCESS was specified
|
|
|
|
* during tuplesort_begin_*. Additionally backwards scan in gettuple/getdatum
|
|
|
|
* also require TUPLESORT_RANDOMACCESS. Note that parallel sorts do not
|
|
|
|
* support random access.
|
2022-07-27 07:28:26 +02:00
|
|
|
*/
|
|
|
|
extern void tuplesort_rescan(Tuplesortstate *state);
|
|
|
|
extern void tuplesort_markpos(Tuplesortstate *state);
|
|
|
|
extern void tuplesort_restorepos(Tuplesortstate *state);
|
|
|
|
|
|
|
|
extern void *tuplesort_readtup_alloc(Tuplesortstate *state, Size tuplen);
|
|
|
|
|
|
|
|
|
|
|
|
/* tuplesortvariants.c */
|
|
|
|
|
1999-10-18 00:15:09 +02:00
|
|
|
extern Tuplesortstate *tuplesort_begin_heap(TupleDesc tupDesc,
|
2007-01-09 03:14:16 +01:00
|
|
|
int nkeys, AttrNumber *attNums,
|
2011-04-13 01:19:24 +02:00
|
|
|
Oid *sortOperators, Oid *sortCollations,
|
|
|
|
bool *nullsFirstFlags,
|
Support parallel btree index builds.
To make this work, tuplesort.c and logtape.c must also support
parallelism, so this patch adds that infrastructure and then applies
it to the particular case of parallel btree index builds. Testing
to date shows that this can often be 2-3x faster than a serial
index build.
The model for deciding how many workers to use is fairly primitive
at present, but it's better than not having the feature. We can
refine it as we get more experience.
Peter Geoghegan with some help from Rushabh Lathia. While Heikki
Linnakangas is not an author of this patch, he wrote other patches
without which this feature would not have been possible, and
therefore the release notes should possibly credit him as an author
of this feature. Reviewed by Claudio Freire, Heikki Linnakangas,
Thomas Munro, Tels, Amit Kapila, me.
Discussion: http://postgr.es/m/CAM3SWZQKM=Pzc=CAHzRixKjp2eO5Q0Jg1SoFQqeXFQ647JiwqQ@mail.gmail.com
Discussion: http://postgr.es/m/CAH2-Wz=AxWqDoVvGU7dq856S4r6sJAj6DBn7VMtigkB33N5eyg@mail.gmail.com
2018-02-02 19:25:55 +01:00
|
|
|
int workMem, SortCoordinate coordinate,
|
2022-04-04 12:24:59 +02:00
|
|
|
int sortopt);
|
2010-10-08 02:00:28 +02:00
|
|
|
extern Tuplesortstate *tuplesort_begin_cluster(TupleDesc tupDesc,
|
2023-04-02 05:12:26 +02:00
|
|
|
Relation indexRel,
|
|
|
|
Relation heaprel,
|
|
|
|
int workMem,
|
2022-04-04 12:24:59 +02:00
|
|
|
SortCoordinate coordinate,
|
|
|
|
int sortopt);
|
Provide database object names as separate fields in error messages.
This patch addresses the problem that applications currently have to
extract object names from possibly-localized textual error messages,
if they want to know for example which index caused a UNIQUE_VIOLATION
failure. It adds new error message fields to the wire protocol, which
can carry the name of a table, table column, data type, or constraint
associated with the error. (Since the protocol spec has always instructed
clients to ignore unrecognized field types, this should not create any
compatibility problem.)
Support for providing these new fields has been added to just a limited set
of error reports (mainly, those in the "integrity constraint violation"
SQLSTATE class), but we will doubtless add them to more calls in future.
Pavel Stehule, reviewed and extensively revised by Peter Geoghegan, with
additional hacking by Tom Lane.
2013-01-29 23:06:26 +01:00
|
|
|
extern Tuplesortstate *tuplesort_begin_index_btree(Relation heapRel,
|
|
|
|
Relation indexRel,
|
2008-03-17 00:15:08 +01:00
|
|
|
bool enforceUnique,
|
2022-02-03 11:29:54 +01:00
|
|
|
bool uniqueNullsNotDistinct,
|
Support parallel btree index builds.
To make this work, tuplesort.c and logtape.c must also support
parallelism, so this patch adds that infrastructure and then applies
it to the particular case of parallel btree index builds. Testing
to date shows that this can often be 2-3x faster than a serial
index build.
The model for deciding how many workers to use is fairly primitive
at present, but it's better than not having the feature. We can
refine it as we get more experience.
Peter Geoghegan with some help from Rushabh Lathia. While Heikki
Linnakangas is not an author of this patch, he wrote other patches
without which this feature would not have been possible, and
therefore the release notes should possibly credit him as an author
of this feature. Reviewed by Claudio Freire, Heikki Linnakangas,
Thomas Munro, Tels, Amit Kapila, me.
Discussion: http://postgr.es/m/CAM3SWZQKM=Pzc=CAHzRixKjp2eO5Q0Jg1SoFQqeXFQ647JiwqQ@mail.gmail.com
Discussion: http://postgr.es/m/CAH2-Wz=AxWqDoVvGU7dq856S4r6sJAj6DBn7VMtigkB33N5eyg@mail.gmail.com
2018-02-02 19:25:55 +01:00
|
|
|
int workMem, SortCoordinate coordinate,
|
2022-04-04 12:24:59 +02:00
|
|
|
int sortopt);
|
Provide database object names as separate fields in error messages.
This patch addresses the problem that applications currently have to
extract object names from possibly-localized textual error messages,
if they want to know for example which index caused a UNIQUE_VIOLATION
failure. It adds new error message fields to the wire protocol, which
can carry the name of a table, table column, data type, or constraint
associated with the error. (Since the protocol spec has always instructed
clients to ignore unrecognized field types, this should not create any
compatibility problem.)
Support for providing these new fields has been added to just a limited set
of error reports (mainly, those in the "integrity constraint violation"
SQLSTATE class), but we will doubtless add them to more calls in future.
Pavel Stehule, reviewed and extensively revised by Peter Geoghegan, with
additional hacking by Tom Lane.
2013-01-29 23:06:26 +01:00
|
|
|
extern Tuplesortstate *tuplesort_begin_index_hash(Relation heapRel,
|
|
|
|
Relation indexRel,
|
Expand hash indexes more gradually.
Since hash indexes typically have very few overflow pages, adding a
new splitpoint essentially doubles the on-disk size of the index,
which can lead to large and abrupt increases in disk usage (and
perhaps long delays on occasion). To mitigate this problem to some
degree, divide larger splitpoints into four equal phases. This means
that, for example, instead of growing from 4GB to 8GB all at once, a
hash index will now grow from 4GB to 5GB to 6GB to 7GB to 8GB, which
is perhaps still not as smooth as we'd like but certainly an
improvement.
This changes the on-disk format of the metapage, so bump HASH_VERSION
from 2 to 3. This will force a REINDEX of all existing hash indexes,
but that's probably a good idea anyway. First, hash indexes from
pre-10 versions of PostgreSQL could easily be corrupted, and we don't
want to confuse corruption carried over from an older release with any
corruption caused despite the new write-ahead logging in v10. Second,
it will let us remove some backward-compatibility code added by commit
293e24e507838733aba4748b514536af2d39d7f2.
Mithun Cy, reviewed by Amit Kapila, Jesper Pedersen and me. Regression
test outputs updated by me.
Discussion: http://postgr.es/m/CAD__OuhG6F1gQLCgMQNnMNgoCvOLQZz9zKYJQNYvYmmJoM42gA@mail.gmail.com
Discussion: http://postgr.es/m/CA+TgmoYty0jCf-pa+m+vYUJ716+AxM7nv_syvyanyf5O-L_i2A@mail.gmail.com
2017-04-04 05:46:33 +02:00
|
|
|
uint32 high_mask,
|
|
|
|
uint32 low_mask,
|
|
|
|
uint32 max_buckets,
|
Support parallel btree index builds.
To make this work, tuplesort.c and logtape.c must also support
parallelism, so this patch adds that infrastructure and then applies
it to the particular case of parallel btree index builds. Testing
to date shows that this can often be 2-3x faster than a serial
index build.
The model for deciding how many workers to use is fairly primitive
at present, but it's better than not having the feature. We can
refine it as we get more experience.
Peter Geoghegan with some help from Rushabh Lathia. While Heikki
Linnakangas is not an author of this patch, he wrote other patches
without which this feature would not have been possible, and
therefore the release notes should possibly credit him as an author
of this feature. Reviewed by Claudio Freire, Heikki Linnakangas,
Thomas Munro, Tels, Amit Kapila, me.
Discussion: http://postgr.es/m/CAM3SWZQKM=Pzc=CAHzRixKjp2eO5Q0Jg1SoFQqeXFQ647JiwqQ@mail.gmail.com
Discussion: http://postgr.es/m/CAH2-Wz=AxWqDoVvGU7dq856S4r6sJAj6DBn7VMtigkB33N5eyg@mail.gmail.com
2018-02-02 19:25:55 +01:00
|
|
|
int workMem, SortCoordinate coordinate,
|
2022-04-04 12:24:59 +02:00
|
|
|
int sortopt);
|
2020-09-17 10:33:40 +02:00
|
|
|
extern Tuplesortstate *tuplesort_begin_index_gist(Relation heapRel,
|
|
|
|
Relation indexRel,
|
|
|
|
int workMem, SortCoordinate coordinate,
|
2022-04-04 12:24:59 +02:00
|
|
|
int sortopt);
|
1999-12-13 02:27:21 +01:00
|
|
|
extern Tuplesortstate *tuplesort_begin_datum(Oid datumType,
|
2011-04-13 01:19:24 +02:00
|
|
|
Oid sortOperator, Oid sortCollation,
|
|
|
|
bool nullsFirstFlag,
|
Support parallel btree index builds.
To make this work, tuplesort.c and logtape.c must also support
parallelism, so this patch adds that infrastructure and then applies
it to the particular case of parallel btree index builds. Testing
to date shows that this can often be 2-3x faster than a serial
index build.
The model for deciding how many workers to use is fairly primitive
at present, but it's better than not having the feature. We can
refine it as we get more experience.
Peter Geoghegan with some help from Rushabh Lathia. While Heikki
Linnakangas is not an author of this patch, he wrote other patches
without which this feature would not have been possible, and
therefore the release notes should possibly credit him as an author
of this feature. Reviewed by Claudio Freire, Heikki Linnakangas,
Thomas Munro, Tels, Amit Kapila, me.
Discussion: http://postgr.es/m/CAM3SWZQKM=Pzc=CAHzRixKjp2eO5Q0Jg1SoFQqeXFQ647JiwqQ@mail.gmail.com
Discussion: http://postgr.es/m/CAH2-Wz=AxWqDoVvGU7dq856S4r6sJAj6DBn7VMtigkB33N5eyg@mail.gmail.com
2018-02-02 19:25:55 +01:00
|
|
|
int workMem, SortCoordinate coordinate,
|
2022-04-04 12:24:59 +02:00
|
|
|
int sortopt);
|
1999-10-18 00:15:09 +02:00
|
|
|
|
2006-06-27 18:53:02 +02:00
|
|
|
extern void tuplesort_puttupleslot(Tuplesortstate *state,
|
|
|
|
TupleTableSlot *slot);
|
2010-10-08 02:00:28 +02:00
|
|
|
extern void tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup);
|
2014-07-01 16:34:42 +02:00
|
|
|
extern void tuplesort_putindextuplevalues(Tuplesortstate *state,
|
|
|
|
Relation rel, ItemPointer self,
|
|
|
|
Datum *values, bool *isnull);
|
1999-12-13 02:27:21 +01:00
|
|
|
extern void tuplesort_putdatum(Tuplesortstate *state, Datum val,
|
|
|
|
bool isNull);
|
|
|
|
|
2006-06-27 18:53:02 +02:00
|
|
|
extern bool tuplesort_gettupleslot(Tuplesortstate *state, bool forward,
|
2017-04-06 23:48:59 +02:00
|
|
|
bool copy, TupleTableSlot *slot, Datum *abbrev);
|
2016-12-12 21:57:35 +01:00
|
|
|
extern HeapTuple tuplesort_getheaptuple(Tuplesortstate *state, bool forward);
|
|
|
|
extern IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward);
|
2022-10-27 22:25:12 +02:00
|
|
|
extern bool tuplesort_getdatum(Tuplesortstate *state, bool forward, bool copy,
|
2016-02-17 11:10:00 +01:00
|
|
|
Datum *val, bool *isNull, Datum *abbrev);
|
1999-12-13 02:27:21 +01:00
|
|
|
|
1999-10-18 00:15:09 +02:00
|
|
|
|
|
|
|
#endif /* TUPLESORT_H */
|