postgresql/src/backend/utils/sort/tuplesortvariants.c

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

1885 lines
54 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* tuplesortvariants.c
* Implementation of tuple sorting variants.
*
* This module handles the sorting of heap tuples, index tuples, or single
* Datums. The implementation is based on the generalized tuple sorting
* facility given in tuplesort.c. Support other kinds of sortable objects
* could be easily added here, another module, or even an extension.
*
*
* Copyright (c) 2022-2023, PostgreSQL Global Development Group
*
* IDENTIFICATION
* src/backend/utils/sort/tuplesortvariants.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
Allow parallel CREATE INDEX for BRIN indexes Allow using multiple worker processes to build BRIN index, which until now was supported only for BTREE indexes. For large tables this often results in significant speedup when the build is CPU-bound. The work is split in a simple way - each worker builds BRIN summaries on a subset of the table, determined by the regular parallel scan used to read the data, and feeds them into a shared tuplesort which sorts them by blkno (start of the range). The leader then reads this sorted stream of ranges, merges duplicates (which may happen if the parallel scan does not align with BRIN pages_per_range), and adds the resulting ranges into the index. The number of duplicate results produced by workers (requiring merging in the leader process) should be fairly small, thanks to how parallel scans assign chunks to workers. The likelihood of duplicate results may increase for higher pages_per_range values, but then there are fewer page ranges in total. In any case, we expect the merging to be much cheaper than summarization, so this should be a win. Most of the parallelism infrastructure is a simplified copy of the code used by BTREE indexes, omitting the parts irrelevant for BRIN indexes (e.g. uniqueness checks). This also introduces a new index AM flag amcanbuildparallel, determining whether to attempt to start parallel workers for the index build. Original patch by me, with reviews and substantial reworks by Matthias van de Meent, certainly enough to make him a co-author. Author: Tomas Vondra, Matthias van de Meent Reviewed-by: Matthias van de Meent Discussion: https://postgr.es/m/c2ee7d69-ce17-43f2-d1a0-9811edbda6e6%40enterprisedb.com
2023-12-08 18:15:23 +01:00
#include "access/brin_tuple.h"
#include "access/hash.h"
#include "access/htup_details.h"
#include "access/nbtree.h"
#include "catalog/index.h"
#include "executor/executor.h"
#include "pg_trace.h"
#include "utils/datum.h"
#include "utils/lsyscache.h"
#include "utils/guc.h"
#include "utils/tuplesort.h"
/* sort-type codes for sort__start probes */
#define HEAP_SORT 0
#define INDEX_SORT 1
#define DATUM_SORT 2
#define CLUSTER_SORT 3
static void removeabbrev_heap(Tuplesortstate *state, SortTuple *stups,
int count);
static void removeabbrev_cluster(Tuplesortstate *state, SortTuple *stups,
int count);
static void removeabbrev_index(Tuplesortstate *state, SortTuple *stups,
int count);
Allow parallel CREATE INDEX for BRIN indexes Allow using multiple worker processes to build BRIN index, which until now was supported only for BTREE indexes. For large tables this often results in significant speedup when the build is CPU-bound. The work is split in a simple way - each worker builds BRIN summaries on a subset of the table, determined by the regular parallel scan used to read the data, and feeds them into a shared tuplesort which sorts them by blkno (start of the range). The leader then reads this sorted stream of ranges, merges duplicates (which may happen if the parallel scan does not align with BRIN pages_per_range), and adds the resulting ranges into the index. The number of duplicate results produced by workers (requiring merging in the leader process) should be fairly small, thanks to how parallel scans assign chunks to workers. The likelihood of duplicate results may increase for higher pages_per_range values, but then there are fewer page ranges in total. In any case, we expect the merging to be much cheaper than summarization, so this should be a win. Most of the parallelism infrastructure is a simplified copy of the code used by BTREE indexes, omitting the parts irrelevant for BRIN indexes (e.g. uniqueness checks). This also introduces a new index AM flag amcanbuildparallel, determining whether to attempt to start parallel workers for the index build. Original patch by me, with reviews and substantial reworks by Matthias van de Meent, certainly enough to make him a co-author. Author: Tomas Vondra, Matthias van de Meent Reviewed-by: Matthias van de Meent Discussion: https://postgr.es/m/c2ee7d69-ce17-43f2-d1a0-9811edbda6e6%40enterprisedb.com
2023-12-08 18:15:23 +01:00
static void removeabbrev_index_brin(Tuplesortstate *state, SortTuple *stups,
int count);
static void removeabbrev_datum(Tuplesortstate *state, SortTuple *stups,
int count);
static int comparetup_heap(const SortTuple *a, const SortTuple *b,
Tuplesortstate *state);
static int comparetup_heap_tiebreak(const SortTuple *a, const SortTuple *b,
Tuplesortstate *state);
static void writetup_heap(Tuplesortstate *state, LogicalTape *tape,
SortTuple *stup);
static void readtup_heap(Tuplesortstate *state, SortTuple *stup,
LogicalTape *tape, unsigned int len);
static int comparetup_cluster(const SortTuple *a, const SortTuple *b,
Tuplesortstate *state);
static int comparetup_cluster_tiebreak(const SortTuple *a, const SortTuple *b,
Tuplesortstate *state);
static void writetup_cluster(Tuplesortstate *state, LogicalTape *tape,
SortTuple *stup);
static void readtup_cluster(Tuplesortstate *state, SortTuple *stup,
LogicalTape *tape, unsigned int tuplen);
static int comparetup_index_btree(const SortTuple *a, const SortTuple *b,
Tuplesortstate *state);
static int comparetup_index_btree_tiebreak(const SortTuple *a, const SortTuple *b,
Tuplesortstate *state);
static int comparetup_index_hash(const SortTuple *a, const SortTuple *b,
Tuplesortstate *state);
static int comparetup_index_hash_tiebreak(const SortTuple *a, const SortTuple *b,
Tuplesortstate *state);
Allow parallel CREATE INDEX for BRIN indexes Allow using multiple worker processes to build BRIN index, which until now was supported only for BTREE indexes. For large tables this often results in significant speedup when the build is CPU-bound. The work is split in a simple way - each worker builds BRIN summaries on a subset of the table, determined by the regular parallel scan used to read the data, and feeds them into a shared tuplesort which sorts them by blkno (start of the range). The leader then reads this sorted stream of ranges, merges duplicates (which may happen if the parallel scan does not align with BRIN pages_per_range), and adds the resulting ranges into the index. The number of duplicate results produced by workers (requiring merging in the leader process) should be fairly small, thanks to how parallel scans assign chunks to workers. The likelihood of duplicate results may increase for higher pages_per_range values, but then there are fewer page ranges in total. In any case, we expect the merging to be much cheaper than summarization, so this should be a win. Most of the parallelism infrastructure is a simplified copy of the code used by BTREE indexes, omitting the parts irrelevant for BRIN indexes (e.g. uniqueness checks). This also introduces a new index AM flag amcanbuildparallel, determining whether to attempt to start parallel workers for the index build. Original patch by me, with reviews and substantial reworks by Matthias van de Meent, certainly enough to make him a co-author. Author: Tomas Vondra, Matthias van de Meent Reviewed-by: Matthias van de Meent Discussion: https://postgr.es/m/c2ee7d69-ce17-43f2-d1a0-9811edbda6e6%40enterprisedb.com
2023-12-08 18:15:23 +01:00
static int comparetup_index_brin(const SortTuple *a, const SortTuple *b,
Tuplesortstate *state);
static void writetup_index(Tuplesortstate *state, LogicalTape *tape,
SortTuple *stup);
static void readtup_index(Tuplesortstate *state, SortTuple *stup,
LogicalTape *tape, unsigned int len);
Allow parallel CREATE INDEX for BRIN indexes Allow using multiple worker processes to build BRIN index, which until now was supported only for BTREE indexes. For large tables this often results in significant speedup when the build is CPU-bound. The work is split in a simple way - each worker builds BRIN summaries on a subset of the table, determined by the regular parallel scan used to read the data, and feeds them into a shared tuplesort which sorts them by blkno (start of the range). The leader then reads this sorted stream of ranges, merges duplicates (which may happen if the parallel scan does not align with BRIN pages_per_range), and adds the resulting ranges into the index. The number of duplicate results produced by workers (requiring merging in the leader process) should be fairly small, thanks to how parallel scans assign chunks to workers. The likelihood of duplicate results may increase for higher pages_per_range values, but then there are fewer page ranges in total. In any case, we expect the merging to be much cheaper than summarization, so this should be a win. Most of the parallelism infrastructure is a simplified copy of the code used by BTREE indexes, omitting the parts irrelevant for BRIN indexes (e.g. uniqueness checks). This also introduces a new index AM flag amcanbuildparallel, determining whether to attempt to start parallel workers for the index build. Original patch by me, with reviews and substantial reworks by Matthias van de Meent, certainly enough to make him a co-author. Author: Tomas Vondra, Matthias van de Meent Reviewed-by: Matthias van de Meent Discussion: https://postgr.es/m/c2ee7d69-ce17-43f2-d1a0-9811edbda6e6%40enterprisedb.com
2023-12-08 18:15:23 +01:00
static void writetup_index_brin(Tuplesortstate *state, LogicalTape *tape,
SortTuple *stup);
static void readtup_index_brin(Tuplesortstate *state, SortTuple *stup,
LogicalTape *tape, unsigned int len);
static int comparetup_datum(const SortTuple *a, const SortTuple *b,
Tuplesortstate *state);
static int comparetup_datum_tiebreak(const SortTuple *a, const SortTuple *b,
Tuplesortstate *state);
static void writetup_datum(Tuplesortstate *state, LogicalTape *tape,
SortTuple *stup);
static void readtup_datum(Tuplesortstate *state, SortTuple *stup,
LogicalTape *tape, unsigned int len);
static void freestate_cluster(Tuplesortstate *state);
/*
* Data struture pointed by "TuplesortPublic.arg" for the CLUSTER case. Set by
* the tuplesort_begin_cluster.
*/
typedef struct
{
TupleDesc tupDesc;
IndexInfo *indexInfo; /* info about index being used for reference */
EState *estate; /* for evaluating index expressions */
} TuplesortClusterArg;
/*
* Data struture pointed by "TuplesortPublic.arg" for the IndexTuple case.
* Set by tuplesort_begin_index_xxx and used only by the IndexTuple routines.
*/
typedef struct
{
Relation heapRel; /* table the index is being built on */
Relation indexRel; /* index being built */
} TuplesortIndexArg;
/*
* Data struture pointed by "TuplesortPublic.arg" for the index_btree subcase.
*/
typedef struct
{
TuplesortIndexArg index;
bool enforceUnique; /* complain if we find duplicate tuples */
bool uniqueNullsNotDistinct; /* unique constraint null treatment */
} TuplesortIndexBTreeArg;
/*
* Data struture pointed by "TuplesortPublic.arg" for the index_hash subcase.
*/
typedef struct
{
TuplesortIndexArg index;
uint32 high_mask; /* masks for sortable part of hash code */
uint32 low_mask;
uint32 max_buckets;
} TuplesortIndexHashArg;
Allow parallel CREATE INDEX for BRIN indexes Allow using multiple worker processes to build BRIN index, which until now was supported only for BTREE indexes. For large tables this often results in significant speedup when the build is CPU-bound. The work is split in a simple way - each worker builds BRIN summaries on a subset of the table, determined by the regular parallel scan used to read the data, and feeds them into a shared tuplesort which sorts them by blkno (start of the range). The leader then reads this sorted stream of ranges, merges duplicates (which may happen if the parallel scan does not align with BRIN pages_per_range), and adds the resulting ranges into the index. The number of duplicate results produced by workers (requiring merging in the leader process) should be fairly small, thanks to how parallel scans assign chunks to workers. The likelihood of duplicate results may increase for higher pages_per_range values, but then there are fewer page ranges in total. In any case, we expect the merging to be much cheaper than summarization, so this should be a win. Most of the parallelism infrastructure is a simplified copy of the code used by BTREE indexes, omitting the parts irrelevant for BRIN indexes (e.g. uniqueness checks). This also introduces a new index AM flag amcanbuildparallel, determining whether to attempt to start parallel workers for the index build. Original patch by me, with reviews and substantial reworks by Matthias van de Meent, certainly enough to make him a co-author. Author: Tomas Vondra, Matthias van de Meent Reviewed-by: Matthias van de Meent Discussion: https://postgr.es/m/c2ee7d69-ce17-43f2-d1a0-9811edbda6e6%40enterprisedb.com
2023-12-08 18:15:23 +01:00
/*
* Data struture pointed by "TuplesortPublic.arg" for the index_brin subcase.
*/
typedef struct
{
TuplesortIndexArg index;
/* XXX do we need something here? */
} TuplesortIndexBrinArg;
/*
* Data struture pointed by "TuplesortPublic.arg" for the Datum case.
* Set by tuplesort_begin_datum and used only by the DatumTuple routines.
*/
typedef struct
{
/* the datatype oid of Datum's to be sorted */
Oid datumType;
/* we need typelen in order to know how to copy the Datums. */
int datumTypeLen;
} TuplesortDatumArg;
Allow parallel CREATE INDEX for BRIN indexes Allow using multiple worker processes to build BRIN index, which until now was supported only for BTREE indexes. For large tables this often results in significant speedup when the build is CPU-bound. The work is split in a simple way - each worker builds BRIN summaries on a subset of the table, determined by the regular parallel scan used to read the data, and feeds them into a shared tuplesort which sorts them by blkno (start of the range). The leader then reads this sorted stream of ranges, merges duplicates (which may happen if the parallel scan does not align with BRIN pages_per_range), and adds the resulting ranges into the index. The number of duplicate results produced by workers (requiring merging in the leader process) should be fairly small, thanks to how parallel scans assign chunks to workers. The likelihood of duplicate results may increase for higher pages_per_range values, but then there are fewer page ranges in total. In any case, we expect the merging to be much cheaper than summarization, so this should be a win. Most of the parallelism infrastructure is a simplified copy of the code used by BTREE indexes, omitting the parts irrelevant for BRIN indexes (e.g. uniqueness checks). This also introduces a new index AM flag amcanbuildparallel, determining whether to attempt to start parallel workers for the index build. Original patch by me, with reviews and substantial reworks by Matthias van de Meent, certainly enough to make him a co-author. Author: Tomas Vondra, Matthias van de Meent Reviewed-by: Matthias van de Meent Discussion: https://postgr.es/m/c2ee7d69-ce17-43f2-d1a0-9811edbda6e6%40enterprisedb.com
2023-12-08 18:15:23 +01:00
/*
* Computing BrinTuple size with only the tuple is difficult, so we want to track
* the length referenced by the SortTuple. That's what BrinSortTuple is meant
* to do - it's essentially a BrinTuple prefixed by its length.
*/
typedef struct BrinSortTuple
{
Size tuplen;
BrinTuple tuple;
} BrinSortTuple;
/* Size of the BrinSortTuple, given length of the BrinTuple. */
#define BRINSORTTUPLE_SIZE(len) (offsetof(BrinSortTuple, tuple) + (len))
Tuplesortstate *
tuplesort_begin_heap(TupleDesc tupDesc,
int nkeys, AttrNumber *attNums,
Oid *sortOperators, Oid *sortCollations,
bool *nullsFirstFlags,
int workMem, SortCoordinate coordinate, int sortopt)
{
Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
sortopt);
TuplesortPublic *base = TuplesortstateGetPublic(state);
MemoryContext oldcontext;
int i;
oldcontext = MemoryContextSwitchTo(base->maincontext);
Assert(nkeys > 0);
#ifdef TRACE_SORT
if (trace_sort)
elog(LOG,
"begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c",
nkeys, workMem, sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f');
#endif
base->nKeys = nkeys;
TRACE_POSTGRESQL_SORT_START(HEAP_SORT,
false, /* no unique check */
nkeys,
workMem,
sortopt & TUPLESORT_RANDOMACCESS,
PARALLEL_SORT(coordinate));
base->removeabbrev = removeabbrev_heap;
base->comparetup = comparetup_heap;
base->comparetup_tiebreak = comparetup_heap_tiebreak;
base->writetup = writetup_heap;
base->readtup = readtup_heap;
base->haveDatum1 = true;
base->arg = tupDesc; /* assume we need not copy tupDesc */
/* Prepare SortSupport data for each column */
base->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData));
for (i = 0; i < nkeys; i++)
{
SortSupport sortKey = base->sortKeys + i;
Assert(attNums[i] != 0);
Assert(sortOperators[i] != 0);
sortKey->ssup_cxt = CurrentMemoryContext;
sortKey->ssup_collation = sortCollations[i];
sortKey->ssup_nulls_first = nullsFirstFlags[i];
sortKey->ssup_attno = attNums[i];
/* Convey if abbreviation optimization is applicable in principle */
sortKey->abbreviate = (i == 0 && base->haveDatum1);
PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey);
}
/*
* The "onlyKey" optimization cannot be used with abbreviated keys, since
* tie-breaker comparisons may be required. Typically, the optimization
* is only of value to pass-by-value types anyway, whereas abbreviated
* keys are typically only of value to pass-by-reference types.
*/
if (nkeys == 1 && !base->sortKeys->abbrev_converter)
base->onlyKey = base->sortKeys;
MemoryContextSwitchTo(oldcontext);
return state;
}
Tuplesortstate *
tuplesort_begin_cluster(TupleDesc tupDesc,
Relation indexRel,
int workMem,
SortCoordinate coordinate, int sortopt)
{
Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
sortopt);
TuplesortPublic *base = TuplesortstateGetPublic(state);
BTScanInsert indexScanKey;
MemoryContext oldcontext;
TuplesortClusterArg *arg;
int i;
Assert(indexRel->rd_rel->relam == BTREE_AM_OID);
oldcontext = MemoryContextSwitchTo(base->maincontext);
arg = (TuplesortClusterArg *) palloc0(sizeof(TuplesortClusterArg));
#ifdef TRACE_SORT
if (trace_sort)
elog(LOG,
"begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c",
RelationGetNumberOfAttributes(indexRel),
workMem, sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f');
#endif
base->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel);
TRACE_POSTGRESQL_SORT_START(CLUSTER_SORT,
false, /* no unique check */
base->nKeys,
workMem,
sortopt & TUPLESORT_RANDOMACCESS,
PARALLEL_SORT(coordinate));
base->removeabbrev = removeabbrev_cluster;
base->comparetup = comparetup_cluster;
base->comparetup_tiebreak = comparetup_cluster_tiebreak;
base->writetup = writetup_cluster;
base->readtup = readtup_cluster;
base->freestate = freestate_cluster;
base->arg = arg;
arg->indexInfo = BuildIndexInfo(indexRel);
/*
* If we don't have a simple leading attribute, we don't currently
* initialize datum1, so disable optimizations that require it.
*/
if (arg->indexInfo->ii_IndexAttrNumbers[0] == 0)
base->haveDatum1 = false;
else
base->haveDatum1 = true;
arg->tupDesc = tupDesc; /* assume we need not copy tupDesc */
indexScanKey = _bt_mkscankey(indexRel, NULL);
if (arg->indexInfo->ii_Expressions != NULL)
{
TupleTableSlot *slot;
ExprContext *econtext;
/*
* We will need to use FormIndexDatum to evaluate the index
* expressions. To do that, we need an EState, as well as a
* TupleTableSlot to put the table tuples into. The econtext's
* scantuple has to point to that slot, too.
*/
arg->estate = CreateExecutorState();
slot = MakeSingleTupleTableSlot(tupDesc, &TTSOpsHeapTuple);
econtext = GetPerTupleExprContext(arg->estate);
econtext->ecxt_scantuple = slot;
}
/* Prepare SortSupport data for each column */
base->sortKeys = (SortSupport) palloc0(base->nKeys *
sizeof(SortSupportData));
for (i = 0; i < base->nKeys; i++)
{
SortSupport sortKey = base->sortKeys + i;
ScanKey scanKey = indexScanKey->scankeys + i;
int16 strategy;
sortKey->ssup_cxt = CurrentMemoryContext;
sortKey->ssup_collation = scanKey->sk_collation;
sortKey->ssup_nulls_first =
(scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
sortKey->ssup_attno = scanKey->sk_attno;
/* Convey if abbreviation optimization is applicable in principle */
sortKey->abbreviate = (i == 0 && base->haveDatum1);
Assert(sortKey->ssup_attno != 0);
strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
BTGreaterStrategyNumber : BTLessStrategyNumber;
PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey);
}
pfree(indexScanKey);
MemoryContextSwitchTo(oldcontext);
return state;
}
Tuplesortstate *
tuplesort_begin_index_btree(Relation heapRel,
Relation indexRel,
bool enforceUnique,
bool uniqueNullsNotDistinct,
int workMem,
SortCoordinate coordinate,
int sortopt)
{
Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
sortopt);
TuplesortPublic *base = TuplesortstateGetPublic(state);
BTScanInsert indexScanKey;
TuplesortIndexBTreeArg *arg;
MemoryContext oldcontext;
int i;
oldcontext = MemoryContextSwitchTo(base->maincontext);
arg = (TuplesortIndexBTreeArg *) palloc(sizeof(TuplesortIndexBTreeArg));
#ifdef TRACE_SORT
if (trace_sort)
elog(LOG,
"begin index sort: unique = %c, workMem = %d, randomAccess = %c",
enforceUnique ? 't' : 'f',
workMem, sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f');
#endif
base->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel);
TRACE_POSTGRESQL_SORT_START(INDEX_SORT,
enforceUnique,
base->nKeys,
workMem,
sortopt & TUPLESORT_RANDOMACCESS,
PARALLEL_SORT(coordinate));
base->removeabbrev = removeabbrev_index;
base->comparetup = comparetup_index_btree;
base->comparetup_tiebreak = comparetup_index_btree_tiebreak;
base->writetup = writetup_index;
base->readtup = readtup_index;
base->haveDatum1 = true;
base->arg = arg;
arg->index.heapRel = heapRel;
arg->index.indexRel = indexRel;
arg->enforceUnique = enforceUnique;
arg->uniqueNullsNotDistinct = uniqueNullsNotDistinct;
indexScanKey = _bt_mkscankey(indexRel, NULL);
/* Prepare SortSupport data for each column */
base->sortKeys = (SortSupport) palloc0(base->nKeys *
sizeof(SortSupportData));
for (i = 0; i < base->nKeys; i++)
{
SortSupport sortKey = base->sortKeys + i;
ScanKey scanKey = indexScanKey->scankeys + i;
int16 strategy;
sortKey->ssup_cxt = CurrentMemoryContext;
sortKey->ssup_collation = scanKey->sk_collation;
sortKey->ssup_nulls_first =
(scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
sortKey->ssup_attno = scanKey->sk_attno;
/* Convey if abbreviation optimization is applicable in principle */
sortKey->abbreviate = (i == 0 && base->haveDatum1);
Assert(sortKey->ssup_attno != 0);
strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
BTGreaterStrategyNumber : BTLessStrategyNumber;
PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey);
}
pfree(indexScanKey);
MemoryContextSwitchTo(oldcontext);
return state;
}
Tuplesortstate *
tuplesort_begin_index_hash(Relation heapRel,
Relation indexRel,
uint32 high_mask,
uint32 low_mask,
uint32 max_buckets,
int workMem,
SortCoordinate coordinate,
int sortopt)
{
Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
sortopt);
TuplesortPublic *base = TuplesortstateGetPublic(state);
MemoryContext oldcontext;
TuplesortIndexHashArg *arg;
oldcontext = MemoryContextSwitchTo(base->maincontext);
arg = (TuplesortIndexHashArg *) palloc(sizeof(TuplesortIndexHashArg));
#ifdef TRACE_SORT
if (trace_sort)
elog(LOG,
"begin index sort: high_mask = 0x%x, low_mask = 0x%x, "
"max_buckets = 0x%x, workMem = %d, randomAccess = %c",
high_mask,
low_mask,
max_buckets,
workMem,
sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f');
#endif
base->nKeys = 1; /* Only one sort column, the hash code */
base->removeabbrev = removeabbrev_index;
base->comparetup = comparetup_index_hash;
base->comparetup_tiebreak = comparetup_index_hash_tiebreak;
base->writetup = writetup_index;
base->readtup = readtup_index;
base->haveDatum1 = true;
base->arg = arg;
arg->index.heapRel = heapRel;
arg->index.indexRel = indexRel;
arg->high_mask = high_mask;
arg->low_mask = low_mask;
arg->max_buckets = max_buckets;
MemoryContextSwitchTo(oldcontext);
return state;
}
Tuplesortstate *
tuplesort_begin_index_gist(Relation heapRel,
Relation indexRel,
int workMem,
SortCoordinate coordinate,
int sortopt)
{
Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
sortopt);
TuplesortPublic *base = TuplesortstateGetPublic(state);
MemoryContext oldcontext;
TuplesortIndexBTreeArg *arg;
int i;
oldcontext = MemoryContextSwitchTo(base->maincontext);
arg = (TuplesortIndexBTreeArg *) palloc(sizeof(TuplesortIndexBTreeArg));
#ifdef TRACE_SORT
if (trace_sort)
elog(LOG,
"begin index sort: workMem = %d, randomAccess = %c",
workMem, sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f');
#endif
base->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel);
base->removeabbrev = removeabbrev_index;
base->comparetup = comparetup_index_btree;
base->comparetup_tiebreak = comparetup_index_btree_tiebreak;
base->writetup = writetup_index;
base->readtup = readtup_index;
base->haveDatum1 = true;
base->arg = arg;
arg->index.heapRel = heapRel;
arg->index.indexRel = indexRel;
arg->enforceUnique = false;
arg->uniqueNullsNotDistinct = false;
/* Prepare SortSupport data for each column */
base->sortKeys = (SortSupport) palloc0(base->nKeys *
sizeof(SortSupportData));
for (i = 0; i < base->nKeys; i++)
{
SortSupport sortKey = base->sortKeys + i;
sortKey->ssup_cxt = CurrentMemoryContext;
sortKey->ssup_collation = indexRel->rd_indcollation[i];
sortKey->ssup_nulls_first = false;
sortKey->ssup_attno = i + 1;
/* Convey if abbreviation optimization is applicable in principle */
sortKey->abbreviate = (i == 0 && base->haveDatum1);
Assert(sortKey->ssup_attno != 0);
/* Look for a sort support function */
PrepareSortSupportFromGistIndexRel(indexRel, sortKey);
}
MemoryContextSwitchTo(oldcontext);
return state;
}
Allow parallel CREATE INDEX for BRIN indexes Allow using multiple worker processes to build BRIN index, which until now was supported only for BTREE indexes. For large tables this often results in significant speedup when the build is CPU-bound. The work is split in a simple way - each worker builds BRIN summaries on a subset of the table, determined by the regular parallel scan used to read the data, and feeds them into a shared tuplesort which sorts them by blkno (start of the range). The leader then reads this sorted stream of ranges, merges duplicates (which may happen if the parallel scan does not align with BRIN pages_per_range), and adds the resulting ranges into the index. The number of duplicate results produced by workers (requiring merging in the leader process) should be fairly small, thanks to how parallel scans assign chunks to workers. The likelihood of duplicate results may increase for higher pages_per_range values, but then there are fewer page ranges in total. In any case, we expect the merging to be much cheaper than summarization, so this should be a win. Most of the parallelism infrastructure is a simplified copy of the code used by BTREE indexes, omitting the parts irrelevant for BRIN indexes (e.g. uniqueness checks). This also introduces a new index AM flag amcanbuildparallel, determining whether to attempt to start parallel workers for the index build. Original patch by me, with reviews and substantial reworks by Matthias van de Meent, certainly enough to make him a co-author. Author: Tomas Vondra, Matthias van de Meent Reviewed-by: Matthias van de Meent Discussion: https://postgr.es/m/c2ee7d69-ce17-43f2-d1a0-9811edbda6e6%40enterprisedb.com
2023-12-08 18:15:23 +01:00
Tuplesortstate *
tuplesort_begin_index_brin(Relation heapRel,
Relation indexRel,
int workMem,
SortCoordinate coordinate,
int sortopt)
{
Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
sortopt);
TuplesortPublic *base = TuplesortstateGetPublic(state);
MemoryContext oldcontext;
TuplesortIndexBrinArg *arg;
oldcontext = MemoryContextSwitchTo(base->maincontext);
arg = (TuplesortIndexBrinArg *) palloc(sizeof(TuplesortIndexBrinArg));
#ifdef TRACE_SORT
if (trace_sort)
elog(LOG,
"begin index sort: workMem = %d, randomAccess = %c",
workMem,
sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f');
#endif
base->nKeys = 1; /* Only one sort column, the block number */
base->removeabbrev = removeabbrev_index_brin;
base->comparetup = comparetup_index_brin;
base->writetup = writetup_index_brin;
base->readtup = readtup_index_brin;
base->haveDatum1 = true;
base->arg = arg;
arg->index.heapRel = heapRel;
arg->index.indexRel = indexRel;
MemoryContextSwitchTo(oldcontext);
return state;
}
Tuplesortstate *
tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation,
bool nullsFirstFlag, int workMem,
SortCoordinate coordinate, int sortopt)
{
Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
sortopt);
TuplesortPublic *base = TuplesortstateGetPublic(state);
TuplesortDatumArg *arg;
MemoryContext oldcontext;
int16 typlen;
bool typbyval;
oldcontext = MemoryContextSwitchTo(base->maincontext);
arg = (TuplesortDatumArg *) palloc(sizeof(TuplesortDatumArg));
#ifdef TRACE_SORT
if (trace_sort)
elog(LOG,
"begin datum sort: workMem = %d, randomAccess = %c",
workMem, sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f');
#endif
base->nKeys = 1; /* always a one-column sort */
TRACE_POSTGRESQL_SORT_START(DATUM_SORT,
false, /* no unique check */
1,
workMem,
sortopt & TUPLESORT_RANDOMACCESS,
PARALLEL_SORT(coordinate));
base->removeabbrev = removeabbrev_datum;
base->comparetup = comparetup_datum;
base->comparetup_tiebreak = comparetup_datum_tiebreak;
base->writetup = writetup_datum;
base->readtup = readtup_datum;
base->haveDatum1 = true;
base->arg = arg;
arg->datumType = datumType;
/* lookup necessary attributes of the datum type */
get_typlenbyval(datumType, &typlen, &typbyval);
arg->datumTypeLen = typlen;
base->tuples = !typbyval;
/* Prepare SortSupport data */
base->sortKeys = (SortSupport) palloc0(sizeof(SortSupportData));
base->sortKeys->ssup_cxt = CurrentMemoryContext;
base->sortKeys->ssup_collation = sortCollation;
base->sortKeys->ssup_nulls_first = nullsFirstFlag;
/*
* Abbreviation is possible here only for by-reference types. In theory,
* a pass-by-value datatype could have an abbreviated form that is cheaper
* to compare. In a tuple sort, we could support that, because we can
* always extract the original datum from the tuple as needed. Here, we
* can't, because a datum sort only stores a single copy of the datum; the
* "tuple" field of each SortTuple is NULL.
*/
base->sortKeys->abbreviate = !typbyval;
PrepareSortSupportFromOrderingOp(sortOperator, base->sortKeys);
/*
* The "onlyKey" optimization cannot be used with abbreviated keys, since
* tie-breaker comparisons may be required. Typically, the optimization
* is only of value to pass-by-value types anyway, whereas abbreviated
* keys are typically only of value to pass-by-reference types.
*/
if (!base->sortKeys->abbrev_converter)
base->onlyKey = base->sortKeys;
MemoryContextSwitchTo(oldcontext);
return state;
}
/*
* Accept one tuple while collecting input data for sort.
*
* Note that the input data is always copied; the caller need not save it.
*/
void
tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot)
{
TuplesortPublic *base = TuplesortstateGetPublic(state);
MemoryContext oldcontext = MemoryContextSwitchTo(base->tuplecontext);
TupleDesc tupDesc = (TupleDesc) base->arg;
SortTuple stup;
MinimalTuple tuple;
HeapTupleData htup;
/* copy the tuple into sort storage */
tuple = ExecCopySlotMinimalTuple(slot);
stup.tuple = (void *) tuple;
/* set up first-column key value */
htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
stup.datum1 = heap_getattr(&htup,
base->sortKeys[0].ssup_attno,
tupDesc,
&stup.isnull1);
tuplesort_puttuple_common(state, &stup,
base->sortKeys->abbrev_converter &&
!stup.isnull1);
MemoryContextSwitchTo(oldcontext);
}
/*
* Accept one tuple while collecting input data for sort.
*
* Note that the input data is always copied; the caller need not save it.
*/
void
tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup)
{
SortTuple stup;
TuplesortPublic *base = TuplesortstateGetPublic(state);
MemoryContext oldcontext = MemoryContextSwitchTo(base->tuplecontext);
TuplesortClusterArg *arg = (TuplesortClusterArg *) base->arg;
/* copy the tuple into sort storage */
tup = heap_copytuple(tup);
stup.tuple = (void *) tup;
/*
* set up first-column key value, and potentially abbreviate, if it's a
* simple column
*/
if (base->haveDatum1)
{
stup.datum1 = heap_getattr(tup,
arg->indexInfo->ii_IndexAttrNumbers[0],
arg->tupDesc,
&stup.isnull1);
}
tuplesort_puttuple_common(state, &stup,
base->haveDatum1 &&
base->sortKeys->abbrev_converter &&
!stup.isnull1);
MemoryContextSwitchTo(oldcontext);
}
/*
* Collect one index tuple while collecting input data for sort, building
* it from caller-supplied values.
*/
void
tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel,
ItemPointer self, const Datum *values,
const bool *isnull)
{
SortTuple stup;
IndexTuple tuple;
TuplesortPublic *base = TuplesortstateGetPublic(state);
TuplesortIndexArg *arg = (TuplesortIndexArg *) base->arg;
stup.tuple = index_form_tuple_context(RelationGetDescr(rel), values,
isnull, base->tuplecontext);
tuple = ((IndexTuple) stup.tuple);
tuple->t_tid = *self;
/* set up first-column key value */
stup.datum1 = index_getattr(tuple,
1,
RelationGetDescr(arg->indexRel),
&stup.isnull1);
tuplesort_puttuple_common(state, &stup,
base->sortKeys &&
base->sortKeys->abbrev_converter &&
!stup.isnull1);
}
Allow parallel CREATE INDEX for BRIN indexes Allow using multiple worker processes to build BRIN index, which until now was supported only for BTREE indexes. For large tables this often results in significant speedup when the build is CPU-bound. The work is split in a simple way - each worker builds BRIN summaries on a subset of the table, determined by the regular parallel scan used to read the data, and feeds them into a shared tuplesort which sorts them by blkno (start of the range). The leader then reads this sorted stream of ranges, merges duplicates (which may happen if the parallel scan does not align with BRIN pages_per_range), and adds the resulting ranges into the index. The number of duplicate results produced by workers (requiring merging in the leader process) should be fairly small, thanks to how parallel scans assign chunks to workers. The likelihood of duplicate results may increase for higher pages_per_range values, but then there are fewer page ranges in total. In any case, we expect the merging to be much cheaper than summarization, so this should be a win. Most of the parallelism infrastructure is a simplified copy of the code used by BTREE indexes, omitting the parts irrelevant for BRIN indexes (e.g. uniqueness checks). This also introduces a new index AM flag amcanbuildparallel, determining whether to attempt to start parallel workers for the index build. Original patch by me, with reviews and substantial reworks by Matthias van de Meent, certainly enough to make him a co-author. Author: Tomas Vondra, Matthias van de Meent Reviewed-by: Matthias van de Meent Discussion: https://postgr.es/m/c2ee7d69-ce17-43f2-d1a0-9811edbda6e6%40enterprisedb.com
2023-12-08 18:15:23 +01:00
/*
* Collect one BRIN tuple while collecting input data for sort.
*/
void
tuplesort_putbrintuple(Tuplesortstate *state, BrinTuple *tuple, Size size)
{
SortTuple stup;
BrinSortTuple *bstup;
TuplesortPublic *base = TuplesortstateGetPublic(state);
MemoryContext oldcontext = MemoryContextSwitchTo(base->tuplecontext);
/* allocate space for the whole BRIN sort tuple */
bstup = palloc(BRINSORTTUPLE_SIZE(size));
bstup->tuplen = size;
memcpy(&bstup->tuple, tuple, size);
stup.tuple = bstup;
stup.datum1 = tuple->bt_blkno;
stup.isnull1 = false;
tuplesort_puttuple_common(state, &stup,
base->sortKeys &&
base->sortKeys->abbrev_converter &&
!stup.isnull1);
MemoryContextSwitchTo(oldcontext);
}
/*
* Accept one Datum while collecting input data for sort.
*
* If the Datum is pass-by-ref type, the value will be copied.
*/
void
tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull)
{
TuplesortPublic *base = TuplesortstateGetPublic(state);
MemoryContext oldcontext = MemoryContextSwitchTo(base->tuplecontext);
TuplesortDatumArg *arg = (TuplesortDatumArg *) base->arg;
SortTuple stup;
/*
* Pass-by-value types or null values are just stored directly in
* stup.datum1 (and stup.tuple is not used and set to NULL).
*
* Non-null pass-by-reference values need to be copied into memory we
* control, and possibly abbreviated. The copied value is pointed to by
* stup.tuple and is treated as the canonical copy (e.g. to return via
* tuplesort_getdatum or when writing to tape); stup.datum1 gets the
* abbreviated value if abbreviation is happening, otherwise it's
* identical to stup.tuple.
*/
if (isNull || !base->tuples)
{
/*
* Set datum1 to zeroed representation for NULLs (to be consistent,
* and to support cheap inequality tests for NULL abbreviated keys).
*/
stup.datum1 = !isNull ? val : (Datum) 0;
stup.isnull1 = isNull;
stup.tuple = NULL; /* no separate storage */
}
else
{
stup.isnull1 = false;
stup.datum1 = datumCopy(val, false, arg->datumTypeLen);
stup.tuple = DatumGetPointer(stup.datum1);
}
tuplesort_puttuple_common(state, &stup,
base->tuples &&
base->sortKeys->abbrev_converter && !isNull);
MemoryContextSwitchTo(oldcontext);
}
/*
* Fetch the next tuple in either forward or back direction.
* If successful, put tuple in slot and return true; else, clear the slot
* and return false.
*
* Caller may optionally be passed back abbreviated value (on true return
* value) when abbreviation was used, which can be used to cheaply avoid
* equality checks that might otherwise be required. Caller can safely make a
* determination of "non-equal tuple" based on simple binary inequality. A
* NULL value in leading attribute will set abbreviated value to zeroed
* representation, which caller may rely on in abbreviated inequality check.
*
* If copy is true, the slot receives a tuple that's been copied into the
* caller's memory context, so that it will stay valid regardless of future
* manipulations of the tuplesort's state (up to and including deleting the
* tuplesort). If copy is false, the slot will just receive a pointer to a
* tuple held within the tuplesort, which is more efficient, but only safe for
* callers that are prepared to have any subsequent manipulation of the
* tuplesort's state invalidate slot contents.
*/
bool
tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy,
TupleTableSlot *slot, Datum *abbrev)
{
TuplesortPublic *base = TuplesortstateGetPublic(state);
MemoryContext oldcontext = MemoryContextSwitchTo(base->sortcontext);
SortTuple stup;
if (!tuplesort_gettuple_common(state, forward, &stup))
stup.tuple = NULL;
MemoryContextSwitchTo(oldcontext);
if (stup.tuple)
{
/* Record abbreviated key for caller */
if (base->sortKeys->abbrev_converter && abbrev)
*abbrev = stup.datum1;
if (copy)
stup.tuple = heap_copy_minimal_tuple((MinimalTuple) stup.tuple);
ExecStoreMinimalTuple((MinimalTuple) stup.tuple, slot, copy);
return true;
}
else
{
ExecClearTuple(slot);
return false;
}
}
/*
* Fetch the next tuple in either forward or back direction.
* Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory
* context, and must not be freed by caller. Caller may not rely on tuple
* remaining valid after any further manipulation of tuplesort.
*/
HeapTuple
tuplesort_getheaptuple(Tuplesortstate *state, bool forward)
{
TuplesortPublic *base = TuplesortstateGetPublic(state);
MemoryContext oldcontext = MemoryContextSwitchTo(base->sortcontext);
SortTuple stup;
if (!tuplesort_gettuple_common(state, forward, &stup))
stup.tuple = NULL;
MemoryContextSwitchTo(oldcontext);
return stup.tuple;
}
/*
* Fetch the next index tuple in either forward or back direction.
* Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory
* context, and must not be freed by caller. Caller may not rely on tuple
* remaining valid after any further manipulation of tuplesort.
*/
IndexTuple
tuplesort_getindextuple(Tuplesortstate *state, bool forward)
{
TuplesortPublic *base = TuplesortstateGetPublic(state);
MemoryContext oldcontext = MemoryContextSwitchTo(base->sortcontext);
SortTuple stup;
if (!tuplesort_gettuple_common(state, forward, &stup))
stup.tuple = NULL;
MemoryContextSwitchTo(oldcontext);
return (IndexTuple) stup.tuple;
}
Allow parallel CREATE INDEX for BRIN indexes Allow using multiple worker processes to build BRIN index, which until now was supported only for BTREE indexes. For large tables this often results in significant speedup when the build is CPU-bound. The work is split in a simple way - each worker builds BRIN summaries on a subset of the table, determined by the regular parallel scan used to read the data, and feeds them into a shared tuplesort which sorts them by blkno (start of the range). The leader then reads this sorted stream of ranges, merges duplicates (which may happen if the parallel scan does not align with BRIN pages_per_range), and adds the resulting ranges into the index. The number of duplicate results produced by workers (requiring merging in the leader process) should be fairly small, thanks to how parallel scans assign chunks to workers. The likelihood of duplicate results may increase for higher pages_per_range values, but then there are fewer page ranges in total. In any case, we expect the merging to be much cheaper than summarization, so this should be a win. Most of the parallelism infrastructure is a simplified copy of the code used by BTREE indexes, omitting the parts irrelevant for BRIN indexes (e.g. uniqueness checks). This also introduces a new index AM flag amcanbuildparallel, determining whether to attempt to start parallel workers for the index build. Original patch by me, with reviews and substantial reworks by Matthias van de Meent, certainly enough to make him a co-author. Author: Tomas Vondra, Matthias van de Meent Reviewed-by: Matthias van de Meent Discussion: https://postgr.es/m/c2ee7d69-ce17-43f2-d1a0-9811edbda6e6%40enterprisedb.com
2023-12-08 18:15:23 +01:00
/*
* Fetch the next BRIN tuple in either forward or back direction.
* Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory
* context, and must not be freed by caller. Caller may not rely on tuple
* remaining valid after any further manipulation of tuplesort.
*/
BrinTuple *
tuplesort_getbrintuple(Tuplesortstate *state, Size *len, bool forward)
{
TuplesortPublic *base = TuplesortstateGetPublic(state);
MemoryContext oldcontext = MemoryContextSwitchTo(base->sortcontext);
SortTuple stup;
BrinSortTuple *btup;
if (!tuplesort_gettuple_common(state, forward, &stup))
stup.tuple = NULL;
MemoryContextSwitchTo(oldcontext);
if (!stup.tuple)
return NULL;
btup = (BrinSortTuple *) stup.tuple;
*len = btup->tuplen;
return &btup->tuple;
}
/*
* Fetch the next Datum in either forward or back direction.
* Returns false if no more datums.
*
* If the Datum is pass-by-ref type, the returned value is freshly palloc'd
* in caller's context, and is now owned by the caller (this differs from
* similar routines for other types of tuplesorts).
*
* Caller may optionally be passed back abbreviated value (on true return
* value) when abbreviation was used, which can be used to cheaply avoid
* equality checks that might otherwise be required. Caller can safely make a
* determination of "non-equal tuple" based on simple binary inequality. A
* NULL value will have a zeroed abbreviated value representation, which caller
* may rely on in abbreviated inequality check.
*
* For byref Datums, if copy is true, *val is set to a copy of the Datum
* copied into the caller's memory context, so that it will stay valid
* regardless of future manipulations of the tuplesort's state (up to and
* including deleting the tuplesort). If copy is false, *val will just be
* set to a pointer to the Datum held within the tuplesort, which is more
* efficient, but only safe for callers that are prepared to have any
* subsequent manipulation of the tuplesort's state invalidate slot contents.
* For byval Datums, the value of the 'copy' parameter has no effect.
*/
bool
tuplesort_getdatum(Tuplesortstate *state, bool forward, bool copy,
Datum *val, bool *isNull, Datum *abbrev)
{
TuplesortPublic *base = TuplesortstateGetPublic(state);
MemoryContext oldcontext = MemoryContextSwitchTo(base->sortcontext);
TuplesortDatumArg *arg = (TuplesortDatumArg *) base->arg;
SortTuple stup;
if (!tuplesort_gettuple_common(state, forward, &stup))
{
MemoryContextSwitchTo(oldcontext);
return false;
}
/* Ensure we copy into caller's memory context */
MemoryContextSwitchTo(oldcontext);
/* Record abbreviated key for caller */
if (base->sortKeys->abbrev_converter && abbrev)
*abbrev = stup.datum1;
if (stup.isnull1 || !base->tuples)
{
*val = stup.datum1;
*isNull = stup.isnull1;
}
else
{
/* use stup.tuple because stup.datum1 may be an abbreviation */
if (copy)
*val = datumCopy(PointerGetDatum(stup.tuple), false,
arg->datumTypeLen);
else
*val = PointerGetDatum(stup.tuple);
*isNull = false;
}
return true;
}
/*
* Routines specialized for HeapTuple (actually MinimalTuple) case
*/
static void
removeabbrev_heap(Tuplesortstate *state, SortTuple *stups, int count)
{
int i;
TuplesortPublic *base = TuplesortstateGetPublic(state);
for (i = 0; i < count; i++)
{
HeapTupleData htup;
htup.t_len = ((MinimalTuple) stups[i].tuple)->t_len +
MINIMAL_TUPLE_OFFSET;
htup.t_data = (HeapTupleHeader) ((char *) stups[i].tuple -
MINIMAL_TUPLE_OFFSET);
stups[i].datum1 = heap_getattr(&htup,
base->sortKeys[0].ssup_attno,
(TupleDesc) base->arg,
&stups[i].isnull1);
}
}
static int
comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
{
TuplesortPublic *base = TuplesortstateGetPublic(state);
SortSupport sortKey = base->sortKeys;
int32 compare;
/* Compare the leading sort key */
compare = ApplySortComparator(a->datum1, a->isnull1,
b->datum1, b->isnull1,
sortKey);
if (compare != 0)
return compare;
/* Compare additional sort keys */
return comparetup_heap_tiebreak(a, b, state);
}
static int
comparetup_heap_tiebreak(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
{
TuplesortPublic *base = TuplesortstateGetPublic(state);
SortSupport sortKey = base->sortKeys;
HeapTupleData ltup;
HeapTupleData rtup;
TupleDesc tupDesc;
int nkey;
int32 compare;
AttrNumber attno;
Datum datum1,
datum2;
bool isnull1,
isnull2;
ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
ltup.t_data = (HeapTupleHeader) ((char *) a->tuple - MINIMAL_TUPLE_OFFSET);
rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET);
tupDesc = (TupleDesc) base->arg;
if (sortKey->abbrev_converter)
{
attno = sortKey->ssup_attno;
datum1 = heap_getattr(&ltup, attno, tupDesc, &isnull1);
datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2);
compare = ApplySortAbbrevFullComparator(datum1, isnull1,
datum2, isnull2,
sortKey);
if (compare != 0)
return compare;
}
sortKey++;
for (nkey = 1; nkey < base->nKeys; nkey++, sortKey++)
{
attno = sortKey->ssup_attno;
datum1 = heap_getattr(&ltup, attno, tupDesc, &isnull1);
datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2);
compare = ApplySortComparator(datum1, isnull1,
datum2, isnull2,
sortKey);
if (compare != 0)
return compare;
}
return 0;
}
static void
writetup_heap(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup)
{
TuplesortPublic *base = TuplesortstateGetPublic(state);
MinimalTuple tuple = (MinimalTuple) stup->tuple;
/* the part of the MinimalTuple we'll write: */
char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET;
unsigned int tupbodylen = tuple->t_len - MINIMAL_TUPLE_DATA_OFFSET;
/* total on-disk footprint: */
unsigned int tuplen = tupbodylen + sizeof(int);
LogicalTapeWrite(tape, &tuplen, sizeof(tuplen));
LogicalTapeWrite(tape, tupbody, tupbodylen);
if (base->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length word? */
LogicalTapeWrite(tape, &tuplen, sizeof(tuplen));
}
static void
readtup_heap(Tuplesortstate *state, SortTuple *stup,
LogicalTape *tape, unsigned int len)
{
unsigned int tupbodylen = len - sizeof(int);
unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET;
MinimalTuple tuple = (MinimalTuple) tuplesort_readtup_alloc(state, tuplen);
char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET;
TuplesortPublic *base = TuplesortstateGetPublic(state);
HeapTupleData htup;
/* read in the tuple proper */
tuple->t_len = tuplen;
LogicalTapeReadExact(tape, tupbody, tupbodylen);
if (base->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length word? */
LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen));
stup->tuple = (void *) tuple;
/* set up first-column key value */
htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
stup->datum1 = heap_getattr(&htup,
base->sortKeys[0].ssup_attno,
(TupleDesc) base->arg,
&stup->isnull1);
}
/*
* Routines specialized for the CLUSTER case (HeapTuple data, with
* comparisons per a btree index definition)
*/
static void
removeabbrev_cluster(Tuplesortstate *state, SortTuple *stups, int count)
{
int i;
TuplesortPublic *base = TuplesortstateGetPublic(state);
TuplesortClusterArg *arg = (TuplesortClusterArg *) base->arg;
for (i = 0; i < count; i++)
{
HeapTuple tup;
tup = (HeapTuple) stups[i].tuple;
stups[i].datum1 = heap_getattr(tup,
arg->indexInfo->ii_IndexAttrNumbers[0],
arg->tupDesc,
&stups[i].isnull1);
}
}
static int
comparetup_cluster(const SortTuple *a, const SortTuple *b,
Tuplesortstate *state)
{
TuplesortPublic *base = TuplesortstateGetPublic(state);
SortSupport sortKey = base->sortKeys;
int32 compare;
/* Compare the leading sort key, if it's simple */
if (base->haveDatum1)
{
compare = ApplySortComparator(a->datum1, a->isnull1,
b->datum1, b->isnull1,
sortKey);
if (compare != 0)
return compare;
}
return comparetup_cluster_tiebreak(a, b, state);
}
static int
comparetup_cluster_tiebreak(const SortTuple *a, const SortTuple *b,
Tuplesortstate *state)
{
TuplesortPublic *base = TuplesortstateGetPublic(state);
TuplesortClusterArg *arg = (TuplesortClusterArg *) base->arg;
SortSupport sortKey = base->sortKeys;
HeapTuple ltup;
HeapTuple rtup;
TupleDesc tupDesc;
int nkey;
int32 compare = 0;
Datum datum1,
datum2;
bool isnull1,
isnull2;
ltup = (HeapTuple) a->tuple;
rtup = (HeapTuple) b->tuple;
tupDesc = arg->tupDesc;
/* Compare the leading sort key, if it's simple */
if (base->haveDatum1)
{
if (sortKey->abbrev_converter)
{
AttrNumber leading = arg->indexInfo->ii_IndexAttrNumbers[0];
datum1 = heap_getattr(ltup, leading, tupDesc, &isnull1);
datum2 = heap_getattr(rtup, leading, tupDesc, &isnull2);
compare = ApplySortAbbrevFullComparator(datum1, isnull1,
datum2, isnull2,
sortKey);
}
if (compare != 0 || base->nKeys == 1)
return compare;
/* Compare additional columns the hard way */
sortKey++;
nkey = 1;
}
else
{
/* Must compare all keys the hard way */
nkey = 0;
}
if (arg->indexInfo->ii_Expressions == NULL)
{
/* If not expression index, just compare the proper heap attrs */
for (; nkey < base->nKeys; nkey++, sortKey++)
{
AttrNumber attno = arg->indexInfo->ii_IndexAttrNumbers[nkey];
datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1);
datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2);
compare = ApplySortComparator(datum1, isnull1,
datum2, isnull2,
sortKey);
if (compare != 0)
return compare;
}
}
else
{
/*
* In the expression index case, compute the whole index tuple and
* then compare values. It would perhaps be faster to compute only as
* many columns as we need to compare, but that would require
* duplicating all the logic in FormIndexDatum.
*/
Datum l_index_values[INDEX_MAX_KEYS];
bool l_index_isnull[INDEX_MAX_KEYS];
Datum r_index_values[INDEX_MAX_KEYS];
bool r_index_isnull[INDEX_MAX_KEYS];
TupleTableSlot *ecxt_scantuple;
/* Reset context each time to prevent memory leakage */
ResetPerTupleExprContext(arg->estate);
ecxt_scantuple = GetPerTupleExprContext(arg->estate)->ecxt_scantuple;
ExecStoreHeapTuple(ltup, ecxt_scantuple, false);
FormIndexDatum(arg->indexInfo, ecxt_scantuple, arg->estate,
l_index_values, l_index_isnull);
ExecStoreHeapTuple(rtup, ecxt_scantuple, false);
FormIndexDatum(arg->indexInfo, ecxt_scantuple, arg->estate,
r_index_values, r_index_isnull);
for (; nkey < base->nKeys; nkey++, sortKey++)
{
compare = ApplySortComparator(l_index_values[nkey],
l_index_isnull[nkey],
r_index_values[nkey],
r_index_isnull[nkey],
sortKey);
if (compare != 0)
return compare;
}
}
return 0;
}
static void
writetup_cluster(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup)
{
TuplesortPublic *base = TuplesortstateGetPublic(state);
HeapTuple tuple = (HeapTuple) stup->tuple;
unsigned int tuplen = tuple->t_len + sizeof(ItemPointerData) + sizeof(int);
/* We need to store t_self, but not other fields of HeapTupleData */
LogicalTapeWrite(tape, &tuplen, sizeof(tuplen));
LogicalTapeWrite(tape, &tuple->t_self, sizeof(ItemPointerData));
LogicalTapeWrite(tape, tuple->t_data, tuple->t_len);
if (base->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length word? */
LogicalTapeWrite(tape, &tuplen, sizeof(tuplen));
}
static void
readtup_cluster(Tuplesortstate *state, SortTuple *stup,
LogicalTape *tape, unsigned int tuplen)
{
TuplesortPublic *base = TuplesortstateGetPublic(state);
TuplesortClusterArg *arg = (TuplesortClusterArg *) base->arg;
unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int);
HeapTuple tuple = (HeapTuple) tuplesort_readtup_alloc(state,
t_len + HEAPTUPLESIZE);
/* Reconstruct the HeapTupleData header */
tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
tuple->t_len = t_len;
LogicalTapeReadExact(tape, &tuple->t_self, sizeof(ItemPointerData));
/* We don't currently bother to reconstruct t_tableOid */
tuple->t_tableOid = InvalidOid;
/* Read in the tuple body */
LogicalTapeReadExact(tape, tuple->t_data, tuple->t_len);
if (base->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length word? */
LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen));
stup->tuple = (void *) tuple;
/* set up first-column key value, if it's a simple column */
if (base->haveDatum1)
stup->datum1 = heap_getattr(tuple,
arg->indexInfo->ii_IndexAttrNumbers[0],
arg->tupDesc,
&stup->isnull1);
}
static void
freestate_cluster(Tuplesortstate *state)
{
TuplesortPublic *base = TuplesortstateGetPublic(state);
TuplesortClusterArg *arg = (TuplesortClusterArg *) base->arg;
/* Free any execution state created for CLUSTER case */
if (arg->estate != NULL)
{
ExprContext *econtext = GetPerTupleExprContext(arg->estate);
ExecDropSingleTupleTableSlot(econtext->ecxt_scantuple);
FreeExecutorState(arg->estate);
}
}
/*
* Routines specialized for IndexTuple case
*
* The btree and hash cases require separate comparison functions, but the
* IndexTuple representation is the same so the copy/write/read support
* functions can be shared.
*/
static void
removeabbrev_index(Tuplesortstate *state, SortTuple *stups, int count)
{
TuplesortPublic *base = TuplesortstateGetPublic(state);
TuplesortIndexArg *arg = (TuplesortIndexArg *) base->arg;
int i;
for (i = 0; i < count; i++)
{
IndexTuple tuple;
tuple = stups[i].tuple;
stups[i].datum1 = index_getattr(tuple,
1,
RelationGetDescr(arg->indexRel),
&stups[i].isnull1);
}
}
static int
comparetup_index_btree(const SortTuple *a, const SortTuple *b,
Tuplesortstate *state)
{
/*
* This is similar to comparetup_heap(), but expects index tuples. There
* is also special handling for enforcing uniqueness, and special
* treatment for equal keys at the end.
*/
TuplesortPublic *base = TuplesortstateGetPublic(state);
SortSupport sortKey = base->sortKeys;
int32 compare;
/* Compare the leading sort key */
compare = ApplySortComparator(a->datum1, a->isnull1,
b->datum1, b->isnull1,
sortKey);
if (compare != 0)
return compare;
/* Compare additional sort keys */
return comparetup_index_btree_tiebreak(a, b, state);
}
static int
comparetup_index_btree_tiebreak(const SortTuple *a, const SortTuple *b,
Tuplesortstate *state)
{
TuplesortPublic *base = TuplesortstateGetPublic(state);
TuplesortIndexBTreeArg *arg = (TuplesortIndexBTreeArg *) base->arg;
SortSupport sortKey = base->sortKeys;
IndexTuple tuple1;
IndexTuple tuple2;
int keysz;
TupleDesc tupDes;
bool equal_hasnull = false;
int nkey;
int32 compare;
Datum datum1,
datum2;
bool isnull1,
isnull2;
tuple1 = (IndexTuple) a->tuple;
tuple2 = (IndexTuple) b->tuple;
keysz = base->nKeys;
tupDes = RelationGetDescr(arg->index.indexRel);
if (sortKey->abbrev_converter)
{
datum1 = index_getattr(tuple1, 1, tupDes, &isnull1);
datum2 = index_getattr(tuple2, 1, tupDes, &isnull2);
compare = ApplySortAbbrevFullComparator(datum1, isnull1,
datum2, isnull2,
sortKey);
if (compare != 0)
return compare;
}
/* they are equal, so we only need to examine one null flag */
if (a->isnull1)
equal_hasnull = true;
sortKey++;
for (nkey = 2; nkey <= keysz; nkey++, sortKey++)
{
datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1);
datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2);
compare = ApplySortComparator(datum1, isnull1,
datum2, isnull2,
sortKey);
if (compare != 0)
return compare; /* done when we find unequal attributes */
/* they are equal, so we only need to examine one null flag */
if (isnull1)
equal_hasnull = true;
}
/*
* If btree has asked us to enforce uniqueness, complain if two equal
* tuples are detected (unless there was at least one NULL field and NULLS
* NOT DISTINCT was not set).
*
* It is sufficient to make the test here, because if two tuples are equal
* they *must* get compared at some stage of the sort --- otherwise the
* sort algorithm wouldn't have checked whether one must appear before the
* other.
*/
if (arg->enforceUnique && !(!arg->uniqueNullsNotDistinct && equal_hasnull))
{
Datum values[INDEX_MAX_KEYS];
bool isnull[INDEX_MAX_KEYS];
char *key_desc;
/*
* Some rather brain-dead implementations of qsort (such as the one in
* QNX 4) will sometimes call the comparison routine to compare a
* value to itself, but we always use our own implementation, which
* does not.
*/
Assert(tuple1 != tuple2);
index_deform_tuple(tuple1, tupDes, values, isnull);
key_desc = BuildIndexValueDescription(arg->index.indexRel, values, isnull);
ereport(ERROR,
(errcode(ERRCODE_UNIQUE_VIOLATION),
errmsg("could not create unique index \"%s\"",
RelationGetRelationName(arg->index.indexRel)),
key_desc ? errdetail("Key %s is duplicated.", key_desc) :
errdetail("Duplicate keys exist."),
errtableconstraint(arg->index.heapRel,
RelationGetRelationName(arg->index.indexRel))));
}
/*
* If key values are equal, we sort on ItemPointer. This is required for
* btree indexes, since heap TID is treated as an implicit last key
* attribute in order to ensure that all keys in the index are physically
* unique.
*/
{
BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid);
BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid);
if (blk1 != blk2)
return (blk1 < blk2) ? -1 : 1;
}
{
OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid);
OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid);
if (pos1 != pos2)
return (pos1 < pos2) ? -1 : 1;
}
/* ItemPointer values should never be equal */
Assert(false);
return 0;
}
static int
comparetup_index_hash(const SortTuple *a, const SortTuple *b,
Tuplesortstate *state)
{
Bucket bucket1;
Bucket bucket2;
uint32 hash1;
uint32 hash2;
IndexTuple tuple1;
IndexTuple tuple2;
TuplesortPublic *base = TuplesortstateGetPublic(state);
TuplesortIndexHashArg *arg = (TuplesortIndexHashArg *) base->arg;
/*
* Fetch hash keys and mask off bits we don't want to sort by, so that the
* initial sort is just on the bucket number. We know that the first
* column of the index tuple is the hash key.
*/
Assert(!a->isnull1);
bucket1 = _hash_hashkey2bucket(DatumGetUInt32(a->datum1),
arg->max_buckets, arg->high_mask,
arg->low_mask);
Assert(!b->isnull1);
bucket2 = _hash_hashkey2bucket(DatumGetUInt32(b->datum1),
arg->max_buckets, arg->high_mask,
arg->low_mask);
if (bucket1 > bucket2)
return 1;
else if (bucket1 < bucket2)
return -1;
/*
* If bucket values are equal, sort by hash values. This allows us to
* insert directly onto bucket/overflow pages, where the index tuples are
* stored in hash order to allow fast binary search within each page.
*/
hash1 = DatumGetUInt32(a->datum1);
hash2 = DatumGetUInt32(b->datum1);
if (hash1 > hash2)
return 1;
else if (hash1 < hash2)
return -1;
/*
* If hash values are equal, we sort on ItemPointer. This does not affect
* validity of the finished index, but it may be useful to have index
* scans in physical order.
*/
tuple1 = (IndexTuple) a->tuple;
tuple2 = (IndexTuple) b->tuple;
{
BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid);
BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid);
if (blk1 != blk2)
return (blk1 < blk2) ? -1 : 1;
}
{
OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid);
OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid);
if (pos1 != pos2)
return (pos1 < pos2) ? -1 : 1;
}
/* ItemPointer values should never be equal */
Assert(false);
return 0;
}
/*
* Sorting for hash indexes only uses one sort key, so this shouldn't ever be
* called. It's only here for consistency.
*/
static int
comparetup_index_hash_tiebreak(const SortTuple *a, const SortTuple *b,
Tuplesortstate *state)
{
Assert(false);
return 0;
}
static void
writetup_index(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup)
{
TuplesortPublic *base = TuplesortstateGetPublic(state);
IndexTuple tuple = (IndexTuple) stup->tuple;
unsigned int tuplen;
tuplen = IndexTupleSize(tuple) + sizeof(tuplen);
LogicalTapeWrite(tape, &tuplen, sizeof(tuplen));
LogicalTapeWrite(tape, tuple, IndexTupleSize(tuple));
if (base->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length word? */
LogicalTapeWrite(tape, &tuplen, sizeof(tuplen));
}
static void
readtup_index(Tuplesortstate *state, SortTuple *stup,
LogicalTape *tape, unsigned int len)
{
TuplesortPublic *base = TuplesortstateGetPublic(state);
TuplesortIndexArg *arg = (TuplesortIndexArg *) base->arg;
unsigned int tuplen = len - sizeof(unsigned int);
IndexTuple tuple = (IndexTuple) tuplesort_readtup_alloc(state, tuplen);
LogicalTapeReadExact(tape, tuple, tuplen);
if (base->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length word? */
LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen));
stup->tuple = (void *) tuple;
/* set up first-column key value */
stup->datum1 = index_getattr(tuple,
1,
RelationGetDescr(arg->indexRel),
&stup->isnull1);
}
Allow parallel CREATE INDEX for BRIN indexes Allow using multiple worker processes to build BRIN index, which until now was supported only for BTREE indexes. For large tables this often results in significant speedup when the build is CPU-bound. The work is split in a simple way - each worker builds BRIN summaries on a subset of the table, determined by the regular parallel scan used to read the data, and feeds them into a shared tuplesort which sorts them by blkno (start of the range). The leader then reads this sorted stream of ranges, merges duplicates (which may happen if the parallel scan does not align with BRIN pages_per_range), and adds the resulting ranges into the index. The number of duplicate results produced by workers (requiring merging in the leader process) should be fairly small, thanks to how parallel scans assign chunks to workers. The likelihood of duplicate results may increase for higher pages_per_range values, but then there are fewer page ranges in total. In any case, we expect the merging to be much cheaper than summarization, so this should be a win. Most of the parallelism infrastructure is a simplified copy of the code used by BTREE indexes, omitting the parts irrelevant for BRIN indexes (e.g. uniqueness checks). This also introduces a new index AM flag amcanbuildparallel, determining whether to attempt to start parallel workers for the index build. Original patch by me, with reviews and substantial reworks by Matthias van de Meent, certainly enough to make him a co-author. Author: Tomas Vondra, Matthias van de Meent Reviewed-by: Matthias van de Meent Discussion: https://postgr.es/m/c2ee7d69-ce17-43f2-d1a0-9811edbda6e6%40enterprisedb.com
2023-12-08 18:15:23 +01:00
/*
* Routines specialized for BrinTuple case
*/
static void
removeabbrev_index_brin(Tuplesortstate *state, SortTuple *stups, int count)
{
int i;
for (i = 0; i < count; i++)
{
BrinSortTuple *tuple;
tuple = stups[i].tuple;
stups[i].datum1 = tuple->tuple.bt_blkno;
}
}
static int
comparetup_index_brin(const SortTuple *a, const SortTuple *b,
Tuplesortstate *state)
{
Assert(TuplesortstateGetPublic(state)->haveDatum1);
if (DatumGetUInt32(a->datum1) > DatumGetUInt32(b->datum1))
return 1;
if (DatumGetUInt32(a->datum1) < DatumGetUInt32(b->datum1))
return -1;
/* silence compilers */
return 0;
}
static void
writetup_index_brin(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup)
{
TuplesortPublic *base = TuplesortstateGetPublic(state);
BrinSortTuple *tuple = (BrinSortTuple *) stup->tuple;
unsigned int tuplen = tuple->tuplen;
tuplen = tuplen + sizeof(tuplen);
LogicalTapeWrite(tape, &tuplen, sizeof(tuplen));
LogicalTapeWrite(tape, &tuple->tuple, tuple->tuplen);
if (base->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length word? */
LogicalTapeWrite(tape, &tuplen, sizeof(tuplen));
}
static void
readtup_index_brin(Tuplesortstate *state, SortTuple *stup,
LogicalTape *tape, unsigned int len)
{
BrinSortTuple *tuple;
TuplesortPublic *base = TuplesortstateGetPublic(state);
unsigned int tuplen = len - sizeof(unsigned int);
/*
* Allocate space for the BRIN sort tuple, which is BrinTuple with an
* extra length field.
*/
tuple = (BrinSortTuple *) tuplesort_readtup_alloc(state,
BRINSORTTUPLE_SIZE(tuplen));
tuple->tuplen = tuplen;
LogicalTapeReadExact(tape, &tuple->tuple, tuplen);
if (base->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length word? */
LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen));
stup->tuple = (void *) tuple;
/* set up first-column key value, which is block number */
stup->datum1 = tuple->tuple.bt_blkno;
}
/*
* Routines specialized for DatumTuple case
*/
static void
removeabbrev_datum(Tuplesortstate *state, SortTuple *stups, int count)
{
int i;
for (i = 0; i < count; i++)
stups[i].datum1 = PointerGetDatum(stups[i].tuple);
}
static int
comparetup_datum(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
{
TuplesortPublic *base = TuplesortstateGetPublic(state);
int compare;
compare = ApplySortComparator(a->datum1, a->isnull1,
b->datum1, b->isnull1,
base->sortKeys);
if (compare != 0)
return compare;
return comparetup_datum_tiebreak(a, b, state);
}
static int
comparetup_datum_tiebreak(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
{
TuplesortPublic *base = TuplesortstateGetPublic(state);
int32 compare = 0;
/* if we have abbreviations, then "tuple" has the original value */
if (base->sortKeys->abbrev_converter)
compare = ApplySortAbbrevFullComparator(PointerGetDatum(a->tuple), a->isnull1,
PointerGetDatum(b->tuple), b->isnull1,
base->sortKeys);
return compare;
}
static void
writetup_datum(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup)
{
TuplesortPublic *base = TuplesortstateGetPublic(state);
TuplesortDatumArg *arg = (TuplesortDatumArg *) base->arg;
void *waddr;
unsigned int tuplen;
unsigned int writtenlen;
if (stup->isnull1)
{
waddr = NULL;
tuplen = 0;
}
else if (!base->tuples)
{
waddr = &stup->datum1;
tuplen = sizeof(Datum);
}
else
{
waddr = stup->tuple;
tuplen = datumGetSize(PointerGetDatum(stup->tuple), false, arg->datumTypeLen);
Assert(tuplen != 0);
}
writtenlen = tuplen + sizeof(unsigned int);
LogicalTapeWrite(tape, &writtenlen, sizeof(writtenlen));
LogicalTapeWrite(tape, waddr, tuplen);
if (base->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length word? */
LogicalTapeWrite(tape, &writtenlen, sizeof(writtenlen));
}
static void
readtup_datum(Tuplesortstate *state, SortTuple *stup,
LogicalTape *tape, unsigned int len)
{
TuplesortPublic *base = TuplesortstateGetPublic(state);
unsigned int tuplen = len - sizeof(unsigned int);
if (tuplen == 0)
{
/* it's NULL */
stup->datum1 = (Datum) 0;
stup->isnull1 = true;
stup->tuple = NULL;
}
else if (!base->tuples)
{
Assert(tuplen == sizeof(Datum));
LogicalTapeReadExact(tape, &stup->datum1, tuplen);
stup->isnull1 = false;
stup->tuple = NULL;
}
else
{
void *raddr = tuplesort_readtup_alloc(state, tuplen);
LogicalTapeReadExact(tape, raddr, tuplen);
stup->datum1 = PointerGetDatum(raddr);
stup->isnull1 = false;
stup->tuple = raddr;
}
if (base->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length word? */
LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen));
}