/*------------------------------------------------------------------------- * * tuplesortvariants.c * Implementation of tuple sorting variants. * * This module handles the sorting of heap tuples, index tuples, or single * Datums. The implementation is based on the generalized tuple sorting * facility given in tuplesort.c. Support other kinds of sortable objects * could be easily added here, another module, or even an extension. * * * Copyright (c) 2022-2024, PostgreSQL Global Development Group * * IDENTIFICATION * src/backend/utils/sort/tuplesortvariants.c * *------------------------------------------------------------------------- */ #include "postgres.h" #include "access/brin_tuple.h" #include "access/hash.h" #include "access/htup_details.h" #include "access/nbtree.h" #include "catalog/index.h" #include "executor/executor.h" #include "pg_trace.h" #include "utils/datum.h" #include "utils/guc.h" #include "utils/lsyscache.h" #include "utils/tuplesort.h" /* sort-type codes for sort__start probes */ #define HEAP_SORT 0 #define INDEX_SORT 1 #define DATUM_SORT 2 #define CLUSTER_SORT 3 static void removeabbrev_heap(Tuplesortstate *state, SortTuple *stups, int count); static void removeabbrev_cluster(Tuplesortstate *state, SortTuple *stups, int count); static void removeabbrev_index(Tuplesortstate *state, SortTuple *stups, int count); static void removeabbrev_index_brin(Tuplesortstate *state, SortTuple *stups, int count); static void removeabbrev_datum(Tuplesortstate *state, SortTuple *stups, int count); static int comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state); static int comparetup_heap_tiebreak(const SortTuple *a, const SortTuple *b, Tuplesortstate *state); static void writetup_heap(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup); static void readtup_heap(Tuplesortstate *state, SortTuple *stup, LogicalTape *tape, unsigned int len); static int comparetup_cluster(const SortTuple *a, const SortTuple *b, Tuplesortstate *state); static int comparetup_cluster_tiebreak(const SortTuple *a, const SortTuple *b, Tuplesortstate *state); static void writetup_cluster(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup); static void readtup_cluster(Tuplesortstate *state, SortTuple *stup, LogicalTape *tape, unsigned int tuplen); static int comparetup_index_btree(const SortTuple *a, const SortTuple *b, Tuplesortstate *state); static int comparetup_index_btree_tiebreak(const SortTuple *a, const SortTuple *b, Tuplesortstate *state); static int comparetup_index_hash(const SortTuple *a, const SortTuple *b, Tuplesortstate *state); static int comparetup_index_hash_tiebreak(const SortTuple *a, const SortTuple *b, Tuplesortstate *state); static int comparetup_index_brin(const SortTuple *a, const SortTuple *b, Tuplesortstate *state); static void writetup_index(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup); static void readtup_index(Tuplesortstate *state, SortTuple *stup, LogicalTape *tape, unsigned int len); static void writetup_index_brin(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup); static void readtup_index_brin(Tuplesortstate *state, SortTuple *stup, LogicalTape *tape, unsigned int len); static int comparetup_datum(const SortTuple *a, const SortTuple *b, Tuplesortstate *state); static int comparetup_datum_tiebreak(const SortTuple *a, const SortTuple *b, Tuplesortstate *state); static void writetup_datum(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup); static void readtup_datum(Tuplesortstate *state, SortTuple *stup, LogicalTape *tape, unsigned int len); static void freestate_cluster(Tuplesortstate *state); /* * Data structure pointed by "TuplesortPublic.arg" for the CLUSTER case. Set by * the tuplesort_begin_cluster. */ typedef struct { TupleDesc tupDesc; IndexInfo *indexInfo; /* info about index being used for reference */ EState *estate; /* for evaluating index expressions */ } TuplesortClusterArg; /* * Data structure pointed by "TuplesortPublic.arg" for the IndexTuple case. * Set by tuplesort_begin_index_xxx and used only by the IndexTuple routines. */ typedef struct { Relation heapRel; /* table the index is being built on */ Relation indexRel; /* index being built */ } TuplesortIndexArg; /* * Data structure pointed by "TuplesortPublic.arg" for the index_btree subcase. */ typedef struct { TuplesortIndexArg index; bool enforceUnique; /* complain if we find duplicate tuples */ bool uniqueNullsNotDistinct; /* unique constraint null treatment */ } TuplesortIndexBTreeArg; /* * Data structure pointed by "TuplesortPublic.arg" for the index_hash subcase. */ typedef struct { TuplesortIndexArg index; uint32 high_mask; /* masks for sortable part of hash code */ uint32 low_mask; uint32 max_buckets; } TuplesortIndexHashArg; /* * Data structure pointed by "TuplesortPublic.arg" for the Datum case. * Set by tuplesort_begin_datum and used only by the DatumTuple routines. */ typedef struct { /* the datatype oid of Datum's to be sorted */ Oid datumType; /* we need typelen in order to know how to copy the Datums. */ int datumTypeLen; } TuplesortDatumArg; /* * Computing BrinTuple size with only the tuple is difficult, so we want to track * the length referenced by the SortTuple. That's what BrinSortTuple is meant * to do - it's essentially a BrinTuple prefixed by its length. */ typedef struct BrinSortTuple { Size tuplen; BrinTuple tuple; } BrinSortTuple; /* Size of the BrinSortTuple, given length of the BrinTuple. */ #define BRINSORTTUPLE_SIZE(len) (offsetof(BrinSortTuple, tuple) + (len)) Tuplesortstate * tuplesort_begin_heap(TupleDesc tupDesc, int nkeys, AttrNumber *attNums, Oid *sortOperators, Oid *sortCollations, bool *nullsFirstFlags, int workMem, SortCoordinate coordinate, int sortopt) { Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, sortopt); TuplesortPublic *base = TuplesortstateGetPublic(state); MemoryContext oldcontext; int i; oldcontext = MemoryContextSwitchTo(base->maincontext); Assert(nkeys > 0); #ifdef TRACE_SORT if (trace_sort) elog(LOG, "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", nkeys, workMem, sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f'); #endif base->nKeys = nkeys; TRACE_POSTGRESQL_SORT_START(HEAP_SORT, false, /* no unique check */ nkeys, workMem, sortopt & TUPLESORT_RANDOMACCESS, PARALLEL_SORT(coordinate)); base->removeabbrev = removeabbrev_heap; base->comparetup = comparetup_heap; base->comparetup_tiebreak = comparetup_heap_tiebreak; base->writetup = writetup_heap; base->readtup = readtup_heap; base->haveDatum1 = true; base->arg = tupDesc; /* assume we need not copy tupDesc */ /* Prepare SortSupport data for each column */ base->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData)); for (i = 0; i < nkeys; i++) { SortSupport sortKey = base->sortKeys + i; Assert(attNums[i] != 0); Assert(sortOperators[i] != 0); sortKey->ssup_cxt = CurrentMemoryContext; sortKey->ssup_collation = sortCollations[i]; sortKey->ssup_nulls_first = nullsFirstFlags[i]; sortKey->ssup_attno = attNums[i]; /* Convey if abbreviation optimization is applicable in principle */ sortKey->abbreviate = (i == 0 && base->haveDatum1); PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey); } /* * The "onlyKey" optimization cannot be used with abbreviated keys, since * tie-breaker comparisons may be required. Typically, the optimization * is only of value to pass-by-value types anyway, whereas abbreviated * keys are typically only of value to pass-by-reference types. */ if (nkeys == 1 && !base->sortKeys->abbrev_converter) base->onlyKey = base->sortKeys; MemoryContextSwitchTo(oldcontext); return state; } Tuplesortstate * tuplesort_begin_cluster(TupleDesc tupDesc, Relation indexRel, int workMem, SortCoordinate coordinate, int sortopt) { Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, sortopt); TuplesortPublic *base = TuplesortstateGetPublic(state); BTScanInsert indexScanKey; MemoryContext oldcontext; TuplesortClusterArg *arg; int i; Assert(indexRel->rd_rel->relam == BTREE_AM_OID); oldcontext = MemoryContextSwitchTo(base->maincontext); arg = (TuplesortClusterArg *) palloc0(sizeof(TuplesortClusterArg)); #ifdef TRACE_SORT if (trace_sort) elog(LOG, "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", RelationGetNumberOfAttributes(indexRel), workMem, sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f'); #endif base->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel); TRACE_POSTGRESQL_SORT_START(CLUSTER_SORT, false, /* no unique check */ base->nKeys, workMem, sortopt & TUPLESORT_RANDOMACCESS, PARALLEL_SORT(coordinate)); base->removeabbrev = removeabbrev_cluster; base->comparetup = comparetup_cluster; base->comparetup_tiebreak = comparetup_cluster_tiebreak; base->writetup = writetup_cluster; base->readtup = readtup_cluster; base->freestate = freestate_cluster; base->arg = arg; arg->indexInfo = BuildIndexInfo(indexRel); /* * If we don't have a simple leading attribute, we don't currently * initialize datum1, so disable optimizations that require it. */ if (arg->indexInfo->ii_IndexAttrNumbers[0] == 0) base->haveDatum1 = false; else base->haveDatum1 = true; arg->tupDesc = tupDesc; /* assume we need not copy tupDesc */ indexScanKey = _bt_mkscankey(indexRel, NULL); if (arg->indexInfo->ii_Expressions != NULL) { TupleTableSlot *slot; ExprContext *econtext; /* * We will need to use FormIndexDatum to evaluate the index * expressions. To do that, we need an EState, as well as a * TupleTableSlot to put the table tuples into. The econtext's * scantuple has to point to that slot, too. */ arg->estate = CreateExecutorState(); slot = MakeSingleTupleTableSlot(tupDesc, &TTSOpsHeapTuple); econtext = GetPerTupleExprContext(arg->estate); econtext->ecxt_scantuple = slot; } /* Prepare SortSupport data for each column */ base->sortKeys = (SortSupport) palloc0(base->nKeys * sizeof(SortSupportData)); for (i = 0; i < base->nKeys; i++) { SortSupport sortKey = base->sortKeys + i; ScanKey scanKey = indexScanKey->scankeys + i; int16 strategy; sortKey->ssup_cxt = CurrentMemoryContext; sortKey->ssup_collation = scanKey->sk_collation; sortKey->ssup_nulls_first = (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; sortKey->ssup_attno = scanKey->sk_attno; /* Convey if abbreviation optimization is applicable in principle */ sortKey->abbreviate = (i == 0 && base->haveDatum1); Assert(sortKey->ssup_attno != 0); strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? BTGreaterStrategyNumber : BTLessStrategyNumber; PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); } pfree(indexScanKey); MemoryContextSwitchTo(oldcontext); return state; } Tuplesortstate * tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, int workMem, SortCoordinate coordinate, int sortopt) { Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, sortopt); TuplesortPublic *base = TuplesortstateGetPublic(state); BTScanInsert indexScanKey; TuplesortIndexBTreeArg *arg; MemoryContext oldcontext; int i; oldcontext = MemoryContextSwitchTo(base->maincontext); arg = (TuplesortIndexBTreeArg *) palloc(sizeof(TuplesortIndexBTreeArg)); #ifdef TRACE_SORT if (trace_sort) elog(LOG, "begin index sort: unique = %c, workMem = %d, randomAccess = %c", enforceUnique ? 't' : 'f', workMem, sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f'); #endif base->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel); TRACE_POSTGRESQL_SORT_START(INDEX_SORT, enforceUnique, base->nKeys, workMem, sortopt & TUPLESORT_RANDOMACCESS, PARALLEL_SORT(coordinate)); base->removeabbrev = removeabbrev_index; base->comparetup = comparetup_index_btree; base->comparetup_tiebreak = comparetup_index_btree_tiebreak; base->writetup = writetup_index; base->readtup = readtup_index; base->haveDatum1 = true; base->arg = arg; arg->index.heapRel = heapRel; arg->index.indexRel = indexRel; arg->enforceUnique = enforceUnique; arg->uniqueNullsNotDistinct = uniqueNullsNotDistinct; indexScanKey = _bt_mkscankey(indexRel, NULL); /* Prepare SortSupport data for each column */ base->sortKeys = (SortSupport) palloc0(base->nKeys * sizeof(SortSupportData)); for (i = 0; i < base->nKeys; i++) { SortSupport sortKey = base->sortKeys + i; ScanKey scanKey = indexScanKey->scankeys + i; int16 strategy; sortKey->ssup_cxt = CurrentMemoryContext; sortKey->ssup_collation = scanKey->sk_collation; sortKey->ssup_nulls_first = (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; sortKey->ssup_attno = scanKey->sk_attno; /* Convey if abbreviation optimization is applicable in principle */ sortKey->abbreviate = (i == 0 && base->haveDatum1); Assert(sortKey->ssup_attno != 0); strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? BTGreaterStrategyNumber : BTLessStrategyNumber; PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); } pfree(indexScanKey); MemoryContextSwitchTo(oldcontext); return state; } Tuplesortstate * tuplesort_begin_index_hash(Relation heapRel, Relation indexRel, uint32 high_mask, uint32 low_mask, uint32 max_buckets, int workMem, SortCoordinate coordinate, int sortopt) { Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, sortopt); TuplesortPublic *base = TuplesortstateGetPublic(state); MemoryContext oldcontext; TuplesortIndexHashArg *arg; oldcontext = MemoryContextSwitchTo(base->maincontext); arg = (TuplesortIndexHashArg *) palloc(sizeof(TuplesortIndexHashArg)); #ifdef TRACE_SORT if (trace_sort) elog(LOG, "begin index sort: high_mask = 0x%x, low_mask = 0x%x, " "max_buckets = 0x%x, workMem = %d, randomAccess = %c", high_mask, low_mask, max_buckets, workMem, sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f'); #endif base->nKeys = 1; /* Only one sort column, the hash code */ base->removeabbrev = removeabbrev_index; base->comparetup = comparetup_index_hash; base->comparetup_tiebreak = comparetup_index_hash_tiebreak; base->writetup = writetup_index; base->readtup = readtup_index; base->haveDatum1 = true; base->arg = arg; arg->index.heapRel = heapRel; arg->index.indexRel = indexRel; arg->high_mask = high_mask; arg->low_mask = low_mask; arg->max_buckets = max_buckets; MemoryContextSwitchTo(oldcontext); return state; } Tuplesortstate * tuplesort_begin_index_gist(Relation heapRel, Relation indexRel, int workMem, SortCoordinate coordinate, int sortopt) { Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, sortopt); TuplesortPublic *base = TuplesortstateGetPublic(state); MemoryContext oldcontext; TuplesortIndexBTreeArg *arg; int i; oldcontext = MemoryContextSwitchTo(base->maincontext); arg = (TuplesortIndexBTreeArg *) palloc(sizeof(TuplesortIndexBTreeArg)); #ifdef TRACE_SORT if (trace_sort) elog(LOG, "begin index sort: workMem = %d, randomAccess = %c", workMem, sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f'); #endif base->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel); base->removeabbrev = removeabbrev_index; base->comparetup = comparetup_index_btree; base->comparetup_tiebreak = comparetup_index_btree_tiebreak; base->writetup = writetup_index; base->readtup = readtup_index; base->haveDatum1 = true; base->arg = arg; arg->index.heapRel = heapRel; arg->index.indexRel = indexRel; arg->enforceUnique = false; arg->uniqueNullsNotDistinct = false; /* Prepare SortSupport data for each column */ base->sortKeys = (SortSupport) palloc0(base->nKeys * sizeof(SortSupportData)); for (i = 0; i < base->nKeys; i++) { SortSupport sortKey = base->sortKeys + i; sortKey->ssup_cxt = CurrentMemoryContext; sortKey->ssup_collation = indexRel->rd_indcollation[i]; sortKey->ssup_nulls_first = false; sortKey->ssup_attno = i + 1; /* Convey if abbreviation optimization is applicable in principle */ sortKey->abbreviate = (i == 0 && base->haveDatum1); Assert(sortKey->ssup_attno != 0); /* Look for a sort support function */ PrepareSortSupportFromGistIndexRel(indexRel, sortKey); } MemoryContextSwitchTo(oldcontext); return state; } Tuplesortstate * tuplesort_begin_index_brin(int workMem, SortCoordinate coordinate, int sortopt) { Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, sortopt); TuplesortPublic *base = TuplesortstateGetPublic(state); #ifdef TRACE_SORT if (trace_sort) elog(LOG, "begin index sort: workMem = %d, randomAccess = %c", workMem, sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f'); #endif base->nKeys = 1; /* Only one sort column, the block number */ base->removeabbrev = removeabbrev_index_brin; base->comparetup = comparetup_index_brin; base->writetup = writetup_index_brin; base->readtup = readtup_index_brin; base->haveDatum1 = true; base->arg = NULL; return state; } Tuplesortstate * tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, bool nullsFirstFlag, int workMem, SortCoordinate coordinate, int sortopt) { Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, sortopt); TuplesortPublic *base = TuplesortstateGetPublic(state); TuplesortDatumArg *arg; MemoryContext oldcontext; int16 typlen; bool typbyval; oldcontext = MemoryContextSwitchTo(base->maincontext); arg = (TuplesortDatumArg *) palloc(sizeof(TuplesortDatumArg)); #ifdef TRACE_SORT if (trace_sort) elog(LOG, "begin datum sort: workMem = %d, randomAccess = %c", workMem, sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f'); #endif base->nKeys = 1; /* always a one-column sort */ TRACE_POSTGRESQL_SORT_START(DATUM_SORT, false, /* no unique check */ 1, workMem, sortopt & TUPLESORT_RANDOMACCESS, PARALLEL_SORT(coordinate)); base->removeabbrev = removeabbrev_datum; base->comparetup = comparetup_datum; base->comparetup_tiebreak = comparetup_datum_tiebreak; base->writetup = writetup_datum; base->readtup = readtup_datum; base->haveDatum1 = true; base->arg = arg; arg->datumType = datumType; /* lookup necessary attributes of the datum type */ get_typlenbyval(datumType, &typlen, &typbyval); arg->datumTypeLen = typlen; base->tuples = !typbyval; /* Prepare SortSupport data */ base->sortKeys = (SortSupport) palloc0(sizeof(SortSupportData)); base->sortKeys->ssup_cxt = CurrentMemoryContext; base->sortKeys->ssup_collation = sortCollation; base->sortKeys->ssup_nulls_first = nullsFirstFlag; /* * Abbreviation is possible here only for by-reference types. In theory, * a pass-by-value datatype could have an abbreviated form that is cheaper * to compare. In a tuple sort, we could support that, because we can * always extract the original datum from the tuple as needed. Here, we * can't, because a datum sort only stores a single copy of the datum; the * "tuple" field of each SortTuple is NULL. */ base->sortKeys->abbreviate = !typbyval; PrepareSortSupportFromOrderingOp(sortOperator, base->sortKeys); /* * The "onlyKey" optimization cannot be used with abbreviated keys, since * tie-breaker comparisons may be required. Typically, the optimization * is only of value to pass-by-value types anyway, whereas abbreviated * keys are typically only of value to pass-by-reference types. */ if (!base->sortKeys->abbrev_converter) base->onlyKey = base->sortKeys; MemoryContextSwitchTo(oldcontext); return state; } /* * Accept one tuple while collecting input data for sort. * * Note that the input data is always copied; the caller need not save it. */ void tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot) { TuplesortPublic *base = TuplesortstateGetPublic(state); MemoryContext oldcontext = MemoryContextSwitchTo(base->tuplecontext); TupleDesc tupDesc = (TupleDesc) base->arg; SortTuple stup; MinimalTuple tuple; HeapTupleData htup; /* copy the tuple into sort storage */ tuple = ExecCopySlotMinimalTuple(slot); stup.tuple = (void *) tuple; /* set up first-column key value */ htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); stup.datum1 = heap_getattr(&htup, base->sortKeys[0].ssup_attno, tupDesc, &stup.isnull1); tuplesort_puttuple_common(state, &stup, base->sortKeys->abbrev_converter && !stup.isnull1); MemoryContextSwitchTo(oldcontext); } /* * Accept one tuple while collecting input data for sort. * * Note that the input data is always copied; the caller need not save it. */ void tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup) { SortTuple stup; TuplesortPublic *base = TuplesortstateGetPublic(state); MemoryContext oldcontext = MemoryContextSwitchTo(base->tuplecontext); TuplesortClusterArg *arg = (TuplesortClusterArg *) base->arg; /* copy the tuple into sort storage */ tup = heap_copytuple(tup); stup.tuple = (void *) tup; /* * set up first-column key value, and potentially abbreviate, if it's a * simple column */ if (base->haveDatum1) { stup.datum1 = heap_getattr(tup, arg->indexInfo->ii_IndexAttrNumbers[0], arg->tupDesc, &stup.isnull1); } tuplesort_puttuple_common(state, &stup, base->haveDatum1 && base->sortKeys->abbrev_converter && !stup.isnull1); MemoryContextSwitchTo(oldcontext); } /* * Collect one index tuple while collecting input data for sort, building * it from caller-supplied values. */ void tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, ItemPointer self, const Datum *values, const bool *isnull) { SortTuple stup; IndexTuple tuple; TuplesortPublic *base = TuplesortstateGetPublic(state); TuplesortIndexArg *arg = (TuplesortIndexArg *) base->arg; stup.tuple = index_form_tuple_context(RelationGetDescr(rel), values, isnull, base->tuplecontext); tuple = ((IndexTuple) stup.tuple); tuple->t_tid = *self; /* set up first-column key value */ stup.datum1 = index_getattr(tuple, 1, RelationGetDescr(arg->indexRel), &stup.isnull1); tuplesort_puttuple_common(state, &stup, base->sortKeys && base->sortKeys->abbrev_converter && !stup.isnull1); } /* * Collect one BRIN tuple while collecting input data for sort. */ void tuplesort_putbrintuple(Tuplesortstate *state, BrinTuple *tuple, Size size) { SortTuple stup; BrinSortTuple *bstup; TuplesortPublic *base = TuplesortstateGetPublic(state); MemoryContext oldcontext = MemoryContextSwitchTo(base->tuplecontext); /* allocate space for the whole BRIN sort tuple */ bstup = palloc(BRINSORTTUPLE_SIZE(size)); bstup->tuplen = size; memcpy(&bstup->tuple, tuple, size); stup.tuple = bstup; stup.datum1 = tuple->bt_blkno; stup.isnull1 = false; tuplesort_puttuple_common(state, &stup, base->sortKeys && base->sortKeys->abbrev_converter && !stup.isnull1); MemoryContextSwitchTo(oldcontext); } /* * Accept one Datum while collecting input data for sort. * * If the Datum is pass-by-ref type, the value will be copied. */ void tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull) { TuplesortPublic *base = TuplesortstateGetPublic(state); MemoryContext oldcontext = MemoryContextSwitchTo(base->tuplecontext); TuplesortDatumArg *arg = (TuplesortDatumArg *) base->arg; SortTuple stup; /* * Pass-by-value types or null values are just stored directly in * stup.datum1 (and stup.tuple is not used and set to NULL). * * Non-null pass-by-reference values need to be copied into memory we * control, and possibly abbreviated. The copied value is pointed to by * stup.tuple and is treated as the canonical copy (e.g. to return via * tuplesort_getdatum or when writing to tape); stup.datum1 gets the * abbreviated value if abbreviation is happening, otherwise it's * identical to stup.tuple. */ if (isNull || !base->tuples) { /* * Set datum1 to zeroed representation for NULLs (to be consistent, * and to support cheap inequality tests for NULL abbreviated keys). */ stup.datum1 = !isNull ? val : (Datum) 0; stup.isnull1 = isNull; stup.tuple = NULL; /* no separate storage */ } else { stup.isnull1 = false; stup.datum1 = datumCopy(val, false, arg->datumTypeLen); stup.tuple = DatumGetPointer(stup.datum1); } tuplesort_puttuple_common(state, &stup, base->tuples && base->sortKeys->abbrev_converter && !isNull); MemoryContextSwitchTo(oldcontext); } /* * Fetch the next tuple in either forward or back direction. * If successful, put tuple in slot and return true; else, clear the slot * and return false. * * Caller may optionally be passed back abbreviated value (on true return * value) when abbreviation was used, which can be used to cheaply avoid * equality checks that might otherwise be required. Caller can safely make a * determination of "non-equal tuple" based on simple binary inequality. A * NULL value in leading attribute will set abbreviated value to zeroed * representation, which caller may rely on in abbreviated inequality check. * * If copy is true, the slot receives a tuple that's been copied into the * caller's memory context, so that it will stay valid regardless of future * manipulations of the tuplesort's state (up to and including deleting the * tuplesort). If copy is false, the slot will just receive a pointer to a * tuple held within the tuplesort, which is more efficient, but only safe for * callers that are prepared to have any subsequent manipulation of the * tuplesort's state invalidate slot contents. */ bool tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy, TupleTableSlot *slot, Datum *abbrev) { TuplesortPublic *base = TuplesortstateGetPublic(state); MemoryContext oldcontext = MemoryContextSwitchTo(base->sortcontext); SortTuple stup; if (!tuplesort_gettuple_common(state, forward, &stup)) stup.tuple = NULL; MemoryContextSwitchTo(oldcontext); if (stup.tuple) { /* Record abbreviated key for caller */ if (base->sortKeys->abbrev_converter && abbrev) *abbrev = stup.datum1; if (copy) stup.tuple = heap_copy_minimal_tuple((MinimalTuple) stup.tuple); ExecStoreMinimalTuple((MinimalTuple) stup.tuple, slot, copy); return true; } else { ExecClearTuple(slot); return false; } } /* * Fetch the next tuple in either forward or back direction. * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory * context, and must not be freed by caller. Caller may not rely on tuple * remaining valid after any further manipulation of tuplesort. */ HeapTuple tuplesort_getheaptuple(Tuplesortstate *state, bool forward) { TuplesortPublic *base = TuplesortstateGetPublic(state); MemoryContext oldcontext = MemoryContextSwitchTo(base->sortcontext); SortTuple stup; if (!tuplesort_gettuple_common(state, forward, &stup)) stup.tuple = NULL; MemoryContextSwitchTo(oldcontext); return stup.tuple; } /* * Fetch the next index tuple in either forward or back direction. * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory * context, and must not be freed by caller. Caller may not rely on tuple * remaining valid after any further manipulation of tuplesort. */ IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward) { TuplesortPublic *base = TuplesortstateGetPublic(state); MemoryContext oldcontext = MemoryContextSwitchTo(base->sortcontext); SortTuple stup; if (!tuplesort_gettuple_common(state, forward, &stup)) stup.tuple = NULL; MemoryContextSwitchTo(oldcontext); return (IndexTuple) stup.tuple; } /* * Fetch the next BRIN tuple in either forward or back direction. * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory * context, and must not be freed by caller. Caller may not rely on tuple * remaining valid after any further manipulation of tuplesort. */ BrinTuple * tuplesort_getbrintuple(Tuplesortstate *state, Size *len, bool forward) { TuplesortPublic *base = TuplesortstateGetPublic(state); MemoryContext oldcontext = MemoryContextSwitchTo(base->sortcontext); SortTuple stup; BrinSortTuple *btup; if (!tuplesort_gettuple_common(state, forward, &stup)) stup.tuple = NULL; MemoryContextSwitchTo(oldcontext); if (!stup.tuple) return NULL; btup = (BrinSortTuple *) stup.tuple; *len = btup->tuplen; return &btup->tuple; } /* * Fetch the next Datum in either forward or back direction. * Returns false if no more datums. * * If the Datum is pass-by-ref type, the returned value is freshly palloc'd * in caller's context, and is now owned by the caller (this differs from * similar routines for other types of tuplesorts). * * Caller may optionally be passed back abbreviated value (on true return * value) when abbreviation was used, which can be used to cheaply avoid * equality checks that might otherwise be required. Caller can safely make a * determination of "non-equal tuple" based on simple binary inequality. A * NULL value will have a zeroed abbreviated value representation, which caller * may rely on in abbreviated inequality check. * * For byref Datums, if copy is true, *val is set to a copy of the Datum * copied into the caller's memory context, so that it will stay valid * regardless of future manipulations of the tuplesort's state (up to and * including deleting the tuplesort). If copy is false, *val will just be * set to a pointer to the Datum held within the tuplesort, which is more * efficient, but only safe for callers that are prepared to have any * subsequent manipulation of the tuplesort's state invalidate slot contents. * For byval Datums, the value of the 'copy' parameter has no effect. */ bool tuplesort_getdatum(Tuplesortstate *state, bool forward, bool copy, Datum *val, bool *isNull, Datum *abbrev) { TuplesortPublic *base = TuplesortstateGetPublic(state); MemoryContext oldcontext = MemoryContextSwitchTo(base->sortcontext); TuplesortDatumArg *arg = (TuplesortDatumArg *) base->arg; SortTuple stup; if (!tuplesort_gettuple_common(state, forward, &stup)) { MemoryContextSwitchTo(oldcontext); return false; } /* Ensure we copy into caller's memory context */ MemoryContextSwitchTo(oldcontext); /* Record abbreviated key for caller */ if (base->sortKeys->abbrev_converter && abbrev) *abbrev = stup.datum1; if (stup.isnull1 || !base->tuples) { *val = stup.datum1; *isNull = stup.isnull1; } else { /* use stup.tuple because stup.datum1 may be an abbreviation */ if (copy) *val = datumCopy(PointerGetDatum(stup.tuple), false, arg->datumTypeLen); else *val = PointerGetDatum(stup.tuple); *isNull = false; } return true; } /* * Routines specialized for HeapTuple (actually MinimalTuple) case */ static void removeabbrev_heap(Tuplesortstate *state, SortTuple *stups, int count) { int i; TuplesortPublic *base = TuplesortstateGetPublic(state); for (i = 0; i < count; i++) { HeapTupleData htup; htup.t_len = ((MinimalTuple) stups[i].tuple)->t_len + MINIMAL_TUPLE_OFFSET; htup.t_data = (HeapTupleHeader) ((char *) stups[i].tuple - MINIMAL_TUPLE_OFFSET); stups[i].datum1 = heap_getattr(&htup, base->sortKeys[0].ssup_attno, (TupleDesc) base->arg, &stups[i].isnull1); } } static int comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) { TuplesortPublic *base = TuplesortstateGetPublic(state); SortSupport sortKey = base->sortKeys; int32 compare; /* Compare the leading sort key */ compare = ApplySortComparator(a->datum1, a->isnull1, b->datum1, b->isnull1, sortKey); if (compare != 0) return compare; /* Compare additional sort keys */ return comparetup_heap_tiebreak(a, b, state); } static int comparetup_heap_tiebreak(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) { TuplesortPublic *base = TuplesortstateGetPublic(state); SortSupport sortKey = base->sortKeys; HeapTupleData ltup; HeapTupleData rtup; TupleDesc tupDesc; int nkey; int32 compare; AttrNumber attno; Datum datum1, datum2; bool isnull1, isnull2; ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET; ltup.t_data = (HeapTupleHeader) ((char *) a->tuple - MINIMAL_TUPLE_OFFSET); rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET; rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET); tupDesc = (TupleDesc) base->arg; if (sortKey->abbrev_converter) { attno = sortKey->ssup_attno; datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); compare = ApplySortAbbrevFullComparator(datum1, isnull1, datum2, isnull2, sortKey); if (compare != 0) return compare; } sortKey++; for (nkey = 1; nkey < base->nKeys; nkey++, sortKey++) { attno = sortKey->ssup_attno; datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); compare = ApplySortComparator(datum1, isnull1, datum2, isnull2, sortKey); if (compare != 0) return compare; } return 0; } static void writetup_heap(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup) { TuplesortPublic *base = TuplesortstateGetPublic(state); MinimalTuple tuple = (MinimalTuple) stup->tuple; /* the part of the MinimalTuple we'll write: */ char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; unsigned int tupbodylen = tuple->t_len - MINIMAL_TUPLE_DATA_OFFSET; /* total on-disk footprint: */ unsigned int tuplen = tupbodylen + sizeof(int); LogicalTapeWrite(tape, &tuplen, sizeof(tuplen)); LogicalTapeWrite(tape, tupbody, tupbodylen); if (base->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length word? */ LogicalTapeWrite(tape, &tuplen, sizeof(tuplen)); } static void readtup_heap(Tuplesortstate *state, SortTuple *stup, LogicalTape *tape, unsigned int len) { unsigned int tupbodylen = len - sizeof(int); unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET; MinimalTuple tuple = (MinimalTuple) tuplesort_readtup_alloc(state, tuplen); char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; TuplesortPublic *base = TuplesortstateGetPublic(state); HeapTupleData htup; /* read in the tuple proper */ tuple->t_len = tuplen; LogicalTapeReadExact(tape, tupbody, tupbodylen); if (base->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length word? */ LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen)); stup->tuple = (void *) tuple; /* set up first-column key value */ htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); stup->datum1 = heap_getattr(&htup, base->sortKeys[0].ssup_attno, (TupleDesc) base->arg, &stup->isnull1); } /* * Routines specialized for the CLUSTER case (HeapTuple data, with * comparisons per a btree index definition) */ static void removeabbrev_cluster(Tuplesortstate *state, SortTuple *stups, int count) { int i; TuplesortPublic *base = TuplesortstateGetPublic(state); TuplesortClusterArg *arg = (TuplesortClusterArg *) base->arg; for (i = 0; i < count; i++) { HeapTuple tup; tup = (HeapTuple) stups[i].tuple; stups[i].datum1 = heap_getattr(tup, arg->indexInfo->ii_IndexAttrNumbers[0], arg->tupDesc, &stups[i].isnull1); } } static int comparetup_cluster(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) { TuplesortPublic *base = TuplesortstateGetPublic(state); SortSupport sortKey = base->sortKeys; int32 compare; /* Compare the leading sort key, if it's simple */ if (base->haveDatum1) { compare = ApplySortComparator(a->datum1, a->isnull1, b->datum1, b->isnull1, sortKey); if (compare != 0) return compare; } return comparetup_cluster_tiebreak(a, b, state); } static int comparetup_cluster_tiebreak(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) { TuplesortPublic *base = TuplesortstateGetPublic(state); TuplesortClusterArg *arg = (TuplesortClusterArg *) base->arg; SortSupport sortKey = base->sortKeys; HeapTuple ltup; HeapTuple rtup; TupleDesc tupDesc; int nkey; int32 compare = 0; Datum datum1, datum2; bool isnull1, isnull2; ltup = (HeapTuple) a->tuple; rtup = (HeapTuple) b->tuple; tupDesc = arg->tupDesc; /* Compare the leading sort key, if it's simple */ if (base->haveDatum1) { if (sortKey->abbrev_converter) { AttrNumber leading = arg->indexInfo->ii_IndexAttrNumbers[0]; datum1 = heap_getattr(ltup, leading, tupDesc, &isnull1); datum2 = heap_getattr(rtup, leading, tupDesc, &isnull2); compare = ApplySortAbbrevFullComparator(datum1, isnull1, datum2, isnull2, sortKey); } if (compare != 0 || base->nKeys == 1) return compare; /* Compare additional columns the hard way */ sortKey++; nkey = 1; } else { /* Must compare all keys the hard way */ nkey = 0; } if (arg->indexInfo->ii_Expressions == NULL) { /* If not expression index, just compare the proper heap attrs */ for (; nkey < base->nKeys; nkey++, sortKey++) { AttrNumber attno = arg->indexInfo->ii_IndexAttrNumbers[nkey]; datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1); datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2); compare = ApplySortComparator(datum1, isnull1, datum2, isnull2, sortKey); if (compare != 0) return compare; } } else { /* * In the expression index case, compute the whole index tuple and * then compare values. It would perhaps be faster to compute only as * many columns as we need to compare, but that would require * duplicating all the logic in FormIndexDatum. */ Datum l_index_values[INDEX_MAX_KEYS]; bool l_index_isnull[INDEX_MAX_KEYS]; Datum r_index_values[INDEX_MAX_KEYS]; bool r_index_isnull[INDEX_MAX_KEYS]; TupleTableSlot *ecxt_scantuple; /* Reset context each time to prevent memory leakage */ ResetPerTupleExprContext(arg->estate); ecxt_scantuple = GetPerTupleExprContext(arg->estate)->ecxt_scantuple; ExecStoreHeapTuple(ltup, ecxt_scantuple, false); FormIndexDatum(arg->indexInfo, ecxt_scantuple, arg->estate, l_index_values, l_index_isnull); ExecStoreHeapTuple(rtup, ecxt_scantuple, false); FormIndexDatum(arg->indexInfo, ecxt_scantuple, arg->estate, r_index_values, r_index_isnull); for (; nkey < base->nKeys; nkey++, sortKey++) { compare = ApplySortComparator(l_index_values[nkey], l_index_isnull[nkey], r_index_values[nkey], r_index_isnull[nkey], sortKey); if (compare != 0) return compare; } } return 0; } static void writetup_cluster(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup) { TuplesortPublic *base = TuplesortstateGetPublic(state); HeapTuple tuple = (HeapTuple) stup->tuple; unsigned int tuplen = tuple->t_len + sizeof(ItemPointerData) + sizeof(int); /* We need to store t_self, but not other fields of HeapTupleData */ LogicalTapeWrite(tape, &tuplen, sizeof(tuplen)); LogicalTapeWrite(tape, &tuple->t_self, sizeof(ItemPointerData)); LogicalTapeWrite(tape, tuple->t_data, tuple->t_len); if (base->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length word? */ LogicalTapeWrite(tape, &tuplen, sizeof(tuplen)); } static void readtup_cluster(Tuplesortstate *state, SortTuple *stup, LogicalTape *tape, unsigned int tuplen) { TuplesortPublic *base = TuplesortstateGetPublic(state); TuplesortClusterArg *arg = (TuplesortClusterArg *) base->arg; unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int); HeapTuple tuple = (HeapTuple) tuplesort_readtup_alloc(state, t_len + HEAPTUPLESIZE); /* Reconstruct the HeapTupleData header */ tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE); tuple->t_len = t_len; LogicalTapeReadExact(tape, &tuple->t_self, sizeof(ItemPointerData)); /* We don't currently bother to reconstruct t_tableOid */ tuple->t_tableOid = InvalidOid; /* Read in the tuple body */ LogicalTapeReadExact(tape, tuple->t_data, tuple->t_len); if (base->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length word? */ LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen)); stup->tuple = (void *) tuple; /* set up first-column key value, if it's a simple column */ if (base->haveDatum1) stup->datum1 = heap_getattr(tuple, arg->indexInfo->ii_IndexAttrNumbers[0], arg->tupDesc, &stup->isnull1); } static void freestate_cluster(Tuplesortstate *state) { TuplesortPublic *base = TuplesortstateGetPublic(state); TuplesortClusterArg *arg = (TuplesortClusterArg *) base->arg; /* Free any execution state created for CLUSTER case */ if (arg->estate != NULL) { ExprContext *econtext = GetPerTupleExprContext(arg->estate); ExecDropSingleTupleTableSlot(econtext->ecxt_scantuple); FreeExecutorState(arg->estate); } } /* * Routines specialized for IndexTuple case * * The btree and hash cases require separate comparison functions, but the * IndexTuple representation is the same so the copy/write/read support * functions can be shared. */ static void removeabbrev_index(Tuplesortstate *state, SortTuple *stups, int count) { TuplesortPublic *base = TuplesortstateGetPublic(state); TuplesortIndexArg *arg = (TuplesortIndexArg *) base->arg; int i; for (i = 0; i < count; i++) { IndexTuple tuple; tuple = stups[i].tuple; stups[i].datum1 = index_getattr(tuple, 1, RelationGetDescr(arg->indexRel), &stups[i].isnull1); } } static int comparetup_index_btree(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) { /* * This is similar to comparetup_heap(), but expects index tuples. There * is also special handling for enforcing uniqueness, and special * treatment for equal keys at the end. */ TuplesortPublic *base = TuplesortstateGetPublic(state); SortSupport sortKey = base->sortKeys; int32 compare; /* Compare the leading sort key */ compare = ApplySortComparator(a->datum1, a->isnull1, b->datum1, b->isnull1, sortKey); if (compare != 0) return compare; /* Compare additional sort keys */ return comparetup_index_btree_tiebreak(a, b, state); } static int comparetup_index_btree_tiebreak(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) { TuplesortPublic *base = TuplesortstateGetPublic(state); TuplesortIndexBTreeArg *arg = (TuplesortIndexBTreeArg *) base->arg; SortSupport sortKey = base->sortKeys; IndexTuple tuple1; IndexTuple tuple2; int keysz; TupleDesc tupDes; bool equal_hasnull = false; int nkey; int32 compare; Datum datum1, datum2; bool isnull1, isnull2; tuple1 = (IndexTuple) a->tuple; tuple2 = (IndexTuple) b->tuple; keysz = base->nKeys; tupDes = RelationGetDescr(arg->index.indexRel); if (sortKey->abbrev_converter) { datum1 = index_getattr(tuple1, 1, tupDes, &isnull1); datum2 = index_getattr(tuple2, 1, tupDes, &isnull2); compare = ApplySortAbbrevFullComparator(datum1, isnull1, datum2, isnull2, sortKey); if (compare != 0) return compare; } /* they are equal, so we only need to examine one null flag */ if (a->isnull1) equal_hasnull = true; sortKey++; for (nkey = 2; nkey <= keysz; nkey++, sortKey++) { datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1); datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2); compare = ApplySortComparator(datum1, isnull1, datum2, isnull2, sortKey); if (compare != 0) return compare; /* done when we find unequal attributes */ /* they are equal, so we only need to examine one null flag */ if (isnull1) equal_hasnull = true; } /* * If btree has asked us to enforce uniqueness, complain if two equal * tuples are detected (unless there was at least one NULL field and NULLS * NOT DISTINCT was not set). * * It is sufficient to make the test here, because if two tuples are equal * they *must* get compared at some stage of the sort --- otherwise the * sort algorithm wouldn't have checked whether one must appear before the * other. */ if (arg->enforceUnique && !(!arg->uniqueNullsNotDistinct && equal_hasnull)) { Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; char *key_desc; /* * Some rather brain-dead implementations of qsort (such as the one in * QNX 4) will sometimes call the comparison routine to compare a * value to itself, but we always use our own implementation, which * does not. */ Assert(tuple1 != tuple2); index_deform_tuple(tuple1, tupDes, values, isnull); key_desc = BuildIndexValueDescription(arg->index.indexRel, values, isnull); ereport(ERROR, (errcode(ERRCODE_UNIQUE_VIOLATION), errmsg("could not create unique index \"%s\"", RelationGetRelationName(arg->index.indexRel)), key_desc ? errdetail("Key %s is duplicated.", key_desc) : errdetail("Duplicate keys exist."), errtableconstraint(arg->index.heapRel, RelationGetRelationName(arg->index.indexRel)))); } /* * If key values are equal, we sort on ItemPointer. This is required for * btree indexes, since heap TID is treated as an implicit last key * attribute in order to ensure that all keys in the index are physically * unique. */ { BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); if (blk1 != blk2) return (blk1 < blk2) ? -1 : 1; } { OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); if (pos1 != pos2) return (pos1 < pos2) ? -1 : 1; } /* ItemPointer values should never be equal */ Assert(false); return 0; } static int comparetup_index_hash(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) { Bucket bucket1; Bucket bucket2; uint32 hash1; uint32 hash2; IndexTuple tuple1; IndexTuple tuple2; TuplesortPublic *base = TuplesortstateGetPublic(state); TuplesortIndexHashArg *arg = (TuplesortIndexHashArg *) base->arg; /* * Fetch hash keys and mask off bits we don't want to sort by, so that the * initial sort is just on the bucket number. We know that the first * column of the index tuple is the hash key. */ Assert(!a->isnull1); bucket1 = _hash_hashkey2bucket(DatumGetUInt32(a->datum1), arg->max_buckets, arg->high_mask, arg->low_mask); Assert(!b->isnull1); bucket2 = _hash_hashkey2bucket(DatumGetUInt32(b->datum1), arg->max_buckets, arg->high_mask, arg->low_mask); if (bucket1 > bucket2) return 1; else if (bucket1 < bucket2) return -1; /* * If bucket values are equal, sort by hash values. This allows us to * insert directly onto bucket/overflow pages, where the index tuples are * stored in hash order to allow fast binary search within each page. */ hash1 = DatumGetUInt32(a->datum1); hash2 = DatumGetUInt32(b->datum1); if (hash1 > hash2) return 1; else if (hash1 < hash2) return -1; /* * If hash values are equal, we sort on ItemPointer. This does not affect * validity of the finished index, but it may be useful to have index * scans in physical order. */ tuple1 = (IndexTuple) a->tuple; tuple2 = (IndexTuple) b->tuple; { BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); if (blk1 != blk2) return (blk1 < blk2) ? -1 : 1; } { OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); if (pos1 != pos2) return (pos1 < pos2) ? -1 : 1; } /* ItemPointer values should never be equal */ Assert(false); return 0; } /* * Sorting for hash indexes only uses one sort key, so this shouldn't ever be * called. It's only here for consistency. */ static int comparetup_index_hash_tiebreak(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) { Assert(false); return 0; } static void writetup_index(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup) { TuplesortPublic *base = TuplesortstateGetPublic(state); IndexTuple tuple = (IndexTuple) stup->tuple; unsigned int tuplen; tuplen = IndexTupleSize(tuple) + sizeof(tuplen); LogicalTapeWrite(tape, &tuplen, sizeof(tuplen)); LogicalTapeWrite(tape, tuple, IndexTupleSize(tuple)); if (base->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length word? */ LogicalTapeWrite(tape, &tuplen, sizeof(tuplen)); } static void readtup_index(Tuplesortstate *state, SortTuple *stup, LogicalTape *tape, unsigned int len) { TuplesortPublic *base = TuplesortstateGetPublic(state); TuplesortIndexArg *arg = (TuplesortIndexArg *) base->arg; unsigned int tuplen = len - sizeof(unsigned int); IndexTuple tuple = (IndexTuple) tuplesort_readtup_alloc(state, tuplen); LogicalTapeReadExact(tape, tuple, tuplen); if (base->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length word? */ LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen)); stup->tuple = (void *) tuple; /* set up first-column key value */ stup->datum1 = index_getattr(tuple, 1, RelationGetDescr(arg->indexRel), &stup->isnull1); } /* * Routines specialized for BrinTuple case */ static void removeabbrev_index_brin(Tuplesortstate *state, SortTuple *stups, int count) { int i; for (i = 0; i < count; i++) { BrinSortTuple *tuple; tuple = stups[i].tuple; stups[i].datum1 = tuple->tuple.bt_blkno; } } static int comparetup_index_brin(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) { Assert(TuplesortstateGetPublic(state)->haveDatum1); if (DatumGetUInt32(a->datum1) > DatumGetUInt32(b->datum1)) return 1; if (DatumGetUInt32(a->datum1) < DatumGetUInt32(b->datum1)) return -1; /* silence compilers */ return 0; } static void writetup_index_brin(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup) { TuplesortPublic *base = TuplesortstateGetPublic(state); BrinSortTuple *tuple = (BrinSortTuple *) stup->tuple; unsigned int tuplen = tuple->tuplen; tuplen = tuplen + sizeof(tuplen); LogicalTapeWrite(tape, &tuplen, sizeof(tuplen)); LogicalTapeWrite(tape, &tuple->tuple, tuple->tuplen); if (base->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length word? */ LogicalTapeWrite(tape, &tuplen, sizeof(tuplen)); } static void readtup_index_brin(Tuplesortstate *state, SortTuple *stup, LogicalTape *tape, unsigned int len) { BrinSortTuple *tuple; TuplesortPublic *base = TuplesortstateGetPublic(state); unsigned int tuplen = len - sizeof(unsigned int); /* * Allocate space for the BRIN sort tuple, which is BrinTuple with an * extra length field. */ tuple = (BrinSortTuple *) tuplesort_readtup_alloc(state, BRINSORTTUPLE_SIZE(tuplen)); tuple->tuplen = tuplen; LogicalTapeReadExact(tape, &tuple->tuple, tuplen); if (base->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length word? */ LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen)); stup->tuple = (void *) tuple; /* set up first-column key value, which is block number */ stup->datum1 = tuple->tuple.bt_blkno; } /* * Routines specialized for DatumTuple case */ static void removeabbrev_datum(Tuplesortstate *state, SortTuple *stups, int count) { int i; for (i = 0; i < count; i++) stups[i].datum1 = PointerGetDatum(stups[i].tuple); } static int comparetup_datum(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) { TuplesortPublic *base = TuplesortstateGetPublic(state); int compare; compare = ApplySortComparator(a->datum1, a->isnull1, b->datum1, b->isnull1, base->sortKeys); if (compare != 0) return compare; return comparetup_datum_tiebreak(a, b, state); } static int comparetup_datum_tiebreak(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) { TuplesortPublic *base = TuplesortstateGetPublic(state); int32 compare = 0; /* if we have abbreviations, then "tuple" has the original value */ if (base->sortKeys->abbrev_converter) compare = ApplySortAbbrevFullComparator(PointerGetDatum(a->tuple), a->isnull1, PointerGetDatum(b->tuple), b->isnull1, base->sortKeys); return compare; } static void writetup_datum(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup) { TuplesortPublic *base = TuplesortstateGetPublic(state); TuplesortDatumArg *arg = (TuplesortDatumArg *) base->arg; void *waddr; unsigned int tuplen; unsigned int writtenlen; if (stup->isnull1) { waddr = NULL; tuplen = 0; } else if (!base->tuples) { waddr = &stup->datum1; tuplen = sizeof(Datum); } else { waddr = stup->tuple; tuplen = datumGetSize(PointerGetDatum(stup->tuple), false, arg->datumTypeLen); Assert(tuplen != 0); } writtenlen = tuplen + sizeof(unsigned int); LogicalTapeWrite(tape, &writtenlen, sizeof(writtenlen)); LogicalTapeWrite(tape, waddr, tuplen); if (base->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length word? */ LogicalTapeWrite(tape, &writtenlen, sizeof(writtenlen)); } static void readtup_datum(Tuplesortstate *state, SortTuple *stup, LogicalTape *tape, unsigned int len) { TuplesortPublic *base = TuplesortstateGetPublic(state); unsigned int tuplen = len - sizeof(unsigned int); if (tuplen == 0) { /* it's NULL */ stup->datum1 = (Datum) 0; stup->isnull1 = true; stup->tuple = NULL; } else if (!base->tuples) { Assert(tuplen == sizeof(Datum)); LogicalTapeReadExact(tape, &stup->datum1, tuplen); stup->isnull1 = false; stup->tuple = NULL; } else { void *raddr = tuplesort_readtup_alloc(state, tuplen); LogicalTapeReadExact(tape, raddr, tuplen); stup->datum1 = PointerGetDatum(raddr); stup->isnull1 = false; stup->tuple = raddr; } if (base->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length word? */ LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen)); }