diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index ce8b674e41..625f490af8 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -717,6 +717,8 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) sortKey->ssup_nulls_first = (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; sortKey->ssup_attno = scanKey->sk_attno; + /* Abbreviation is not supported here */ + sortKey->abbreviate = false; AssertState(sortKey->ssup_attno != 0); diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 5de2b39d10..d2856a379e 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -2301,6 +2301,12 @@ compute_scalar_stats(VacAttrStatsP stats, /* We always use the default collation for statistics */ ssup.ssup_collation = DEFAULT_COLLATION_OID; ssup.ssup_nulls_first = false; + /* + * For now, don't perform abbreviated key conversion, because full values + * are required for MCV slot generation. Supporting that optimization + * would necessitate teaching compare_scalars() to call a tie-breaker. + */ + ssup.abbreviate = false; PrepareSortSupportFromOrderingOp(mystats->ltopr, &ssup); diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c index 08088ea3c0..8079d97764 100644 --- a/src/backend/executor/nodeAgg.c +++ b/src/backend/executor/nodeAgg.c @@ -363,6 +363,10 @@ initialize_aggregates(AggState *aggstate, * We use a plain Datum sorter when there's a single input column; * otherwise sort the full tuple. (See comments for * process_ordered_aggregate_single.) + * + * In the future, we should consider forcing the + * tuplesort_begin_heap() case when the abbreviated key + * optimization can thereby be used, even when numInputs is 1. */ peraggstate->sortstate = (peraggstate->numInputs == 1) ? diff --git a/src/backend/executor/nodeMergeAppend.c b/src/backend/executor/nodeMergeAppend.c index 4e200a8e34..0c814f0e72 100644 --- a/src/backend/executor/nodeMergeAppend.c +++ b/src/backend/executor/nodeMergeAppend.c @@ -137,6 +137,15 @@ ExecInitMergeAppend(MergeAppend *node, EState *estate, int eflags) sortKey->ssup_nulls_first = node->nullsFirst[i]; sortKey->ssup_attno = node->sortColIdx[i]; + /* + * It isn't feasible to perform abbreviated key conversion, since + * tuples are pulled into mergestate's binary heap as needed. It would + * likely be counter-productive to convert tuples into an abbreviated + * representation as they're pulled up, so opt out of that additional + * optimization entirely. + */ + sortKey->abbreviate = false; + PrepareSortSupportFromOrderingOp(node->sortOperators[i], sortKey); } diff --git a/src/backend/executor/nodeMergejoin.c b/src/backend/executor/nodeMergejoin.c index 2a5a7acf94..15742c574a 100644 --- a/src/backend/executor/nodeMergejoin.c +++ b/src/backend/executor/nodeMergejoin.c @@ -229,6 +229,14 @@ MJExamineQuals(List *mergeclauses, elog(ERROR, "cannot merge using non-equality operator %u", qual->opno); + /* + * sortsupport routine must know if abbreviation optimization is + * applicable in principle. It is never applicable for merge joins + * because there is no convenient opportunity to convert to alternative + * representation. + */ + clause->ssup.abbreviate = false; + /* And get the matching support or comparison function */ Assert(clause->ssup.comparator == NULL); sortfunc = get_opfamily_proc(opfamily, diff --git a/src/backend/lib/Makefile b/src/backend/lib/Makefile index 949ee5e41d..fe4781a8e8 100644 --- a/src/backend/lib/Makefile +++ b/src/backend/lib/Makefile @@ -12,6 +12,6 @@ subdir = src/backend/lib top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -OBJS = ilist.o binaryheap.o pairingheap.o rbtree.o stringinfo.o +OBJS = ilist.o binaryheap.o hyperloglog.o pairingheap.o rbtree.o stringinfo.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/lib/hyperloglog.c b/src/backend/lib/hyperloglog.c new file mode 100644 index 0000000000..1157e9ad76 --- /dev/null +++ b/src/backend/lib/hyperloglog.c @@ -0,0 +1,228 @@ +/*------------------------------------------------------------------------- + * + * hyperloglog.c + * HyperLogLog cardinality estimator + * + * Portions Copyright (c) 2014, PostgreSQL Global Development Group + * + * Based on Hideaki Ohno's C++ implementation. This is probably not ideally + * suited to estimating the cardinality of very large sets; in particular, we + * have not attempted to further optimize the implementation as described in + * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic + * Engineering of a State of The Art Cardinality Estimation Algorithm". + * + * A sparse representation of HyperLogLog state is used, with fixed space + * overhead. + * + * The copyright terms of Ohno's original version (the MIT license) follow. + * + * IDENTIFICATION + * src/backend/lib/hyperloglog.c + * + *------------------------------------------------------------------------- + */ + +/* + * Copyright (c) 2013 Hideaki Ohno + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the 'Software'), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "postgres.h" + +#include + +#include "lib/hyperloglog.h" + +#define POW_2_32 (4294967296.0) +#define NEG_POW_2_32 (-4294967296.0) + +static inline uint8 rho(uint32 x, uint8 b); + +/* + * Initialize HyperLogLog track state + * + * bwidth is bit width (so register size will be 2 to the power of bwidth). + * Must be between 4 and 16 inclusive. + */ +void +initHyperLogLog(hyperLogLogState *cState, uint8 bwidth) +{ + double alpha; + + if (bwidth < 4 || bwidth > 16) + elog(ERROR, "bit width must be between 4 and 16 inclusive"); + + cState->registerWidth = bwidth; + cState->nRegisters = 1 << bwidth; + cState->arrSize = sizeof(uint8) * cState->nRegisters + 1; + + /* + * Initialize hashes array to zero, not negative infinity, per discussion + * of the coupon collector problem in the HyperLogLog paper + */ + cState->hashesArr = palloc0(cState->arrSize); + + /* + * "alpha" is a value that for each possible number of registers (m) is + * used to correct a systematic multiplicative bias present in m ^ 2 Z (Z + * is "the indicator function" through which we finally compute E, + * estimated cardinality). + */ + switch (cState->nRegisters) + { + case 16: + alpha = 0.673; + break; + case 32: + alpha = 0.697; + break; + case 64: + alpha = 0.709; + break; + default: + alpha = 0.7213 / (1.0 + 1.079 / cState->nRegisters); + } + + /* + * Precalculate alpha m ^ 2, later used to generate "raw" HyperLogLog + * estimate E + */ + cState->alphaMM = alpha * cState->nRegisters * cState->nRegisters; +} + +/* + * Adds element to the estimator, from caller-supplied hash. + * + * It is critical that the hash value passed be an actual hash value, typically + * generated using hash_any(). The algorithm relies on a specific bit-pattern + * observable in conjunction with stochastic averaging. There must be a + * uniform distribution of bits in hash values for each distinct original value + * observed. + */ +void +addHyperLogLog(hyperLogLogState *cState, uint32 hash) +{ + uint8 count; + uint32 index; + + /* Use the first "k" (registerWidth) bits as a zero based index */ + index = hash >> (BITS_PER_BYTE * sizeof(uint32) - cState->registerWidth); + + /* Compute the rank of the remaining 32 - "k" (registerWidth) bits */ + count = rho(hash << cState->registerWidth, + BITS_PER_BYTE * sizeof(uint32) - cState->registerWidth); + + cState->hashesArr[index] = Max(count, cState->hashesArr[index]); +} + +/* + * Estimates cardinality, based on elements added so far + */ +double +estimateHyperLogLog(hyperLogLogState *cState) +{ + double result; + double sum = 0.0; + int i; + + for (i = 0; i < cState->nRegisters; i++) + { + sum += 1.0 / pow(2.0, cState->hashesArr[i]); + } + + /* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */ + result = cState->alphaMM / sum; + + if (result <= (5.0 / 2.0) * cState->nRegisters) + { + /* Small range correction */ + int zero_count = 0; + + for (i = 0; i < cState->nRegisters; i++) + { + if (cState->hashesArr[i] == 0) + zero_count++; + } + + if (zero_count != 0) + result = cState->nRegisters * log((double) cState->nRegisters / + zero_count); + } + else if (result > (1.0 / 30.0) * POW_2_32) + { + /* Large range correction */ + result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32)); + } + + return result; +} + +/* + * Merges the estimate from one HyperLogLog state to another, returning the + * estimate of their union. + * + * The number of registers in each must match. + */ +void +mergeHyperLogLog(hyperLogLogState *cState, const hyperLogLogState *oState) +{ + int r; + + if (cState->nRegisters != oState->nRegisters) + elog(ERROR, "number of registers mismatch: %zu != %zu", + cState->nRegisters, oState->nRegisters); + + for (r = 0; r < cState->nRegisters; ++r) + { + cState->hashesArr[r] = Max(cState->hashesArr[r], oState->hashesArr[r]); + } +} + + +/* + * Worker for addHyperLogLog(). + * + * Calculates the position of the first set bit in first b bits of x argument + * starting from the first, reading from most significant to least significant + * bits. + * + * Example (when considering fist 10 bits of x): + * + * rho(x = 0b1000000000) returns 1 + * rho(x = 0b0010000000) returns 3 + * rho(x = 0b0000000000) returns b + 1 + * + * "The binary address determined by the first b bits of x" + * + * Return value "j" used to index bit pattern to watch. + */ +static inline uint8 +rho(uint32 x, uint8 b) +{ + uint8 j = 1; + + while (j <= b && !(x & 0x80000000)) + { + j++; + x <<= 1; + } + + return j; +} diff --git a/src/backend/utils/adt/orderedsetaggs.c b/src/backend/utils/adt/orderedsetaggs.c index 869a83b185..f9a5f7f93f 100644 --- a/src/backend/utils/adt/orderedsetaggs.c +++ b/src/backend/utils/adt/orderedsetaggs.c @@ -266,7 +266,13 @@ ordered_set_startup(FunctionCallInfo fcinfo, bool use_tuples) osastate->qstate = qstate; osastate->gcontext = gcontext; - /* Initialize tuplesort object */ + /* + * Initialize tuplesort object. + * + * In the future, we should consider forcing the tuplesort_begin_heap() + * case when the abbreviated key optimization can thereby be used, even + * when !use_tuples. + */ if (use_tuples) osastate->sortstate = tuplesort_begin_heap(qstate->tupdesc, qstate->numSortCols, diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index e95ed88366..71d47380ac 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -17,9 +17,11 @@ #include #include +#include "access/hash.h" #include "access/tuptoaster.h" #include "catalog/pg_collation.h" #include "catalog/pg_type.h" +#include "lib/hyperloglog.h" #include "libpq/md5.h" #include "libpq/pqformat.h" #include "miscadmin.h" @@ -32,6 +34,9 @@ #include "utils/pg_locale.h" #include "utils/sortsupport.h" +#ifdef DEBUG_ABBREV_KEYS +#define DEBUG_elog_output DEBUG1 +#endif /* GUC variable */ int bytea_output = BYTEA_OUTPUT_HEX; @@ -54,10 +59,12 @@ typedef struct typedef struct { - char *buf1; /* 1st string */ - char *buf2; /* 2nd string */ + char *buf1; /* 1st string, or abbreviation original string buf */ + char *buf2; /* 2nd string, or abbreviation strxfrm() buf */ int buflen1; int buflen2; + hyperLogLogState abbr_card; /* Abbreviated key cardinality state */ + hyperLogLogState full_card; /* Full key cardinality state */ #ifdef HAVE_LOCALE_T pg_locale_t locale; #endif @@ -78,6 +85,9 @@ typedef struct static void btsortsupport_worker(SortSupport ssup, Oid collid); static int bttextfastcmp_c(Datum x, Datum y, SortSupport ssup); static int bttextfastcmp_locale(Datum x, Datum y, SortSupport ssup); +static int bttextcmp_abbrev(Datum x, Datum y, SortSupport ssup); +static Datum bttext_abbrev_convert(Datum original, SortSupport ssup); +static bool bttext_abbrev_abort(int memtupcount, SortSupport ssup); static int32 text_length(Datum str); static text *text_catenate(text *t1, text *t2); static text *text_substring(Datum str, @@ -1735,27 +1745,54 @@ btsortsupport_worker(SortSupport ssup, Oid collid) { TextSortSupport *tss; - /* - * If LC_COLLATE = C, we can make things quite a bit faster by using - * memcmp() rather than strcoll(). To minimize the per-comparison - * overhead, we make this decision just once for the whole sort. - */ - if (lc_collate_is_c(collid)) - { - ssup->comparator = bttextfastcmp_c; - return; - } - /* * WIN32 requires complex hacks when the database encoding is UTF-8 (except * when using the "C" collation). For now, we don't optimize that case. */ #ifdef WIN32 - if (GetDatabaseEncoding() == PG_UTF8) + if (GetDatabaseEncoding() == PG_UTF8 && !lc_collate_is_c(collid)) return; #endif /* + * On platforms where the abbreviated key for text optimization might have + * bad worst case performance, it may be useful to avoid it entirely by + * disabling it at compile time. Having only 4 byte datums could make + * worst-case performance drastically more likely, for example. Moreover, + * Darwin's strxfrm() implementations is known to not effectively + * concentrate a significant amount of entropy from the original string in + * earlier transformed blobs. It's possible that other supported platforms + * are similarly encumbered. + * + * Any reasonable implementation will pack primary weights into the start + * of returned blobs. The canonical algorithm's implementation is + * discussed by Unicode Technical Standard #10 ("UNICODE COLLATION + * ALGORITHM"), section 4, "Main algorithm". Section 4.3, "Form Sort Key" + * is of particular interest: + * + * http://www.unicode.org/reports/tr10/#Step_3 + * + * The collation algorithm standard goes on to state: + * + * "By default, the algorithm makes use of three fully-customizable levels. + * For the Latin script, these levels correspond roughly to: + * + * alphabetic ordering + * + * diacritic ordering + * + * case ordering. + * + * A final level may be used for tie-breaking between strings not otherwise + * distinguished." + * + * It is generally expected that most non-equal keys will have their + * comparisons resolved at the primary level. If enough comparisons can be + * resolved with just 4 or 8 byte abbreviated keys, this optimization is + * very effective (although if there are many tie-breakers that largely + * only perform cheap memcmp() calls, that is also much faster than the + * unoptimized case - see bttext_abbrev_abort()). + * * We may need a collation-sensitive comparison. To make things faster, * we'll figure out the collation based on the locale id and cache the * result. Also, since strxfrm()/strcoll() require NUL-terminated inputs, @@ -1788,13 +1825,47 @@ btsortsupport_worker(SortSupport ssup, Oid collid) #endif } - tss->buf1 = palloc(TEXTBUFLEN); - tss->buflen1 = TEXTBUFLEN; - tss->buf2 = palloc(TEXTBUFLEN); - tss->buflen2 = TEXTBUFLEN; + /* + * If LC_COLLATE = C, we can make things quite a bit faster by using + * memcmp() rather than strcoll(). To minimize the per-comparison + * overhead, we make this decision just once for the whole sort. + * + * There is no reason to not at least perform fmgr elision on builds where + * abbreviation is disabled. + */ + if (lc_collate_is_c(collid)) + ssup->abbrev_full_comparator = ssup->comparator = bttextfastcmp_c; + else + ssup->abbrev_full_comparator = ssup->comparator = bttextfastcmp_locale; - ssup->ssup_extra = tss; - ssup->comparator = bttextfastcmp_locale; + if (!lc_collate_is_c(collid) || ssup->abbreviate) + { + /* + * Abbreviated case requires temp buffers for strxfrm() copying. + * bttextfastcmp_locale() also uses these buffers (even if abbreviation + * isn't used), while bttextfast_c() does not. + */ + tss->buf1 = palloc(TEXTBUFLEN); + tss->buflen1 = TEXTBUFLEN; + tss->buf2 = palloc(TEXTBUFLEN); + tss->buflen2 = TEXTBUFLEN; + ssup->ssup_extra = tss; + } + + if (!ssup->abbreviate) + return; + + initHyperLogLog(&tss->abbr_card, 10); + initHyperLogLog(&tss->full_card, 10); + + /* + * Change comparator to be abbreviation-based -- abbreviated version will + * probably ultimately be used during sorting proper, but core code may + * switch back to authoritative comparator should abbreviation be aborted + */ + ssup->comparator = bttextcmp_abbrev; + ssup->abbrev_converter = bttext_abbrev_convert; + ssup->abbrev_abort = bttext_abbrev_abort; } /* @@ -1903,6 +1974,225 @@ done: return result; } +/* + * Abbreviated key comparison func + */ +static int +bttextcmp_abbrev(Datum x, Datum y, SortSupport ssup) +{ + char *a = (char *) &x; + char *b = (char *) &y; + int result; + + result = memcmp(a, b, sizeof(Datum)); + + /* + * When result = 0, the core system will call bttextfastcmp_c() or + * bttextfastcmp_locale(). Even a strcmp() on two non-truncated strxfrm() + * blobs cannot indicate *equality* authoritatively, for the same reason + * that there is a strcoll() tie-breaker call to strcmp() in varstr_cmp(). + */ + return result; +} + +/* + * Conversion routine for sortsupport. Converts original text to abbreviated + * key representation. Our encoding strategy is simple -- pack the first 8 + * bytes of a strxfrm() blob into a Datum. + */ +static Datum +bttext_abbrev_convert(Datum original, SortSupport ssup) +{ + TextSortSupport *tss = (TextSortSupport *) ssup->ssup_extra; + text *authoritative = DatumGetTextPP(original); + + /* working state */ + Datum res; + char *pres; + int len; + Size bsize; + uint32 hash; + + /* + * Abbreviated key representation is a pass-by-value Datum that is treated + * as a char array by the specialized comparator bttextcmp_abbrev(). + */ + pres = (char *) &res; + /* memset(), so any non-overwritten bytes are NUL */ + memset(pres, 0, sizeof(Datum)); + len = VARSIZE_ANY_EXHDR(authoritative); + + /* By convention, we use buffer 1 to store and NUL-terminate text */ + if (len >= tss->buflen1) + { + pfree(tss->buf1); + tss->buflen1 = Max(len + 1, Min(tss->buflen1 * 2, MaxAllocSize)); + tss->buf1 = palloc(tss->buflen1); + } + + /* Just like strcoll(), strxfrm() expects a NUL-terminated string */ + memcpy(tss->buf1, VARDATA_ANY(authoritative), len); + tss->buf1[len] = '\0'; + + /* Don't leak memory here */ + if (PointerGetDatum(authoritative) != original) + pfree(authoritative); + +retry: + + /* + * There is no special handling of the C locale here, unlike with + * varstr_cmp(). strxfrm() is used indifferently. + */ +#ifdef HAVE_LOCALE_T + if (tss->locale) + bsize = strxfrm_l(tss->buf2, tss->buf1, tss->buflen2, tss->locale); + else +#endif + bsize = strxfrm(tss->buf2, tss->buf1, tss->buflen2); + + if (bsize >= tss->buflen2) + { + /* + * The C standard states that the contents of the buffer is now + * unspecified. Grow buffer, and retry. + */ + pfree(tss->buf2); + tss->buflen2 = Max(bsize + 1, Min(tss->buflen2 * 2, MaxAllocSize)); + tss->buf2 = palloc(tss->buflen2); + goto retry; + } + + /* + * Maintain approximate cardinality of both abbreviated keys and original, + * authoritative keys using HyperLogLog. Used as cheap insurance against + * the worst case, where we do many string transformations for no saving in + * full strcoll()-based comparisons. These statistics are used by + * bttext_abbrev_abort(). + * + * First, Hash key proper, or a significant fraction of it. Mix in length + * in order to compensate for cases where differences are past + * CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing. + */ + hash = hash_any((unsigned char *) tss->buf1, Min(len, PG_CACHE_LINE_SIZE)); + + if (len > PG_CACHE_LINE_SIZE) + hash ^= DatumGetUInt32(hash_uint32((uint32) len)); + + addHyperLogLog(&tss->full_card, hash); + + memcpy(pres, tss->buf2, Min(sizeof(Datum), bsize)); + + /* Hash abbreviated key */ +#if SIZEOF_DATUM == 8 + { + uint32 lohalf, + hihalf; + + lohalf = (uint32) res; + hihalf = (uint32) (res >> 32); + hash = hash_uint32(lohalf ^ hihalf); + } +#else /* SIZEOF_DATUM != 8 */ + hash = hash_uint32((uint32) res); +#endif + + addHyperLogLog(&tss->abbr_card, hash); + + /* + * Every Datum byte is always compared. This is safe because the strxfrm() + * blob is itself NUL terminated, leaving no danger of misinterpreting any + * NUL bytes not intended to be interpreted as logically representing + * termination. + */ + return res; +} + +/* + * Callback for estimating effectiveness of abbreviated key optimization, using + * heuristic rules. Returns value indicating if the abbreviation optimization + * should be aborted, based on its projected effectiveness. + */ +static bool +bttext_abbrev_abort(int memtupcount, SortSupport ssup) +{ + TextSortSupport *tss = (TextSortSupport *) ssup->ssup_extra; + double abbrev_distinct, key_distinct; + + Assert(ssup->abbreviate); + + /* Have a little patience */ + if (memtupcount < 20) + return false; + + abbrev_distinct = estimateHyperLogLog(&tss->abbr_card); + key_distinct = estimateHyperLogLog(&tss->full_card); + + /* + * Clamp cardinality estimates to at least one distinct value. While NULLs + * are generally disregarded, if only NULL values were seen so far, that + * might misrepresent costs if we failed to clamp. + */ + if (abbrev_distinct <= 1.0) + abbrev_distinct = 1.0; + + if (key_distinct <= 1.0) + key_distinct = 1.0; + + /* + * In the worst case all abbreviated keys are identical, while at the same + * time there are differences within full key strings not captured in + * abbreviations. + */ +#ifdef DEBUG_ABBREV_KEYS + { + double norm_abbrev_card = abbrev_distinct / (double) memtupcount; + + elog(DEBUG_elog_output, "abbrev_distinct after %d: %f (key_distinct: %f, norm_abbrev_card: %f)", + memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card); + } +#endif + + /* + * If the number of distinct abbreviated keys approximately matches the + * number of distinct authoritative original keys, that's reason enough to + * proceed. We can win even with a very low cardinality set if most + * tie-breakers only memcmp(). This is by far the most important + * consideration. + * + * While comparisons that are resolved at the abbreviated key level are + * considerably cheaper than tie-breakers resolved with memcmp(), both of + * those two outcomes are so much cheaper than a full strcoll() once + * sorting is underway that it doesn't seem worth it to weigh abbreviated + * cardinality against the overall size of the set in order to more + * accurately model costs. Assume that an abbreviated comparison, and an + * abbreviated comparison with a cheap memcmp()-based authoritative + * resolution are equivalent. + */ + if (abbrev_distinct > key_distinct * 0.05) + return false; + + /* + * Abort abbreviation strategy. + * + * The worst case, where all abbreviated keys are identical while all + * original strings differ will typically only see a regression of about + * 10% in execution time for small to medium sized lists of strings. + * Whereas on modern CPUs where cache stalls are the dominant cost, we can + * often expect very large improvements, particularly with sets of strings + * of moderately high to high abbreviated cardinality. There is little to + * lose but much to gain, which our strategy reflects. + */ +#ifdef DEBUG_ABBREV_KEYS + elog(DEBUG_elog_output, "would have aborted abbreviation due to worst-case at %d. abbrev_distinct: %f, key_distinct: %f", + memtupcount, abbrev_distinct, key_distinct); + /* Actually abort only when debugging is disabled */ + return false; +#endif + + return true; +} + Datum text_larger(PG_FUNCTION_ARGS) { diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index 4aa3972841..6d3aa889bc 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -150,7 +150,10 @@ bool optimize_bounded_sort = true; * When sorting single Datums, the data value is represented directly by * datum1/isnull1. If the datatype is pass-by-reference and isnull1 is false, * then datum1 points to a separately palloc'd data value that is also pointed - * to by the "tuple" pointer; otherwise "tuple" is NULL. + * to by the "tuple" pointer; otherwise "tuple" is NULL. There is one special + * case: when the sort support infrastructure provides an "abbreviated key" + * representation, where the key is (typically) a pass by value proxy for a + * pass by reference type. * * While building initial runs, tupindex holds the tuple's run number. During * merge passes, we re-use it to hold the input tape number that each tuple in @@ -346,6 +349,14 @@ struct Tuplesortstate */ SortSupport onlyKey; + /* + * Additional state for managing "abbreviated key" sortsupport routines + * (which currently may be used by all cases except the Datum sort case and + * hash index case). Tracks the intervals at which the optimization's + * effectiveness is tested. + */ + int64 abbrevNext; /* Tuple # at which to next check applicability */ + /* * These variables are specific to the CLUSTER case; they are set by * tuplesort_begin_cluster. @@ -442,6 +453,7 @@ struct Tuplesortstate static Tuplesortstate *tuplesort_begin_common(int workMem, bool randomAccess); static void puttuple_common(Tuplesortstate *state, SortTuple *tuple); +static bool consider_abort_common(Tuplesortstate *state); static void inittapes(Tuplesortstate *state); static void selectnewtape(Tuplesortstate *state); static void mergeruns(Tuplesortstate *state); @@ -619,6 +631,7 @@ tuplesort_begin_heap(TupleDesc tupDesc, state->readtup = readtup_heap; state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + state->abbrevNext = 10; /* Prepare SortSupport data for each column */ state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData)); @@ -634,11 +647,19 @@ tuplesort_begin_heap(TupleDesc tupDesc, sortKey->ssup_collation = sortCollations[i]; sortKey->ssup_nulls_first = nullsFirstFlags[i]; sortKey->ssup_attno = attNums[i]; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey); } - if (nkeys == 1) + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization is + * only of value to pass-by-value types anyway, whereas abbreviated keys + * are typically only of value to pass-by-reference types. + */ + if (nkeys == 1 && !state->sortKeys->abbrev_converter) state->onlyKey = state->sortKeys; MemoryContextSwitchTo(oldcontext); @@ -680,6 +701,7 @@ tuplesort_begin_cluster(TupleDesc tupDesc, state->copytup = copytup_cluster; state->writetup = writetup_cluster; state->readtup = readtup_cluster; + state->abbrevNext = 10; state->indexInfo = BuildIndexInfo(indexRel); @@ -719,6 +741,8 @@ tuplesort_begin_cluster(TupleDesc tupDesc, sortKey->ssup_nulls_first = (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); AssertState(sortKey->ssup_attno != 0); @@ -768,6 +792,7 @@ tuplesort_begin_index_btree(Relation heapRel, state->copytup = copytup_index; state->writetup = writetup_index; state->readtup = readtup_index; + state->abbrevNext = 10; state->heapRel = heapRel; state->indexRel = indexRel; @@ -791,6 +816,8 @@ tuplesort_begin_index_btree(Relation heapRel, sortKey->ssup_nulls_first = (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); AssertState(sortKey->ssup_attno != 0); @@ -883,6 +910,13 @@ tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, state->onlyKey->ssup_cxt = CurrentMemoryContext; state->onlyKey->ssup_collation = sortCollation; state->onlyKey->ssup_nulls_first = nullsFirstFlag; + /* + * Conversion to abbreviated representation infeasible in the Datum case. + * It must be possible to subsequently fetch original datum values within + * tuplesort_getdatum(), which would require special-case preservation of + * original values. + */ + state->onlyKey->abbreviate = false; PrepareSortSupportFromOrderingOp(sortOperator, state->onlyKey); @@ -928,6 +962,19 @@ tuplesort_set_bound(Tuplesortstate *state, int64 bound) state->bounded = true; state->bound = (int) bound; + + /* + * Bounded sorts are not an effective target for abbreviated key + * optimization. Disable by setting state to be consistent with no + * abbreviation support. + */ + state->sortKeys->abbrev_converter = NULL; + if (state->sortKeys->abbrev_full_comparator) + state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator; + + /* Not strictly necessary, but be tidy */ + state->sortKeys->abbrev_abort = NULL; + state->sortKeys->abbrev_full_comparator = NULL; } /* @@ -1186,15 +1233,63 @@ tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, { MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); SortTuple stup; + Datum original; + IndexTuple tuple; stup.tuple = index_form_tuple(RelationGetDescr(rel), values, isnull); - ((IndexTuple) stup.tuple)->t_tid = *self; + tuple = ((IndexTuple) stup.tuple); + tuple->t_tid = *self; USEMEM(state, GetMemoryChunkSpace(stup.tuple)); /* set up first-column key value */ - stup.datum1 = index_getattr((IndexTuple) stup.tuple, - 1, - RelationGetDescr(state->indexRel), - &stup.isnull1); + original = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup.isnull1); + + if (!state->sortKeys->abbrev_converter || stup.isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to "void" representation (to be + * consistent). + */ + stup.datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup.datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup.datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just handled). + * Note that we rely on all tuples copied so far actually being + * contained within memtuples array. + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = mtup->tuple; + mtup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup.isnull1); + } + } + puttuple_common(state, &stup); MemoryContextSwitchTo(oldcontext); @@ -1359,6 +1454,47 @@ puttuple_common(Tuplesortstate *state, SortTuple *tuple) } } +static bool +consider_abort_common(Tuplesortstate *state) +{ + Assert(state->sortKeys[0].abbrev_converter != NULL); + Assert(state->sortKeys[0].abbrev_abort != NULL); + Assert(state->sortKeys[0].abbrev_full_comparator != NULL); + + /* + * Check effectiveness of abbreviation optimization. Consider aborting + * when still within memory limit. + */ + if (state->status == TSS_INITIAL && + state->memtupcount >= state->abbrevNext) + { + state->abbrevNext *= 2; + + /* + * Check opclass-supplied abbreviation abort routine. It may + * indicate that abbreviation should not proceed. + */ + if (!state->sortKeys->abbrev_abort(state->memtupcount, + state->sortKeys)) + return false; + + /* + * Finally, restore authoritative comparator, and indicate that + * abbreviation is not in play by setting abbrev_converter to NULL + */ + state->sortKeys[0].comparator = state->sortKeys[0].abbrev_full_comparator; + state->sortKeys[0].abbrev_converter = NULL; + /* Not strictly necessary, but be tidy */ + state->sortKeys[0].abbrev_abort = NULL; + state->sortKeys[0].abbrev_full_comparator = NULL; + + /* Give up - expect original pass-by-value representation */ + return true; + } + + return false; +} + /* * All tuples have been provided; finish the sort. */ @@ -2853,6 +2989,12 @@ comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) TupleDesc tupDesc; int nkey; int32 compare; + AttrNumber attno; + Datum datum1, + datum2; + bool isnull1, + isnull2; + /* Compare the leading sort key */ compare = ApplySortComparator(a->datum1, a->isnull1, @@ -2867,14 +3009,25 @@ comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET; rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET); tupDesc = state->tupDesc; + + if (sortKey->abbrev_converter) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + sortKey++; for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++) { - AttrNumber attno = sortKey->ssup_attno; - Datum datum1, - datum2; - bool isnull1, - isnull2; + attno = sortKey->ssup_attno; datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); @@ -2897,6 +3050,7 @@ copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup) * MinimalTuple using the exported interface for that. */ TupleTableSlot *slot = (TupleTableSlot *) tup; + Datum original; MinimalTuple tuple; HeapTupleData htup; @@ -2907,10 +3061,58 @@ copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup) /* set up first-column key value */ htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); - stup->datum1 = heap_getattr(&htup, - state->sortKeys[0].ssup_attno, - state->tupDesc, - &stup->isnull1); + original = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to "void" representation (to be + * consistent). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just handled). + * Note that we rely on all tuples copied so far actually being + * contained within memtuples array. + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + htup.t_len = ((MinimalTuple) mtup->tuple)->t_len + + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) mtup->tuple - + MINIMAL_TUPLE_OFFSET); + + mtup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &mtup->isnull1); + } + } } static void @@ -2980,13 +3182,35 @@ comparetup_cluster(const SortTuple *a, const SortTuple *b, TupleDesc tupDesc; int nkey; int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + AttrNumber leading = state->indexInfo->ii_KeyAttrNumbers[0]; + + /* Be prepared to compare additional sort keys */ + ltup = (HeapTuple) a->tuple; + rtup = (HeapTuple) b->tuple; + tupDesc = state->tupDesc; /* Compare the leading sort key, if it's simple */ - if (state->indexInfo->ii_KeyAttrNumbers[0] != 0) + if (leading != 0) { compare = ApplySortComparator(a->datum1, a->isnull1, b->datum1, b->isnull1, sortKey); + if (compare != 0) + return compare; + + if (sortKey->abbrev_converter) + { + datum1 = heap_getattr(ltup, leading, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, leading, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + } if (compare != 0 || state->nKeys == 1) return compare; /* Compare additional columns the hard way */ @@ -2999,22 +3223,13 @@ comparetup_cluster(const SortTuple *a, const SortTuple *b, nkey = 0; } - /* Compare additional sort keys */ - ltup = (HeapTuple) a->tuple; - rtup = (HeapTuple) b->tuple; - if (state->indexInfo->ii_Expressions == NULL) { /* If not expression index, just compare the proper heap attrs */ - tupDesc = state->tupDesc; for (; nkey < state->nKeys; nkey++, sortKey++) { AttrNumber attno = state->indexInfo->ii_KeyAttrNumbers[nkey]; - Datum datum1, - datum2; - bool isnull1, - isnull2; datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1); datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2); @@ -3072,17 +3287,67 @@ static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup) { HeapTuple tuple = (HeapTuple) tup; + Datum original; /* copy the tuple into sort storage */ tuple = heap_copytuple(tuple); stup->tuple = (void *) tuple; USEMEM(state, GetMemoryChunkSpace(tuple)); - /* set up first-column key value, if it's a simple column */ - if (state->indexInfo->ii_KeyAttrNumbers[0] != 0) - stup->datum1 = heap_getattr(tuple, - state->indexInfo->ii_KeyAttrNumbers[0], - state->tupDesc, - &stup->isnull1); + /* + * set up first-column key value, and potentially abbreviate, if it's a + * simple column + */ + if (state->indexInfo->ii_KeyAttrNumbers[0] == 0) + return; + + original = heap_getattr(tuple, + state->indexInfo->ii_KeyAttrNumbers[0], + state->tupDesc, + &stup->isnull1); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to "void" representation (to be + * consistent). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just handled). + * Note that we rely on all tuples copied so far actually being + * contained within memtuples array. + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = (HeapTuple) mtup->tuple; + mtup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_KeyAttrNumbers[0], + state->tupDesc, + &stup->isnull1); + } + } } static void @@ -3162,6 +3427,11 @@ comparetup_index_btree(const SortTuple *a, const SortTuple *b, bool equal_hasnull = false; int nkey; int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + /* Compare the leading sort key */ compare = ApplySortComparator(a->datum1, a->isnull1, @@ -3170,23 +3440,31 @@ comparetup_index_btree(const SortTuple *a, const SortTuple *b, if (compare != 0) return compare; - /* they are equal, so we only need to examine one null flag */ - if (a->isnull1) - equal_hasnull = true; - /* Compare additional sort keys */ tuple1 = (IndexTuple) a->tuple; tuple2 = (IndexTuple) b->tuple; keysz = state->nKeys; tupDes = RelationGetDescr(state->indexRel); + + if (sortKey->abbrev_converter) + { + datum1 = index_getattr(tuple1, 1, tupDes, &isnull1); + datum2 = index_getattr(tuple2, 1, tupDes, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + /* they are equal, so we only need to examine one null flag */ + if (a->isnull1) + equal_hasnull = true; + sortKey++; for (nkey = 2; nkey <= keysz; nkey++, sortKey++) { - Datum datum1, - datum2; - bool isnull1, - isnull2; - datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1); datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2); @@ -3313,6 +3591,7 @@ copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup) IndexTuple tuple = (IndexTuple) tup; unsigned int tuplen = IndexTupleSize(tuple); IndexTuple newtuple; + Datum original; /* copy the tuple into sort storage */ newtuple = (IndexTuple) palloc(tuplen); @@ -3320,10 +3599,54 @@ copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup) USEMEM(state, GetMemoryChunkSpace(newtuple)); stup->tuple = (void *) newtuple; /* set up first-column key value */ - stup->datum1 = index_getattr(newtuple, - 1, - RelationGetDescr(state->indexRel), - &stup->isnull1); + original = index_getattr(newtuple, + 1, + RelationGetDescr(state->indexRel), + &stup->isnull1); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to "void" representation (to be + * consistent). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just handled). + * Note that we rely on all tuples copied so far actually being + * contained within memtuples array. + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = (IndexTuple) mtup->tuple; + mtup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup->isnull1); + } + } } static void diff --git a/src/include/lib/hyperloglog.h b/src/include/lib/hyperloglog.h new file mode 100644 index 0000000000..a6cbffc4c3 --- /dev/null +++ b/src/include/lib/hyperloglog.h @@ -0,0 +1,67 @@ +/* + * hyperloglog.h + * + * A simple HyperLogLog cardinality estimator implementation + * + * Portions Copyright (c) 2014, PostgreSQL Global Development Group + * + * Based on Hideaki Ohno's C++ implementation. The copyright terms of Ohno's + * original version (the MIT license) follow. + * + * src/include/lib/hyperloglog.h + */ + +/* + * Copyright (c) 2013 Hideaki Ohno + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the 'Software'), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef HYPERLOGLOG_H +#define HYPERLOGLOG_H + +/* + * HyperLogLog is an approximate technique for computing the number of distinct + * entries in a set. Importantly, it does this by using a fixed amount of + * memory. See the 2007 paper "HyperLogLog: the analysis of a near-optimal + * cardinality estimation algorithm" for more. + * + * hyperLogLogState + * + * registerWidth register width, in bits ("k") + * nRegisters number of registers + * alphaMM alpha * m ^ 2 (see initHyperLogLog()) + * hashesArr array of hashes + * arrSize size of hashesArr + */ +typedef struct hyperLogLogState +{ + uint8 registerWidth; + Size nRegisters; + double alphaMM; + uint8 *hashesArr; + Size arrSize; +} hyperLogLogState; + +extern void initHyperLogLog(hyperLogLogState *cState, uint8 bwidth); +extern void addHyperLogLog(hyperLogLogState *cState, uint32 hash); +extern double estimateHyperLogLog(hyperLogLogState *cState); +extern void mergeHyperLogLog(hyperLogLogState *cState, const hyperLogLogState *oState); + +#endif /* HYPERLOGLOG_H */ diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h index cb35697ad0..5cfc0ae1e8 100644 --- a/src/include/pg_config_manual.h +++ b/src/include/pg_config_manual.h @@ -214,13 +214,13 @@ #endif /* - * Assumed cache line size. This doesn't affect correctness, but can be - * used for low-level optimizations. Currently, this is only used to pad - * some data structures in xlog.c, to ensure that highly-contended fields - * are on different cache lines. Too small a value can hurt performance due - * to false sharing, while the only downside of too large a value is a few - * bytes of wasted memory. The default is 128, which should be large enough - * for all supported platforms. + * Assumed cache line size. This doesn't affect correctness, but can be used + * for low-level optimizations. Currently, this is used to pad some data + * structures in xlog.c, to ensure that highly-contended fields are on + * different cache lines. Too small a value can hurt performance due to false + * sharing, while the only downside of too large a value is a few bytes of + * wasted memory. The default is 128, which should be large enough for all + * supported platforms. */ #define PG_CACHE_LINE_SIZE 128 diff --git a/src/include/utils/sortsupport.h b/src/include/utils/sortsupport.h index f379bb4702..62fedfaaad 100644 --- a/src/include/utils/sortsupport.h +++ b/src/include/utils/sortsupport.h @@ -21,7 +21,12 @@ * required to provide all of them. The BTSORTSUPPORT function should * simply not set any function pointers for mechanisms it doesn't support. * Opclasses that provide BTSORTSUPPORT and don't provide a comparator - * function will have a shim set up by sort support automatically. + * function will have a shim set up by sort support automatically. However, + * opclasses that support the optional additional abbreviated key capability + * must always provide an authoritative comparator used to tie-break + * inconclusive abbreviated comparisons and also used when aborting + * abbreviation. Furthermore, a converter and abort/costing function must be + * provided. * * All sort support functions will be passed the address of the * SortSupportData struct when called, so they can use it to store @@ -93,12 +98,96 @@ typedef struct SortSupportData * than, equal to, or greater than y. Note that x and y are guaranteed * not null, and there is no way to return null either. Do not return * INT_MIN, as callers are allowed to negate the result before using it. + * + * This may be either the authoritative comparator, or the abbreviated + * comparator. Core code may switch this over the initial preference of an + * opclass support function despite originally indicating abbreviation was + * applicable, by assigning the authoritative comparator back. */ int (*comparator) (Datum x, Datum y, SortSupport ssup); /* - * Additional sort-acceleration functions might be added here later. + * "Abbreviated key" infrastructure follows. + * + * All callbacks must be set by sortsupport opclasses that make use of this + * optional additional infrastructure (unless for whatever reasons the + * opclass doesn't proceed with abbreviation, in which case + * abbrev_converter must not be set). + * + * This allows opclass authors to supply a conversion routine, used to + * create an alternative representation of the underlying type (an + * "abbreviated key"). Typically, this representation is an ad-hoc, + * pass-by-value Datum format that only the opclass has knowledge of. An + * alternative comparator, used only with this alternative representation + * must also be provided (which is assigned to "comparator"). This + * representation is a simple approximation of the original Datum. It must + * be possible to compare datums of this representation with each other + * using the supplied alternative comparator, and have any non-zero return + * value be a reliable proxy for what a proper comparison would indicate. + * Returning zero from the alternative comparator does not indicate + * equality, as with a conventional support routine 1, though -- it + * indicates that it wasn't possible to determine how the two abbreviated + * values compared. A proper comparison, using "abbrev_full_comparator"/ + * ApplySortAbbrevFullComparator() is therefore required. In many cases + * this results in most or all comparisons only using the cheap alternative + * comparison func, which is typically implemented as code that compiles to + * just a few CPU instructions. CPU cache miss penalties are expensive; to + * get good overall performance, sort infrastructure must heavily weigh + * cache performance. + * + * Opclass authors must consider the final cardinality of abbreviated keys + * when devising an encoding scheme. It's possible for a strategy to work + * better than an alternative strategy with one usage pattern, while the + * reverse might be true for another usage pattern. All of these factors + * must be considered. */ + + /* + * "abbreviate" concerns whether or not the abbreviated key optimization is + * applicable in principle (that is, the sortsupport routine needs to know + * if its dealing with a key where an abbreviated representation can + * usefully be packed together. Conventionally, this is the leading + * attribute key). Note, however, that in order to determine that + * abbreviation is not in play, the core code always checks whether or not + * the opclass has set abbrev_converter. This is a one way, one time + * message to the opclass. + */ + bool abbreviate; + + /* + * Converter to abbreviated format, from original representation. Core + * code uses this callback to convert from a pass-by-reference "original" + * Datum to a pass-by-value abbreviated key Datum. Note that original is + * guaranteed NOT NULL, because it doesn't make sense to factor NULLness + * into ad-hoc cost model. + * + * abbrev_converter is tested to see if abbreviation is in play. Core code + * may set it to NULL to indicate abbreviation should not be used (which is + * something sortsupport routines need not concern themselves with). + * However, sortsupport routines must not set it when it is immediately + * established that abbreviation should not proceed (for abbreviation + * calls, or platform-specific impediments to using abbreviation). + */ + Datum (*abbrev_converter) (Datum original, SortSupport ssup); + + /* + * abbrev_abort callback allows clients to verify that the current strategy + * is working out, using a sortsupport routine defined ad-hoc cost model. + * If there is a lot of duplicate abbreviated keys in practice, it's useful + * to be able to abandon the strategy before paying too high a cost in + * conversion (perhaps certain opclass-specific adaptations are useful + * too). + */ + bool (*abbrev_abort) (int memtupcount, SortSupport ssup); + + /* + * Full, authoritative comparator for key that an abbreviated + * representation was generated for, used when an abbreviated comparison + * was inconclusive (by calling ApplySortComparatorFull()), or used to + * replace "comparator" when core system ultimately decides against + * abbreviation. + */ + int (*abbrev_full_comparator) (Datum x, Datum y, SortSupport ssup); } SortSupportData; @@ -110,6 +199,9 @@ typedef struct SortSupportData extern int ApplySortComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup); +extern int ApplySortAbbrevFullComparator(Datum datum1, bool isNull1, + Datum datum2, bool isNull2, + SortSupport ssup); #endif /* !PG_USE_INLINE */ #if defined(PG_USE_INLINE) || defined(SORTSUPPORT_INCLUDE_DEFINITIONS) /* @@ -148,6 +240,44 @@ ApplySortComparator(Datum datum1, bool isNull1, return compare; } + +/* + * Apply a sort comparator function and return a 3-way comparison using full, + * authoritative comparator. This takes care of handling reverse-sort and + * NULLs-ordering properly. + */ +STATIC_IF_INLINE int +ApplySortAbbrevFullComparator(Datum datum1, bool isNull1, + Datum datum2, bool isNull2, + SortSupport ssup) +{ + int compare; + + if (isNull1) + { + if (isNull2) + compare = 0; /* NULL "=" NULL */ + else if (ssup->ssup_nulls_first) + compare = -1; /* NULL "<" NOT_NULL */ + else + compare = 1; /* NULL ">" NOT_NULL */ + } + else if (isNull2) + { + if (ssup->ssup_nulls_first) + compare = 1; /* NOT_NULL ">" NULL */ + else + compare = -1; /* NOT_NULL "<" NULL */ + } + else + { + compare = (*ssup->abbrev_full_comparator) (datum1, datum2, ssup); + if (ssup->ssup_reverse) + compare = -compare; + } + + return compare; +} #endif /*-- PG_USE_INLINE || SORTSUPPORT_INCLUDE_DEFINITIONS */ /* Other functions in utils/sort/sortsupport.c */