diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c index 0c4fcbe0d3..cd339931be 100644 --- a/src/backend/utils/adt/numeric.c +++ b/src/backend/utils/adt/numeric.c @@ -29,13 +29,16 @@ #include "access/hash.h" #include "catalog/pg_type.h" #include "funcapi.h" +#include "lib/hyperloglog.h" #include "libpq/pqformat.h" #include "miscadmin.h" #include "nodes/nodeFuncs.h" #include "utils/array.h" #include "utils/builtins.h" +#include "utils/guc.h" #include "utils/int8.h" #include "utils/numeric.h" +#include "utils/sortsupport.h" /* ---------- * Uncomment the following to enable compilation of dump_numeric() @@ -57,6 +60,12 @@ * are easy. Also, it's actually more efficient if NBASE is rather less than * sqrt(INT_MAX), so that there is "headroom" for mul_var and div_var_fast to * postpone processing carries. + * + * Values of NBASE other than 10000 are considered of historical interest only + * and are no longer supported in any sense; no mechanism exists for the client + * to discover the base, so every client supporting binary mode expects the + * base-10000 format. If you plan to change this, also note the numeric + * abbreviation code, which assumes NBASE=10000. * ---------- */ @@ -274,6 +283,29 @@ typedef struct } generate_series_numeric_fctx; +/* ---------- + * Sort support. + * ---------- + */ +typedef struct +{ + void *buf; /* buffer for short varlenas */ + int64 input_count; /* number of non-null values seen */ + bool estimating; /* true if estimating cardinality */ + + hyperLogLogState abbr_card; /* cardinality estimator */ +} NumericSortSupport; + +#define NUMERIC_ABBREV_BITS (SIZEOF_DATUM * BITS_PER_BYTE) +#ifdef SIZEOF_DATUM == 8 +#define DatumGetNumericAbbrev(d) ((int64) d) +#define NUMERIC_ABBREV_NAN Int64GetDatum(PG_INT64_MIN) +#else +#define DatumGetNumericAbbrev(d) ((int32) d) +#define NUMERIC_ABBREV_NAN Int32GetDatum(PG_INT32_MIN) +#endif + + /* ---------- * Some preinitialized constants * ---------- @@ -409,6 +441,13 @@ static void int128_to_numericvar(int128 val, NumericVar *var); static double numeric_to_double_no_overflow(Numeric num); static double numericvar_to_double_no_overflow(NumericVar *var); +static Datum numeric_abbrev_convert(Datum original_datum, SortSupport ssup); +static bool numeric_abbrev_abort(int memtupcount, SortSupport ssup); +static int numeric_fast_cmp(Datum x, Datum y, SortSupport ssup); +static int numeric_cmp_abbrev(Datum x, Datum y, SortSupport ssup); + +static Datum numeric_abbrev_convert_var(NumericVar *var, NumericSortSupport *nss); + static int cmp_numerics(Numeric num1, Numeric num2); static int cmp_var(NumericVar *var1, NumericVar *var2); static int cmp_var_common(const NumericDigit *var1digits, int var1ndigits, @@ -1507,9 +1546,428 @@ compute_bucket(Numeric operand, Numeric bound1, Numeric bound2, * Note: btree indexes need these routines not to leak memory; therefore, * be careful to free working copies of toasted datums. Most places don't * need to be so careful. + * + * Sort support: + * + * We implement the sortsupport strategy routine in order to get the benefit of + * abbreviation. The ordinary numeric comparison can be quite slow as a result + * of palloc/pfree cycles (due to detoasting packed values for alignment); + * while this could be worked on itself, the abbreviation strategy gives more + * speedup in many common cases. + * + * Two different representations are used for the abbreviated form, one in + * int32 and one in int64, whichever fits into a by-value Datum. In both cases + * the representation is negated relative to the original value, because we use + * the largest negative value for NaN, which sorts higher than other values. We + * convert the absolute value of the numeric to a 31-bit or 63-bit positive + * value, and then negate it if the original number was positive. + * + * We abort the abbreviation process if the abbreviation cardinality is below + * 0.01% of the row count (1 per 10k non-null rows). The actual break-even + * point is somewhat below that, perhaps 1 per 30k (at 1 per 100k there's a + * very small penalty), but we don't want to build up too many abbreviated + * values before first testing for abort, so we take the slightly pessimistic + * number. We make no attempt to estimate the cardinality of the real values, + * since it plays no part in the cost model here (if the abbreviation is equal, + * the cost of comparing equal and unequal underlying values is comparable). + * We discontinue even checking for abort (saving us the hashing overhead) if + * the estimated cardinality gets to 100k; that would be enough to support many + * billions of rows while doing no worse than breaking even. + * * ---------------------------------------------------------------------- */ +/* + * Sort support strategy routine. + */ +Datum +numeric_sortsupport(PG_FUNCTION_ARGS) +{ + SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); + + ssup->comparator = numeric_fast_cmp; + + if (ssup->abbreviate) + { + NumericSortSupport *nss; + MemoryContext oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); + + nss = palloc(sizeof(NumericSortSupport)); + + /* + * palloc a buffer for handling unaligned packed values in addition to + * the support struct + */ + nss->buf = palloc(VARATT_SHORT_MAX + VARHDRSZ + 1); + + nss->input_count = 0; + nss->estimating = true; + initHyperLogLog(&nss->abbr_card, 10); + + ssup->ssup_extra = nss; + + ssup->abbrev_full_comparator = ssup->comparator; + ssup->comparator = numeric_cmp_abbrev; + ssup->abbrev_converter = numeric_abbrev_convert; + ssup->abbrev_abort = numeric_abbrev_abort; + + MemoryContextSwitchTo(oldcontext); + } + + PG_RETURN_VOID(); +} + +/* + * Abbreviate a numeric datum, handling NaNs and detoasting + * (must not leak memory!) + */ +static Datum +numeric_abbrev_convert(Datum original_datum, SortSupport ssup) +{ + NumericSortSupport *nss = ssup->ssup_extra; + void *original_varatt = PG_DETOAST_DATUM_PACKED(original_datum); + Numeric value; + Datum result; + + nss->input_count += 1; + + /* + * This is to handle packed datums without needing a palloc/pfree cycle; + * we keep and reuse a buffer large enough to handle any short datum. + */ + if (VARATT_IS_SHORT(original_varatt)) + { + void *buf = nss->buf; + Size sz = VARSIZE_SHORT(original_varatt) - VARHDRSZ_SHORT; + + Assert(sz <= VARATT_SHORT_MAX - VARHDRSZ_SHORT); + + SET_VARSIZE(buf, VARHDRSZ + sz); + memcpy(VARDATA(buf), VARDATA_SHORT(original_varatt), sz); + + value = (Numeric) buf; + } + else + value = (Numeric) original_varatt; + + if (NUMERIC_IS_NAN(value)) + { + result = NUMERIC_ABBREV_NAN; + } + else + { + NumericVar var; + + init_var_from_num(value, &var); + + result = numeric_abbrev_convert_var(&var, nss); + } + + /* should happen only for external/compressed toasts */ + if ((Pointer) original_varatt != DatumGetPointer(original_datum)) + pfree(original_varatt); + + return result; +} + +/* + * Consider whether to abort abbreviation. + * + * We pay no attention to the cardinality of the non-abbreviated data. There is + * no reason to do so: unlike text, we have no fast check for equal values, so + * we pay the full overhead whenever the abbreviations are equal regardless of + * whether the underlying values are also equal. + */ +static bool +numeric_abbrev_abort(int memtupcount, SortSupport ssup) +{ + NumericSortSupport *nss = ssup->ssup_extra; + double abbr_card; + + if (memtupcount < 10000 || nss->input_count < 10000 || !nss->estimating) + return false; + + abbr_card = estimateHyperLogLog(&nss->abbr_card); + + /* + * If we have >100k distinct values, then even if we were sorting many + * billion rows we'd likely still break even, and the penalty of undoing + * that many rows of abbrevs would probably not be worth it. Stop even + * counting at that point. + */ + if (abbr_card > 100000.0) + { +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "numeric_abbrev: estimation ends at cardinality %f" + " after " INT64_FORMAT " values (%d rows)", + abbr_card, nss->input_count, memtupcount); +#endif + nss->estimating = false; + return false; + } + + /* + * Target minimum cardinality is 1 per ~10k of non-null inputs. (The + * break even point is somewhere between one per 100k rows, where + * abbreviation has a very slight penalty, and 1 per 10k where it wins by + * a measurable percentage.) We use the relatively pessimistic 10k + * threshold, and add a 0.5 row fudge factor, because it allows us to + * abort earlier on genuinely pathological data where we've had exactly + * one abbreviated value in the first 10k (non-null) rows. + */ + if (abbr_card < nss->input_count / 10000.0 + 0.5) + { +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "numeric_abbrev: aborting abbreviation at cardinality %f" + " below threshold %f after " INT64_FORMAT " values (%d rows)", + abbr_card, nss->input_count / 10000.0 + 0.5, + nss->input_count, memtupcount); +#endif + return true; + } + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "numeric_abbrev: cardinality %f" + " after " INT64_FORMAT " values (%d rows)", + abbr_card, nss->input_count, memtupcount); +#endif + + return false; +} + +/* + * Non-fmgr interface to the comparison routine to allow sortsupport to elide + * the fmgr call. The saving here is small given how slow numeric comparisons + * are, but it is a required part of the sort support API when abbreviations + * are performed. + * + * Two palloc/pfree cycles could be saved here by using persistent buffers for + * aligning short-varlena inputs, but this has not so far been considered to + * be worth the effort. + */ +static int +numeric_fast_cmp(Datum x, Datum y, SortSupport ssup) +{ + Numeric nx = DatumGetNumeric(x); + Numeric ny = DatumGetNumeric(y); + int result; + + result = cmp_numerics(nx, ny); + + if ((Pointer) nx != DatumGetPointer(x)) + pfree(nx); + if ((Pointer) ny != DatumGetPointer(y)) + pfree(ny); + + return result; +} + +/* + * Compare abbreviations of values. (Abbreviations may be equal where the true + * values differ, but if the abbreviations differ, they must reflect the + * ordering of the true values.) + */ +static int +numeric_cmp_abbrev(Datum x, Datum y, SortSupport ssup) +{ + /* + * NOTE WELL: this is intentionally backwards, because the abbreviation is + * negated relative to the original value, to handle NaN. + */ + if (DatumGetNumericAbbrev(x) < DatumGetNumericAbbrev(y)) + return 1; + if (DatumGetNumericAbbrev(x) > DatumGetNumericAbbrev(y)) + return -1; + return 0; +} + +/* + * Abbreviate a NumericVar according to the available bit size. + * + * The 31-bit value is constructed as: + * + * 0 + 7bits digit weight + 24 bits digit value + * + * where the digit weight is in single decimal digits, not digit words, and + * stored in excess-44 representation[1]. The 24-bit digit value is the 7 most + * significant decimal digits of the value converted to binary. Values whose + * weights would fall outside the representable range are rounded off to zero + * (which is also used to represent actual zeros) or to 0x7FFFFFFF (which + * otherwise cannot occur). Abbreviation therefore fails to gain any advantage + * where values are outside the range 10^-44 to 10^83, which is not considered + * to be a serious limitation, or when values are of the same magnitude and + * equal in the first 7 decimal digits, which is considered to be an + * unavoidable limitation given the available bits. (Stealing three more bits + * to compare another digit would narrow the range of representable weights by + * a factor of 8, which starts to look like a real limiting factor.) + * + * (The value 44 for the excess is essentially arbitrary) + * + * The 63-bit value is constructed as: + * + * 0 + 7bits weight + 4 x 14-bit packed digit words + * + * The weight in this case is again stored in excess-44, but this time it is + * the original weight in digit words (i.e. powers of 10000). The first four + * digit words of the value (if present; trailing zeros are assumed as needed) + * are packed into 14 bits each to form the rest of the value. Again, + * out-of-range values are rounded off to 0 or 0x7FFFFFFFFFFFFFFF. The + * representable range in this case is 10^-176 to 10^332, which is considered + * to be good enough for all practical purposes, and comparison of 4 words + * means that at least 13 decimal digits are compared, which is considered to + * be a reasonable compromise between effectiveness and efficiency in computing + * the abbreviation. + * + * (The value 44 for the excess is even more arbitrary here, it was chosen just + * to match the value used in the 31-bit case) + * + * [1] - Excess-k representation means that the value is offset by adding 'k' + * and then treated as unsigned, so the smallest representable value is stored + * with all bits zero. This allows simple comparisons to work on the composite + * value. + */ + +#if NUMERIC_ABBREV_BITS == 64 + +static Datum +numeric_abbrev_convert_var(NumericVar *var, NumericSortSupport *nss) +{ + int ndigits = var->ndigits; + int weight = var->weight; + int64 result; + + if (ndigits == 0 || weight < -44) + { + result = 0; + } + else if (weight > 83) + { + result = PG_INT64_MAX; + } + else + { + result = ((int64) (weight + 44) << 56); + + switch (ndigits) + { + default: + result |= ((int64) var->digits[3]); + /* FALLTHROUGH */ + case 3: + result |= ((int64) var->digits[2]) << 14; + /* FALLTHROUGH */ + case 2: + result |= ((int64) var->digits[1]) << 28; + /* FALLTHROUGH */ + case 1: + result |= ((int64) var->digits[0]) << 42; + break; + } + } + + /* the abbrev is negated relative to the original */ + if (var->sign == NUMERIC_POS) + result = -result; + + if (nss->estimating) + { + uint32 tmp = ((uint32) result + ^ (uint32) ((uint64) result >> 32)); + + addHyperLogLog(&nss->abbr_card, DatumGetUInt32(hash_uint32(tmp))); + } + + return Int64GetDatum(result); +} + +#endif /* NUMERIC_ABBREV_BITS == 64 */ + +#if NUMERIC_ABBREV_BITS == 32 + +static Datum +numeric_abbrev_convert_var(NumericVar *var, NumericSortSupport *nss) +{ + int ndigits = var->ndigits; + int weight = var->weight; + int32 result; + + if (ndigits == 0 || weight < -11) + { + result = 0; + } + else if (weight > 20) + { + result = PG_INT32_MAX; + } + else + { + NumericDigit nxt1 = (ndigits > 1) ? var->digits[1] : 0; + + weight = (weight + 11) * 4; + + result = var->digits[0]; + + /* + * "result" now has 1 to 4 nonzero decimal digits. We pack in more + * digits to make 7 in total (largest we can fit in 24 bits) + */ + + if (result > 999) + { + /* already have 4 digits, add 3 more */ + result = (result * 1000) + (nxt1 / 10); + weight += 3; + } + else if (result > 99) + { + /* already have 3 digits, add 4 more */ + result = (result * 10000) + nxt1; + weight += 2; + } + else if (result > 9) + { + NumericDigit nxt2 = (ndigits > 2) ? var->digits[2] : 0; + + /* already have 2 digits, add 5 more */ + result = (result * 100000) + (nxt1 * 10) + (nxt2 / 1000); + weight += 1; + } + else + { + NumericDigit nxt2 = (ndigits > 2) ? var->digits[2] : 0; + + /* already have 1 digit, add 6 more */ + result = (result * 1000000) + (nxt1 * 100) + (nxt2 / 100); + } + + result = result | (weight << 24); + } + + /* the abbrev is negated relative to the original */ + if (var->sign == NUMERIC_POS) + result = -result; + + if (nss->estimating) + { + uint32 tmp = (uint32) result; + + addHyperLogLog(&nss->abbr_card, DatumGetUInt32(hash_uint32(tmp))); + } + + return Int32GetDatum(result); +} + +#endif /* NUMERIC_ABBREV_BITS == 32 */ + +/* + * Ordinary (non-sortsupport) comparisons follow. + */ Datum numeric_cmp(PG_FUNCTION_ARGS) diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index b8a0f9f20c..492c093e4d 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -112,9 +112,6 @@ extern char *temp_tablespaces; extern bool ignore_checksum_failure; extern bool synchronize_seqscans; -#ifdef TRACE_SORT -extern bool trace_sort; -#endif #ifdef TRACE_SYNCSCAN extern bool trace_syncscan; #endif diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index b87d080c69..c20c634367 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 201503302 +#define CATALOG_VERSION_NO 201504021 #endif diff --git a/src/include/catalog/pg_amproc.h b/src/include/catalog/pg_amproc.h index 86f81cb0d9..a54d11fdbd 100644 --- a/src/include/catalog/pg_amproc.h +++ b/src/include/catalog/pg_amproc.h @@ -118,6 +118,7 @@ DATA(insert ( 1984 829 829 1 836 )); DATA(insert ( 1986 19 19 1 359 )); DATA(insert ( 1986 19 19 2 3135 )); DATA(insert ( 1988 1700 1700 1 1769 )); +DATA(insert ( 1988 1700 1700 2 3283 )); DATA(insert ( 1989 26 26 1 356 )); DATA(insert ( 1989 26 26 2 3134 )); DATA(insert ( 1991 30 30 1 404 )); diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 8890ade7ed..d90ecc5396 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -2370,6 +2370,8 @@ DATA(insert OID = 1767 ( numeric_larger PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 DESCR("larger of two"); DATA(insert OID = 1769 ( numeric_cmp PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 23 "1700 1700" _null_ _null_ _null_ _null_ numeric_cmp _null_ _null_ _null_ )); DESCR("less-equal-greater"); +DATA(insert OID = 3283 ( numeric_sortsupport PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 2278 "2281" _null_ _null_ _null_ _null_ numeric_sortsupport _null_ _null_ _null_ )); +DESCR("sort support"); DATA(insert OID = 1771 ( numeric_uminus PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 1700 "1700" _null_ _null_ _null_ _null_ numeric_uminus _null_ _null_ _null_ )); DATA(insert OID = 1779 ( int8 PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 20 "1700" _null_ _null_ _null_ _null_ numeric_int8 _null_ _null_ _null_ )); DESCR("convert numeric to int8"); diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index 631064158a..33a453f602 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -978,6 +978,7 @@ extern Datum numeric_round(PG_FUNCTION_ARGS); extern Datum numeric_trunc(PG_FUNCTION_ARGS); extern Datum numeric_ceil(PG_FUNCTION_ARGS); extern Datum numeric_floor(PG_FUNCTION_ARGS); +extern Datum numeric_sortsupport(PG_FUNCTION_ARGS); extern Datum numeric_cmp(PG_FUNCTION_ARGS); extern Datum numeric_eq(PG_FUNCTION_ARGS); extern Datum numeric_ne(PG_FUNCTION_ARGS); diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index d3100d1781..ff78b70b96 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -258,6 +258,10 @@ extern int tcp_keepalives_idle; extern int tcp_keepalives_interval; extern int tcp_keepalives_count; +#ifdef TRACE_SORT +extern bool trace_sort; +#endif + /* * Functions exported by guc.c */