Use abbreviated keys for faster sorting of numeric datums.

Andrew Gierth, reviewed by Peter Geoghegan, with further tweaks by me.
This commit is contained in:
Robert Haas 2015-04-02 14:02:55 -04:00
parent 00ee6c7672
commit abd94bcac4
7 changed files with 467 additions and 4 deletions

View File

@ -29,13 +29,16 @@
#include "access/hash.h"
#include "catalog/pg_type.h"
#include "funcapi.h"
#include "lib/hyperloglog.h"
#include "libpq/pqformat.h"
#include "miscadmin.h"
#include "nodes/nodeFuncs.h"
#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/guc.h"
#include "utils/int8.h"
#include "utils/numeric.h"
#include "utils/sortsupport.h"
/* ----------
* Uncomment the following to enable compilation of dump_numeric()
@ -57,6 +60,12 @@
* are easy. Also, it's actually more efficient if NBASE is rather less than
* sqrt(INT_MAX), so that there is "headroom" for mul_var and div_var_fast to
* postpone processing carries.
*
* Values of NBASE other than 10000 are considered of historical interest only
* and are no longer supported in any sense; no mechanism exists for the client
* to discover the base, so every client supporting binary mode expects the
* base-10000 format. If you plan to change this, also note the numeric
* abbreviation code, which assumes NBASE=10000.
* ----------
*/
@ -274,6 +283,29 @@ typedef struct
} generate_series_numeric_fctx;
/* ----------
* Sort support.
* ----------
*/
typedef struct
{
void *buf; /* buffer for short varlenas */
int64 input_count; /* number of non-null values seen */
bool estimating; /* true if estimating cardinality */
hyperLogLogState abbr_card; /* cardinality estimator */
} NumericSortSupport;
#define NUMERIC_ABBREV_BITS (SIZEOF_DATUM * BITS_PER_BYTE)
#ifdef SIZEOF_DATUM == 8
#define DatumGetNumericAbbrev(d) ((int64) d)
#define NUMERIC_ABBREV_NAN Int64GetDatum(PG_INT64_MIN)
#else
#define DatumGetNumericAbbrev(d) ((int32) d)
#define NUMERIC_ABBREV_NAN Int32GetDatum(PG_INT32_MIN)
#endif
/* ----------
* Some preinitialized constants
* ----------
@ -409,6 +441,13 @@ static void int128_to_numericvar(int128 val, NumericVar *var);
static double numeric_to_double_no_overflow(Numeric num);
static double numericvar_to_double_no_overflow(NumericVar *var);
static Datum numeric_abbrev_convert(Datum original_datum, SortSupport ssup);
static bool numeric_abbrev_abort(int memtupcount, SortSupport ssup);
static int numeric_fast_cmp(Datum x, Datum y, SortSupport ssup);
static int numeric_cmp_abbrev(Datum x, Datum y, SortSupport ssup);
static Datum numeric_abbrev_convert_var(NumericVar *var, NumericSortSupport *nss);
static int cmp_numerics(Numeric num1, Numeric num2);
static int cmp_var(NumericVar *var1, NumericVar *var2);
static int cmp_var_common(const NumericDigit *var1digits, int var1ndigits,
@ -1507,9 +1546,428 @@ compute_bucket(Numeric operand, Numeric bound1, Numeric bound2,
* Note: btree indexes need these routines not to leak memory; therefore,
* be careful to free working copies of toasted datums. Most places don't
* need to be so careful.
*
* Sort support:
*
* We implement the sortsupport strategy routine in order to get the benefit of
* abbreviation. The ordinary numeric comparison can be quite slow as a result
* of palloc/pfree cycles (due to detoasting packed values for alignment);
* while this could be worked on itself, the abbreviation strategy gives more
* speedup in many common cases.
*
* Two different representations are used for the abbreviated form, one in
* int32 and one in int64, whichever fits into a by-value Datum. In both cases
* the representation is negated relative to the original value, because we use
* the largest negative value for NaN, which sorts higher than other values. We
* convert the absolute value of the numeric to a 31-bit or 63-bit positive
* value, and then negate it if the original number was positive.
*
* We abort the abbreviation process if the abbreviation cardinality is below
* 0.01% of the row count (1 per 10k non-null rows). The actual break-even
* point is somewhat below that, perhaps 1 per 30k (at 1 per 100k there's a
* very small penalty), but we don't want to build up too many abbreviated
* values before first testing for abort, so we take the slightly pessimistic
* number. We make no attempt to estimate the cardinality of the real values,
* since it plays no part in the cost model here (if the abbreviation is equal,
* the cost of comparing equal and unequal underlying values is comparable).
* We discontinue even checking for abort (saving us the hashing overhead) if
* the estimated cardinality gets to 100k; that would be enough to support many
* billions of rows while doing no worse than breaking even.
*
* ----------------------------------------------------------------------
*/
/*
* Sort support strategy routine.
*/
Datum
numeric_sortsupport(PG_FUNCTION_ARGS)
{
SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
ssup->comparator = numeric_fast_cmp;
if (ssup->abbreviate)
{
NumericSortSupport *nss;
MemoryContext oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
nss = palloc(sizeof(NumericSortSupport));
/*
* palloc a buffer for handling unaligned packed values in addition to
* the support struct
*/
nss->buf = palloc(VARATT_SHORT_MAX + VARHDRSZ + 1);
nss->input_count = 0;
nss->estimating = true;
initHyperLogLog(&nss->abbr_card, 10);
ssup->ssup_extra = nss;
ssup->abbrev_full_comparator = ssup->comparator;
ssup->comparator = numeric_cmp_abbrev;
ssup->abbrev_converter = numeric_abbrev_convert;
ssup->abbrev_abort = numeric_abbrev_abort;
MemoryContextSwitchTo(oldcontext);
}
PG_RETURN_VOID();
}
/*
* Abbreviate a numeric datum, handling NaNs and detoasting
* (must not leak memory!)
*/
static Datum
numeric_abbrev_convert(Datum original_datum, SortSupport ssup)
{
NumericSortSupport *nss = ssup->ssup_extra;
void *original_varatt = PG_DETOAST_DATUM_PACKED(original_datum);
Numeric value;
Datum result;
nss->input_count += 1;
/*
* This is to handle packed datums without needing a palloc/pfree cycle;
* we keep and reuse a buffer large enough to handle any short datum.
*/
if (VARATT_IS_SHORT(original_varatt))
{
void *buf = nss->buf;
Size sz = VARSIZE_SHORT(original_varatt) - VARHDRSZ_SHORT;
Assert(sz <= VARATT_SHORT_MAX - VARHDRSZ_SHORT);
SET_VARSIZE(buf, VARHDRSZ + sz);
memcpy(VARDATA(buf), VARDATA_SHORT(original_varatt), sz);
value = (Numeric) buf;
}
else
value = (Numeric) original_varatt;
if (NUMERIC_IS_NAN(value))
{
result = NUMERIC_ABBREV_NAN;
}
else
{
NumericVar var;
init_var_from_num(value, &var);
result = numeric_abbrev_convert_var(&var, nss);
}
/* should happen only for external/compressed toasts */
if ((Pointer) original_varatt != DatumGetPointer(original_datum))
pfree(original_varatt);
return result;
}
/*
* Consider whether to abort abbreviation.
*
* We pay no attention to the cardinality of the non-abbreviated data. There is
* no reason to do so: unlike text, we have no fast check for equal values, so
* we pay the full overhead whenever the abbreviations are equal regardless of
* whether the underlying values are also equal.
*/
static bool
numeric_abbrev_abort(int memtupcount, SortSupport ssup)
{
NumericSortSupport *nss = ssup->ssup_extra;
double abbr_card;
if (memtupcount < 10000 || nss->input_count < 10000 || !nss->estimating)
return false;
abbr_card = estimateHyperLogLog(&nss->abbr_card);
/*
* If we have >100k distinct values, then even if we were sorting many
* billion rows we'd likely still break even, and the penalty of undoing
* that many rows of abbrevs would probably not be worth it. Stop even
* counting at that point.
*/
if (abbr_card > 100000.0)
{
#ifdef TRACE_SORT
if (trace_sort)
elog(LOG,
"numeric_abbrev: estimation ends at cardinality %f"
" after " INT64_FORMAT " values (%d rows)",
abbr_card, nss->input_count, memtupcount);
#endif
nss->estimating = false;
return false;
}
/*
* Target minimum cardinality is 1 per ~10k of non-null inputs. (The
* break even point is somewhere between one per 100k rows, where
* abbreviation has a very slight penalty, and 1 per 10k where it wins by
* a measurable percentage.) We use the relatively pessimistic 10k
* threshold, and add a 0.5 row fudge factor, because it allows us to
* abort earlier on genuinely pathological data where we've had exactly
* one abbreviated value in the first 10k (non-null) rows.
*/
if (abbr_card < nss->input_count / 10000.0 + 0.5)
{
#ifdef TRACE_SORT
if (trace_sort)
elog(LOG,
"numeric_abbrev: aborting abbreviation at cardinality %f"
" below threshold %f after " INT64_FORMAT " values (%d rows)",
abbr_card, nss->input_count / 10000.0 + 0.5,
nss->input_count, memtupcount);
#endif
return true;
}
#ifdef TRACE_SORT
if (trace_sort)
elog(LOG,
"numeric_abbrev: cardinality %f"
" after " INT64_FORMAT " values (%d rows)",
abbr_card, nss->input_count, memtupcount);
#endif
return false;
}
/*
* Non-fmgr interface to the comparison routine to allow sortsupport to elide
* the fmgr call. The saving here is small given how slow numeric comparisons
* are, but it is a required part of the sort support API when abbreviations
* are performed.
*
* Two palloc/pfree cycles could be saved here by using persistent buffers for
* aligning short-varlena inputs, but this has not so far been considered to
* be worth the effort.
*/
static int
numeric_fast_cmp(Datum x, Datum y, SortSupport ssup)
{
Numeric nx = DatumGetNumeric(x);
Numeric ny = DatumGetNumeric(y);
int result;
result = cmp_numerics(nx, ny);
if ((Pointer) nx != DatumGetPointer(x))
pfree(nx);
if ((Pointer) ny != DatumGetPointer(y))
pfree(ny);
return result;
}
/*
* Compare abbreviations of values. (Abbreviations may be equal where the true
* values differ, but if the abbreviations differ, they must reflect the
* ordering of the true values.)
*/
static int
numeric_cmp_abbrev(Datum x, Datum y, SortSupport ssup)
{
/*
* NOTE WELL: this is intentionally backwards, because the abbreviation is
* negated relative to the original value, to handle NaN.
*/
if (DatumGetNumericAbbrev(x) < DatumGetNumericAbbrev(y))
return 1;
if (DatumGetNumericAbbrev(x) > DatumGetNumericAbbrev(y))
return -1;
return 0;
}
/*
* Abbreviate a NumericVar according to the available bit size.
*
* The 31-bit value is constructed as:
*
* 0 + 7bits digit weight + 24 bits digit value
*
* where the digit weight is in single decimal digits, not digit words, and
* stored in excess-44 representation[1]. The 24-bit digit value is the 7 most
* significant decimal digits of the value converted to binary. Values whose
* weights would fall outside the representable range are rounded off to zero
* (which is also used to represent actual zeros) or to 0x7FFFFFFF (which
* otherwise cannot occur). Abbreviation therefore fails to gain any advantage
* where values are outside the range 10^-44 to 10^83, which is not considered
* to be a serious limitation, or when values are of the same magnitude and
* equal in the first 7 decimal digits, which is considered to be an
* unavoidable limitation given the available bits. (Stealing three more bits
* to compare another digit would narrow the range of representable weights by
* a factor of 8, which starts to look like a real limiting factor.)
*
* (The value 44 for the excess is essentially arbitrary)
*
* The 63-bit value is constructed as:
*
* 0 + 7bits weight + 4 x 14-bit packed digit words
*
* The weight in this case is again stored in excess-44, but this time it is
* the original weight in digit words (i.e. powers of 10000). The first four
* digit words of the value (if present; trailing zeros are assumed as needed)
* are packed into 14 bits each to form the rest of the value. Again,
* out-of-range values are rounded off to 0 or 0x7FFFFFFFFFFFFFFF. The
* representable range in this case is 10^-176 to 10^332, which is considered
* to be good enough for all practical purposes, and comparison of 4 words
* means that at least 13 decimal digits are compared, which is considered to
* be a reasonable compromise between effectiveness and efficiency in computing
* the abbreviation.
*
* (The value 44 for the excess is even more arbitrary here, it was chosen just
* to match the value used in the 31-bit case)
*
* [1] - Excess-k representation means that the value is offset by adding 'k'
* and then treated as unsigned, so the smallest representable value is stored
* with all bits zero. This allows simple comparisons to work on the composite
* value.
*/
#if NUMERIC_ABBREV_BITS == 64
static Datum
numeric_abbrev_convert_var(NumericVar *var, NumericSortSupport *nss)
{
int ndigits = var->ndigits;
int weight = var->weight;
int64 result;
if (ndigits == 0 || weight < -44)
{
result = 0;
}
else if (weight > 83)
{
result = PG_INT64_MAX;
}
else
{
result = ((int64) (weight + 44) << 56);
switch (ndigits)
{
default:
result |= ((int64) var->digits[3]);
/* FALLTHROUGH */
case 3:
result |= ((int64) var->digits[2]) << 14;
/* FALLTHROUGH */
case 2:
result |= ((int64) var->digits[1]) << 28;
/* FALLTHROUGH */
case 1:
result |= ((int64) var->digits[0]) << 42;
break;
}
}
/* the abbrev is negated relative to the original */
if (var->sign == NUMERIC_POS)
result = -result;
if (nss->estimating)
{
uint32 tmp = ((uint32) result
^ (uint32) ((uint64) result >> 32));
addHyperLogLog(&nss->abbr_card, DatumGetUInt32(hash_uint32(tmp)));
}
return Int64GetDatum(result);
}
#endif /* NUMERIC_ABBREV_BITS == 64 */
#if NUMERIC_ABBREV_BITS == 32
static Datum
numeric_abbrev_convert_var(NumericVar *var, NumericSortSupport *nss)
{
int ndigits = var->ndigits;
int weight = var->weight;
int32 result;
if (ndigits == 0 || weight < -11)
{
result = 0;
}
else if (weight > 20)
{
result = PG_INT32_MAX;
}
else
{
NumericDigit nxt1 = (ndigits > 1) ? var->digits[1] : 0;
weight = (weight + 11) * 4;
result = var->digits[0];
/*
* "result" now has 1 to 4 nonzero decimal digits. We pack in more
* digits to make 7 in total (largest we can fit in 24 bits)
*/
if (result > 999)
{
/* already have 4 digits, add 3 more */
result = (result * 1000) + (nxt1 / 10);
weight += 3;
}
else if (result > 99)
{
/* already have 3 digits, add 4 more */
result = (result * 10000) + nxt1;
weight += 2;
}
else if (result > 9)
{
NumericDigit nxt2 = (ndigits > 2) ? var->digits[2] : 0;
/* already have 2 digits, add 5 more */
result = (result * 100000) + (nxt1 * 10) + (nxt2 / 1000);
weight += 1;
}
else
{
NumericDigit nxt2 = (ndigits > 2) ? var->digits[2] : 0;
/* already have 1 digit, add 6 more */
result = (result * 1000000) + (nxt1 * 100) + (nxt2 / 100);
}
result = result | (weight << 24);
}
/* the abbrev is negated relative to the original */
if (var->sign == NUMERIC_POS)
result = -result;
if (nss->estimating)
{
uint32 tmp = (uint32) result;
addHyperLogLog(&nss->abbr_card, DatumGetUInt32(hash_uint32(tmp)));
}
return Int32GetDatum(result);
}
#endif /* NUMERIC_ABBREV_BITS == 32 */
/*
* Ordinary (non-sortsupport) comparisons follow.
*/
Datum
numeric_cmp(PG_FUNCTION_ARGS)

View File

@ -112,9 +112,6 @@ extern char *temp_tablespaces;
extern bool ignore_checksum_failure;
extern bool synchronize_seqscans;
#ifdef TRACE_SORT
extern bool trace_sort;
#endif
#ifdef TRACE_SYNCSCAN
extern bool trace_syncscan;
#endif

View File

@ -53,6 +53,6 @@
*/
/* yyyymmddN */
#define CATALOG_VERSION_NO 201503302
#define CATALOG_VERSION_NO 201504021
#endif

View File

@ -118,6 +118,7 @@ DATA(insert ( 1984 829 829 1 836 ));
DATA(insert ( 1986 19 19 1 359 ));
DATA(insert ( 1986 19 19 2 3135 ));
DATA(insert ( 1988 1700 1700 1 1769 ));
DATA(insert ( 1988 1700 1700 2 3283 ));
DATA(insert ( 1989 26 26 1 356 ));
DATA(insert ( 1989 26 26 2 3134 ));
DATA(insert ( 1991 30 30 1 404 ));

View File

@ -2370,6 +2370,8 @@ DATA(insert OID = 1767 ( numeric_larger PGNSP PGUID 12 1 0 0 0 f f f f t f i 2
DESCR("larger of two");
DATA(insert OID = 1769 ( numeric_cmp PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 23 "1700 1700" _null_ _null_ _null_ _null_ numeric_cmp _null_ _null_ _null_ ));
DESCR("less-equal-greater");
DATA(insert OID = 3283 ( numeric_sortsupport PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 2278 "2281" _null_ _null_ _null_ _null_ numeric_sortsupport _null_ _null_ _null_ ));
DESCR("sort support");
DATA(insert OID = 1771 ( numeric_uminus PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 1700 "1700" _null_ _null_ _null_ _null_ numeric_uminus _null_ _null_ _null_ ));
DATA(insert OID = 1779 ( int8 PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 20 "1700" _null_ _null_ _null_ _null_ numeric_int8 _null_ _null_ _null_ ));
DESCR("convert numeric to int8");

View File

@ -978,6 +978,7 @@ extern Datum numeric_round(PG_FUNCTION_ARGS);
extern Datum numeric_trunc(PG_FUNCTION_ARGS);
extern Datum numeric_ceil(PG_FUNCTION_ARGS);
extern Datum numeric_floor(PG_FUNCTION_ARGS);
extern Datum numeric_sortsupport(PG_FUNCTION_ARGS);
extern Datum numeric_cmp(PG_FUNCTION_ARGS);
extern Datum numeric_eq(PG_FUNCTION_ARGS);
extern Datum numeric_ne(PG_FUNCTION_ARGS);

View File

@ -258,6 +258,10 @@ extern int tcp_keepalives_idle;
extern int tcp_keepalives_interval;
extern int tcp_keepalives_count;
#ifdef TRACE_SORT
extern bool trace_sort;
#endif
/*
* Functions exported by guc.c
*/