From 2b8b2852bbc54f02e26131a966e62c432144dc93 Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Sun, 2 Jul 2023 10:24:13 +0200 Subject: [PATCH] Introduce bloom_filter_size for BRIN bloom opclass Move the calculation of Bloom filter parameters (for BRIN indexes) into a separate function to make reuse easier. At the moment we only call it from one place, but that may change and it's easier to read anyway. Reviewed-by: Heikki Linnakangas Discussion: https://postgr.es/m/0e1f3350-c9cf-ab62-43a5-5dae314de89c%40enterprisedb.com --- src/backend/access/brin/brin_bloom.c | 63 +++++++++++++++++++++------- 1 file changed, 47 insertions(+), 16 deletions(-) diff --git a/src/backend/access/brin/brin_bloom.c b/src/backend/access/brin/brin_bloom.c index 568faf1cd5..f47eb81012 100644 --- a/src/backend/access/brin/brin_bloom.c +++ b/src/backend/access/brin/brin_bloom.c @@ -259,6 +259,48 @@ typedef struct BloomFilter char data[FLEXIBLE_ARRAY_MEMBER]; } BloomFilter; +/* + * bloom_filter_size + * Calculate Bloom filter parameters (nbits, nbytes, nhashes). + * + * Given expected number of distinct values and desired false positive rate, + * calculates the optimal parameters of the Bloom filter. + * + * The resulting parameters are returned through nbytesp (number of bytes), + * nbitsp (number of bits) and nhashesp (number of hash functions). If a + * pointer is NULL, the parameter is not returned. + */ +static void +bloom_filter_size(int ndistinct, double false_positive_rate, + int *nbytesp, int *nbitsp, int *nhashesp) +{ + double k; + int nbits, + nbytes; + + /* sizing bloom filter: -(n * ln(p)) / (ln(2))^2 */ + nbits = ceil(-(ndistinct * log(false_positive_rate)) / pow(log(2.0), 2)); + + /* round m to whole bytes */ + nbytes = ((nbits + 7) / 8); + nbits = nbytes * 8; + + /* + * round(log(2.0) * m / ndistinct), but assume round() may not be + * available on Windows + */ + k = log(2.0) * nbits / ndistinct; + k = (k - floor(k) >= 0.5) ? ceil(k) : floor(k); + + if (nbytesp) + *nbytesp = nbytes; + + if (nbitsp) + *nbitsp = nbits; + + if (nhashesp) + *nhashesp = (int) k; +} /* * bloom_init @@ -275,19 +317,15 @@ bloom_init(int ndistinct, double false_positive_rate) int nbits; /* size of filter / number of bits */ int nbytes; /* size of filter / number of bytes */ - - double k; /* number of hash functions */ + int nhashes; /* number of hash functions */ Assert(ndistinct > 0); Assert((false_positive_rate >= BLOOM_MIN_FALSE_POSITIVE_RATE) && (false_positive_rate < BLOOM_MAX_FALSE_POSITIVE_RATE)); - /* sizing bloom filter: -(n * ln(p)) / (ln(2))^2 */ - nbits = ceil(-(ndistinct * log(false_positive_rate)) / pow(log(2.0), 2)); - - /* round m to whole bytes */ - nbytes = ((nbits + 7) / 8); - nbits = nbytes * 8; + /* calculate bloom filter size / parameters */ + bloom_filter_size(ndistinct, false_positive_rate, + &nbytes, &nbits, &nhashes); /* * Reject filters that are obviously too large to store on a page. @@ -310,13 +348,6 @@ bloom_init(int ndistinct, double false_positive_rate) elog(ERROR, "the bloom filter is too large (%d > %zu)", nbytes, BloomMaxFilterSize); - /* - * round(log(2.0) * m / ndistinct), but assume round() may not be - * available on Windows - */ - k = log(2.0) * nbits / ndistinct; - k = (k - floor(k) >= 0.5) ? ceil(k) : floor(k); - /* * We allocate the whole filter. Most of it is going to be 0 bits, so the * varlena is easy to compress. @@ -326,7 +357,7 @@ bloom_init(int ndistinct, double false_positive_rate) filter = (BloomFilter *) palloc0(len); filter->flags = 0; - filter->nhashes = (int) k; + filter->nhashes = nhashes; filter->nbits = nbits; SET_VARSIZE(filter, len);