diff --git a/src/backend/lib/Makefile b/src/backend/lib/Makefile index 191ea9bca2..3c1ee1df83 100644 --- a/src/backend/lib/Makefile +++ b/src/backend/lib/Makefile @@ -13,6 +13,6 @@ top_builddir = ../../.. include $(top_builddir)/src/Makefile.global OBJS = binaryheap.o bipartite_match.o bloomfilter.o dshash.o hyperloglog.o \ - ilist.o knapsack.o pairingheap.o rbtree.o stringinfo.o + ilist.o integerset.o knapsack.o pairingheap.o rbtree.o stringinfo.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/lib/README b/src/backend/lib/README index ae5debe1bc..f2fb591237 100644 --- a/src/backend/lib/README +++ b/src/backend/lib/README @@ -13,6 +13,8 @@ hyperloglog.c - a streaming cardinality estimator ilist.c - single and double-linked lists +integerset.c - a data structure for holding large set of integers + knapsack.c - knapsack problem solver pairingheap.c - a pairing heap diff --git a/src/backend/lib/integerset.c b/src/backend/lib/integerset.c new file mode 100644 index 0000000000..fcbb2bd0db --- /dev/null +++ b/src/backend/lib/integerset.c @@ -0,0 +1,1019 @@ +/*------------------------------------------------------------------------- + * + * integerset.c + * Data structure to hold a large set of 64-bit integers efficiently + * + * IntegerSet provides an in-memory data structure to hold a set of + * arbitrary 64-bit integers. Internally, the values are stored in a + * B-tree, with a special packed representation at the leaf level using + * the Simple-8b algorithm, which can pack hold clusters of nearby values + * very tightly. + * + * Memory consumption depends on the number of values stored, but also + * on how far the values are from each other. In the best case, with + * long runs of consecutive integers, memory consumption can be as low as + * 0.1 bytes per integer. In the worst case, if integers are more than + * 2^32 apart, it uses about 8 bytes per integer. In typical use, the + * consumption per integer is somewhere between those extremes, depending + * on the range of integers stored, and how "clustered" they are. + * + * + * Interface + * --------- + * + * intset_create - Create a new empty set. + * intset_add_member - Add an integer to the set. + * intset_is_member - Test if an integer is in the set + * intset_begin_iterate - Begin iterating through all integers in set + * intset_iterate_next - Return next integer + * + * intset_create() creates the set in the current memory context. Note + * that there is no function to free an integer set. If you need to do that, + * create a dedicated memory context to hold it, and destroy the memory + * context instead. + * + * + * Limitations + * ----------- + * + * - Values must be added in order. (Random insertions would require + * splitting nodes, which hasn't been implemented.) + * + * - Values cannot be added while iteration is in progress. + * + * - No support for removing values. + * + * None of these limitations are fundamental to the data structure, and + * could be lifted if needed, by writing some new code. But the current + * users of this facility don't need them. + * + * + * References + * ---------- + * + * Simple-8b encoding is based on: + * + * Vo Ngoc Anh , Alistair Moffat, Index compression using 64-bit words, + * Software - Practice & Experience, v.40 n.2, p.131-147, February 2010 + * (https://doi.org/10.1002/spe.948) + * + * + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/lib/integerset.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/htup_details.h" +#include "lib/integerset.h" +#include "port/pg_bitutils.h" +#include "utils/memutils.h" + + +/* + * Maximum number of integers that can be encoded in a single Single-8b + * codeword. (Defined here before anything else, so that we can size arrays + * using this). + */ +#define SIMPLE8B_MAX_VALUES_PER_CODEWORD 240 + +/* + * Parameters for shape of the in-memory B-tree. + * + * These set the size of each internal and leaf node. They don't necessarily + * need to be the same, because the tree is just an in-memory structure. + * With the default 64, each node is about 1 kb. + * + * If you change these, you must recalculate MAX_TREE_LEVELS, too! + */ +#define MAX_INTERNAL_ITEMS 64 +#define MAX_LEAF_ITEMS 64 + +/* + * Maximum height of the tree. + * + * MAX_TREE_ITEMS is calculated from the "fan-out" of the B-tree. The + * theoretical maximum number of items that we can store in a set is 2^64, + * so MAX_TREE_LEVELS should be set so that: + * + * MAX_LEAF_ITEMS * MAX_INTERNAL_ITEMS ^ (MAX_TREE_LEVELS - 1) >= 2^64. + * + * In practice, we'll need far fewer levels, because you will run out of + * memory long before reaching that number, but let's be conservative. + */ +#define MAX_TREE_LEVELS 11 + +/* + * Node structures, for the in-memory B-tree. + * + * An internal node holds a number of downlink pointers to leaf nodes, or + * to internal nodes on lower level. For each downlink, the key value + * corresponding the lower level node is stored in a sorted array. The + * stored key values are low keys. In other words, if the downlink has value + * X, then all items stored on that child are >= X. + * + * Each leaf node holds a number of "items", with a varying number of + * integers packed into each item. Each item consists of two 64-bit words: + * The first word holds first integer stored in the item, in plain format. + * The second word contains between 0 and 240 more integers, packed using + * Simple-8b encoding. By storing the first integer in plain, unpacked, + * format, we can use binary search to quickly find an item that holds (or + * would hold) a particular integer. And by storing the rest in packed form, + * we still get pretty good memory density, if there are clusters of integers + * with similar values. + * + * Each leaf node also has a pointer to the next leaf node, so that the leaf + * nodes can be easily walked from beginning to end, when iterating. + */ +typedef struct intset_node intset_node; +typedef struct intset_leaf_node intset_leaf_node; +typedef struct intset_internal_node intset_internal_node; + +/* Common structure of both leaf and internal nodes. */ +struct intset_node +{ + uint16 level; + uint16 num_items; +}; + +/* Internal node */ +struct intset_internal_node +{ + /* common header, must match intset_node */ + uint16 level; /* >= 1 on internal nodes */ + uint16 num_items; + + /* + * 'values' is an array of key values, and 'downlinks' are pointers to + * lower-level nodes, corresponding to the key values. + */ + uint64 values[MAX_INTERNAL_ITEMS]; + intset_node *downlinks[MAX_INTERNAL_ITEMS]; +}; + +/* Leaf node */ +typedef struct +{ + uint64 first; /* first integer in this item */ + uint64 codeword; /* simple8b encoded differences from 'first' */ +} leaf_item; + +#define MAX_VALUES_PER_LEAF_ITEM (1 + SIMPLE8B_MAX_VALUES_PER_CODEWORD) + +struct intset_leaf_node +{ + /* common header, must match intset_node */ + uint16 level; /* 0 on leafs */ + uint16 num_items; + + intset_leaf_node *next; /* right sibling, if any */ + + leaf_item items[MAX_LEAF_ITEMS]; +}; + +/* + * We buffer insertions in a simple array, before packing and inserting them + * into the B-tree. MAX_BUFFERED_VALUES sets the size of the buffer. The + * encoder assumes that it is large enough, that we can always fill a leaf + * item with buffered new items. In other words, MAX_BUFFERED_VALUES must be + * larger than MAX_VALUES_PER_LEAF_ITEM. For efficiency, make it much larger. + */ +#define MAX_BUFFERED_VALUES (MAX_VALUES_PER_LEAF_ITEM * 2) + +/* + * IntegerSet is the top-level object representing the set. + * + * The integers are stored in an in-memory B-tree structure, and an array + * for newly-added integers. IntegerSet also tracks information about memory + * usage, as well as the current position, when iterating the set with + * intset_begin_iterate / intset_iterate_next. + */ +struct IntegerSet +{ + /* + * 'context' is the memory context holding this integer set and all its + * tree nodes. + * + * 'mem_used' tracks the amount of memory used. We don't do anything with + * it in integerset.c itself, but the callers can ask for it with + * intset_memory_usage(). + */ + MemoryContext context; + uint64 mem_used; + + uint64 num_entries; /* total # of values in the set */ + uint64 highest_value; /* highest value stored in this set */ + + /* + * B-tree to hold the packed values. + * + * 'rightmost_nodes' hold pointers to the rightmost node on each level. + * rightmost_parent[0] is rightmost leaf, rightmost_parent[1] is its + * parent, and so forth, all the way up to the root. These are needed when + * adding new values. (Currently, we require that new values are added at + * the end.) + */ + int num_levels; /* height of the tree */ + intset_node *root; /* root node */ + intset_node *rightmost_nodes[MAX_TREE_LEVELS]; + intset_leaf_node *leftmost_leaf; /* leftmost leaf node */ + + /* + * Holding area for new items that haven't been inserted to the tree yet. + */ + uint64 buffered_values[MAX_BUFFERED_VALUES]; + int num_buffered_values; + + /* + * Iterator support. + * + * 'iter_values' is an array of integers ready to be returned to the + * caller. 'item_node' and 'item_itemno' point to the leaf node, and item + * within the leaf node, to get the next batch of values from. + * + * Normally, 'iter_values' points 'iter_values_buf', which holds items + * decoded from a leaf item. But after we have scanned the whole B-tree, + * we iterate through all the unbuffered values, too, by pointing + * iter_values to 'buffered_values'. + */ + uint64 *iter_values; + int iter_num_values; /* number of elements in 'iter_values' */ + int iter_valueno; /* index into 'iter_values' */ + intset_leaf_node *iter_node; /* current leaf node */ + int iter_itemno; /* next item 'iter_node' to decode */ + + uint64 iter_values_buf[MAX_VALUES_PER_LEAF_ITEM]; +}; + +/* + * prototypes for internal functions. + */ +static void intset_update_upper(IntegerSet *intset, int level, + intset_node *new_node, uint64 new_node_item); +static void intset_flush_buffered_values(IntegerSet *intset); + +static int intset_binsrch_uint64(uint64 value, uint64 *arr, int arr_elems, + bool nextkey); +static int intset_binsrch_leaf(uint64 value, leaf_item *arr, int arr_elems, + bool nextkey); + +static uint64 simple8b_encode(uint64 *ints, int *num_encoded, uint64 base); +static int simple8b_decode(uint64 codeword, uint64 *decoded, uint64 base); +static bool simple8b_contains(uint64 codeword, uint64 key, uint64 base); + + +/* + * Create a new, initially empty, integer set. + * + * The integer set is created in the current memory context. + */ +IntegerSet * +intset_create(void) +{ + IntegerSet *intset; + + /* + * Allocate the IntegerSet object in the current memory context. Remember + * the context, so that we will do all subsequent allocations in the same + * context, too, regardless of which memory context is current when new + * integers are added to the set. + */ + intset = (IntegerSet *) palloc(sizeof(IntegerSet)); + intset->context = CurrentMemoryContext; + intset->mem_used = GetMemoryChunkSpace(intset); + + intset->num_entries = 0; + intset->highest_value = 0; + + intset->num_levels = 0; + intset->root = NULL; + memset(intset->rightmost_nodes, 0, sizeof(intset->rightmost_nodes)); + intset->leftmost_leaf = NULL; + + intset->num_buffered_values = 0; + + intset->iter_node = NULL; + intset->iter_itemno = 0; + intset->iter_valueno = 0; + intset->iter_num_values = 0; + + return intset; +} + +/* + * Allocate a new node. + */ +static intset_internal_node * +intset_new_internal_node(IntegerSet *intset) +{ + intset_internal_node *n; + + n = (intset_internal_node *) MemoryContextAlloc(intset->context, + sizeof(intset_internal_node)); + intset->mem_used += GetMemoryChunkSpace(n); + + n->level = 0; /* caller must set */ + n->num_items = 0; + + return n; +} + +static intset_leaf_node * +intset_new_leaf_node(IntegerSet *intset) +{ + intset_leaf_node *n; + + n = (intset_leaf_node *) MemoryContextAlloc(intset->context, + sizeof(intset_leaf_node)); + intset->mem_used += GetMemoryChunkSpace(n); + + n->level = 0; + n->num_items = 0; + n->next = NULL; + + return n; +} + +/* + * Return the number of entries in the integer set. + */ +uint64 +intset_num_entries(IntegerSet *intset) +{ + return intset->num_entries; +} + +/* + * Return the amount of memory used by the integer set. + */ +uint64 +intset_memory_usage(IntegerSet *intset) +{ + return intset->mem_used; +} + +/* + * Add a value to the set. + * + * Values must be added in order. + */ +void +intset_add_member(IntegerSet *intset, uint64 x) +{ + if (intset->iter_node) + elog(ERROR, "cannot add new values to integer set when iteration is in progress"); + + if (x <= intset->highest_value && intset->num_entries > 0) + elog(ERROR, "cannot add value to integer set out of order"); + + if (intset->num_buffered_values >= MAX_BUFFERED_VALUES) + { + /* Time to flush our buffer */ + intset_flush_buffered_values(intset); + Assert(intset->num_buffered_values < MAX_BUFFERED_VALUES); + } + + /* Add it to the buffer of newly-added values */ + intset->buffered_values[intset->num_buffered_values] = x; + intset->num_buffered_values++; + intset->num_entries++; + intset->highest_value = x; +} + +/* + * Take a batch of buffered values, and pack them into the B-tree. + */ +static void +intset_flush_buffered_values(IntegerSet *intset) +{ + uint64 *values = intset->buffered_values; + uint64 num_values = intset->num_buffered_values; + int num_packed = 0; + intset_leaf_node *leaf; + + leaf = (intset_leaf_node *) intset->rightmost_nodes[0]; + + /* + * If the tree is completely empty, create the first leaf page, which is + * also the root. + */ + if (leaf == NULL) + { + /* + * This is the very first item in the set. + * + * Allocate root node. It's also a leaf. + */ + leaf = intset_new_leaf_node(intset); + + intset->root = (intset_node *) leaf; + intset->leftmost_leaf = leaf; + intset->rightmost_nodes[0] = (intset_node *) leaf; + intset->num_levels = 1; + } + + /* + * If there are less than MAX_VALUES_PER_LEAF_ITEM values in the buffer, + * stop. In most cases, we cannot encode that many values in a single + * value, but this way, the encoder doesn't have to worry about running + * out of input. + */ + while (num_values - num_packed >= MAX_VALUES_PER_LEAF_ITEM) + { + leaf_item item; + int num_encoded; + + /* + * Construct the next leaf item, packing as many buffered values as + * possible. + */ + item.first = values[num_packed]; + item.codeword = simple8b_encode(&values[num_packed + 1], + &num_encoded, + item.first); + + /* + * Add the item to the node, allocating a new node if the old one is + * full. + */ + if (leaf->num_items >= MAX_LEAF_ITEMS) + { + /* Allocate new leaf and link it to the tree */ + intset_leaf_node *old_leaf = leaf; + + leaf = intset_new_leaf_node(intset); + old_leaf->next = leaf; + intset->rightmost_nodes[0] = (intset_node *) leaf; + intset_update_upper(intset, 1, (intset_node *) leaf, item.first); + } + leaf->items[leaf->num_items++] = item; + + num_packed += 1 + num_encoded; + } + + /* + * Move any remaining buffered values to the beginning of the array. + */ + if (num_packed < intset->num_buffered_values) + { + memmove(&intset->buffered_values[0], + &intset->buffered_values[num_packed], + (intset->num_buffered_values - num_packed) * sizeof(uint64)); + } + intset->num_buffered_values -= num_packed; +} + +/* + * Insert a downlink into parent node, after creating a new node. + * + * Recurses if the parent node is full, too. + */ +static void +intset_update_upper(IntegerSet *intset, int level, intset_node *child, + uint64 child_key) +{ + intset_internal_node *parent; + + Assert(level > 0); + + /* + * Create a new root node, if necessary. + */ + if (level >= intset->num_levels) + { + intset_node *oldroot = intset->root; + uint64 downlink_key; + + /* MAX_TREE_LEVELS should be more than enough, this shouldn't happen */ + if (intset->num_levels == MAX_TREE_LEVELS) + elog(ERROR, "could not expand integer set, maximum number of levels reached"); + intset->num_levels++; + + /* + * Get the first value on the old root page, to be used as the + * downlink. + */ + if (intset->root->level == 0) + downlink_key = ((intset_leaf_node *) oldroot)->items[0].first; + else + downlink_key = ((intset_internal_node *) oldroot)->values[0]; + + parent = intset_new_internal_node(intset); + parent->level = level; + parent->values[0] = downlink_key; + parent->downlinks[0] = oldroot; + parent->num_items = 1; + + intset->root = (intset_node *) parent; + intset->rightmost_nodes[level] = (intset_node *) parent; + } + + /* + * Place the downlink on the parent page. + */ + parent = (intset_internal_node *) intset->rightmost_nodes[level]; + + if (parent->num_items < MAX_INTERNAL_ITEMS) + { + parent->values[parent->num_items] = child_key; + parent->downlinks[parent->num_items] = child; + parent->num_items++; + } + else + { + /* + * Doesn't fit. Allocate new parent, with the downlink as the first + * item on it, and recursively insert the downlink to the new parent + * to the grandparent. + */ + parent = intset_new_internal_node(intset); + parent->level = level; + parent->values[0] = child_key; + parent->downlinks[0] = child; + parent->num_items = 1; + + intset->rightmost_nodes[level] = (intset_node *) parent; + + intset_update_upper(intset, level + 1, (intset_node *) parent, child_key); + } +} + +/* + * Does the set contain the given value? + */ +bool +intset_is_member(IntegerSet *intset, uint64 x) +{ + intset_node *node; + intset_leaf_node *leaf; + int level; + int itemno; + leaf_item *item; + + /* + * The value might be in the buffer of newly-added values. + */ + if (intset->num_buffered_values > 0 && x >= intset->buffered_values[0]) + { + int itemno; + + itemno = intset_binsrch_uint64(x, + intset->buffered_values, + intset->num_buffered_values, + false); + if (itemno >= intset->num_buffered_values) + return false; + else + return intset->buffered_values[itemno] == x; + } + + /* + * Start from the root, and walk down the B-tree to find the right leaf + * node. + */ + if (!intset->root) + return false; + node = intset->root; + for (level = intset->num_levels - 1; level > 0; level--) + { + intset_internal_node *n = (intset_internal_node *) node; + + Assert(node->level == level); + + itemno = intset_binsrch_uint64(x, n->values, n->num_items, true); + if (itemno == 0) + return false; + node = n->downlinks[itemno - 1]; + } + Assert(node->level == 0); + leaf = (intset_leaf_node *) node; + + /* + * Binary search the right item on the leaf page + */ + itemno = intset_binsrch_leaf(x, leaf->items, leaf->num_items, true); + if (itemno == 0) + return false; + item = &leaf->items[itemno - 1]; + + /* Is this a match to the first value on the item? */ + if (item->first == x) + return true; + Assert(x > item->first); + + /* Is it in the packed codeword? */ + if (simple8b_contains(item->codeword, x, item->first)) + return true; + + return false; +} + +/* + * Begin in-order scan through all the values. + * + * While the iteration is in-progress, you cannot add new values to the set. + */ +void +intset_begin_iterate(IntegerSet *intset) +{ + intset->iter_node = intset->leftmost_leaf; + intset->iter_itemno = 0; + intset->iter_valueno = 0; + intset->iter_num_values = 0; + intset->iter_values = intset->iter_values_buf; +} + +/* + * Returns the next integer, when iterating. + * + * intset_begin_iterate() must be called first. intset_iterate_next() returns + * the next value in the set. Returns true, if there was another value, and + * stores the value in *next. Otherwise, returns false. + */ +bool +intset_iterate_next(IntegerSet *intset, uint64 *next) +{ + for (;;) + { + if (intset->iter_valueno < intset->iter_num_values) + { + *next = intset->iter_values[intset->iter_valueno++]; + return true; + } + + /* Our queue is empty, decode next leaf item */ + if (intset->iter_node && intset->iter_itemno < intset->iter_node->num_items) + { + /* We have reached end of this packed item. Step to the next one. */ + leaf_item *item; + int num_decoded; + + item = &intset->iter_node->items[intset->iter_itemno++]; + + intset->iter_values[0] = item->first; + num_decoded = simple8b_decode(item->codeword, &intset->iter_values[1], item->first); + intset->iter_num_values = num_decoded + 1; + + intset->iter_valueno = 0; + continue; + } + + /* No more items on this leaf, step to next node */ + if (intset->iter_node) + { + /* No more matches on this bucket. Step to the next node. */ + intset->iter_node = intset->iter_node->next; + intset->iter_itemno = 0; + intset->iter_valueno = 0; + intset->iter_num_values = 0; + continue; + } + + /* + * We have reached the end of the B-tree. But we might still have + * some integers in the buffer of newly-added values. + */ + if (intset->iter_values == intset->iter_values_buf) + { + intset->iter_values = intset->buffered_values; + intset->iter_num_values = intset->num_buffered_values; + continue; + } + + break; + } + + /* No more results. */ + *next = 0; + return false; +} + +/* + * intset_binsrch_uint64() -- search a sorted array of uint64s + * + * Returns the first position with key equal or less than the given key. + * The returned position would be the "insert" location for the given key, + * that is, the position where the new key should be inserted to. + * + * 'nextkey' affects the behavior on equal keys. If true, and there is an + * equal key in the array, this returns the position immediately after the + * equal key. If false, this returns the position of the equal key itself. + */ +static int +intset_binsrch_uint64(uint64 item, uint64 *arr, int arr_elems, bool nextkey) +{ + int low, + high, + mid; + + low = 0; + high = arr_elems; + while (high > low) + { + mid = low + (high - low) / 2; + + if (nextkey) + { + if (item >= arr[mid]) + low = mid + 1; + else + high = mid; + } + else + { + if (item > arr[mid]) + low = mid + 1; + else + high = mid; + } + } + + return low; +} + +/* same, but for an array of leaf items */ +static int +intset_binsrch_leaf(uint64 item, leaf_item *arr, int arr_elems, bool nextkey) +{ + int low, + high, + mid; + + low = 0; + high = arr_elems; + while (high > low) + { + mid = low + (high - low) / 2; + + if (nextkey) + { + if (item >= arr[mid].first) + low = mid + 1; + else + high = mid; + } + else + { + if (item > arr[mid].first) + low = mid + 1; + else + high = mid; + } + } + + return low; +} + +/* + * Simple-8b encoding. + * + * Simple-8b algorithm packs between 1 and 240 integers into 64-bit words, + * called "codewords". The number of integers packed into a single codeword + * depends on the integers being packed: small integers are encoded using + * fewer bits than large integers. A single codeword can store a single + * 60-bit integer, or two 30-bit integers, for example. + * + * Since we're storing a unique, sorted, set of integers, we actually encode + * the *differences* between consecutive integers. That way, clusters of + * integers that are close to each other are packed efficiently, regardless + * of the absolute values. + * + * In Simple-8b, each codeword consists of a 4-bit selector, which indicates + * how many integers are encoded in the codeword, and the encoded integers + * packed into the remaining 60 bits. The selector allows for 16 different + * ways of using the remaining 60 bits, "modes". The number of integers + * packed into a single codeword is listed in the simple8b_modes table below. + * For example, consider the following codeword: + * + * 20-bit integer 20-bit integer 20-bit integer + * 1101 00000000000000010010 01111010000100100000 00000000000000010100 + * ^ + * selector + * + * The selector 1101 is 13 in decimal. From the modes table below, we see + * that it means that the codeword encodes three 12-bit integers. In decimal, + * those integers are 18, 500000 and 20. Because we encode deltas rather than + * absolute values, the actual values that they represent are 18, 500018 and + * 500038. + * + * Modes 0 and 1 are a bit special; they encode a run of 240 or 120 zeros + * (which means 240 or 120 consecutive integers, since we're encoding the + * the deltas between integers), without using the rest of the codeword bits + * for anything. + * + * Simple-8b cannot encode integers larger than 60 bits. Values larger than + * that are always stored in the 'first' field of a leaf item, never in the + * packed codeword. If there is a sequence of integers that are more than + * 2^60 apart, the codeword will go unused on those items. To represent that, + * we use a magic EMPTY_CODEWORD codeword. + */ +static const struct +{ + uint8 bits_per_int; + uint8 num_ints; +} simple8b_modes[17] = +{ + {0, 240}, /* mode 0: 240 zeros */ + {0, 120}, /* mode 1: 120 zeros */ + {1, 60}, /* mode 2: sixty 1-bit integers */ + {2, 30}, /* mode 3: thirty 2-bit integers */ + {3, 20}, /* mode 4: twenty 3-bit integers */ + {4, 15}, /* mode 5: fifteen 4-bit integers */ + {5, 12}, /* mode 6: twelve 5-bit integers */ + {6, 10}, /* mode 7: ten 6-bit integers */ + {7, 8}, /* mode 8: eight 7-bit integers (four bits + * are wasted) */ + {8, 7}, /* mode 9: seven 8-bit integers (four bits + * are wasted) */ + {10, 6}, /* mode 10: six 10-bit integers */ + {12, 5}, /* mode 11: five 12-bit integers */ + {15, 4}, /* mode 12: four 15-bit integers */ + {20, 3}, /* mode 13: three 20-bit integers */ + {30, 2}, /* mode 14: two 30-bit integers */ + {60, 1}, /* mode 15: one 60-bit integer */ + {0, 0} /* sentinel value */ +}; + +/* + * EMPTY_CODEWORD is a special value, used to indicate "no values". + * It is used if the next value is too large to be encoded with Simple-8b. + * + * This value looks like a 0-mode codeword, but we check for it + * specifically. (In a real 0-mode codeword, all the unused bits are zero.) + */ +#define EMPTY_CODEWORD (0xFFFFFFFFFFFFFFF0) + +/* + * Encode a number of integers into a Simple-8b codeword. + * + * Returns the number of integers that were encoded. + */ +static uint64 +simple8b_encode(uint64 *ints, int *num_encoded, uint64 base) +{ + int selector; + int nints; + int bits; + uint64 diff; + uint64 last_val; + uint64 codeword; + uint64 diffs[60]; + int i; + + Assert(ints[0] > base); + + /* + * Select the "mode" to use for the next codeword. + * + * In each iteration, check if the next value can be represented in the + * current mode we're considering. If it's too large, then step up the + * mode to a wider one, and repeat. If it fits, move on to the next + * integer. Repeat until the codeword is full, given the current mode. + * + * Note that we don't have any way to represent unused slots in the + * codeword, so we require each codeword to be "full". + */ + selector = 0; + nints = simple8b_modes[0].num_ints; + bits = simple8b_modes[0].bits_per_int; + diff = ints[0] - base - 1; + last_val = ints[0]; + i = 0; + for (;;) + { + if (diff >= (1L << bits)) + { + /* too large, step up to next mode */ + selector++; + nints = simple8b_modes[selector].num_ints; + bits = simple8b_modes[selector].bits_per_int; + if (i >= nints) + break; + } + else + { + if (i < 60) + diffs[i] = diff; + i++; + if (i >= nints) + break; + + Assert(ints[i] > last_val); + diff = ints[i] - last_val - 1; + last_val = ints[i]; + } + } + + if (nints == 0) + { + /* The next value is too large and be encoded with Simple-8b */ + Assert(i == 0); + *num_encoded = 0; + return EMPTY_CODEWORD; + } + + /* + * Encode the integers using the selected mode. Note that we shift them + * into the codeword in reverse order, so that they will come out in the + * correct order in the decoder. + */ + codeword = 0; + if (bits > 0) + { + for (i = nints - 1; i >= 0; i--) + { + codeword <<= bits; + codeword |= diffs[i]; + } + } + + /* add selector to the codeword, and return */ + codeword <<= 4; + codeword |= selector; + + *num_encoded = nints; + return codeword; +} + +/* + * Decode a codeword into an array of integers. + */ +static int +simple8b_decode(uint64 codeword, uint64 *decoded, uint64 base) +{ + int selector = codeword & 0x0f; + int nints = simple8b_modes[selector].num_ints; + uint64 bits = simple8b_modes[selector].bits_per_int; + uint64 mask = (1L << bits) - 1; + uint64 prev_value; + + if (codeword == EMPTY_CODEWORD) + return 0; + + codeword >>= 4; /* shift out the selector */ + + prev_value = base; + for (int i = 0; i < nints; i++) + { + uint64 diff = codeword & mask; + + decoded[i] = prev_value + 1L + diff; + prev_value = decoded[i]; + codeword >>= bits; + } + return nints; +} + +/* + * This is very similar to simple8b_decode(), but instead of decoding all + * the values to an array, it just checks if the given integer is part of + * the codeword. + */ +static bool +simple8b_contains(uint64 codeword, uint64 key, uint64 base) +{ + int selector = codeword & 0x0f; + int nints = simple8b_modes[selector].num_ints; + int bits = simple8b_modes[selector].bits_per_int; + + if (codeword == EMPTY_CODEWORD) + return false; + + codeword >>= 4; /* shift out the selector */ + + if (bits == 0) + { + /* Special handling for 0-bit cases. */ + return key - base <= nints; + } + else + { + int mask = (1L << bits) - 1; + uint64 prev_value; + + prev_value = base; + for (int i = 0; i < nints; i++) + { + uint64 diff = codeword & mask; + uint64 curr_value; + + curr_value = prev_value + 1L + diff; + + if (curr_value >= key) + { + if (curr_value == key) + return true; + else + return false; + } + + codeword >>= bits; + prev_value = curr_value; + } + } + return false; +} diff --git a/src/include/lib/integerset.h b/src/include/lib/integerset.h new file mode 100644 index 0000000000..a7882c1a26 --- /dev/null +++ b/src/include/lib/integerset.h @@ -0,0 +1,24 @@ +/* + * integerset.h + * In-memory data structure to hold a large set of integers efficiently + * + * Portions Copyright (c) 2012-2019, PostgreSQL Global Development Group + * + * src/include/lib/integerset.h + */ +#ifndef INTEGERSET_H +#define INTEGERSET_H + +typedef struct IntegerSet IntegerSet; + +extern IntegerSet *intset_create(void); +extern void intset_add_member(IntegerSet *intset, uint64 x); +extern bool intset_is_member(IntegerSet *intset, uint64 x); + +extern uint64 intset_num_entries(IntegerSet *intset); +extern uint64 intset_memory_usage(IntegerSet *intset); + +extern void intset_begin_iterate(IntegerSet *intset); +extern bool intset_iterate_next(IntegerSet *intset, uint64 *next); + +#endif /* INTEGERSET_H */ diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile index 19d60a506e..dfd0956aee 100644 --- a/src/test/modules/Makefile +++ b/src/test/modules/Makefile @@ -12,6 +12,7 @@ SUBDIRS = \ test_bloomfilter \ test_ddl_deparse \ test_extensions \ + test_integerset \ test_parser \ test_pg_dump \ test_predtest \ diff --git a/src/test/modules/test_integerset/.gitignore b/src/test/modules/test_integerset/.gitignore new file mode 100644 index 0000000000..5dcb3ff972 --- /dev/null +++ b/src/test/modules/test_integerset/.gitignore @@ -0,0 +1,4 @@ +# Generated subdirectories +/log/ +/results/ +/tmp_check/ diff --git a/src/test/modules/test_integerset/Makefile b/src/test/modules/test_integerset/Makefile new file mode 100644 index 0000000000..3b7c4999d6 --- /dev/null +++ b/src/test/modules/test_integerset/Makefile @@ -0,0 +1,21 @@ +# src/test/modules/test_integerset/Makefile + +MODULE_big = test_integerset +OBJS = test_integerset.o $(WIN32RES) +PGFILEDESC = "test_integerset - test code for src/backend/lib/integerset.c" + +EXTENSION = test_integerset +DATA = test_integerset--1.0.sql + +REGRESS = test_integerset + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/test_integerset +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/src/test/modules/test_integerset/README b/src/test/modules/test_integerset/README new file mode 100644 index 0000000000..3e4226adb5 --- /dev/null +++ b/src/test/modules/test_integerset/README @@ -0,0 +1,7 @@ +test_integerset contains unit tests for testing the integer set implementation, +in src/backend/lib/integerset.c + +The tests verify the correctness of the implemention, but they can also be +as a micro-benchmark: If you set the 'intset_tests_stats' flag in +test_integerset.c, the tests will print extra information about execution time +and memory usage. diff --git a/src/test/modules/test_integerset/expected/test_integerset.out b/src/test/modules/test_integerset/expected/test_integerset.out new file mode 100644 index 0000000000..d7c88ded09 --- /dev/null +++ b/src/test/modules/test_integerset/expected/test_integerset.out @@ -0,0 +1,14 @@ +CREATE EXTENSION test_integerset; +-- +-- These tests don't produce any interesting output. We're checking that +-- the operations complete without crashing or hanging and that none of their +-- internal sanity tests fail. They print progress information as INFOs, +-- which are not interesting for automated tests, so suppress those. +-- +SET client_min_messages = 'warning'; +SELECT test_integerset(); + test_integerset +----------------- + +(1 row) + diff --git a/src/test/modules/test_integerset/sql/test_integerset.sql b/src/test/modules/test_integerset/sql/test_integerset.sql new file mode 100644 index 0000000000..34223afa88 --- /dev/null +++ b/src/test/modules/test_integerset/sql/test_integerset.sql @@ -0,0 +1,11 @@ +CREATE EXTENSION test_integerset; + +-- +-- These tests don't produce any interesting output. We're checking that +-- the operations complete without crashing or hanging and that none of their +-- internal sanity tests fail. They print progress information as INFOs, +-- which are not interesting for automated tests, so suppress those. +-- +SET client_min_messages = 'warning'; + +SELECT test_integerset(); diff --git a/src/test/modules/test_integerset/test_integerset--1.0.sql b/src/test/modules/test_integerset/test_integerset--1.0.sql new file mode 100644 index 0000000000..d6d5a3f6cf --- /dev/null +++ b/src/test/modules/test_integerset/test_integerset--1.0.sql @@ -0,0 +1,8 @@ +/* src/test/modules/test_integerset/test_integerset--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION test_integerset" to load this file. \quit + +CREATE FUNCTION test_integerset() +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; diff --git a/src/test/modules/test_integerset/test_integerset.c b/src/test/modules/test_integerset/test_integerset.c new file mode 100644 index 0000000000..f2a5ec42d9 --- /dev/null +++ b/src/test/modules/test_integerset/test_integerset.c @@ -0,0 +1,622 @@ +/*-------------------------------------------------------------------------- + * + * test_integerset.c + * Test integer set data structure. + * + * Copyright (c) 2019, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/test/modules/test_integerset/test_integerset.c + * + * ------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "fmgr.h" +#include "lib/integerset.h" +#include "nodes/bitmapset.h" +#include "utils/memutils.h" +#include "utils/timestamp.h" +#include "storage/block.h" +#include "storage/itemptr.h" +#include "miscadmin.h" + +/* + * If you enable this, the "pattern" tests will print information about + * how long populating, probing, and iterating the test set takes, and + * how much memory the test set consumed. That can be used as + * micro-benchmark of various operations and input patterns (you might + * want to increase the number of values used in each of the test, if + * you do that, to reduce noise) + * + * The information is printed to the server's stderr, mostly because + * that's where MemoryContextStats() output goes. + */ +static const bool intset_test_stats = false; + +PG_MODULE_MAGIC; + +PG_FUNCTION_INFO_V1(test_integerset); + +/* + * A struct to define a pattern of integers, for use with test_pattern() + * function. + */ +typedef struct +{ + char *test_name; /* short name of the test, for humans */ + char *pattern_str; /* a bit pattern */ + uint64 spacing; /* pattern repeats at this interval */ + uint64 num_values; /* number of integers to set in total */ +} test_spec; + +static const test_spec test_specs[] = { + { + "all ones", "1111111111", + 10, 10000000 + }, + { + "alternating bits", "0101010101", + 10, 10000000 + }, + { + "clusters of ten", "1111111111", + 10000, 10000000 + }, + { + "clusters of hundred", + "1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111", + 10000, 100000000 + }, + { + "one-every-64k", "1", + 65536, 10000000 + }, + { + "sparse", "100000000000000000000000000000001", + 10000000, 10000000 + }, + { + "single values, distance > 2^32", "1", + 10000000000L, 1000000 + }, + { + "clusters, distance > 2^32", "10101010", + 10000000000L, 10000000 + }, + { + "clusters, distance > 2^60", "10101010", + 2000000000000000000L, + 23 /* can't be much higher than this, or we + * overflow uint64 */ + } +}; + +static void test_pattern(const test_spec *spec); +static void test_empty(void); +static void test_single_value(uint64 value); +static void check_with_filler(IntegerSet *intset, uint64 x, uint64 value, uint64 filler_min, uint64 filler_max); +static void test_single_value_and_filler(uint64 value, uint64 filler_min, uint64 filler_max); +static void test_huge_distances(void); + +/* + * SQL-callable entry point to perform all tests. + */ +Datum +test_integerset(PG_FUNCTION_ARGS) +{ + MemoryContext test_ctx; + + test_ctx = AllocSetContextCreate(CurrentMemoryContext, + "test_integerset context", + ALLOCSET_DEFAULT_SIZES); + + /* Tests for various corner cases */ + test_empty(); + test_huge_distances(); + test_single_value(0); + test_single_value(1); + test_single_value(PG_UINT64_MAX - 1); + test_single_value(PG_UINT64_MAX); + test_single_value_and_filler(0, 1000, 2000); + test_single_value_and_filler(1, 1000, 2000); + test_single_value_and_filler(1, 1000, 2000000); + test_single_value_and_filler(PG_UINT64_MAX - 1, 1000, 2000); + test_single_value_and_filler(PG_UINT64_MAX, 1000, 2000); + + /* Test different test patterns, with lots of entries */ + for (int i = 0; i < lengthof(test_specs); i++) + { + MemoryContextReset(test_ctx); + test_pattern(&test_specs[i]); + } + + MemoryContextDelete(test_ctx); + + PG_RETURN_VOID(); +} + +/* + * Test with a repeating pattern, defined by the 'spec'. + */ +static void +test_pattern(const test_spec *spec) +{ + IntegerSet *intset; + MemoryContext intset_ctx; + MemoryContext old_ctx; + TimestampTz starttime; + TimestampTz endtime; + uint64 n; + uint64 last_int; + int patternlen; + uint64 *pattern_values; + uint64 pattern_num_values; + + elog(NOTICE, "testing intset with pattern \"%s\"", spec->test_name); + if (intset_test_stats) + fprintf(stderr, "-----\ntesting intset with pattern \"%s\"\n", spec->test_name); + + /* Pre-process the pattern, creating an array of integers from it. */ + patternlen = strlen(spec->pattern_str); + pattern_values = palloc(patternlen * sizeof(uint64)); + pattern_num_values = 0; + for (int i = 0; i < patternlen; i++) + { + if (spec->pattern_str[i] == '1') + pattern_values[pattern_num_values++] = i; + } + + /* + * Allocate the integer set. + * + * Allocate it in a separate memory context, so that we can print its + * memory usage easily. (intset_create() creates a memory context of its + * own, too, but we don't have direct access to it, so we cannot call + * MemoryContextStats() on it directly). + */ + intset_ctx = AllocSetContextCreate(CurrentMemoryContext, + "intset test", + ALLOCSET_SMALL_SIZES); + MemoryContextSetIdentifier(intset_ctx, spec->test_name); + old_ctx = MemoryContextSwitchTo(intset_ctx); + intset = intset_create(); + MemoryContextSwitchTo(old_ctx); + + /* + * Add values to the set. + */ + starttime = GetCurrentTimestamp(); + + n = 0; + last_int = 0; + while (n < spec->num_values) + { + uint64 x = 0; + + for (int i = 0; i < pattern_num_values && n < spec->num_values; i++) + { + x = last_int + pattern_values[i]; + + intset_add_member(intset, x); + n++; + } + last_int += spec->spacing; + } + + endtime = GetCurrentTimestamp(); + + if (intset_test_stats) + fprintf(stderr, "added %lu values in %lu ms\n", + spec->num_values, (endtime - starttime) / 1000); + + /* + * Print stats on the amount of memory used. + * + * We print the usage reported by intset_memory_usage(), as well as the + * stats from the memory context. They should be in the same ballpark, + * but it's hard to automate testing that, so if you're making changes to + * the implementation, just observe that manually. + */ + if (intset_test_stats) + { + uint64 mem_usage; + + /* + * Also print memory usage as reported by intset_memory_usage(). It + * should be in the same ballpark as the usage reported by + * MemoryContextStats(). + */ + mem_usage = intset_memory_usage(intset); + fprintf(stderr, "intset_memory_usage() reported %lu (%0.2f bytes / integer)\n", + mem_usage, (double) mem_usage / spec->num_values); + + MemoryContextStats(intset_ctx); + } + + /* Check that intset_get_num_entries works */ + n = intset_num_entries(intset); + if (n != spec->num_values) + elog(ERROR, "intset_num_entries returned %lu, expected %lu", n, spec->num_values); + + /* + * Test random-access probes with intset_is_member() + */ + starttime = GetCurrentTimestamp(); + + for (n = 0; n < 100000; n++) + { + bool b; + bool expected; + uint64 x; + + /* + * Pick next value to probe at random. We limit the probes to the + * last integer that we added to the set, plus an arbitrary constant + * (1000). There's no point in probing the whole 0 - 2^64 range, if + * only a small part of the integer space is used. We would very + * rarely hit hit values that are actually in the set. + */ + x = (pg_lrand48() << 31) | pg_lrand48(); + x = x % (last_int + 1000); + + /* Do we expect this value to be present in the set? */ + if (x >= last_int) + expected = false; + else + { + uint64 idx = x % spec->spacing; + + if (idx >= patternlen) + expected = false; + else if (spec->pattern_str[idx] == '1') + expected = true; + else + expected = false; + } + + /* Is it present according to intset_is_member() ? */ + b = intset_is_member(intset, x); + + if (b != expected) + elog(ERROR, "mismatch at %lu: %d vs %d", x, b, expected); + } + endtime = GetCurrentTimestamp(); + if (intset_test_stats) + fprintf(stderr, "probed %lu values in %lu ms\n", n, (endtime - starttime) / 1000); + + /* + * Test iterator + */ + starttime = GetCurrentTimestamp(); + + intset_begin_iterate(intset); + n = 0; + last_int = 0; + while (n < spec->num_values) + { + for (int i = 0; i < pattern_num_values && n < spec->num_values; i++) + { + uint64 expected = last_int + pattern_values[i]; + uint64 x; + + if (!intset_iterate_next(intset, &x)) + break; + + if (x != expected) + elog(ERROR, "iterate returned wrong value; got %lu, expected %lu", x, expected); + n++; + } + last_int += spec->spacing; + } + endtime = GetCurrentTimestamp(); + if (intset_test_stats) + fprintf(stderr, "iterated %lu values in %lu ms\n", n, (endtime - starttime) / 1000); + + if (n < spec->num_values) + elog(ERROR, "iterator stopped short after %lu entries, expected %lu", n, spec->num_values); + if (n > spec->num_values) + elog(ERROR, "iterator returned %lu entries, %lu was expected", n, spec->num_values); + + MemoryContextDelete(intset_ctx); +} + +/* + * Test with a set containing a single integer. + */ +static void +test_single_value(uint64 value) +{ + IntegerSet *intset; + uint64 x; + uint64 num_entries; + bool found; + + elog(NOTICE, "testing intset with single value %lu", value); + + /* Create the set. */ + intset = intset_create(); + intset_add_member(intset, value); + + /* Test intset_get_num_entries() */ + num_entries = intset_num_entries(intset); + if (num_entries != 1) + elog(ERROR, "intset_num_entries returned %lu, expected %lu", num_entries, 1L); + + /* + * Test intset_is_member() at various special values, like 0 and and + * maximum possible 64-bit integer, as well as the value itself. + */ + if (intset_is_member(intset, 0) != (value == 0)) + elog(ERROR, "intset_is_member failed for 0"); + if (intset_is_member(intset, 1) != (value == 1)) + elog(ERROR, "intset_is_member failed for 1"); + if (intset_is_member(intset, PG_UINT64_MAX) != (value == PG_UINT64_MAX)) + elog(ERROR, "intset_is_member failed for PG_UINT64_MAX"); + if (intset_is_member(intset, value) != true) + elog(ERROR, "intset_is_member failed for the tested value"); + + /* + * Test iterator + */ + intset_begin_iterate(intset); + found = intset_iterate_next(intset, &x); + if (!found || x != value) + elog(ERROR, "intset_iterate_next failed for %lu", x); + + found = intset_iterate_next(intset, &x); + if (found) + elog(ERROR, "intset_iterate_next failed %lu", x); +} + +/* + * Test with an integer set that contains: + * + * - a given single 'value', and + * - all integers between 'filler_min' and 'filler_max'. + * + * This exercises different codepaths than testing just with a single value, + * because the implementation buffers newly-added values. If we add just + * single value to the set, we won't test the internal B-tree code at all, + * just the code that deals with the buffer. + */ +static void +test_single_value_and_filler(uint64 value, uint64 filler_min, uint64 filler_max) +{ + IntegerSet *intset; + uint64 x; + bool found; + uint64 *iter_expected; + uint64 n = 0; + uint64 num_entries = 0; + uint64 mem_usage; + + elog(NOTICE, "testing intset with value %lu, and all between %lu and %lu", + value, filler_min, filler_max); + + intset = intset_create(); + + iter_expected = palloc(sizeof(uint64) * (filler_max - filler_min + 1)); + if (value < filler_min) + { + intset_add_member(intset, value); + iter_expected[n++] = value; + } + + for (x = filler_min; x < filler_max; x++) + { + intset_add_member(intset, x); + iter_expected[n++] = x; + } + + if (value >= filler_max) + { + intset_add_member(intset, value); + iter_expected[n++] = value; + } + + /* Test intset_get_num_entries() */ + num_entries = intset_num_entries(intset); + if (num_entries != n) + elog(ERROR, "intset_num_entries returned %lu, expected %lu", num_entries, n); + + /* + * Test intset_is_member() at various spots, at and around the values that + * we expect to be set, as well as 0 and the maximum possible value. + */ + check_with_filler(intset, 0, + value, filler_min, filler_max); + check_with_filler(intset, 1, + value, filler_min, filler_max); + check_with_filler(intset, filler_min - 1, + value, filler_min, filler_max); + check_with_filler(intset, filler_min, + value, filler_min, filler_max); + check_with_filler(intset, filler_min + 1, + value, filler_min, filler_max); + check_with_filler(intset, value - 1, + value, filler_min, filler_max); + check_with_filler(intset, value, + value, filler_min, filler_max); + check_with_filler(intset, value + 1, + value, filler_min, filler_max); + check_with_filler(intset, filler_max - 1, + value, filler_min, filler_max); + check_with_filler(intset, filler_max, + value, filler_min, filler_max); + check_with_filler(intset, filler_max + 1, + value, filler_min, filler_max); + check_with_filler(intset, PG_UINT64_MAX - 1, + value, filler_min, filler_max); + check_with_filler(intset, PG_UINT64_MAX, + value, filler_min, filler_max); + + intset_begin_iterate(intset); + for (uint64 i = 0; i < n; i++) + { + found = intset_iterate_next(intset, &x); + if (!found || x != iter_expected[i]) + elog(ERROR, "intset_iterate_next failed for %lu", x); + } + found = intset_iterate_next(intset, &x); + if (found) + elog(ERROR, "intset_iterate_next failed %lu", x); + + mem_usage = intset_memory_usage(intset); + if (mem_usage < 5000 || mem_usage > 500000000) + elog(ERROR, "intset_memory_usage() reported suspicous value: %lu", mem_usage); +} + +/* + * Helper function for test_single_value_and_filler. + * + * Calls intset_is_member() for value 'x', and checks that the result is what + * we expect. + */ +static void +check_with_filler(IntegerSet *intset, uint64 x, + uint64 value, uint64 filler_min, uint64 filler_max) +{ + bool expected; + bool actual; + + expected = (x == value || (filler_min <= x && x < filler_max)); + + actual = intset_is_member(intset, x); + + if (actual != expected) + elog(ERROR, "intset_is_member failed for %lu", x); +} + +/* + * Test empty set + */ +static void +test_empty(void) +{ + IntegerSet *intset; + uint64 x; + + elog(NOTICE, "testing intset with empty set"); + + intset = intset_create(); + + /* Test intset_is_member() */ + if (intset_is_member(intset, 0) != false) + elog(ERROR, "intset_is_member on empty set returned true"); + if (intset_is_member(intset, 1) != false) + elog(ERROR, "intset_is_member on empty set returned true"); + if (intset_is_member(intset, PG_UINT64_MAX) != false) + elog(ERROR, "intset_is_member on empty set returned true"); + + /* Test iterator */ + intset_begin_iterate(intset); + if (intset_iterate_next(intset, &x)) + elog(ERROR, "intset_iterate_next on empty set returned a value (%lu)", x); +} + +/* + * Test with integers that are more than 2^60 apart. + * + * The Simple-8b encoding used by the set implementation can only encode + * values up to 2^60. That makes large differences like this interesting + * to test. + */ +static void +test_huge_distances(void) +{ + IntegerSet *intset; + uint64 values[1000]; + int num_values = 0; + uint64 val = 0; + bool found; + uint64 x; + + elog(NOTICE, "testing intset with distances > 2^60 between values"); + + val = 0; + values[num_values++] = val; + + val += 1152921504606846976L - 1; /* 2^60 - 1 */ + values[num_values++] = val; + + val += 1152921504606846976L - 1; /* 2^60 - 1 */ + values[num_values++] = val; + + val += 1152921504606846976L; /* 2^60 */ + values[num_values++] = val; + + val += 1152921504606846976L; /* 2^60 */ + values[num_values++] = val; + + val += 1152921504606846976L; /* 2^60 */ + values[num_values++] = val; + + val += 1152921504606846976L + 1; /* 2^60 + 1 */ + values[num_values++] = val; + + val += 1152921504606846976L + 1; /* 2^60 + 1 */ + values[num_values++] = val; + + val += 1152921504606846976L + 1; /* 2^60 + 1 */ + values[num_values++] = val; + + val += 1152921504606846976L; /* 2^60 */ + values[num_values++] = val; + + /* we're now very close to 2^64, so can't add large values anymore */ + + intset = intset_create(); + + /* + * Add many more values to the end, to make sure that all the above values + * get flushed and packed into the tree structure. + */ + while (num_values < 1000) + { + val += pg_lrand48(); + values[num_values++] = val; + } + + /* Add these numbers to the set */ + for (int i = 0; i < num_values; i++) + intset_add_member(intset, values[i]); + + /* + * Test iterset_is_member() around each of these values + */ + for (int i = 0; i < num_values; i++) + { + uint64 x = values[i]; + bool result; + + if (x > 0) + { + result = intset_is_member(intset, x - 1); + if (result != false) + elog(ERROR, "intset_is_member failed for %lu", x - 1); + } + + result = intset_is_member(intset, x); + if (result != true) + elog(ERROR, "intset_is_member failed for %lu", x); + + result = intset_is_member(intset, x + 1); + if (result != false) + elog(ERROR, "intset_is_member failed for %lu", x + 1); + } + + /* + * Test iterator + */ + intset_begin_iterate(intset); + for (int i = 0; i < num_values; i++) + { + found = intset_iterate_next(intset, &x); + if (!found || x != values[i]) + elog(ERROR, "intset_iterate_next failed for %lu", x); + } + found = intset_iterate_next(intset, &x); + if (found) + elog(ERROR, "intset_iterate_next failed %lu", x); +} diff --git a/src/test/modules/test_integerset/test_integerset.control b/src/test/modules/test_integerset/test_integerset.control new file mode 100644 index 0000000000..7d20c2d7b8 --- /dev/null +++ b/src/test/modules/test_integerset/test_integerset.control @@ -0,0 +1,4 @@ +comment = 'Test code for integerset' +default_version = '1.0' +module_pathname = '$libdir/test_integerset' +relocatable = true