2005-04-18 00:24:02 +02:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
|
|
|
* tidbitmap.c
|
|
|
|
* PostgreSQL tuple-id (TID) bitmap package
|
|
|
|
*
|
|
|
|
* This module provides bitmap data structures that are spiritually
|
|
|
|
* similar to Bitmapsets, but are specially adapted to store sets of
|
|
|
|
* tuple identifiers (TIDs), or ItemPointers. In particular, the division
|
|
|
|
* of an ItemPointer into BlockNumber and OffsetNumber is catered for.
|
|
|
|
* Also, since we wish to be able to store very large tuple sets in
|
|
|
|
* memory with this data structure, we support "lossy" storage, in which
|
|
|
|
* we no longer remember individual tuple offsets on a page but only the
|
|
|
|
* fact that a particular page needs to be visited.
|
|
|
|
*
|
|
|
|
* The "lossy" storage uses one bit per disk page, so at the standard 8K
|
|
|
|
* BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
|
|
|
|
* of memory. People pushing around tables of that size should have a
|
|
|
|
* couple of Mb to spare, so we don't worry about providing a second level
|
|
|
|
* of lossiness. In theory we could fall back to page ranges at some
|
|
|
|
* point, but for now that seems useless complexity.
|
|
|
|
*
|
2014-05-06 18:12:18 +02:00
|
|
|
* We also support the notion of candidate matches, or rechecking. This
|
2008-04-11 00:25:26 +02:00
|
|
|
* means we know that a search need visit only some tuples on a page,
|
|
|
|
* but we are not certain that all of those tuples are real matches.
|
|
|
|
* So the eventual heap scan must recheck the quals for these tuples only,
|
|
|
|
* rather than rechecking the quals for all tuples on the page as in the
|
|
|
|
* lossy-bitmap case. Rechecking can be specified when TIDs are inserted
|
|
|
|
* into a bitmap, and it can also happen internally when we AND a lossy
|
|
|
|
* and a non-lossy page.
|
|
|
|
*
|
2005-04-18 00:24:02 +02:00
|
|
|
*
|
2017-01-03 19:48:53 +01:00
|
|
|
* Copyright (c) 2003-2017, PostgreSQL Global Development Group
|
2005-04-18 00:24:02 +02:00
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
2010-09-20 22:08:53 +02:00
|
|
|
* src/backend/nodes/tidbitmap.c
|
2005-04-18 00:24:02 +02:00
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
|
|
|
|
|
|
#include <limits.h>
|
|
|
|
|
2012-08-30 22:15:44 +02:00
|
|
|
#include "access/htup_details.h"
|
2007-09-20 19:56:33 +02:00
|
|
|
#include "nodes/bitmapset.h"
|
2005-04-18 00:24:02 +02:00
|
|
|
#include "nodes/tidbitmap.h"
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The maximum number of tuples per page is not large (typically 256 with
|
|
|
|
* 8K pages, or 1024 with 32K pages). So there's not much point in making
|
2014-05-06 18:12:18 +02:00
|
|
|
* the per-page bitmaps variable size. We just legislate that the size
|
2005-04-18 00:24:02 +02:00
|
|
|
* is this:
|
|
|
|
*/
|
2005-09-02 21:02:20 +02:00
|
|
|
#define MAX_TUPLES_PER_PAGE MaxHeapTuplesPerPage
|
2005-04-18 00:24:02 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* When we have to switch over to lossy storage, we use a data structure
|
|
|
|
* with one bit per page, where all pages having the same number DIV
|
|
|
|
* PAGES_PER_CHUNK are aggregated into one chunk. When a chunk is present
|
|
|
|
* and has the bit set for a given page, there must not be a per-page entry
|
|
|
|
* for that page in the page table.
|
|
|
|
*
|
|
|
|
* We actually store both exact pages and lossy chunks in the same hash
|
2016-10-15 01:05:30 +02:00
|
|
|
* table, using identical data structures. (This is because the memory
|
|
|
|
* management for hashtables doesn't easily/efficiently allow space to be
|
|
|
|
* transferred easily from one hashtable to another.) Therefore it's best
|
|
|
|
* if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not
|
|
|
|
* too different. But we also want PAGES_PER_CHUNK to be a power of 2 to
|
|
|
|
* avoid expensive integer remainder operations. So, define it like this:
|
2005-04-18 00:24:02 +02:00
|
|
|
*/
|
|
|
|
#define PAGES_PER_CHUNK (BLCKSZ / 32)
|
|
|
|
|
2007-09-20 19:56:33 +02:00
|
|
|
/* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
|
2005-04-18 00:24:02 +02:00
|
|
|
|
|
|
|
#define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
|
|
|
|
#define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
|
|
|
|
|
|
|
|
/* number of active words for an exact page: */
|
2005-10-15 04:49:52 +02:00
|
|
|
#define WORDS_PER_PAGE ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
|
2005-04-18 00:24:02 +02:00
|
|
|
/* number of active words for a lossy chunk: */
|
|
|
|
#define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The hashtable entries are represented by this data structure. For
|
|
|
|
* an exact page, blockno is the page number and bit k of the bitmap
|
|
|
|
* represents tuple offset k+1. For a lossy chunk, blockno is the first
|
|
|
|
* page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
|
|
|
|
* bit k represents page blockno+k. Note that it is not possible to
|
|
|
|
* have exact storage for the first page of a chunk if we are using
|
|
|
|
* lossy storage for any page in the chunk's range, since the same
|
|
|
|
* hashtable entry has to serve both purposes.
|
2008-04-11 00:25:26 +02:00
|
|
|
*
|
|
|
|
* recheck is used only on exact pages --- it indicates that although
|
|
|
|
* only the stated tuples need be checked, the full index qual condition
|
|
|
|
* must be checked for each (ie, these are candidate matches).
|
2005-04-18 00:24:02 +02:00
|
|
|
*/
|
|
|
|
typedef struct PagetableEntry
|
|
|
|
{
|
2005-10-15 04:49:52 +02:00
|
|
|
BlockNumber blockno; /* page number (hashtable key) */
|
2016-10-15 01:05:30 +02:00
|
|
|
char status; /* hash entry status */
|
2005-04-18 00:24:02 +02:00
|
|
|
bool ischunk; /* T = lossy storage, F = exact */
|
2008-04-11 00:25:26 +02:00
|
|
|
bool recheck; /* should the tuples be rechecked? */
|
2005-04-18 00:24:02 +02:00
|
|
|
bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
|
|
|
|
} PagetableEntry;
|
|
|
|
|
2005-05-17 02:43:47 +02:00
|
|
|
/*
|
2016-10-15 01:05:30 +02:00
|
|
|
* We want to avoid the overhead of creating the hashtable, which is
|
|
|
|
* comparatively large, when not necessary. Particularly when we are using a
|
|
|
|
* bitmap scan on the inside of a nestloop join: a bitmap may well live only
|
|
|
|
* long enough to accumulate one entry in such cases. We therefore avoid
|
|
|
|
* creating an actual hashtable until we need two pagetable entries. When
|
|
|
|
* just one pagetable entry is needed, we store it in a fixed field of
|
|
|
|
* TIDBitMap. (NOTE: we don't get rid of the hashtable if the bitmap later
|
|
|
|
* shrinks down to zero or one page again. So, status can be TBM_HASH even
|
|
|
|
* when nentries is zero or one.)
|
2005-05-17 02:43:47 +02:00
|
|
|
*/
|
|
|
|
typedef enum
|
|
|
|
{
|
|
|
|
TBM_EMPTY, /* no hashtable, nentries == 0 */
|
|
|
|
TBM_ONE_PAGE, /* entry1 contains the single entry */
|
|
|
|
TBM_HASH /* pagetable is valid, entry1 is not */
|
|
|
|
} TBMStatus;
|
|
|
|
|
2005-04-18 00:24:02 +02:00
|
|
|
/*
|
|
|
|
* Here is the representation for a whole TIDBitMap:
|
|
|
|
*/
|
|
|
|
struct TIDBitmap
|
|
|
|
{
|
|
|
|
NodeTag type; /* to make it a valid Node */
|
|
|
|
MemoryContext mcxt; /* memory context containing me */
|
2005-05-17 02:43:47 +02:00
|
|
|
TBMStatus status; /* see codes above */
|
2016-10-15 01:05:30 +02:00
|
|
|
struct pagetable_hash *pagetable; /* hash table of PagetableEntry's */
|
2005-04-18 00:24:02 +02:00
|
|
|
int nentries; /* number of entries in pagetable */
|
|
|
|
int maxentries; /* limit on same to meet maxbytes */
|
|
|
|
int npages; /* number of exact entries in pagetable */
|
|
|
|
int nchunks; /* number of lossy entries in pagetable */
|
|
|
|
bool iterating; /* tbm_begin_iterate called? */
|
2016-10-15 01:05:30 +02:00
|
|
|
uint32 lossify_start; /* offset to start lossifying hashtable at */
|
2005-05-17 02:43:47 +02:00
|
|
|
PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */
|
2009-01-10 22:08:36 +01:00
|
|
|
/* these are valid when iterating is true: */
|
2005-04-18 00:24:02 +02:00
|
|
|
PagetableEntry **spages; /* sorted exact-page list, or NULL */
|
|
|
|
PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */
|
2009-01-10 22:08:36 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* When iterating over a bitmap in sorted order, a TBMIterator is used to
|
2014-05-06 18:12:18 +02:00
|
|
|
* track our progress. There can be several iterators scanning the same
|
2009-01-10 22:08:36 +01:00
|
|
|
* bitmap concurrently. Note that the bitmap becomes read-only as soon as
|
|
|
|
* any iterator is created.
|
|
|
|
*/
|
|
|
|
struct TBMIterator
|
|
|
|
{
|
|
|
|
TIDBitmap *tbm; /* TIDBitmap we're iterating over */
|
2005-04-18 00:24:02 +02:00
|
|
|
int spageptr; /* next spages index */
|
|
|
|
int schunkptr; /* next schunks index */
|
|
|
|
int schunkbit; /* next bit to check in current schunk */
|
2005-05-17 02:43:47 +02:00
|
|
|
TBMIterateResult output; /* MUST BE LAST (because variable-size) */
|
2005-04-18 00:24:02 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
/* Local function prototypes */
|
2005-05-17 02:43:47 +02:00
|
|
|
static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
|
2005-07-24 04:25:26 +02:00
|
|
|
static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
|
2005-10-15 04:49:52 +02:00
|
|
|
const TIDBitmap *b);
|
2005-05-17 02:43:47 +02:00
|
|
|
static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
|
2005-10-15 04:49:52 +02:00
|
|
|
BlockNumber pageno);
|
2005-04-18 00:24:02 +02:00
|
|
|
static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno);
|
|
|
|
static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
|
|
|
|
static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
|
|
|
|
static void tbm_lossify(TIDBitmap *tbm);
|
|
|
|
static int tbm_comparator(const void *left, const void *right);
|
|
|
|
|
2016-10-15 01:05:30 +02:00
|
|
|
/*
|
|
|
|
* Simple inline murmur hash implementation for the exact width required, for
|
|
|
|
* performance.
|
|
|
|
*/
|
|
|
|
static inline uint32
|
|
|
|
hash_blockno(BlockNumber b)
|
|
|
|
{
|
|
|
|
uint32 h = b;
|
|
|
|
|
|
|
|
h ^= h >> 16;
|
|
|
|
h *= 0x85ebca6b;
|
|
|
|
h ^= h >> 13;
|
|
|
|
h *= 0xc2b2ae35;
|
|
|
|
h ^= h >> 16;
|
|
|
|
return h;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* define hashtable mapping block numbers to PagetableEntry's */
|
|
|
|
#define SH_PREFIX pagetable
|
|
|
|
#define SH_ELEMENT_TYPE PagetableEntry
|
|
|
|
#define SH_KEY_TYPE BlockNumber
|
|
|
|
#define SH_KEY blockno
|
|
|
|
#define SH_HASH_KEY(tb, key) hash_blockno(key)
|
|
|
|
#define SH_EQUAL(tb, a, b) a == b
|
|
|
|
#define SH_SCOPE static inline
|
|
|
|
#define SH_DEFINE
|
|
|
|
#define SH_DECLARE
|
|
|
|
#include "lib/simplehash.h"
|
|
|
|
|
2005-04-18 00:24:02 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* tbm_create - create an initially-empty bitmap
|
|
|
|
*
|
|
|
|
* The bitmap will live in the memory context that is CurrentMemoryContext
|
|
|
|
* at the time of this call. It will be limited to (approximately) maxbytes
|
|
|
|
* total memory consumption.
|
|
|
|
*/
|
|
|
|
TIDBitmap *
|
|
|
|
tbm_create(long maxbytes)
|
|
|
|
{
|
|
|
|
TIDBitmap *tbm;
|
|
|
|
long nbuckets;
|
|
|
|
|
2009-01-10 22:08:36 +01:00
|
|
|
/* Create the TIDBitmap struct and zero all its fields */
|
|
|
|
tbm = makeNode(TIDBitmap);
|
2005-05-17 02:43:47 +02:00
|
|
|
|
2005-04-18 00:24:02 +02:00
|
|
|
tbm->mcxt = CurrentMemoryContext;
|
2005-05-17 02:43:47 +02:00
|
|
|
tbm->status = TBM_EMPTY;
|
2005-04-18 00:24:02 +02:00
|
|
|
|
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* Estimate number of hashtable entries we can have within maxbytes. This
|
2016-10-15 01:05:30 +02:00
|
|
|
* estimates the hash cost as sizeof(PagetableEntry), which is good enough
|
|
|
|
* for our purpose. Also count an extra Pointer per entry for the arrays
|
|
|
|
* created during iteration readout.
|
2005-04-18 00:24:02 +02:00
|
|
|
*/
|
|
|
|
nbuckets = maxbytes /
|
2016-10-15 01:05:30 +02:00
|
|
|
(sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
|
2005-10-15 04:49:52 +02:00
|
|
|
nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */
|
|
|
|
nbuckets = Max(nbuckets, 16); /* sanity limit */
|
2005-04-18 00:24:02 +02:00
|
|
|
tbm->maxentries = (int) nbuckets;
|
2016-10-15 01:05:30 +02:00
|
|
|
tbm->lossify_start = 0;
|
2005-04-18 00:24:02 +02:00
|
|
|
|
2005-05-17 02:43:47 +02:00
|
|
|
return tbm;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Actually create the hashtable. Since this is a moderately expensive
|
|
|
|
* proposition, we don't do it until we have to.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
tbm_create_pagetable(TIDBitmap *tbm)
|
|
|
|
{
|
|
|
|
Assert(tbm->status != TBM_HASH);
|
|
|
|
Assert(tbm->pagetable == NULL);
|
|
|
|
|
2017-02-07 23:01:40 +01:00
|
|
|
tbm->pagetable = pagetable_create(tbm->mcxt, 128);
|
2005-04-18 00:24:02 +02:00
|
|
|
|
2005-05-17 02:43:47 +02:00
|
|
|
/* If entry1 is valid, push it into the hashtable */
|
|
|
|
if (tbm->status == TBM_ONE_PAGE)
|
|
|
|
{
|
|
|
|
PagetableEntry *page;
|
|
|
|
bool found;
|
2016-10-15 01:05:30 +02:00
|
|
|
char oldstatus;
|
2005-05-17 02:43:47 +02:00
|
|
|
|
2016-10-15 01:05:30 +02:00
|
|
|
page = pagetable_insert(tbm->pagetable,
|
|
|
|
tbm->entry1.blockno,
|
|
|
|
&found);
|
2005-05-17 02:43:47 +02:00
|
|
|
Assert(!found);
|
2016-10-15 01:05:30 +02:00
|
|
|
oldstatus = page->status;
|
2005-05-17 02:43:47 +02:00
|
|
|
memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
|
2016-10-15 01:05:30 +02:00
|
|
|
page->status = oldstatus;
|
2005-05-17 02:43:47 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
tbm->status = TBM_HASH;
|
2005-04-18 00:24:02 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* tbm_free - free a TIDBitmap
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
tbm_free(TIDBitmap *tbm)
|
|
|
|
{
|
2005-05-17 02:43:47 +02:00
|
|
|
if (tbm->pagetable)
|
2016-10-15 01:05:30 +02:00
|
|
|
pagetable_destroy(tbm->pagetable);
|
2005-04-18 00:24:02 +02:00
|
|
|
if (tbm->spages)
|
|
|
|
pfree(tbm->spages);
|
|
|
|
if (tbm->schunks)
|
|
|
|
pfree(tbm->schunks);
|
|
|
|
pfree(tbm);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* tbm_add_tuples - add some tuple IDs to a TIDBitmap
|
2008-04-11 00:25:26 +02:00
|
|
|
*
|
|
|
|
* If recheck is true, then the recheck flag will be set in the
|
|
|
|
* TBMIterateResult when any of these tuples are reported out.
|
2005-04-18 00:24:02 +02:00
|
|
|
*/
|
|
|
|
void
|
2008-04-11 00:25:26 +02:00
|
|
|
tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
|
|
|
|
bool recheck)
|
2005-04-18 00:24:02 +02:00
|
|
|
{
|
2015-01-16 19:28:30 +01:00
|
|
|
BlockNumber currblk = InvalidBlockNumber;
|
|
|
|
PagetableEntry *page = NULL; /* only valid when currblk is valid */
|
|
|
|
int i;
|
2005-04-18 00:24:02 +02:00
|
|
|
|
|
|
|
Assert(!tbm->iterating);
|
|
|
|
for (i = 0; i < ntids; i++)
|
|
|
|
{
|
|
|
|
BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
|
|
|
|
OffsetNumber off = ItemPointerGetOffsetNumber(tids + i);
|
|
|
|
int wordnum,
|
|
|
|
bitnum;
|
|
|
|
|
|
|
|
/* safety check to ensure we don't overrun bit array bounds */
|
|
|
|
if (off < 1 || off > MAX_TUPLES_PER_PAGE)
|
|
|
|
elog(ERROR, "tuple offset out of range: %u", off);
|
|
|
|
|
2015-01-16 19:28:30 +01:00
|
|
|
/*
|
|
|
|
* Look up target page unless we already did. This saves cycles when
|
|
|
|
* the input includes consecutive tuples on the same page, which is
|
|
|
|
* common enough to justify an extra test here.
|
|
|
|
*/
|
|
|
|
if (blk != currblk)
|
2015-01-16 17:47:59 +01:00
|
|
|
{
|
|
|
|
if (tbm_page_is_lossy(tbm, blk))
|
2015-01-16 19:28:30 +01:00
|
|
|
page = NULL; /* remember page is lossy */
|
|
|
|
else
|
|
|
|
page = tbm_get_pageentry(tbm, blk);
|
|
|
|
currblk = blk;
|
2015-01-16 17:47:59 +01:00
|
|
|
}
|
2005-04-18 00:24:02 +02:00
|
|
|
|
2015-01-16 19:28:30 +01:00
|
|
|
if (page == NULL)
|
|
|
|
continue; /* whole page is already marked */
|
|
|
|
|
2005-04-18 00:24:02 +02:00
|
|
|
if (page->ischunk)
|
|
|
|
{
|
|
|
|
/* The page is a lossy chunk header, set bit for itself */
|
|
|
|
wordnum = bitnum = 0;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* Page is exact, so set bit for individual tuple */
|
|
|
|
wordnum = WORDNUM(off - 1);
|
|
|
|
bitnum = BITNUM(off - 1);
|
|
|
|
}
|
|
|
|
page->words[wordnum] |= ((bitmapword) 1 << bitnum);
|
2008-04-11 00:25:26 +02:00
|
|
|
page->recheck |= recheck;
|
2005-04-18 00:24:02 +02:00
|
|
|
|
|
|
|
if (tbm->nentries > tbm->maxentries)
|
2015-01-16 17:47:59 +01:00
|
|
|
{
|
2005-04-18 00:24:02 +02:00
|
|
|
tbm_lossify(tbm);
|
2015-01-16 19:28:30 +01:00
|
|
|
/* Page could have been converted to lossy, so force new lookup */
|
|
|
|
currblk = InvalidBlockNumber;
|
2015-01-16 17:47:59 +01:00
|
|
|
}
|
2005-04-18 00:24:02 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-03-24 21:17:18 +01:00
|
|
|
/*
|
|
|
|
* tbm_add_page - add a whole page to a TIDBitmap
|
|
|
|
*
|
|
|
|
* This causes the whole page to be reported (with the recheck flag)
|
|
|
|
* when the TIDBitmap is scanned.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
|
|
|
|
{
|
|
|
|
/* Enter the page in the bitmap, or mark it lossy if already present */
|
|
|
|
tbm_mark_page_lossy(tbm, pageno);
|
|
|
|
/* If we went over the memory limit, lossify some more pages */
|
|
|
|
if (tbm->nentries > tbm->maxentries)
|
|
|
|
tbm_lossify(tbm);
|
|
|
|
}
|
|
|
|
|
2005-04-18 00:24:02 +02:00
|
|
|
/*
|
|
|
|
* tbm_union - set union
|
|
|
|
*
|
|
|
|
* a is modified in-place, b is not changed
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
tbm_union(TIDBitmap *a, const TIDBitmap *b)
|
|
|
|
{
|
2005-05-17 02:43:47 +02:00
|
|
|
Assert(!a->iterating);
|
|
|
|
/* Nothing to do if b is empty */
|
|
|
|
if (b->nentries == 0)
|
|
|
|
return;
|
|
|
|
/* Scan through chunks and pages in b, merge into a */
|
|
|
|
if (b->status == TBM_ONE_PAGE)
|
|
|
|
tbm_union_page(a, &b->entry1);
|
|
|
|
else
|
|
|
|
{
|
2016-10-15 01:05:30 +02:00
|
|
|
pagetable_iterator i;
|
2005-05-17 02:43:47 +02:00
|
|
|
PagetableEntry *bpage;
|
|
|
|
|
|
|
|
Assert(b->status == TBM_HASH);
|
2016-10-15 01:05:30 +02:00
|
|
|
pagetable_start_iterate(b->pagetable, &i);
|
|
|
|
while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL)
|
2005-05-17 02:43:47 +02:00
|
|
|
tbm_union_page(a, bpage);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Process one page of b during a union op */
|
|
|
|
static void
|
|
|
|
tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
|
|
|
|
{
|
2005-04-18 00:24:02 +02:00
|
|
|
PagetableEntry *apage;
|
2005-10-15 04:49:52 +02:00
|
|
|
int wordnum;
|
2005-04-18 00:24:02 +02:00
|
|
|
|
2005-05-17 02:43:47 +02:00
|
|
|
if (bpage->ischunk)
|
2005-04-18 00:24:02 +02:00
|
|
|
{
|
2005-05-17 02:43:47 +02:00
|
|
|
/* Scan b's chunk, mark each indicated page lossy in a */
|
2013-11-16 00:34:14 +01:00
|
|
|
for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
|
2005-04-18 00:24:02 +02:00
|
|
|
{
|
2005-05-17 02:43:47 +02:00
|
|
|
bitmapword w = bpage->words[wordnum];
|
|
|
|
|
|
|
|
if (w != 0)
|
2005-04-18 00:24:02 +02:00
|
|
|
{
|
2005-10-15 04:49:52 +02:00
|
|
|
BlockNumber pg;
|
2005-04-18 00:24:02 +02:00
|
|
|
|
2005-05-17 02:43:47 +02:00
|
|
|
pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
|
|
|
|
while (w != 0)
|
2005-04-18 00:24:02 +02:00
|
|
|
{
|
2005-05-17 02:43:47 +02:00
|
|
|
if (w & 1)
|
|
|
|
tbm_mark_page_lossy(a, pg);
|
|
|
|
pg++;
|
|
|
|
w >>= 1;
|
2005-04-18 00:24:02 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2005-05-17 02:43:47 +02:00
|
|
|
}
|
|
|
|
else if (tbm_page_is_lossy(a, bpage->blockno))
|
|
|
|
{
|
|
|
|
/* page is already lossy in a, nothing to do */
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
apage = tbm_get_pageentry(a, bpage->blockno);
|
|
|
|
if (apage->ischunk)
|
2005-04-18 00:24:02 +02:00
|
|
|
{
|
2005-05-17 02:43:47 +02:00
|
|
|
/* The page is a lossy chunk header, set bit for itself */
|
|
|
|
apage->words[0] |= ((bitmapword) 1 << 0);
|
2005-04-18 00:24:02 +02:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2005-05-17 02:43:47 +02:00
|
|
|
/* Both pages are exact, merge at the bit level */
|
|
|
|
for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
|
|
|
|
apage->words[wordnum] |= bpage->words[wordnum];
|
2008-04-11 00:25:26 +02:00
|
|
|
apage->recheck |= bpage->recheck;
|
2005-04-18 00:24:02 +02:00
|
|
|
}
|
|
|
|
}
|
2005-05-17 02:43:47 +02:00
|
|
|
|
|
|
|
if (a->nentries > a->maxentries)
|
|
|
|
tbm_lossify(a);
|
2005-04-18 00:24:02 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* tbm_intersect - set intersection
|
|
|
|
*
|
|
|
|
* a is modified in-place, b is not changed
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
|
|
|
|
{
|
|
|
|
Assert(!a->iterating);
|
2005-05-17 02:43:47 +02:00
|
|
|
/* Nothing to do if a is empty */
|
|
|
|
if (a->nentries == 0)
|
|
|
|
return;
|
2005-04-18 00:24:02 +02:00
|
|
|
/* Scan through chunks and pages in a, try to match to b */
|
2005-05-17 02:43:47 +02:00
|
|
|
if (a->status == TBM_ONE_PAGE)
|
2005-04-18 00:24:02 +02:00
|
|
|
{
|
2005-07-24 04:25:26 +02:00
|
|
|
if (tbm_intersect_page(a, &a->entry1, b))
|
2005-04-18 00:24:02 +02:00
|
|
|
{
|
2005-05-17 02:43:47 +02:00
|
|
|
/* Page is now empty, remove it from a */
|
|
|
|
Assert(!a->entry1.ischunk);
|
|
|
|
a->npages--;
|
|
|
|
a->nentries--;
|
|
|
|
Assert(a->nentries == 0);
|
|
|
|
a->status = TBM_EMPTY;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2016-10-15 01:05:30 +02:00
|
|
|
pagetable_iterator i;
|
2005-05-17 02:43:47 +02:00
|
|
|
PagetableEntry *apage;
|
2005-04-18 00:24:02 +02:00
|
|
|
|
2005-05-17 02:43:47 +02:00
|
|
|
Assert(a->status == TBM_HASH);
|
2016-10-15 01:05:30 +02:00
|
|
|
pagetable_start_iterate(a->pagetable, &i);
|
|
|
|
while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL)
|
2005-05-17 02:43:47 +02:00
|
|
|
{
|
2005-07-24 04:25:26 +02:00
|
|
|
if (tbm_intersect_page(a, apage, b))
|
2005-04-18 00:24:02 +02:00
|
|
|
{
|
2005-05-17 02:43:47 +02:00
|
|
|
/* Page or chunk is now empty, remove it from a */
|
|
|
|
if (apage->ischunk)
|
|
|
|
a->nchunks--;
|
|
|
|
else
|
|
|
|
a->npages--;
|
|
|
|
a->nentries--;
|
2016-10-15 01:05:30 +02:00
|
|
|
if (!pagetable_delete(a->pagetable, apage->blockno))
|
2005-04-18 00:24:02 +02:00
|
|
|
elog(ERROR, "hash table corrupted");
|
|
|
|
}
|
|
|
|
}
|
2005-05-17 02:43:47 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Process one page of a during an intersection op
|
|
|
|
*
|
|
|
|
* Returns TRUE if apage is now empty and should be deleted from a
|
|
|
|
*/
|
|
|
|
static bool
|
2005-07-24 04:25:26 +02:00
|
|
|
tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
|
2005-05-17 02:43:47 +02:00
|
|
|
{
|
|
|
|
const PagetableEntry *bpage;
|
2005-10-15 04:49:52 +02:00
|
|
|
int wordnum;
|
2005-05-17 02:43:47 +02:00
|
|
|
|
|
|
|
if (apage->ischunk)
|
|
|
|
{
|
|
|
|
/* Scan each bit in chunk, try to clear */
|
2005-10-15 04:49:52 +02:00
|
|
|
bool candelete = true;
|
2005-05-17 02:43:47 +02:00
|
|
|
|
2013-11-16 00:34:14 +01:00
|
|
|
for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
|
2005-04-18 00:24:02 +02:00
|
|
|
{
|
2005-05-17 02:43:47 +02:00
|
|
|
bitmapword w = apage->words[wordnum];
|
2005-04-18 00:24:02 +02:00
|
|
|
|
2005-05-17 02:43:47 +02:00
|
|
|
if (w != 0)
|
2005-04-18 00:24:02 +02:00
|
|
|
{
|
2005-05-17 02:43:47 +02:00
|
|
|
bitmapword neww = w;
|
2005-10-15 04:49:52 +02:00
|
|
|
BlockNumber pg;
|
|
|
|
int bitnum;
|
2005-05-17 02:43:47 +02:00
|
|
|
|
|
|
|
pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
|
|
|
|
bitnum = 0;
|
|
|
|
while (w != 0)
|
2005-04-18 00:24:02 +02:00
|
|
|
{
|
2005-05-17 02:43:47 +02:00
|
|
|
if (w & 1)
|
|
|
|
{
|
|
|
|
if (!tbm_page_is_lossy(b, pg) &&
|
|
|
|
tbm_find_pageentry(b, pg) == NULL)
|
|
|
|
{
|
|
|
|
/* Page is not in b at all, lose lossy bit */
|
|
|
|
neww &= ~((bitmapword) 1 << bitnum);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
pg++;
|
|
|
|
bitnum++;
|
|
|
|
w >>= 1;
|
2005-04-18 00:24:02 +02:00
|
|
|
}
|
2005-05-17 02:43:47 +02:00
|
|
|
apage->words[wordnum] = neww;
|
|
|
|
if (neww != 0)
|
|
|
|
candelete = false;
|
2005-04-18 00:24:02 +02:00
|
|
|
}
|
2005-05-17 02:43:47 +02:00
|
|
|
}
|
|
|
|
return candelete;
|
|
|
|
}
|
|
|
|
else if (tbm_page_is_lossy(b, apage->blockno))
|
|
|
|
{
|
2005-07-24 04:25:26 +02:00
|
|
|
/*
|
2009-06-11 16:49:15 +02:00
|
|
|
* Some of the tuples in 'a' might not satisfy the quals for 'b', but
|
|
|
|
* because the page 'b' is lossy, we don't know which ones. Therefore
|
|
|
|
* we mark 'a' as requiring rechecks, to indicate that at most those
|
|
|
|
* tuples set in 'a' are matches.
|
2005-07-24 04:25:26 +02:00
|
|
|
*/
|
2008-04-11 00:25:26 +02:00
|
|
|
apage->recheck = true;
|
2005-05-17 02:43:47 +02:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2005-10-15 04:49:52 +02:00
|
|
|
bool candelete = true;
|
2005-05-17 02:43:47 +02:00
|
|
|
|
|
|
|
bpage = tbm_find_pageentry(b, apage->blockno);
|
|
|
|
if (bpage != NULL)
|
|
|
|
{
|
|
|
|
/* Both pages are exact, merge at the bit level */
|
|
|
|
Assert(!bpage->ischunk);
|
|
|
|
for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
|
2005-04-18 00:24:02 +02:00
|
|
|
{
|
2005-05-17 02:43:47 +02:00
|
|
|
apage->words[wordnum] &= bpage->words[wordnum];
|
|
|
|
if (apage->words[wordnum] != 0)
|
|
|
|
candelete = false;
|
2005-04-18 00:24:02 +02:00
|
|
|
}
|
2008-04-11 00:25:26 +02:00
|
|
|
apage->recheck |= bpage->recheck;
|
2005-04-18 00:24:02 +02:00
|
|
|
}
|
2008-04-11 00:25:26 +02:00
|
|
|
/* If there is no matching b page, we can just delete the a page */
|
2005-05-17 02:43:47 +02:00
|
|
|
return candelete;
|
2005-04-18 00:24:02 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-08-29 00:47:20 +02:00
|
|
|
/*
|
|
|
|
* tbm_is_empty - is a TIDBitmap completely empty?
|
|
|
|
*/
|
|
|
|
bool
|
|
|
|
tbm_is_empty(const TIDBitmap *tbm)
|
|
|
|
{
|
|
|
|
return (tbm->nentries == 0);
|
|
|
|
}
|
|
|
|
|
2005-04-18 00:24:02 +02:00
|
|
|
/*
|
|
|
|
* tbm_begin_iterate - prepare to iterate through a TIDBitmap
|
|
|
|
*
|
2009-01-10 22:08:36 +01:00
|
|
|
* The TBMIterator struct is created in the caller's memory context.
|
|
|
|
* For a clean shutdown of the iteration, call tbm_end_iterate; but it's
|
|
|
|
* okay to just allow the memory context to be released, too. It is caller's
|
|
|
|
* responsibility not to touch the TBMIterator anymore once the TIDBitmap
|
|
|
|
* is freed.
|
|
|
|
*
|
2005-04-18 00:24:02 +02:00
|
|
|
* NB: after this is called, it is no longer allowed to modify the contents
|
|
|
|
* of the bitmap. However, you can call this multiple times to scan the
|
2009-01-10 22:08:36 +01:00
|
|
|
* contents repeatedly, including parallel scans.
|
2005-04-18 00:24:02 +02:00
|
|
|
*/
|
2009-01-10 22:08:36 +01:00
|
|
|
TBMIterator *
|
2005-04-18 00:24:02 +02:00
|
|
|
tbm_begin_iterate(TIDBitmap *tbm)
|
|
|
|
{
|
2009-01-10 22:08:36 +01:00
|
|
|
TBMIterator *iterator;
|
2005-10-15 04:49:52 +02:00
|
|
|
|
2005-04-18 00:24:02 +02:00
|
|
|
/*
|
2009-01-10 22:08:36 +01:00
|
|
|
* Create the TBMIterator struct, with enough trailing space to serve the
|
|
|
|
* needs of the TBMIterateResult sub-struct.
|
2005-05-17 02:43:47 +02:00
|
|
|
*/
|
2009-01-10 22:08:36 +01:00
|
|
|
iterator = (TBMIterator *) palloc(sizeof(TBMIterator) +
|
2009-06-11 16:49:15 +02:00
|
|
|
MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
|
2009-01-10 22:08:36 +01:00
|
|
|
iterator->tbm = tbm;
|
2005-10-15 04:49:52 +02:00
|
|
|
|
2005-05-17 02:43:47 +02:00
|
|
|
/*
|
2009-01-10 22:08:36 +01:00
|
|
|
* Initialize iteration pointers.
|
2005-04-18 00:24:02 +02:00
|
|
|
*/
|
2009-01-10 22:08:36 +01:00
|
|
|
iterator->spageptr = 0;
|
|
|
|
iterator->schunkptr = 0;
|
|
|
|
iterator->schunkbit = 0;
|
2005-10-15 04:49:52 +02:00
|
|
|
|
2005-04-18 00:24:02 +02:00
|
|
|
/*
|
2009-06-11 16:49:15 +02:00
|
|
|
* If we have a hashtable, create and fill the sorted page lists, unless
|
|
|
|
* we already did that for a previous iterator. Note that the lists are
|
|
|
|
* attached to the bitmap not the iterator, so they can be used by more
|
|
|
|
* than one iterator.
|
2005-04-18 00:24:02 +02:00
|
|
|
*/
|
2009-01-10 22:08:36 +01:00
|
|
|
if (tbm->status == TBM_HASH && !tbm->iterating)
|
2005-04-18 00:24:02 +02:00
|
|
|
{
|
2016-10-15 01:05:30 +02:00
|
|
|
pagetable_iterator i;
|
2009-01-10 22:08:36 +01:00
|
|
|
PagetableEntry *page;
|
|
|
|
int npages;
|
|
|
|
int nchunks;
|
|
|
|
|
|
|
|
if (!tbm->spages && tbm->npages > 0)
|
|
|
|
tbm->spages = (PagetableEntry **)
|
|
|
|
MemoryContextAlloc(tbm->mcxt,
|
|
|
|
tbm->npages * sizeof(PagetableEntry *));
|
|
|
|
if (!tbm->schunks && tbm->nchunks > 0)
|
|
|
|
tbm->schunks = (PagetableEntry **)
|
|
|
|
MemoryContextAlloc(tbm->mcxt,
|
|
|
|
tbm->nchunks * sizeof(PagetableEntry *));
|
|
|
|
|
|
|
|
npages = nchunks = 0;
|
2016-10-15 01:05:30 +02:00
|
|
|
pagetable_start_iterate(tbm->pagetable, &i);
|
|
|
|
while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
|
2009-01-10 22:08:36 +01:00
|
|
|
{
|
|
|
|
if (page->ischunk)
|
|
|
|
tbm->schunks[nchunks++] = page;
|
|
|
|
else
|
|
|
|
tbm->spages[npages++] = page;
|
|
|
|
}
|
|
|
|
Assert(npages == tbm->npages);
|
|
|
|
Assert(nchunks == tbm->nchunks);
|
|
|
|
if (npages > 1)
|
|
|
|
qsort(tbm->spages, npages, sizeof(PagetableEntry *),
|
|
|
|
tbm_comparator);
|
|
|
|
if (nchunks > 1)
|
|
|
|
qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
|
|
|
|
tbm_comparator);
|
2005-04-18 00:24:02 +02:00
|
|
|
}
|
2009-01-10 22:08:36 +01:00
|
|
|
|
|
|
|
tbm->iterating = true;
|
|
|
|
|
|
|
|
return iterator;
|
2005-04-18 00:24:02 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* tbm_iterate - scan through next page of a TIDBitmap
|
|
|
|
*
|
|
|
|
* Returns a TBMIterateResult representing one page, or NULL if there are
|
|
|
|
* no more pages to scan. Pages are guaranteed to be delivered in numerical
|
|
|
|
* order. If result->ntuples < 0, then the bitmap is "lossy" and failed to
|
|
|
|
* remember the exact tuples to look at on this page --- the caller must
|
|
|
|
* examine all tuples on the page and check if they meet the intended
|
2008-04-11 00:25:26 +02:00
|
|
|
* condition. If result->recheck is true, only the indicated tuples need
|
|
|
|
* be examined, but the condition must be rechecked anyway. (For ease of
|
|
|
|
* testing, recheck is always set true when ntuples < 0.)
|
2005-04-18 00:24:02 +02:00
|
|
|
*/
|
|
|
|
TBMIterateResult *
|
2009-01-10 22:08:36 +01:00
|
|
|
tbm_iterate(TBMIterator *iterator)
|
2005-04-18 00:24:02 +02:00
|
|
|
{
|
2009-06-11 16:49:15 +02:00
|
|
|
TIDBitmap *tbm = iterator->tbm;
|
2009-01-10 22:08:36 +01:00
|
|
|
TBMIterateResult *output = &(iterator->output);
|
2005-04-18 00:24:02 +02:00
|
|
|
|
|
|
|
Assert(tbm->iterating);
|
2005-10-15 04:49:52 +02:00
|
|
|
|
2005-04-18 00:24:02 +02:00
|
|
|
/*
|
|
|
|
* If lossy chunk pages remain, make sure we've advanced schunkptr/
|
|
|
|
* schunkbit to the next set bit.
|
|
|
|
*/
|
2009-01-10 22:08:36 +01:00
|
|
|
while (iterator->schunkptr < tbm->nchunks)
|
2005-04-18 00:24:02 +02:00
|
|
|
{
|
2009-01-10 22:08:36 +01:00
|
|
|
PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
|
|
|
|
int schunkbit = iterator->schunkbit;
|
2005-04-18 00:24:02 +02:00
|
|
|
|
|
|
|
while (schunkbit < PAGES_PER_CHUNK)
|
|
|
|
{
|
2005-10-15 04:49:52 +02:00
|
|
|
int wordnum = WORDNUM(schunkbit);
|
|
|
|
int bitnum = BITNUM(schunkbit);
|
2005-04-18 00:24:02 +02:00
|
|
|
|
|
|
|
if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
|
|
|
|
break;
|
|
|
|
schunkbit++;
|
|
|
|
}
|
|
|
|
if (schunkbit < PAGES_PER_CHUNK)
|
|
|
|
{
|
2009-01-10 22:08:36 +01:00
|
|
|
iterator->schunkbit = schunkbit;
|
2005-04-18 00:24:02 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
/* advance to next chunk */
|
2009-01-10 22:08:36 +01:00
|
|
|
iterator->schunkptr++;
|
|
|
|
iterator->schunkbit = 0;
|
2005-04-18 00:24:02 +02:00
|
|
|
}
|
2005-10-15 04:49:52 +02:00
|
|
|
|
2005-04-18 00:24:02 +02:00
|
|
|
/*
|
|
|
|
* If both chunk and per-page data remain, must output the numerically
|
|
|
|
* earlier page.
|
|
|
|
*/
|
2009-01-10 22:08:36 +01:00
|
|
|
if (iterator->schunkptr < tbm->nchunks)
|
2005-04-18 00:24:02 +02:00
|
|
|
{
|
2009-01-10 22:08:36 +01:00
|
|
|
PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
|
2005-04-18 00:24:02 +02:00
|
|
|
BlockNumber chunk_blockno;
|
|
|
|
|
2009-01-10 22:08:36 +01:00
|
|
|
chunk_blockno = chunk->blockno + iterator->schunkbit;
|
|
|
|
if (iterator->spageptr >= tbm->npages ||
|
|
|
|
chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
|
2005-04-18 00:24:02 +02:00
|
|
|
{
|
|
|
|
/* Return a lossy page indicator from the chunk */
|
|
|
|
output->blockno = chunk_blockno;
|
|
|
|
output->ntuples = -1;
|
2008-04-11 00:25:26 +02:00
|
|
|
output->recheck = true;
|
2009-01-10 22:08:36 +01:00
|
|
|
iterator->schunkbit++;
|
2005-04-18 00:24:02 +02:00
|
|
|
return output;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-01-10 22:08:36 +01:00
|
|
|
if (iterator->spageptr < tbm->npages)
|
2005-04-18 00:24:02 +02:00
|
|
|
{
|
2005-05-17 02:43:47 +02:00
|
|
|
PagetableEntry *page;
|
2005-04-18 00:24:02 +02:00
|
|
|
int ntuples;
|
|
|
|
int wordnum;
|
|
|
|
|
2005-05-17 02:43:47 +02:00
|
|
|
/* In ONE_PAGE state, we don't allocate an spages[] array */
|
|
|
|
if (tbm->status == TBM_ONE_PAGE)
|
|
|
|
page = &tbm->entry1;
|
|
|
|
else
|
2009-01-10 22:08:36 +01:00
|
|
|
page = tbm->spages[iterator->spageptr];
|
2005-05-17 02:43:47 +02:00
|
|
|
|
2005-04-18 00:24:02 +02:00
|
|
|
/* scan bitmap to extract individual offset numbers */
|
|
|
|
ntuples = 0;
|
|
|
|
for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
|
|
|
|
{
|
|
|
|
bitmapword w = page->words[wordnum];
|
|
|
|
|
|
|
|
if (w != 0)
|
|
|
|
{
|
|
|
|
int off = wordnum * BITS_PER_BITMAPWORD + 1;
|
|
|
|
|
|
|
|
while (w != 0)
|
|
|
|
{
|
|
|
|
if (w & 1)
|
|
|
|
output->offsets[ntuples++] = (OffsetNumber) off;
|
|
|
|
off++;
|
|
|
|
w >>= 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
output->blockno = page->blockno;
|
|
|
|
output->ntuples = ntuples;
|
2008-04-11 00:25:26 +02:00
|
|
|
output->recheck = page->recheck;
|
2009-01-10 22:08:36 +01:00
|
|
|
iterator->spageptr++;
|
2005-04-18 00:24:02 +02:00
|
|
|
return output;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Nothing more in the bitmap */
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2009-01-10 22:08:36 +01:00
|
|
|
/*
|
|
|
|
* tbm_end_iterate - finish an iteration over a TIDBitmap
|
|
|
|
*
|
|
|
|
* Currently this is just a pfree, but it might do more someday. (For
|
|
|
|
* instance, it could be useful to count open iterators and allow the
|
|
|
|
* bitmap to return to read/write status when there are no more iterators.)
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
tbm_end_iterate(TBMIterator *iterator)
|
|
|
|
{
|
|
|
|
pfree(iterator);
|
|
|
|
}
|
|
|
|
|
2005-04-18 00:24:02 +02:00
|
|
|
/*
|
|
|
|
* tbm_find_pageentry - find a PagetableEntry for the pageno
|
|
|
|
*
|
|
|
|
* Returns NULL if there is no non-lossy entry for the pageno.
|
|
|
|
*/
|
2005-05-17 02:43:47 +02:00
|
|
|
static const PagetableEntry *
|
2005-04-18 00:24:02 +02:00
|
|
|
tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
|
|
|
|
{
|
2005-05-17 02:43:47 +02:00
|
|
|
const PagetableEntry *page;
|
|
|
|
|
|
|
|
if (tbm->nentries == 0) /* in case pagetable doesn't exist */
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
if (tbm->status == TBM_ONE_PAGE)
|
|
|
|
{
|
|
|
|
page = &tbm->entry1;
|
|
|
|
if (page->blockno != pageno)
|
|
|
|
return NULL;
|
|
|
|
Assert(!page->ischunk);
|
|
|
|
return page;
|
|
|
|
}
|
2005-04-18 00:24:02 +02:00
|
|
|
|
2016-10-15 01:05:30 +02:00
|
|
|
page = pagetable_lookup(tbm->pagetable, pageno);
|
2005-04-18 00:24:02 +02:00
|
|
|
if (page == NULL)
|
|
|
|
return NULL;
|
|
|
|
if (page->ischunk)
|
|
|
|
return NULL; /* don't want a lossy chunk header */
|
|
|
|
return page;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* tbm_get_pageentry - find or create a PagetableEntry for the pageno
|
|
|
|
*
|
|
|
|
* If new, the entry is marked as an exact (non-chunk) entry.
|
|
|
|
*
|
2014-05-06 18:12:18 +02:00
|
|
|
* This may cause the table to exceed the desired memory size. It is
|
2005-04-18 00:24:02 +02:00
|
|
|
* up to the caller to call tbm_lossify() at the next safe point if so.
|
|
|
|
*/
|
|
|
|
static PagetableEntry *
|
|
|
|
tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
|
|
|
|
{
|
|
|
|
PagetableEntry *page;
|
|
|
|
bool found;
|
|
|
|
|
2005-05-17 02:43:47 +02:00
|
|
|
if (tbm->status == TBM_EMPTY)
|
|
|
|
{
|
|
|
|
/* Use the fixed slot */
|
|
|
|
page = &tbm->entry1;
|
|
|
|
found = false;
|
|
|
|
tbm->status = TBM_ONE_PAGE;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (tbm->status == TBM_ONE_PAGE)
|
|
|
|
{
|
|
|
|
page = &tbm->entry1;
|
|
|
|
if (page->blockno == pageno)
|
|
|
|
return page;
|
|
|
|
/* Time to switch from one page to a hashtable */
|
|
|
|
tbm_create_pagetable(tbm);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Look up or create an entry */
|
2016-10-15 01:05:30 +02:00
|
|
|
page = pagetable_insert(tbm->pagetable, pageno, &found);
|
2005-05-17 02:43:47 +02:00
|
|
|
}
|
2005-04-18 00:24:02 +02:00
|
|
|
|
|
|
|
/* Initialize it if not present before */
|
|
|
|
if (!found)
|
|
|
|
{
|
2016-10-15 01:05:30 +02:00
|
|
|
char oldstatus = page->status;
|
|
|
|
|
2005-04-18 00:24:02 +02:00
|
|
|
MemSet(page, 0, sizeof(PagetableEntry));
|
2016-10-15 01:05:30 +02:00
|
|
|
page->status = oldstatus;
|
2005-04-18 00:24:02 +02:00
|
|
|
page->blockno = pageno;
|
|
|
|
/* must count it too */
|
|
|
|
tbm->nentries++;
|
|
|
|
tbm->npages++;
|
|
|
|
}
|
|
|
|
|
|
|
|
return page;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* tbm_page_is_lossy - is the page marked as lossily stored?
|
|
|
|
*/
|
|
|
|
static bool
|
|
|
|
tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
|
|
|
|
{
|
|
|
|
PagetableEntry *page;
|
|
|
|
BlockNumber chunk_pageno;
|
|
|
|
int bitno;
|
|
|
|
|
|
|
|
/* we can skip the lookup if there are no lossy chunks */
|
|
|
|
if (tbm->nchunks == 0)
|
|
|
|
return false;
|
2005-05-17 02:43:47 +02:00
|
|
|
Assert(tbm->status == TBM_HASH);
|
2005-04-18 00:24:02 +02:00
|
|
|
|
|
|
|
bitno = pageno % PAGES_PER_CHUNK;
|
|
|
|
chunk_pageno = pageno - bitno;
|
2016-10-15 01:05:30 +02:00
|
|
|
|
|
|
|
page = pagetable_lookup(tbm->pagetable, chunk_pageno);
|
|
|
|
|
2005-04-18 00:24:02 +02:00
|
|
|
if (page != NULL && page->ischunk)
|
|
|
|
{
|
2005-10-15 04:49:52 +02:00
|
|
|
int wordnum = WORDNUM(bitno);
|
|
|
|
int bitnum = BITNUM(bitno);
|
2005-04-18 00:24:02 +02:00
|
|
|
|
|
|
|
if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* tbm_mark_page_lossy - mark the page number as lossily stored
|
|
|
|
*
|
2014-05-06 18:12:18 +02:00
|
|
|
* This may cause the table to exceed the desired memory size. It is
|
2005-04-18 00:24:02 +02:00
|
|
|
* up to the caller to call tbm_lossify() at the next safe point if so.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
|
|
|
|
{
|
|
|
|
PagetableEntry *page;
|
|
|
|
bool found;
|
|
|
|
BlockNumber chunk_pageno;
|
|
|
|
int bitno;
|
|
|
|
int wordnum;
|
|
|
|
int bitnum;
|
|
|
|
|
2005-05-17 02:43:47 +02:00
|
|
|
/* We force the bitmap into hashtable mode whenever it's lossy */
|
|
|
|
if (tbm->status != TBM_HASH)
|
|
|
|
tbm_create_pagetable(tbm);
|
|
|
|
|
2005-04-18 00:24:02 +02:00
|
|
|
bitno = pageno % PAGES_PER_CHUNK;
|
|
|
|
chunk_pageno = pageno - bitno;
|
|
|
|
|
|
|
|
/*
|
2014-05-06 18:12:18 +02:00
|
|
|
* Remove any extant non-lossy entry for the page. If the page is its own
|
2005-10-15 04:49:52 +02:00
|
|
|
* chunk header, however, we skip this and handle the case below.
|
2005-04-18 00:24:02 +02:00
|
|
|
*/
|
|
|
|
if (bitno != 0)
|
|
|
|
{
|
2016-10-15 01:05:30 +02:00
|
|
|
if (pagetable_delete(tbm->pagetable, pageno))
|
2005-04-18 00:24:02 +02:00
|
|
|
{
|
|
|
|
/* It was present, so adjust counts */
|
|
|
|
tbm->nentries--;
|
|
|
|
tbm->npages--; /* assume it must have been non-lossy */
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Look up or create entry for chunk-header page */
|
2016-10-15 01:05:30 +02:00
|
|
|
page = pagetable_insert(tbm->pagetable, chunk_pageno, &found);
|
2005-04-18 00:24:02 +02:00
|
|
|
|
|
|
|
/* Initialize it if not present before */
|
|
|
|
if (!found)
|
|
|
|
{
|
2016-10-15 01:05:30 +02:00
|
|
|
char oldstatus = page->status;
|
|
|
|
|
2005-04-18 00:24:02 +02:00
|
|
|
MemSet(page, 0, sizeof(PagetableEntry));
|
2016-10-15 01:05:30 +02:00
|
|
|
page->status = oldstatus;
|
2005-04-18 00:24:02 +02:00
|
|
|
page->blockno = chunk_pageno;
|
|
|
|
page->ischunk = true;
|
|
|
|
/* must count it too */
|
|
|
|
tbm->nentries++;
|
|
|
|
tbm->nchunks++;
|
|
|
|
}
|
|
|
|
else if (!page->ischunk)
|
|
|
|
{
|
2016-10-15 01:05:30 +02:00
|
|
|
char oldstatus = page->status;
|
|
|
|
|
2005-04-18 00:24:02 +02:00
|
|
|
/* chunk header page was formerly non-lossy, make it lossy */
|
|
|
|
MemSet(page, 0, sizeof(PagetableEntry));
|
2016-10-15 01:05:30 +02:00
|
|
|
page->status = oldstatus;
|
2005-04-18 00:24:02 +02:00
|
|
|
page->blockno = chunk_pageno;
|
|
|
|
page->ischunk = true;
|
|
|
|
/* we assume it had some tuple bit(s) set, so mark it lossy */
|
|
|
|
page->words[0] = ((bitmapword) 1 << 0);
|
|
|
|
/* adjust counts */
|
|
|
|
tbm->nchunks++;
|
|
|
|
tbm->npages--;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Now set the original target page's bit */
|
|
|
|
wordnum = WORDNUM(bitno);
|
|
|
|
bitnum = BITNUM(bitno);
|
|
|
|
page->words[wordnum] |= ((bitmapword) 1 << bitnum);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* tbm_lossify - lose some information to get back under the memory limit
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
tbm_lossify(TIDBitmap *tbm)
|
|
|
|
{
|
2016-10-15 01:05:30 +02:00
|
|
|
pagetable_iterator i;
|
2005-04-18 00:24:02 +02:00
|
|
|
PagetableEntry *page;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* XXX Really stupid implementation: this just lossifies pages in
|
2005-10-15 04:49:52 +02:00
|
|
|
* essentially random order. We should be paying some attention to the
|
2011-08-20 20:51:02 +02:00
|
|
|
* number of bits set in each page, instead.
|
|
|
|
*
|
|
|
|
* Since we are called as soon as nentries exceeds maxentries, we should
|
|
|
|
* push nentries down to significantly less than maxentries, or else we'll
|
2014-05-06 18:12:18 +02:00
|
|
|
* just end up doing this again very soon. We shoot for maxentries/2.
|
2005-04-18 00:24:02 +02:00
|
|
|
*/
|
|
|
|
Assert(!tbm->iterating);
|
2005-05-17 02:43:47 +02:00
|
|
|
Assert(tbm->status == TBM_HASH);
|
|
|
|
|
2016-10-15 01:05:30 +02:00
|
|
|
pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start);
|
|
|
|
while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
|
2005-04-18 00:24:02 +02:00
|
|
|
{
|
|
|
|
if (page->ischunk)
|
|
|
|
continue; /* already a chunk header */
|
2005-10-15 04:49:52 +02:00
|
|
|
|
2005-04-18 00:24:02 +02:00
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* If the page would become a chunk header, we won't save anything by
|
|
|
|
* converting it to lossy, so skip it.
|
2005-04-18 00:24:02 +02:00
|
|
|
*/
|
|
|
|
if ((page->blockno % PAGES_PER_CHUNK) == 0)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* This does the dirty work ... */
|
|
|
|
tbm_mark_page_lossy(tbm, page->blockno);
|
|
|
|
|
2011-08-20 20:51:02 +02:00
|
|
|
if (tbm->nentries <= tbm->maxentries / 2)
|
2007-04-27 01:24:46 +02:00
|
|
|
{
|
2016-10-15 01:05:30 +02:00
|
|
|
/*
|
|
|
|
* We have made enough room. Remember where to start lossifying
|
|
|
|
* next round, so we evenly iterate over the hashtable.
|
|
|
|
*/
|
|
|
|
tbm->lossify_start = i.cur;
|
2007-04-27 01:24:46 +02:00
|
|
|
break;
|
|
|
|
}
|
2005-04-18 00:24:02 +02:00
|
|
|
|
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
|
2016-10-15 01:05:30 +02:00
|
|
|
* hashtable and may have deleted the non-lossy chunk. We can
|
|
|
|
* continue the same hash table scan, since failure to visit one
|
|
|
|
* element or visiting the newly inserted element, isn't fatal.
|
2005-04-18 00:24:02 +02:00
|
|
|
*/
|
|
|
|
}
|
2011-08-20 20:51:02 +02:00
|
|
|
|
|
|
|
/*
|
2012-06-10 21:20:04 +02:00
|
|
|
* With a big bitmap and small work_mem, it's possible that we cannot get
|
|
|
|
* under maxentries. Again, if that happens, we'd end up uselessly
|
2011-08-20 20:51:02 +02:00
|
|
|
* calling tbm_lossify over and over. To prevent this from becoming a
|
|
|
|
* performance sink, force maxentries up to at least double the current
|
|
|
|
* number of entries. (In essence, we're admitting inability to fit
|
2012-06-10 21:20:04 +02:00
|
|
|
* within work_mem when we do this.) Note that this test will not fire if
|
|
|
|
* we broke out of the loop early; and if we didn't, the current number of
|
|
|
|
* entries is simply not reducible any further.
|
2011-08-20 20:51:02 +02:00
|
|
|
*/
|
|
|
|
if (tbm->nentries > tbm->maxentries / 2)
|
|
|
|
tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2;
|
2005-04-18 00:24:02 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* qsort comparator to handle PagetableEntry pointers.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
tbm_comparator(const void *left, const void *right)
|
|
|
|
{
|
2012-06-10 21:20:04 +02:00
|
|
|
BlockNumber l = (*((PagetableEntry *const *) left))->blockno;
|
|
|
|
BlockNumber r = (*((PagetableEntry *const *) right))->blockno;
|
2005-04-18 00:24:02 +02:00
|
|
|
|
|
|
|
if (l < r)
|
|
|
|
return -1;
|
|
|
|
else if (l > r)
|
|
|
|
return 1;
|
|
|
|
return 0;
|
|
|
|
}
|