Revise the TIDBitmap API to support multiple concurrent iterations over a

bitmap.  This is extracted from Greg Stark's posix_fadvise patch; it seems
worth committing separately, since it's potentially useful independently of
posix_fadvise.
This commit is contained in:
Tom Lane 2009-01-10 21:08:36 +00:00
parent 3b34e98242
commit 43a57cf365
7 changed files with 151 additions and 88 deletions

View File

@ -8,7 +8,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/gin/ginget.c,v 1.21 2009/01/01 17:23:34 momjian Exp $
* $PostgreSQL: pgsql/src/backend/access/gin/ginget.c,v 1.22 2009/01/10 21:08:36 tgl Exp $
*-------------------------------------------------------------------------
*/
@ -290,6 +290,7 @@ startScanEntry(Relation index, GinState *ginstate, GinScanEntry entry)
entry->list = NULL;
entry->nlist = 0;
entry->partialMatch = NULL;
entry->partialMatchIterator = NULL;
entry->partialMatchResult = NULL;
entry->reduceResult = FALSE;
entry->predictNumberResult = 0;
@ -311,6 +312,9 @@ startScanEntry(Relation index, GinState *ginstate, GinScanEntry entry)
*/
if ( entry->partialMatch )
{
if (entry->partialMatchIterator)
tbm_end_iterate(entry->partialMatchIterator);
entry->partialMatchIterator = NULL;
tbm_free( entry->partialMatch );
entry->partialMatch = NULL;
}
@ -323,7 +327,7 @@ startScanEntry(Relation index, GinState *ginstate, GinScanEntry entry)
if ( entry->partialMatch && !tbm_is_empty(entry->partialMatch) )
{
tbm_begin_iterate(entry->partialMatch);
entry->partialMatchIterator = tbm_begin_iterate(entry->partialMatch);
entry->isFinished = FALSE;
}
}
@ -534,11 +538,13 @@ entryGetItem(Relation index, GinScanEntry entry)
{
if ( entry->partialMatchResult == NULL || entry->offset >= entry->partialMatchResult->ntuples )
{
entry->partialMatchResult = tbm_iterate( entry->partialMatch );
entry->partialMatchResult = tbm_iterate( entry->partialMatchIterator );
if ( entry->partialMatchResult == NULL )
{
ItemPointerSet(&entry->curItem, InvalidBlockNumber, InvalidOffsetNumber);
tbm_end_iterate(entry->partialMatchIterator);
entry->partialMatchIterator = NULL;
entry->isFinished = TRUE;
break;
}

View File

@ -8,7 +8,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/gin/ginscan.c,v 1.20 2009/01/01 17:23:34 momjian Exp $
* $PostgreSQL: pgsql/src/backend/access/gin/ginscan.c,v 1.21 2009/01/10 21:08:36 tgl Exp $
*-------------------------------------------------------------------------
*/
@ -61,6 +61,8 @@ fillScanKey(GinState *ginstate, GinScanKey key, OffsetNumber attnum, Datum query
key->scanEntry[i].offset = InvalidOffsetNumber;
key->scanEntry[i].buffer = InvalidBuffer;
key->scanEntry[i].partialMatch = NULL;
key->scanEntry[i].partialMatchIterator = NULL;
key->scanEntry[i].partialMatchResult = NULL;
key->scanEntry[i].strategy = strategy;
key->scanEntry[i].list = NULL;
key->scanEntry[i].nlist = 0;
@ -107,6 +109,7 @@ resetScanKeys(GinScanKey keys, uint32 nkeys)
key->scanEntry[j].list = NULL;
key->scanEntry[j].nlist = 0;
key->scanEntry[j].partialMatch = NULL;
key->scanEntry[j].partialMatchIterator = NULL;
key->scanEntry[j].partialMatchResult = NULL;
}
}
@ -132,6 +135,8 @@ freeScanKeys(GinScanKey keys, uint32 nkeys)
ReleaseBuffer(key->scanEntry[j].buffer);
if (key->scanEntry[j].list)
pfree(key->scanEntry[j].list);
if (key->scanEntry[j].partialMatchIterator)
tbm_end_iterate(key->scanEntry[j].partialMatchIterator);
if (key->scanEntry[j].partialMatch)
tbm_free(key->scanEntry[j].partialMatch);
}

View File

@ -21,7 +21,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/executor/nodeBitmapHeapscan.c,v 1.31 2009/01/01 17:23:41 momjian Exp $
* $PostgreSQL: pgsql/src/backend/executor/nodeBitmapHeapscan.c,v 1.32 2009/01/10 21:08:36 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -65,6 +65,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
HeapScanDesc scan;
Index scanrelid;
TIDBitmap *tbm;
TBMIterator *tbmiterator;
TBMIterateResult *tbmres;
OffsetNumber targoffset;
TupleTableSlot *slot;
@ -78,6 +79,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
scan = node->ss.ss_currentScanDesc;
scanrelid = ((BitmapHeapScan *) node->ss.ps.plan)->scan.scanrelid;
tbm = node->tbm;
tbmiterator = node->tbmiterator;
tbmres = node->tbmres;
/*
@ -111,7 +113,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
/*
* If we haven't yet performed the underlying index scan, do it, and
* prepare the bitmap to be iterated over.
* begin the iteration over the bitmap.
*/
if (tbm == NULL)
{
@ -121,9 +123,8 @@ BitmapHeapNext(BitmapHeapScanState *node)
elog(ERROR, "unrecognized result from subplan");
node->tbm = tbm;
node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm);
node->tbmres = tbmres = NULL;
tbm_begin_iterate(tbm);
}
for (;;)
@ -136,7 +137,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
*/
if (tbmres == NULL)
{
node->tbmres = tbmres = tbm_iterate(tbm);
node->tbmres = tbmres = tbm_iterate(tbmiterator);
if (tbmres == NULL)
{
/* no more entries in the bitmap */
@ -376,9 +377,12 @@ ExecBitmapHeapReScan(BitmapHeapScanState *node, ExprContext *exprCtxt)
/* rescan to release any page pin */
heap_rescan(node->ss.ss_currentScanDesc, NULL);
if (node->tbmiterator)
tbm_end_iterate(node->tbmiterator);
if (node->tbm)
tbm_free(node->tbm);
node->tbm = NULL;
node->tbmiterator = NULL;
node->tbmres = NULL;
/*
@ -423,6 +427,8 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node)
/*
* release bitmap if any
*/
if (node->tbmiterator)
tbm_end_iterate(node->tbmiterator);
if (node->tbm)
tbm_free(node->tbm);
@ -466,6 +472,7 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
scanstate->ss.ps.state = estate;
scanstate->tbm = NULL;
scanstate->tbmiterator = NULL;
scanstate->tbmres = NULL;
/*

View File

@ -32,7 +32,7 @@
* Copyright (c) 2003-2009, PostgreSQL Global Development Group
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/nodes/tidbitmap.c,v 1.16 2009/01/01 17:23:43 momjian Exp $
* $PostgreSQL: pgsql/src/backend/nodes/tidbitmap.c,v 1.17 2009/01/10 21:08:36 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -136,9 +136,20 @@ struct TIDBitmap
int nchunks; /* number of lossy entries in pagetable */
bool iterating; /* tbm_begin_iterate called? */
PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */
/* the remaining fields are used while producing sorted output: */
/* these are valid when iterating is true: */
PagetableEntry **spages; /* sorted exact-page list, or NULL */
PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */
};
/*
* When iterating over a bitmap in sorted order, a TBMIterator is used to
* track our progress. There can be several iterators scanning the same
* bitmap concurrently. Note that the bitmap becomes read-only as soon as
* any iterator is created.
*/
struct TBMIterator
{
TIDBitmap *tbm; /* TIDBitmap we're iterating over */
int spageptr; /* next spages index */
int schunkptr; /* next schunks index */
int schunkbit; /* next bit to check in current schunk */
@ -172,16 +183,9 @@ tbm_create(long maxbytes)
TIDBitmap *tbm;
long nbuckets;
/*
* Create the TIDBitmap struct, with enough trailing space to serve the
* needs of the TBMIterateResult sub-struct.
*/
tbm = (TIDBitmap *) palloc(sizeof(TIDBitmap) +
MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
/* Zero all the fixed fields */
MemSetAligned(tbm, 0, sizeof(TIDBitmap));
/* Create the TIDBitmap struct and zero all its fields */
tbm = makeNode(TIDBitmap);
tbm->type = T_TIDBitmap; /* Set NodeTag */
tbm->mcxt = CurrentMemoryContext;
tbm->status = TBM_EMPTY;
@ -533,60 +537,80 @@ tbm_is_empty(const TIDBitmap *tbm)
/*
* tbm_begin_iterate - prepare to iterate through a TIDBitmap
*
* The TBMIterator struct is created in the caller's memory context.
* For a clean shutdown of the iteration, call tbm_end_iterate; but it's
* okay to just allow the memory context to be released, too. It is caller's
* responsibility not to touch the TBMIterator anymore once the TIDBitmap
* is freed.
*
* NB: after this is called, it is no longer allowed to modify the contents
* of the bitmap. However, you can call this multiple times to scan the
* contents repeatedly.
* contents repeatedly, including parallel scans.
*/
void
TBMIterator *
tbm_begin_iterate(TIDBitmap *tbm)
{
HASH_SEQ_STATUS status;
PagetableEntry *page;
int npages;
int nchunks;
TBMIterator *iterator;
/*
* Create the TBMIterator struct, with enough trailing space to serve the
* needs of the TBMIterateResult sub-struct.
*/
iterator = (TBMIterator *) palloc(sizeof(TBMIterator) +
MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
iterator->tbm = tbm;
/*
* Initialize iteration pointers.
*/
iterator->spageptr = 0;
iterator->schunkptr = 0;
iterator->schunkbit = 0;
/*
* If we have a hashtable, create and fill the sorted page lists,
* unless we already did that for a previous iterator. Note that the
* lists are attached to the bitmap not the iterator, so they can be
* used by more than one iterator.
*/
if (tbm->status == TBM_HASH && !tbm->iterating)
{
HASH_SEQ_STATUS status;
PagetableEntry *page;
int npages;
int nchunks;
if (!tbm->spages && tbm->npages > 0)
tbm->spages = (PagetableEntry **)
MemoryContextAlloc(tbm->mcxt,
tbm->npages * sizeof(PagetableEntry *));
if (!tbm->schunks && tbm->nchunks > 0)
tbm->schunks = (PagetableEntry **)
MemoryContextAlloc(tbm->mcxt,
tbm->nchunks * sizeof(PagetableEntry *));
hash_seq_init(&status, tbm->pagetable);
npages = nchunks = 0;
while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
{
if (page->ischunk)
tbm->schunks[nchunks++] = page;
else
tbm->spages[npages++] = page;
}
Assert(npages == tbm->npages);
Assert(nchunks == tbm->nchunks);
if (npages > 1)
qsort(tbm->spages, npages, sizeof(PagetableEntry *),
tbm_comparator);
if (nchunks > 1)
qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
tbm_comparator);
}
tbm->iterating = true;
/*
* Reset iteration pointers.
*/
tbm->spageptr = 0;
tbm->schunkptr = 0;
tbm->schunkbit = 0;
/*
* Nothing else to do if no entries, nor if we don't have a hashtable.
*/
if (tbm->nentries == 0 || tbm->status != TBM_HASH)
return;
/*
* Create and fill the sorted page lists if we didn't already.
*/
if (!tbm->spages && tbm->npages > 0)
tbm->spages = (PagetableEntry **)
MemoryContextAlloc(tbm->mcxt,
tbm->npages * sizeof(PagetableEntry *));
if (!tbm->schunks && tbm->nchunks > 0)
tbm->schunks = (PagetableEntry **)
MemoryContextAlloc(tbm->mcxt,
tbm->nchunks * sizeof(PagetableEntry *));
hash_seq_init(&status, tbm->pagetable);
npages = nchunks = 0;
while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
{
if (page->ischunk)
tbm->schunks[nchunks++] = page;
else
tbm->spages[npages++] = page;
}
Assert(npages == tbm->npages);
Assert(nchunks == tbm->nchunks);
if (npages > 1)
qsort(tbm->spages, npages, sizeof(PagetableEntry *), tbm_comparator);
if (nchunks > 1)
qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *), tbm_comparator);
return iterator;
}
/*
@ -602,9 +626,10 @@ tbm_begin_iterate(TIDBitmap *tbm)
* testing, recheck is always set true when ntuples < 0.)
*/
TBMIterateResult *
tbm_iterate(TIDBitmap *tbm)
tbm_iterate(TBMIterator *iterator)
{
TBMIterateResult *output = &(tbm->output);
TIDBitmap *tbm = iterator->tbm;
TBMIterateResult *output = &(iterator->output);
Assert(tbm->iterating);
@ -612,10 +637,10 @@ tbm_iterate(TIDBitmap *tbm)
* If lossy chunk pages remain, make sure we've advanced schunkptr/
* schunkbit to the next set bit.
*/
while (tbm->schunkptr < tbm->nchunks)
while (iterator->schunkptr < tbm->nchunks)
{
PagetableEntry *chunk = tbm->schunks[tbm->schunkptr];
int schunkbit = tbm->schunkbit;
PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
int schunkbit = iterator->schunkbit;
while (schunkbit < PAGES_PER_CHUNK)
{
@ -628,37 +653,37 @@ tbm_iterate(TIDBitmap *tbm)
}
if (schunkbit < PAGES_PER_CHUNK)
{
tbm->schunkbit = schunkbit;
iterator->schunkbit = schunkbit;
break;
}
/* advance to next chunk */
tbm->schunkptr++;
tbm->schunkbit = 0;
iterator->schunkptr++;
iterator->schunkbit = 0;
}
/*
* If both chunk and per-page data remain, must output the numerically
* earlier page.
*/
if (tbm->schunkptr < tbm->nchunks)
if (iterator->schunkptr < tbm->nchunks)
{
PagetableEntry *chunk = tbm->schunks[tbm->schunkptr];
PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
BlockNumber chunk_blockno;
chunk_blockno = chunk->blockno + tbm->schunkbit;
if (tbm->spageptr >= tbm->npages ||
chunk_blockno < tbm->spages[tbm->spageptr]->blockno)
chunk_blockno = chunk->blockno + iterator->schunkbit;
if (iterator->spageptr >= tbm->npages ||
chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
{
/* Return a lossy page indicator from the chunk */
output->blockno = chunk_blockno;
output->ntuples = -1;
output->recheck = true;
tbm->schunkbit++;
iterator->schunkbit++;
return output;
}
}
if (tbm->spageptr < tbm->npages)
if (iterator->spageptr < tbm->npages)
{
PagetableEntry *page;
int ntuples;
@ -668,7 +693,7 @@ tbm_iterate(TIDBitmap *tbm)
if (tbm->status == TBM_ONE_PAGE)
page = &tbm->entry1;
else
page = tbm->spages[tbm->spageptr];
page = tbm->spages[iterator->spageptr];
/* scan bitmap to extract individual offset numbers */
ntuples = 0;
@ -692,7 +717,7 @@ tbm_iterate(TIDBitmap *tbm)
output->blockno = page->blockno;
output->ntuples = ntuples;
output->recheck = page->recheck;
tbm->spageptr++;
iterator->spageptr++;
return output;
}
@ -700,6 +725,19 @@ tbm_iterate(TIDBitmap *tbm)
return NULL;
}
/*
* tbm_end_iterate - finish an iteration over a TIDBitmap
*
* Currently this is just a pfree, but it might do more someday. (For
* instance, it could be useful to count open iterators and allow the
* bitmap to return to read/write status when there are no more iterators.)
*/
void
tbm_end_iterate(TBMIterator *iterator)
{
pfree(iterator);
}
/*
* tbm_find_pageentry - find a PagetableEntry for the pageno
*

View File

@ -4,7 +4,7 @@
*
* Copyright (c) 2006-2009, PostgreSQL Global Development Group
*
* $PostgreSQL: pgsql/src/include/access/gin.h,v 1.27 2009/01/01 17:23:55 momjian Exp $
* $PostgreSQL: pgsql/src/include/access/gin.h,v 1.28 2009/01/10 21:08:36 tgl Exp $
*--------------------------------------------------------------------------
*/
@ -380,6 +380,7 @@ typedef struct GinScanEntryData
/* partial match support */
bool isPartialMatch;
TIDBitmap *partialMatch;
TBMIterator *partialMatchIterator;
TBMIterateResult *partialMatchResult;
StrategyNumber strategy;

View File

@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.199 2009/01/01 17:23:59 momjian Exp $
* $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.200 2009/01/10 21:08:36 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -1152,6 +1152,7 @@ typedef struct BitmapIndexScanState
*
* bitmapqualorig execution state for bitmapqualorig expressions
* tbm bitmap obtained from child index scan(s)
* tbmiterator iterator for scanning current pages
* tbmres current-page data
* ----------------
*/
@ -1160,6 +1161,7 @@ typedef struct BitmapHeapScanState
ScanState ss; /* its first field is NodeTag */
List *bitmapqualorig;
TIDBitmap *tbm;
TBMIterator *tbmiterator;
TBMIterateResult *tbmres;
} BitmapHeapScanState;

View File

@ -15,7 +15,7 @@
*
* Copyright (c) 2003-2009, PostgreSQL Global Development Group
*
* $PostgreSQL: pgsql/src/include/nodes/tidbitmap.h,v 1.8 2009/01/01 17:24:00 momjian Exp $
* $PostgreSQL: pgsql/src/include/nodes/tidbitmap.h,v 1.9 2009/01/10 21:08:36 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -31,6 +31,9 @@
*/
typedef struct TIDBitmap TIDBitmap;
/* Likewise, TBMIterator is private */
typedef struct TBMIterator TBMIterator;
/* Result structure for tbm_iterate */
typedef struct
{
@ -55,7 +58,8 @@ extern void tbm_intersect(TIDBitmap *a, const TIDBitmap *b);
extern bool tbm_is_empty(const TIDBitmap *tbm);
extern void tbm_begin_iterate(TIDBitmap *tbm);
extern TBMIterateResult *tbm_iterate(TIDBitmap *tbm);
extern TBMIterator *tbm_begin_iterate(TIDBitmap *tbm);
extern TBMIterateResult *tbm_iterate(TBMIterator *iterator);
extern void tbm_end_iterate(TBMIterator *iterator);
#endif /* TIDBITMAP_H */