Rewrite hashbulkdelete() to make it amenable to new bucket locking

scheme.  A pleasant side effect is that it is *much* faster when deleting
a large fraction of the indexed tuples, because of elimination of
redundant hash_step activity induced by hash_adjscans.  Various other
continuing code cleanup.
This commit is contained in:
Tom Lane 2003-09-02 02:18:38 +00:00
parent 5f65345a57
commit 39673ca47b
6 changed files with 233 additions and 78 deletions

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.65 2003/08/04 02:39:57 momjian Exp $
* $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.66 2003/09/02 02:18:38 tgl Exp $
*
* NOTES
* This file contains only the public interface routines.
@ -449,40 +449,178 @@ hashbulkdelete(PG_FUNCTION_ARGS)
BlockNumber num_pages;
double tuples_removed;
double num_index_tuples;
IndexScanDesc iscan;
tuples_removed = 0;
num_index_tuples = 0;
uint32 deleted_tuples;
uint32 tuples_remaining;
uint32 orig_ntuples;
Bucket orig_maxbucket;
Bucket cur_maxbucket;
Bucket cur_bucket;
Buffer metabuf;
HashMetaPage metap;
HashMetaPageData local_metapage;
/*
* XXX generic implementation --- should be improved!
* keep track of counts in both float form (to return) and integer form
* (to update hashm_ntuples). It'd be better to make hashm_ntuples a
* double, but that will have to wait for an initdb.
*/
tuples_removed = 0;
num_index_tuples = 0;
deleted_tuples = 0;
tuples_remaining = 0;
/* walk through the entire index */
iscan = index_beginscan(NULL, rel, SnapshotAny, 0, (ScanKey) NULL);
/* including killed tuples */
iscan->ignore_killed_tuples = false;
/*
* Read the metapage to fetch original bucket and tuple counts. Also,
* we keep a copy of the last-seen metapage so that we can use its
* hashm_spares[] values to compute bucket page addresses. This is a
* bit hokey but perfectly safe, since the interesting entries in the
* spares array cannot change under us; and it beats rereading the
* metapage for each bucket.
*/
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ);
metap = (HashMetaPage) BufferGetPage(metabuf);
_hash_checkpage((Page) metap, LH_META_PAGE);
orig_maxbucket = metap->hashm_maxbucket;
orig_ntuples = metap->hashm_ntuples;
memcpy(&local_metapage, metap, sizeof(local_metapage));
_hash_relbuf(rel, metabuf, HASH_READ);
while (index_getnext_indexitem(iscan, ForwardScanDirection))
/* Scan the buckets that we know exist */
cur_bucket = 0;
cur_maxbucket = orig_maxbucket;
loop_top:
while (cur_bucket <= cur_maxbucket)
{
if (callback(&iscan->xs_ctup.t_self, callback_state))
{
ItemPointerData indextup = iscan->currentItemData;
BlockNumber bucket_blkno;
BlockNumber blkno;
bool bucket_dirty = false;
/* adjust any active scans that will be affected by deletion */
/* (namely, my own scan) */
/* Get address of bucket's start page */
bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket);
/* XXX lock bucket here */
/* Scan each page in bucket */
blkno = bucket_blkno;
while (BlockNumberIsValid(blkno))
{
Buffer buf;
Page page;
HashPageOpaque opaque;
OffsetNumber offno;
OffsetNumber maxoffno;
bool page_dirty = false;
buf = _hash_getbuf(rel, blkno, HASH_WRITE);
page = BufferGetPage(buf);
_hash_checkpage(page, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
Assert(opaque->hasho_bucket == cur_bucket);
/* Scan each tuple in page */
offno = FirstOffsetNumber;
maxoffno = PageGetMaxOffsetNumber(page);
while (offno <= maxoffno)
{
HashItem hitem;
ItemPointer htup;
hitem = (HashItem) PageGetItem(page,
PageGetItemId(page, offno));
htup = &(hitem->hash_itup.t_tid);
if (callback(htup, callback_state))
{
ItemPointerData indextup;
/* adjust any active scans that will be affected */
/* (this should be unnecessary) */
ItemPointerSet(&indextup, blkno, offno);
_hash_adjscans(rel, &indextup);
/* delete the data from the page */
_hash_pagedel(rel, &indextup);
/* delete the item from the page */
PageIndexTupleDelete(page, offno);
bucket_dirty = page_dirty = true;
/* don't increment offno, instead decrement maxoffno */
maxoffno = OffsetNumberPrev(maxoffno);
tuples_removed += 1;
deleted_tuples += 1;
}
else
{
offno = OffsetNumberNext(offno);
num_index_tuples += 1;
tuples_remaining += 1;
}
}
index_endscan(iscan);
/*
* Write or free page if needed, advance to next page. We want
* to preserve the invariant that overflow pages are nonempty.
*/
blkno = opaque->hasho_nextblkno;
if (PageIsEmpty(page) && (opaque->hasho_flag & LH_OVERFLOW_PAGE))
_hash_freeovflpage(rel, buf);
else if (page_dirty)
_hash_wrtbuf(rel, buf);
else
_hash_relbuf(rel, buf, HASH_WRITE);
}
/* If we deleted anything, try to compact free space */
if (bucket_dirty)
_hash_squeezebucket(rel, cur_bucket, bucket_blkno);
/* XXX unlock bucket here */
/* Advance to next bucket */
cur_bucket++;
}
/* Write-lock metapage and check for split since we started */
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
metap = (HashMetaPage) BufferGetPage(metabuf);
_hash_checkpage((Page) metap, LH_META_PAGE);
if (cur_maxbucket != metap->hashm_maxbucket)
{
/* There's been a split, so process the additional bucket(s) */
cur_maxbucket = metap->hashm_maxbucket;
memcpy(&local_metapage, metap, sizeof(local_metapage));
_hash_relbuf(rel, metabuf, HASH_WRITE);
goto loop_top;
}
/* Okay, we're really done. Update tuple count in metapage. */
if (orig_maxbucket == metap->hashm_maxbucket &&
orig_ntuples == metap->hashm_ntuples)
{
/*
* No one has split or inserted anything since start of scan,
* so believe our count as gospel.
*/
metap->hashm_ntuples = tuples_remaining;
}
else
{
/*
* Otherwise, our count is untrustworthy since we may have
* double-scanned tuples in split buckets. Proceed by
* dead-reckoning.
*/
if (metap->hashm_ntuples > deleted_tuples)
metap->hashm_ntuples -= deleted_tuples;
else
metap->hashm_ntuples = 0;
num_index_tuples = metap->hashm_ntuples;
}
_hash_wrtbuf(rel, metabuf);
/* return statistics */
num_pages = RelationGetNumberOfBlocks(rel);

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/hash/hashovfl.c,v 1.38 2003/09/01 20:26:34 tgl Exp $
* $Header: /cvsroot/pgsql/src/backend/access/hash/hashovfl.c,v 1.39 2003/09/02 02:18:38 tgl Exp $
*
* NOTES
* Overflow pages look like ordinary relation pages.
@ -444,11 +444,13 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno)
* first page in the bucket chain. The read page works backward and
* the write page works forward; the procedure terminates when the
* read page and write page are the same page.
*
* Caller must hold exclusive lock on the target bucket.
*/
void
_hash_squeezebucket(Relation rel,
HashMetaPage metap,
Bucket bucket)
Bucket bucket,
BlockNumber bucket_blkno)
{
Buffer wbuf;
Buffer rbuf = 0;
@ -466,7 +468,7 @@ _hash_squeezebucket(Relation rel,
/*
* start squeezing into the base bucket page.
*/
wblkno = BUCKET_TO_BLKNO(bucket);
wblkno = bucket_blkno;
wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE);
wpage = BufferGetPage(wbuf);
_hash_checkpage(wpage, LH_BUCKET_PAGE);
@ -484,11 +486,6 @@ _hash_squeezebucket(Relation rel,
/*
* find the last page in the bucket chain by starting at the base
* bucket page and working forward.
*
* XXX if chains tend to be long, we should probably move forward using
* HASH_READ and then _hash_chgbufaccess to HASH_WRITE when we reach
* the end. if they are short we probably don't care very much. if
* the hash function is working at all, they had better be short..
*/
ropaque = wopaque;
do

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.39 2003/09/01 20:26:34 tgl Exp $
* $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.40 2003/09/02 02:18:38 tgl Exp $
*
* NOTES
* Postgres hash pages look like ordinary relation pages. The opaque
@ -143,7 +143,7 @@ _hash_metapinit(Relation rel)
*/
for (i = 0; i <= 1; i++)
{
buf = _hash_getbuf(rel, BUCKET_TO_BLKNO(i), HASH_WRITE);
buf = _hash_getbuf(rel, BUCKET_TO_BLKNO(metap, i), HASH_WRITE);
pg = BufferGetPage(buf);
_hash_pageinit(pg, BufferGetPageSize(buf));
pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
@ -456,6 +456,8 @@ _hash_splitbucket(Relation rel,
Buffer ovflbuf;
BlockNumber oblkno;
BlockNumber nblkno;
BlockNumber start_oblkno;
BlockNumber start_nblkno;
bool null;
Datum datum;
HashItem hitem;
@ -475,8 +477,10 @@ _hash_splitbucket(Relation rel,
_hash_checkpage((Page) metap, LH_META_PAGE);
/* get the buffers & pages */
oblkno = BUCKET_TO_BLKNO(obucket);
nblkno = BUCKET_TO_BLKNO(nbucket);
start_oblkno = BUCKET_TO_BLKNO(metap, obucket);
start_nblkno = BUCKET_TO_BLKNO(metap, nbucket);
oblkno = start_oblkno;
nblkno = start_nblkno;
obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
nbuf = _hash_getbuf(rel, nblkno, HASH_WRITE);
opage = BufferGetPage(obuf);
@ -571,7 +575,7 @@ _hash_splitbucket(Relation rel,
*/
_hash_wrtbuf(rel, obuf);
_hash_wrtbuf(rel, nbuf);
_hash_squeezebucket(rel, metap, obucket);
_hash_squeezebucket(rel, obucket, start_oblkno);
return;
}
}
@ -639,7 +643,7 @@ _hash_splitbucket(Relation rel,
if (!BlockNumberIsValid(oblkno))
{
_hash_wrtbuf(rel, nbuf);
_hash_squeezebucket(rel, metap, obucket);
_hash_squeezebucket(rel, obucket, start_oblkno);
return;
}

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/hash/hashsearch.c,v 1.31 2003/08/04 02:39:57 momjian Exp $
* $Header: /cvsroot/pgsql/src/backend/access/hash/hashsearch.c,v 1.32 2003/09/02 02:18:38 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -19,8 +19,10 @@
/*
* _hash_search() -- Finds the page/bucket that the contains the
* scankey and loads it into *bufP. the buffer has a read lock.
* _hash_search() -- Find the bucket that contains the scankey
* and fetch its primary bucket page into *bufP.
*
* the buffer has a read lock.
*/
void
_hash_search(Relation rel,
@ -30,22 +32,23 @@ _hash_search(Relation rel,
HashMetaPage metap)
{
BlockNumber blkno;
Datum keyDatum;
Bucket bucket;
if (scankey == (ScanKey) NULL ||
(keyDatum = scankey[0].sk_argument) == (Datum) NULL)
if (scankey == NULL)
{
/*
* If the scankey argument is NULL, all tuples will satisfy the
* If the scankey is empty, all tuples will satisfy the
* scan so we start the scan at the first bucket (bucket 0).
*/
bucket = 0;
}
else
bucket = _hash_call(rel, metap, keyDatum);
{
Assert(!(scankey[0].sk_flags & SK_ISNULL));
bucket = _hash_call(rel, metap, scankey[0].sk_argument);
}
blkno = BUCKET_TO_BLKNO(bucket);
blkno = BUCKET_TO_BLKNO(metap, bucket);
*bufP = _hash_getbuf(rel, blkno, HASH_READ);
}
@ -330,7 +333,7 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir, Buffer metabuf)
if (allbuckets && bucket < metap->hashm_maxbucket)
{
++bucket;
blkno = BUCKET_TO_BLKNO(bucket);
blkno = BUCKET_TO_BLKNO(metap, bucket);
buf = _hash_getbuf(rel, blkno, HASH_READ);
page = BufferGetPage(buf);
_hash_checkpage(page, LH_BUCKET_PAGE);
@ -380,7 +383,7 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir, Buffer metabuf)
if (allbuckets && bucket > 0)
{
--bucket;
blkno = BUCKET_TO_BLKNO(bucket);
blkno = BUCKET_TO_BLKNO(metap, bucket);
buf = _hash_getbuf(rel, blkno, HASH_READ);
page = BufferGetPage(buf);
_hash_checkpage(page, LH_BUCKET_PAGE);

View File

@ -8,11 +8,10 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/hash/hashutil.c,v 1.33 2003/08/04 02:39:57 momjian Exp $
* $Header: /cvsroot/pgsql/src/backend/access/hash/hashutil.c,v 1.34 2003/09/02 02:18:38 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/genam.h"
@ -20,20 +19,23 @@
#include "access/iqual.h"
/*
* _hash_mkscankey -- build a scan key matching the given indextuple
*
* Note: this is prepared for multiple index columns, but very little
* else in access/hash is ...
*/
ScanKey
_hash_mkscankey(Relation rel, IndexTuple itup)
{
ScanKey skey;
TupleDesc itupdesc;
int natts;
TupleDesc itupdesc = RelationGetDescr(rel);
int natts = rel->rd_rel->relnatts;
AttrNumber i;
Datum arg;
FmgrInfo *procinfo;
bool isnull;
natts = rel->rd_rel->relnatts;
itupdesc = RelationGetDescr(rel);
skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));
for (i = 0; i < natts; i++)
@ -41,7 +43,7 @@ _hash_mkscankey(Relation rel, IndexTuple itup)
arg = index_getattr(itup, i + 1, itupdesc, &isnull);
procinfo = index_getprocinfo(rel, i + 1, HASHPROC);
ScanKeyEntryInitializeWithInfo(&skey[i],
0x0,
isnull ? SK_ISNULL : 0x0,
(AttrNumber) (i + 1),
procinfo,
CurrentMemoryContext,
@ -57,18 +59,19 @@ _hash_freeskey(ScanKey skey)
pfree(skey);
}
/*
* _hash_checkqual -- does the index tuple satisfy the scan conditions?
*/
bool
_hash_checkqual(IndexScanDesc scan, IndexTuple itup)
{
if (scan->numberOfKeys > 0)
return (index_keytest(itup,
RelationGetDescr(scan->indexRelation),
scan->numberOfKeys, scan->keyData));
else
return true;
return index_keytest(itup, RelationGetDescr(scan->indexRelation),
scan->numberOfKeys, scan->keyData);
}
/*
* _hash_formitem -- construct a hash index entry
*/
HashItem
_hash_formitem(IndexTuple itup)
{
@ -82,17 +85,27 @@ _hash_formitem(IndexTuple itup)
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("hash indexes cannot include null keys")));
/* make a copy of the index tuple with room for the sequence number */
/*
* make a copy of the index tuple (XXX do we still need to copy?)
*
* HashItemData used to have more fields than IndexTupleData, but no
* longer...
*/
tuplen = IndexTupleSize(itup);
nbytes_hitem = tuplen +
(sizeof(HashItemData) - sizeof(IndexTupleData));
hitem = (HashItem) palloc(nbytes_hitem);
memmove((char *) &(hitem->hash_itup), (char *) itup, tuplen);
memcpy((char *) &(hitem->hash_itup), (char *) itup, tuplen);
return hitem;
}
/*
* _hash_call -- given a Datum, call the index's hash procedure
*
* Returns the bucket number that the hash key maps to.
*/
Bucket
_hash_call(Relation rel, HashMetaPage metap, Datum key)
{
@ -103,9 +116,11 @@ _hash_call(Relation rel, HashMetaPage metap, Datum key)
/* XXX assumes index has only one attribute */
procinfo = index_getprocinfo(rel, 1, HASHPROC);
n = DatumGetUInt32(FunctionCall1(procinfo, key));
bucket = n & metap->hashm_highmask;
if (bucket > metap->hashm_maxbucket)
bucket = bucket & metap->hashm_lowmask;
return bucket;
}
@ -119,7 +134,7 @@ _hash_log2(uint32 num)
limit;
limit = 1;
for (i = 0; limit < num; limit = limit << 1, i++)
for (i = 0; limit < num; limit <<= 1, i++)
;
return i;
}
@ -130,20 +145,19 @@ _hash_log2(uint32 num)
void
_hash_checkpage(Page page, int flags)
{
HashPageOpaque opaque;
#ifdef USE_ASSERT_CHECKING
Assert(page);
Assert(((PageHeader) (page))->pd_lower >= SizeOfPageHeaderData);
#if 1
Assert(((PageHeader) (page))->pd_upper <=
(BLCKSZ - MAXALIGN(sizeof(HashPageOpaqueData))));
Assert(((PageHeader) (page))->pd_special ==
(BLCKSZ - MAXALIGN(sizeof(HashPageOpaqueData))));
Assert(PageGetPageSize(page) == BLCKSZ);
#endif
if (flags)
{
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
HashPageOpaque opaque = (HashPageOpaque) PageGetSpecialPointer(page);
Assert(opaque->hasho_flag & flags);
}
#endif /* USE_ASSERT_CHECKING */
}

View File

@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $Id: hash.h,v 1.50 2003/09/01 20:26:34 tgl Exp $
* $Id: hash.h,v 1.51 2003/09/02 02:18:38 tgl Exp $
*
* NOTES
* modeled after Margo Seltzer's hash implementation for unix.
@ -25,13 +25,12 @@
/*
* Mapping from hash bucket number to physical block number of bucket's
* starting page. Beware of multiple evaluations of argument! Also notice
* macro's implicit dependency on "metap".
* starting page. Beware of multiple evaluations of argument!
*/
typedef uint32 Bucket;
#define BUCKET_TO_BLKNO(B) \
((BlockNumber) ((B) + ((B) ? metap->hashm_spares[_hash_log2((B)+1)-1] : 0)) + 1)
#define BUCKET_TO_BLKNO(metap,B) \
((BlockNumber) ((B) + ((B) ? (metap)->hashm_spares[_hash_log2((B)+1)-1] : 0)) + 1)
/*
* Special space for hash index pages.
@ -243,8 +242,8 @@ extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf);
extern BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf);
extern void _hash_initbitmap(Relation rel, HashMetaPage metap,
BlockNumber blkno);
extern void _hash_squeezebucket(Relation rel, HashMetaPage metap,
Bucket bucket);
extern void _hash_squeezebucket(Relation rel,
Bucket bucket, BlockNumber bucket_blkno);
/* hashpage.c */
extern void _hash_metapinit(Relation rel);