diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index fbdf0dc04c..0d2f8b6199 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.65 2003/08/04 02:39:57 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.66 2003/09/02 02:18:38 tgl Exp $ * * NOTES * This file contains only the public interface routines. @@ -449,40 +449,178 @@ hashbulkdelete(PG_FUNCTION_ARGS) BlockNumber num_pages; double tuples_removed; double num_index_tuples; - IndexScanDesc iscan; - - tuples_removed = 0; - num_index_tuples = 0; + uint32 deleted_tuples; + uint32 tuples_remaining; + uint32 orig_ntuples; + Bucket orig_maxbucket; + Bucket cur_maxbucket; + Bucket cur_bucket; + Buffer metabuf; + HashMetaPage metap; + HashMetaPageData local_metapage; /* - * XXX generic implementation --- should be improved! + * keep track of counts in both float form (to return) and integer form + * (to update hashm_ntuples). It'd be better to make hashm_ntuples a + * double, but that will have to wait for an initdb. */ + tuples_removed = 0; + num_index_tuples = 0; + deleted_tuples = 0; + tuples_remaining = 0; - /* walk through the entire index */ - iscan = index_beginscan(NULL, rel, SnapshotAny, 0, (ScanKey) NULL); - /* including killed tuples */ - iscan->ignore_killed_tuples = false; + /* + * Read the metapage to fetch original bucket and tuple counts. Also, + * we keep a copy of the last-seen metapage so that we can use its + * hashm_spares[] values to compute bucket page addresses. This is a + * bit hokey but perfectly safe, since the interesting entries in the + * spares array cannot change under us; and it beats rereading the + * metapage for each bucket. + */ + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ); + metap = (HashMetaPage) BufferGetPage(metabuf); + _hash_checkpage((Page) metap, LH_META_PAGE); + orig_maxbucket = metap->hashm_maxbucket; + orig_ntuples = metap->hashm_ntuples; + memcpy(&local_metapage, metap, sizeof(local_metapage)); + _hash_relbuf(rel, metabuf, HASH_READ); - while (index_getnext_indexitem(iscan, ForwardScanDirection)) + /* Scan the buckets that we know exist */ + cur_bucket = 0; + cur_maxbucket = orig_maxbucket; + +loop_top: + while (cur_bucket <= cur_maxbucket) { - if (callback(&iscan->xs_ctup.t_self, callback_state)) + BlockNumber bucket_blkno; + BlockNumber blkno; + bool bucket_dirty = false; + + /* Get address of bucket's start page */ + bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket); + + /* XXX lock bucket here */ + + /* Scan each page in bucket */ + blkno = bucket_blkno; + while (BlockNumberIsValid(blkno)) { - ItemPointerData indextup = iscan->currentItemData; + Buffer buf; + Page page; + HashPageOpaque opaque; + OffsetNumber offno; + OffsetNumber maxoffno; + bool page_dirty = false; - /* adjust any active scans that will be affected by deletion */ - /* (namely, my own scan) */ - _hash_adjscans(rel, &indextup); + buf = _hash_getbuf(rel, blkno, HASH_WRITE); + page = BufferGetPage(buf); + _hash_checkpage(page, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + Assert(opaque->hasho_bucket == cur_bucket); - /* delete the data from the page */ - _hash_pagedel(rel, &indextup); + /* Scan each tuple in page */ + offno = FirstOffsetNumber; + maxoffno = PageGetMaxOffsetNumber(page); + while (offno <= maxoffno) + { + HashItem hitem; + ItemPointer htup; - tuples_removed += 1; + hitem = (HashItem) PageGetItem(page, + PageGetItemId(page, offno)); + htup = &(hitem->hash_itup.t_tid); + if (callback(htup, callback_state)) + { + ItemPointerData indextup; + + /* adjust any active scans that will be affected */ + /* (this should be unnecessary) */ + ItemPointerSet(&indextup, blkno, offno); + _hash_adjscans(rel, &indextup); + + /* delete the item from the page */ + PageIndexTupleDelete(page, offno); + bucket_dirty = page_dirty = true; + + /* don't increment offno, instead decrement maxoffno */ + maxoffno = OffsetNumberPrev(maxoffno); + + tuples_removed += 1; + deleted_tuples += 1; + } + else + { + offno = OffsetNumberNext(offno); + + num_index_tuples += 1; + tuples_remaining += 1; + } + } + + /* + * Write or free page if needed, advance to next page. We want + * to preserve the invariant that overflow pages are nonempty. + */ + blkno = opaque->hasho_nextblkno; + + if (PageIsEmpty(page) && (opaque->hasho_flag & LH_OVERFLOW_PAGE)) + _hash_freeovflpage(rel, buf); + else if (page_dirty) + _hash_wrtbuf(rel, buf); + else + _hash_relbuf(rel, buf, HASH_WRITE); } - else - num_index_tuples += 1; + + /* If we deleted anything, try to compact free space */ + if (bucket_dirty) + _hash_squeezebucket(rel, cur_bucket, bucket_blkno); + + /* XXX unlock bucket here */ + + /* Advance to next bucket */ + cur_bucket++; } - index_endscan(iscan); + /* Write-lock metapage and check for split since we started */ + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE); + metap = (HashMetaPage) BufferGetPage(metabuf); + _hash_checkpage((Page) metap, LH_META_PAGE); + + if (cur_maxbucket != metap->hashm_maxbucket) + { + /* There's been a split, so process the additional bucket(s) */ + cur_maxbucket = metap->hashm_maxbucket; + memcpy(&local_metapage, metap, sizeof(local_metapage)); + _hash_relbuf(rel, metabuf, HASH_WRITE); + goto loop_top; + } + + /* Okay, we're really done. Update tuple count in metapage. */ + + if (orig_maxbucket == metap->hashm_maxbucket && + orig_ntuples == metap->hashm_ntuples) + { + /* + * No one has split or inserted anything since start of scan, + * so believe our count as gospel. + */ + metap->hashm_ntuples = tuples_remaining; + } + else + { + /* + * Otherwise, our count is untrustworthy since we may have + * double-scanned tuples in split buckets. Proceed by + * dead-reckoning. + */ + if (metap->hashm_ntuples > deleted_tuples) + metap->hashm_ntuples -= deleted_tuples; + else + metap->hashm_ntuples = 0; + num_index_tuples = metap->hashm_ntuples; + } + + _hash_wrtbuf(rel, metabuf); /* return statistics */ num_pages = RelationGetNumberOfBlocks(rel); diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c index aa74c547da..f3f120e47f 100644 --- a/src/backend/access/hash/hashovfl.c +++ b/src/backend/access/hash/hashovfl.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/hash/hashovfl.c,v 1.38 2003/09/01 20:26:34 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/hash/hashovfl.c,v 1.39 2003/09/02 02:18:38 tgl Exp $ * * NOTES * Overflow pages look like ordinary relation pages. @@ -444,11 +444,13 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno) * first page in the bucket chain. The read page works backward and * the write page works forward; the procedure terminates when the * read page and write page are the same page. + * + * Caller must hold exclusive lock on the target bucket. */ void _hash_squeezebucket(Relation rel, - HashMetaPage metap, - Bucket bucket) + Bucket bucket, + BlockNumber bucket_blkno) { Buffer wbuf; Buffer rbuf = 0; @@ -466,7 +468,7 @@ _hash_squeezebucket(Relation rel, /* * start squeezing into the base bucket page. */ - wblkno = BUCKET_TO_BLKNO(bucket); + wblkno = bucket_blkno; wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE); wpage = BufferGetPage(wbuf); _hash_checkpage(wpage, LH_BUCKET_PAGE); @@ -484,11 +486,6 @@ _hash_squeezebucket(Relation rel, /* * find the last page in the bucket chain by starting at the base * bucket page and working forward. - * - * XXX if chains tend to be long, we should probably move forward using - * HASH_READ and then _hash_chgbufaccess to HASH_WRITE when we reach - * the end. if they are short we probably don't care very much. if - * the hash function is working at all, they had better be short.. */ ropaque = wopaque; do diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index e5e77c94b6..23d8e0bdf6 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.39 2003/09/01 20:26:34 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.40 2003/09/02 02:18:38 tgl Exp $ * * NOTES * Postgres hash pages look like ordinary relation pages. The opaque @@ -143,7 +143,7 @@ _hash_metapinit(Relation rel) */ for (i = 0; i <= 1; i++) { - buf = _hash_getbuf(rel, BUCKET_TO_BLKNO(i), HASH_WRITE); + buf = _hash_getbuf(rel, BUCKET_TO_BLKNO(metap, i), HASH_WRITE); pg = BufferGetPage(buf); _hash_pageinit(pg, BufferGetPageSize(buf)); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); @@ -456,6 +456,8 @@ _hash_splitbucket(Relation rel, Buffer ovflbuf; BlockNumber oblkno; BlockNumber nblkno; + BlockNumber start_oblkno; + BlockNumber start_nblkno; bool null; Datum datum; HashItem hitem; @@ -475,8 +477,10 @@ _hash_splitbucket(Relation rel, _hash_checkpage((Page) metap, LH_META_PAGE); /* get the buffers & pages */ - oblkno = BUCKET_TO_BLKNO(obucket); - nblkno = BUCKET_TO_BLKNO(nbucket); + start_oblkno = BUCKET_TO_BLKNO(metap, obucket); + start_nblkno = BUCKET_TO_BLKNO(metap, nbucket); + oblkno = start_oblkno; + nblkno = start_nblkno; obuf = _hash_getbuf(rel, oblkno, HASH_WRITE); nbuf = _hash_getbuf(rel, nblkno, HASH_WRITE); opage = BufferGetPage(obuf); @@ -571,7 +575,7 @@ _hash_splitbucket(Relation rel, */ _hash_wrtbuf(rel, obuf); _hash_wrtbuf(rel, nbuf); - _hash_squeezebucket(rel, metap, obucket); + _hash_squeezebucket(rel, obucket, start_oblkno); return; } } @@ -639,7 +643,7 @@ _hash_splitbucket(Relation rel, if (!BlockNumberIsValid(oblkno)) { _hash_wrtbuf(rel, nbuf); - _hash_squeezebucket(rel, metap, obucket); + _hash_squeezebucket(rel, obucket, start_oblkno); return; } diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c index 53796c7e76..3237e7a8fd 100644 --- a/src/backend/access/hash/hashsearch.c +++ b/src/backend/access/hash/hashsearch.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/hash/hashsearch.c,v 1.31 2003/08/04 02:39:57 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/hash/hashsearch.c,v 1.32 2003/09/02 02:18:38 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -19,8 +19,10 @@ /* - * _hash_search() -- Finds the page/bucket that the contains the - * scankey and loads it into *bufP. the buffer has a read lock. + * _hash_search() -- Find the bucket that contains the scankey + * and fetch its primary bucket page into *bufP. + * + * the buffer has a read lock. */ void _hash_search(Relation rel, @@ -30,22 +32,23 @@ _hash_search(Relation rel, HashMetaPage metap) { BlockNumber blkno; - Datum keyDatum; Bucket bucket; - if (scankey == (ScanKey) NULL || - (keyDatum = scankey[0].sk_argument) == (Datum) NULL) + if (scankey == NULL) { /* - * If the scankey argument is NULL, all tuples will satisfy the + * If the scankey is empty, all tuples will satisfy the * scan so we start the scan at the first bucket (bucket 0). */ bucket = 0; } else - bucket = _hash_call(rel, metap, keyDatum); + { + Assert(!(scankey[0].sk_flags & SK_ISNULL)); + bucket = _hash_call(rel, metap, scankey[0].sk_argument); + } - blkno = BUCKET_TO_BLKNO(bucket); + blkno = BUCKET_TO_BLKNO(metap, bucket); *bufP = _hash_getbuf(rel, blkno, HASH_READ); } @@ -330,7 +333,7 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir, Buffer metabuf) if (allbuckets && bucket < metap->hashm_maxbucket) { ++bucket; - blkno = BUCKET_TO_BLKNO(bucket); + blkno = BUCKET_TO_BLKNO(metap, bucket); buf = _hash_getbuf(rel, blkno, HASH_READ); page = BufferGetPage(buf); _hash_checkpage(page, LH_BUCKET_PAGE); @@ -380,7 +383,7 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir, Buffer metabuf) if (allbuckets && bucket > 0) { --bucket; - blkno = BUCKET_TO_BLKNO(bucket); + blkno = BUCKET_TO_BLKNO(metap, bucket); buf = _hash_getbuf(rel, blkno, HASH_READ); page = BufferGetPage(buf); _hash_checkpage(page, LH_BUCKET_PAGE); diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c index 83126fccb4..76d9bc5f4e 100644 --- a/src/backend/access/hash/hashutil.c +++ b/src/backend/access/hash/hashutil.c @@ -8,11 +8,10 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/hash/hashutil.c,v 1.33 2003/08/04 02:39:57 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/hash/hashutil.c,v 1.34 2003/09/02 02:18:38 tgl Exp $ * *------------------------------------------------------------------------- */ - #include "postgres.h" #include "access/genam.h" @@ -20,20 +19,23 @@ #include "access/iqual.h" +/* + * _hash_mkscankey -- build a scan key matching the given indextuple + * + * Note: this is prepared for multiple index columns, but very little + * else in access/hash is ... + */ ScanKey _hash_mkscankey(Relation rel, IndexTuple itup) { ScanKey skey; - TupleDesc itupdesc; - int natts; + TupleDesc itupdesc = RelationGetDescr(rel); + int natts = rel->rd_rel->relnatts; AttrNumber i; Datum arg; FmgrInfo *procinfo; bool isnull; - natts = rel->rd_rel->relnatts; - itupdesc = RelationGetDescr(rel); - skey = (ScanKey) palloc(natts * sizeof(ScanKeyData)); for (i = 0; i < natts; i++) @@ -41,7 +43,7 @@ _hash_mkscankey(Relation rel, IndexTuple itup) arg = index_getattr(itup, i + 1, itupdesc, &isnull); procinfo = index_getprocinfo(rel, i + 1, HASHPROC); ScanKeyEntryInitializeWithInfo(&skey[i], - 0x0, + isnull ? SK_ISNULL : 0x0, (AttrNumber) (i + 1), procinfo, CurrentMemoryContext, @@ -57,18 +59,19 @@ _hash_freeskey(ScanKey skey) pfree(skey); } - +/* + * _hash_checkqual -- does the index tuple satisfy the scan conditions? + */ bool _hash_checkqual(IndexScanDesc scan, IndexTuple itup) { - if (scan->numberOfKeys > 0) - return (index_keytest(itup, - RelationGetDescr(scan->indexRelation), - scan->numberOfKeys, scan->keyData)); - else - return true; + return index_keytest(itup, RelationGetDescr(scan->indexRelation), + scan->numberOfKeys, scan->keyData); } +/* + * _hash_formitem -- construct a hash index entry + */ HashItem _hash_formitem(IndexTuple itup) { @@ -82,17 +85,27 @@ _hash_formitem(IndexTuple itup) (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("hash indexes cannot include null keys"))); - /* make a copy of the index tuple with room for the sequence number */ + /* + * make a copy of the index tuple (XXX do we still need to copy?) + * + * HashItemData used to have more fields than IndexTupleData, but no + * longer... + */ tuplen = IndexTupleSize(itup); nbytes_hitem = tuplen + (sizeof(HashItemData) - sizeof(IndexTupleData)); hitem = (HashItem) palloc(nbytes_hitem); - memmove((char *) &(hitem->hash_itup), (char *) itup, tuplen); + memcpy((char *) &(hitem->hash_itup), (char *) itup, tuplen); return hitem; } +/* + * _hash_call -- given a Datum, call the index's hash procedure + * + * Returns the bucket number that the hash key maps to. + */ Bucket _hash_call(Relation rel, HashMetaPage metap, Datum key) { @@ -103,9 +116,11 @@ _hash_call(Relation rel, HashMetaPage metap, Datum key) /* XXX assumes index has only one attribute */ procinfo = index_getprocinfo(rel, 1, HASHPROC); n = DatumGetUInt32(FunctionCall1(procinfo, key)); + bucket = n & metap->hashm_highmask; if (bucket > metap->hashm_maxbucket) bucket = bucket & metap->hashm_lowmask; + return bucket; } @@ -119,7 +134,7 @@ _hash_log2(uint32 num) limit; limit = 1; - for (i = 0; limit < num; limit = limit << 1, i++) + for (i = 0; limit < num; limit <<= 1, i++) ; return i; } @@ -130,20 +145,19 @@ _hash_log2(uint32 num) void _hash_checkpage(Page page, int flags) { - HashPageOpaque opaque; - +#ifdef USE_ASSERT_CHECKING Assert(page); Assert(((PageHeader) (page))->pd_lower >= SizeOfPageHeaderData); -#if 1 Assert(((PageHeader) (page))->pd_upper <= (BLCKSZ - MAXALIGN(sizeof(HashPageOpaqueData)))); Assert(((PageHeader) (page))->pd_special == (BLCKSZ - MAXALIGN(sizeof(HashPageOpaqueData)))); Assert(PageGetPageSize(page) == BLCKSZ); -#endif if (flags) { - opaque = (HashPageOpaque) PageGetSpecialPointer(page); + HashPageOpaque opaque = (HashPageOpaque) PageGetSpecialPointer(page); + Assert(opaque->hasho_flag & flags); } +#endif /* USE_ASSERT_CHECKING */ } diff --git a/src/include/access/hash.h b/src/include/access/hash.h index 83aae20c1c..045fb40c40 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $Id: hash.h,v 1.50 2003/09/01 20:26:34 tgl Exp $ + * $Id: hash.h,v 1.51 2003/09/02 02:18:38 tgl Exp $ * * NOTES * modeled after Margo Seltzer's hash implementation for unix. @@ -25,13 +25,12 @@ /* * Mapping from hash bucket number to physical block number of bucket's - * starting page. Beware of multiple evaluations of argument! Also notice - * macro's implicit dependency on "metap". + * starting page. Beware of multiple evaluations of argument! */ typedef uint32 Bucket; -#define BUCKET_TO_BLKNO(B) \ - ((BlockNumber) ((B) + ((B) ? metap->hashm_spares[_hash_log2((B)+1)-1] : 0)) + 1) +#define BUCKET_TO_BLKNO(metap,B) \ + ((BlockNumber) ((B) + ((B) ? (metap)->hashm_spares[_hash_log2((B)+1)-1] : 0)) + 1) /* * Special space for hash index pages. @@ -243,8 +242,8 @@ extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf); extern BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf); extern void _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno); -extern void _hash_squeezebucket(Relation rel, HashMetaPage metap, - Bucket bucket); +extern void _hash_squeezebucket(Relation rel, + Bucket bucket, BlockNumber bucket_blkno); /* hashpage.c */ extern void _hash_metapinit(Relation rel);