Rewrite hashbulkdelete() to make it amenable to new bucket locking

scheme. A pleasant side effect is that it is *much* faster when deleting a large fraction of the indexed tuples, because of elimination of redundant hash_step activity induced by hash_adjscans. Various other continuing code cleanup.
2003-09-02 02:18:38 +00:00 · 2003-09-02 02:18:38 +00:00 · 39673ca47b
parent 5f65345a57
commit 39673ca47b
6 changed files with 233 additions and 78 deletions
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.65 2003/08/04 02:39:57 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.66 2003/09/02 02:18:38 tgl Exp $
 *
 * NOTES
 *	  This file contains only the public interface routines.
@ -449,40 +449,178 @@ hashbulkdelete(PG_FUNCTION_ARGS)
 	BlockNumber num_pages;
 	double		tuples_removed;
 	double		num_index_tuples;
-	IndexScanDesc iscan;
+	uint32		deleted_tuples;
-
+	uint32		tuples_remaining;
-	tuples_removed = 0;
+	uint32		orig_ntuples;
-	num_index_tuples = 0;
+	Bucket		orig_maxbucket;
 	Bucket		cur_maxbucket;
 	Bucket		cur_bucket;
 	Buffer		metabuf;
 	HashMetaPage metap;
 	HashMetaPageData local_metapage;
 	/*
-	 * XXX generic implementation --- should be improved!
+	 * keep track of counts in both float form (to return) and integer form
 	 * (to update hashm_ntuples).  It'd be better to make hashm_ntuples a
 	 * double, but that will have to wait for an initdb.
 	 */
 	tuples_removed = 0;
 	num_index_tuples = 0;
 	deleted_tuples = 0;
 	tuples_remaining = 0;
-	/* walk through the entire index */
+	/*
-	iscan = index_beginscan(NULL, rel, SnapshotAny, 0, (ScanKey) NULL);
+	 * Read the metapage to fetch original bucket and tuple counts.  Also,
-	/* including killed tuples */
+	 * we keep a copy of the last-seen metapage so that we can use its
-	iscan->ignore_killed_tuples = false;
+	 * hashm_spares[] values to compute bucket page addresses.  This is a
 	 * bit hokey but perfectly safe, since the interesting entries in the
 	 * spares array cannot change under us; and it beats rereading the
 	 * metapage for each bucket.
 	 */
 	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ);
 	metap = (HashMetaPage) BufferGetPage(metabuf);
 	_hash_checkpage((Page) metap, LH_META_PAGE);
 	orig_maxbucket = metap->hashm_maxbucket;
 	orig_ntuples = metap->hashm_ntuples;
 	memcpy(&local_metapage, metap, sizeof(local_metapage));
 	_hash_relbuf(rel, metabuf, HASH_READ);
-	while (index_getnext_indexitem(iscan, ForwardScanDirection))
+	/* Scan the buckets that we know exist */
 	cur_bucket = 0;
 	cur_maxbucket = orig_maxbucket;
 loop_top:
 	while (cur_bucket <= cur_maxbucket)
 	{
-		if (callback(&iscan->xs_ctup.t_self, callback_state))
+		BlockNumber bucket_blkno;
 		BlockNumber blkno;
 		bool		bucket_dirty = false;
 		/* Get address of bucket's start page */
 		bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket);
 		/* XXX lock bucket here */
 		/* Scan each page in bucket */
 		blkno = bucket_blkno;
 		while (BlockNumberIsValid(blkno))
 		{
-			ItemPointerData indextup = iscan->currentItemData;
+			Buffer		buf;
 			Page		page;
 			HashPageOpaque opaque;
 			OffsetNumber offno;
 			OffsetNumber maxoffno;
 			bool		page_dirty = false;
-			/* adjust any active scans that will be affected by deletion */
+			buf = _hash_getbuf(rel, blkno, HASH_WRITE);
-			/* (namely, my own scan) */
+			page = BufferGetPage(buf);
-			_hash_adjscans(rel, &indextup);
+			_hash_checkpage(page, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
 			opaque = (HashPageOpaque) PageGetSpecialPointer(page);
 			Assert(opaque->hasho_bucket == cur_bucket);
-			/* delete the data from the page */
+			/* Scan each tuple in page */
-			_hash_pagedel(rel, &indextup);
+			offno = FirstOffsetNumber;
 			maxoffno = PageGetMaxOffsetNumber(page);
 			while (offno <= maxoffno)
 			{
 				HashItem	hitem;
 				ItemPointer htup;
-			tuples_removed += 1;
+				hitem = (HashItem) PageGetItem(page,
 											   PageGetItemId(page, offno));
 				htup = &(hitem->hash_itup.t_tid);
 				if (callback(htup, callback_state))
 				{
 					ItemPointerData indextup;
 					/* adjust any active scans that will be affected */
 					/* (this should be unnecessary) */
 					ItemPointerSet(&indextup, blkno, offno);
 					_hash_adjscans(rel, &indextup);
 					/* delete the item from the page */
 					PageIndexTupleDelete(page, offno);
 					bucket_dirty = page_dirty = true;
 					/* don't increment offno, instead decrement maxoffno */
 					maxoffno = OffsetNumberPrev(maxoffno);
 					tuples_removed += 1;
 					deleted_tuples += 1;
 				}
 				else
 				{
 					offno = OffsetNumberNext(offno);
 					num_index_tuples += 1;
 					tuples_remaining += 1;
 				}
 			}
 			/*
 			 * Write or free page if needed, advance to next page.  We want
 			 * to preserve the invariant that overflow pages are nonempty.
 			 */
 			blkno = opaque->hasho_nextblkno;
 			if (PageIsEmpty(page) && (opaque->hasho_flag & LH_OVERFLOW_PAGE))
 				_hash_freeovflpage(rel, buf);
 			else if (page_dirty)
 				_hash_wrtbuf(rel, buf);
 			else
 				_hash_relbuf(rel, buf, HASH_WRITE);
 		}
-		else
+
-			num_index_tuples += 1;
+		/* If we deleted anything, try to compact free space */
 		if (bucket_dirty)
 			_hash_squeezebucket(rel, cur_bucket, bucket_blkno);
 		/* XXX unlock bucket here */
 		/* Advance to next bucket */
 		cur_bucket++;
 	}
-	index_endscan(iscan);
+	/* Write-lock metapage and check for split since we started */
 	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
 	metap = (HashMetaPage) BufferGetPage(metabuf);
 	_hash_checkpage((Page) metap, LH_META_PAGE);
 	if (cur_maxbucket != metap->hashm_maxbucket)
 	{
 		/* There's been a split, so process the additional bucket(s) */
 		cur_maxbucket = metap->hashm_maxbucket;
 		memcpy(&local_metapage, metap, sizeof(local_metapage));
 		_hash_relbuf(rel, metabuf, HASH_WRITE);
 		goto loop_top;
 	}
 	/* Okay, we're really done.  Update tuple count in metapage. */
 	if (orig_maxbucket == metap->hashm_maxbucket &&
 		orig_ntuples == metap->hashm_ntuples)
 	{
 		/*
 		 * No one has split or inserted anything since start of scan,
 		 * so believe our count as gospel.
 		 */
 		metap->hashm_ntuples = tuples_remaining;
 	}
 	else
 	{
 		/*
 		 * Otherwise, our count is untrustworthy since we may have
 		 * double-scanned tuples in split buckets.  Proceed by
 		 * dead-reckoning.
 		 */
 		if (metap->hashm_ntuples > deleted_tuples)
 			metap->hashm_ntuples -= deleted_tuples;
 		else
 			metap->hashm_ntuples = 0;
 		num_index_tuples = metap->hashm_ntuples;
 	}
 	_hash_wrtbuf(rel, metabuf);
 	/* return statistics */
 	num_pages = RelationGetNumberOfBlocks(rel);
--- a/src/backend/access/hash/hashovfl.c
+++ b/src/backend/access/hash/hashovfl.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashovfl.c,v 1.38 2003/09/01 20:26:34 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashovfl.c,v 1.39 2003/09/02 02:18:38 tgl Exp $
 *
 * NOTES
 *	  Overflow pages look like ordinary relation pages.
@ -444,11 +444,13 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno)
 *	first page in the bucket chain.  The read page works backward and
 *	the write page works forward; the procedure terminates when the
 *	read page and write page are the same page.
 *
 *	Caller must hold exclusive lock on the target bucket.
 */
 void
 _hash_squeezebucket(Relation rel,
-					HashMetaPage metap,
+					Bucket bucket,
-					Bucket bucket)
+					BlockNumber bucket_blkno)
 {
 	Buffer		wbuf;
 	Buffer		rbuf = 0;
@ -466,7 +468,7 @@ _hash_squeezebucket(Relation rel,
 	/*
 	 * start squeezing into the base bucket page.
 	 */
-	wblkno = BUCKET_TO_BLKNO(bucket);
+	wblkno = bucket_blkno;
 	wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE);
 	wpage = BufferGetPage(wbuf);
 	_hash_checkpage(wpage, LH_BUCKET_PAGE);
@ -484,11 +486,6 @@ _hash_squeezebucket(Relation rel,
 	/*
 	 * find the last page in the bucket chain by starting at the base
 	 * bucket page and working forward.
 	 *
 	 * XXX if chains tend to be long, we should probably move forward using
 	 * HASH_READ and then _hash_chgbufaccess to HASH_WRITE when we reach
 	 * the end.  if they are short we probably don't care very much.  if
 	 * the hash function is working at all, they had better be short..
 	 */
 	ropaque = wopaque;
 	do
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.39 2003/09/01 20:26:34 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.40 2003/09/02 02:18:38 tgl Exp $
 *
 * NOTES
 *	  Postgres hash pages look like ordinary relation pages.  The opaque
@ -143,7 +143,7 @@ _hash_metapinit(Relation rel)
 	 */
 	for (i = 0; i <= 1; i++)
 	{
-		buf = _hash_getbuf(rel, BUCKET_TO_BLKNO(i), HASH_WRITE);
+		buf = _hash_getbuf(rel, BUCKET_TO_BLKNO(metap, i), HASH_WRITE);
 		pg = BufferGetPage(buf);
 		_hash_pageinit(pg, BufferGetPageSize(buf));
 		pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
@ -456,6 +456,8 @@ _hash_splitbucket(Relation rel,
 	Buffer		ovflbuf;
 	BlockNumber oblkno;
 	BlockNumber nblkno;
 	BlockNumber start_oblkno;
 	BlockNumber start_nblkno;
 	bool		null;
 	Datum		datum;
 	HashItem	hitem;
@ -475,8 +477,10 @@ _hash_splitbucket(Relation rel,
 	_hash_checkpage((Page) metap, LH_META_PAGE);
 	/* get the buffers & pages */
-	oblkno = BUCKET_TO_BLKNO(obucket);
+	start_oblkno = BUCKET_TO_BLKNO(metap, obucket);
-	nblkno = BUCKET_TO_BLKNO(nbucket);
+	start_nblkno = BUCKET_TO_BLKNO(metap, nbucket);
 	oblkno = start_oblkno;
 	nblkno = start_nblkno;
 	obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
 	nbuf = _hash_getbuf(rel, nblkno, HASH_WRITE);
 	opage = BufferGetPage(obuf);
@ -571,7 +575,7 @@ _hash_splitbucket(Relation rel,
 				 */
 				_hash_wrtbuf(rel, obuf);
 				_hash_wrtbuf(rel, nbuf);
-				_hash_squeezebucket(rel, metap, obucket);
+				_hash_squeezebucket(rel, obucket, start_oblkno);
 				return;
 			}
 		}
@ -639,7 +643,7 @@ _hash_splitbucket(Relation rel,
 				if (!BlockNumberIsValid(oblkno))
 				{
 					_hash_wrtbuf(rel, nbuf);
-					_hash_squeezebucket(rel, metap, obucket);
+					_hash_squeezebucket(rel, obucket, start_oblkno);
 					return;
 				}
--- a/src/backend/access/hash/hashsearch.c
+++ b/src/backend/access/hash/hashsearch.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashsearch.c,v 1.31 2003/08/04 02:39:57 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashsearch.c,v 1.32 2003/09/02 02:18:38 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -19,8 +19,10 @@
 /*
- *	_hash_search() -- Finds the page/bucket that the contains the
+ *	_hash_search() -- Find the bucket that contains the scankey
- *	scankey and loads it into *bufP.  the buffer has a read lock.
+ *		and fetch its primary bucket page into *bufP.
 *
 * the buffer has a read lock.
 */
 void
 _hash_search(Relation rel,
@ -30,22 +32,23 @@ _hash_search(Relation rel,
 			 HashMetaPage metap)
 {
 	BlockNumber blkno;
 	Datum		keyDatum;
 	Bucket		bucket;
-	if (scankey == (ScanKey) NULL ||
+	if (scankey == NULL)
 		(keyDatum = scankey[0].sk_argument) == (Datum) NULL)
 	{
 		/*
-		 * If the scankey argument is NULL, all tuples will satisfy the
+		 * If the scankey is empty, all tuples will satisfy the
 		 * scan so we start the scan at the first bucket (bucket 0).
 		 */
 		bucket = 0;
 	}
 	else
-		bucket = _hash_call(rel, metap, keyDatum);
+	{
 		Assert(!(scankey[0].sk_flags & SK_ISNULL));
 		bucket = _hash_call(rel, metap, scankey[0].sk_argument);
 	}
-	blkno = BUCKET_TO_BLKNO(bucket);
+	blkno = BUCKET_TO_BLKNO(metap, bucket);
 	*bufP = _hash_getbuf(rel, blkno, HASH_READ);
 }
@ -330,7 +333,7 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir, Buffer metabuf)
 						if (allbuckets && bucket < metap->hashm_maxbucket)
 						{
 							++bucket;
-							blkno = BUCKET_TO_BLKNO(bucket);
+							blkno = BUCKET_TO_BLKNO(metap, bucket);
 							buf = _hash_getbuf(rel, blkno, HASH_READ);
 							page = BufferGetPage(buf);
 							_hash_checkpage(page, LH_BUCKET_PAGE);
@ -380,7 +383,7 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir, Buffer metabuf)
 						if (allbuckets && bucket > 0)
 						{
 							--bucket;
-							blkno = BUCKET_TO_BLKNO(bucket);
+							blkno = BUCKET_TO_BLKNO(metap, bucket);
 							buf = _hash_getbuf(rel, blkno, HASH_READ);
 							page = BufferGetPage(buf);
 							_hash_checkpage(page, LH_BUCKET_PAGE);
--- a/src/backend/access/hash/hashutil.c
+++ b/src/backend/access/hash/hashutil.c
@ -8,11 +8,10 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashutil.c,v 1.33 2003/08/04 02:39:57 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashutil.c,v 1.34 2003/09/02 02:18:38 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
 #include "postgres.h"
 #include "access/genam.h"
@ -20,20 +19,23 @@
 #include "access/iqual.h"
 /*
 * _hash_mkscankey -- build a scan key matching the given indextuple
 *
 * Note: this is prepared for multiple index columns, but very little
 * else in access/hash is ...
 */
 ScanKey
 _hash_mkscankey(Relation rel, IndexTuple itup)
 {
 	ScanKey		skey;
-	TupleDesc	itupdesc;
+	TupleDesc	itupdesc = RelationGetDescr(rel);
-	int			natts;
+	int			natts = rel->rd_rel->relnatts;
 	AttrNumber	i;
 	Datum		arg;
 	FmgrInfo   *procinfo;
 	bool		isnull;
 	natts = rel->rd_rel->relnatts;
 	itupdesc = RelationGetDescr(rel);
 	skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));
 	for (i = 0; i < natts; i++)
@ -41,7 +43,7 @@ _hash_mkscankey(Relation rel, IndexTuple itup)
 		arg = index_getattr(itup, i + 1, itupdesc, &isnull);
 		procinfo = index_getprocinfo(rel, i + 1, HASHPROC);
 		ScanKeyEntryInitializeWithInfo(&skey[i],
-									   0x0,
+									   isnull ? SK_ISNULL : 0x0,
 									   (AttrNumber) (i + 1),
 									   procinfo,
 									   CurrentMemoryContext,
@ -57,18 +59,19 @@ _hash_freeskey(ScanKey skey)
 	pfree(skey);
 }
-
+/*
 * _hash_checkqual -- does the index tuple satisfy the scan conditions?
 */
 bool
 _hash_checkqual(IndexScanDesc scan, IndexTuple itup)
 {
-	if (scan->numberOfKeys > 0)
+	return index_keytest(itup, RelationGetDescr(scan->indexRelation),
-		return (index_keytest(itup,
+						 scan->numberOfKeys, scan->keyData);
 							  RelationGetDescr(scan->indexRelation),
 							  scan->numberOfKeys, scan->keyData));
 	else
 		return true;
 }
 /*
 * _hash_formitem -- construct a hash index entry
 */
 HashItem
 _hash_formitem(IndexTuple itup)
 {
@ -82,17 +85,27 @@ _hash_formitem(IndexTuple itup)
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 				 errmsg("hash indexes cannot include null keys")));
-	/* make a copy of the index tuple with room for the sequence number */
+	/*
 	 * make a copy of the index tuple (XXX do we still need to copy?)
 	 *
 	 * HashItemData used to have more fields than IndexTupleData, but no
 	 * longer...
 	 */
 	tuplen = IndexTupleSize(itup);
 	nbytes_hitem = tuplen +
 		(sizeof(HashItemData) - sizeof(IndexTupleData));
 	hitem = (HashItem) palloc(nbytes_hitem);
-	memmove((char *) &(hitem->hash_itup), (char *) itup, tuplen);
+	memcpy((char *) &(hitem->hash_itup), (char *) itup, tuplen);
 	return hitem;
 }
 /*
 * _hash_call -- given a Datum, call the index's hash procedure
 *
 * Returns the bucket number that the hash key maps to.
 */
 Bucket
 _hash_call(Relation rel, HashMetaPage metap, Datum key)
 {
@ -103,9 +116,11 @@ _hash_call(Relation rel, HashMetaPage metap, Datum key)
 	/* XXX assumes index has only one attribute */
 	procinfo = index_getprocinfo(rel, 1, HASHPROC);
 	n = DatumGetUInt32(FunctionCall1(procinfo, key));
 	bucket = n & metap->hashm_highmask;
 	if (bucket > metap->hashm_maxbucket)
 		bucket = bucket & metap->hashm_lowmask;
 	return bucket;
 }
@ -119,7 +134,7 @@ _hash_log2(uint32 num)
 				limit;
 	limit = 1;
-	for (i = 0; limit < num; limit = limit << 1, i++)
+	for (i = 0; limit < num; limit <<= 1, i++)
 		;
 	return i;
 }
@ -130,20 +145,19 @@ _hash_log2(uint32 num)
 void
 _hash_checkpage(Page page, int flags)
 {
-	HashPageOpaque opaque;
+#ifdef USE_ASSERT_CHECKING
 	Assert(page);
 	Assert(((PageHeader) (page))->pd_lower >= SizeOfPageHeaderData);
 #if 1
 	Assert(((PageHeader) (page))->pd_upper <=
 		   (BLCKSZ - MAXALIGN(sizeof(HashPageOpaqueData))));
 	Assert(((PageHeader) (page))->pd_special ==
 		   (BLCKSZ - MAXALIGN(sizeof(HashPageOpaqueData))));
 	Assert(PageGetPageSize(page) == BLCKSZ);
 #endif
 	if (flags)
 	{
-		opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+		HashPageOpaque opaque = (HashPageOpaque) PageGetSpecialPointer(page);
 		Assert(opaque->hasho_flag & flags);
 	}
 #endif   /* USE_ASSERT_CHECKING */
 }
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $Id: hash.h,v 1.50 2003/09/01 20:26:34 tgl Exp $
+ * $Id: hash.h,v 1.51 2003/09/02 02:18:38 tgl Exp $
 *
 * NOTES
 *		modeled after Margo Seltzer's hash implementation for unix.
@ -25,13 +25,12 @@
 /*
 * Mapping from hash bucket number to physical block number of bucket's
- * starting page.  Beware of multiple evaluations of argument!  Also notice
+ * starting page.  Beware of multiple evaluations of argument!
 * macro's implicit dependency on "metap".
 */
 typedef uint32 Bucket;
-#define BUCKET_TO_BLKNO(B) \
+#define BUCKET_TO_BLKNO(metap,B) \
-		((BlockNumber) ((B) + ((B) ? metap->hashm_spares[_hash_log2((B)+1)-1] : 0)) + 1)
+		((BlockNumber) ((B) + ((B) ? (metap)->hashm_spares[_hash_log2((B)+1)-1] : 0)) + 1)
 /*
 * Special space for hash index pages.
@ -243,8 +242,8 @@ extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf);
 extern BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf);
 extern void _hash_initbitmap(Relation rel, HashMetaPage metap,
 							 BlockNumber blkno);
-extern void _hash_squeezebucket(Relation rel, HashMetaPage metap,
+extern void _hash_squeezebucket(Relation rel,
-					Bucket bucket);
+								Bucket bucket, BlockNumber bucket_blkno);
 /* hashpage.c */
 extern void _hash_metapinit(Relation rel);