Rewrite hashbulkdelete() to make it amenable to new bucket locking

scheme. A pleasant side effect is that it is *much* faster when deleting a large fraction of the indexed tuples, because of elimination of redundant hash_step activity induced by hash_adjscans. Various other continuing code cleanup.
2003-09-02 02:18:38 +00:00 · 2003-09-02 02:18:38 +00:00 · 39673ca47b
parent 5f65345a57
commit 39673ca47b
6 changed files with 233 additions and 78 deletions
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.65 2003/08/04 02:39:57 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.66 2003/09/02 02:18:38 tgl Exp $
 *
 * NOTES
 *	  This file contains only the public interface routines.
@ -449,40 +449,178 @@ hashbulkdelete(PG_FUNCTION_ARGS)
 	BlockNumber num_pages;
 	double		tuples_removed;
 	double		num_index_tuples;
-	IndexScanDesc iscan;
-
-	tuples_removed = 0;
-	num_index_tuples = 0;
+	uint32		deleted_tuples;
+	uint32		tuples_remaining;
+	uint32		orig_ntuples;
+	Bucket		orig_maxbucket;
+	Bucket		cur_maxbucket;
+	Bucket		cur_bucket;
+	Buffer		metabuf;
+	HashMetaPage metap;
+	HashMetaPageData local_metapage;

 	/*
-	 * XXX generic implementation --- should be improved!
+	 * keep track of counts in both float form (to return) and integer form
+	 * (to update hashm_ntuples).  It'd be better to make hashm_ntuples a
+	 * double, but that will have to wait for an initdb.
 	 */
+	tuples_removed = 0;
+	num_index_tuples = 0;
+	deleted_tuples = 0;
+	tuples_remaining = 0;

-	/* walk through the entire index */
-	iscan = index_beginscan(NULL, rel, SnapshotAny, 0, (ScanKey) NULL);
-	/* including killed tuples */
-	iscan->ignore_killed_tuples = false;
+	/*
+	 * Read the metapage to fetch original bucket and tuple counts.  Also,
+	 * we keep a copy of the last-seen metapage so that we can use its
+	 * hashm_spares[] values to compute bucket page addresses.  This is a
+	 * bit hokey but perfectly safe, since the interesting entries in the
+	 * spares array cannot change under us; and it beats rereading the
+	 * metapage for each bucket.
+	 */
+	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ);
+	metap = (HashMetaPage) BufferGetPage(metabuf);
+	_hash_checkpage((Page) metap, LH_META_PAGE);
+	orig_maxbucket = metap->hashm_maxbucket;
+	orig_ntuples = metap->hashm_ntuples;
+	memcpy(&local_metapage, metap, sizeof(local_metapage));
+	_hash_relbuf(rel, metabuf, HASH_READ);

-	while (index_getnext_indexitem(iscan, ForwardScanDirection))
+	/* Scan the buckets that we know exist */
+	cur_bucket = 0;
+	cur_maxbucket = orig_maxbucket;
+
+loop_top:
+	while (cur_bucket <= cur_maxbucket)
 	{
-		if (callback(&iscan->xs_ctup.t_self, callback_state))
+		BlockNumber bucket_blkno;
+		BlockNumber blkno;
+		bool		bucket_dirty = false;
+
+		/* Get address of bucket's start page */
+		bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket);
+
+		/* XXX lock bucket here */
+
+		/* Scan each page in bucket */
+		blkno = bucket_blkno;
+		while (BlockNumberIsValid(blkno))
 		{
-			ItemPointerData indextup = iscan->currentItemData;
+			Buffer		buf;
+			Page		page;
+			HashPageOpaque opaque;
+			OffsetNumber offno;
+			OffsetNumber maxoffno;
+			bool		page_dirty = false;

-			/* adjust any active scans that will be affected by deletion */
-			/* (namely, my own scan) */
-			_hash_adjscans(rel, &indextup);
+			buf = _hash_getbuf(rel, blkno, HASH_WRITE);
+			page = BufferGetPage(buf);
+			_hash_checkpage(page, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+			opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+			Assert(opaque->hasho_bucket == cur_bucket);

-			/* delete the data from the page */
-			_hash_pagedel(rel, &indextup);
+			/* Scan each tuple in page */
+			offno = FirstOffsetNumber;
+			maxoffno = PageGetMaxOffsetNumber(page);
+			while (offno <= maxoffno)
+			{
+				HashItem	hitem;
+				ItemPointer htup;

-			tuples_removed += 1;
+				hitem = (HashItem) PageGetItem(page,
+											   PageGetItemId(page, offno));
+				htup = &(hitem->hash_itup.t_tid);
+				if (callback(htup, callback_state))
+				{
+					ItemPointerData indextup;
+
+					/* adjust any active scans that will be affected */
+					/* (this should be unnecessary) */
+					ItemPointerSet(&indextup, blkno, offno);
+					_hash_adjscans(rel, &indextup);
+
+					/* delete the item from the page */
+					PageIndexTupleDelete(page, offno);
+					bucket_dirty = page_dirty = true;
+
+					/* don't increment offno, instead decrement maxoffno */
+					maxoffno = OffsetNumberPrev(maxoffno);
+
+					tuples_removed += 1;
+					deleted_tuples += 1;
+				}
+				else
+				{
+					offno = OffsetNumberNext(offno);
+
+					num_index_tuples += 1;
+					tuples_remaining += 1;
+				}
+			}
+
+			/*
+			 * Write or free page if needed, advance to next page.  We want
+			 * to preserve the invariant that overflow pages are nonempty.
+			 */
+			blkno = opaque->hasho_nextblkno;
+
+			if (PageIsEmpty(page) && (opaque->hasho_flag & LH_OVERFLOW_PAGE))
+				_hash_freeovflpage(rel, buf);
+			else if (page_dirty)
+				_hash_wrtbuf(rel, buf);
+			else
+				_hash_relbuf(rel, buf, HASH_WRITE);
 		}
-		else
-			num_index_tuples += 1;
+
+		/* If we deleted anything, try to compact free space */
+		if (bucket_dirty)
+			_hash_squeezebucket(rel, cur_bucket, bucket_blkno);
+
+		/* XXX unlock bucket here */
+
+		/* Advance to next bucket */
+		cur_bucket++;
 	}

-	index_endscan(iscan);
+	/* Write-lock metapage and check for split since we started */
+	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
+	metap = (HashMetaPage) BufferGetPage(metabuf);
+	_hash_checkpage((Page) metap, LH_META_PAGE);
+
+	if (cur_maxbucket != metap->hashm_maxbucket)
+	{
+		/* There's been a split, so process the additional bucket(s) */
+		cur_maxbucket = metap->hashm_maxbucket;
+		memcpy(&local_metapage, metap, sizeof(local_metapage));
+		_hash_relbuf(rel, metabuf, HASH_WRITE);
+		goto loop_top;
+	}
+
+	/* Okay, we're really done.  Update tuple count in metapage. */
+
+	if (orig_maxbucket == metap->hashm_maxbucket &&
+		orig_ntuples == metap->hashm_ntuples)
+	{
+		/*
+		 * No one has split or inserted anything since start of scan,
+		 * so believe our count as gospel.
+		 */
+		metap->hashm_ntuples = tuples_remaining;
+	}
+	else
+	{
+		/*
+		 * Otherwise, our count is untrustworthy since we may have
+		 * double-scanned tuples in split buckets.  Proceed by
+		 * dead-reckoning.
+		 */
+		if (metap->hashm_ntuples > deleted_tuples)
+			metap->hashm_ntuples -= deleted_tuples;
+		else
+			metap->hashm_ntuples = 0;
+		num_index_tuples = metap->hashm_ntuples;
+	}
+
+	_hash_wrtbuf(rel, metabuf);

 	/* return statistics */
 	num_pages = RelationGetNumberOfBlocks(rel);
--- a/src/backend/access/hash/hashovfl.c
+++ b/src/backend/access/hash/hashovfl.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashovfl.c,v 1.38 2003/09/01 20:26:34 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashovfl.c,v 1.39 2003/09/02 02:18:38 tgl Exp $
 *
 * NOTES
 *	  Overflow pages look like ordinary relation pages.
@ -444,11 +444,13 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno)
 *	first page in the bucket chain.  The read page works backward and
 *	the write page works forward; the procedure terminates when the
 *	read page and write page are the same page.
+ *
+ *	Caller must hold exclusive lock on the target bucket.
 */
 void
 _hash_squeezebucket(Relation rel,
-					HashMetaPage metap,
-					Bucket bucket)
+					Bucket bucket,
+					BlockNumber bucket_blkno)
 {
 	Buffer		wbuf;
 	Buffer		rbuf = 0;
@ -466,7 +468,7 @@ _hash_squeezebucket(Relation rel,
 	/*
 	 * start squeezing into the base bucket page.
 	 */
-	wblkno = BUCKET_TO_BLKNO(bucket);
+	wblkno = bucket_blkno;
 	wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE);
 	wpage = BufferGetPage(wbuf);
 	_hash_checkpage(wpage, LH_BUCKET_PAGE);
@ -484,11 +486,6 @@ _hash_squeezebucket(Relation rel,
 	/*
 	 * find the last page in the bucket chain by starting at the base
 	 * bucket page and working forward.
-	 *
-	 * XXX if chains tend to be long, we should probably move forward using
-	 * HASH_READ and then _hash_chgbufaccess to HASH_WRITE when we reach
-	 * the end.  if they are short we probably don't care very much.  if
-	 * the hash function is working at all, they had better be short..
 	 */
 	ropaque = wopaque;
 	do
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.39 2003/09/01 20:26:34 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.40 2003/09/02 02:18:38 tgl Exp $
 *
 * NOTES
 *	  Postgres hash pages look like ordinary relation pages.  The opaque
@ -143,7 +143,7 @@ _hash_metapinit(Relation rel)
 	 */
 	for (i = 0; i <= 1; i++)
 	{
-		buf = _hash_getbuf(rel, BUCKET_TO_BLKNO(i), HASH_WRITE);
+		buf = _hash_getbuf(rel, BUCKET_TO_BLKNO(metap, i), HASH_WRITE);
 		pg = BufferGetPage(buf);
 		_hash_pageinit(pg, BufferGetPageSize(buf));
 		pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
@ -456,6 +456,8 @@ _hash_splitbucket(Relation rel,
 	Buffer		ovflbuf;
 	BlockNumber oblkno;
 	BlockNumber nblkno;
+	BlockNumber start_oblkno;
+	BlockNumber start_nblkno;
 	bool		null;
 	Datum		datum;
 	HashItem	hitem;
@ -475,8 +477,10 @@ _hash_splitbucket(Relation rel,
 	_hash_checkpage((Page) metap, LH_META_PAGE);

 	/* get the buffers & pages */
-	oblkno = BUCKET_TO_BLKNO(obucket);
-	nblkno = BUCKET_TO_BLKNO(nbucket);
+	start_oblkno = BUCKET_TO_BLKNO(metap, obucket);
+	start_nblkno = BUCKET_TO_BLKNO(metap, nbucket);
+	oblkno = start_oblkno;
+	nblkno = start_nblkno;
 	obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
 	nbuf = _hash_getbuf(rel, nblkno, HASH_WRITE);
 	opage = BufferGetPage(obuf);
@ -571,7 +575,7 @@ _hash_splitbucket(Relation rel,
 				 */
 				_hash_wrtbuf(rel, obuf);
 				_hash_wrtbuf(rel, nbuf);
-				_hash_squeezebucket(rel, metap, obucket);
+				_hash_squeezebucket(rel, obucket, start_oblkno);
 				return;
 			}
 		}
@ -639,7 +643,7 @@ _hash_splitbucket(Relation rel,
 				if (!BlockNumberIsValid(oblkno))
 				{
 					_hash_wrtbuf(rel, nbuf);
-					_hash_squeezebucket(rel, metap, obucket);
+					_hash_squeezebucket(rel, obucket, start_oblkno);
 					return;
 				}

--- a/src/backend/access/hash/hashsearch.c
+++ b/src/backend/access/hash/hashsearch.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashsearch.c,v 1.31 2003/08/04 02:39:57 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashsearch.c,v 1.32 2003/09/02 02:18:38 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -19,8 +19,10 @@


 /*
- *	_hash_search() -- Finds the page/bucket that the contains the
- *	scankey and loads it into *bufP.  the buffer has a read lock.
+ *	_hash_search() -- Find the bucket that contains the scankey
+ *		and fetch its primary bucket page into *bufP.
+ *
+ * the buffer has a read lock.
 */
 void
 _hash_search(Relation rel,
@ -30,22 +32,23 @@ _hash_search(Relation rel,
 			 HashMetaPage metap)
 {
 	BlockNumber blkno;
-	Datum		keyDatum;
 	Bucket		bucket;

-	if (scankey == (ScanKey) NULL ||
-		(keyDatum = scankey[0].sk_argument) == (Datum) NULL)
+	if (scankey == NULL)
 	{
 		/*
-		 * If the scankey argument is NULL, all tuples will satisfy the
+		 * If the scankey is empty, all tuples will satisfy the
 		 * scan so we start the scan at the first bucket (bucket 0).
 		 */
 		bucket = 0;
 	}
 	else
-		bucket = _hash_call(rel, metap, keyDatum);
+	{
+		Assert(!(scankey[0].sk_flags & SK_ISNULL));
+		bucket = _hash_call(rel, metap, scankey[0].sk_argument);
+	}

-	blkno = BUCKET_TO_BLKNO(bucket);
+	blkno = BUCKET_TO_BLKNO(metap, bucket);

 	*bufP = _hash_getbuf(rel, blkno, HASH_READ);
 }
@ -330,7 +333,7 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir, Buffer metabuf)
 						if (allbuckets && bucket < metap->hashm_maxbucket)
 						{
 							++bucket;
-							blkno = BUCKET_TO_BLKNO(bucket);
+							blkno = BUCKET_TO_BLKNO(metap, bucket);
 							buf = _hash_getbuf(rel, blkno, HASH_READ);
 							page = BufferGetPage(buf);
 							_hash_checkpage(page, LH_BUCKET_PAGE);
@ -380,7 +383,7 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir, Buffer metabuf)
 						if (allbuckets && bucket > 0)
 						{
 							--bucket;
-							blkno = BUCKET_TO_BLKNO(bucket);
+							blkno = BUCKET_TO_BLKNO(metap, bucket);
 							buf = _hash_getbuf(rel, blkno, HASH_READ);
 							page = BufferGetPage(buf);
 							_hash_checkpage(page, LH_BUCKET_PAGE);
--- a/src/backend/access/hash/hashutil.c
+++ b/src/backend/access/hash/hashutil.c
@ -8,11 +8,10 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashutil.c,v 1.33 2003/08/04 02:39:57 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashutil.c,v 1.34 2003/09/02 02:18:38 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
-
 #include "postgres.h"

 #include "access/genam.h"
@ -20,20 +19,23 @@
 #include "access/iqual.h"


+/*
+ * _hash_mkscankey -- build a scan key matching the given indextuple
+ *
+ * Note: this is prepared for multiple index columns, but very little
+ * else in access/hash is ...
+ */
 ScanKey
 _hash_mkscankey(Relation rel, IndexTuple itup)
 {
 	ScanKey		skey;
-	TupleDesc	itupdesc;
-	int			natts;
+	TupleDesc	itupdesc = RelationGetDescr(rel);
+	int			natts = rel->rd_rel->relnatts;
 	AttrNumber	i;
 	Datum		arg;
 	FmgrInfo   *procinfo;
 	bool		isnull;

-	natts = rel->rd_rel->relnatts;
-	itupdesc = RelationGetDescr(rel);
-
 	skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));

 	for (i = 0; i < natts; i++)
@ -41,7 +43,7 @@ _hash_mkscankey(Relation rel, IndexTuple itup)
 		arg = index_getattr(itup, i + 1, itupdesc, &isnull);
 		procinfo = index_getprocinfo(rel, i + 1, HASHPROC);
 		ScanKeyEntryInitializeWithInfo(&skey[i],
-									   0x0,
+									   isnull ? SK_ISNULL : 0x0,
 									   (AttrNumber) (i + 1),
 									   procinfo,
 									   CurrentMemoryContext,
@ -57,18 +59,19 @@ _hash_freeskey(ScanKey skey)
 	pfree(skey);
 }

-
+/*
+ * _hash_checkqual -- does the index tuple satisfy the scan conditions?
+ */
 bool
 _hash_checkqual(IndexScanDesc scan, IndexTuple itup)
 {
-	if (scan->numberOfKeys > 0)
-		return (index_keytest(itup,
-							  RelationGetDescr(scan->indexRelation),
-							  scan->numberOfKeys, scan->keyData));
-	else
-		return true;
+	return index_keytest(itup, RelationGetDescr(scan->indexRelation),
+						 scan->numberOfKeys, scan->keyData);
 }

+/*
+ * _hash_formitem -- construct a hash index entry
+ */
 HashItem
 _hash_formitem(IndexTuple itup)
 {
@ -82,17 +85,27 @@ _hash_formitem(IndexTuple itup)
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 				 errmsg("hash indexes cannot include null keys")));

-	/* make a copy of the index tuple with room for the sequence number */
+	/*
+	 * make a copy of the index tuple (XXX do we still need to copy?)
+	 *
+	 * HashItemData used to have more fields than IndexTupleData, but no
+	 * longer...
+	 */
 	tuplen = IndexTupleSize(itup);
 	nbytes_hitem = tuplen +
 		(sizeof(HashItemData) - sizeof(IndexTupleData));

 	hitem = (HashItem) palloc(nbytes_hitem);
-	memmove((char *) &(hitem->hash_itup), (char *) itup, tuplen);
+	memcpy((char *) &(hitem->hash_itup), (char *) itup, tuplen);

 	return hitem;
 }

+/*
+ * _hash_call -- given a Datum, call the index's hash procedure
+ *
+ * Returns the bucket number that the hash key maps to.
+ */
 Bucket
 _hash_call(Relation rel, HashMetaPage metap, Datum key)
 {
@ -103,9 +116,11 @@ _hash_call(Relation rel, HashMetaPage metap, Datum key)
 	/* XXX assumes index has only one attribute */
 	procinfo = index_getprocinfo(rel, 1, HASHPROC);
 	n = DatumGetUInt32(FunctionCall1(procinfo, key));
+
 	bucket = n & metap->hashm_highmask;
 	if (bucket > metap->hashm_maxbucket)
 		bucket = bucket & metap->hashm_lowmask;
+
 	return bucket;
 }

@ -119,7 +134,7 @@ _hash_log2(uint32 num)
 				limit;

 	limit = 1;
-	for (i = 0; limit < num; limit = limit << 1, i++)
+	for (i = 0; limit < num; limit <<= 1, i++)
 		;
 	return i;
 }
@ -130,20 +145,19 @@ _hash_log2(uint32 num)
 void
 _hash_checkpage(Page page, int flags)
 {
-	HashPageOpaque opaque;
-
+#ifdef USE_ASSERT_CHECKING
 	Assert(page);
 	Assert(((PageHeader) (page))->pd_lower >= SizeOfPageHeaderData);
-#if 1
 	Assert(((PageHeader) (page))->pd_upper <=
 		   (BLCKSZ - MAXALIGN(sizeof(HashPageOpaqueData))));
 	Assert(((PageHeader) (page))->pd_special ==
 		   (BLCKSZ - MAXALIGN(sizeof(HashPageOpaqueData))));
 	Assert(PageGetPageSize(page) == BLCKSZ);
-#endif
 	if (flags)
 	{
-		opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+		HashPageOpaque opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+
 		Assert(opaque->hasho_flag & flags);
 	}
+#endif   /* USE_ASSERT_CHECKING */
 }
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $Id: hash.h,v 1.50 2003/09/01 20:26:34 tgl Exp $
+ * $Id: hash.h,v 1.51 2003/09/02 02:18:38 tgl Exp $
 *
 * NOTES
 *		modeled after Margo Seltzer's hash implementation for unix.
@ -25,13 +25,12 @@

 /*
 * Mapping from hash bucket number to physical block number of bucket's
- * starting page.  Beware of multiple evaluations of argument!  Also notice
- * macro's implicit dependency on "metap".
+ * starting page.  Beware of multiple evaluations of argument!
 */
 typedef uint32 Bucket;

-#define BUCKET_TO_BLKNO(B) \
-		((BlockNumber) ((B) + ((B) ? metap->hashm_spares[_hash_log2((B)+1)-1] : 0)) + 1)
+#define BUCKET_TO_BLKNO(metap,B) \
+		((BlockNumber) ((B) + ((B) ? (metap)->hashm_spares[_hash_log2((B)+1)-1] : 0)) + 1)

 /*
 * Special space for hash index pages.
@ -243,8 +242,8 @@ extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf);
 extern BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf);
 extern void _hash_initbitmap(Relation rel, HashMetaPage metap,
 							 BlockNumber blkno);
-extern void _hash_squeezebucket(Relation rel, HashMetaPage metap,
-					Bucket bucket);
+extern void _hash_squeezebucket(Relation rel,
+								Bucket bucket, BlockNumber bucket_blkno);

 /* hashpage.c */
 extern void _hash_metapinit(Relation rel);