diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c
index 828c527986..4fe0301c75 100644
--- a/src/backend/access/hash/hashovfl.c
+++ b/src/backend/access/hash/hashovfl.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.52 2006/03/31 23:32:05 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.53 2006/11/19 21:33:22 tgl Exp $
  *
  * NOTES
  *	  Overflow pages look like ordinary relation pages.
@@ -20,7 +20,7 @@
 #include "access/hash.h"
 
 
-static BlockNumber _hash_getovflpage(Relation rel, Buffer metabuf);
+static Buffer _hash_getovflpage(Relation rel, Buffer metabuf);
 static uint32 _hash_firstfreebit(uint32 map);
 
 
@@ -99,18 +99,14 @@ blkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno)
 Buffer
 _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf)
 {
-	BlockNumber ovflblkno;
 	Buffer		ovflbuf;
 	Page		page;
 	Page		ovflpage;
 	HashPageOpaque pageopaque;
 	HashPageOpaque ovflopaque;
 
-	/* allocate an empty overflow page */
-	ovflblkno = _hash_getovflpage(rel, metabuf);
-
-	/* lock the overflow page */
-	ovflbuf = _hash_getbuf(rel, ovflblkno, HASH_WRITE);
+	/* allocate and lock an empty overflow page */
+	ovflbuf = _hash_getovflpage(rel, metabuf);
 	ovflpage = BufferGetPage(ovflbuf);
 
 	/*
@@ -150,7 +146,7 @@ _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf)
 	MarkBufferDirty(ovflbuf);
 
 	/* logically chain overflow page to previous page */
-	pageopaque->hasho_nextblkno = ovflblkno;
+	pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf);
 	_hash_wrtbuf(rel, buf);
 
 	return ovflbuf;
@@ -159,16 +155,18 @@ _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf)
 /*
  *	_hash_getovflpage()
  *
- *	Find an available overflow page and return its block number.
+ *	Find an available overflow page and return it.  The returned buffer
+ *	is pinned and write-locked, but its contents are not initialized.
  *
  * The caller must hold a pin, but no lock, on the metapage buffer.
- * The buffer is returned in the same state.
+ * That buffer is left in the same state at exit.
  */
-static BlockNumber
+static Buffer
 _hash_getovflpage(Relation rel, Buffer metabuf)
 {
 	HashMetaPage metap;
 	Buffer		mapbuf = 0;
+	Buffer		newbuf;
 	BlockNumber blkno;
 	uint32		orig_firstfree;
 	uint32		splitnum;
@@ -243,11 +241,10 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
 		_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
 	}
 
-	/* No Free Page Found - have to allocate a new page */
-	bit = metap->hashm_spares[splitnum];
-	metap->hashm_spares[splitnum]++;
-
-	/* Check if we need to allocate a new bitmap page */
+	/*
+	 * No free pages --- have to extend the relation to add an overflow page.
+	 * First, check to see if we have to add a new bitmap page too.
+	 */
 	if (last_bit == (uint32) (BMPGSZ_BIT(metap) - 1))
 	{
 		/*
@@ -258,22 +255,39 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
 		 * marked "in use".  Subsequent pages do not exist yet, but it is
 		 * convenient to pre-mark them as "in use" too.
 		 */
-		_hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit));
-
 		bit = metap->hashm_spares[splitnum];
+		_hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit));
 		metap->hashm_spares[splitnum]++;
 	}
 	else
 	{
 		/*
-		 * Nothing to do here; since the page was past the last used page, we
-		 * know its bitmap bit was preinitialized to "in use".
+		 * Nothing to do here; since the page will be past the last used page,
+		 * we know its bitmap bit was preinitialized to "in use".
 		 */
 	}
 
 	/* Calculate address of the new overflow page */
+	bit = metap->hashm_spares[splitnum];
 	blkno = bitno_to_blkno(metap, bit);
 
+	/*
+	 * We have to fetch the page with P_NEW to ensure smgr's idea of the
+	 * relation length stays in sync with ours.  XXX It's annoying to do this
+	 * with metapage write lock held; would be better to use a lock that
+	 * doesn't block incoming searches.  Best way to fix it would be to stop
+	 * maintaining hashm_spares[hashm_ovflpoint] and rely entirely on the
+	 * smgr relation length to track where new overflow pages come from;
+	 * then we could release the metapage before we do the smgrextend.
+	 * FIXME later (not in beta...)
+	 */
+	newbuf = _hash_getbuf(rel, P_NEW, HASH_WRITE);
+	if (BufferGetBlockNumber(newbuf) != blkno)
+		elog(ERROR, "unexpected hash relation size: %u, should be %u",
+			 BufferGetBlockNumber(newbuf), blkno);
+
+	metap->hashm_spares[splitnum]++;
+
 	/*
 	 * Adjust hashm_firstfree to avoid redundant searches.	But don't risk
 	 * changing it if someone moved it while we were searching bitmap pages.
@@ -284,7 +298,7 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
 	/* Write updated metapage and release lock, but not pin */
 	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
 
-	return blkno;
+	return newbuf;
 
 found:
 	/* convert bit to bit number within page */
@@ -300,7 +314,7 @@ found:
 	/* convert bit to absolute bit number */
 	bit += (i << BMPG_SHIFT(metap));
 
-	/* Calculate address of the new overflow page */
+	/* Calculate address of the recycled overflow page */
 	blkno = bitno_to_blkno(metap, bit);
 
 	/*
@@ -320,7 +334,8 @@ found:
 		_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
 	}
 
-	return blkno;
+	/* Fetch and return the recycled page */
+	return _hash_getbuf(rel, blkno, HASH_WRITE);
 }
 
 /*
@@ -388,7 +403,11 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf)
 	prevblkno = ovflopaque->hasho_prevblkno;
 	bucket = ovflopaque->hasho_bucket;
 
-	/* Zero the page for debugging's sake; then write and release it */
+	/*
+	 * Zero the page for debugging's sake; then write and release it.
+	 * (Note: if we failed to zero the page here, we'd have problems
+	 * with the Assert in _hash_pageinit() when the page is reused.)
+	 */
 	MemSet(ovflpage, 0, BufferGetPageSize(ovflbuf));
 	_hash_wrtbuf(rel, ovflbuf);
 
@@ -488,12 +507,19 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno)
 	/*
 	 * It is okay to write-lock the new bitmap page while holding metapage
 	 * write lock, because no one else could be contending for the new page.
+	 * Also, the metapage lock makes it safe to extend the index using P_NEW,
+	 * which we want to do to ensure the smgr's idea of the relation size
+	 * stays in step with ours.
 	 *
 	 * There is some loss of concurrency in possibly doing I/O for the new
 	 * page while holding the metapage lock, but this path is taken so seldom
 	 * that it's not worth worrying about.
 	 */
-	buf = _hash_getbuf(rel, blkno, HASH_WRITE);
+	buf = _hash_getbuf(rel, P_NEW, HASH_WRITE);
+	if (BufferGetBlockNumber(buf) != blkno)
+		elog(ERROR, "unexpected hash relation size: %u, should be %u",
+			 BufferGetBlockNumber(buf), blkno);
+
 	pg = BufferGetPage(buf);
 
 	/* initialize the page */
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index 696d4bf616..0f643836a1 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.60 2006/10/04 00:29:48 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.61 2006/11/19 21:33:23 tgl Exp $
  *
  * NOTES
  *	  Postgres hash pages look like ordinary relation pages.  The opaque
@@ -32,9 +32,11 @@
 #include "access/hash.h"
 #include "miscadmin.h"
 #include "storage/lmgr.h"
+#include "storage/smgr.h"
 #include "utils/lsyscache.h"
 
 
+static BlockNumber _hash_alloc_buckets(Relation rel, uint32 nblocks);
 static void _hash_splitbucket(Relation rel, Buffer metabuf,
 				  Bucket obucket, Bucket nbucket,
 				  BlockNumber start_oblkno,
@@ -102,21 +104,18 @@ _hash_droplock(Relation rel, BlockNumber whichlock, int access)
  *		requested buffer and its reference count has been incremented
  *		(ie, the buffer is "locked and pinned").
  *
- *		XXX P_NEW is not used because, unlike the tree structures, we
- *		need the bucket blocks to be at certain block numbers.
+ *		blkno == P_NEW is allowed, but it is caller's responsibility to
+ *		ensure that only one process can extend the index at a time.
  *
- *		All call sites should call either _hash_pageinit or _hash_checkpage
+ *		All call sites should call either _hash_checkpage or _hash_pageinit
  *		on the returned page, depending on whether the block is expected
- *		to be new or not.
+ *		to be valid or not.
  */
 Buffer
 _hash_getbuf(Relation rel, BlockNumber blkno, int access)
 {
 	Buffer		buf;
 
-	if (blkno == P_NEW)
-		elog(ERROR, "hash AM does not use P_NEW");
-
 	buf = ReadBuffer(rel, blkno);
 
 	if (access != HASH_NOLOCK)
@@ -237,7 +236,14 @@ _hash_metapinit(Relation rel)
 	if (ffactor < 10)
 		ffactor = 10;
 
-	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
+	/*
+	 * We initialize the metapage, the first two bucket pages, and the
+	 * first bitmap page in sequence, using P_NEW to cause smgrextend()
+	 * calls to occur.  This ensures that the smgr level has the right
+	 * idea of the physical index length.
+	 */
+	metabuf = _hash_getbuf(rel, P_NEW, HASH_WRITE);
+	Assert(BufferGetBlockNumber(metabuf) == HASH_METAPAGE);
 	pg = BufferGetPage(metabuf);
 	_hash_pageinit(pg, BufferGetPageSize(metabuf));
 
@@ -290,7 +296,8 @@ _hash_metapinit(Relation rel)
 	 */
 	for (i = 0; i <= 1; i++)
 	{
-		buf = _hash_getbuf(rel, BUCKET_TO_BLKNO(metap, i), HASH_WRITE);
+		buf = _hash_getbuf(rel, P_NEW, HASH_WRITE);
+		Assert(BufferGetBlockNumber(buf) == BUCKET_TO_BLKNO(metap, i));
 		pg = BufferGetPage(buf);
 		_hash_pageinit(pg, BufferGetPageSize(buf));
 		pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
@@ -303,8 +310,7 @@ _hash_metapinit(Relation rel)
 	}
 
 	/*
-	 * Initialize first bitmap page.  Can't do this until we create the first
-	 * two buckets, else smgr will complain.
+	 * Initialize first bitmap page
 	 */
 	_hash_initbitmap(rel, metap, 3);
 
@@ -339,6 +345,7 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 	Bucket		old_bucket;
 	Bucket		new_bucket;
 	uint32		spare_ndx;
+	BlockNumber firstblock = InvalidBlockNumber;
 	BlockNumber start_oblkno;
 	BlockNumber start_nblkno;
 	uint32		maxbucket;
@@ -376,6 +383,40 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 		(double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1))
 		goto fail;
 
+	/*
+	 * Can't split anymore if maxbucket has reached its maximum possible value.
+	 *
+	 * Ideally we'd allow bucket numbers up to UINT_MAX-1 (no higher because
+	 * the calculation maxbucket+1 mustn't overflow).  Currently we restrict
+	 * to half that because of overflow looping in _hash_log2() and
+	 * insufficient space in hashm_spares[].  It's moot anyway because an
+	 * index with 2^32 buckets would certainly overflow BlockNumber and
+	 * hence _hash_alloc_buckets() would fail, but if we supported buckets
+	 * smaller than a disk block then this would be an independent constraint.
+	 */
+	if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE)
+		goto fail;
+
+	/*
+	 * If the split point is increasing (hashm_maxbucket's log base 2
+	 * increases), we need to allocate a new batch of bucket pages.
+	 */
+	new_bucket = metap->hashm_maxbucket + 1;
+	spare_ndx = _hash_log2(new_bucket + 1);
+	if (spare_ndx > metap->hashm_ovflpoint)
+	{
+		Assert(spare_ndx == metap->hashm_ovflpoint + 1);
+		/*
+		 * The number of buckets in the new splitpoint is equal to the
+		 * total number already in existence, i.e. new_bucket.  Currently
+		 * this maps one-to-one to blocks required, but someday we may need
+		 * a more complicated calculation here.
+		 */
+		firstblock = _hash_alloc_buckets(rel, new_bucket);
+		if (firstblock == InvalidBlockNumber)
+			goto fail;			/* can't split due to BlockNumber overflow */
+	}
+
 	/*
 	 * Determine which bucket is to be split, and attempt to lock the old
 	 * bucket.	If we can't get the lock, give up.
@@ -389,7 +430,6 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 	 * lock.  This should be okay because no one else should be trying to lock
 	 * the new bucket yet...
 	 */
-	new_bucket = metap->hashm_maxbucket + 1;
 	old_bucket = (new_bucket & metap->hashm_lowmask);
 
 	start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);
@@ -425,14 +465,9 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 	 * increases), we need to adjust the hashm_spares[] array and
 	 * hashm_ovflpoint so that future overflow pages will be created beyond
 	 * this new batch of bucket pages.
-	 *
-	 * XXX should initialize new bucket pages to prevent out-of-order page
-	 * creation?  Don't wanna do it right here though.
 	 */
-	spare_ndx = _hash_log2(metap->hashm_maxbucket + 1);
 	if (spare_ndx > metap->hashm_ovflpoint)
 	{
-		Assert(spare_ndx == metap->hashm_ovflpoint + 1);
 		metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint];
 		metap->hashm_ovflpoint = spare_ndx;
 	}
@@ -440,6 +475,12 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 	/* now we can compute the new bucket's primary block number */
 	start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);
 
+	/* if we added a splitpoint, should match result of _hash_alloc_buckets */
+	if (firstblock != InvalidBlockNumber &&
+		firstblock != start_nblkno)
+		elog(PANIC, "unexpected hash relation size: %u, should be %u",
+			 firstblock, start_nblkno);
+
 	Assert(!_hash_has_active_scan(rel, new_bucket));
 
 	if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE))
@@ -487,6 +528,79 @@ fail:
 }
 
 
+/*
+ * _hash_alloc_buckets -- allocate a new splitpoint's worth of bucket pages
+ *
+ * This does not need to initialize the new bucket pages; we'll do that as
+ * each one is used by _hash_expandtable().  But we have to extend the logical
+ * EOF to the end of the splitpoint; otherwise the first overflow page
+ * allocated beyond the splitpoint will represent a noncontiguous access,
+ * which can confuse md.c (and will probably be forbidden by future changes
+ * to md.c).
+ *
+ * We do this by writing a page of zeroes at the end of the splitpoint range.
+ * We expect that the filesystem will ensure that the intervening pages read
+ * as zeroes too.  On many filesystems this "hole" will not be allocated
+ * immediately, which means that the index file may end up more fragmented
+ * than if we forced it all to be allocated now; but since we don't scan
+ * hash indexes sequentially anyway, that probably doesn't matter.
+ *
+ * XXX It's annoying that this code is executed with the metapage lock held.
+ * We need to interlock against _hash_getovflpage() adding a new overflow page
+ * concurrently, but it'd likely be better to use LockRelationForExtension
+ * for the purpose.  OTOH, adding a splitpoint is a very infrequent operation,
+ * so it may not be worth worrying about.
+ *
+ * Returns the first block number in the new splitpoint's range, or
+ * InvalidBlockNumber if allocation failed due to BlockNumber overflow.
+ */
+static BlockNumber
+_hash_alloc_buckets(Relation rel, uint32 nblocks)
+{
+	BlockNumber	firstblock;
+	BlockNumber	lastblock;
+	BlockNumber	endblock;
+	char		zerobuf[BLCKSZ];
+
+	/*
+	 * Since we hold metapage lock, no one else is either splitting or
+	 * allocating a new page in _hash_getovflpage(); hence it's safe to
+	 * assume that the relation length isn't changing under us.
+	 */
+	firstblock = RelationGetNumberOfBlocks(rel);
+	lastblock = firstblock + nblocks - 1;
+
+	/*
+	 * Check for overflow in block number calculation; if so, we cannot
+	 * extend the index anymore.
+	 */
+	if (lastblock < firstblock || lastblock == InvalidBlockNumber)
+		return InvalidBlockNumber;
+
+	/* Note: we assume RelationGetNumberOfBlocks did RelationOpenSmgr for us */
+
+	MemSet(zerobuf, 0, sizeof(zerobuf));
+
+	/*
+	 * XXX If the extension results in creation of new segment files,
+	 * we have to make sure that each non-last file is correctly filled out to
+	 * RELSEG_SIZE blocks.  This ought to be done inside mdextend, but
+	 * changing the smgr API seems best left for development cycle not late
+	 * beta.  Temporary fix for bug #2737.
+	 */
+#ifndef LET_OS_MANAGE_FILESIZE
+	for (endblock = firstblock | (RELSEG_SIZE - 1);
+		 endblock < lastblock;
+		 endblock += RELSEG_SIZE)
+		smgrextend(rel->rd_smgr, endblock, zerobuf, rel->rd_istemp);
+#endif
+
+	smgrextend(rel->rd_smgr, lastblock, zerobuf, rel->rd_istemp);
+
+	return firstblock;
+}
+
+
 /*
  * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'
  *