Change hash indexes to store only the hash code rather than the whole indexed

value. This means that hash index lookups are always lossy and have to be rechecked when the heap is visited; however, the gain in index compactness outweighs this when the indexed values are wide. Also, we only need to perform datatype comparisons when the hash codes match exactly, rather than for every entry in the hash bucket; so it could also win for datatypes that have expensive comparison functions. A small additional win is gained by keeping hash index pages sorted by hash code and using binary search to reduce the number of index tuples we have to look at. Xiao Meng This commit also incorporates Zdenek Kotala's patch to isolate hash metapages and hash bitmaps a bit better from the page header datastructures.
2008-09-15 18:43:41 +00:00 · 2008-09-15 18:43:41 +00:00 · 4adc2f72a4
parent 440b3384b0
commit 4adc2f72a4
13 changed files with 313 additions and 129 deletions
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/catalogs.sgml,v 2.173 2008/09/10 18:09:19 alvherre Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/catalogs.sgml,v 2.174 2008/09/15 18:43:41 tgl Exp $ -->
 <!--
 Documentation of the system catalogs, directed toward PostgreSQL developers
 -->
@ -451,6 +451,13 @@
      <entry>Can an index of this type be clustered on?</entry>
     </row>

+     <row>
+      <entry><structfield>amkeytype</structfield></entry>
+      <entry><type>oid</type></entry>
+      <entry><literal><link linkend="catalog-pg-type"><structname>pg_type</structname></link>.oid</literal></entry>
+      <entry>Type of data stored in index, or zero if not a fixed type</entry>
+     </row>
+
     <row>
      <entry><structfield>aminsert</structfield></entry>
      <entry><type>regproc</type></entry>
@ -6424,7 +6431,7 @@
     <row>
      <entry><structfield>sourceline</structfield></entry>
      <entry><type>text</type></entry>
-      <entry>Line number within the sourcefile the current value was set 
+      <entry>Line number within the sourcefile the current value was set
      from (NULL for values set in sources other than configuration files)
      </entry>
     </row>
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.104 2008/06/19 00:46:03 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.105 2008/09/15 18:43:41 tgl Exp $
 *
 * NOTES
 *	  This file contains only the public interface routines.
@ -79,12 +79,12 @@ hashbuild(PG_FUNCTION_ARGS)
 	 * then we'll thrash horribly.  To prevent that scenario, we can sort the
 	 * tuples by (expected) bucket number.  However, such a sort is useless
 	 * overhead when the index does fit in RAM.  We choose to sort if the
-	 * initial index size exceeds effective_cache_size.
+	 * initial index size exceeds NBuffers.
 	 *
 	 * NOTE: this test will need adjustment if a bucket is ever different
 	 * from one page.
 	 */
-	if (num_buckets >= (uint32) effective_cache_size)
+	if (num_buckets >= (uint32) NBuffers)
 		buildstate.spool = _h_spoolinit(index, num_buckets);
 	else
 		buildstate.spool = NULL;
@ -129,7 +129,7 @@ hashbuildCallback(Relation index,
 	IndexTuple	itup;

 	/* form an index tuple and point it at the heap tuple */
-	itup = index_form_tuple(RelationGetDescr(index), values, isnull);
+	itup = _hash_form_tuple(index, values, isnull);
 	itup->t_tid = htup->t_self;

 	/* Hash indexes don't index nulls, see notes in hashinsert */
@ -153,8 +153,8 @@ hashbuildCallback(Relation index,
 /*
 *	hashinsert() -- insert an index tuple into a hash table.
 *
- *	Hash on the index tuple's key, find the appropriate location
- *	for the new tuple, and put it there.
+ *	Hash on the heap tuple's key, form an index tuple with hash code.
+ *	Find the appropriate location for the new tuple, and put it there.
 */
 Datum
 hashinsert(PG_FUNCTION_ARGS)
@ -171,7 +171,7 @@ hashinsert(PG_FUNCTION_ARGS)
 	IndexTuple	itup;

 	/* generate an index tuple */
-	itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
+	itup = _hash_form_tuple(rel, values, isnull);
 	itup->t_tid = *ht_ctid;

 	/*
@ -211,8 +211,8 @@ hashgettuple(PG_FUNCTION_ARGS)
 	OffsetNumber offnum;
 	bool		res;

-	/* Hash indexes are never lossy (at the moment anyway) */
-	scan->xs_recheck = false;
+	/* Hash indexes are always lossy since we store only the hash code */
+	scan->xs_recheck = true;

 	/*
 	 * We hold pin but not lock on current buffer while outside the hash AM.
@ -317,7 +317,8 @@ hashgetbitmap(PG_FUNCTION_ARGS)
 		/* Save tuple ID, and continue scanning */
 		if (add_tuple) 
 		{
-			tbm_add_tuples(tbm, &scan->xs_ctup.t_self, 1, false);
+			/* Note we mark the tuple ID as requiring recheck */
+			tbm_add_tuples(tbm, &scan->xs_ctup.t_self, 1, true);
 			ntids++;
 		}

@ -527,7 +528,7 @@ hashbulkdelete(PG_FUNCTION_ARGS)
 	 * each bucket.
 	 */
 	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
-	metap = (HashMetaPage) BufferGetPage(metabuf);
+	metap =  HashPageGetMeta(BufferGetPage(metabuf));
 	orig_maxbucket = metap->hashm_maxbucket;
 	orig_ntuples = metap->hashm_ntuples;
 	memcpy(&local_metapage, metap, sizeof(local_metapage));
@ -629,7 +630,7 @@ loop_top:

 	/* Write-lock metapage and check for split since we started */
 	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE, LH_META_PAGE);
-	metap = (HashMetaPage) BufferGetPage(metabuf);
+	metap = HashPageGetMeta(BufferGetPage(metabuf));

 	if (cur_maxbucket != metap->hashm_maxbucket)
 	{
--- a/src/backend/access/hash/hashinsert.c
+++ b/src/backend/access/hash/hashinsert.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashinsert.c,v 1.50 2008/06/19 00:46:03 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashinsert.c,v 1.51 2008/09/15 18:43:41 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -43,18 +43,11 @@ _hash_doinsert(Relation rel, IndexTuple itup)
 	bool		do_expand;
 	uint32		hashkey;
 	Bucket		bucket;
-	Datum		datum;
-	bool		isnull;

 	/*
-	 * Compute the hash key for the item.  We do this first so as not to need
-	 * to hold any locks while running the hash function.
+	 * Get the hash key for the item (it's stored in the index tuple itself).
 	 */
-	if (rel->rd_rel->relnatts != 1)
-		elog(ERROR, "hash indexes support only one index key");
-	datum = index_getattr(itup, 1, RelationGetDescr(rel), &isnull);
-	Assert(!isnull);
-	hashkey = _hash_datum2hashkey(rel, datum);
+	hashkey = _hash_get_indextuple_hashkey(itup);

 	/* compute item size too */
 	itemsz = IndexTupleDSize(*itup);
@ -69,12 +62,14 @@ _hash_doinsert(Relation rel, IndexTuple itup)

 	/* Read the metapage */
 	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
-	metap = (HashMetaPage) BufferGetPage(metabuf);
+	metap = HashPageGetMeta(BufferGetPage(metabuf));

 	/*
 	 * Check whether the item can fit on a hash page at all. (Eventually, we
 	 * ought to try to apply TOAST methods if not.)  Note that at this point,
 	 * itemsz doesn't include the ItemId.
+	 *
+	 * XXX this is useless code if we are only storing hash keys.
 	 */
 	if (itemsz > HashMaxItemSize((Page) metap))
 		ereport(ERROR,
@ -197,11 +192,15 @@ _hash_pgaddtup(Relation rel,
 {
 	OffsetNumber itup_off;
 	Page		page;
+	uint32		hashkey;

 	_hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
 	page = BufferGetPage(buf);

-	itup_off = OffsetNumberNext(PageGetMaxOffsetNumber(page));
+	/* Find where to insert the tuple (preserving page's hashkey ordering) */
+	hashkey = _hash_get_indextuple_hashkey(itup);
+	itup_off = _hash_binsearch(page, hashkey);
+
 	if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false)
 		== InvalidOffsetNumber)
 		elog(ERROR, "failed to add index item to \"%s\"",
--- a/src/backend/access/hash/hashovfl.c
+++ b/src/backend/access/hash/hashovfl.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.64 2008/06/19 00:46:03 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.65 2008/09/15 18:43:41 tgl Exp $
 *
 * NOTES
 *	  Overflow pages look like ordinary relation pages.
@ -187,7 +187,7 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
 	_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);

 	_hash_checkpage(rel, metabuf, LH_META_PAGE);
-	metap = (HashMetaPage) BufferGetPage(metabuf);
+	metap = HashPageGetMeta(BufferGetPage(metabuf));

 	/* start search at hashm_firstfree */
 	orig_firstfree = metap->hashm_firstfree;
@ -450,7 +450,7 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf,

 	/* Read the metapage so we can determine which bitmap page to use */
 	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
-	metap = (HashMetaPage) BufferGetPage(metabuf);
+	metap = HashPageGetMeta(BufferGetPage(metabuf));

 	/* Identify which bit to set */
 	ovflbitno = blkno_to_bitno(metap, ovflblkno);
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.76 2008/08/11 11:05:10 heikki Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.77 2008/09/15 18:43:41 tgl Exp $
 *
 * NOTES
 *	  Postgres hash pages look like ordinary relation pages.  The opaque
@ -348,11 +348,9 @@ _hash_metapinit(Relation rel, double num_tuples)
 	 * Determine the target fill factor (in tuples per bucket) for this index.
 	 * The idea is to make the fill factor correspond to pages about as full
 	 * as the user-settable fillfactor parameter says.	We can compute it
-	 * exactly if the index datatype is fixed-width, but for var-width there's
-	 * some guessing involved.
+	 * exactly since the index datatype (i.e. uint32 hash key) is fixed-width.
 	 */
-	data_width = get_typavgwidth(RelationGetDescr(rel)->attrs[0]->atttypid,
-								 RelationGetDescr(rel)->attrs[0]->atttypmod);
+	data_width = sizeof(uint32);
 	item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) +
 		sizeof(ItemIdData);		/* include the line pointer */
 	ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width;
@ -395,20 +393,18 @@ _hash_metapinit(Relation rel, double num_tuples)
 	pageopaque->hasho_flag = LH_META_PAGE;
 	pageopaque->hasho_page_id = HASHO_PAGE_ID;

-	metap = (HashMetaPage) pg;
+	metap = HashPageGetMeta(pg);

 	metap->hashm_magic = HASH_MAGIC;
 	metap->hashm_version = HASH_VERSION;
 	metap->hashm_ntuples = 0;
 	metap->hashm_nmaps = 0;
 	metap->hashm_ffactor = ffactor;
-	metap->hashm_bsize = BufferGetPageSize(metabuf);
+	metap->hashm_bsize = HashGetMaxBitmapSize(pg);
 	/* find largest bitmap array size that will fit in page size */
 	for (i = _hash_log2(metap->hashm_bsize); i > 0; --i)
 	{
-		if ((1 << i) <= (metap->hashm_bsize -
-						 (MAXALIGN(sizeof(PageHeaderData)) +
-						  MAXALIGN(sizeof(HashPageOpaqueData)))))
+		if ((1 << i) <= metap->hashm_bsize)
 			break;
 	}
 	Assert(i > 0);
@ -532,7 +528,7 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 	_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);

 	_hash_checkpage(rel, metabuf, LH_META_PAGE);
-	metap = (HashMetaPage) BufferGetPage(metabuf);
+	metap = HashPageGetMeta(BufferGetPage(metabuf));

 	/*
 	 * Check to see if split is still needed; someone else might have already
@ -774,8 +770,6 @@ _hash_splitbucket(Relation rel,
 	Buffer		nbuf;
 	BlockNumber oblkno;
 	BlockNumber nblkno;
-	bool		null;
-	Datum		datum;
 	HashPageOpaque oopaque;
 	HashPageOpaque nopaque;
 	IndexTuple	itup;
@ -785,7 +779,6 @@ _hash_splitbucket(Relation rel,
 	OffsetNumber omaxoffnum;
 	Page		opage;
 	Page		npage;
-	TupleDesc	itupdesc = RelationGetDescr(rel);

 	/*
 	 * It should be okay to simultaneously write-lock pages from each bucket,
@ -846,16 +839,11 @@ _hash_splitbucket(Relation rel,
 		}

 		/*
-		 * Re-hash the tuple to determine which bucket it now belongs in.
-		 *
-		 * It is annoying to call the hash function while holding locks, but
-		 * releasing and relocking the page for each tuple is unappealing too.
+		 * Fetch the item's hash key (conveniently stored in the item)
+		 * and determine which bucket it now belongs in.
 		 */
 		itup = (IndexTuple) PageGetItem(opage, PageGetItemId(opage, ooffnum));
-		datum = index_getattr(itup, 1, itupdesc, &null);
-		Assert(!null);
-
-		bucket = _hash_hashkey2bucket(_hash_datum2hashkey(rel, datum),
+		bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
 									  maxbucket, highmask, lowmask);

 		if (bucket == nbucket)
--- a/src/backend/access/hash/hashsearch.c
+++ b/src/backend/access/hash/hashsearch.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashsearch.c,v 1.53 2008/06/19 00:46:03 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashsearch.c,v 1.54 2008/09/15 18:43:41 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -178,6 +178,8 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
 		hashkey = _hash_datum2hashkey_type(rel, cur->sk_argument,
 										   cur->sk_subtype);

+	so->hashso_sk_hash = hashkey;
+
 	/*
 	 * Acquire shared split lock so we can compute the target bucket safely
 	 * (see README).
@ -186,7 +188,7 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)

 	/* Read the metapage */
 	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
-	metap = (HashMetaPage) BufferGetPage(metabuf);
+	metap = HashPageGetMeta(BufferGetPage(metabuf));

 	/*
 	 * Compute the target bucket number, and convert to block number.
@ -284,7 +286,7 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 		offnum = InvalidOffsetNumber;

 	/*
-	 * 'offnum' now points to the last tuple we have seen (if any).
+	 * 'offnum' now points to the last tuple we examined (if any).
 	 *
 	 * continue to step through tuples until: 1) we get to the end of the
 	 * bucket chain or 2) we find a valid tuple.
@ -297,25 +299,39 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 				if (offnum != InvalidOffsetNumber)
 					offnum = OffsetNumberNext(offnum);	/* move forward */
 				else
-					offnum = FirstOffsetNumber; /* new page */
+				{
+					/* new page, locate starting position by binary search */
+					offnum = _hash_binsearch(page, so->hashso_sk_hash);
+				}

-				while (offnum > maxoff)
+				for (;;)
 				{
 					/*
-					 * either this page is empty (maxoff ==
-					 * InvalidOffsetNumber) or we ran off the end.
+					 * check if we're still in the range of items with
+					 * the target hash key
+					 */
+					if (offnum <= maxoff)
+					{
+						Assert(offnum >= FirstOffsetNumber);
+						itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+						if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
+							break;				/* yes, so exit for-loop */
+					}
+
+					/*
+					 * ran off the end of this page, try the next
 					 */
 					_hash_readnext(rel, &buf, &page, &opaque);
 					if (BufferIsValid(buf))
 					{
 						maxoff = PageGetMaxOffsetNumber(page);
-						offnum = FirstOffsetNumber;
+						offnum = _hash_binsearch(page, so->hashso_sk_hash);
 					}
 					else
 					{
 						/* end of bucket */
-						maxoff = offnum = InvalidOffsetNumber;
-						break;	/* exit while */
+						itup = NULL;
+						break;	/* exit for-loop */
 					}
 				}
 				break;
@ -324,22 +340,39 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 				if (offnum != InvalidOffsetNumber)
 					offnum = OffsetNumberPrev(offnum);	/* move back */
 				else
-					offnum = maxoff;	/* new page */
+				{
+					/* new page, locate starting position by binary search */
+					offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
+				}

-				while (offnum < FirstOffsetNumber)
+				for (;;)
 				{
 					/*
-					 * either this page is empty (offnum ==
-					 * InvalidOffsetNumber) or we ran off the end.
+					 * check if we're still in the range of items with
+					 * the target hash key
+					 */
+					if (offnum >= FirstOffsetNumber)
+					{
+						Assert(offnum <= maxoff);
+						itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+						if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
+							break;				/* yes, so exit for-loop */
+					}
+
+					/*
+					 * ran off the end of this page, try the next
 					 */
 					_hash_readprev(rel, &buf, &page, &opaque);
 					if (BufferIsValid(buf))
-						maxoff = offnum = PageGetMaxOffsetNumber(page);
+					{
+						maxoff = PageGetMaxOffsetNumber(page);
+						offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
+					}
 					else
 					{
 						/* end of bucket */
-						maxoff = offnum = InvalidOffsetNumber;
-						break;	/* exit while */
+						itup = NULL;
+						break;	/* exit for-loop */
 					}
 				}
 				break;
@ -347,19 +380,19 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 			default:
 				/* NoMovementScanDirection */
 				/* this should not be reached */
+				itup = NULL;
 				break;
 		}

-		/* we ran off the end of the world without finding a match */
-		if (offnum == InvalidOffsetNumber)
+		if (itup == NULL)
 		{
+			/* we ran off the end of the bucket without finding a match */
 			*bufP = so->hashso_curbuf = InvalidBuffer;
 			ItemPointerSetInvalid(current);
 			return false;
 		}

-		/* get ready to check this tuple */
-		itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+		/* check the tuple quals, loop around if not met */
 	} while (!_hash_checkqual(scan, itup));

 	/* if we made it to here, we've found a valid tuple */
--- a/src/backend/access/hash/hashutil.c
+++ b/src/backend/access/hash/hashutil.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashutil.c,v 1.56 2008/07/13 20:45:47 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashutil.c,v 1.57 2008/09/15 18:43:41 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -28,12 +28,21 @@
 bool
 _hash_checkqual(IndexScanDesc scan, IndexTuple itup)
 {
+	/*
+	 * Currently, we can't check any of the scan conditions since we do
+	 * not have the original index entry value to supply to the sk_func.
+	 * Always return true; we expect that hashgettuple already set the
+	 * recheck flag to make the main indexscan code do it.
+	 */
+#ifdef NOT_USED
 	TupleDesc	tupdesc = RelationGetDescr(scan->indexRelation);
 	ScanKey		key = scan->keyData;
 	int			scanKeySize = scan->numberOfKeys;
+#endif

 	IncrIndexProcessed();

+#ifdef NOT_USED
 	while (scanKeySize > 0)
 	{
 		Datum		datum;
@ -59,6 +68,7 @@ _hash_checkqual(IndexScanDesc scan, IndexTuple itup)
 		key++;
 		scanKeySize--;
 	}
+#endif

 	return true;
 }
@ -190,7 +200,7 @@ _hash_checkpage(Relation rel, Buffer buf, int flags)
 	 */
 	if (flags == LH_META_PAGE)
 	{
-		HashMetaPage metap = (HashMetaPage) page;
+		HashMetaPage metap = HashPageGetMeta(page);

 		if (metap->hashm_magic != HASH_MAGIC)
 			ereport(ERROR,
@ -221,3 +231,123 @@ hashoptions(PG_FUNCTION_ARGS)
 		PG_RETURN_BYTEA_P(result);
 	PG_RETURN_NULL();
 }
+
+/*
+ * _hash_get_indextuple_hashkey - get the hash index tuple's hash key value
+ */
+uint32
+_hash_get_indextuple_hashkey(IndexTuple itup)
+{
+	char	   *attp;
+
+	/*
+	 * We assume the hash key is the first attribute and can't be null,
+	 * so this can be done crudely but very very cheaply ...
+	 */
+	attp = (char *) itup + IndexInfoFindDataOffset(itup->t_info);
+	return *((uint32 *) attp);
+}
+
+/*
+ * _hash_form_tuple - form an index tuple containing hash code only
+ */
+IndexTuple
+_hash_form_tuple(Relation index, Datum *values, bool *isnull)
+{
+	IndexTuple		itup;
+	uint32			hashkey;
+	Datum			hashkeydatum;
+	TupleDesc		hashdesc;
+
+	if (isnull[0])
+		hashkeydatum = (Datum) 0;
+	else
+	{
+		hashkey = _hash_datum2hashkey(index, values[0]);
+		hashkeydatum = UInt32GetDatum(hashkey);
+	}
+	hashdesc = RelationGetDescr(index);
+	Assert(hashdesc->natts == 1);
+	itup = index_form_tuple(hashdesc, &hashkeydatum, isnull);
+	return itup;
+}
+
+/*
+ * _hash_binsearch - Return the offset number in the page where the
+ *					 specified hash value should be sought or inserted.
+ *
+ * We use binary search, relying on the assumption that the existing entries
+ * are ordered by hash key.
+ *
+ * Returns the offset of the first index entry having hashkey >= hash_value,
+ * or the page's max offset plus one if hash_value is greater than all
+ * existing hash keys in the page.  This is the appropriate place to start
+ * a search, or to insert a new item.
+ */
+OffsetNumber
+_hash_binsearch(Page page, uint32 hash_value)
+{
+	OffsetNumber	upper;
+	OffsetNumber	lower;
+
+	/* Loop invariant: lower <= desired place <= upper */
+	upper = PageGetMaxOffsetNumber(page) + 1;
+	lower = FirstOffsetNumber;
+
+	while (upper > lower)
+	{
+		OffsetNumber	off;
+		IndexTuple		itup;
+		uint32			hashkey;
+
+		off = (upper + lower) / 2;
+		Assert(OffsetNumberIsValid(off));
+
+		itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
+		hashkey = _hash_get_indextuple_hashkey(itup);
+		if (hashkey < hash_value)
+			lower = off + 1;
+		else
+			upper = off;
+	}
+
+	return lower;
+}
+
+/*
+ * _hash_binsearch_last
+ *
+ * Same as above, except that if there are multiple matching items in the
+ * page, we return the offset of the last one instead of the first one,
+ * and the possible range of outputs is 0..maxoffset not 1..maxoffset+1.
+ * This is handy for starting a new page in a backwards scan.
+ */
+OffsetNumber
+_hash_binsearch_last(Page page, uint32 hash_value)
+{
+	OffsetNumber	upper;
+	OffsetNumber	lower;
+
+	/* Loop invariant: lower <= desired place <= upper */
+	upper = PageGetMaxOffsetNumber(page);
+	lower = FirstOffsetNumber - 1;
+
+	while (upper > lower)
+	{
+		IndexTuple		itup;
+		OffsetNumber	off;
+		uint32			hashkey;
+
+		off = (upper + lower + 1) / 2;
+		Assert(OffsetNumberIsValid(off));
+
+		itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
+		hashkey = _hash_get_indextuple_hashkey(itup);
+		if (hashkey > hash_value)
+			upper = off - 1;
+		else
+			lower = off;
+	}
+
+	return lower;
+}
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.303 2008/08/25 22:42:32 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.304 2008/09/15 18:43:41 tgl Exp $
 *
 *
 * INTERFACE ROUTINES
@ -76,6 +76,7 @@ typedef struct
 /* non-export function prototypes */
 static TupleDesc ConstructTupleDescriptor(Relation heapRelation,
 						 IndexInfo *indexInfo,
+						 Oid accessMethodObjectId,
 						 Oid *classObjectId);
 static void InitializeAttributeOids(Relation indexRelation,
 						int numatts, Oid indexoid);
@ -105,15 +106,28 @@ static Oid	IndexGetRelation(Oid indexId);
 static TupleDesc
 ConstructTupleDescriptor(Relation heapRelation,
 						 IndexInfo *indexInfo,
+						 Oid accessMethodObjectId,
 						 Oid *classObjectId)
 {
 	int			numatts = indexInfo->ii_NumIndexAttrs;
 	ListCell   *indexpr_item = list_head(indexInfo->ii_Expressions);
+	HeapTuple	amtuple;
+	Form_pg_am	amform;
 	TupleDesc	heapTupDesc;
 	TupleDesc	indexTupDesc;
 	int			natts;			/* #atts in heap rel --- for error checks */
 	int			i;

+	/* We need access to the index AM's pg_am tuple */
+	amtuple = SearchSysCache(AMOID,
+							 ObjectIdGetDatum(accessMethodObjectId),
+							 0, 0, 0);
+	if (!HeapTupleIsValid(amtuple))
+		elog(ERROR, "cache lookup failed for access method %u",
+			 accessMethodObjectId);
+	amform = (Form_pg_am) GETSTRUCT(amtuple);
+
+	/* ... and to the table's tuple descriptor */
 	heapTupDesc = RelationGetDescr(heapRelation);
 	natts = RelationGetForm(heapRelation)->relnatts;

@ -133,6 +147,7 @@ ConstructTupleDescriptor(Relation heapRelation,
 		Form_pg_attribute to = indexTupDesc->attrs[i];
 		HeapTuple	tuple;
 		Form_pg_type typeTup;
+		Form_pg_opclass opclassTup;
 		Oid			keyType;

 		if (atnum != 0)
@ -231,8 +246,8 @@ ConstructTupleDescriptor(Relation heapRelation,
 		to->attrelid = InvalidOid;

 		/*
-		 * Check the opclass to see if it provides a keytype (overriding the
-		 * attribute type).
+		 * Check the opclass and index AM to see if either provides a keytype
+		 * (overriding the attribute type).  Opclass takes precedence.
 		 */
 		tuple = SearchSysCache(CLAOID,
 							   ObjectIdGetDatum(classObjectId[i]),
@ -240,7 +255,11 @@ ConstructTupleDescriptor(Relation heapRelation,
 		if (!HeapTupleIsValid(tuple))
 			elog(ERROR, "cache lookup failed for opclass %u",
 				 classObjectId[i]);
-		keyType = ((Form_pg_opclass) GETSTRUCT(tuple))->opckeytype;
+		opclassTup = (Form_pg_opclass) GETSTRUCT(tuple);
+		if (OidIsValid(opclassTup->opckeytype))
+			keyType = opclassTup->opckeytype;
+		else
+			keyType = amform->amkeytype;
 		ReleaseSysCache(tuple);

 		if (OidIsValid(keyType) && keyType != to->atttypid)
@ -264,6 +283,8 @@ ConstructTupleDescriptor(Relation heapRelation,
 		}
 	}

+	ReleaseSysCache(amtuple);
+
 	return indexTupDesc;
 }

@ -577,6 +598,7 @@ index_create(Oid heapRelationId,
 	 */
 	indexTupDesc = ConstructTupleDescriptor(heapRelation,
 											indexInfo,
+											accessMethodObjectId,
 											classObjectId);

 	/*
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@ -91,7 +91,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/sort/tuplesort.c,v 1.86 2008/08/01 13:16:09 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/sort/tuplesort.c,v 1.87 2008/09/15 18:43:41 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -101,7 +101,6 @@
 #include <limits.h>

 #include "access/genam.h"
-#include "access/hash.h"
 #include "access/nbtree.h"
 #include "catalog/pg_amop.h"
 #include "catalog/pg_operator.h"
@ -353,7 +352,6 @@ struct Tuplesortstate
 	bool		enforceUnique;	/* complain if we find duplicate tuples */

 	/* These are specific to the index_hash subcase: */
-	FmgrInfo   *hash_proc;		/* call info for the hash function */
 	uint32		hash_mask;		/* mask for sortable part of hash code */

 	/*
@ -689,13 +687,6 @@ tuplesort_begin_index_hash(Relation indexRel,

 	state->indexRel = indexRel;

-	/*
-	 * We look up the index column's hash function just once, to avoid
-	 * chewing lots of cycles in repeated index_getprocinfo calls.  This
-	 * assumes that our caller holds the index relation open throughout the
-	 * sort, else the pointer obtained here might cease to be valid.
-	 */
-	state->hash_proc = index_getprocinfo(indexRel, 1, HASHPROC);
 	state->hash_mask = hash_mask;

 	MemoryContextSwitchTo(oldcontext);
@ -2821,11 +2812,6 @@ static int
 comparetup_index_hash(const SortTuple *a, const SortTuple *b,
 					  Tuplesortstate *state)
 {
-	/*
-	 * It's slightly annoying to redo the hash function each time, although
-	 * most hash functions ought to be cheap.  Is it worth having a variant
-	 * tuple storage format so we can store the hash code?
-	 */
 	uint32		hash1;
 	uint32		hash2;
 	IndexTuple	tuple1;
@ -2834,13 +2820,14 @@ comparetup_index_hash(const SortTuple *a, const SortTuple *b,
 	/* Allow interrupting long sorts */
 	CHECK_FOR_INTERRUPTS();

-	/* Compute hash codes and mask off bits we don't want to sort by */
+	/*
+	 * Fetch hash keys and mask off bits we don't want to sort by.
+	 * We know that the first column of the index tuple is the hash key.
+	 */
 	Assert(!a->isnull1);
-	hash1 = DatumGetUInt32(FunctionCall1(state->hash_proc, a->datum1))
-		& state->hash_mask;
+	hash1 = DatumGetUInt32(a->datum1) & state->hash_mask;
 	Assert(!b->isnull1);
-	hash2 = DatumGetUInt32(FunctionCall1(state->hash_proc, b->datum1))
-		& state->hash_mask;
+	hash2 = DatumGetUInt32(b->datum1) & state->hash_mask;

 	if (hash1 > hash2)
 		return 1;
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.89 2008/07/13 20:45:47 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.90 2008/09/15 18:43:41 tgl Exp $
 *
 * NOTES
 *		modeled after Margo Seltzer's hash implementation for unix.
@ -75,6 +75,9 @@ typedef HashPageOpaqueData *HashPageOpaque;
 */
 typedef struct HashScanOpaqueData
 {
+	/* Hash value of the scan key, ie, the hash key we seek */
+	uint32		hashso_sk_hash;
+
 	/*
 	 * By definition, a hash scan should be examining only one bucket. We
 	 * record the bucket number here as soon as it is known.
@ -111,7 +114,7 @@ typedef HashScanOpaqueData *HashScanOpaque;
 #define HASH_METAPAGE	0		/* metapage is always block 0 */

 #define HASH_MAGIC		0x6440640
-#define HASH_VERSION	1		/* new for Pg 7.4 */
+#define HASH_VERSION	2		/* 2 signifies only hash key value is stored */

 /*
 * Spares[] holds the number of overflow pages currently allocated at or
@ -138,7 +141,6 @@ typedef HashScanOpaqueData *HashScanOpaque;

 typedef struct HashMetaPageData
 {
-	PageHeaderData hashm_phdr;	/* pad for page header (do not use) */
 	uint32		hashm_magic;	/* magic no. for hash tables */
 	uint32		hashm_version;	/* version ID */
 	double		hashm_ntuples;	/* number of tuples stored in the table */
@ -191,8 +193,16 @@ typedef HashMetaPageData *HashMetaPage;
 #define BMPGSZ_BIT(metap)		((metap)->hashm_bmsize << BYTE_TO_BIT)
 #define BMPG_SHIFT(metap)		((metap)->hashm_bmshift)
 #define BMPG_MASK(metap)		(BMPGSZ_BIT(metap) - 1)
-#define HashPageGetBitmap(pg) \
-	((uint32 *) (((char *) (pg)) + MAXALIGN(sizeof(PageHeaderData))))
+
+#define HashPageGetBitmap(page) \
+	((uint32 *) PageGetContents(page))
+
+#define HashGetMaxBitmapSize(page) \
+	(PageGetPageSize((Page) page) - \
+	 (MAXALIGN(SizeOfPageHeaderData) + MAXALIGN(sizeof(HashPageOpaqueData))))
+
+#define HashPageGetMeta(page) \
+	((HashMetaPage) PageGetContents(page))

 /*
 * The number of bits in an ovflpage bitmap word.
@ -330,6 +340,11 @@ extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket,
 					 uint32 highmask, uint32 lowmask);
 extern uint32 _hash_log2(uint32 num);
 extern void _hash_checkpage(Relation rel, Buffer buf, int flags);
+extern uint32 _hash_get_indextuple_hashkey(IndexTuple itup);
+extern IndexTuple _hash_form_tuple(Relation index,
+								   Datum *values, bool *isnull);
+extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value);
+extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value);

 /* hash.c */
 extern void hash_redo(XLogRecPtr lsn, XLogRecord *record);
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@ -37,7 +37,7 @@
 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.485 2008/09/10 18:09:20 alvherre Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.486 2008/09/15 18:43:41 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -53,6 +53,6 @@
 */

 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	200809101
+#define CATALOG_VERSION_NO	200809151

 #endif
--- a/src/include/catalog/pg_am.h
+++ b/src/include/catalog/pg_am.h
@ -8,7 +8,7 @@
 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/catalog/pg_am.h,v 1.57 2008/07/11 21:06:29 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/pg_am.h,v 1.58 2008/09/15 18:43:41 tgl Exp $
 *
 * NOTES
 *		the genbki.sh script reads this file and generates .bki
@ -48,6 +48,7 @@ CATALOG(pg_am,2601)
 	bool		amsearchnulls;	/* can AM search for NULL index entries? */
 	bool		amstorage;		/* can storage type differ from column type? */
 	bool		amclusterable;	/* does AM support cluster command? */
+	Oid			amkeytype;		/* type of data in index, or InvalidOid */
 	regproc		aminsert;		/* "insert this tuple" function */
 	regproc		ambeginscan;	/* "start new scan" function */
 	regproc		amgettuple;		/* "next valid tuple" function */
@ -74,7 +75,7 @@ typedef FormData_pg_am *Form_pg_am;
 *		compiler constants for pg_am
 * ----------------
 */
-#define Natts_pg_am						24
+#define Natts_pg_am						25
 #define Anum_pg_am_amname				1
 #define Anum_pg_am_amstrategies			2
 #define Anum_pg_am_amsupport			3
@ -86,35 +87,36 @@ typedef FormData_pg_am *Form_pg_am;
 #define Anum_pg_am_amsearchnulls		9
 #define Anum_pg_am_amstorage			10
 #define Anum_pg_am_amclusterable		11
-#define Anum_pg_am_aminsert				12
-#define Anum_pg_am_ambeginscan			13
-#define Anum_pg_am_amgettuple			14
-#define Anum_pg_am_amgetbitmap			15
-#define Anum_pg_am_amrescan				16
-#define Anum_pg_am_amendscan			17
-#define Anum_pg_am_ammarkpos			18
-#define Anum_pg_am_amrestrpos			19
-#define Anum_pg_am_ambuild				20
-#define Anum_pg_am_ambulkdelete			21
-#define Anum_pg_am_amvacuumcleanup		22
-#define Anum_pg_am_amcostestimate		23
-#define Anum_pg_am_amoptions			24
+#define Anum_pg_am_amkeytype			12
+#define Anum_pg_am_aminsert				13
+#define Anum_pg_am_ambeginscan			14
+#define Anum_pg_am_amgettuple			15
+#define Anum_pg_am_amgetbitmap			16
+#define Anum_pg_am_amrescan				17
+#define Anum_pg_am_amendscan			18
+#define Anum_pg_am_ammarkpos			19
+#define Anum_pg_am_amrestrpos			20
+#define Anum_pg_am_ambuild				21
+#define Anum_pg_am_ambulkdelete			22
+#define Anum_pg_am_amvacuumcleanup		23
+#define Anum_pg_am_amcostestimate		24
+#define Anum_pg_am_amoptions			25

 /* ----------------
 *		initial contents of pg_am
 * ----------------
 */

-DATA(insert OID = 403 (  btree	5 1 t t t t t t f t btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbulkdelete btvacuumcleanup btcostestimate btoptions ));
+DATA(insert OID = 403 (  btree	5 1 t t t t t t f t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbulkdelete btvacuumcleanup btcostestimate btoptions ));
 DESCR("b-tree index access method");
 #define BTREE_AM_OID 403
-DATA(insert OID = 405 (  hash	1 1 f f f f f f f f hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbulkdelete hashvacuumcleanup hashcostestimate hashoptions ));
+DATA(insert OID = 405 (  hash	1 1 f f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbulkdelete hashvacuumcleanup hashcostestimate hashoptions ));
 DESCR("hash index access method");
 #define HASH_AM_OID 405
-DATA(insert OID = 783 (  gist	0 7 f f t t t t t t gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions ));
+DATA(insert OID = 783 (  gist	0 7 f f t t t t t t 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions ));
 DESCR("GiST index access method");
 #define GIST_AM_OID 783
-DATA(insert OID = 2742 (  gin	0 5 f f t t f f t f gininsert ginbeginscan gingettuple gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbulkdelete ginvacuumcleanup gincostestimate ginoptions ));
+DATA(insert OID = 2742 (  gin	0 5 f f t t f f t f 0 gininsert ginbeginscan gingettuple gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbulkdelete ginvacuumcleanup gincostestimate ginoptions ));
 DESCR("GIN index access method");
 #define GIN_AM_OID 2742

--- a/src/include/catalog/pg_opclass.h
+++ b/src/include/catalog/pg_opclass.h
@ -28,7 +28,7 @@
 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/catalog/pg_opclass.h,v 1.82 2008/06/24 17:58:27 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/pg_opclass.h,v 1.83 2008/09/15 18:43:41 tgl Exp $
 *
 * NOTES
 *	  the genbki.sh script reads this file and generates .bki
@ -123,13 +123,13 @@ DATA(insert (	403		macaddr_ops			PGNSP PGUID 1984  829 t 0 ));
 DATA(insert (	405		macaddr_ops			PGNSP PGUID 1985  829 t 0 ));
 /*
 * Here's an ugly little hack to save space in the system catalog indexes.
- * btree and hash don't ordinarily allow a storage type different from input
- * type; but cstring and name are the same thing except for trailing padding,
+ * btree doesn't ordinarily allow a storage type different from input type;
+ * but cstring and name are the same thing except for trailing padding,
 * and we can safely omit that within an index entry.  So we declare the
- * opclasses for name as using cstring storage type.
+ * btree opclass for name as using cstring storage type.
 */
 DATA(insert (	403		name_ops			PGNSP PGUID 1986   19 t 2275 ));
-DATA(insert (	405		name_ops			PGNSP PGUID 1987   19 t 2275 ));
+DATA(insert (	405		name_ops			PGNSP PGUID 1987   19 t 0 ));
 DATA(insert (	403		numeric_ops			PGNSP PGUID 1988 1700 t 0 ));
 DATA(insert (	405		numeric_ops			PGNSP PGUID 1998 1700 t 0 ));
 DATA(insert OID = 1981 ( 403	oid_ops		PGNSP PGUID 1989   26 t 0 ));