From 4adc2f72a4ccd6e55e594aca837f09130a6af62b Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Mon, 15 Sep 2008 18:43:41 +0000 Subject: [PATCH] Change hash indexes to store only the hash code rather than the whole indexed value. This means that hash index lookups are always lossy and have to be rechecked when the heap is visited; however, the gain in index compactness outweighs this when the indexed values are wide. Also, we only need to perform datatype comparisons when the hash codes match exactly, rather than for every entry in the hash bucket; so it could also win for datatypes that have expensive comparison functions. A small additional win is gained by keeping hash index pages sorted by hash code and using binary search to reduce the number of index tuples we have to look at. Xiao Meng This commit also incorporates Zdenek Kotala's patch to isolate hash metapages and hash bitmaps a bit better from the page header datastructures. --- doc/src/sgml/catalogs.sgml | 11 ++- src/backend/access/hash/hash.c | 25 ++--- src/backend/access/hash/hashinsert.c | 23 +++-- src/backend/access/hash/hashovfl.c | 6 +- src/backend/access/hash/hashpage.c | 32 ++----- src/backend/access/hash/hashsearch.c | 75 ++++++++++----- src/backend/access/hash/hashutil.c | 134 ++++++++++++++++++++++++++- src/backend/catalog/index.c | 30 +++++- src/backend/utils/sort/tuplesort.c | 27 ++---- src/include/access/hash.h | 25 ++++- src/include/catalog/catversion.h | 4 +- src/include/catalog/pg_am.h | 40 ++++---- src/include/catalog/pg_opclass.h | 10 +- 13 files changed, 313 insertions(+), 129 deletions(-) diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index 97a624f453..646e37d79f 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -1,4 +1,4 @@ - + @@ -451,6 +451,13 @@ Can an index of this type be clustered on? + + amkeytype + oid + pg_type.oid + Type of data stored in index, or zero if not a fixed type + + aminsert regproc @@ -6424,7 +6431,7 @@ sourceline text - Line number within the sourcefile the current value was set + Line number within the sourcefile the current value was set from (NULL for values set in sources other than configuration files) diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 41607c54dc..af4c4c058f 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.104 2008/06/19 00:46:03 alvherre Exp $ + * $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.105 2008/09/15 18:43:41 tgl Exp $ * * NOTES * This file contains only the public interface routines. @@ -79,12 +79,12 @@ hashbuild(PG_FUNCTION_ARGS) * then we'll thrash horribly. To prevent that scenario, we can sort the * tuples by (expected) bucket number. However, such a sort is useless * overhead when the index does fit in RAM. We choose to sort if the - * initial index size exceeds effective_cache_size. + * initial index size exceeds NBuffers. * * NOTE: this test will need adjustment if a bucket is ever different * from one page. */ - if (num_buckets >= (uint32) effective_cache_size) + if (num_buckets >= (uint32) NBuffers) buildstate.spool = _h_spoolinit(index, num_buckets); else buildstate.spool = NULL; @@ -129,7 +129,7 @@ hashbuildCallback(Relation index, IndexTuple itup; /* form an index tuple and point it at the heap tuple */ - itup = index_form_tuple(RelationGetDescr(index), values, isnull); + itup = _hash_form_tuple(index, values, isnull); itup->t_tid = htup->t_self; /* Hash indexes don't index nulls, see notes in hashinsert */ @@ -153,8 +153,8 @@ hashbuildCallback(Relation index, /* * hashinsert() -- insert an index tuple into a hash table. * - * Hash on the index tuple's key, find the appropriate location - * for the new tuple, and put it there. + * Hash on the heap tuple's key, form an index tuple with hash code. + * Find the appropriate location for the new tuple, and put it there. */ Datum hashinsert(PG_FUNCTION_ARGS) @@ -171,7 +171,7 @@ hashinsert(PG_FUNCTION_ARGS) IndexTuple itup; /* generate an index tuple */ - itup = index_form_tuple(RelationGetDescr(rel), values, isnull); + itup = _hash_form_tuple(rel, values, isnull); itup->t_tid = *ht_ctid; /* @@ -211,8 +211,8 @@ hashgettuple(PG_FUNCTION_ARGS) OffsetNumber offnum; bool res; - /* Hash indexes are never lossy (at the moment anyway) */ - scan->xs_recheck = false; + /* Hash indexes are always lossy since we store only the hash code */ + scan->xs_recheck = true; /* * We hold pin but not lock on current buffer while outside the hash AM. @@ -317,7 +317,8 @@ hashgetbitmap(PG_FUNCTION_ARGS) /* Save tuple ID, and continue scanning */ if (add_tuple) { - tbm_add_tuples(tbm, &scan->xs_ctup.t_self, 1, false); + /* Note we mark the tuple ID as requiring recheck */ + tbm_add_tuples(tbm, &scan->xs_ctup.t_self, 1, true); ntids++; } @@ -527,7 +528,7 @@ hashbulkdelete(PG_FUNCTION_ARGS) * each bucket. */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); - metap = (HashMetaPage) BufferGetPage(metabuf); + metap = HashPageGetMeta(BufferGetPage(metabuf)); orig_maxbucket = metap->hashm_maxbucket; orig_ntuples = metap->hashm_ntuples; memcpy(&local_metapage, metap, sizeof(local_metapage)); @@ -629,7 +630,7 @@ loop_top: /* Write-lock metapage and check for split since we started */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE, LH_META_PAGE); - metap = (HashMetaPage) BufferGetPage(metabuf); + metap = HashPageGetMeta(BufferGetPage(metabuf)); if (cur_maxbucket != metap->hashm_maxbucket) { diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c index 7f68318f1a..6195c8a2ac 100644 --- a/src/backend/access/hash/hashinsert.c +++ b/src/backend/access/hash/hashinsert.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hashinsert.c,v 1.50 2008/06/19 00:46:03 alvherre Exp $ + * $PostgreSQL: pgsql/src/backend/access/hash/hashinsert.c,v 1.51 2008/09/15 18:43:41 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -43,18 +43,11 @@ _hash_doinsert(Relation rel, IndexTuple itup) bool do_expand; uint32 hashkey; Bucket bucket; - Datum datum; - bool isnull; /* - * Compute the hash key for the item. We do this first so as not to need - * to hold any locks while running the hash function. + * Get the hash key for the item (it's stored in the index tuple itself). */ - if (rel->rd_rel->relnatts != 1) - elog(ERROR, "hash indexes support only one index key"); - datum = index_getattr(itup, 1, RelationGetDescr(rel), &isnull); - Assert(!isnull); - hashkey = _hash_datum2hashkey(rel, datum); + hashkey = _hash_get_indextuple_hashkey(itup); /* compute item size too */ itemsz = IndexTupleDSize(*itup); @@ -69,12 +62,14 @@ _hash_doinsert(Relation rel, IndexTuple itup) /* Read the metapage */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); - metap = (HashMetaPage) BufferGetPage(metabuf); + metap = HashPageGetMeta(BufferGetPage(metabuf)); /* * Check whether the item can fit on a hash page at all. (Eventually, we * ought to try to apply TOAST methods if not.) Note that at this point, * itemsz doesn't include the ItemId. + * + * XXX this is useless code if we are only storing hash keys. */ if (itemsz > HashMaxItemSize((Page) metap)) ereport(ERROR, @@ -197,11 +192,15 @@ _hash_pgaddtup(Relation rel, { OffsetNumber itup_off; Page page; + uint32 hashkey; _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); page = BufferGetPage(buf); - itup_off = OffsetNumberNext(PageGetMaxOffsetNumber(page)); + /* Find where to insert the tuple (preserving page's hashkey ordering) */ + hashkey = _hash_get_indextuple_hashkey(itup); + itup_off = _hash_binsearch(page, hashkey); + if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add index item to \"%s\"", diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c index 06958ec865..37315dbf37 100644 --- a/src/backend/access/hash/hashovfl.c +++ b/src/backend/access/hash/hashovfl.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.64 2008/06/19 00:46:03 alvherre Exp $ + * $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.65 2008/09/15 18:43:41 tgl Exp $ * * NOTES * Overflow pages look like ordinary relation pages. @@ -187,7 +187,7 @@ _hash_getovflpage(Relation rel, Buffer metabuf) _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); _hash_checkpage(rel, metabuf, LH_META_PAGE); - metap = (HashMetaPage) BufferGetPage(metabuf); + metap = HashPageGetMeta(BufferGetPage(metabuf)); /* start search at hashm_firstfree */ orig_firstfree = metap->hashm_firstfree; @@ -450,7 +450,7 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf, /* Read the metapage so we can determine which bitmap page to use */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); - metap = (HashMetaPage) BufferGetPage(metabuf); + metap = HashPageGetMeta(BufferGetPage(metabuf)); /* Identify which bit to set */ ovflbitno = blkno_to_bitno(metap, ovflblkno); diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index 43ec69cab3..c5edf6dcfb 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.76 2008/08/11 11:05:10 heikki Exp $ + * $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.77 2008/09/15 18:43:41 tgl Exp $ * * NOTES * Postgres hash pages look like ordinary relation pages. The opaque @@ -348,11 +348,9 @@ _hash_metapinit(Relation rel, double num_tuples) * Determine the target fill factor (in tuples per bucket) for this index. * The idea is to make the fill factor correspond to pages about as full * as the user-settable fillfactor parameter says. We can compute it - * exactly if the index datatype is fixed-width, but for var-width there's - * some guessing involved. + * exactly since the index datatype (i.e. uint32 hash key) is fixed-width. */ - data_width = get_typavgwidth(RelationGetDescr(rel)->attrs[0]->atttypid, - RelationGetDescr(rel)->attrs[0]->atttypmod); + data_width = sizeof(uint32); item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) + sizeof(ItemIdData); /* include the line pointer */ ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width; @@ -395,20 +393,18 @@ _hash_metapinit(Relation rel, double num_tuples) pageopaque->hasho_flag = LH_META_PAGE; pageopaque->hasho_page_id = HASHO_PAGE_ID; - metap = (HashMetaPage) pg; + metap = HashPageGetMeta(pg); metap->hashm_magic = HASH_MAGIC; metap->hashm_version = HASH_VERSION; metap->hashm_ntuples = 0; metap->hashm_nmaps = 0; metap->hashm_ffactor = ffactor; - metap->hashm_bsize = BufferGetPageSize(metabuf); + metap->hashm_bsize = HashGetMaxBitmapSize(pg); /* find largest bitmap array size that will fit in page size */ for (i = _hash_log2(metap->hashm_bsize); i > 0; --i) { - if ((1 << i) <= (metap->hashm_bsize - - (MAXALIGN(sizeof(PageHeaderData)) + - MAXALIGN(sizeof(HashPageOpaqueData))))) + if ((1 << i) <= metap->hashm_bsize) break; } Assert(i > 0); @@ -532,7 +528,7 @@ _hash_expandtable(Relation rel, Buffer metabuf) _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); _hash_checkpage(rel, metabuf, LH_META_PAGE); - metap = (HashMetaPage) BufferGetPage(metabuf); + metap = HashPageGetMeta(BufferGetPage(metabuf)); /* * Check to see if split is still needed; someone else might have already @@ -774,8 +770,6 @@ _hash_splitbucket(Relation rel, Buffer nbuf; BlockNumber oblkno; BlockNumber nblkno; - bool null; - Datum datum; HashPageOpaque oopaque; HashPageOpaque nopaque; IndexTuple itup; @@ -785,7 +779,6 @@ _hash_splitbucket(Relation rel, OffsetNumber omaxoffnum; Page opage; Page npage; - TupleDesc itupdesc = RelationGetDescr(rel); /* * It should be okay to simultaneously write-lock pages from each bucket, @@ -846,16 +839,11 @@ _hash_splitbucket(Relation rel, } /* - * Re-hash the tuple to determine which bucket it now belongs in. - * - * It is annoying to call the hash function while holding locks, but - * releasing and relocking the page for each tuple is unappealing too. + * Fetch the item's hash key (conveniently stored in the item) + * and determine which bucket it now belongs in. */ itup = (IndexTuple) PageGetItem(opage, PageGetItemId(opage, ooffnum)); - datum = index_getattr(itup, 1, itupdesc, &null); - Assert(!null); - - bucket = _hash_hashkey2bucket(_hash_datum2hashkey(rel, datum), + bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), maxbucket, highmask, lowmask); if (bucket == nbucket) diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c index 1e05558523..8536839342 100644 --- a/src/backend/access/hash/hashsearch.c +++ b/src/backend/access/hash/hashsearch.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hashsearch.c,v 1.53 2008/06/19 00:46:03 alvherre Exp $ + * $PostgreSQL: pgsql/src/backend/access/hash/hashsearch.c,v 1.54 2008/09/15 18:43:41 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -178,6 +178,8 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) hashkey = _hash_datum2hashkey_type(rel, cur->sk_argument, cur->sk_subtype); + so->hashso_sk_hash = hashkey; + /* * Acquire shared split lock so we can compute the target bucket safely * (see README). @@ -186,7 +188,7 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) /* Read the metapage */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); - metap = (HashMetaPage) BufferGetPage(metabuf); + metap = HashPageGetMeta(BufferGetPage(metabuf)); /* * Compute the target bucket number, and convert to block number. @@ -284,7 +286,7 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) offnum = InvalidOffsetNumber; /* - * 'offnum' now points to the last tuple we have seen (if any). + * 'offnum' now points to the last tuple we examined (if any). * * continue to step through tuples until: 1) we get to the end of the * bucket chain or 2) we find a valid tuple. @@ -297,25 +299,39 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) if (offnum != InvalidOffsetNumber) offnum = OffsetNumberNext(offnum); /* move forward */ else - offnum = FirstOffsetNumber; /* new page */ + { + /* new page, locate starting position by binary search */ + offnum = _hash_binsearch(page, so->hashso_sk_hash); + } - while (offnum > maxoff) + for (;;) { /* - * either this page is empty (maxoff == - * InvalidOffsetNumber) or we ran off the end. + * check if we're still in the range of items with + * the target hash key + */ + if (offnum <= maxoff) + { + Assert(offnum >= FirstOffsetNumber); + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup)) + break; /* yes, so exit for-loop */ + } + + /* + * ran off the end of this page, try the next */ _hash_readnext(rel, &buf, &page, &opaque); if (BufferIsValid(buf)) { maxoff = PageGetMaxOffsetNumber(page); - offnum = FirstOffsetNumber; + offnum = _hash_binsearch(page, so->hashso_sk_hash); } else { /* end of bucket */ - maxoff = offnum = InvalidOffsetNumber; - break; /* exit while */ + itup = NULL; + break; /* exit for-loop */ } } break; @@ -324,22 +340,39 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) if (offnum != InvalidOffsetNumber) offnum = OffsetNumberPrev(offnum); /* move back */ else - offnum = maxoff; /* new page */ + { + /* new page, locate starting position by binary search */ + offnum = _hash_binsearch_last(page, so->hashso_sk_hash); + } - while (offnum < FirstOffsetNumber) + for (;;) { /* - * either this page is empty (offnum == - * InvalidOffsetNumber) or we ran off the end. + * check if we're still in the range of items with + * the target hash key + */ + if (offnum >= FirstOffsetNumber) + { + Assert(offnum <= maxoff); + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup)) + break; /* yes, so exit for-loop */ + } + + /* + * ran off the end of this page, try the next */ _hash_readprev(rel, &buf, &page, &opaque); if (BufferIsValid(buf)) - maxoff = offnum = PageGetMaxOffsetNumber(page); + { + maxoff = PageGetMaxOffsetNumber(page); + offnum = _hash_binsearch_last(page, so->hashso_sk_hash); + } else { /* end of bucket */ - maxoff = offnum = InvalidOffsetNumber; - break; /* exit while */ + itup = NULL; + break; /* exit for-loop */ } } break; @@ -347,19 +380,19 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) default: /* NoMovementScanDirection */ /* this should not be reached */ + itup = NULL; break; } - /* we ran off the end of the world without finding a match */ - if (offnum == InvalidOffsetNumber) + if (itup == NULL) { + /* we ran off the end of the bucket without finding a match */ *bufP = so->hashso_curbuf = InvalidBuffer; ItemPointerSetInvalid(current); return false; } - /* get ready to check this tuple */ - itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + /* check the tuple quals, loop around if not met */ } while (!_hash_checkqual(scan, itup)); /* if we made it to here, we've found a valid tuple */ diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c index 29cdf24529..7a1e3a8ad0 100644 --- a/src/backend/access/hash/hashutil.c +++ b/src/backend/access/hash/hashutil.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hashutil.c,v 1.56 2008/07/13 20:45:47 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/hash/hashutil.c,v 1.57 2008/09/15 18:43:41 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -28,12 +28,21 @@ bool _hash_checkqual(IndexScanDesc scan, IndexTuple itup) { + /* + * Currently, we can't check any of the scan conditions since we do + * not have the original index entry value to supply to the sk_func. + * Always return true; we expect that hashgettuple already set the + * recheck flag to make the main indexscan code do it. + */ +#ifdef NOT_USED TupleDesc tupdesc = RelationGetDescr(scan->indexRelation); ScanKey key = scan->keyData; int scanKeySize = scan->numberOfKeys; +#endif IncrIndexProcessed(); +#ifdef NOT_USED while (scanKeySize > 0) { Datum datum; @@ -59,6 +68,7 @@ _hash_checkqual(IndexScanDesc scan, IndexTuple itup) key++; scanKeySize--; } +#endif return true; } @@ -190,7 +200,7 @@ _hash_checkpage(Relation rel, Buffer buf, int flags) */ if (flags == LH_META_PAGE) { - HashMetaPage metap = (HashMetaPage) page; + HashMetaPage metap = HashPageGetMeta(page); if (metap->hashm_magic != HASH_MAGIC) ereport(ERROR, @@ -221,3 +231,123 @@ hashoptions(PG_FUNCTION_ARGS) PG_RETURN_BYTEA_P(result); PG_RETURN_NULL(); } + +/* + * _hash_get_indextuple_hashkey - get the hash index tuple's hash key value + */ +uint32 +_hash_get_indextuple_hashkey(IndexTuple itup) +{ + char *attp; + + /* + * We assume the hash key is the first attribute and can't be null, + * so this can be done crudely but very very cheaply ... + */ + attp = (char *) itup + IndexInfoFindDataOffset(itup->t_info); + return *((uint32 *) attp); +} + +/* + * _hash_form_tuple - form an index tuple containing hash code only + */ +IndexTuple +_hash_form_tuple(Relation index, Datum *values, bool *isnull) +{ + IndexTuple itup; + uint32 hashkey; + Datum hashkeydatum; + TupleDesc hashdesc; + + if (isnull[0]) + hashkeydatum = (Datum) 0; + else + { + hashkey = _hash_datum2hashkey(index, values[0]); + hashkeydatum = UInt32GetDatum(hashkey); + } + hashdesc = RelationGetDescr(index); + Assert(hashdesc->natts == 1); + itup = index_form_tuple(hashdesc, &hashkeydatum, isnull); + return itup; +} + +/* + * _hash_binsearch - Return the offset number in the page where the + * specified hash value should be sought or inserted. + * + * We use binary search, relying on the assumption that the existing entries + * are ordered by hash key. + * + * Returns the offset of the first index entry having hashkey >= hash_value, + * or the page's max offset plus one if hash_value is greater than all + * existing hash keys in the page. This is the appropriate place to start + * a search, or to insert a new item. + */ +OffsetNumber +_hash_binsearch(Page page, uint32 hash_value) +{ + OffsetNumber upper; + OffsetNumber lower; + + /* Loop invariant: lower <= desired place <= upper */ + upper = PageGetMaxOffsetNumber(page) + 1; + lower = FirstOffsetNumber; + + while (upper > lower) + { + OffsetNumber off; + IndexTuple itup; + uint32 hashkey; + + off = (upper + lower) / 2; + Assert(OffsetNumberIsValid(off)); + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); + hashkey = _hash_get_indextuple_hashkey(itup); + if (hashkey < hash_value) + lower = off + 1; + else + upper = off; + } + + return lower; +} + +/* + * _hash_binsearch_last + * + * Same as above, except that if there are multiple matching items in the + * page, we return the offset of the last one instead of the first one, + * and the possible range of outputs is 0..maxoffset not 1..maxoffset+1. + * This is handy for starting a new page in a backwards scan. + */ +OffsetNumber +_hash_binsearch_last(Page page, uint32 hash_value) +{ + OffsetNumber upper; + OffsetNumber lower; + + /* Loop invariant: lower <= desired place <= upper */ + upper = PageGetMaxOffsetNumber(page); + lower = FirstOffsetNumber - 1; + + while (upper > lower) + { + IndexTuple itup; + OffsetNumber off; + uint32 hashkey; + + off = (upper + lower + 1) / 2; + Assert(OffsetNumberIsValid(off)); + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); + hashkey = _hash_get_indextuple_hashkey(itup); + if (hashkey > hash_value) + upper = off - 1; + else + lower = off; + } + + return lower; +} diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 1847f023e4..301e7d1f2d 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.303 2008/08/25 22:42:32 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.304 2008/09/15 18:43:41 tgl Exp $ * * * INTERFACE ROUTINES @@ -76,6 +76,7 @@ typedef struct /* non-export function prototypes */ static TupleDesc ConstructTupleDescriptor(Relation heapRelation, IndexInfo *indexInfo, + Oid accessMethodObjectId, Oid *classObjectId); static void InitializeAttributeOids(Relation indexRelation, int numatts, Oid indexoid); @@ -105,15 +106,28 @@ static Oid IndexGetRelation(Oid indexId); static TupleDesc ConstructTupleDescriptor(Relation heapRelation, IndexInfo *indexInfo, + Oid accessMethodObjectId, Oid *classObjectId) { int numatts = indexInfo->ii_NumIndexAttrs; ListCell *indexpr_item = list_head(indexInfo->ii_Expressions); + HeapTuple amtuple; + Form_pg_am amform; TupleDesc heapTupDesc; TupleDesc indexTupDesc; int natts; /* #atts in heap rel --- for error checks */ int i; + /* We need access to the index AM's pg_am tuple */ + amtuple = SearchSysCache(AMOID, + ObjectIdGetDatum(accessMethodObjectId), + 0, 0, 0); + if (!HeapTupleIsValid(amtuple)) + elog(ERROR, "cache lookup failed for access method %u", + accessMethodObjectId); + amform = (Form_pg_am) GETSTRUCT(amtuple); + + /* ... and to the table's tuple descriptor */ heapTupDesc = RelationGetDescr(heapRelation); natts = RelationGetForm(heapRelation)->relnatts; @@ -133,6 +147,7 @@ ConstructTupleDescriptor(Relation heapRelation, Form_pg_attribute to = indexTupDesc->attrs[i]; HeapTuple tuple; Form_pg_type typeTup; + Form_pg_opclass opclassTup; Oid keyType; if (atnum != 0) @@ -231,8 +246,8 @@ ConstructTupleDescriptor(Relation heapRelation, to->attrelid = InvalidOid; /* - * Check the opclass to see if it provides a keytype (overriding the - * attribute type). + * Check the opclass and index AM to see if either provides a keytype + * (overriding the attribute type). Opclass takes precedence. */ tuple = SearchSysCache(CLAOID, ObjectIdGetDatum(classObjectId[i]), @@ -240,7 +255,11 @@ ConstructTupleDescriptor(Relation heapRelation, if (!HeapTupleIsValid(tuple)) elog(ERROR, "cache lookup failed for opclass %u", classObjectId[i]); - keyType = ((Form_pg_opclass) GETSTRUCT(tuple))->opckeytype; + opclassTup = (Form_pg_opclass) GETSTRUCT(tuple); + if (OidIsValid(opclassTup->opckeytype)) + keyType = opclassTup->opckeytype; + else + keyType = amform->amkeytype; ReleaseSysCache(tuple); if (OidIsValid(keyType) && keyType != to->atttypid) @@ -264,6 +283,8 @@ ConstructTupleDescriptor(Relation heapRelation, } } + ReleaseSysCache(amtuple); + return indexTupDesc; } @@ -577,6 +598,7 @@ index_create(Oid heapRelationId, */ indexTupDesc = ConstructTupleDescriptor(heapRelation, indexInfo, + accessMethodObjectId, classObjectId); /* diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index 775840da18..29a076e138 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -91,7 +91,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/sort/tuplesort.c,v 1.86 2008/08/01 13:16:09 alvherre Exp $ + * $PostgreSQL: pgsql/src/backend/utils/sort/tuplesort.c,v 1.87 2008/09/15 18:43:41 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -101,7 +101,6 @@ #include #include "access/genam.h" -#include "access/hash.h" #include "access/nbtree.h" #include "catalog/pg_amop.h" #include "catalog/pg_operator.h" @@ -353,7 +352,6 @@ struct Tuplesortstate bool enforceUnique; /* complain if we find duplicate tuples */ /* These are specific to the index_hash subcase: */ - FmgrInfo *hash_proc; /* call info for the hash function */ uint32 hash_mask; /* mask for sortable part of hash code */ /* @@ -689,13 +687,6 @@ tuplesort_begin_index_hash(Relation indexRel, state->indexRel = indexRel; - /* - * We look up the index column's hash function just once, to avoid - * chewing lots of cycles in repeated index_getprocinfo calls. This - * assumes that our caller holds the index relation open throughout the - * sort, else the pointer obtained here might cease to be valid. - */ - state->hash_proc = index_getprocinfo(indexRel, 1, HASHPROC); state->hash_mask = hash_mask; MemoryContextSwitchTo(oldcontext); @@ -2821,11 +2812,6 @@ static int comparetup_index_hash(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) { - /* - * It's slightly annoying to redo the hash function each time, although - * most hash functions ought to be cheap. Is it worth having a variant - * tuple storage format so we can store the hash code? - */ uint32 hash1; uint32 hash2; IndexTuple tuple1; @@ -2834,13 +2820,14 @@ comparetup_index_hash(const SortTuple *a, const SortTuple *b, /* Allow interrupting long sorts */ CHECK_FOR_INTERRUPTS(); - /* Compute hash codes and mask off bits we don't want to sort by */ + /* + * Fetch hash keys and mask off bits we don't want to sort by. + * We know that the first column of the index tuple is the hash key. + */ Assert(!a->isnull1); - hash1 = DatumGetUInt32(FunctionCall1(state->hash_proc, a->datum1)) - & state->hash_mask; + hash1 = DatumGetUInt32(a->datum1) & state->hash_mask; Assert(!b->isnull1); - hash2 = DatumGetUInt32(FunctionCall1(state->hash_proc, b->datum1)) - & state->hash_mask; + hash2 = DatumGetUInt32(b->datum1) & state->hash_mask; if (hash1 > hash2) return 1; diff --git a/src/include/access/hash.h b/src/include/access/hash.h index 0dab2b6ae9..e00176d451 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.89 2008/07/13 20:45:47 tgl Exp $ + * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.90 2008/09/15 18:43:41 tgl Exp $ * * NOTES * modeled after Margo Seltzer's hash implementation for unix. @@ -75,6 +75,9 @@ typedef HashPageOpaqueData *HashPageOpaque; */ typedef struct HashScanOpaqueData { + /* Hash value of the scan key, ie, the hash key we seek */ + uint32 hashso_sk_hash; + /* * By definition, a hash scan should be examining only one bucket. We * record the bucket number here as soon as it is known. @@ -111,7 +114,7 @@ typedef HashScanOpaqueData *HashScanOpaque; #define HASH_METAPAGE 0 /* metapage is always block 0 */ #define HASH_MAGIC 0x6440640 -#define HASH_VERSION 1 /* new for Pg 7.4 */ +#define HASH_VERSION 2 /* 2 signifies only hash key value is stored */ /* * Spares[] holds the number of overflow pages currently allocated at or @@ -138,7 +141,6 @@ typedef HashScanOpaqueData *HashScanOpaque; typedef struct HashMetaPageData { - PageHeaderData hashm_phdr; /* pad for page header (do not use) */ uint32 hashm_magic; /* magic no. for hash tables */ uint32 hashm_version; /* version ID */ double hashm_ntuples; /* number of tuples stored in the table */ @@ -191,8 +193,16 @@ typedef HashMetaPageData *HashMetaPage; #define BMPGSZ_BIT(metap) ((metap)->hashm_bmsize << BYTE_TO_BIT) #define BMPG_SHIFT(metap) ((metap)->hashm_bmshift) #define BMPG_MASK(metap) (BMPGSZ_BIT(metap) - 1) -#define HashPageGetBitmap(pg) \ - ((uint32 *) (((char *) (pg)) + MAXALIGN(sizeof(PageHeaderData)))) + +#define HashPageGetBitmap(page) \ + ((uint32 *) PageGetContents(page)) + +#define HashGetMaxBitmapSize(page) \ + (PageGetPageSize((Page) page) - \ + (MAXALIGN(SizeOfPageHeaderData) + MAXALIGN(sizeof(HashPageOpaqueData)))) + +#define HashPageGetMeta(page) \ + ((HashMetaPage) PageGetContents(page)) /* * The number of bits in an ovflpage bitmap word. @@ -330,6 +340,11 @@ extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, uint32 highmask, uint32 lowmask); extern uint32 _hash_log2(uint32 num); extern void _hash_checkpage(Relation rel, Buffer buf, int flags); +extern uint32 _hash_get_indextuple_hashkey(IndexTuple itup); +extern IndexTuple _hash_form_tuple(Relation index, + Datum *values, bool *isnull); +extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value); +extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value); /* hash.c */ extern void hash_redo(XLogRecPtr lsn, XLogRecord *record); diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 6e4b4d40f9..bd08779e71 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -37,7 +37,7 @@ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.485 2008/09/10 18:09:20 alvherre Exp $ + * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.486 2008/09/15 18:43:41 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 200809101 +#define CATALOG_VERSION_NO 200809151 #endif diff --git a/src/include/catalog/pg_am.h b/src/include/catalog/pg_am.h index 712a409633..a7a638e083 100644 --- a/src/include/catalog/pg_am.h +++ b/src/include/catalog/pg_am.h @@ -8,7 +8,7 @@ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/catalog/pg_am.h,v 1.57 2008/07/11 21:06:29 tgl Exp $ + * $PostgreSQL: pgsql/src/include/catalog/pg_am.h,v 1.58 2008/09/15 18:43:41 tgl Exp $ * * NOTES * the genbki.sh script reads this file and generates .bki @@ -48,6 +48,7 @@ CATALOG(pg_am,2601) bool amsearchnulls; /* can AM search for NULL index entries? */ bool amstorage; /* can storage type differ from column type? */ bool amclusterable; /* does AM support cluster command? */ + Oid amkeytype; /* type of data in index, or InvalidOid */ regproc aminsert; /* "insert this tuple" function */ regproc ambeginscan; /* "start new scan" function */ regproc amgettuple; /* "next valid tuple" function */ @@ -74,7 +75,7 @@ typedef FormData_pg_am *Form_pg_am; * compiler constants for pg_am * ---------------- */ -#define Natts_pg_am 24 +#define Natts_pg_am 25 #define Anum_pg_am_amname 1 #define Anum_pg_am_amstrategies 2 #define Anum_pg_am_amsupport 3 @@ -86,35 +87,36 @@ typedef FormData_pg_am *Form_pg_am; #define Anum_pg_am_amsearchnulls 9 #define Anum_pg_am_amstorage 10 #define Anum_pg_am_amclusterable 11 -#define Anum_pg_am_aminsert 12 -#define Anum_pg_am_ambeginscan 13 -#define Anum_pg_am_amgettuple 14 -#define Anum_pg_am_amgetbitmap 15 -#define Anum_pg_am_amrescan 16 -#define Anum_pg_am_amendscan 17 -#define Anum_pg_am_ammarkpos 18 -#define Anum_pg_am_amrestrpos 19 -#define Anum_pg_am_ambuild 20 -#define Anum_pg_am_ambulkdelete 21 -#define Anum_pg_am_amvacuumcleanup 22 -#define Anum_pg_am_amcostestimate 23 -#define Anum_pg_am_amoptions 24 +#define Anum_pg_am_amkeytype 12 +#define Anum_pg_am_aminsert 13 +#define Anum_pg_am_ambeginscan 14 +#define Anum_pg_am_amgettuple 15 +#define Anum_pg_am_amgetbitmap 16 +#define Anum_pg_am_amrescan 17 +#define Anum_pg_am_amendscan 18 +#define Anum_pg_am_ammarkpos 19 +#define Anum_pg_am_amrestrpos 20 +#define Anum_pg_am_ambuild 21 +#define Anum_pg_am_ambulkdelete 22 +#define Anum_pg_am_amvacuumcleanup 23 +#define Anum_pg_am_amcostestimate 24 +#define Anum_pg_am_amoptions 25 /* ---------------- * initial contents of pg_am * ---------------- */ -DATA(insert OID = 403 ( btree 5 1 t t t t t t f t btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbulkdelete btvacuumcleanup btcostestimate btoptions )); +DATA(insert OID = 403 ( btree 5 1 t t t t t t f t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbulkdelete btvacuumcleanup btcostestimate btoptions )); DESCR("b-tree index access method"); #define BTREE_AM_OID 403 -DATA(insert OID = 405 ( hash 1 1 f f f f f f f f hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbulkdelete hashvacuumcleanup hashcostestimate hashoptions )); +DATA(insert OID = 405 ( hash 1 1 f f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbulkdelete hashvacuumcleanup hashcostestimate hashoptions )); DESCR("hash index access method"); #define HASH_AM_OID 405 -DATA(insert OID = 783 ( gist 0 7 f f t t t t t t gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions )); +DATA(insert OID = 783 ( gist 0 7 f f t t t t t t 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions )); DESCR("GiST index access method"); #define GIST_AM_OID 783 -DATA(insert OID = 2742 ( gin 0 5 f f t t f f t f gininsert ginbeginscan gingettuple gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbulkdelete ginvacuumcleanup gincostestimate ginoptions )); +DATA(insert OID = 2742 ( gin 0 5 f f t t f f t f 0 gininsert ginbeginscan gingettuple gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbulkdelete ginvacuumcleanup gincostestimate ginoptions )); DESCR("GIN index access method"); #define GIN_AM_OID 2742 diff --git a/src/include/catalog/pg_opclass.h b/src/include/catalog/pg_opclass.h index f0cb23e270..7c4d95003c 100644 --- a/src/include/catalog/pg_opclass.h +++ b/src/include/catalog/pg_opclass.h @@ -28,7 +28,7 @@ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/catalog/pg_opclass.h,v 1.82 2008/06/24 17:58:27 tgl Exp $ + * $PostgreSQL: pgsql/src/include/catalog/pg_opclass.h,v 1.83 2008/09/15 18:43:41 tgl Exp $ * * NOTES * the genbki.sh script reads this file and generates .bki @@ -123,13 +123,13 @@ DATA(insert ( 403 macaddr_ops PGNSP PGUID 1984 829 t 0 )); DATA(insert ( 405 macaddr_ops PGNSP PGUID 1985 829 t 0 )); /* * Here's an ugly little hack to save space in the system catalog indexes. - * btree and hash don't ordinarily allow a storage type different from input - * type; but cstring and name are the same thing except for trailing padding, + * btree doesn't ordinarily allow a storage type different from input type; + * but cstring and name are the same thing except for trailing padding, * and we can safely omit that within an index entry. So we declare the - * opclasses for name as using cstring storage type. + * btree opclass for name as using cstring storage type. */ DATA(insert ( 403 name_ops PGNSP PGUID 1986 19 t 2275 )); -DATA(insert ( 405 name_ops PGNSP PGUID 1987 19 t 2275 )); +DATA(insert ( 405 name_ops PGNSP PGUID 1987 19 t 0 )); DATA(insert ( 403 numeric_ops PGNSP PGUID 1988 1700 t 0 )); DATA(insert ( 405 numeric_ops PGNSP PGUID 1998 1700 t 0 )); DATA(insert OID = 1981 ( 403 oid_ops PGNSP PGUID 1989 26 t 0 ));