Change hash indexes to store only the hash code rather than the whole indexed

value.  This means that hash index lookups are always lossy and have to be
rechecked when the heap is visited; however, the gain in index compactness
outweighs this when the indexed values are wide.  Also, we only need to
perform datatype comparisons when the hash codes match exactly, rather than
for every entry in the hash bucket; so it could also win for datatypes that
have expensive comparison functions.  A small additional win is gained by
keeping hash index pages sorted by hash code and using binary search to reduce
the number of index tuples we have to look at.

Xiao Meng

This commit also incorporates Zdenek Kotala's patch to isolate hash metapages
and hash bitmaps a bit better from the page header datastructures.
This commit is contained in:
Tom Lane 2008-09-15 18:43:41 +00:00
parent 440b3384b0
commit 4adc2f72a4
13 changed files with 313 additions and 129 deletions

View File

@ -1,4 +1,4 @@
<!-- $PostgreSQL: pgsql/doc/src/sgml/catalogs.sgml,v 2.173 2008/09/10 18:09:19 alvherre Exp $ -->
<!-- $PostgreSQL: pgsql/doc/src/sgml/catalogs.sgml,v 2.174 2008/09/15 18:43:41 tgl Exp $ -->
<!--
Documentation of the system catalogs, directed toward PostgreSQL developers
-->
@ -451,6 +451,13 @@
<entry>Can an index of this type be clustered on?</entry>
</row>
<row>
<entry><structfield>amkeytype</structfield></entry>
<entry><type>oid</type></entry>
<entry><literal><link linkend="catalog-pg-type"><structname>pg_type</structname></link>.oid</literal></entry>
<entry>Type of data stored in index, or zero if not a fixed type</entry>
</row>
<row>
<entry><structfield>aminsert</structfield></entry>
<entry><type>regproc</type></entry>
@ -6424,7 +6431,7 @@
<row>
<entry><structfield>sourceline</structfield></entry>
<entry><type>text</type></entry>
<entry>Line number within the sourcefile the current value was set
<entry>Line number within the sourcefile the current value was set
from (NULL for values set in sources other than configuration files)
</entry>
</row>

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.104 2008/06/19 00:46:03 alvherre Exp $
* $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.105 2008/09/15 18:43:41 tgl Exp $
*
* NOTES
* This file contains only the public interface routines.
@ -79,12 +79,12 @@ hashbuild(PG_FUNCTION_ARGS)
* then we'll thrash horribly. To prevent that scenario, we can sort the
* tuples by (expected) bucket number. However, such a sort is useless
* overhead when the index does fit in RAM. We choose to sort if the
* initial index size exceeds effective_cache_size.
* initial index size exceeds NBuffers.
*
* NOTE: this test will need adjustment if a bucket is ever different
* from one page.
*/
if (num_buckets >= (uint32) effective_cache_size)
if (num_buckets >= (uint32) NBuffers)
buildstate.spool = _h_spoolinit(index, num_buckets);
else
buildstate.spool = NULL;
@ -129,7 +129,7 @@ hashbuildCallback(Relation index,
IndexTuple itup;
/* form an index tuple and point it at the heap tuple */
itup = index_form_tuple(RelationGetDescr(index), values, isnull);
itup = _hash_form_tuple(index, values, isnull);
itup->t_tid = htup->t_self;
/* Hash indexes don't index nulls, see notes in hashinsert */
@ -153,8 +153,8 @@ hashbuildCallback(Relation index,
/*
* hashinsert() -- insert an index tuple into a hash table.
*
* Hash on the index tuple's key, find the appropriate location
* for the new tuple, and put it there.
* Hash on the heap tuple's key, form an index tuple with hash code.
* Find the appropriate location for the new tuple, and put it there.
*/
Datum
hashinsert(PG_FUNCTION_ARGS)
@ -171,7 +171,7 @@ hashinsert(PG_FUNCTION_ARGS)
IndexTuple itup;
/* generate an index tuple */
itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
itup = _hash_form_tuple(rel, values, isnull);
itup->t_tid = *ht_ctid;
/*
@ -211,8 +211,8 @@ hashgettuple(PG_FUNCTION_ARGS)
OffsetNumber offnum;
bool res;
/* Hash indexes are never lossy (at the moment anyway) */
scan->xs_recheck = false;
/* Hash indexes are always lossy since we store only the hash code */
scan->xs_recheck = true;
/*
* We hold pin but not lock on current buffer while outside the hash AM.
@ -317,7 +317,8 @@ hashgetbitmap(PG_FUNCTION_ARGS)
/* Save tuple ID, and continue scanning */
if (add_tuple)
{
tbm_add_tuples(tbm, &scan->xs_ctup.t_self, 1, false);
/* Note we mark the tuple ID as requiring recheck */
tbm_add_tuples(tbm, &scan->xs_ctup.t_self, 1, true);
ntids++;
}
@ -527,7 +528,7 @@ hashbulkdelete(PG_FUNCTION_ARGS)
* each bucket.
*/
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
metap = (HashMetaPage) BufferGetPage(metabuf);
metap = HashPageGetMeta(BufferGetPage(metabuf));
orig_maxbucket = metap->hashm_maxbucket;
orig_ntuples = metap->hashm_ntuples;
memcpy(&local_metapage, metap, sizeof(local_metapage));
@ -629,7 +630,7 @@ loop_top:
/* Write-lock metapage and check for split since we started */
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE, LH_META_PAGE);
metap = (HashMetaPage) BufferGetPage(metabuf);
metap = HashPageGetMeta(BufferGetPage(metabuf));
if (cur_maxbucket != metap->hashm_maxbucket)
{

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/hash/hashinsert.c,v 1.50 2008/06/19 00:46:03 alvherre Exp $
* $PostgreSQL: pgsql/src/backend/access/hash/hashinsert.c,v 1.51 2008/09/15 18:43:41 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -43,18 +43,11 @@ _hash_doinsert(Relation rel, IndexTuple itup)
bool do_expand;
uint32 hashkey;
Bucket bucket;
Datum datum;
bool isnull;
/*
* Compute the hash key for the item. We do this first so as not to need
* to hold any locks while running the hash function.
* Get the hash key for the item (it's stored in the index tuple itself).
*/
if (rel->rd_rel->relnatts != 1)
elog(ERROR, "hash indexes support only one index key");
datum = index_getattr(itup, 1, RelationGetDescr(rel), &isnull);
Assert(!isnull);
hashkey = _hash_datum2hashkey(rel, datum);
hashkey = _hash_get_indextuple_hashkey(itup);
/* compute item size too */
itemsz = IndexTupleDSize(*itup);
@ -69,12 +62,14 @@ _hash_doinsert(Relation rel, IndexTuple itup)
/* Read the metapage */
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
metap = (HashMetaPage) BufferGetPage(metabuf);
metap = HashPageGetMeta(BufferGetPage(metabuf));
/*
* Check whether the item can fit on a hash page at all. (Eventually, we
* ought to try to apply TOAST methods if not.) Note that at this point,
* itemsz doesn't include the ItemId.
*
* XXX this is useless code if we are only storing hash keys.
*/
if (itemsz > HashMaxItemSize((Page) metap))
ereport(ERROR,
@ -197,11 +192,15 @@ _hash_pgaddtup(Relation rel,
{
OffsetNumber itup_off;
Page page;
uint32 hashkey;
_hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
page = BufferGetPage(buf);
itup_off = OffsetNumberNext(PageGetMaxOffsetNumber(page));
/* Find where to insert the tuple (preserving page's hashkey ordering) */
hashkey = _hash_get_indextuple_hashkey(itup);
itup_off = _hash_binsearch(page, hashkey);
if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false)
== InvalidOffsetNumber)
elog(ERROR, "failed to add index item to \"%s\"",

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.64 2008/06/19 00:46:03 alvherre Exp $
* $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.65 2008/09/15 18:43:41 tgl Exp $
*
* NOTES
* Overflow pages look like ordinary relation pages.
@ -187,7 +187,7 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
_hash_checkpage(rel, metabuf, LH_META_PAGE);
metap = (HashMetaPage) BufferGetPage(metabuf);
metap = HashPageGetMeta(BufferGetPage(metabuf));
/* start search at hashm_firstfree */
orig_firstfree = metap->hashm_firstfree;
@ -450,7 +450,7 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf,
/* Read the metapage so we can determine which bitmap page to use */
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
metap = (HashMetaPage) BufferGetPage(metabuf);
metap = HashPageGetMeta(BufferGetPage(metabuf));
/* Identify which bit to set */
ovflbitno = blkno_to_bitno(metap, ovflblkno);

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.76 2008/08/11 11:05:10 heikki Exp $
* $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.77 2008/09/15 18:43:41 tgl Exp $
*
* NOTES
* Postgres hash pages look like ordinary relation pages. The opaque
@ -348,11 +348,9 @@ _hash_metapinit(Relation rel, double num_tuples)
* Determine the target fill factor (in tuples per bucket) for this index.
* The idea is to make the fill factor correspond to pages about as full
* as the user-settable fillfactor parameter says. We can compute it
* exactly if the index datatype is fixed-width, but for var-width there's
* some guessing involved.
* exactly since the index datatype (i.e. uint32 hash key) is fixed-width.
*/
data_width = get_typavgwidth(RelationGetDescr(rel)->attrs[0]->atttypid,
RelationGetDescr(rel)->attrs[0]->atttypmod);
data_width = sizeof(uint32);
item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) +
sizeof(ItemIdData); /* include the line pointer */
ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width;
@ -395,20 +393,18 @@ _hash_metapinit(Relation rel, double num_tuples)
pageopaque->hasho_flag = LH_META_PAGE;
pageopaque->hasho_page_id = HASHO_PAGE_ID;
metap = (HashMetaPage) pg;
metap = HashPageGetMeta(pg);
metap->hashm_magic = HASH_MAGIC;
metap->hashm_version = HASH_VERSION;
metap->hashm_ntuples = 0;
metap->hashm_nmaps = 0;
metap->hashm_ffactor = ffactor;
metap->hashm_bsize = BufferGetPageSize(metabuf);
metap->hashm_bsize = HashGetMaxBitmapSize(pg);
/* find largest bitmap array size that will fit in page size */
for (i = _hash_log2(metap->hashm_bsize); i > 0; --i)
{
if ((1 << i) <= (metap->hashm_bsize -
(MAXALIGN(sizeof(PageHeaderData)) +
MAXALIGN(sizeof(HashPageOpaqueData)))))
if ((1 << i) <= metap->hashm_bsize)
break;
}
Assert(i > 0);
@ -532,7 +528,7 @@ _hash_expandtable(Relation rel, Buffer metabuf)
_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
_hash_checkpage(rel, metabuf, LH_META_PAGE);
metap = (HashMetaPage) BufferGetPage(metabuf);
metap = HashPageGetMeta(BufferGetPage(metabuf));
/*
* Check to see if split is still needed; someone else might have already
@ -774,8 +770,6 @@ _hash_splitbucket(Relation rel,
Buffer nbuf;
BlockNumber oblkno;
BlockNumber nblkno;
bool null;
Datum datum;
HashPageOpaque oopaque;
HashPageOpaque nopaque;
IndexTuple itup;
@ -785,7 +779,6 @@ _hash_splitbucket(Relation rel,
OffsetNumber omaxoffnum;
Page opage;
Page npage;
TupleDesc itupdesc = RelationGetDescr(rel);
/*
* It should be okay to simultaneously write-lock pages from each bucket,
@ -846,16 +839,11 @@ _hash_splitbucket(Relation rel,
}
/*
* Re-hash the tuple to determine which bucket it now belongs in.
*
* It is annoying to call the hash function while holding locks, but
* releasing and relocking the page for each tuple is unappealing too.
* Fetch the item's hash key (conveniently stored in the item)
* and determine which bucket it now belongs in.
*/
itup = (IndexTuple) PageGetItem(opage, PageGetItemId(opage, ooffnum));
datum = index_getattr(itup, 1, itupdesc, &null);
Assert(!null);
bucket = _hash_hashkey2bucket(_hash_datum2hashkey(rel, datum),
bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
maxbucket, highmask, lowmask);
if (bucket == nbucket)

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/hash/hashsearch.c,v 1.53 2008/06/19 00:46:03 alvherre Exp $
* $PostgreSQL: pgsql/src/backend/access/hash/hashsearch.c,v 1.54 2008/09/15 18:43:41 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -178,6 +178,8 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
hashkey = _hash_datum2hashkey_type(rel, cur->sk_argument,
cur->sk_subtype);
so->hashso_sk_hash = hashkey;
/*
* Acquire shared split lock so we can compute the target bucket safely
* (see README).
@ -186,7 +188,7 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
/* Read the metapage */
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
metap = (HashMetaPage) BufferGetPage(metabuf);
metap = HashPageGetMeta(BufferGetPage(metabuf));
/*
* Compute the target bucket number, and convert to block number.
@ -284,7 +286,7 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
offnum = InvalidOffsetNumber;
/*
* 'offnum' now points to the last tuple we have seen (if any).
* 'offnum' now points to the last tuple we examined (if any).
*
* continue to step through tuples until: 1) we get to the end of the
* bucket chain or 2) we find a valid tuple.
@ -297,25 +299,39 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
if (offnum != InvalidOffsetNumber)
offnum = OffsetNumberNext(offnum); /* move forward */
else
offnum = FirstOffsetNumber; /* new page */
{
/* new page, locate starting position by binary search */
offnum = _hash_binsearch(page, so->hashso_sk_hash);
}
while (offnum > maxoff)
for (;;)
{
/*
* either this page is empty (maxoff ==
* InvalidOffsetNumber) or we ran off the end.
* check if we're still in the range of items with
* the target hash key
*/
if (offnum <= maxoff)
{
Assert(offnum >= FirstOffsetNumber);
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
break; /* yes, so exit for-loop */
}
/*
* ran off the end of this page, try the next
*/
_hash_readnext(rel, &buf, &page, &opaque);
if (BufferIsValid(buf))
{
maxoff = PageGetMaxOffsetNumber(page);
offnum = FirstOffsetNumber;
offnum = _hash_binsearch(page, so->hashso_sk_hash);
}
else
{
/* end of bucket */
maxoff = offnum = InvalidOffsetNumber;
break; /* exit while */
itup = NULL;
break; /* exit for-loop */
}
}
break;
@ -324,22 +340,39 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
if (offnum != InvalidOffsetNumber)
offnum = OffsetNumberPrev(offnum); /* move back */
else
offnum = maxoff; /* new page */
{
/* new page, locate starting position by binary search */
offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
}
while (offnum < FirstOffsetNumber)
for (;;)
{
/*
* either this page is empty (offnum ==
* InvalidOffsetNumber) or we ran off the end.
* check if we're still in the range of items with
* the target hash key
*/
if (offnum >= FirstOffsetNumber)
{
Assert(offnum <= maxoff);
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
break; /* yes, so exit for-loop */
}
/*
* ran off the end of this page, try the next
*/
_hash_readprev(rel, &buf, &page, &opaque);
if (BufferIsValid(buf))
maxoff = offnum = PageGetMaxOffsetNumber(page);
{
maxoff = PageGetMaxOffsetNumber(page);
offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
}
else
{
/* end of bucket */
maxoff = offnum = InvalidOffsetNumber;
break; /* exit while */
itup = NULL;
break; /* exit for-loop */
}
}
break;
@ -347,19 +380,19 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
default:
/* NoMovementScanDirection */
/* this should not be reached */
itup = NULL;
break;
}
/* we ran off the end of the world without finding a match */
if (offnum == InvalidOffsetNumber)
if (itup == NULL)
{
/* we ran off the end of the bucket without finding a match */
*bufP = so->hashso_curbuf = InvalidBuffer;
ItemPointerSetInvalid(current);
return false;
}
/* get ready to check this tuple */
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
/* check the tuple quals, loop around if not met */
} while (!_hash_checkqual(scan, itup));
/* if we made it to here, we've found a valid tuple */

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/hash/hashutil.c,v 1.56 2008/07/13 20:45:47 tgl Exp $
* $PostgreSQL: pgsql/src/backend/access/hash/hashutil.c,v 1.57 2008/09/15 18:43:41 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -28,12 +28,21 @@
bool
_hash_checkqual(IndexScanDesc scan, IndexTuple itup)
{
/*
* Currently, we can't check any of the scan conditions since we do
* not have the original index entry value to supply to the sk_func.
* Always return true; we expect that hashgettuple already set the
* recheck flag to make the main indexscan code do it.
*/
#ifdef NOT_USED
TupleDesc tupdesc = RelationGetDescr(scan->indexRelation);
ScanKey key = scan->keyData;
int scanKeySize = scan->numberOfKeys;
#endif
IncrIndexProcessed();
#ifdef NOT_USED
while (scanKeySize > 0)
{
Datum datum;
@ -59,6 +68,7 @@ _hash_checkqual(IndexScanDesc scan, IndexTuple itup)
key++;
scanKeySize--;
}
#endif
return true;
}
@ -190,7 +200,7 @@ _hash_checkpage(Relation rel, Buffer buf, int flags)
*/
if (flags == LH_META_PAGE)
{
HashMetaPage metap = (HashMetaPage) page;
HashMetaPage metap = HashPageGetMeta(page);
if (metap->hashm_magic != HASH_MAGIC)
ereport(ERROR,
@ -221,3 +231,123 @@ hashoptions(PG_FUNCTION_ARGS)
PG_RETURN_BYTEA_P(result);
PG_RETURN_NULL();
}
/*
* _hash_get_indextuple_hashkey - get the hash index tuple's hash key value
*/
uint32
_hash_get_indextuple_hashkey(IndexTuple itup)
{
char *attp;
/*
* We assume the hash key is the first attribute and can't be null,
* so this can be done crudely but very very cheaply ...
*/
attp = (char *) itup + IndexInfoFindDataOffset(itup->t_info);
return *((uint32 *) attp);
}
/*
* _hash_form_tuple - form an index tuple containing hash code only
*/
IndexTuple
_hash_form_tuple(Relation index, Datum *values, bool *isnull)
{
IndexTuple itup;
uint32 hashkey;
Datum hashkeydatum;
TupleDesc hashdesc;
if (isnull[0])
hashkeydatum = (Datum) 0;
else
{
hashkey = _hash_datum2hashkey(index, values[0]);
hashkeydatum = UInt32GetDatum(hashkey);
}
hashdesc = RelationGetDescr(index);
Assert(hashdesc->natts == 1);
itup = index_form_tuple(hashdesc, &hashkeydatum, isnull);
return itup;
}
/*
* _hash_binsearch - Return the offset number in the page where the
* specified hash value should be sought or inserted.
*
* We use binary search, relying on the assumption that the existing entries
* are ordered by hash key.
*
* Returns the offset of the first index entry having hashkey >= hash_value,
* or the page's max offset plus one if hash_value is greater than all
* existing hash keys in the page. This is the appropriate place to start
* a search, or to insert a new item.
*/
OffsetNumber
_hash_binsearch(Page page, uint32 hash_value)
{
OffsetNumber upper;
OffsetNumber lower;
/* Loop invariant: lower <= desired place <= upper */
upper = PageGetMaxOffsetNumber(page) + 1;
lower = FirstOffsetNumber;
while (upper > lower)
{
OffsetNumber off;
IndexTuple itup;
uint32 hashkey;
off = (upper + lower) / 2;
Assert(OffsetNumberIsValid(off));
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
hashkey = _hash_get_indextuple_hashkey(itup);
if (hashkey < hash_value)
lower = off + 1;
else
upper = off;
}
return lower;
}
/*
* _hash_binsearch_last
*
* Same as above, except that if there are multiple matching items in the
* page, we return the offset of the last one instead of the first one,
* and the possible range of outputs is 0..maxoffset not 1..maxoffset+1.
* This is handy for starting a new page in a backwards scan.
*/
OffsetNumber
_hash_binsearch_last(Page page, uint32 hash_value)
{
OffsetNumber upper;
OffsetNumber lower;
/* Loop invariant: lower <= desired place <= upper */
upper = PageGetMaxOffsetNumber(page);
lower = FirstOffsetNumber - 1;
while (upper > lower)
{
IndexTuple itup;
OffsetNumber off;
uint32 hashkey;
off = (upper + lower + 1) / 2;
Assert(OffsetNumberIsValid(off));
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
hashkey = _hash_get_indextuple_hashkey(itup);
if (hashkey > hash_value)
upper = off - 1;
else
lower = off;
}
return lower;
}

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.303 2008/08/25 22:42:32 tgl Exp $
* $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.304 2008/09/15 18:43:41 tgl Exp $
*
*
* INTERFACE ROUTINES
@ -76,6 +76,7 @@ typedef struct
/* non-export function prototypes */
static TupleDesc ConstructTupleDescriptor(Relation heapRelation,
IndexInfo *indexInfo,
Oid accessMethodObjectId,
Oid *classObjectId);
static void InitializeAttributeOids(Relation indexRelation,
int numatts, Oid indexoid);
@ -105,15 +106,28 @@ static Oid IndexGetRelation(Oid indexId);
static TupleDesc
ConstructTupleDescriptor(Relation heapRelation,
IndexInfo *indexInfo,
Oid accessMethodObjectId,
Oid *classObjectId)
{
int numatts = indexInfo->ii_NumIndexAttrs;
ListCell *indexpr_item = list_head(indexInfo->ii_Expressions);
HeapTuple amtuple;
Form_pg_am amform;
TupleDesc heapTupDesc;
TupleDesc indexTupDesc;
int natts; /* #atts in heap rel --- for error checks */
int i;
/* We need access to the index AM's pg_am tuple */
amtuple = SearchSysCache(AMOID,
ObjectIdGetDatum(accessMethodObjectId),
0, 0, 0);
if (!HeapTupleIsValid(amtuple))
elog(ERROR, "cache lookup failed for access method %u",
accessMethodObjectId);
amform = (Form_pg_am) GETSTRUCT(amtuple);
/* ... and to the table's tuple descriptor */
heapTupDesc = RelationGetDescr(heapRelation);
natts = RelationGetForm(heapRelation)->relnatts;
@ -133,6 +147,7 @@ ConstructTupleDescriptor(Relation heapRelation,
Form_pg_attribute to = indexTupDesc->attrs[i];
HeapTuple tuple;
Form_pg_type typeTup;
Form_pg_opclass opclassTup;
Oid keyType;
if (atnum != 0)
@ -231,8 +246,8 @@ ConstructTupleDescriptor(Relation heapRelation,
to->attrelid = InvalidOid;
/*
* Check the opclass to see if it provides a keytype (overriding the
* attribute type).
* Check the opclass and index AM to see if either provides a keytype
* (overriding the attribute type). Opclass takes precedence.
*/
tuple = SearchSysCache(CLAOID,
ObjectIdGetDatum(classObjectId[i]),
@ -240,7 +255,11 @@ ConstructTupleDescriptor(Relation heapRelation,
if (!HeapTupleIsValid(tuple))
elog(ERROR, "cache lookup failed for opclass %u",
classObjectId[i]);
keyType = ((Form_pg_opclass) GETSTRUCT(tuple))->opckeytype;
opclassTup = (Form_pg_opclass) GETSTRUCT(tuple);
if (OidIsValid(opclassTup->opckeytype))
keyType = opclassTup->opckeytype;
else
keyType = amform->amkeytype;
ReleaseSysCache(tuple);
if (OidIsValid(keyType) && keyType != to->atttypid)
@ -264,6 +283,8 @@ ConstructTupleDescriptor(Relation heapRelation,
}
}
ReleaseSysCache(amtuple);
return indexTupDesc;
}
@ -577,6 +598,7 @@ index_create(Oid heapRelationId,
*/
indexTupDesc = ConstructTupleDescriptor(heapRelation,
indexInfo,
accessMethodObjectId,
classObjectId);
/*

View File

@ -91,7 +91,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/sort/tuplesort.c,v 1.86 2008/08/01 13:16:09 alvherre Exp $
* $PostgreSQL: pgsql/src/backend/utils/sort/tuplesort.c,v 1.87 2008/09/15 18:43:41 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -101,7 +101,6 @@
#include <limits.h>
#include "access/genam.h"
#include "access/hash.h"
#include "access/nbtree.h"
#include "catalog/pg_amop.h"
#include "catalog/pg_operator.h"
@ -353,7 +352,6 @@ struct Tuplesortstate
bool enforceUnique; /* complain if we find duplicate tuples */
/* These are specific to the index_hash subcase: */
FmgrInfo *hash_proc; /* call info for the hash function */
uint32 hash_mask; /* mask for sortable part of hash code */
/*
@ -689,13 +687,6 @@ tuplesort_begin_index_hash(Relation indexRel,
state->indexRel = indexRel;
/*
* We look up the index column's hash function just once, to avoid
* chewing lots of cycles in repeated index_getprocinfo calls. This
* assumes that our caller holds the index relation open throughout the
* sort, else the pointer obtained here might cease to be valid.
*/
state->hash_proc = index_getprocinfo(indexRel, 1, HASHPROC);
state->hash_mask = hash_mask;
MemoryContextSwitchTo(oldcontext);
@ -2821,11 +2812,6 @@ static int
comparetup_index_hash(const SortTuple *a, const SortTuple *b,
Tuplesortstate *state)
{
/*
* It's slightly annoying to redo the hash function each time, although
* most hash functions ought to be cheap. Is it worth having a variant
* tuple storage format so we can store the hash code?
*/
uint32 hash1;
uint32 hash2;
IndexTuple tuple1;
@ -2834,13 +2820,14 @@ comparetup_index_hash(const SortTuple *a, const SortTuple *b,
/* Allow interrupting long sorts */
CHECK_FOR_INTERRUPTS();
/* Compute hash codes and mask off bits we don't want to sort by */
/*
* Fetch hash keys and mask off bits we don't want to sort by.
* We know that the first column of the index tuple is the hash key.
*/
Assert(!a->isnull1);
hash1 = DatumGetUInt32(FunctionCall1(state->hash_proc, a->datum1))
& state->hash_mask;
hash1 = DatumGetUInt32(a->datum1) & state->hash_mask;
Assert(!b->isnull1);
hash2 = DatumGetUInt32(FunctionCall1(state->hash_proc, b->datum1))
& state->hash_mask;
hash2 = DatumGetUInt32(b->datum1) & state->hash_mask;
if (hash1 > hash2)
return 1;

View File

@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/access/hash.h,v 1.89 2008/07/13 20:45:47 tgl Exp $
* $PostgreSQL: pgsql/src/include/access/hash.h,v 1.90 2008/09/15 18:43:41 tgl Exp $
*
* NOTES
* modeled after Margo Seltzer's hash implementation for unix.
@ -75,6 +75,9 @@ typedef HashPageOpaqueData *HashPageOpaque;
*/
typedef struct HashScanOpaqueData
{
/* Hash value of the scan key, ie, the hash key we seek */
uint32 hashso_sk_hash;
/*
* By definition, a hash scan should be examining only one bucket. We
* record the bucket number here as soon as it is known.
@ -111,7 +114,7 @@ typedef HashScanOpaqueData *HashScanOpaque;
#define HASH_METAPAGE 0 /* metapage is always block 0 */
#define HASH_MAGIC 0x6440640
#define HASH_VERSION 1 /* new for Pg 7.4 */
#define HASH_VERSION 2 /* 2 signifies only hash key value is stored */
/*
* Spares[] holds the number of overflow pages currently allocated at or
@ -138,7 +141,6 @@ typedef HashScanOpaqueData *HashScanOpaque;
typedef struct HashMetaPageData
{
PageHeaderData hashm_phdr; /* pad for page header (do not use) */
uint32 hashm_magic; /* magic no. for hash tables */
uint32 hashm_version; /* version ID */
double hashm_ntuples; /* number of tuples stored in the table */
@ -191,8 +193,16 @@ typedef HashMetaPageData *HashMetaPage;
#define BMPGSZ_BIT(metap) ((metap)->hashm_bmsize << BYTE_TO_BIT)
#define BMPG_SHIFT(metap) ((metap)->hashm_bmshift)
#define BMPG_MASK(metap) (BMPGSZ_BIT(metap) - 1)
#define HashPageGetBitmap(pg) \
((uint32 *) (((char *) (pg)) + MAXALIGN(sizeof(PageHeaderData))))
#define HashPageGetBitmap(page) \
((uint32 *) PageGetContents(page))
#define HashGetMaxBitmapSize(page) \
(PageGetPageSize((Page) page) - \
(MAXALIGN(SizeOfPageHeaderData) + MAXALIGN(sizeof(HashPageOpaqueData))))
#define HashPageGetMeta(page) \
((HashMetaPage) PageGetContents(page))
/*
* The number of bits in an ovflpage bitmap word.
@ -330,6 +340,11 @@ extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket,
uint32 highmask, uint32 lowmask);
extern uint32 _hash_log2(uint32 num);
extern void _hash_checkpage(Relation rel, Buffer buf, int flags);
extern uint32 _hash_get_indextuple_hashkey(IndexTuple itup);
extern IndexTuple _hash_form_tuple(Relation index,
Datum *values, bool *isnull);
extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value);
extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value);
/* hash.c */
extern void hash_redo(XLogRecPtr lsn, XLogRecord *record);

View File

@ -37,7 +37,7 @@
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.485 2008/09/10 18:09:20 alvherre Exp $
* $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.486 2008/09/15 18:43:41 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -53,6 +53,6 @@
*/
/* yyyymmddN */
#define CATALOG_VERSION_NO 200809101
#define CATALOG_VERSION_NO 200809151
#endif

View File

@ -8,7 +8,7 @@
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/catalog/pg_am.h,v 1.57 2008/07/11 21:06:29 tgl Exp $
* $PostgreSQL: pgsql/src/include/catalog/pg_am.h,v 1.58 2008/09/15 18:43:41 tgl Exp $
*
* NOTES
* the genbki.sh script reads this file and generates .bki
@ -48,6 +48,7 @@ CATALOG(pg_am,2601)
bool amsearchnulls; /* can AM search for NULL index entries? */
bool amstorage; /* can storage type differ from column type? */
bool amclusterable; /* does AM support cluster command? */
Oid amkeytype; /* type of data in index, or InvalidOid */
regproc aminsert; /* "insert this tuple" function */
regproc ambeginscan; /* "start new scan" function */
regproc amgettuple; /* "next valid tuple" function */
@ -74,7 +75,7 @@ typedef FormData_pg_am *Form_pg_am;
* compiler constants for pg_am
* ----------------
*/
#define Natts_pg_am 24
#define Natts_pg_am 25
#define Anum_pg_am_amname 1
#define Anum_pg_am_amstrategies 2
#define Anum_pg_am_amsupport 3
@ -86,35 +87,36 @@ typedef FormData_pg_am *Form_pg_am;
#define Anum_pg_am_amsearchnulls 9
#define Anum_pg_am_amstorage 10
#define Anum_pg_am_amclusterable 11
#define Anum_pg_am_aminsert 12
#define Anum_pg_am_ambeginscan 13
#define Anum_pg_am_amgettuple 14
#define Anum_pg_am_amgetbitmap 15
#define Anum_pg_am_amrescan 16
#define Anum_pg_am_amendscan 17
#define Anum_pg_am_ammarkpos 18
#define Anum_pg_am_amrestrpos 19
#define Anum_pg_am_ambuild 20
#define Anum_pg_am_ambulkdelete 21
#define Anum_pg_am_amvacuumcleanup 22
#define Anum_pg_am_amcostestimate 23
#define Anum_pg_am_amoptions 24
#define Anum_pg_am_amkeytype 12
#define Anum_pg_am_aminsert 13
#define Anum_pg_am_ambeginscan 14
#define Anum_pg_am_amgettuple 15
#define Anum_pg_am_amgetbitmap 16
#define Anum_pg_am_amrescan 17
#define Anum_pg_am_amendscan 18
#define Anum_pg_am_ammarkpos 19
#define Anum_pg_am_amrestrpos 20
#define Anum_pg_am_ambuild 21
#define Anum_pg_am_ambulkdelete 22
#define Anum_pg_am_amvacuumcleanup 23
#define Anum_pg_am_amcostestimate 24
#define Anum_pg_am_amoptions 25
/* ----------------
* initial contents of pg_am
* ----------------
*/
DATA(insert OID = 403 ( btree 5 1 t t t t t t f t btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbulkdelete btvacuumcleanup btcostestimate btoptions ));
DATA(insert OID = 403 ( btree 5 1 t t t t t t f t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbulkdelete btvacuumcleanup btcostestimate btoptions ));
DESCR("b-tree index access method");
#define BTREE_AM_OID 403
DATA(insert OID = 405 ( hash 1 1 f f f f f f f f hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbulkdelete hashvacuumcleanup hashcostestimate hashoptions ));
DATA(insert OID = 405 ( hash 1 1 f f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbulkdelete hashvacuumcleanup hashcostestimate hashoptions ));
DESCR("hash index access method");
#define HASH_AM_OID 405
DATA(insert OID = 783 ( gist 0 7 f f t t t t t t gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions ));
DATA(insert OID = 783 ( gist 0 7 f f t t t t t t 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions ));
DESCR("GiST index access method");
#define GIST_AM_OID 783
DATA(insert OID = 2742 ( gin 0 5 f f t t f f t f gininsert ginbeginscan gingettuple gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbulkdelete ginvacuumcleanup gincostestimate ginoptions ));
DATA(insert OID = 2742 ( gin 0 5 f f t t f f t f 0 gininsert ginbeginscan gingettuple gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbulkdelete ginvacuumcleanup gincostestimate ginoptions ));
DESCR("GIN index access method");
#define GIN_AM_OID 2742

View File

@ -28,7 +28,7 @@
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/catalog/pg_opclass.h,v 1.82 2008/06/24 17:58:27 tgl Exp $
* $PostgreSQL: pgsql/src/include/catalog/pg_opclass.h,v 1.83 2008/09/15 18:43:41 tgl Exp $
*
* NOTES
* the genbki.sh script reads this file and generates .bki
@ -123,13 +123,13 @@ DATA(insert ( 403 macaddr_ops PGNSP PGUID 1984 829 t 0 ));
DATA(insert ( 405 macaddr_ops PGNSP PGUID 1985 829 t 0 ));
/*
* Here's an ugly little hack to save space in the system catalog indexes.
* btree and hash don't ordinarily allow a storage type different from input
* type; but cstring and name are the same thing except for trailing padding,
* btree doesn't ordinarily allow a storage type different from input type;
* but cstring and name are the same thing except for trailing padding,
* and we can safely omit that within an index entry. So we declare the
* opclasses for name as using cstring storage type.
* btree opclass for name as using cstring storage type.
*/
DATA(insert ( 403 name_ops PGNSP PGUID 1986 19 t 2275 ));
DATA(insert ( 405 name_ops PGNSP PGUID 1987 19 t 2275 ));
DATA(insert ( 405 name_ops PGNSP PGUID 1987 19 t 0 ));
DATA(insert ( 403 numeric_ops PGNSP PGUID 1988 1700 t 0 ));
DATA(insert ( 405 numeric_ops PGNSP PGUID 1998 1700 t 0 ));
DATA(insert OID = 1981 ( 403 oid_ops PGNSP PGUID 1989 26 t 0 ));