diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c index 828c527986..4fe0301c75 100644 --- a/src/backend/access/hash/hashovfl.c +++ b/src/backend/access/hash/hashovfl.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.52 2006/03/31 23:32:05 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.53 2006/11/19 21:33:22 tgl Exp $ * * NOTES * Overflow pages look like ordinary relation pages. @@ -20,7 +20,7 @@ #include "access/hash.h" -static BlockNumber _hash_getovflpage(Relation rel, Buffer metabuf); +static Buffer _hash_getovflpage(Relation rel, Buffer metabuf); static uint32 _hash_firstfreebit(uint32 map); @@ -99,18 +99,14 @@ blkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno) Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf) { - BlockNumber ovflblkno; Buffer ovflbuf; Page page; Page ovflpage; HashPageOpaque pageopaque; HashPageOpaque ovflopaque; - /* allocate an empty overflow page */ - ovflblkno = _hash_getovflpage(rel, metabuf); - - /* lock the overflow page */ - ovflbuf = _hash_getbuf(rel, ovflblkno, HASH_WRITE); + /* allocate and lock an empty overflow page */ + ovflbuf = _hash_getovflpage(rel, metabuf); ovflpage = BufferGetPage(ovflbuf); /* @@ -150,7 +146,7 @@ _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf) MarkBufferDirty(ovflbuf); /* logically chain overflow page to previous page */ - pageopaque->hasho_nextblkno = ovflblkno; + pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf); _hash_wrtbuf(rel, buf); return ovflbuf; @@ -159,16 +155,18 @@ _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf) /* * _hash_getovflpage() * - * Find an available overflow page and return its block number. + * Find an available overflow page and return it. The returned buffer + * is pinned and write-locked, but its contents are not initialized. * * The caller must hold a pin, but no lock, on the metapage buffer. - * The buffer is returned in the same state. + * That buffer is left in the same state at exit. */ -static BlockNumber +static Buffer _hash_getovflpage(Relation rel, Buffer metabuf) { HashMetaPage metap; Buffer mapbuf = 0; + Buffer newbuf; BlockNumber blkno; uint32 orig_firstfree; uint32 splitnum; @@ -243,11 +241,10 @@ _hash_getovflpage(Relation rel, Buffer metabuf) _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); } - /* No Free Page Found - have to allocate a new page */ - bit = metap->hashm_spares[splitnum]; - metap->hashm_spares[splitnum]++; - - /* Check if we need to allocate a new bitmap page */ + /* + * No free pages --- have to extend the relation to add an overflow page. + * First, check to see if we have to add a new bitmap page too. + */ if (last_bit == (uint32) (BMPGSZ_BIT(metap) - 1)) { /* @@ -258,22 +255,39 @@ _hash_getovflpage(Relation rel, Buffer metabuf) * marked "in use". Subsequent pages do not exist yet, but it is * convenient to pre-mark them as "in use" too. */ - _hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit)); - bit = metap->hashm_spares[splitnum]; + _hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit)); metap->hashm_spares[splitnum]++; } else { /* - * Nothing to do here; since the page was past the last used page, we - * know its bitmap bit was preinitialized to "in use". + * Nothing to do here; since the page will be past the last used page, + * we know its bitmap bit was preinitialized to "in use". */ } /* Calculate address of the new overflow page */ + bit = metap->hashm_spares[splitnum]; blkno = bitno_to_blkno(metap, bit); + /* + * We have to fetch the page with P_NEW to ensure smgr's idea of the + * relation length stays in sync with ours. XXX It's annoying to do this + * with metapage write lock held; would be better to use a lock that + * doesn't block incoming searches. Best way to fix it would be to stop + * maintaining hashm_spares[hashm_ovflpoint] and rely entirely on the + * smgr relation length to track where new overflow pages come from; + * then we could release the metapage before we do the smgrextend. + * FIXME later (not in beta...) + */ + newbuf = _hash_getbuf(rel, P_NEW, HASH_WRITE); + if (BufferGetBlockNumber(newbuf) != blkno) + elog(ERROR, "unexpected hash relation size: %u, should be %u", + BufferGetBlockNumber(newbuf), blkno); + + metap->hashm_spares[splitnum]++; + /* * Adjust hashm_firstfree to avoid redundant searches. But don't risk * changing it if someone moved it while we were searching bitmap pages. @@ -284,7 +298,7 @@ _hash_getovflpage(Relation rel, Buffer metabuf) /* Write updated metapage and release lock, but not pin */ _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); - return blkno; + return newbuf; found: /* convert bit to bit number within page */ @@ -300,7 +314,7 @@ found: /* convert bit to absolute bit number */ bit += (i << BMPG_SHIFT(metap)); - /* Calculate address of the new overflow page */ + /* Calculate address of the recycled overflow page */ blkno = bitno_to_blkno(metap, bit); /* @@ -320,7 +334,8 @@ found: _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); } - return blkno; + /* Fetch and return the recycled page */ + return _hash_getbuf(rel, blkno, HASH_WRITE); } /* @@ -388,7 +403,11 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf) prevblkno = ovflopaque->hasho_prevblkno; bucket = ovflopaque->hasho_bucket; - /* Zero the page for debugging's sake; then write and release it */ + /* + * Zero the page for debugging's sake; then write and release it. + * (Note: if we failed to zero the page here, we'd have problems + * with the Assert in _hash_pageinit() when the page is reused.) + */ MemSet(ovflpage, 0, BufferGetPageSize(ovflbuf)); _hash_wrtbuf(rel, ovflbuf); @@ -488,12 +507,19 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno) /* * It is okay to write-lock the new bitmap page while holding metapage * write lock, because no one else could be contending for the new page. + * Also, the metapage lock makes it safe to extend the index using P_NEW, + * which we want to do to ensure the smgr's idea of the relation size + * stays in step with ours. * * There is some loss of concurrency in possibly doing I/O for the new * page while holding the metapage lock, but this path is taken so seldom * that it's not worth worrying about. */ - buf = _hash_getbuf(rel, blkno, HASH_WRITE); + buf = _hash_getbuf(rel, P_NEW, HASH_WRITE); + if (BufferGetBlockNumber(buf) != blkno) + elog(ERROR, "unexpected hash relation size: %u, should be %u", + BufferGetBlockNumber(buf), blkno); + pg = BufferGetPage(buf); /* initialize the page */ diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index 696d4bf616..0f643836a1 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.60 2006/10/04 00:29:48 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.61 2006/11/19 21:33:23 tgl Exp $ * * NOTES * Postgres hash pages look like ordinary relation pages. The opaque @@ -32,9 +32,11 @@ #include "access/hash.h" #include "miscadmin.h" #include "storage/lmgr.h" +#include "storage/smgr.h" #include "utils/lsyscache.h" +static BlockNumber _hash_alloc_buckets(Relation rel, uint32 nblocks); static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket, BlockNumber start_oblkno, @@ -102,21 +104,18 @@ _hash_droplock(Relation rel, BlockNumber whichlock, int access) * requested buffer and its reference count has been incremented * (ie, the buffer is "locked and pinned"). * - * XXX P_NEW is not used because, unlike the tree structures, we - * need the bucket blocks to be at certain block numbers. + * blkno == P_NEW is allowed, but it is caller's responsibility to + * ensure that only one process can extend the index at a time. * - * All call sites should call either _hash_pageinit or _hash_checkpage + * All call sites should call either _hash_checkpage or _hash_pageinit * on the returned page, depending on whether the block is expected - * to be new or not. + * to be valid or not. */ Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access) { Buffer buf; - if (blkno == P_NEW) - elog(ERROR, "hash AM does not use P_NEW"); - buf = ReadBuffer(rel, blkno); if (access != HASH_NOLOCK) @@ -237,7 +236,14 @@ _hash_metapinit(Relation rel) if (ffactor < 10) ffactor = 10; - metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE); + /* + * We initialize the metapage, the first two bucket pages, and the + * first bitmap page in sequence, using P_NEW to cause smgrextend() + * calls to occur. This ensures that the smgr level has the right + * idea of the physical index length. + */ + metabuf = _hash_getbuf(rel, P_NEW, HASH_WRITE); + Assert(BufferGetBlockNumber(metabuf) == HASH_METAPAGE); pg = BufferGetPage(metabuf); _hash_pageinit(pg, BufferGetPageSize(metabuf)); @@ -290,7 +296,8 @@ _hash_metapinit(Relation rel) */ for (i = 0; i <= 1; i++) { - buf = _hash_getbuf(rel, BUCKET_TO_BLKNO(metap, i), HASH_WRITE); + buf = _hash_getbuf(rel, P_NEW, HASH_WRITE); + Assert(BufferGetBlockNumber(buf) == BUCKET_TO_BLKNO(metap, i)); pg = BufferGetPage(buf); _hash_pageinit(pg, BufferGetPageSize(buf)); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); @@ -303,8 +310,7 @@ _hash_metapinit(Relation rel) } /* - * Initialize first bitmap page. Can't do this until we create the first - * two buckets, else smgr will complain. + * Initialize first bitmap page */ _hash_initbitmap(rel, metap, 3); @@ -339,6 +345,7 @@ _hash_expandtable(Relation rel, Buffer metabuf) Bucket old_bucket; Bucket new_bucket; uint32 spare_ndx; + BlockNumber firstblock = InvalidBlockNumber; BlockNumber start_oblkno; BlockNumber start_nblkno; uint32 maxbucket; @@ -376,6 +383,40 @@ _hash_expandtable(Relation rel, Buffer metabuf) (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1)) goto fail; + /* + * Can't split anymore if maxbucket has reached its maximum possible value. + * + * Ideally we'd allow bucket numbers up to UINT_MAX-1 (no higher because + * the calculation maxbucket+1 mustn't overflow). Currently we restrict + * to half that because of overflow looping in _hash_log2() and + * insufficient space in hashm_spares[]. It's moot anyway because an + * index with 2^32 buckets would certainly overflow BlockNumber and + * hence _hash_alloc_buckets() would fail, but if we supported buckets + * smaller than a disk block then this would be an independent constraint. + */ + if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE) + goto fail; + + /* + * If the split point is increasing (hashm_maxbucket's log base 2 + * increases), we need to allocate a new batch of bucket pages. + */ + new_bucket = metap->hashm_maxbucket + 1; + spare_ndx = _hash_log2(new_bucket + 1); + if (spare_ndx > metap->hashm_ovflpoint) + { + Assert(spare_ndx == metap->hashm_ovflpoint + 1); + /* + * The number of buckets in the new splitpoint is equal to the + * total number already in existence, i.e. new_bucket. Currently + * this maps one-to-one to blocks required, but someday we may need + * a more complicated calculation here. + */ + firstblock = _hash_alloc_buckets(rel, new_bucket); + if (firstblock == InvalidBlockNumber) + goto fail; /* can't split due to BlockNumber overflow */ + } + /* * Determine which bucket is to be split, and attempt to lock the old * bucket. If we can't get the lock, give up. @@ -389,7 +430,6 @@ _hash_expandtable(Relation rel, Buffer metabuf) * lock. This should be okay because no one else should be trying to lock * the new bucket yet... */ - new_bucket = metap->hashm_maxbucket + 1; old_bucket = (new_bucket & metap->hashm_lowmask); start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket); @@ -425,14 +465,9 @@ _hash_expandtable(Relation rel, Buffer metabuf) * increases), we need to adjust the hashm_spares[] array and * hashm_ovflpoint so that future overflow pages will be created beyond * this new batch of bucket pages. - * - * XXX should initialize new bucket pages to prevent out-of-order page - * creation? Don't wanna do it right here though. */ - spare_ndx = _hash_log2(metap->hashm_maxbucket + 1); if (spare_ndx > metap->hashm_ovflpoint) { - Assert(spare_ndx == metap->hashm_ovflpoint + 1); metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint]; metap->hashm_ovflpoint = spare_ndx; } @@ -440,6 +475,12 @@ _hash_expandtable(Relation rel, Buffer metabuf) /* now we can compute the new bucket's primary block number */ start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket); + /* if we added a splitpoint, should match result of _hash_alloc_buckets */ + if (firstblock != InvalidBlockNumber && + firstblock != start_nblkno) + elog(PANIC, "unexpected hash relation size: %u, should be %u", + firstblock, start_nblkno); + Assert(!_hash_has_active_scan(rel, new_bucket)); if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE)) @@ -487,6 +528,79 @@ fail: } +/* + * _hash_alloc_buckets -- allocate a new splitpoint's worth of bucket pages + * + * This does not need to initialize the new bucket pages; we'll do that as + * each one is used by _hash_expandtable(). But we have to extend the logical + * EOF to the end of the splitpoint; otherwise the first overflow page + * allocated beyond the splitpoint will represent a noncontiguous access, + * which can confuse md.c (and will probably be forbidden by future changes + * to md.c). + * + * We do this by writing a page of zeroes at the end of the splitpoint range. + * We expect that the filesystem will ensure that the intervening pages read + * as zeroes too. On many filesystems this "hole" will not be allocated + * immediately, which means that the index file may end up more fragmented + * than if we forced it all to be allocated now; but since we don't scan + * hash indexes sequentially anyway, that probably doesn't matter. + * + * XXX It's annoying that this code is executed with the metapage lock held. + * We need to interlock against _hash_getovflpage() adding a new overflow page + * concurrently, but it'd likely be better to use LockRelationForExtension + * for the purpose. OTOH, adding a splitpoint is a very infrequent operation, + * so it may not be worth worrying about. + * + * Returns the first block number in the new splitpoint's range, or + * InvalidBlockNumber if allocation failed due to BlockNumber overflow. + */ +static BlockNumber +_hash_alloc_buckets(Relation rel, uint32 nblocks) +{ + BlockNumber firstblock; + BlockNumber lastblock; + BlockNumber endblock; + char zerobuf[BLCKSZ]; + + /* + * Since we hold metapage lock, no one else is either splitting or + * allocating a new page in _hash_getovflpage(); hence it's safe to + * assume that the relation length isn't changing under us. + */ + firstblock = RelationGetNumberOfBlocks(rel); + lastblock = firstblock + nblocks - 1; + + /* + * Check for overflow in block number calculation; if so, we cannot + * extend the index anymore. + */ + if (lastblock < firstblock || lastblock == InvalidBlockNumber) + return InvalidBlockNumber; + + /* Note: we assume RelationGetNumberOfBlocks did RelationOpenSmgr for us */ + + MemSet(zerobuf, 0, sizeof(zerobuf)); + + /* + * XXX If the extension results in creation of new segment files, + * we have to make sure that each non-last file is correctly filled out to + * RELSEG_SIZE blocks. This ought to be done inside mdextend, but + * changing the smgr API seems best left for development cycle not late + * beta. Temporary fix for bug #2737. + */ +#ifndef LET_OS_MANAGE_FILESIZE + for (endblock = firstblock | (RELSEG_SIZE - 1); + endblock < lastblock; + endblock += RELSEG_SIZE) + smgrextend(rel->rd_smgr, endblock, zerobuf, rel->rd_istemp); +#endif + + smgrextend(rel->rd_smgr, lastblock, zerobuf, rel->rd_istemp); + + return firstblock; +} + + /* * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket' *