diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c index 8140418c39..6b850920c7 100644 --- a/src/backend/access/heap/hio.c +++ b/src/backend/access/heap/hio.c @@ -168,6 +168,75 @@ GetVisibilityMapPins(Relation relation, Buffer buffer1, Buffer buffer2, } } +/* + * Extend a relation by multiple blocks to avoid future contention on the + * relation extension lock. Our goal is to pre-extend the relation by an + * amount which ramps up as the degree of contention ramps up, but limiting + * the result to some sane overall value. + */ +static void +RelationAddExtraBlocks(Relation relation, BulkInsertState bistate) +{ + Page page; + BlockNumber blockNum = InvalidBlockNumber, + firstBlock = InvalidBlockNumber; + int extraBlocks = 0; + int lockWaiters = 0; + Size freespace = 0; + Buffer buffer; + + /* Use the length of the lock wait queue to judge how much to extend. */ + lockWaiters = RelationExtensionLockWaiterCount(relation); + if (lockWaiters <= 0) + return; + + /* + * It might seem like multiplying the number of lock waiters by as much + * as 20 is too aggressive, but benchmarking revealed that smaller numbers + * were insufficient. 512 is just an arbitrary cap to prevent pathological + * results. + */ + extraBlocks = Min(512, lockWaiters * 20); + + while (extraBlocks-- >= 0) + { + /* Ouch - an unnecessary lseek() each time through the loop! */ + buffer = ReadBufferBI(relation, P_NEW, bistate); + + /* Extend by one page. */ + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + page = BufferGetPage(buffer); + PageInit(page, BufferGetPageSize(buffer), 0); + MarkBufferDirty(buffer); + blockNum = BufferGetBlockNumber(buffer); + freespace = PageGetHeapFreeSpace(page); + UnlockReleaseBuffer(buffer); + + /* Remember first block number thus added. */ + if (firstBlock == InvalidBlockNumber) + firstBlock = blockNum; + + /* + * Immediately update the bottom level of the FSM. This has a good + * chance of making this page visible to other concurrently inserting + * backends, and we want that to happen without delay. + */ + RecordPageWithFreeSpace(relation, blockNum, freespace); + } + + /* + * Updating the upper levels of the free space map is too expensive + * to do for every block, but it's worth doing once at the end to make + * sure that subsequent insertion activity sees all of those nifty free + * pages we just inserted. + * + * Note that we're using the freespace value that was reported for the + * last block we added as if it were the freespace value for every block + * we added. That's actually true, because they're all equally empty. + */ + UpdateFreeSpaceMap(relation, firstBlock, blockNum, freespace); +} + /* * RelationGetBufferForTuple * @@ -233,8 +302,8 @@ RelationGetBufferForTuple(Relation relation, Size len, bool use_fsm = !(options & HEAP_INSERT_SKIP_FSM); Buffer buffer = InvalidBuffer; Page page; - Size pageFreeSpace, - saveFreeSpace; + Size pageFreeSpace = 0, + saveFreeSpace = 0; BlockNumber targetBlock, otherBlock; bool needLock; @@ -308,6 +377,7 @@ RelationGetBufferForTuple(Relation relation, Size len, } } +loop: while (targetBlock != InvalidBlockNumber) { /* @@ -440,10 +510,46 @@ RelationGetBufferForTuple(Relation relation, Size len, */ needLock = !RELATION_IS_LOCAL(relation); + /* + * If we need the lock but are not able to acquire it immediately, we'll + * consider extending the relation by multiple blocks at a time to manage + * contention on the relation extension lock. However, this only makes + * sense if we're using the FSM; otherwise, there's no point. + */ if (needLock) - LockRelationForExtension(relation, ExclusiveLock); + { + if (!use_fsm) + LockRelationForExtension(relation, ExclusiveLock); + else if (!ConditionalLockRelationForExtension(relation, ExclusiveLock)) + { + /* Couldn't get the lock immediately; wait for it. */ + LockRelationForExtension(relation, ExclusiveLock); + + /* + * Check if some other backend has extended a block for us while + * we were waiting on the lock. + */ + targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace); + + /* + * If some other waiter has already extended the relation, we + * don't need to do so; just use the existing freespace. + */ + if (targetBlock != InvalidBlockNumber) + { + UnlockRelationForExtension(relation, ExclusiveLock); + goto loop; + } + + /* Time to bulk-extend. */ + RelationAddExtraBlocks(relation, bistate); + } + } /* + * In addition to whatever extension we performed above, we always add + * at least one block to satisfy our own request. + * * XXX This does an lseek - rather expensive - but at the moment it is the * only way to accurately determine how many blocks are in a relation. Is * it worth keeping an accurate file length in shared memory someplace, diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c index 813990ea70..2ffa8ff24d 100644 --- a/src/backend/storage/freespace/freespace.c +++ b/src/backend/storage/freespace/freespace.c @@ -109,6 +109,8 @@ static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot, uint8 newValue, uint8 minValue); static BlockNumber fsm_search(Relation rel, uint8 min_cat); static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof); +static BlockNumber fsm_get_lastblckno(Relation rel, FSMAddress addr); +static void fsm_update_recursive(Relation rel, FSMAddress addr, uint8 new_cat); /******** Public API ********/ @@ -188,6 +190,46 @@ RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail) fsm_set_and_search(rel, addr, slot, new_cat, 0); } +/* + * Update the upper levels of the free space map all the way up to the root + * to make sure we don't lose track of new blocks we just inserted. This is + * intended to be used after adding many new blocks to the relation; we judge + * it not worth updating the upper levels of the tree every time data for + * a single page changes, but for a bulk-extend it's worth it. + */ +void +UpdateFreeSpaceMap(Relation rel, BlockNumber startBlkNum, + BlockNumber endBlkNum, Size freespace) +{ + int new_cat = fsm_space_avail_to_cat(freespace); + FSMAddress addr; + uint16 slot; + BlockNumber blockNum; + BlockNumber lastBlkOnPage; + + blockNum = startBlkNum; + + while (blockNum <= endBlkNum) + { + /* + * Find FSM address for this block; update tree all the way to the + * root. + */ + addr = fsm_get_location(blockNum, &slot); + fsm_update_recursive(rel, addr, new_cat); + + /* + * Get the last block number on this FSM page. If that's greater + * than or equal to our endBlkNum, we're done. Otherwise, advance + * to the first block on the next page. + */ + lastBlkOnPage = fsm_get_lastblckno(rel, addr); + if (lastBlkOnPage >= endBlkNum) + break; + blockNum = lastBlkOnPage + 1; + } +} + /* * XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in * WAL replay @@ -788,3 +830,42 @@ fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof_p) return max_avail; } + +/* + * This function will return the last block number stored on given + * FSM page address. + */ +static BlockNumber +fsm_get_lastblckno(Relation rel, FSMAddress addr) +{ + int slot; + + /* + * Get the last slot number on the given address and convert that to + * block number + */ + slot = SlotsPerFSMPage - 1; + return fsm_get_heap_blk(addr, slot); +} + +/* + * Recursively update the FSM tree from given address to + * all the way up to root. + */ +static void +fsm_update_recursive(Relation rel, FSMAddress addr, uint8 new_cat) +{ + uint16 parentslot; + FSMAddress parent; + + if (addr.level == FSM_ROOT_LEVEL) + return; + + /* + * Get the parent page and our slot in the parent page, and + * update the information in that. + */ + parent = fsm_get_parent(addr, &parentslot); + fsm_set_and_search(rel, parent, parentslot, new_cat, 0); + fsm_update_recursive(rel, parent, new_cat); +} diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c index 0632fc009e..7b08555b07 100644 --- a/src/backend/storage/lmgr/lmgr.c +++ b/src/backend/storage/lmgr/lmgr.c @@ -340,6 +340,41 @@ LockRelationForExtension(Relation relation, LOCKMODE lockmode) (void) LockAcquire(&tag, lockmode, false, false); } +/* + * ConditionalLockRelationForExtension + * + * As above, but only lock if we can get the lock without blocking. + * Returns TRUE iff the lock was acquired. + */ +bool +ConditionalLockRelationForExtension(Relation relation, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_RELATION_EXTEND(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId); + + return (LockAcquire(&tag, lockmode, false, true) != LOCKACQUIRE_NOT_AVAIL); +} + +/* + * RelationExtensionLockWaiterCount + * + * Count the number of processes waiting for the given relation extension lock. + */ +int +RelationExtensionLockWaiterCount(Relation relation) +{ + LOCKTAG tag; + + SET_LOCKTAG_RELATION_EXTEND(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId); + + return LockWaiterCount(&tag); +} + /* * UnlockRelationForExtension */ diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index b30b7b1009..41f6930645 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -4380,3 +4380,40 @@ VirtualXactLock(VirtualTransactionId vxid, bool wait) LockRelease(&tag, ShareLock, false); return true; } + +/* + * LockWaiterCount + * + * Find the number of lock requester on this locktag + */ +int +LockWaiterCount(const LOCKTAG *locktag) +{ + LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid; + LOCK *lock; + bool found; + uint32 hashcode; + LWLock *partitionLock; + int waiters = 0; + + if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) + elog(ERROR, "unrecognized lock method: %d", lockmethodid); + + hashcode = LockTagHashCode(locktag); + partitionLock = LockHashPartitionLock(hashcode); + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash, + (const void *) locktag, + hashcode, + HASH_FIND, + &found); + if (found) + { + Assert(lock != NULL); + waiters = lock->nRequested; + } + LWLockRelease(partitionLock); + + return waiters; +} diff --git a/src/include/storage/freespace.h b/src/include/storage/freespace.h index 19dcb8d964..16c052b8bc 100644 --- a/src/include/storage/freespace.h +++ b/src/include/storage/freespace.h @@ -32,5 +32,9 @@ extern void XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk, extern void FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks); extern void FreeSpaceMapVacuum(Relation rel); +extern void UpdateFreeSpaceMap(Relation rel, + BlockNumber firtsBlkNum, + BlockNumber lastBlkNum, + Size freespace); #endif /* FREESPACE_H_ */ diff --git a/src/include/storage/lmgr.h b/src/include/storage/lmgr.h index 975b6f8155..8288e7d505 100644 --- a/src/include/storage/lmgr.h +++ b/src/include/storage/lmgr.h @@ -53,6 +53,9 @@ extern void UnlockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode); /* Lock a relation for extension */ extern void LockRelationForExtension(Relation relation, LOCKMODE lockmode); extern void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode); +extern bool ConditionalLockRelationForExtension(Relation relation, + LOCKMODE lockmode); +extern int RelationExtensionLockWaiterCount(Relation relation); /* Lock a page (currently only used within indexes) */ extern void LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode); diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h index b26427dea3..efa75ecca9 100644 --- a/src/include/storage/lock.h +++ b/src/include/storage/lock.h @@ -574,6 +574,8 @@ extern void RememberSimpleDeadLock(PGPROC *proc1, PGPROC *proc2); extern void InitDeadLockChecking(void); +extern int LockWaiterCount(const LOCKTAG *locktag); + #ifdef LOCK_DEBUG extern void DumpLocks(PGPROC *proc); extern void DumpAllLocks(void);