Allow Pin/UnpinBuffer to operate in a lockfree manner.

Pinning/Unpinning a buffer is a very frequent operation; especially in read-mostly cache resident workloads. Benchmarking shows that in various scenarios the spinlock protecting a buffer header's state becomes a significant bottleneck. The problem can be reproduced with pgbench -S on larger machines, but can be considerably worse for queries which touch the same buffers over and over at a high frequency (e.g. nested loops over a small inner table). To allow atomic operations to be used, cram BufferDesc's flags, usage_count, buf_hdr_lock, refcount into a single 32bit atomic variable; that allows to manipulate them together using 32bit compare-and-swap operations. This requires reducing MAX_BACKENDS to 2^18-1 (which could be lifted by using a 64bit field, but it's not a realistic configuration atm). As not all operations can easily implemented in a lockfree manner, implement the previous buf_hdr_lock via a flag bit in the atomic variable. That way we can continue to lock the header in places where it's needed, but can get away without acquiring it in the more frequent hot-paths. There's some additional operations which can be done without the lock, but aren't in this patch; but the most important places are covered. As bufmgr.c now essentially re-implements spinlocks, abstract the delay logic from s_lock.c into something more generic. It now has already two users, and more are coming up; there's a follupw patch for lwlock.c at least. This patch is based on a proof-of-concept written by me, which Alexander Korotkov made into a fully working patch; the committed version is again revised by me. Benchmarking and testing has, amongst others, been provided by Dilip Kumar, Alexander Korotkov, Robert Haas. On a large x86 system improvements for readonly pgbench, with a high client count, of a factor of 8 have been observed. Author: Alexander Korotkov and Andres Freund Discussion: 2400449.GjM57CE0Yg@dinodell
2016-04-10 20:12:32 -07:00 · 2016-04-10 20:12:32 -07:00 · 48354581a4
parent cf223c3bf5
commit 48354581a4
10 changed files with 628 additions and 363 deletions
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@ -148,11 +148,12 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 		 */
 		for (i = 0; i < NBuffers; i++)
 		{
-			volatile BufferDesc *bufHdr;
+			BufferDesc *bufHdr;
+			uint32		buf_state;

 			bufHdr = GetBufferDescriptor(i);
 			/* Lock each buffer header before inspecting. */
-			LockBufHdr(bufHdr);
+			buf_state = LockBufHdr(bufHdr);

 			fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
 			fctx->record[i].relfilenode = bufHdr->tag.rnode.relNode;
@ -160,21 +161,21 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 			fctx->record[i].reldatabase = bufHdr->tag.rnode.dbNode;
 			fctx->record[i].forknum = bufHdr->tag.forkNum;
 			fctx->record[i].blocknum = bufHdr->tag.blockNum;
-			fctx->record[i].usagecount = bufHdr->usage_count;
-			fctx->record[i].pinning_backends = bufHdr->refcount;
+			fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
+			fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);

-			if (bufHdr->flags & BM_DIRTY)
+			if (buf_state & BM_DIRTY)
 				fctx->record[i].isdirty = true;
 			else
 				fctx->record[i].isdirty = false;

 			/* Note if the buffer is valid, and has storage created */
-			if ((bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_TAG_VALID))
+			if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
 				fctx->record[i].isvalid = true;
 			else
 				fctx->record[i].isvalid = false;

-			UnlockBufHdr(bufHdr);
+			UnlockBufHdr(bufHdr, buf_state);
 		}

 		/*
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@ -135,12 +135,9 @@ InitBufferPool(void)
 			BufferDesc *buf = GetBufferDescriptor(i);

 			CLEAR_BUFFERTAG(buf->tag);
-			buf->flags = 0;
-			buf->usage_count = 0;
-			buf->refcount = 0;
-			buf->wait_backend_pid = 0;

-			SpinLockInit(&buf->buf_hdr_lock);
+			pg_atomic_init_u32(&buf->state, 0);
+			buf->wait_backend_pid = 0;

 			buf->buf_id = i;

--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@ -98,7 +98,8 @@ typedef struct BufferAccessStrategyData


 /* Prototypes for internal functions */
-static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy);
+static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
+				  uint32 *buf_state);
 static void AddBufferToRing(BufferAccessStrategy strategy,
 				BufferDesc *buf);

@ -180,11 +181,12 @@ ClockSweepTick(void)
 *	return the buffer with the buffer header spinlock still held.
 */
 BufferDesc *
-StrategyGetBuffer(BufferAccessStrategy strategy)
+StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
 {
 	BufferDesc *buf;
 	int			bgwprocno;
 	int			trycounter;
+	uint32		local_buf_state;	/* to avoid repeated (de-)referencing */

 	/*
 	 * If given a strategy object, see whether it can select a buffer. We
@ -192,7 +194,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 	 */
 	if (strategy != NULL)
 	{
-		buf = GetBufferFromRing(strategy);
+		buf = GetBufferFromRing(strategy, buf_state);
 		if (buf != NULL)
 			return buf;
 	}
@ -279,14 +281,16 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 			 * it before we got to it.  It's probably impossible altogether as
 			 * of 8.3, but we'd better check anyway.)
 			 */
-			LockBufHdr(buf);
-			if (buf->refcount == 0 && buf->usage_count == 0)
+			local_buf_state = LockBufHdr(buf);
+			if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0
+				&& BUF_STATE_GET_USAGECOUNT(local_buf_state) == 0)
 			{
 				if (strategy != NULL)
 					AddBufferToRing(strategy, buf);
+				*buf_state = local_buf_state;
 				return buf;
 			}
-			UnlockBufHdr(buf);
+			UnlockBufHdr(buf, local_buf_state);

 		}
 	}
@ -295,19 +299,20 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 	trycounter = NBuffers;
 	for (;;)
 	{
-
 		buf = GetBufferDescriptor(ClockSweepTick());

 		/*
 		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
 		 * it; decrement the usage_count (unless pinned) and keep scanning.
 		 */
-		LockBufHdr(buf);
-		if (buf->refcount == 0)
+		local_buf_state = LockBufHdr(buf);
+
+		if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0)
 		{
-			if (buf->usage_count > 0)
+			if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0)
 			{
-				buf->usage_count--;
+				local_buf_state -= BUF_USAGECOUNT_ONE;
+
 				trycounter = NBuffers;
 			}
 			else
@ -315,6 +320,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 				/* Found a usable buffer */
 				if (strategy != NULL)
 					AddBufferToRing(strategy, buf);
+				*buf_state = local_buf_state;
 				return buf;
 			}
 		}
@ -327,10 +333,10 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 			 * probably better to fail than to risk getting stuck in an
 			 * infinite loop.
 			 */
-			UnlockBufHdr(buf);
+			UnlockBufHdr(buf, local_buf_state);
 			elog(ERROR, "no unpinned buffers available");
 		}
-		UnlockBufHdr(buf);
+		UnlockBufHdr(buf, local_buf_state);
 	}
 }

@ -585,10 +591,12 @@ FreeAccessStrategy(BufferAccessStrategy strategy)
 * The bufhdr spin lock is held on the returned buffer.
 */
 static BufferDesc *
-GetBufferFromRing(BufferAccessStrategy strategy)
+GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state)
 {
 	BufferDesc *buf;
 	Buffer		bufnum;
+	uint32		local_buf_state;	/* to avoid repeated (de-)referencing */
+

 	/* Advance to next ring slot */
 	if (++strategy->current >= strategy->ring_size)
@ -616,13 +624,15 @@ GetBufferFromRing(BufferAccessStrategy strategy)
 	 * shouldn't re-use it.
 	 */
 	buf = GetBufferDescriptor(bufnum - 1);
-	LockBufHdr(buf);
-	if (buf->refcount == 0 && buf->usage_count <= 1)
+	local_buf_state = LockBufHdr(buf);
+	if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0
+		&& BUF_STATE_GET_USAGECOUNT(local_buf_state) <= 1)
 	{
 		strategy->current_was_in_ring = true;
+		*buf_state = local_buf_state;
 		return buf;
 	}
-	UnlockBufHdr(buf);
+	UnlockBufHdr(buf, local_buf_state);

 	/*
 	 * Tell caller to allocate a new buffer with the normal allocation
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@ -108,6 +108,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 	int			b;
 	int			trycounter;
 	bool		found;
+	uint32		buf_state;

 	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);

@ -128,16 +129,21 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
 				smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1);
 #endif
+		buf_state = pg_atomic_read_u32(&bufHdr->state);
+
 		/* this part is equivalent to PinBuffer for a shared buffer */
 		if (LocalRefCount[b] == 0)
 		{
-			if (bufHdr->usage_count < BM_MAX_USAGE_COUNT)
-				bufHdr->usage_count++;
+			if (BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT)
+			{
+				buf_state += BUF_USAGECOUNT_ONE;
+				pg_atomic_write_u32(&bufHdr->state, buf_state);
+			}
 		}
 		LocalRefCount[b]++;
 		ResourceOwnerRememberBuffer(CurrentResourceOwner,
 									BufferDescriptorGetBuffer(bufHdr));
-		if (bufHdr->flags & BM_VALID)
+		if (buf_state & BM_VALID)
 			*foundPtr = TRUE;
 		else
 		{
@ -169,9 +175,12 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,

 		if (LocalRefCount[b] == 0)
 		{
-			if (bufHdr->usage_count > 0)
+			buf_state = pg_atomic_read_u32(&bufHdr->state);
+
+			if (BUF_STATE_GET_USAGECOUNT(buf_state) > 0)
 			{
-				bufHdr->usage_count--;
+				buf_state -= BUF_USAGECOUNT_ONE;
+				pg_atomic_write_u32(&bufHdr->state, buf_state);
 				trycounter = NLocBuffer;
 			}
 			else
@ -193,7 +202,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 	 * this buffer is not referenced but it might still be dirty. if that's
 	 * the case, write it out before reusing it!
 	 */
-	if (bufHdr->flags & BM_DIRTY)
+	if (buf_state & BM_DIRTY)
 	{
 		SMgrRelation oreln;
 		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
@ -211,7 +220,8 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 				  false);

 		/* Mark not-dirty now in case we error out below */
-		bufHdr->flags &= ~BM_DIRTY;
+		buf_state &= ~BM_DIRTY;
+		pg_atomic_write_u32(&bufHdr->state, buf_state);

 		pgBufferUsage.local_blks_written++;
 	}
@ -228,7 +238,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 	/*
 	 * Update the hash table: remove old entry, if any, and make new one.
 	 */
-	if (bufHdr->flags & BM_TAG_VALID)
+	if (buf_state & BM_TAG_VALID)
 	{
 		hresult = (LocalBufferLookupEnt *)
 			hash_search(LocalBufHash, (void *) &bufHdr->tag,
@ -237,7 +247,8 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 			elog(ERROR, "local buffer hash table corrupted");
 		/* mark buffer invalid just in case hash insert fails */
 		CLEAR_BUFFERTAG(bufHdr->tag);
-		bufHdr->flags &= ~(BM_VALID | BM_TAG_VALID);
+		buf_state &= ~(BM_VALID | BM_TAG_VALID);
+		pg_atomic_write_u32(&bufHdr->state, buf_state);
 	}

 	hresult = (LocalBufferLookupEnt *)
@ -250,9 +261,11 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 	 * it's all ours now.
 	 */
 	bufHdr->tag = newTag;
-	bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
-	bufHdr->flags |= BM_TAG_VALID;
-	bufHdr->usage_count = 1;
+	buf_state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
+	buf_state |= BM_TAG_VALID;
+	buf_state &= ~BUF_USAGECOUNT_MASK;
+	buf_state += BUF_USAGECOUNT_ONE;
+	pg_atomic_write_u32(&bufHdr->state, buf_state);

 	*foundPtr = FALSE;
 	return bufHdr;
@ -267,6 +280,7 @@ MarkLocalBufferDirty(Buffer buffer)
 {
 	int			bufid;
 	BufferDesc *bufHdr;
+	uint32		buf_state;

 	Assert(BufferIsLocal(buffer));

@ -280,10 +294,10 @@ MarkLocalBufferDirty(Buffer buffer)

 	bufHdr = GetLocalBufferDescriptor(bufid);

-	if (!(bufHdr->flags & BM_DIRTY))
-		pgBufferUsage.local_blks_dirtied++;
+	buf_state = pg_atomic_fetch_or_u32(&bufHdr->state, BM_DIRTY);

-	bufHdr->flags |= BM_DIRTY;
+	if (!(buf_state & BM_DIRTY))
+		pgBufferUsage.local_blks_dirtied++;
 }

 /*
@ -307,8 +321,11 @@ DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum,
 	{
 		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
 		LocalBufferLookupEnt *hresult;
+		uint32		buf_state;

-		if ((bufHdr->flags & BM_TAG_VALID) &&
+		buf_state = pg_atomic_read_u32(&bufHdr->state);
+
+		if ((buf_state & BM_TAG_VALID) &&
 			RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
 			bufHdr->tag.forkNum == forkNum &&
 			bufHdr->tag.blockNum >= firstDelBlock)
@ -327,8 +344,9 @@ DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum,
 				elog(ERROR, "local buffer hash table corrupted");
 			/* Mark buffer invalid */
 			CLEAR_BUFFERTAG(bufHdr->tag);
-			bufHdr->flags = 0;
-			bufHdr->usage_count = 0;
+			buf_state &= ~BUF_FLAG_MASK;
+			buf_state &= ~BUF_USAGECOUNT_MASK;
+			pg_atomic_write_u32(&bufHdr->state, buf_state);
 		}
 	}
 }
@ -349,8 +367,11 @@ DropRelFileNodeAllLocalBuffers(RelFileNode rnode)
 	{
 		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
 		LocalBufferLookupEnt *hresult;
+		uint32		buf_state;

-		if ((bufHdr->flags & BM_TAG_VALID) &&
+		buf_state = pg_atomic_read_u32(&bufHdr->state);
+
+		if ((buf_state & BM_TAG_VALID) &&
 			RelFileNodeEquals(bufHdr->tag.rnode, rnode))
 		{
 			if (LocalRefCount[i] != 0)
@ -367,8 +388,9 @@ DropRelFileNodeAllLocalBuffers(RelFileNode rnode)
 				elog(ERROR, "local buffer hash table corrupted");
 			/* Mark buffer invalid */
 			CLEAR_BUFFERTAG(bufHdr->tag);
-			bufHdr->flags = 0;
-			bufHdr->usage_count = 0;
+			buf_state &= ~BUF_FLAG_MASK;
+			buf_state &= ~BUF_USAGECOUNT_MASK;
+			pg_atomic_write_u32(&bufHdr->state, buf_state);
 		}
 	}
 }
--- a/src/backend/storage/lmgr/s_lock.c
+++ b/src/backend/storage/lmgr/s_lock.c
@ -3,6 +3,38 @@
 * s_lock.c
 *	   Hardware-dependent implementation of spinlocks.
 *
+ * When waiting for a contended spinlock we loop tightly for awhile, then
+ * delay using pg_usleep() and try again.  Preferably, "awhile" should be a
+ * small multiple of the maximum time we expect a spinlock to be held.  100
+ * iterations seems about right as an initial guess.  However, on a
+ * uniprocessor the loop is a waste of cycles, while in a multi-CPU scenario
+ * it's usually better to spin a bit longer than to call the kernel, so we try
+ * to adapt the spin loop count depending on whether we seem to be in a
+ * uniprocessor or multiprocessor.
+ *
+ * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd
+ * be wrong; there are platforms where that can result in a "stuck
+ * spinlock" failure.  This has been seen particularly on Alphas; it seems
+ * that the first TAS after returning from kernel space will always fail
+ * on that hardware.
+ *
+ * Once we do decide to block, we use randomly increasing pg_usleep()
+ * delays. The first delay is 1 msec, then the delay randomly increases to
+ * about one second, after which we reset to 1 msec and start again.  The
+ * idea here is that in the presence of heavy contention we need to
+ * increase the delay, else the spinlock holder may never get to run and
+ * release the lock.  (Consider situation where spinlock holder has been
+ * nice'd down in priority by the scheduler --- it will not get scheduled
+ * until all would-be acquirers are sleeping, so if we always use a 1-msec
+ * sleep, there is a real possibility of starvation.)  But we can't just
+ * clamp the delay to an upper bound, else it would take a long time to
+ * make a reasonable number of tries.
+ *
+ * We time out and declare error after NUM_DELAYS delays (thus, exactly
+ * that many tries).  With the given settings, this will usually take 2 or
+ * so minutes.  It seems better to fix the total number of tries (and thus
+ * the probability of unintended failure) than to fix the total time
+ * spent.
 *
 * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
@ -21,6 +53,14 @@
 #include "storage/s_lock.h"
 #include "storage/barrier.h"

+
+#define MIN_SPINS_PER_DELAY 10
+#define MAX_SPINS_PER_DELAY 1000
+#define NUM_DELAYS			1000
+#define MIN_DELAY_USEC		1000L
+#define MAX_DELAY_USEC		1000000L
+
+
 slock_t		dummy_spinlock;

 static int	spins_per_delay = DEFAULT_SPINS_PER_DELAY;
@ -30,128 +70,35 @@ static int	spins_per_delay = DEFAULT_SPINS_PER_DELAY;
 * s_lock_stuck() - complain about a stuck spinlock
 */
 static void
-s_lock_stuck(volatile slock_t *lock, const char *file, int line)
+s_lock_stuck(void *p, const char *file, int line)
 {
 #if defined(S_LOCK_TEST)
 	fprintf(stderr,
 			"\nStuck spinlock (%p) detected at %s:%d.\n",
-			lock, file, line);
+			p, file, line);
 	exit(1);
 #else
 	elog(PANIC, "stuck spinlock (%p) detected at %s:%d",
-		 lock, file, line);
+		 p, file, line);
 #endif
 }

-
 /*
 * s_lock(lock) - platform-independent portion of waiting for a spinlock.
 */
 int
 s_lock(volatile slock_t *lock, const char *file, int line)
 {
-	/*
-	 * We loop tightly for awhile, then delay using pg_usleep() and try again.
-	 * Preferably, "awhile" should be a small multiple of the maximum time we
-	 * expect a spinlock to be held.  100 iterations seems about right as an
-	 * initial guess.  However, on a uniprocessor the loop is a waste of
-	 * cycles, while in a multi-CPU scenario it's usually better to spin a bit
-	 * longer than to call the kernel, so we try to adapt the spin loop count
-	 * depending on whether we seem to be in a uniprocessor or multiprocessor.
-	 *
-	 * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd
-	 * be wrong; there are platforms where that can result in a "stuck
-	 * spinlock" failure.  This has been seen particularly on Alphas; it seems
-	 * that the first TAS after returning from kernel space will always fail
-	 * on that hardware.
-	 *
-	 * Once we do decide to block, we use randomly increasing pg_usleep()
-	 * delays. The first delay is 1 msec, then the delay randomly increases to
-	 * about one second, after which we reset to 1 msec and start again.  The
-	 * idea here is that in the presence of heavy contention we need to
-	 * increase the delay, else the spinlock holder may never get to run and
-	 * release the lock.  (Consider situation where spinlock holder has been
-	 * nice'd down in priority by the scheduler --- it will not get scheduled
-	 * until all would-be acquirers are sleeping, so if we always use a 1-msec
-	 * sleep, there is a real possibility of starvation.)  But we can't just
-	 * clamp the delay to an upper bound, else it would take a long time to
-	 * make a reasonable number of tries.
-	 *
-	 * We time out and declare error after NUM_DELAYS delays (thus, exactly
-	 * that many tries).  With the given settings, this will usually take 2 or
-	 * so minutes.  It seems better to fix the total number of tries (and thus
-	 * the probability of unintended failure) than to fix the total time
-	 * spent.
-	 */
-#define MIN_SPINS_PER_DELAY 10
-#define MAX_SPINS_PER_DELAY 1000
-#define NUM_DELAYS			1000
-#define MIN_DELAY_USEC		1000L
-#define MAX_DELAY_USEC		1000000L
-
-	int			spins = 0;
-	int			delays = 0;
-	int			cur_delay = 0;
+	SpinDelayStatus delayStatus = init_spin_delay((void *) lock);

 	while (TAS_SPIN(lock))
 	{
-		/* CPU-specific delay each time through the loop */
-		SPIN_DELAY();
-
-		/* Block the process every spins_per_delay tries */
-		if (++spins >= spins_per_delay)
-		{
-			if (++delays > NUM_DELAYS)
-				s_lock_stuck(lock, file, line);
-
-			if (cur_delay == 0) /* first time to delay? */
-				cur_delay = MIN_DELAY_USEC;
-
-			pg_usleep(cur_delay);
-
-#if defined(S_LOCK_TEST)
-			fprintf(stdout, "*");
-			fflush(stdout);
-#endif
-
-			/* increase delay by a random fraction between 1X and 2X */
-			cur_delay += (int) (cur_delay *
-					  ((double) random() / (double) MAX_RANDOM_VALUE) + 0.5);
-			/* wrap back to minimum delay when max is exceeded */
-			if (cur_delay > MAX_DELAY_USEC)
-				cur_delay = MIN_DELAY_USEC;
-
-			spins = 0;
-		}
+		perform_spin_delay(&delayStatus);
 	}

-	/*
-	 * If we were able to acquire the lock without delaying, it's a good
-	 * indication we are in a multiprocessor.  If we had to delay, it's a sign
-	 * (but not a sure thing) that we are in a uniprocessor. Hence, we
-	 * decrement spins_per_delay slowly when we had to delay, and increase it
-	 * rapidly when we didn't.  It's expected that spins_per_delay will
-	 * converge to the minimum value on a uniprocessor and to the maximum
-	 * value on a multiprocessor.
-	 *
-	 * Note: spins_per_delay is local within our current process. We want to
-	 * average these observations across multiple backends, since it's
-	 * relatively rare for this function to even get entered, and so a single
-	 * backend might not live long enough to converge on a good value.  That
-	 * is handled by the two routines below.
-	 */
-	if (cur_delay == 0)
-	{
-		/* we never had to delay */
-		if (spins_per_delay < MAX_SPINS_PER_DELAY)
-			spins_per_delay = Min(spins_per_delay + 100, MAX_SPINS_PER_DELAY);
-	}
-	else
-	{
-		if (spins_per_delay > MIN_SPINS_PER_DELAY)
-			spins_per_delay = Max(spins_per_delay - 1, MIN_SPINS_PER_DELAY);
-	}
-	return delays;
+	finish_spin_delay(&delayStatus);
+
+	return delayStatus.delays;
 }

 #ifdef USE_DEFAULT_S_UNLOCK
@ -167,6 +114,75 @@ s_unlock(volatile slock_t *lock)
 }
 #endif

+/*
+ * Wait while spinning on a contended spinlock.
+ */
+void
+perform_spin_delay(SpinDelayStatus *status)
+{
+	/* CPU-specific delay each time through the loop */
+	SPIN_DELAY();
+
+	/* Block the process every spins_per_delay tries */
+	if (++(status->spins) >= spins_per_delay)
+	{
+		if (++(status->delays) > NUM_DELAYS)
+			s_lock_stuck(status->ptr, status->file, status->line);
+
+		if (status->cur_delay == 0)		/* first time to delay? */
+			status->cur_delay = MIN_DELAY_USEC;
+
+		pg_usleep(status->cur_delay);
+
+#if defined(S_LOCK_TEST)
+		fprintf(stdout, "*");
+		fflush(stdout);
+#endif
+
+		/* increase delay by a random fraction between 1X and 2X */
+		status->cur_delay += (int) (status->cur_delay *
+					  ((double) random() / (double) MAX_RANDOM_VALUE) + 0.5);
+		/* wrap back to minimum delay when max is exceeded */
+		if (status->cur_delay > MAX_DELAY_USEC)
+			status->cur_delay = MIN_DELAY_USEC;
+
+		status->spins = 0;
+	}
+}
+
+/*
+ * After acquiring a spinlock, update estimates about how long to loop.
+ *
+ * If we were able to acquire the lock without delaying, it's a good
+ * indication we are in a multiprocessor.  If we had to delay, it's a sign
+ * (but not a sure thing) that we are in a uniprocessor. Hence, we
+ * decrement spins_per_delay slowly when we had to delay, and increase it
+ * rapidly when we didn't.  It's expected that spins_per_delay will
+ * converge to the minimum value on a uniprocessor and to the maximum
+ * value on a multiprocessor.
+ *
+ * Note: spins_per_delay is local within our current process. We want to
+ * average these observations across multiple backends, since it's
+ * relatively rare for this function to even get entered, and so a single
+ * backend might not live long enough to converge on a good value.  That
+ * is handled by the two routines below.
+ */
+void
+finish_spin_delay(SpinDelayStatus *status)
+{
+	if (status->cur_delay == 0)
+	{
+		/* we never had to delay */
+		if (spins_per_delay < MAX_SPINS_PER_DELAY)
+			spins_per_delay = Min(spins_per_delay + 100, MAX_SPINS_PER_DELAY);
+	}
+	else
+	{
+		if (spins_per_delay > MIN_SPINS_PER_DELAY)
+			spins_per_delay = Max(spins_per_delay - 1, MIN_SPINS_PER_DELAY);
+	}
+}
+
 /*
 * Set local copy of spins_per_delay during backend startup.
 *
--- a/src/include/postmaster/postmaster.h
+++ b/src/include/postmaster/postmaster.h
@ -63,12 +63,15 @@ extern void ShmemBackendArrayAllocation(void);
 #endif

 /*
- * Note: MAX_BACKENDS is limited to 2^23-1 because inval.c stores the
- * backend ID as a 3-byte signed integer.  Even if that limitation were
- * removed, we still could not exceed INT_MAX/4 because some places compute
- * 4*MaxBackends without any overflow check.  This is rechecked in the relevant
- * GUC check hooks and in RegisterBackgroundWorker().
+ * Note: MAX_BACKENDS is limited to 2^18-1 because that's the width reserved
+ * for buffer references in buf_internals.h.  This limitation could be lifted
+ * by using a 64bit state; but it's unlikely to be worthwhile as 2^18-1
+ * backends exceed currently realistic configurations. Even if that limitation
+ * were removed, we still could not a) exceed 2^23-1 because inval.c stores
+ * the backend ID as a 3-byte signed integer, b) INT_MAX/4 because some places
+ * compute 4*MaxBackends without any overflow check.  This is rechecked in the
+ * relevant GUC check hooks and in RegisterBackgroundWorker().
 */
-#define MAX_BACKENDS	0x7fffff
+#define MAX_BACKENDS	0x3FFFF

 #endif   /* _POSTMASTER_H */
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@ -21,29 +21,51 @@
 #include "storage/lwlock.h"
 #include "storage/shmem.h"
 #include "storage/smgr.h"
+#include "port/atomics.h"
 #include "storage/spin.h"
 #include "utils/relcache.h"


+/*
+ * Buffer state is a single 32-bit variable where following data is combined.
+ *
+ * - 18 bits refcount
+ * - 4 bits usage count
+ * - 10 bits of flags
+ *
+ * Combining these values allows to perform some operations without locking
+ * the buffer header, by modifying them together with a CAS loop.
+ *
+ * The definition of buffer state components is below.
+ */
+#define BUF_REFCOUNT_ONE 1
+#define BUF_REFCOUNT_MASK ((1U << 18) - 1)
+#define BUF_USAGECOUNT_MASK 0x003C0000U
+#define BUF_USAGECOUNT_ONE (1U << 18)
+#define BUF_USAGECOUNT_SHIFT 18
+#define BUF_FLAG_MASK 0xFFC00000U
+
+/* Get refcount and usagecount from buffer state */
+#define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
+#define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
+
 /*
 * Flags for buffer descriptors
 *
 * Note: TAG_VALID essentially means that there is a buffer hashtable
 * entry associated with the buffer's tag.
 */
-#define BM_DIRTY				(1 << 0)		/* data needs writing */
-#define BM_VALID				(1 << 1)		/* data is valid */
-#define BM_TAG_VALID			(1 << 2)		/* tag is assigned */
-#define BM_IO_IN_PROGRESS		(1 << 3)		/* read or write in progress */
-#define BM_IO_ERROR				(1 << 4)		/* previous I/O failed */
-#define BM_JUST_DIRTIED			(1 << 5)		/* dirtied since write started */
-#define BM_PIN_COUNT_WAITER		(1 << 6)		/* have waiter for sole pin */
-#define BM_CHECKPOINT_NEEDED	(1 << 7)		/* must write for checkpoint */
-#define BM_PERMANENT			(1 << 8)		/* permanent relation (not
+#define BM_LOCKED				(1U << 22)		/* buffer header is locked */
+#define BM_DIRTY				(1U << 23)		/* data needs writing */
+#define BM_VALID				(1U << 24)		/* data is valid */
+#define BM_TAG_VALID			(1U << 25)		/* tag is assigned */
+#define BM_IO_IN_PROGRESS		(1U << 26)		/* read or write in progress */
+#define BM_IO_ERROR				(1U << 27)		/* previous I/O failed */
+#define BM_JUST_DIRTIED			(1U << 28)		/* dirtied since write started */
+#define BM_PIN_COUNT_WAITER		(1U << 29)		/* have waiter for sole pin */
+#define BM_CHECKPOINT_NEEDED	(1U << 30)		/* must write for checkpoint */
+#define BM_PERMANENT			(1U << 31)		/* permanent relation (not
 												 * unlogged) */
-
-typedef bits16 BufFlags;
-
 /*
 * The maximum allowed value of usage_count represents a tradeoff between
 * accuracy and speed of the clock-sweep buffer management algorithm.  A
@ -113,18 +135,29 @@ typedef struct buftag
 /*
 *	BufferDesc -- shared descriptor/state data for a single shared buffer.
 *
- * Note: buf_hdr_lock must be held to examine or change the tag, flags,
- * usage_count, refcount, or wait_backend_pid fields.  buf_id field never
- * changes after initialization, so does not need locking.  freeNext is
- * protected by the buffer_strategy_lock not buf_hdr_lock.  The LWLock can
- * take care of itself.  The buf_hdr_lock is *not* used to control access to
- * the data in the buffer!
+ * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
+ * the tag, state or wait_backend_pid fields.  In general, buffer header lock
+ * is a spinlock which is combined with flags, refcount and usagecount into
+ * single atomic variable.  This layout allow us to do some operations in a
+ * single atomic operation, without actually acquiring and releasing spinlock;
+ * for instance, increase or decrease refcount.  buf_id field never changes
+ * after initialization, so does not need locking.  freeNext is protected by
+ * the buffer_strategy_lock not buffer header lock.  The LWLock can take care
+ * of itself.  The buffer header lock is *not* used to control access to the
+ * data in the buffer!
+ *
+ * It's assumed that nobody changes the state field while buffer header lock
+ * is held.  Thus buffer header lock holder can do complex updates of the
+ * state variable in single write, simultaneously with lock release (cleaning
+ * BM_LOCKED flag).  On the other hand, updating of state without holding
+ * buffer header lock is restricted to CAS, which insure that BM_LOCKED flag
+ * is not set.  Atomic increment/decrement, OR/AND etc. are not allowed.
 *
 * An exception is that if we have the buffer pinned, its tag can't change
- * underneath us, so we can examine the tag without locking the spinlock.
+ * underneath us, so we can examine the tag without locking the buffer header.
 * Also, in places we do one-time reads of the flags without bothering to
- * lock the spinlock; this is generally for situations where we don't expect
- * the flag bit being tested to be changing.
+ * lock the buffer header; this is generally for situations where we don't
+ * expect the flag bit being tested to be changing.
 *
 * We can't physically remove items from a disk page if another backend has
 * the buffer pinned.  Hence, a backend may need to wait for all other pins
@ -142,13 +175,12 @@ typedef struct buftag
 typedef struct BufferDesc
 {
 	BufferTag	tag;			/* ID of page contained in buffer */
-	BufFlags	flags;			/* see bit definitions above */
-	uint8		usage_count;	/* usage counter for clock sweep code */
-	slock_t		buf_hdr_lock;	/* protects a subset of fields, see above */
-	unsigned	refcount;		/* # of backends holding pins on buffer */
-	int			wait_backend_pid;		/* backend PID of pin-count waiter */
-
 	int			buf_id;			/* buffer's index number (from 0) */
+
+	/* state of the tag, containing flags, refcount and usagecount */
+	pg_atomic_uint32 state;
+
+	int			wait_backend_pid;		/* backend PID of pin-count waiter */
 	int			freeNext;		/* link in freelist chain */

 	LWLock		content_lock;	/* to lock access to buffer contents */
@ -202,11 +234,15 @@ extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray;
 #define FREENEXT_NOT_IN_LIST	(-2)

 /*
- * Macros for acquiring/releasing a shared buffer header's spinlock.
- * Do not apply these to local buffers!
+ * Functions for acquiring/releasing a shared buffer header's spinlock.  Do
+ * not apply these to local buffers!
 */
-#define LockBufHdr(bufHdr)		SpinLockAcquire(&(bufHdr)->buf_hdr_lock)
-#define UnlockBufHdr(bufHdr)	SpinLockRelease(&(bufHdr)->buf_hdr_lock)
+extern uint32 LockBufHdr(BufferDesc *desc);
+#define UnlockBufHdr(desc, s)	\
+	do {	\
+		pg_atomic_write_u32(&(desc)->state, (s) & (~BM_LOCKED)); \
+		pg_write_barrier(); \
+	} while (0)


 /*
@ -267,7 +303,8 @@ extern void IssuePendingWritebacks(WritebackContext *context);
 extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);

 /* freelist.c */
-extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy);
+extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
+				  uint32 *buf_state);
 extern void StrategyFreeBuffer(BufferDesc *buf);
 extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
 					 BufferDesc *buf);
--- a/src/include/storage/s_lock.h
+++ b/src/include/storage/s_lock.h
@ -991,4 +991,22 @@ extern int s_lock(volatile slock_t *lock, const char *file, int line);
 extern void set_spins_per_delay(int shared_spins_per_delay);
 extern int	update_spins_per_delay(int shared_spins_per_delay);

+/*
+ * Support for spin delay which is useful in various places where
+ * spinlock-like procedures take place.
+ */
+typedef struct
+{
+	int			spins;
+	int			delays;
+	int			cur_delay;
+	void	   *ptr;
+	const char *file;
+	int			line;
+} SpinDelayStatus;
+
+#define init_spin_delay(ptr) {0, 0, 0, (ptr), __FILE__, __LINE__}
+void perform_spin_delay(SpinDelayStatus *status);
+void finish_spin_delay(SpinDelayStatus *status);
+
 #endif	 /* S_LOCK_H */
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@ -1859,6 +1859,7 @@ SpGistScanOpaqueData
 SpGistState
 SpGistTypeDesc
 SpecialJoinInfo
+SpinDelayStatus
 SplitInterval
 SplitLR
 SplitVar