/*-------------------------------------------------------------------------
 *
 * slab.c
 *	  SLAB allocator definitions.
 *
 * SLAB is a MemoryContext implementation designed for cases where large
 * numbers of equally-sized objects are allocated (and freed).
 *
 *
 * Portions Copyright (c) 2017, PostgreSQL Global Development Group
 *
 * IDENTIFICATION
 *	  src/backend/utils/mmgr/slab.c
 *
 *
 * NOTE:
 *	The constant allocation size allows significant simplification and various
 *	optimizations over more general purpose allocators. The blocks are carved
 *	into chunks of exactly the right size (plus alignment), not wasting any
 *	memory.
 *
 *	The information about free chunks is maintained both at the block level and
 *	global (context) level. This is possible as the chunk size (and thus also
 *	the number of chunks per block) is fixed.
 *
 *	On each block, free chunks are tracked in a simple linked list. Contents
 *	of free chunks is replaced with an index of the next free chunk, forming
 *	a very simple linked list. Each block also contains a counter of free
 *	chunks. Combined with the local block-level freelist, it makes it trivial
 *	to eventually free the whole block.
 *
 *	At the context level, we use 'freelist' to track blocks ordered by number
 *	of free chunks, starting with blocks having a single allocated chunk, and
 *	with completely full blocks on the tail.
 *
 *	This also allows various optimizations - for example when searching for
 *	free chunk, the allocator reuses space from the fullest blocks first, in
 *	the hope that some of the less full blocks will get completely empty (and
 *	returned back to the OS).
 *
 *	For each block, we maintain pointer to the first free chunk - this is quite
 *	cheap and allows us to skip all the preceding used chunks, eliminating
 *	a significant number of lookups in many common usage patters. In the worst
 *	case this performs as if the pointer was not maintained.
 *
 *	We cache the freelist index for the blocks with the fewest free chunks
 *	(minFreeChunks), so that we don't have to search the freelist on every
 *	SlabAlloc() call, which is quite expensive.
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include "utils/memdebug.h"
#include "utils/memutils.h"
#include "lib/ilist.h"


/*
 * SlabContext is a specialized implementation of MemoryContext.
 */
typedef struct SlabContext
{
	MemoryContextData header;	/* Standard memory-context fields */
	/* Allocation parameters for this context: */
	Size		chunkSize;		/* chunk size */
	Size		fullChunkSize;	/* chunk size including header and alignment */
	Size		blockSize;		/* block size */
	int			chunksPerBlock; /* number of chunks per block */
	int			minFreeChunks;	/* min number of free chunks in any block */
	int			nblocks;		/* number of blocks allocated */
	/* blocks with free space, grouped by number of free chunks: */
	dlist_head	freelist[FLEXIBLE_ARRAY_MEMBER];
} SlabContext;

/*
 * SlabBlock
 *		Structure of a single block in SLAB allocator.
 *
 * node: doubly-linked list of blocks in global freelist
 * nfree: number of free chunks in this block
 * firstFreeChunk: index of the first free chunk
 */
typedef struct SlabBlock
{
	dlist_node	node;			/* doubly-linked list */
	int			nfree;			/* number of free chunks */
	int			firstFreeChunk; /* index of the first free chunk in the block */
} SlabBlock;

/*
 * SlabChunk
 *		The prefix of each piece of memory in an SlabBlock
 */
typedef struct SlabChunk
{
	/* block owning this chunk */
	void	   *block;
	SlabContext *slab;			/* owning context */
	/* there must not be any padding to reach a MAXALIGN boundary here! */
} SlabChunk;


#define SlabPointerGetChunk(ptr)	\
	((SlabChunk *)(((char *)(ptr)) - sizeof(SlabChunk)))
#define SlabChunkGetPointer(chk)	\
	((void *)(((char *)(chk)) + sizeof(SlabChunk)))
#define SlabBlockGetChunk(slab, block, idx) \
	((SlabChunk *) ((char *) (block) + sizeof(SlabBlock)	\
					+ (idx * slab->fullChunkSize)))
#define SlabBlockStart(block)	\
	((char *) block + sizeof(SlabBlock))
#define SlabChunkIndex(slab, block, chunk)	\
	(((char *) chunk - SlabBlockStart(block)) / slab->fullChunkSize)

/*
 * These functions implement the MemoryContext API for Slab contexts.
 */
static void *SlabAlloc(MemoryContext context, Size size);
static void SlabFree(MemoryContext context, void *pointer);
static void *SlabRealloc(MemoryContext context, void *pointer, Size size);
static void SlabInit(MemoryContext context);
static void SlabReset(MemoryContext context);
static void SlabDelete(MemoryContext context);
static Size SlabGetChunkSpace(MemoryContext context, void *pointer);
static bool SlabIsEmpty(MemoryContext context);
static void SlabStats(MemoryContext context, int level, bool print,
		  MemoryContextCounters *totals);
#ifdef MEMORY_CONTEXT_CHECKING
static void SlabCheck(MemoryContext context);
#endif

/*
 * This is the virtual function table for Slab contexts.
 */
static MemoryContextMethods SlabMethods = {
	SlabAlloc,
	SlabFree,
	SlabRealloc,
	SlabInit,
	SlabReset,
	SlabDelete,
	SlabGetChunkSpace,
	SlabIsEmpty,
	SlabStats
#ifdef MEMORY_CONTEXT_CHECKING
	,SlabCheck
#endif
};

/* ----------
 * Debug macros
 * ----------
 */
#ifdef HAVE_ALLOCINFO
#define SlabFreeInfo(_cxt, _chunk) \
			fprintf(stderr, "SlabFree: %s: %p, %zu\n", \
				(_cxt)->header.name, (_chunk), (_chunk)->header.size)
#define SlabAllocInfo(_cxt, _chunk) \
			fprintf(stderr, "SlabAlloc: %s: %p, %zu\n", \
				(_cxt)->header.name, (_chunk), (_chunk)->header.size)
#else
#define SlabFreeInfo(_cxt, _chunk)
#define SlabAllocInfo(_cxt, _chunk)
#endif


/*
 * SlabContextCreate
 *		Create a new Slab context.
 *
 * parent: parent context, or NULL if top-level context
 * name: name of context (for debugging --- string will be copied)
 * blockSize: allocation block size
 * chunkSize: allocation chunk size
 *
 * The chunkSize may not exceed:
 *		MAXALIGN_DOWN(SIZE_MAX) - MAXALIGN(sizeof(SlabBlock)) - SLAB_CHUNKHDRSZ
 *
 */
MemoryContext
SlabContextCreate(MemoryContext parent,
				  const char *name,
				  Size blockSize,
				  Size chunkSize)
{
	int			chunksPerBlock;
	Size		fullChunkSize;
	Size		freelistSize;
	SlabContext *slab;

	StaticAssertStmt(offsetof(SlabChunk, slab) +sizeof(MemoryContext) ==
					 MAXALIGN(sizeof(SlabChunk)),
					 "padding calculation in SlabChunk is wrong");

	/* otherwise the linked list inside freed chunk isn't guaranteed to fit */
	StaticAssertStmt(MAXIMUM_ALIGNOF >= sizeof(int),
					 "MAXALIGN too small to fit int32");

	/* chunk, including SLAB header (both addresses nicely aligned) */
	fullChunkSize = MAXALIGN(sizeof(SlabChunk) + MAXALIGN(chunkSize));

	/* Make sure the block can store at least one chunk. */
	if (blockSize - sizeof(SlabBlock) < fullChunkSize)
		elog(ERROR, "block size %zu for slab is too small for %zu chunks",
			 blockSize, chunkSize);

	/* Compute maximum number of chunks per block */
	chunksPerBlock = (blockSize - sizeof(SlabBlock)) / fullChunkSize;

	/* The freelist starts with 0, ends with chunksPerBlock. */
	freelistSize = sizeof(dlist_head) * (chunksPerBlock + 1);

	/* if we can't fit at least one chunk into the block, we're hosed */
	Assert(chunksPerBlock > 0);

	/* make sure the chunks actually fit on the block	*/
	Assert((fullChunkSize * chunksPerBlock) + sizeof(SlabBlock) <= blockSize);

	/* Do the type-independent part of context creation */
	slab = (SlabContext *)
		MemoryContextCreate(T_SlabContext,
							(offsetof(SlabContext, freelist) +freelistSize),
							&SlabMethods,
							parent,
							name);

	slab->blockSize = blockSize;
	slab->chunkSize = chunkSize;
	slab->fullChunkSize = fullChunkSize;
	slab->chunksPerBlock = chunksPerBlock;
	slab->nblocks = 0;
	slab->minFreeChunks = 0;

	return (MemoryContext) slab;
}

/*
 * SlabInit
 *		Context-type-specific initialization routine.
 */
static void
SlabInit(MemoryContext context)
{
	int			i;
	SlabContext *slab = castNode(SlabContext, context);

	Assert(slab);

	/* initialize the freelist slots */
	for (i = 0; i < (slab->chunksPerBlock + 1); i++)
		dlist_init(&slab->freelist[i]);
}

/*
 * SlabReset
 *		Frees all memory which is allocated in the given set.
 *
 * The code simply frees all the blocks in the context - we don't keep any
 * keeper blocks or anything like that.
 */
static void
SlabReset(MemoryContext context)
{
	int			i;
	SlabContext *slab = castNode(SlabContext, context);

	Assert(slab);

#ifdef MEMORY_CONTEXT_CHECKING
	/* Check for corruption and leaks before freeing */
	SlabCheck(context);
#endif

	/* walk over freelists and free the blocks */
	for (i = 0; i <= slab->chunksPerBlock; i++)
	{
		dlist_mutable_iter miter;

		dlist_foreach_modify(miter, &slab->freelist[i])
		{
			SlabBlock  *block = dlist_container(SlabBlock, node, miter.cur);

			dlist_delete(miter.cur);

#ifdef CLOBBER_FREED_MEMORY
			wipe_mem(block, slab->blockSize);
#endif
			free(block);
			slab->nblocks--;
		}
	}

	slab->minFreeChunks = 0;

	Assert(slab->nblocks == 0);
}

/*
 * SlabDelete
 *		Frees all memory which is allocated in the given slab, in preparation
 *		for deletion of the slab. We simply call SlabReset().
 */
static void
SlabDelete(MemoryContext context)
{
	/* just reset the context */
	SlabReset(context);
}

/*
 * SlabAlloc
 *		Returns pointer to allocated memory of given size or NULL if
 *		request could not be completed; memory is added to the slab.
 */
static void *
SlabAlloc(MemoryContext context, Size size)
{
	SlabContext *slab = castNode(SlabContext, context);
	SlabBlock  *block;
	SlabChunk  *chunk;
	int			idx;

	Assert(slab);

	Assert((slab->minFreeChunks >= 0) &&
		   (slab->minFreeChunks < slab->chunksPerBlock));

	/* make sure we only allow correct request size */
	if (size != slab->chunkSize)
		elog(ERROR, "unexpected alloc chunk size %zu (expected %zu)",
			 size, slab->chunkSize);

	/*
	 * If there are no free chunks in any existing block, create a new block
	 * and put it to the last freelist bucket.
	 *
	 * slab->minFreeChunks == 0 means there are no blocks with free chunks,
	 * thanks to how minFreeChunks is updated at the end of SlabAlloc().
	 */
	if (slab->minFreeChunks == 0)
	{
		block = (SlabBlock *) malloc(slab->blockSize);

		if (block == NULL)
			return NULL;

		block->nfree = slab->chunksPerBlock;
		block->firstFreeChunk = 0;

		/*
		 * Put all the chunks on a freelist. Walk the chunks and point each
		 * one to the next one.
		 */
		for (idx = 0; idx < slab->chunksPerBlock; idx++)
		{
			chunk = SlabBlockGetChunk(slab, block, idx);
			*(int32 *) SlabChunkGetPointer(chunk) = (idx + 1);
		}

		/*
		 * And add it to the last freelist with all chunks empty.
		 *
		 * We know there are no blocks in the freelist, otherwise we wouldn't
		 * need a new block.
		 */
		Assert(dlist_is_empty(&slab->freelist[slab->chunksPerBlock]));

		dlist_push_head(&slab->freelist[slab->chunksPerBlock], &block->node);

		slab->minFreeChunks = slab->chunksPerBlock;
		slab->nblocks += 1;
	}

	/* grab the block from the freelist (even the new block is there) */
	block = dlist_head_element(SlabBlock, node,
							   &slab->freelist[slab->minFreeChunks]);

	/* make sure we actually got a valid block, with matching nfree */
	Assert(block != NULL);
	Assert(slab->minFreeChunks == block->nfree);
	Assert(block->nfree > 0);

	/* we know index of the first free chunk in the block */
	idx = block->firstFreeChunk;

	/* make sure the chunk index is valid, and that it's marked as empty */
	Assert((idx >= 0) && (idx < slab->chunksPerBlock));

	/* compute the chunk location block start (after the block header) */
	chunk = SlabBlockGetChunk(slab, block, idx);

	/*
	 * Update the block nfree count, and also the minFreeChunks as we've
	 * decreased nfree for a block with the minimum number of free chunks
	 * (because that's how we chose the block).
	 */
	block->nfree--;
	slab->minFreeChunks = block->nfree;

	/*
	 * Remove the chunk from the freelist head. The index of the next free
	 * chunk is stored in the chunk itself.
	 */
	VALGRIND_MAKE_MEM_DEFINED(SlabChunkGetPointer(chunk), sizeof(int32));
	block->firstFreeChunk = *(int32 *) SlabChunkGetPointer(chunk);

	Assert(block->firstFreeChunk >= 0);
	Assert(block->firstFreeChunk <= slab->chunksPerBlock);

	Assert((block->nfree != 0 &&
			block->firstFreeChunk < slab->chunksPerBlock) ||
		   (block->nfree == 0 &&
			block->firstFreeChunk == slab->chunksPerBlock));

	/* move the whole block to the right place in the freelist */
	dlist_delete(&block->node);
	dlist_push_head(&slab->freelist[block->nfree], &block->node);

	/*
	 * And finally update minFreeChunks, i.e. the index to the block with the
	 * lowest number of free chunks. We only need to do that when the block
	 * got full (otherwise we know the current block is the right one). We'll
	 * simply walk the freelist until we find a non-empty entry.
	 */
	if (slab->minFreeChunks == 0)
	{
		for (idx = 1; idx <= slab->chunksPerBlock; idx++)
		{
			if (dlist_is_empty(&slab->freelist[idx]))
				continue;

			/* found a non-empty freelist */
			slab->minFreeChunks = idx;
			break;
		}
	}

	if (slab->minFreeChunks == slab->chunksPerBlock)
		slab->minFreeChunks = 0;

	/* Prepare to initialize the chunk header. */
	VALGRIND_MAKE_MEM_UNDEFINED(chunk, sizeof(SlabChunk));

	chunk->block = (void *) block;
	chunk->slab = slab;

#ifdef MEMORY_CONTEXT_CHECKING
	/* slab mark to catch clobber of "unused" space */
	if (slab->chunkSize < (slab->fullChunkSize - sizeof(SlabChunk)))
	{
		set_sentinel(SlabChunkGetPointer(chunk), size);
		VALGRIND_MAKE_MEM_NOACCESS(((char *) chunk) +
								   sizeof(SlabChunk) + slab->chunkSize,
								   slab->fullChunkSize -
								   (slab->chunkSize + sizeof(SlabChunk)));
	}
#endif
#ifdef RANDOMIZE_ALLOCATED_MEMORY
	/* fill the allocated space with junk */
	randomize_mem((char *) SlabChunkGetPointer(chunk), size);
#endif

	SlabAllocInfo(slab, chunk);
	return SlabChunkGetPointer(chunk);
}

/*
 * SlabFree
 *		Frees allocated memory; memory is removed from the slab.
 */
static void
SlabFree(MemoryContext context, void *pointer)
{
	int			idx;
	SlabContext *slab = castNode(SlabContext, context);
	SlabChunk  *chunk = SlabPointerGetChunk(pointer);
	SlabBlock  *block = chunk->block;

	SlabFreeInfo(slab, chunk);

#ifdef MEMORY_CONTEXT_CHECKING
	/* Test for someone scribbling on unused space in chunk */
	if (slab->chunkSize < (slab->fullChunkSize - sizeof(SlabChunk)))
		if (!sentinel_ok(pointer, slab->chunkSize))
			elog(WARNING, "detected write past chunk end in %s %p",
				 slab->header.name, chunk);
#endif

	/* compute index of the chunk with respect to block start */
	idx = SlabChunkIndex(slab, block, chunk);

	/* add chunk to freelist, and update block nfree count */
	*(int32 *) pointer = block->firstFreeChunk;
	block->firstFreeChunk = idx;
	block->nfree++;

	Assert(block->nfree > 0);
	Assert(block->nfree <= slab->chunksPerBlock);

#ifdef CLOBBER_FREED_MEMORY
	/* XXX don't wipe the int32 index, used for block-level freelist */
	wipe_mem((char *) pointer + sizeof(int32),
			 slab->chunkSize - sizeof(int32));
#endif

	/* remove the block from a freelist */
	dlist_delete(&block->node);

	/*
	 * See if we need to update the minFreeChunks field for the slab - we only
	 * need to do that if there the block had that number of free chunks
	 * before we freed one. In that case, we check if there still are blocks
	 * in the original freelist and we either keep the current value (if there
	 * still are blocks) or increment it by one (the new block is still the
	 * one with minimum free chunks).
	 *
	 * The one exception is when the block will get completely free - in that
	 * case we will free it, se we can't use it for minFreeChunks. It however
	 * means there are no more blocks with free chunks.
	 */
	if (slab->minFreeChunks == (block->nfree - 1))
	{
		/* Have we removed the last chunk from the freelist? */
		if (dlist_is_empty(&slab->freelist[slab->minFreeChunks]))
		{
			/* but if we made the block entirely free, we'll free it */
			if (block->nfree == slab->chunksPerBlock)
				slab->minFreeChunks = 0;
			else
				slab->minFreeChunks++;
		}
	}

	/* If the block is now completely empty, free it. */
	if (block->nfree == slab->chunksPerBlock)
	{
		free(block);
		slab->nblocks--;
	}
	else
		dlist_push_head(&slab->freelist[block->nfree], &block->node);

	Assert(slab->nblocks >= 0);
}

/*
 * SlabRealloc
 *		Change the allocated size of a chunk.
 *
 * As Slab is designed for allocating equally-sized chunks of memory, it can't
 * do an actual chunk size change.  We try to be gentle and allow calls with
 * exactly the same size, as in that case we can simply return the same
 * chunk.  When the size differs, we throw an error.
 *
 * We could also allow requests with size < chunkSize.  That however seems
 * rather pointless - Slab is meant for chunks of constant size, and moreover
 * realloc is usually used to enlarge the chunk.
 */
static void *
SlabRealloc(MemoryContext context, void *pointer, Size size)
{
	SlabContext *slab = castNode(SlabContext, context);

	Assert(slab);

	/* can't do actual realloc with slab, but let's try to be gentle */
	if (size == slab->chunkSize)
		return pointer;

	elog(ERROR, "slab allocator does not support realloc()");
	return NULL;				/* keep compiler quiet */
}

/*
 * SlabGetChunkSpace
 *		Given a currently-allocated chunk, determine the total space
 *		it occupies (including all memory-allocation overhead).
 */
static Size
SlabGetChunkSpace(MemoryContext context, void *pointer)
{
	SlabContext *slab = castNode(SlabContext, context);

	Assert(slab);

	return slab->fullChunkSize;
}

/*
 * SlabIsEmpty
 *		Is an Slab empty of any allocated space?
 */
static bool
SlabIsEmpty(MemoryContext context)
{
	SlabContext *slab = castNode(SlabContext, context);

	Assert(slab);

	return (slab->nblocks == 0);
}

/*
 * SlabStats
 *		Compute stats about memory consumption of an Slab.
 *
 * level: recursion level (0 at top level); used for print indentation.
 * print: true to print stats to stderr.
 * totals: if not NULL, add stats about this Slab into *totals.
 */
static void
SlabStats(MemoryContext context, int level, bool print,
		  MemoryContextCounters *totals)
{
	SlabContext *slab = castNode(SlabContext, context);
	Size		nblocks = 0;
	Size		freechunks = 0;
	Size		totalspace = 0;
	Size		freespace = 0;
	int			i;

	Assert(slab);

	for (i = 0; i <= slab->chunksPerBlock; i++)
	{
		dlist_iter	iter;

		dlist_foreach(iter, &slab->freelist[i])
		{
			SlabBlock  *block = dlist_container(SlabBlock, node, iter.cur);

			nblocks++;
			totalspace += slab->blockSize;
			freespace += slab->fullChunkSize * block->nfree;
			freechunks += block->nfree;
		}
	}

	if (print)
	{
		for (i = 0; i < level; i++)
			fprintf(stderr, "  ");
		fprintf(stderr,
				"Slab: %s: %zu total in %zd blocks; %zu free (%zd chunks); %zu used\n",
				slab->header.name, totalspace, nblocks, freespace, freechunks,
				totalspace - freespace);
	}

	if (totals)
	{
		totals->nblocks += nblocks;
		totals->freechunks += freechunks;
		totals->totalspace += totalspace;
		totals->freespace += freespace;
	}
}


#ifdef MEMORY_CONTEXT_CHECKING

/*
 * SlabCheck
 *		Walk through chunks and check consistency of memory.
 *
 * NOTE: report errors as WARNING, *not* ERROR or FATAL.  Otherwise you'll
 * find yourself in an infinite loop when trouble occurs, because this
 * routine will be entered again when elog cleanup tries to release memory!
 */
static void
SlabCheck(MemoryContext context)
{
	int			i;
	SlabContext *slab = castNode(SlabContext, context);
	char	   *name = slab->header.name;
	char	   *freechunks;

	Assert(slab);
	Assert(slab->chunksPerBlock > 0);

	/* bitmap of free chunks on a block */
	freechunks = palloc(slab->chunksPerBlock * sizeof(bool));

	/* walk all the freelists */
	for (i = 0; i <= slab->chunksPerBlock; i++)
	{
		int			j,
					nfree;
		dlist_iter	iter;

		/* walk all blocks on this freelist */
		dlist_foreach(iter, &slab->freelist[i])
		{
			int			idx;
			SlabBlock  *block = dlist_container(SlabBlock, node, iter.cur);

			/*
			 * Make sure the number of free chunks (in the block header)
			 * matches position in the freelist.
			 */
			if (block->nfree != i)
				elog(WARNING, "problem in slab %s: number of free chunks %d in block %p does not match freelist %d",
					 name, block->nfree, block, i);

			/* reset the bitmap of free chunks for this block */
			memset(freechunks, 0, (slab->chunksPerBlock * sizeof(bool)));
			idx = block->firstFreeChunk;

			/*
			 * Now walk through the chunks, count the free ones and also
			 * perform some additional checks for the used ones. As the chunk
			 * freelist is stored within the chunks themselves, we have to
			 * walk through the chunks and construct our own bitmap.
			 */

			nfree = 0;
			while (idx < slab->chunksPerBlock)
			{
				SlabChunk  *chunk;

				/* count the chunk as free, add it to the bitmap */
				nfree++;
				freechunks[idx] = true;

				/* read index of the next free chunk */
				chunk = SlabBlockGetChunk(slab, block, idx);
				VALGRIND_MAKE_MEM_DEFINED(SlabChunkGetPointer(chunk), sizeof(int32));
				idx = *(int32 *) SlabChunkGetPointer(chunk);
			}

			for (j = 0; j < slab->chunksPerBlock; j++)
			{
				/* non-zero bit in the bitmap means chunk the chunk is used */
				if (!freechunks[j])
				{
					SlabChunk  *chunk = SlabBlockGetChunk(slab, block, j);

					/* chunks have both block and slab pointers, so check both */
					if (chunk->block != block)
						elog(WARNING, "problem in slab %s: bogus block link in block %p, chunk %p",
							 name, block, chunk);

					if (chunk->slab != slab)
						elog(WARNING, "problem in slab %s: bogus slab link in block %p, chunk %p",
							 name, block, chunk);

					/* there might be sentinel (thanks to alignment) */
					if (slab->chunkSize < (slab->fullChunkSize - sizeof(SlabChunk)))
						if (!sentinel_ok(chunk, slab->chunkSize))
							elog(WARNING, "problem in slab %s: detected write past chunk end in block %p, chunk %p",
								 name, block, chunk);
				}
			}

			/*
			 * Make sure we got the expected number of free chunks (as tracked
			 * in the block header).
			 */
			if (nfree != block->nfree)
				elog(WARNING, "problem in slab %s: number of free chunks %d in block %p does not match bitmap %d",
					 name, block->nfree, block, nfree);
		}
	}
}

#endif   /* MEMORY_CONTEXT_CHECKING */