postgresql/src/backend/storage/ipc/dsm.c

/*-------------------------------------------------------------------------
 *
 * dsm.c
 *	  manage dynamic shared memory segments
 *
 * This file provides a set of services to make programming with dynamic
 * shared memory segments more convenient.  Unlike the low-level
 * facilities provided by dsm_impl.h and dsm_impl.c, mappings and segments
 * created using this module will be cleaned up automatically.  Mappings
 * will be removed when the resource owner under which they were created
 * is cleaned up, unless dsm_pin_mapping() is used, in which case they
 * have session lifespan.  Segments will be removed when there are no
 * remaining mappings, or at postmaster shutdown in any case.  After a
 * hard postmaster crash, remaining segments will be removed, if they
 * still exist, at the next postmaster startup.
 *
 * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
 *	  src/backend/storage/ipc/dsm.c
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include <fcntl.h>
#include <unistd.h>
#ifndef WIN32
#include <sys/mman.h>
#endif
#include <sys/stat.h>

#include "common/pg_prng.h"
#include "lib/ilist.h"
#include "miscadmin.h"
#include "port/pg_bitutils.h"
#include "storage/dsm.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/lwlock.h"
#include "storage/pg_shmem.h"
#include "storage/shmem.h"
#include "utils/freepage.h"
#include "utils/guc.h"
#include "utils/memutils.h"
#include "utils/resowner.h"

#define PG_DYNSHMEM_CONTROL_MAGIC		0x9a503d32

#define PG_DYNSHMEM_FIXED_SLOTS			64
#define PG_DYNSHMEM_SLOTS_PER_BACKEND	5

#define INVALID_CONTROL_SLOT		((uint32) -1)

/* Backend-local tracking for on-detach callbacks. */
typedef struct dsm_segment_detach_callback
{
	on_dsm_detach_callback function;
	Datum		arg;
	slist_node	node;
} dsm_segment_detach_callback;

/* Backend-local state for a dynamic shared memory segment. */
struct dsm_segment
{
	dlist_node	node;			/* List link in dsm_segment_list. */
	ResourceOwner resowner;		/* Resource owner. */
	dsm_handle	handle;			/* Segment name. */
	uint32		control_slot;	/* Slot in control segment. */
	void	   *impl_private;	/* Implementation-specific private data. */
	void	   *mapped_address; /* Mapping address, or NULL if unmapped. */
	Size		mapped_size;	/* Size of our mapping. */
	slist_head	on_detach;		/* On-detach callbacks. */
};

/* Shared-memory state for a dynamic shared memory segment. */
typedef struct dsm_control_item
{
	dsm_handle	handle;
	uint32		refcnt;			/* 2+ = active, 1 = moribund, 0 = gone */
	size_t		first_page;
	size_t		npages;
	void	   *impl_private_pm_handle; /* only needed on Windows */
	bool		pinned;
} dsm_control_item;

/* Layout of the dynamic shared memory control segment. */
typedef struct dsm_control_header
{
	uint32		magic;
	uint32		nitems;
	uint32		maxitems;
	dsm_control_item item[FLEXIBLE_ARRAY_MEMBER];
} dsm_control_header;

static void dsm_cleanup_for_mmap(void);
static void dsm_postmaster_shutdown(int code, Datum arg);
static dsm_segment *dsm_create_descriptor(void);
static bool dsm_control_segment_sane(dsm_control_header *control,
									 Size mapped_size);
static uint64 dsm_control_bytes_needed(uint32 nitems);
static inline dsm_handle make_main_region_dsm_handle(int slot);
static inline bool is_main_region_dsm_handle(dsm_handle handle);

/* Has this backend initialized the dynamic shared memory system yet? */
static bool dsm_init_done = false;

/* Preallocated DSM space in the main shared memory region. */
static void *dsm_main_space_begin = NULL;

/*
 * List of dynamic shared memory segments used by this backend.
 *
 * At process exit time, we must decrement the reference count of each
 * segment we have attached; this list makes it possible to find all such
 * segments.
 *
 * This list should always be empty in the postmaster.  We could probably
 * allow the postmaster to map dynamic shared memory segments before it
 * begins to start child processes, provided that each process adjusted
 * the reference counts for those segments in the control segment at
 * startup time, but there's no obvious need for such a facility, which
 * would also be complex to handle in the EXEC_BACKEND case.  Once the
 * postmaster has begun spawning children, there's an additional problem:
 * each new mapping would require an update to the control segment,
 * which requires locking, in which the postmaster must not be involved.
 */
static dlist_head dsm_segment_list = DLIST_STATIC_INIT(dsm_segment_list);

/*
 * Control segment information.
 *
 * Unlike ordinary shared memory segments, the control segment is not
 * reference counted; instead, it lasts for the postmaster's entire
 * life cycle.  For simplicity, it doesn't have a dsm_segment object either.
 */
static dsm_handle dsm_control_handle;
static dsm_control_header *dsm_control;
static Size dsm_control_mapped_size = 0;
static void *dsm_control_impl_private = NULL;


/* ResourceOwner callbacks to hold DSM segments */
static void ResOwnerReleaseDSM(Datum res);
static char *ResOwnerPrintDSM(Datum res);

static const ResourceOwnerDesc dsm_resowner_desc =
{
	.name = "dynamic shared memory segment",
	.release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
	.release_priority = RELEASE_PRIO_DSMS,
	.ReleaseResource = ResOwnerReleaseDSM,
	.DebugPrint = ResOwnerPrintDSM
};

/* Convenience wrappers over ResourceOwnerRemember/Forget */
static inline void
ResourceOwnerRememberDSM(ResourceOwner owner, dsm_segment *seg)
{
	ResourceOwnerRemember(owner, PointerGetDatum(seg), &dsm_resowner_desc);
}
static inline void
ResourceOwnerForgetDSM(ResourceOwner owner, dsm_segment *seg)
{
	ResourceOwnerForget(owner, PointerGetDatum(seg), &dsm_resowner_desc);
}

/*
 * Start up the dynamic shared memory system.
 *
 * This is called just once during each cluster lifetime, at postmaster
 * startup time.
 */
void
dsm_postmaster_startup(PGShmemHeader *shim)
{
	void	   *dsm_control_address = NULL;
	uint32		maxitems;
	Size		segsize;

	Assert(!IsUnderPostmaster);

	/*
	 * If we're using the mmap implementations, clean up any leftovers.
	 * Cleanup isn't needed on Windows, and happens earlier in startup for
	 * POSIX and System V shared memory, via a direct call to
	 * dsm_cleanup_using_control_segment.
	 */
	if (dynamic_shared_memory_type == DSM_IMPL_MMAP)
		dsm_cleanup_for_mmap();

	/* Determine size for new control segment. */
	maxitems = PG_DYNSHMEM_FIXED_SLOTS
		+ PG_DYNSHMEM_SLOTS_PER_BACKEND * MaxBackends;
	elog(DEBUG2, "dynamic shared memory system will support %u segments",
		 maxitems);
	segsize = dsm_control_bytes_needed(maxitems);

	/*
	 * Loop until we find an unused identifier for the new control segment. We
	 * sometimes use DSM_HANDLE_INVALID as a sentinel value indicating "no
	 * control segment", so avoid generating that value for a real handle.
	 */
	for (;;)
	{
		Assert(dsm_control_address == NULL);
		Assert(dsm_control_mapped_size == 0);
		/* Use even numbers only */
		dsm_control_handle = pg_prng_uint32(&pg_global_prng_state) << 1;
		if (dsm_control_handle == DSM_HANDLE_INVALID)
			continue;
		if (dsm_impl_op(DSM_OP_CREATE, dsm_control_handle, segsize,
						&dsm_control_impl_private, &dsm_control_address,
						&dsm_control_mapped_size, ERROR))
			break;
	}
	dsm_control = dsm_control_address;
	on_shmem_exit(dsm_postmaster_shutdown, PointerGetDatum(shim));
	elog(DEBUG2,
		 "created dynamic shared memory control segment %u (%zu bytes)",
		 dsm_control_handle, segsize);
	shim->dsm_control = dsm_control_handle;

	/* Initialize control segment. */
	dsm_control->magic = PG_DYNSHMEM_CONTROL_MAGIC;
	dsm_control->nitems = 0;
	dsm_control->maxitems = maxitems;
}

/*
 * Determine whether the control segment from the previous postmaster
 * invocation still exists.  If so, remove the dynamic shared memory
 * segments to which it refers, and then the control segment itself.
 */
void
dsm_cleanup_using_control_segment(dsm_handle old_control_handle)
{
	void	   *mapped_address = NULL;
	void	   *junk_mapped_address = NULL;
	void	   *impl_private = NULL;
	void	   *junk_impl_private = NULL;
	Size		mapped_size = 0;
	Size		junk_mapped_size = 0;
	uint32		nitems;
	uint32		i;
	dsm_control_header *old_control;

	/*
	 * Try to attach the segment.  If this fails, it probably just means that
	 * the operating system has been rebooted and the segment no longer
	 * exists, or an unrelated process has used the same shm ID.  So just fall
	 * out quietly.
	 */
	if (!dsm_impl_op(DSM_OP_ATTACH, old_control_handle, 0, &impl_private,
					 &mapped_address, &mapped_size, DEBUG1))
		return;

	/*
	 * We've managed to reattach it, but the contents might not be sane. If
	 * they aren't, we disregard the segment after all.
	 */
	old_control = (dsm_control_header *) mapped_address;
	if (!dsm_control_segment_sane(old_control, mapped_size))
	{
		dsm_impl_op(DSM_OP_DETACH, old_control_handle, 0, &impl_private,
					&mapped_address, &mapped_size, LOG);
		return;
	}

	/*
	 * OK, the control segment looks basically valid, so we can use it to get
	 * a list of segments that need to be removed.
	 */
	nitems = old_control->nitems;
	for (i = 0; i < nitems; ++i)
	{
		dsm_handle	handle;
		uint32		refcnt;

		/* If the reference count is 0, the slot is actually unused. */
		refcnt = old_control->item[i].refcnt;
		if (refcnt == 0)
			continue;

		/* If it was using the main shmem area, there is nothing to do. */
		handle = old_control->item[i].handle;
		if (is_main_region_dsm_handle(handle))
			continue;

		/* Log debugging information. */
		elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u (reference count %u)",
			 handle, refcnt);

		/* Destroy the referenced segment. */
		dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
					&junk_mapped_address, &junk_mapped_size, LOG);
	}

	/* Destroy the old control segment, too. */
	elog(DEBUG2,
		 "cleaning up dynamic shared memory control segment with ID %u",
		 old_control_handle);
	dsm_impl_op(DSM_OP_DESTROY, old_control_handle, 0, &impl_private,
				&mapped_address, &mapped_size, LOG);
}

/*
 * When we're using the mmap shared memory implementation, "shared memory"
 * segments might even manage to survive an operating system reboot.
 * But there's no guarantee as to exactly what will survive: some segments
 * may survive, and others may not, and the contents of some may be out
 * of date.  In particular, the control segment may be out of date, so we
 * can't rely on it to figure out what to remove.  However, since we know
 * what directory contains the files we used as shared memory, we can simply
 * scan the directory and blow everything away that shouldn't be there.
 */
static void
dsm_cleanup_for_mmap(void)
{
	DIR		   *dir;
	struct dirent *dent;

	/* Scan the directory for something with a name of the correct format. */
	dir = AllocateDir(PG_DYNSHMEM_DIR);

	while ((dent = ReadDir(dir, PG_DYNSHMEM_DIR)) != NULL)
	{
		if (strncmp(dent->d_name, PG_DYNSHMEM_MMAP_FILE_PREFIX,
					strlen(PG_DYNSHMEM_MMAP_FILE_PREFIX)) == 0)
		{
			char		buf[MAXPGPATH + sizeof(PG_DYNSHMEM_DIR)];

			snprintf(buf, sizeof(buf), PG_DYNSHMEM_DIR "/%s", dent->d_name);

			elog(DEBUG2, "removing file \"%s\"", buf);

			/* We found a matching file; so remove it. */
			if (unlink(buf) != 0)
				ereport(ERROR,
						(errcode_for_file_access(),
						 errmsg("could not remove file \"%s\": %m", buf)));
		}
	}

	/* Cleanup complete. */
	FreeDir(dir);
}

/*
 * At shutdown time, we iterate over the control segment and remove all
 * remaining dynamic shared memory segments.  We avoid throwing errors here;
 * the postmaster is shutting down either way, and this is just non-critical
 * resource cleanup.
 */
static void
dsm_postmaster_shutdown(int code, Datum arg)
{
	uint32		nitems;
	uint32		i;
	void	   *dsm_control_address;
	void	   *junk_mapped_address = NULL;
	void	   *junk_impl_private = NULL;
	Size		junk_mapped_size = 0;
	PGShmemHeader *shim = (PGShmemHeader *) DatumGetPointer(arg);

	/*
	 * If some other backend exited uncleanly, it might have corrupted the
	 * control segment while it was dying.  In that case, we warn and ignore
	 * the contents of the control segment.  This may end up leaving behind
	 * stray shared memory segments, but there's not much we can do about that
	 * if the metadata is gone.
	 */
	nitems = dsm_control->nitems;
	if (!dsm_control_segment_sane(dsm_control, dsm_control_mapped_size))
	{
		ereport(LOG,
				(errmsg("dynamic shared memory control segment is corrupt")));
		return;
	}

	/* Remove any remaining segments. */
	for (i = 0; i < nitems; ++i)
	{
		dsm_handle	handle;

		/* If the reference count is 0, the slot is actually unused. */
		if (dsm_control->item[i].refcnt == 0)
			continue;

		handle = dsm_control->item[i].handle;
		if (is_main_region_dsm_handle(handle))
			continue;

		/* Log debugging information. */
		elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u",
			 handle);

		/* Destroy the segment. */
		dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
					&junk_mapped_address, &junk_mapped_size, LOG);
	}

	/* Remove the control segment itself. */
	elog(DEBUG2,
		 "cleaning up dynamic shared memory control segment with ID %u",
		 dsm_control_handle);
	dsm_control_address = dsm_control;
	dsm_impl_op(DSM_OP_DESTROY, dsm_control_handle, 0,
				&dsm_control_impl_private, &dsm_control_address,
				&dsm_control_mapped_size, LOG);
	dsm_control = dsm_control_address;
	shim->dsm_control = 0;
}

/*
 * Prepare this backend for dynamic shared memory usage.  Under EXEC_BACKEND,
 * we must reread the state file and map the control segment; in other cases,
 * we'll have inherited the postmaster's mapping and global variables.
 */
static void
dsm_backend_startup(void)
{
#ifdef EXEC_BACKEND
	if (IsUnderPostmaster)
	{
		void	   *control_address = NULL;

		/* Attach control segment. */
		Assert(dsm_control_handle != 0);
		dsm_impl_op(DSM_OP_ATTACH, dsm_control_handle, 0,
					&dsm_control_impl_private, &control_address,
					&dsm_control_mapped_size, ERROR);
		dsm_control = control_address;
		/* If control segment doesn't look sane, something is badly wrong. */
		if (!dsm_control_segment_sane(dsm_control, dsm_control_mapped_size))
		{
			dsm_impl_op(DSM_OP_DETACH, dsm_control_handle, 0,
						&dsm_control_impl_private, &control_address,
						&dsm_control_mapped_size, WARNING);
			ereport(FATAL,
					(errcode(ERRCODE_INTERNAL_ERROR),
					 errmsg("dynamic shared memory control segment is not valid")));
		}
	}
#endif

	dsm_init_done = true;
}

#ifdef EXEC_BACKEND
/*
 * When running under EXEC_BACKEND, we get a callback here when the main
 * shared memory segment is re-attached, so that we can record the control
 * handle retrieved from it.
 */
void
dsm_set_control_handle(dsm_handle h)
{
	Assert(dsm_control_handle == 0 && h != 0);
	dsm_control_handle = h;
}
#endif

/*
 * Reserve some space in the main shared memory segment for DSM segments.
 */
size_t
dsm_estimate_size(void)
{
	return 1024 * 1024 * (size_t) min_dynamic_shared_memory;
}

/*
 * Initialize space in the main shared memory segment for DSM segments.
 */
void
dsm_shmem_init(void)
{
	size_t		size = dsm_estimate_size();
	bool		found;

	if (size == 0)
		return;

	dsm_main_space_begin = ShmemInitStruct("Preallocated DSM", size, &found);
	if (!found)
	{
		FreePageManager *fpm = (FreePageManager *) dsm_main_space_begin;
		size_t		first_page = 0;
		size_t		pages;

		/* Reserve space for the FreePageManager. */
		while (first_page * FPM_PAGE_SIZE < sizeof(FreePageManager))
			++first_page;

		/* Initialize it and give it all the rest of the space. */
		FreePageManagerInitialize(fpm, dsm_main_space_begin);
		pages = (size / FPM_PAGE_SIZE) - first_page;
		FreePageManagerPut(fpm, first_page, pages);
	}
}

/*
 * Create a new dynamic shared memory segment.
 *
 * If there is a non-NULL CurrentResourceOwner, the new segment is associated
 * with it and must be detached before the resource owner releases, or a
 * warning will be logged.  If CurrentResourceOwner is NULL, the segment
 * remains attached until explicitly detached or the session ends.
 * Creating with a NULL CurrentResourceOwner is equivalent to creating
 * with a non-NULL CurrentResourceOwner and then calling dsm_pin_mapping.
 */
dsm_segment *
dsm_create(Size size, int flags)
{
	dsm_segment *seg;
	uint32		i;
	uint32		nitems;
	size_t		npages = 0;
	size_t		first_page = 0;
	FreePageManager *dsm_main_space_fpm = dsm_main_space_begin;
	bool		using_main_dsm_region = false;

	/*
	 * Unsafe in postmaster. It might seem pointless to allow use of dsm in
	 * single user mode, but otherwise some subsystems will need dedicated
	 * single user mode code paths.
	 */
	Assert(IsUnderPostmaster || !IsPostmasterEnvironment);

	if (!dsm_init_done)
		dsm_backend_startup();

	/* Create a new segment descriptor. */
	seg = dsm_create_descriptor();

	/*
	 * Lock the control segment while we try to allocate from the main shared
	 * memory area, if configured.
	 */
	if (dsm_main_space_fpm)
	{
		npages = size / FPM_PAGE_SIZE;
		if (size % FPM_PAGE_SIZE > 0)
			++npages;

		LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
		if (FreePageManagerGet(dsm_main_space_fpm, npages, &first_page))
		{
			/* We can carve out a piece of the main shared memory segment. */
			seg->mapped_address = (char *) dsm_main_space_begin +
				first_page * FPM_PAGE_SIZE;
			seg->mapped_size = npages * FPM_PAGE_SIZE;
			using_main_dsm_region = true;
			/* We'll choose a handle below. */
		}
	}

	if (!using_main_dsm_region)
	{
		/*
		 * We need to create a new memory segment.  Loop until we find an
		 * unused segment identifier.
		 */
		if (dsm_main_space_fpm)
			LWLockRelease(DynamicSharedMemoryControlLock);
		for (;;)
		{
			Assert(seg->mapped_address == NULL && seg->mapped_size == 0);
			/* Use even numbers only */
			seg->handle = pg_prng_uint32(&pg_global_prng_state) << 1;
			if (seg->handle == DSM_HANDLE_INVALID)	/* Reserve sentinel */
				continue;
			if (dsm_impl_op(DSM_OP_CREATE, seg->handle, size, &seg->impl_private,
							&seg->mapped_address, &seg->mapped_size, ERROR))
				break;
		}
		LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
	}

	/* Search the control segment for an unused slot. */
	nitems = dsm_control->nitems;
	for (i = 0; i < nitems; ++i)
	{
		if (dsm_control->item[i].refcnt == 0)
		{
			if (using_main_dsm_region)
			{
				seg->handle = make_main_region_dsm_handle(i);
				dsm_control->item[i].first_page = first_page;
				dsm_control->item[i].npages = npages;
			}
			else
				Assert(!is_main_region_dsm_handle(seg->handle));
			dsm_control->item[i].handle = seg->handle;
			/* refcnt of 1 triggers destruction, so start at 2 */
			dsm_control->item[i].refcnt = 2;
			dsm_control->item[i].impl_private_pm_handle = NULL;
			dsm_control->item[i].pinned = false;
			seg->control_slot = i;
			LWLockRelease(DynamicSharedMemoryControlLock);
			return seg;
		}
	}

	/* Verify that we can support an additional mapping. */
	if (nitems >= dsm_control->maxitems)
	{
		if (using_main_dsm_region)
			FreePageManagerPut(dsm_main_space_fpm, first_page, npages);
		LWLockRelease(DynamicSharedMemoryControlLock);
		if (!using_main_dsm_region)
			dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
						&seg->mapped_address, &seg->mapped_size, WARNING);
		if (seg->resowner != NULL)
			ResourceOwnerForgetDSM(seg->resowner, seg);
		dlist_delete(&seg->node);
		pfree(seg);

		if ((flags & DSM_CREATE_NULL_IF_MAXSEGMENTS) != 0)
			return NULL;
		ereport(ERROR,
				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
				 errmsg("too many dynamic shared memory segments")));
	}

	/* Enter the handle into a new array slot. */
	if (using_main_dsm_region)
	{
		seg->handle = make_main_region_dsm_handle(nitems);
		dsm_control->item[i].first_page = first_page;
		dsm_control->item[i].npages = npages;
	}
	dsm_control->item[nitems].handle = seg->handle;
	/* refcnt of 1 triggers destruction, so start at 2 */
	dsm_control->item[nitems].refcnt = 2;
	dsm_control->item[nitems].impl_private_pm_handle = NULL;
	dsm_control->item[nitems].pinned = false;
	seg->control_slot = nitems;
	dsm_control->nitems++;
	LWLockRelease(DynamicSharedMemoryControlLock);

	return seg;
}

/*
 * Attach a dynamic shared memory segment.
 *
 * See comments for dsm_segment_handle() for an explanation of how this
 * is intended to be used.
 *
 * This function will return NULL if the segment isn't known to the system.
 * This can happen if we're asked to attach the segment, but then everyone
 * else detaches it (causing it to be destroyed) before we get around to
 * attaching it.
 *
 * If there is a non-NULL CurrentResourceOwner, the attached segment is
 * associated with it and must be detached before the resource owner releases,
 * or a warning will be logged.  Otherwise the segment remains attached until
 * explicitly detached or the session ends.  See the note atop dsm_create().
 */
dsm_segment *
dsm_attach(dsm_handle h)
{
	dsm_segment *seg;
	dlist_iter	iter;
	uint32		i;
	uint32		nitems;

	/* Unsafe in postmaster (and pointless in a stand-alone backend). */
	Assert(IsUnderPostmaster);

	if (!dsm_init_done)
		dsm_backend_startup();

	/*
	 * Since this is just a debugging cross-check, we could leave it out
	 * altogether, or include it only in assert-enabled builds.  But since the
	 * list of attached segments should normally be very short, let's include
	 * it always for right now.
	 *
	 * If you're hitting this error, you probably want to attempt to find an
	 * existing mapping via dsm_find_mapping() before calling dsm_attach() to
	 * create a new one.
	 */
	dlist_foreach(iter, &dsm_segment_list)
	{
		seg = dlist_container(dsm_segment, node, iter.cur);
		if (seg->handle == h)
			elog(ERROR, "can't attach the same segment more than once");
	}

	/* Create a new segment descriptor. */
	seg = dsm_create_descriptor();
	seg->handle = h;

	/* Bump reference count for this segment in shared memory. */
	LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
	nitems = dsm_control->nitems;
	for (i = 0; i < nitems; ++i)
	{
		/*
		 * If the reference count is 0, the slot is actually unused.  If the
		 * reference count is 1, the slot is still in use, but the segment is
		 * in the process of going away; even if the handle matches, another
		 * slot may already have started using the same handle value by
		 * coincidence so we have to keep searching.
		 */
		if (dsm_control->item[i].refcnt <= 1)
			continue;

		/* If the handle doesn't match, it's not the slot we want. */
		if (dsm_control->item[i].handle != seg->handle)
			continue;

		/* Otherwise we've found a match. */
		dsm_control->item[i].refcnt++;
		seg->control_slot = i;
		if (is_main_region_dsm_handle(seg->handle))
		{
			seg->mapped_address = (char *) dsm_main_space_begin +
				dsm_control->item[i].first_page * FPM_PAGE_SIZE;
			seg->mapped_size = dsm_control->item[i].npages * FPM_PAGE_SIZE;
		}
		break;
	}
	LWLockRelease(DynamicSharedMemoryControlLock);

	/*
	 * If we didn't find the handle we're looking for in the control segment,
	 * it probably means that everyone else who had it mapped, including the
	 * original creator, died before we got to this point. It's up to the
	 * caller to decide what to do about that.
	 */
	if (seg->control_slot == INVALID_CONTROL_SLOT)
	{
		dsm_detach(seg);
		return NULL;
	}

	/* Here's where we actually try to map the segment. */
	if (!is_main_region_dsm_handle(seg->handle))
		dsm_impl_op(DSM_OP_ATTACH, seg->handle, 0, &seg->impl_private,
					&seg->mapped_address, &seg->mapped_size, ERROR);

	return seg;
}

/*
 * At backend shutdown time, detach any segments that are still attached.
 * (This is similar to dsm_detach_all, except that there's no reason to
 * unmap the control segment before exiting, so we don't bother.)
 */
void
dsm_backend_shutdown(void)
{
	while (!dlist_is_empty(&dsm_segment_list))
	{
		dsm_segment *seg;

		seg = dlist_head_element(dsm_segment, node, &dsm_segment_list);
		dsm_detach(seg);
	}
}

/*
 * Detach all shared memory segments, including the control segments.  This
 * should be called, along with PGSharedMemoryDetach, in processes that
 * might inherit mappings but are not intended to be connected to dynamic
 * shared memory.
 */
void
dsm_detach_all(void)
{
	void	   *control_address = dsm_control;

	while (!dlist_is_empty(&dsm_segment_list))
	{
		dsm_segment *seg;

		seg = dlist_head_element(dsm_segment, node, &dsm_segment_list);
		dsm_detach(seg);
	}

	if (control_address != NULL)
		dsm_impl_op(DSM_OP_DETACH, dsm_control_handle, 0,
					&dsm_control_impl_private, &control_address,
					&dsm_control_mapped_size, ERROR);
}

/*
 * Detach from a shared memory segment, destroying the segment if we
 * remove the last reference.
 *
 * This function should never fail.  It will often be invoked when aborting
 * a transaction, and a further error won't serve any purpose.  It's not a
 * complete disaster if we fail to unmap or destroy the segment; it means a
 * resource leak, but that doesn't necessarily preclude further operations.
 */
void
dsm_detach(dsm_segment *seg)
{
	/*
	 * Invoke registered callbacks.  Just in case one of those callbacks
	 * throws a further error that brings us back here, pop the callback
	 * before invoking it, to avoid infinite error recursion.  Don't allow
	 * interrupts while running the individual callbacks in non-error code
	 * paths, to avoid leaving cleanup work unfinished if we're interrupted by
	 * a statement timeout or similar.
	 */
	HOLD_INTERRUPTS();
	while (!slist_is_empty(&seg->on_detach))
	{
		slist_node *node;
		dsm_segment_detach_callback *cb;
		on_dsm_detach_callback function;
		Datum		arg;

		node = slist_pop_head_node(&seg->on_detach);
		cb = slist_container(dsm_segment_detach_callback, node, node);
		function = cb->function;
		arg = cb->arg;
		pfree(cb);

		function(seg, arg);
	}
	RESUME_INTERRUPTS();

	/*
	 * Try to remove the mapping, if one exists.  Normally, there will be, but
	 * maybe not, if we failed partway through a create or attach operation.
	 * We remove the mapping before decrementing the reference count so that
	 * the process that sees a zero reference count can be certain that no
	 * remaining mappings exist.  Even if this fails, we pretend that it
	 * works, because retrying is likely to fail in the same way.
	 */
	if (seg->mapped_address != NULL)
	{
		if (!is_main_region_dsm_handle(seg->handle))
			dsm_impl_op(DSM_OP_DETACH, seg->handle, 0, &seg->impl_private,
						&seg->mapped_address, &seg->mapped_size, WARNING);
		seg->impl_private = NULL;
		seg->mapped_address = NULL;
		seg->mapped_size = 0;
	}

	/* Reduce reference count, if we previously increased it. */
	if (seg->control_slot != INVALID_CONTROL_SLOT)
	{
		uint32		refcnt;
		uint32		control_slot = seg->control_slot;

		LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
		Assert(dsm_control->item[control_slot].handle == seg->handle);
		Assert(dsm_control->item[control_slot].refcnt > 1);
		refcnt = --dsm_control->item[control_slot].refcnt;
		seg->control_slot = INVALID_CONTROL_SLOT;
		LWLockRelease(DynamicSharedMemoryControlLock);

		/* If new reference count is 1, try to destroy the segment. */
		if (refcnt == 1)
		{
			/* A pinned segment should never reach 1. */
			Assert(!dsm_control->item[control_slot].pinned);

			/*
			 * If we fail to destroy the segment here, or are killed before we
			 * finish doing so, the reference count will remain at 1, which
			 * will mean that nobody else can attach to the segment.  At
			 * postmaster shutdown time, or when a new postmaster is started
			 * after a hard kill, another attempt will be made to remove the
			 * segment.
			 *
			 * The main case we're worried about here is being killed by a
			 * signal before we can finish removing the segment.  In that
			 * case, it's important to be sure that the segment still gets
			 * removed. If we actually fail to remove the segment for some
			 * other reason, the postmaster may not have any better luck than
			 * we did.  There's not much we can do about that, though.
			 */
			if (is_main_region_dsm_handle(seg->handle) ||
				dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
							&seg->mapped_address, &seg->mapped_size, WARNING))
			{
				LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
				if (is_main_region_dsm_handle(seg->handle))
					FreePageManagerPut((FreePageManager *) dsm_main_space_begin,
									   dsm_control->item[control_slot].first_page,
									   dsm_control->item[control_slot].npages);
				Assert(dsm_control->item[control_slot].handle == seg->handle);
				Assert(dsm_control->item[control_slot].refcnt == 1);
				dsm_control->item[control_slot].refcnt = 0;
				LWLockRelease(DynamicSharedMemoryControlLock);
			}
		}
	}

	/* Clean up our remaining backend-private data structures. */
	if (seg->resowner != NULL)
		ResourceOwnerForgetDSM(seg->resowner, seg);
	dlist_delete(&seg->node);
	pfree(seg);
}

/*
 * Keep a dynamic shared memory mapping until end of session.
 *
 * By default, mappings are owned by the current resource owner, which
 * typically means they stick around for the duration of the current query
 * only.
 */
void
dsm_pin_mapping(dsm_segment *seg)
{
	if (seg->resowner != NULL)
	{
		ResourceOwnerForgetDSM(seg->resowner, seg);
		seg->resowner = NULL;
	}
}

/*
 * Arrange to remove a dynamic shared memory mapping at cleanup time.
 *
 * dsm_pin_mapping() can be used to preserve a mapping for the entire
 * lifetime of a process; this function reverses that decision, making
 * the segment owned by the current resource owner.  This may be useful
 * just before performing some operation that will invalidate the segment
 * for future use by this backend.
 */
void
dsm_unpin_mapping(dsm_segment *seg)
{
	Assert(seg->resowner == NULL);
	ResourceOwnerEnlarge(CurrentResourceOwner);
	seg->resowner = CurrentResourceOwner;
	ResourceOwnerRememberDSM(seg->resowner, seg);
}

/*
 * Keep a dynamic shared memory segment until postmaster shutdown, or until
 * dsm_unpin_segment is called.
 *
 * This function should not be called more than once per segment, unless the
 * segment is explicitly unpinned with dsm_unpin_segment in between calls.
 *
 * Note that this function does not arrange for the current process to
 * keep the segment mapped indefinitely; if that behavior is desired,
 * dsm_pin_mapping() should be used from each process that needs to
 * retain the mapping.
 */
void
dsm_pin_segment(dsm_segment *seg)
{
	void	   *handle = NULL;

	/*
	 * Bump reference count for this segment in shared memory. This will
	 * ensure that even if there is no session which is attached to this
	 * segment, it will remain until postmaster shutdown or an explicit call
	 * to unpin.
	 */
	LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
	if (dsm_control->item[seg->control_slot].pinned)
		elog(ERROR, "cannot pin a segment that is already pinned");
	if (!is_main_region_dsm_handle(seg->handle))
		dsm_impl_pin_segment(seg->handle, seg->impl_private, &handle);
	dsm_control->item[seg->control_slot].pinned = true;
	dsm_control->item[seg->control_slot].refcnt++;
	dsm_control->item[seg->control_slot].impl_private_pm_handle = handle;
	LWLockRelease(DynamicSharedMemoryControlLock);
}

/*
 * Unpin a dynamic shared memory segment that was previously pinned with
 * dsm_pin_segment.  This function should not be called unless dsm_pin_segment
 * was previously called for this segment.
 *
 * The argument is a dsm_handle rather than a dsm_segment in case you want
 * to unpin a segment to which you haven't attached.  This turns out to be
 * useful if, for example, a reference to one shared memory segment is stored
 * within another shared memory segment.  You might want to unpin the
 * referenced segment before destroying the referencing segment.
 */
void
dsm_unpin_segment(dsm_handle handle)
{
	uint32		control_slot = INVALID_CONTROL_SLOT;
	bool		destroy = false;
	uint32		i;

	/* Find the control slot for the given handle. */
	LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
	for (i = 0; i < dsm_control->nitems; ++i)
	{
		/* Skip unused slots and segments that are concurrently going away. */
		if (dsm_control->item[i].refcnt <= 1)
			continue;

		/* If we've found our handle, we can stop searching. */
		if (dsm_control->item[i].handle == handle)
		{
			control_slot = i;
			break;
		}
	}

	/*
	 * We should definitely have found the slot, and it should not already be
	 * in the process of going away, because this function should only be
	 * called on a segment which is pinned.
	 */
	if (control_slot == INVALID_CONTROL_SLOT)
		elog(ERROR, "cannot unpin unknown segment handle");
	if (!dsm_control->item[control_slot].pinned)
		elog(ERROR, "cannot unpin a segment that is not pinned");
	Assert(dsm_control->item[control_slot].refcnt > 1);

	/*
	 * Allow implementation-specific code to run.  We have to do this before
	 * releasing the lock, because impl_private_pm_handle may get modified by
	 * dsm_impl_unpin_segment.
	 */
	if (!is_main_region_dsm_handle(handle))
		dsm_impl_unpin_segment(handle,
							   &dsm_control->item[control_slot].impl_private_pm_handle);

	/* Note that 1 means no references (0 means unused slot). */
	if (--dsm_control->item[control_slot].refcnt == 1)
		destroy = true;
	dsm_control->item[control_slot].pinned = false;

	/* Now we can release the lock. */
	LWLockRelease(DynamicSharedMemoryControlLock);

	/* Clean up resources if that was the last reference. */
	if (destroy)
	{
		void	   *junk_impl_private = NULL;
		void	   *junk_mapped_address = NULL;
		Size		junk_mapped_size = 0;

		/*
		 * For an explanation of how error handling works in this case, see
		 * comments in dsm_detach.  Note that if we reach this point, the
		 * current process certainly does not have the segment mapped, because
		 * if it did, the reference count would have still been greater than 1
		 * even after releasing the reference count held by the pin.  The fact
		 * that there can't be a dsm_segment for this handle makes it OK to
		 * pass the mapped size, mapped address, and private data as NULL
		 * here.
		 */
		if (is_main_region_dsm_handle(handle) ||
			dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
						&junk_mapped_address, &junk_mapped_size, WARNING))
		{
			LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
			if (is_main_region_dsm_handle(handle))
				FreePageManagerPut((FreePageManager *) dsm_main_space_begin,
								   dsm_control->item[control_slot].first_page,
								   dsm_control->item[control_slot].npages);
			Assert(dsm_control->item[control_slot].handle == handle);
			Assert(dsm_control->item[control_slot].refcnt == 1);
			dsm_control->item[control_slot].refcnt = 0;
			LWLockRelease(DynamicSharedMemoryControlLock);
		}
	}
}

/*
 * Find an existing mapping for a shared memory segment, if there is one.
 */
dsm_segment *
dsm_find_mapping(dsm_handle handle)
{
	dlist_iter	iter;
	dsm_segment *seg;

	dlist_foreach(iter, &dsm_segment_list)
	{
		seg = dlist_container(dsm_segment, node, iter.cur);
		if (seg->handle == handle)
			return seg;
	}

	return NULL;
}

/*
 * Get the address at which a dynamic shared memory segment is mapped.
 */
void *
dsm_segment_address(dsm_segment *seg)
{
	Assert(seg->mapped_address != NULL);
	return seg->mapped_address;
}

/*
 * Get the size of a mapping.
 */
Size
dsm_segment_map_length(dsm_segment *seg)
{
	Assert(seg->mapped_address != NULL);
	return seg->mapped_size;
}

/*
 * Get a handle for a mapping.
 *
 * To establish communication via dynamic shared memory between two backends,
 * one of them should first call dsm_create() to establish a new shared
 * memory mapping.  That process should then call dsm_segment_handle() to
 * obtain a handle for the mapping, and pass that handle to the
 * coordinating backend via some means (e.g. bgw_main_arg, or via the
 * main shared memory segment).  The recipient, once in possession of the
 * handle, should call dsm_attach().
 */
dsm_handle
dsm_segment_handle(dsm_segment *seg)
{
	return seg->handle;
}

/*
 * Register an on-detach callback for a dynamic shared memory segment.
 */
void
on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function, Datum arg)
{
	dsm_segment_detach_callback *cb;

	cb = MemoryContextAlloc(TopMemoryContext,
							sizeof(dsm_segment_detach_callback));
	cb->function = function;
	cb->arg = arg;
	slist_push_head(&seg->on_detach, &cb->node);
}

/*
 * Unregister an on-detach callback for a dynamic shared memory segment.
 */
void
cancel_on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function,
					 Datum arg)
{
	slist_mutable_iter iter;

	slist_foreach_modify(iter, &seg->on_detach)
	{
		dsm_segment_detach_callback *cb;

		cb = slist_container(dsm_segment_detach_callback, node, iter.cur);
		if (cb->function == function && cb->arg == arg)
		{
			slist_delete_current(&iter);
			pfree(cb);
			break;
		}
	}
}

/*
 * Discard all registered on-detach callbacks without executing them.
 */
void
reset_on_dsm_detach(void)
{
	dlist_iter	iter;

	dlist_foreach(iter, &dsm_segment_list)
	{
		dsm_segment *seg = dlist_container(dsm_segment, node, iter.cur);

		/* Throw away explicit on-detach actions one by one. */
		while (!slist_is_empty(&seg->on_detach))
		{
			slist_node *node;
			dsm_segment_detach_callback *cb;

			node = slist_pop_head_node(&seg->on_detach);
			cb = slist_container(dsm_segment_detach_callback, node, node);
			pfree(cb);
		}

		/*
		 * Decrementing the reference count is a sort of implicit on-detach
		 * action; make sure we don't do that, either.
		 */
		seg->control_slot = INVALID_CONTROL_SLOT;
	}
}

/*
 * Create a segment descriptor.
 */
static dsm_segment *
dsm_create_descriptor(void)
{
	dsm_segment *seg;

	if (CurrentResourceOwner)
		ResourceOwnerEnlarge(CurrentResourceOwner);

	seg = MemoryContextAlloc(TopMemoryContext, sizeof(dsm_segment));
	dlist_push_head(&dsm_segment_list, &seg->node);

	/* seg->handle must be initialized by the caller */
	seg->control_slot = INVALID_CONTROL_SLOT;
	seg->impl_private = NULL;
	seg->mapped_address = NULL;
	seg->mapped_size = 0;

	seg->resowner = CurrentResourceOwner;
	if (CurrentResourceOwner)
		ResourceOwnerRememberDSM(CurrentResourceOwner, seg);

	slist_init(&seg->on_detach);

	return seg;
}

/*
 * Sanity check a control segment.
 *
 * The goal here isn't to detect everything that could possibly be wrong with
 * the control segment; there's not enough information for that.  Rather, the
 * goal is to make sure that someone can iterate over the items in the segment
 * without overrunning the end of the mapping and crashing.  We also check
 * the magic number since, if that's messed up, this may not even be one of
 * our segments at all.
 */
static bool
dsm_control_segment_sane(dsm_control_header *control, Size mapped_size)
{
	if (mapped_size < offsetof(dsm_control_header, item))
		return false;			/* Mapped size too short to read header. */
	if (control->magic != PG_DYNSHMEM_CONTROL_MAGIC)
		return false;			/* Magic number doesn't match. */
	if (dsm_control_bytes_needed(control->maxitems) > mapped_size)
		return false;			/* Max item count won't fit in map. */
	if (control->nitems > control->maxitems)
		return false;			/* Overfull. */
	return true;
}

/*
 * Compute the number of control-segment bytes needed to store a given
 * number of items.
 */
static uint64
dsm_control_bytes_needed(uint32 nitems)
{
	return offsetof(dsm_control_header, item)
		+ sizeof(dsm_control_item) * (uint64) nitems;
}

static inline dsm_handle
make_main_region_dsm_handle(int slot)
{
	dsm_handle	handle;

	/*
	 * We need to create a handle that doesn't collide with any existing extra
	 * segment created by dsm_impl_op(), so we'll make it odd.  It also
	 * mustn't collide with any other main area pseudo-segment, so we'll
	 * include the slot number in some of the bits.  We also want to make an
	 * effort to avoid newly created and recently destroyed handles from being
	 * confused, so we'll make the rest of the bits random.
	 */
	handle = 1;
	handle |= slot << 1;
	handle |= pg_prng_uint32(&pg_global_prng_state) << (pg_leftmost_one_pos32(dsm_control->maxitems) + 1);
	return handle;
}

static inline bool
is_main_region_dsm_handle(dsm_handle handle)
{
	return handle & 1;
}

/* ResourceOwner callbacks */

static void
ResOwnerReleaseDSM(Datum res)
{
	dsm_segment *seg = (dsm_segment *) DatumGetPointer(res);

	seg->resowner = NULL;
	dsm_detach(seg);
}
static char *
ResOwnerPrintDSM(Datum res)
{
	dsm_segment *seg = (dsm_segment *) DatumGetPointer(res);

	return psprintf("dynamic shared memory segment %u",
					dsm_segment_handle(seg));
}