postgresql/src/backend/partitioning/partdesc.c

/*-------------------------------------------------------------------------
 *
 * partdesc.c
 *		Support routines for manipulating partition descriptors
 *
 * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
 *		  src/backend/partitioning/partdesc.c
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include "access/genam.h"
#include "access/htup_details.h"
#include "access/table.h"
#include "catalog/partition.h"
#include "catalog/pg_inherits.h"
#include "partitioning/partbounds.h"
#include "partitioning/partdesc.h"
#include "storage/bufmgr.h"
#include "storage/sinval.h"
#include "utils/builtins.h"
#include "utils/fmgroids.h"
#include "utils/hsearch.h"
#include "utils/inval.h"
#include "utils/lsyscache.h"
#include "utils/memutils.h"
#include "utils/partcache.h"
#include "utils/rel.h"
#include "utils/syscache.h"

typedef struct PartitionDirectoryData
{
	MemoryContext pdir_mcxt;
	HTAB	   *pdir_hash;
	bool		omit_detached;
}			PartitionDirectoryData;

typedef struct PartitionDirectoryEntry
{
	Oid			reloid;
	Relation	rel;
	PartitionDesc pd;
} PartitionDirectoryEntry;

static PartitionDesc RelationBuildPartitionDesc(Relation rel,
												bool omit_detached);


/*
 * RelationGetPartitionDesc -- get partition descriptor, if relation is partitioned
 *
 * We keep two partdescs in relcache: rd_partdesc includes all partitions
 * (even those being concurrently marked detached), while rd_partdesc_nodetached
 * omits (some of) those.  We store the pg_inherits.xmin value for the latter,
 * to determine whether it can be validly reused in each case, since that
 * depends on the active snapshot.
 *
 * Note: we arrange for partition descriptors to not get freed until the
 * relcache entry's refcount goes to zero (see hacks in RelationClose,
 * RelationClearRelation, and RelationBuildPartitionDesc).  Therefore, even
 * though we hand back a direct pointer into the relcache entry, it's safe
 * for callers to continue to use that pointer as long as (a) they hold the
 * relation open, and (b) they hold a relation lock strong enough to ensure
 * that the data doesn't become stale.
 */
PartitionDesc
RelationGetPartitionDesc(Relation rel, bool omit_detached)
{
	Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);

	/*
	 * If relcache has a partition descriptor, use that.  However, we can only
	 * do so when we are asked to include all partitions including detached;
	 * and also when we know that there are no detached partitions.
	 *
	 * If there is no active snapshot, detached partitions aren't omitted
	 * either, so we can use the cached descriptor too in that case.
	 */
	if (likely(rel->rd_partdesc &&
			   (!rel->rd_partdesc->detached_exist || !omit_detached ||
				!ActiveSnapshotSet())))
		return rel->rd_partdesc;

	/*
	 * If we're asked to omit detached partitions, we may be able to use a
	 * cached descriptor too.  We determine that based on the pg_inherits.xmin
	 * that was saved alongside that descriptor: if the xmin that was not in
	 * progress for that active snapshot is also not in progress for the
	 * current active snapshot, then we can use it.  Otherwise build one from
	 * scratch.
	 */
	if (omit_detached &&
		rel->rd_partdesc_nodetached &&
		ActiveSnapshotSet())
	{
		Snapshot	activesnap;

		Assert(TransactionIdIsValid(rel->rd_partdesc_nodetached_xmin));
		activesnap = GetActiveSnapshot();

		if (!XidInMVCCSnapshot(rel->rd_partdesc_nodetached_xmin, activesnap))
			return rel->rd_partdesc_nodetached;
	}

	return RelationBuildPartitionDesc(rel, omit_detached);
}

/*
 * RelationBuildPartitionDesc
 *		Form rel's partition descriptor, and store in relcache entry
 *
 * Partition descriptor is a complex structure; to avoid complicated logic to
 * free individual elements whenever the relcache entry is flushed, we give it
 * its own memory context, a child of CacheMemoryContext, which can easily be
 * deleted on its own.  To avoid leaking memory in that context in case of an
 * error partway through this function, the context is initially created as a
 * child of CurTransactionContext and only re-parented to CacheMemoryContext
 * at the end, when no further errors are possible.  Also, we don't make this
 * context the current context except in very brief code sections, out of fear
 * that some of our callees allocate memory on their own which would be leaked
 * permanently.
 *
 * As a special case, partition descriptors that are requested to omit
 * partitions being detached (and which contain such partitions) are transient
 * and are not associated with the relcache entry.  Such descriptors only last
 * through the requesting Portal, so we use the corresponding memory context
 * for them.
 */
static PartitionDesc
RelationBuildPartitionDesc(Relation rel, bool omit_detached)
{
	PartitionDesc partdesc;
	PartitionBoundInfo boundinfo = NULL;
	List	   *inhoids;
	PartitionBoundSpec **boundspecs = NULL;
	Oid		   *oids = NULL;
	bool	   *is_leaf = NULL;
	bool		detached_exist;
	bool		is_omit;
	TransactionId detached_xmin;
	ListCell   *cell;
	int			i,
				nparts;
	PartitionKey key = RelationGetPartitionKey(rel);
	MemoryContext new_pdcxt;
	MemoryContext oldcxt;
	int		   *mapping;

	/*
	 * Get partition oids from pg_inherits.  This uses a single snapshot to
	 * fetch the list of children, so while more children may be getting added
	 * concurrently, whatever this function returns will be accurate as of
	 * some well-defined point in time.
	 */
	detached_exist = false;
	detached_xmin = InvalidTransactionId;
	inhoids = find_inheritance_children_extended(RelationGetRelid(rel),
												 omit_detached, NoLock,
												 &detached_exist,
												 &detached_xmin);

	nparts = list_length(inhoids);

	/* Allocate working arrays for OIDs, leaf flags, and boundspecs. */
	if (nparts > 0)
	{
		oids = (Oid *) palloc(nparts * sizeof(Oid));
		is_leaf = (bool *) palloc(nparts * sizeof(bool));
		boundspecs = palloc(nparts * sizeof(PartitionBoundSpec *));
	}

	/* Collect bound spec nodes for each partition. */
	i = 0;
	foreach(cell, inhoids)
	{
		Oid			inhrelid = lfirst_oid(cell);
		HeapTuple	tuple;
		PartitionBoundSpec *boundspec = NULL;

		/* Try fetching the tuple from the catcache, for speed. */
		tuple = SearchSysCache1(RELOID, inhrelid);
		if (HeapTupleIsValid(tuple))
		{
			Datum		datum;
			bool		isnull;

			datum = SysCacheGetAttr(RELOID, tuple,
									Anum_pg_class_relpartbound,
									&isnull);
			if (!isnull)
				boundspec = stringToNode(TextDatumGetCString(datum));
			ReleaseSysCache(tuple);
		}

		/*
		 * The system cache may be out of date; if so, we may find no pg_class
		 * tuple or an old one where relpartbound is NULL.  In that case, try
		 * the table directly.  We can't just AcceptInvalidationMessages() and
		 * retry the system cache lookup because it's possible that a
		 * concurrent ATTACH PARTITION operation has removed itself from the
		 * ProcArray but not yet added invalidation messages to the shared
		 * queue; InvalidateSystemCaches() would work, but seems excessive.
		 *
		 * Note that this algorithm assumes that PartitionBoundSpec we manage
		 * to fetch is the right one -- so this is only good enough for
		 * concurrent ATTACH PARTITION, not concurrent DETACH PARTITION or
		 * some hypothetical operation that changes the partition bounds.
		 */
		if (boundspec == NULL)
		{
			Relation	pg_class;
			SysScanDesc scan;
			ScanKeyData key[1];
			Datum		datum;
			bool		isnull;

			pg_class = table_open(RelationRelationId, AccessShareLock);
			ScanKeyInit(&key[0],
						Anum_pg_class_oid,
						BTEqualStrategyNumber, F_OIDEQ,
						ObjectIdGetDatum(inhrelid));
			scan = systable_beginscan(pg_class, ClassOidIndexId, true,
									  NULL, 1, key);
			tuple = systable_getnext(scan);
			datum = heap_getattr(tuple, Anum_pg_class_relpartbound,
								 RelationGetDescr(pg_class), &isnull);
			if (!isnull)
				boundspec = stringToNode(TextDatumGetCString(datum));
			systable_endscan(scan);
			table_close(pg_class, AccessShareLock);
		}

		/* Sanity checks. */
		if (!boundspec)
			elog(ERROR, "missing relpartbound for relation %u", inhrelid);
		if (!IsA(boundspec, PartitionBoundSpec))
			elog(ERROR, "invalid relpartbound for relation %u", inhrelid);

		/*
		 * If the PartitionBoundSpec says this is the default partition, its
		 * OID should match pg_partitioned_table.partdefid; if not, the
		 * catalog is corrupt.
		 */
		if (boundspec->is_default)
		{
			Oid			partdefid;

			partdefid = get_default_partition_oid(RelationGetRelid(rel));
			if (partdefid != inhrelid)
				elog(ERROR, "expected partdefid %u, but got %u",
					 inhrelid, partdefid);
		}

		/* Save results. */
		oids[i] = inhrelid;
		is_leaf[i] = (get_rel_relkind(inhrelid) != RELKIND_PARTITIONED_TABLE);
		boundspecs[i] = boundspec;
		++i;
	}

	/*
	 * Create PartitionBoundInfo and mapping, working in the caller's context.
	 * This could fail, but we haven't done any damage if so.
	 */
	if (nparts > 0)
		boundinfo = partition_bounds_create(boundspecs, nparts, key, &mapping);

	/*
	 * Now build the actual relcache partition descriptor, copying all the
	 * data into a new, small context.  As per above comment, we don't make
	 * this a long-lived context until it's finished.
	 */
	new_pdcxt = AllocSetContextCreate(CurTransactionContext,
									  "partition descriptor",
									  ALLOCSET_SMALL_SIZES);
	MemoryContextCopyAndSetIdentifier(new_pdcxt,
									  RelationGetRelationName(rel));

	partdesc = (PartitionDescData *)
		MemoryContextAllocZero(new_pdcxt, sizeof(PartitionDescData));
	partdesc->nparts = nparts;
	partdesc->detached_exist = detached_exist;
	/* If there are no partitions, the rest of the partdesc can stay zero */
	if (nparts > 0)
	{
		oldcxt = MemoryContextSwitchTo(new_pdcxt);
		partdesc->boundinfo = partition_bounds_copy(boundinfo, key);

		/* Initialize caching fields for speeding up ExecFindPartition */
		partdesc->last_found_datum_index = -1;
		partdesc->last_found_part_index = -1;
		partdesc->last_found_count = 0;

		partdesc->oids = (Oid *) palloc(nparts * sizeof(Oid));
		partdesc->is_leaf = (bool *) palloc(nparts * sizeof(bool));

		/*
		 * Assign OIDs from the original array into mapped indexes of the
		 * result array.  The order of OIDs in the former is defined by the
		 * catalog scan that retrieved them, whereas that in the latter is
		 * defined by canonicalized representation of the partition bounds.
		 * Also save leaf-ness of each partition.
		 */
		for (i = 0; i < nparts; i++)
		{
			int			index = mapping[i];

			partdesc->oids[index] = oids[i];
			partdesc->is_leaf[index] = is_leaf[i];
		}
		MemoryContextSwitchTo(oldcxt);
	}

	/*
	 * Are we working with the partdesc that omits the detached partition, or
	 * the one that includes it?
	 *
	 * Note that if a partition was found by the catalog's scan to have been
	 * detached, but the pg_inherit tuple saying so was not visible to the
	 * active snapshot (find_inheritance_children_extended will not have set
	 * detached_xmin in that case), we consider there to be no "omittable"
	 * detached partitions.
	 */
	is_omit = omit_detached && detached_exist && ActiveSnapshotSet() &&
		TransactionIdIsValid(detached_xmin);

	/*
	 * We have a fully valid partdesc.  Reparent it so that it has the right
	 * lifespan.
	 */
	MemoryContextSetParent(new_pdcxt, CacheMemoryContext);

	/*
	 * Store it into relcache.
	 *
	 * But first, a kluge: if there's an old context for this type of
	 * descriptor, it contains an old partition descriptor that may still be
	 * referenced somewhere.  Preserve it, while not leaking it, by
	 * reattaching it as a child context of the new one.  Eventually it will
	 * get dropped by either RelationClose or RelationClearRelation. (We keep
	 * the regular partdesc in rd_pdcxt, and the partdesc-excluding-
	 * detached-partitions in rd_pddcxt.)
	 */
	if (is_omit)
	{
		if (rel->rd_pddcxt != NULL)
			MemoryContextSetParent(rel->rd_pddcxt, new_pdcxt);
		rel->rd_pddcxt = new_pdcxt;
		rel->rd_partdesc_nodetached = partdesc;

		/*
		 * For partdescs built excluding detached partitions, which we save
		 * separately, we also record the pg_inherits.xmin of the detached
		 * partition that was omitted; this informs a future potential user of
		 * such a cached partdesc to only use it after cross-checking that the
		 * xmin is indeed visible to the snapshot it is going to be working
		 * with.
		 */
		Assert(TransactionIdIsValid(detached_xmin));
		rel->rd_partdesc_nodetached_xmin = detached_xmin;
	}
	else
	{
		if (rel->rd_pdcxt != NULL)
			MemoryContextSetParent(rel->rd_pdcxt, new_pdcxt);
		rel->rd_pdcxt = new_pdcxt;
		rel->rd_partdesc = partdesc;
	}

	return partdesc;
}

/*
 * CreatePartitionDirectory
 *		Create a new partition directory object.
 */
PartitionDirectory
CreatePartitionDirectory(MemoryContext mcxt, bool omit_detached)
{
	MemoryContext oldcontext = MemoryContextSwitchTo(mcxt);
	PartitionDirectory pdir;
	HASHCTL		ctl;

	pdir = palloc(sizeof(PartitionDirectoryData));
	pdir->pdir_mcxt = mcxt;

	ctl.keysize = sizeof(Oid);
	ctl.entrysize = sizeof(PartitionDirectoryEntry);
	ctl.hcxt = mcxt;

	pdir->pdir_hash = hash_create("partition directory", 256, &ctl,
								  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
	pdir->omit_detached = omit_detached;

	MemoryContextSwitchTo(oldcontext);
	return pdir;
}

/*
 * PartitionDirectoryLookup
 *		Look up the partition descriptor for a relation in the directory.
 *
 * The purpose of this function is to ensure that we get the same
 * PartitionDesc for each relation every time we look it up.  In the
 * face of concurrent DDL, different PartitionDescs may be constructed with
 * different views of the catalog state, but any single particular OID
 * will always get the same PartitionDesc for as long as the same
 * PartitionDirectory is used.
 */
PartitionDesc
PartitionDirectoryLookup(PartitionDirectory pdir, Relation rel)
{
	PartitionDirectoryEntry *pde;
	Oid			relid = RelationGetRelid(rel);
	bool		found;

	pde = hash_search(pdir->pdir_hash, &relid, HASH_ENTER, &found);
	if (!found)
	{
		/*
		 * We must keep a reference count on the relation so that the
		 * PartitionDesc to which we are pointing can't get destroyed.
		 */
		RelationIncrementReferenceCount(rel);
		pde->rel = rel;
		pde->pd = RelationGetPartitionDesc(rel, pdir->omit_detached);
		Assert(pde->pd != NULL);
	}
	return pde->pd;
}

/*
 * DestroyPartitionDirectory
 *		Destroy a partition directory.
 *
 * Release the reference counts we're holding.
 */
void
DestroyPartitionDirectory(PartitionDirectory pdir)
{
	HASH_SEQ_STATUS status;
	PartitionDirectoryEntry *pde;

	hash_seq_init(&status, pdir->pdir_hash);
	while ((pde = hash_seq_search(&status)) != NULL)
		RelationDecrementReferenceCount(pde->rel);
}

/*
 * get_default_oid_from_partdesc
 *
 * Given a partition descriptor, return the OID of the default partition, if
 * one exists; else, return InvalidOid.
 */
Oid
get_default_oid_from_partdesc(PartitionDesc partdesc)
{
	if (partdesc && partdesc->boundinfo &&
		partition_bound_has_default(partdesc->boundinfo))
		return partdesc->oids[partdesc->boundinfo->default_index];

	return InvalidOid;
}