469 lines
15 KiB
C
469 lines
15 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* partdesc.c
|
|
* Support routines for manipulating partition descriptors
|
|
*
|
|
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
* IDENTIFICATION
|
|
* src/backend/partitioning/partdesc.c
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
|
|
#include "access/genam.h"
|
|
#include "access/htup_details.h"
|
|
#include "access/table.h"
|
|
#include "catalog/partition.h"
|
|
#include "catalog/pg_inherits.h"
|
|
#include "partitioning/partbounds.h"
|
|
#include "partitioning/partdesc.h"
|
|
#include "storage/bufmgr.h"
|
|
#include "storage/sinval.h"
|
|
#include "utils/builtins.h"
|
|
#include "utils/fmgroids.h"
|
|
#include "utils/hsearch.h"
|
|
#include "utils/inval.h"
|
|
#include "utils/lsyscache.h"
|
|
#include "utils/memutils.h"
|
|
#include "utils/partcache.h"
|
|
#include "utils/rel.h"
|
|
#include "utils/syscache.h"
|
|
|
|
typedef struct PartitionDirectoryData
|
|
{
|
|
MemoryContext pdir_mcxt;
|
|
HTAB *pdir_hash;
|
|
bool omit_detached;
|
|
} PartitionDirectoryData;
|
|
|
|
typedef struct PartitionDirectoryEntry
|
|
{
|
|
Oid reloid;
|
|
Relation rel;
|
|
PartitionDesc pd;
|
|
} PartitionDirectoryEntry;
|
|
|
|
static PartitionDesc RelationBuildPartitionDesc(Relation rel,
|
|
bool omit_detached);
|
|
|
|
|
|
/*
|
|
* RelationGetPartitionDesc -- get partition descriptor, if relation is partitioned
|
|
*
|
|
* We keep two partdescs in relcache: rd_partdesc includes all partitions
|
|
* (even those being concurrently marked detached), while rd_partdesc_nodetached
|
|
* omits (some of) those. We store the pg_inherits.xmin value for the latter,
|
|
* to determine whether it can be validly reused in each case, since that
|
|
* depends on the active snapshot.
|
|
*
|
|
* Note: we arrange for partition descriptors to not get freed until the
|
|
* relcache entry's refcount goes to zero (see hacks in RelationClose,
|
|
* RelationClearRelation, and RelationBuildPartitionDesc). Therefore, even
|
|
* though we hand back a direct pointer into the relcache entry, it's safe
|
|
* for callers to continue to use that pointer as long as (a) they hold the
|
|
* relation open, and (b) they hold a relation lock strong enough to ensure
|
|
* that the data doesn't become stale.
|
|
*/
|
|
PartitionDesc
|
|
RelationGetPartitionDesc(Relation rel, bool omit_detached)
|
|
{
|
|
Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
|
|
|
|
/*
|
|
* If relcache has a partition descriptor, use that. However, we can only
|
|
* do so when we are asked to include all partitions including detached;
|
|
* and also when we know that there are no detached partitions.
|
|
*
|
|
* If there is no active snapshot, detached partitions aren't omitted
|
|
* either, so we can use the cached descriptor too in that case.
|
|
*/
|
|
if (likely(rel->rd_partdesc &&
|
|
(!rel->rd_partdesc->detached_exist || !omit_detached ||
|
|
!ActiveSnapshotSet())))
|
|
return rel->rd_partdesc;
|
|
|
|
/*
|
|
* If we're asked to omit detached partitions, we may be able to use a
|
|
* cached descriptor too. We determine that based on the pg_inherits.xmin
|
|
* that was saved alongside that descriptor: if the xmin that was not in
|
|
* progress for that active snapshot is also not in progress for the
|
|
* current active snapshot, then we can use it. Otherwise build one from
|
|
* scratch.
|
|
*/
|
|
if (omit_detached &&
|
|
rel->rd_partdesc_nodetached &&
|
|
ActiveSnapshotSet())
|
|
{
|
|
Snapshot activesnap;
|
|
|
|
Assert(TransactionIdIsValid(rel->rd_partdesc_nodetached_xmin));
|
|
activesnap = GetActiveSnapshot();
|
|
|
|
if (!XidInMVCCSnapshot(rel->rd_partdesc_nodetached_xmin, activesnap))
|
|
return rel->rd_partdesc_nodetached;
|
|
}
|
|
|
|
return RelationBuildPartitionDesc(rel, omit_detached);
|
|
}
|
|
|
|
/*
|
|
* RelationBuildPartitionDesc
|
|
* Form rel's partition descriptor, and store in relcache entry
|
|
*
|
|
* Partition descriptor is a complex structure; to avoid complicated logic to
|
|
* free individual elements whenever the relcache entry is flushed, we give it
|
|
* its own memory context, a child of CacheMemoryContext, which can easily be
|
|
* deleted on its own. To avoid leaking memory in that context in case of an
|
|
* error partway through this function, the context is initially created as a
|
|
* child of CurTransactionContext and only re-parented to CacheMemoryContext
|
|
* at the end, when no further errors are possible. Also, we don't make this
|
|
* context the current context except in very brief code sections, out of fear
|
|
* that some of our callees allocate memory on their own which would be leaked
|
|
* permanently.
|
|
*
|
|
* As a special case, partition descriptors that are requested to omit
|
|
* partitions being detached (and which contain such partitions) are transient
|
|
* and are not associated with the relcache entry. Such descriptors only last
|
|
* through the requesting Portal, so we use the corresponding memory context
|
|
* for them.
|
|
*/
|
|
static PartitionDesc
|
|
RelationBuildPartitionDesc(Relation rel, bool omit_detached)
|
|
{
|
|
PartitionDesc partdesc;
|
|
PartitionBoundInfo boundinfo = NULL;
|
|
List *inhoids;
|
|
PartitionBoundSpec **boundspecs = NULL;
|
|
Oid *oids = NULL;
|
|
bool *is_leaf = NULL;
|
|
bool detached_exist;
|
|
bool is_omit;
|
|
TransactionId detached_xmin;
|
|
ListCell *cell;
|
|
int i,
|
|
nparts;
|
|
PartitionKey key = RelationGetPartitionKey(rel);
|
|
MemoryContext new_pdcxt;
|
|
MemoryContext oldcxt;
|
|
int *mapping;
|
|
|
|
/*
|
|
* Get partition oids from pg_inherits. This uses a single snapshot to
|
|
* fetch the list of children, so while more children may be getting added
|
|
* concurrently, whatever this function returns will be accurate as of
|
|
* some well-defined point in time.
|
|
*/
|
|
detached_exist = false;
|
|
detached_xmin = InvalidTransactionId;
|
|
inhoids = find_inheritance_children_extended(RelationGetRelid(rel),
|
|
omit_detached, NoLock,
|
|
&detached_exist,
|
|
&detached_xmin);
|
|
|
|
nparts = list_length(inhoids);
|
|
|
|
/* Allocate working arrays for OIDs, leaf flags, and boundspecs. */
|
|
if (nparts > 0)
|
|
{
|
|
oids = (Oid *) palloc(nparts * sizeof(Oid));
|
|
is_leaf = (bool *) palloc(nparts * sizeof(bool));
|
|
boundspecs = palloc(nparts * sizeof(PartitionBoundSpec *));
|
|
}
|
|
|
|
/* Collect bound spec nodes for each partition. */
|
|
i = 0;
|
|
foreach(cell, inhoids)
|
|
{
|
|
Oid inhrelid = lfirst_oid(cell);
|
|
HeapTuple tuple;
|
|
PartitionBoundSpec *boundspec = NULL;
|
|
|
|
/* Try fetching the tuple from the catcache, for speed. */
|
|
tuple = SearchSysCache1(RELOID, inhrelid);
|
|
if (HeapTupleIsValid(tuple))
|
|
{
|
|
Datum datum;
|
|
bool isnull;
|
|
|
|
datum = SysCacheGetAttr(RELOID, tuple,
|
|
Anum_pg_class_relpartbound,
|
|
&isnull);
|
|
if (!isnull)
|
|
boundspec = stringToNode(TextDatumGetCString(datum));
|
|
ReleaseSysCache(tuple);
|
|
}
|
|
|
|
/*
|
|
* The system cache may be out of date; if so, we may find no pg_class
|
|
* tuple or an old one where relpartbound is NULL. In that case, try
|
|
* the table directly. We can't just AcceptInvalidationMessages() and
|
|
* retry the system cache lookup because it's possible that a
|
|
* concurrent ATTACH PARTITION operation has removed itself from the
|
|
* ProcArray but not yet added invalidation messages to the shared
|
|
* queue; InvalidateSystemCaches() would work, but seems excessive.
|
|
*
|
|
* Note that this algorithm assumes that PartitionBoundSpec we manage
|
|
* to fetch is the right one -- so this is only good enough for
|
|
* concurrent ATTACH PARTITION, not concurrent DETACH PARTITION or
|
|
* some hypothetical operation that changes the partition bounds.
|
|
*/
|
|
if (boundspec == NULL)
|
|
{
|
|
Relation pg_class;
|
|
SysScanDesc scan;
|
|
ScanKeyData key[1];
|
|
Datum datum;
|
|
bool isnull;
|
|
|
|
pg_class = table_open(RelationRelationId, AccessShareLock);
|
|
ScanKeyInit(&key[0],
|
|
Anum_pg_class_oid,
|
|
BTEqualStrategyNumber, F_OIDEQ,
|
|
ObjectIdGetDatum(inhrelid));
|
|
scan = systable_beginscan(pg_class, ClassOidIndexId, true,
|
|
NULL, 1, key);
|
|
tuple = systable_getnext(scan);
|
|
datum = heap_getattr(tuple, Anum_pg_class_relpartbound,
|
|
RelationGetDescr(pg_class), &isnull);
|
|
if (!isnull)
|
|
boundspec = stringToNode(TextDatumGetCString(datum));
|
|
systable_endscan(scan);
|
|
table_close(pg_class, AccessShareLock);
|
|
}
|
|
|
|
/* Sanity checks. */
|
|
if (!boundspec)
|
|
elog(ERROR, "missing relpartbound for relation %u", inhrelid);
|
|
if (!IsA(boundspec, PartitionBoundSpec))
|
|
elog(ERROR, "invalid relpartbound for relation %u", inhrelid);
|
|
|
|
/*
|
|
* If the PartitionBoundSpec says this is the default partition, its
|
|
* OID should match pg_partitioned_table.partdefid; if not, the
|
|
* catalog is corrupt.
|
|
*/
|
|
if (boundspec->is_default)
|
|
{
|
|
Oid partdefid;
|
|
|
|
partdefid = get_default_partition_oid(RelationGetRelid(rel));
|
|
if (partdefid != inhrelid)
|
|
elog(ERROR, "expected partdefid %u, but got %u",
|
|
inhrelid, partdefid);
|
|
}
|
|
|
|
/* Save results. */
|
|
oids[i] = inhrelid;
|
|
is_leaf[i] = (get_rel_relkind(inhrelid) != RELKIND_PARTITIONED_TABLE);
|
|
boundspecs[i] = boundspec;
|
|
++i;
|
|
}
|
|
|
|
/*
|
|
* Create PartitionBoundInfo and mapping, working in the caller's context.
|
|
* This could fail, but we haven't done any damage if so.
|
|
*/
|
|
if (nparts > 0)
|
|
boundinfo = partition_bounds_create(boundspecs, nparts, key, &mapping);
|
|
|
|
/*
|
|
* Now build the actual relcache partition descriptor, copying all the
|
|
* data into a new, small context. As per above comment, we don't make
|
|
* this a long-lived context until it's finished.
|
|
*/
|
|
new_pdcxt = AllocSetContextCreate(CurTransactionContext,
|
|
"partition descriptor",
|
|
ALLOCSET_SMALL_SIZES);
|
|
MemoryContextCopyAndSetIdentifier(new_pdcxt,
|
|
RelationGetRelationName(rel));
|
|
|
|
partdesc = (PartitionDescData *)
|
|
MemoryContextAllocZero(new_pdcxt, sizeof(PartitionDescData));
|
|
partdesc->nparts = nparts;
|
|
partdesc->detached_exist = detached_exist;
|
|
/* If there are no partitions, the rest of the partdesc can stay zero */
|
|
if (nparts > 0)
|
|
{
|
|
oldcxt = MemoryContextSwitchTo(new_pdcxt);
|
|
partdesc->boundinfo = partition_bounds_copy(boundinfo, key);
|
|
|
|
/* Initialize caching fields for speeding up ExecFindPartition */
|
|
partdesc->last_found_datum_index = -1;
|
|
partdesc->last_found_part_index = -1;
|
|
partdesc->last_found_count = 0;
|
|
|
|
partdesc->oids = (Oid *) palloc(nparts * sizeof(Oid));
|
|
partdesc->is_leaf = (bool *) palloc(nparts * sizeof(bool));
|
|
|
|
/*
|
|
* Assign OIDs from the original array into mapped indexes of the
|
|
* result array. The order of OIDs in the former is defined by the
|
|
* catalog scan that retrieved them, whereas that in the latter is
|
|
* defined by canonicalized representation of the partition bounds.
|
|
* Also save leaf-ness of each partition.
|
|
*/
|
|
for (i = 0; i < nparts; i++)
|
|
{
|
|
int index = mapping[i];
|
|
|
|
partdesc->oids[index] = oids[i];
|
|
partdesc->is_leaf[index] = is_leaf[i];
|
|
}
|
|
MemoryContextSwitchTo(oldcxt);
|
|
}
|
|
|
|
/*
|
|
* Are we working with the partdesc that omits the detached partition, or
|
|
* the one that includes it?
|
|
*
|
|
* Note that if a partition was found by the catalog's scan to have been
|
|
* detached, but the pg_inherit tuple saying so was not visible to the
|
|
* active snapshot (find_inheritance_children_extended will not have set
|
|
* detached_xmin in that case), we consider there to be no "omittable"
|
|
* detached partitions.
|
|
*/
|
|
is_omit = omit_detached && detached_exist && ActiveSnapshotSet() &&
|
|
TransactionIdIsValid(detached_xmin);
|
|
|
|
/*
|
|
* We have a fully valid partdesc. Reparent it so that it has the right
|
|
* lifespan.
|
|
*/
|
|
MemoryContextSetParent(new_pdcxt, CacheMemoryContext);
|
|
|
|
/*
|
|
* Store it into relcache.
|
|
*
|
|
* But first, a kluge: if there's an old context for this type of
|
|
* descriptor, it contains an old partition descriptor that may still be
|
|
* referenced somewhere. Preserve it, while not leaking it, by
|
|
* reattaching it as a child context of the new one. Eventually it will
|
|
* get dropped by either RelationClose or RelationClearRelation. (We keep
|
|
* the regular partdesc in rd_pdcxt, and the partdesc-excluding-
|
|
* detached-partitions in rd_pddcxt.)
|
|
*/
|
|
if (is_omit)
|
|
{
|
|
if (rel->rd_pddcxt != NULL)
|
|
MemoryContextSetParent(rel->rd_pddcxt, new_pdcxt);
|
|
rel->rd_pddcxt = new_pdcxt;
|
|
rel->rd_partdesc_nodetached = partdesc;
|
|
|
|
/*
|
|
* For partdescs built excluding detached partitions, which we save
|
|
* separately, we also record the pg_inherits.xmin of the detached
|
|
* partition that was omitted; this informs a future potential user of
|
|
* such a cached partdesc to only use it after cross-checking that the
|
|
* xmin is indeed visible to the snapshot it is going to be working
|
|
* with.
|
|
*/
|
|
Assert(TransactionIdIsValid(detached_xmin));
|
|
rel->rd_partdesc_nodetached_xmin = detached_xmin;
|
|
}
|
|
else
|
|
{
|
|
if (rel->rd_pdcxt != NULL)
|
|
MemoryContextSetParent(rel->rd_pdcxt, new_pdcxt);
|
|
rel->rd_pdcxt = new_pdcxt;
|
|
rel->rd_partdesc = partdesc;
|
|
}
|
|
|
|
return partdesc;
|
|
}
|
|
|
|
/*
|
|
* CreatePartitionDirectory
|
|
* Create a new partition directory object.
|
|
*/
|
|
PartitionDirectory
|
|
CreatePartitionDirectory(MemoryContext mcxt, bool omit_detached)
|
|
{
|
|
MemoryContext oldcontext = MemoryContextSwitchTo(mcxt);
|
|
PartitionDirectory pdir;
|
|
HASHCTL ctl;
|
|
|
|
pdir = palloc(sizeof(PartitionDirectoryData));
|
|
pdir->pdir_mcxt = mcxt;
|
|
|
|
ctl.keysize = sizeof(Oid);
|
|
ctl.entrysize = sizeof(PartitionDirectoryEntry);
|
|
ctl.hcxt = mcxt;
|
|
|
|
pdir->pdir_hash = hash_create("partition directory", 256, &ctl,
|
|
HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
|
|
pdir->omit_detached = omit_detached;
|
|
|
|
MemoryContextSwitchTo(oldcontext);
|
|
return pdir;
|
|
}
|
|
|
|
/*
|
|
* PartitionDirectoryLookup
|
|
* Look up the partition descriptor for a relation in the directory.
|
|
*
|
|
* The purpose of this function is to ensure that we get the same
|
|
* PartitionDesc for each relation every time we look it up. In the
|
|
* face of concurrent DDL, different PartitionDescs may be constructed with
|
|
* different views of the catalog state, but any single particular OID
|
|
* will always get the same PartitionDesc for as long as the same
|
|
* PartitionDirectory is used.
|
|
*/
|
|
PartitionDesc
|
|
PartitionDirectoryLookup(PartitionDirectory pdir, Relation rel)
|
|
{
|
|
PartitionDirectoryEntry *pde;
|
|
Oid relid = RelationGetRelid(rel);
|
|
bool found;
|
|
|
|
pde = hash_search(pdir->pdir_hash, &relid, HASH_ENTER, &found);
|
|
if (!found)
|
|
{
|
|
/*
|
|
* We must keep a reference count on the relation so that the
|
|
* PartitionDesc to which we are pointing can't get destroyed.
|
|
*/
|
|
RelationIncrementReferenceCount(rel);
|
|
pde->rel = rel;
|
|
pde->pd = RelationGetPartitionDesc(rel, pdir->omit_detached);
|
|
Assert(pde->pd != NULL);
|
|
}
|
|
return pde->pd;
|
|
}
|
|
|
|
/*
|
|
* DestroyPartitionDirectory
|
|
* Destroy a partition directory.
|
|
*
|
|
* Release the reference counts we're holding.
|
|
*/
|
|
void
|
|
DestroyPartitionDirectory(PartitionDirectory pdir)
|
|
{
|
|
HASH_SEQ_STATUS status;
|
|
PartitionDirectoryEntry *pde;
|
|
|
|
hash_seq_init(&status, pdir->pdir_hash);
|
|
while ((pde = hash_seq_search(&status)) != NULL)
|
|
RelationDecrementReferenceCount(pde->rel);
|
|
}
|
|
|
|
/*
|
|
* get_default_oid_from_partdesc
|
|
*
|
|
* Given a partition descriptor, return the OID of the default partition, if
|
|
* one exists; else, return InvalidOid.
|
|
*/
|
|
Oid
|
|
get_default_oid_from_partdesc(PartitionDesc partdesc)
|
|
{
|
|
if (partdesc && partdesc->boundinfo &&
|
|
partition_bound_has_default(partdesc->boundinfo))
|
|
return partdesc->oids[partdesc->boundinfo->default_index];
|
|
|
|
return InvalidOid;
|
|
}
|