postgresql/src/backend/catalog/partition.c

1945 lines
53 KiB
C
Raw Normal View History

Implement table partitioning. Table partitioning is like table inheritance and reuses much of the existing infrastructure, but there are some important differences. The parent is called a partitioned table and is always empty; it may not have indexes or non-inherited constraints, since those make no sense for a relation with no data of its own. The children are called partitions and contain all of the actual data. Each partition has an implicit partitioning constraint. Multiple inheritance is not allowed, and partitioning and inheritance can't be mixed. Partitions can't have extra columns and may not allow nulls unless the parent does. Tuples inserted into the parent are automatically routed to the correct partition, so tuple-routing ON INSERT triggers are not needed. Tuple routing isn't yet supported for partitions which are foreign tables, and it doesn't handle updates that cross partition boundaries. Currently, tables can be range-partitioned or list-partitioned. List partitioning is limited to a single column, but range partitioning can involve multiple columns. A partitioning "column" can be an expression. Because table partitioning is less general than table inheritance, it is hoped that it will be easier to reason about properties of partitions, and therefore that this will serve as a better foundation for a variety of possible optimizations, including query planner optimizations. The tuple routing based which this patch does based on the implicit partitioning constraints is an example of this, but it seems likely that many other useful optimizations are also possible. Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat, Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova, Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
/*-------------------------------------------------------------------------
*
* partition.c
* Partitioning related data structures and functions.
*
* Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/catalog/partition.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/heapam.h"
#include "access/htup_details.h"
#include "access/nbtree.h"
#include "access/sysattr.h"
#include "catalog/dependency.h"
#include "catalog/indexing.h"
#include "catalog/objectaddress.h"
#include "catalog/partition.h"
#include "catalog/pg_collation.h"
#include "catalog/pg_inherits.h"
#include "catalog/pg_inherits_fn.h"
#include "catalog/pg_opclass.h"
#include "catalog/pg_type.h"
#include "executor/executor.h"
#include "miscadmin.h"
#include "nodes/makefuncs.h"
#include "nodes/nodeFuncs.h"
#include "nodes/parsenodes.h"
#include "optimizer/clauses.h"
#include "optimizer/planmain.h"
#include "optimizer/var.h"
#include "rewrite/rewriteManip.h"
#include "storage/lmgr.h"
#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/datum.h"
#include "utils/memutils.h"
#include "utils/fmgroids.h"
#include "utils/inval.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
#include "utils/ruleutils.h"
#include "utils/syscache.h"
/*
* Information about bounds of a partitioned relation
*
* A list partition datum that is known to be NULL is never put into the
* datums array. Instead, it is tracked using has_null and null_index fields.
*
* In the case of range partitioning, ndatums will typically be far less than
* 2 * nparts, because a partition's upper bound and the next partition's lower
* bound are the same in most common cases, and we only store one of them.
*
* In the case of list partitioning, the indexes array stores one entry for
* every datum, which is the index of the partition that accepts a given datum.
* In case of range partitioning, it stores one entry per distinct range
* datum, which is the index of the partition for which a given datum
* is an upper bound.
*/
/* Ternary value to represent what's contained in a range bound datum */
typedef enum RangeDatumContent
{
RANGE_DATUM_FINITE = 0, /* actual datum stored elsewhere */
RANGE_DATUM_NEG_INF, /* negative infinity */
RANGE_DATUM_POS_INF /* positive infinity */
} RangeDatumContent;
typedef struct PartitionBoundInfoData
{
char strategy; /* list or range bounds? */
int ndatums; /* Length of the datums following array */
Datum **datums; /* Array of datum-tuples with key->partnatts
* datums each */
RangeDatumContent **content;/* what's contained in each range bound datum?
* (see the above enum); NULL for list
* partitioned tables */
int *indexes; /* Partition indexes; one entry per member of
* the datums array (plus one if range
* partitioned table) */
bool has_null; /* Is there a null-accepting partition? false
* for range partitioned tables */
int null_index; /* Index of the null-accepting partition; -1
* for range partitioned tables */
} PartitionBoundInfoData;
/*
* When qsort'ing partition bounds after reading from the catalog, each bound
* is represented with one of the following structs.
*/
/* One value coming from some (index'th) list partition */
typedef struct PartitionListValue
{
int index;
Datum value;
} PartitionListValue;
/* One bound of a range partition */
typedef struct PartitionRangeBound
{
int index;
Datum *datums; /* range bound datums */
RangeDatumContent *content; /* what's contained in each datum? */
bool lower; /* this is the lower (vs upper) bound */
} PartitionRangeBound;
static int32 qsort_partition_list_value_cmp(const void *a, const void *b,
void *arg);
static int32 qsort_partition_rbound_cmp(const void *a, const void *b,
void *arg);
static List *get_qual_for_list(PartitionKey key, PartitionBoundSpec *spec);
static List *get_qual_for_range(PartitionKey key, PartitionBoundSpec *spec);
static Oid get_partition_operator(PartitionKey key, int col,
StrategyNumber strategy, bool *need_relabel);
static List *generate_partition_qual(Relation rel, bool recurse);
static PartitionRangeBound *make_one_range_bound(PartitionKey key, int index,
List *datums, bool lower);
static int32 partition_rbound_cmp(PartitionKey key,
Datum *datums1, RangeDatumContent *content1, bool lower1,
PartitionRangeBound *b2);
static int32 partition_rbound_datum_cmp(PartitionKey key,
Datum *rb_datums, RangeDatumContent *rb_content,
Datum *tuple_datums);
static int32 partition_bound_cmp(PartitionKey key,
PartitionBoundInfo boundinfo,
int offset, void *probe, bool probe_is_bound);
static int partition_bound_bsearch(PartitionKey key,
PartitionBoundInfo boundinfo,
void *probe, bool probe_is_bound, bool *is_equal);
/* Support get_partition_for_tuple() */
static void FormPartitionKeyDatum(PartitionDispatch pd,
TupleTableSlot *slot,
EState *estate,
Datum *values,
bool *isnull);
/*
* RelationBuildPartitionDesc
* Form rel's partition descriptor
*
* Not flushed from the cache by RelationClearRelation() unless changed because
* of addition or removal of partition.
*/
void
RelationBuildPartitionDesc(Relation rel)
{
List *inhoids,
*partoids;
Oid *oids = NULL;
List *boundspecs = NIL;
ListCell *cell;
int i,
nparts;
PartitionKey key = RelationGetPartitionKey(rel);
PartitionDesc result;
MemoryContext oldcxt;
int ndatums = 0;
/* List partitioning specific */
PartitionListValue **all_values = NULL;
bool found_null = false;
int null_index = -1;
/* Range partitioning specific */
PartitionRangeBound **rbounds = NULL;
/*
* The following could happen in situations where rel has a pg_class entry
* but not the pg_partitioned_table entry yet.
*/
if (key == NULL)
return;
/* Get partition oids from pg_inherits */
inhoids = find_inheritance_children(RelationGetRelid(rel), NoLock);
/* Collect bound spec nodes in a list */
i = 0;
partoids = NIL;
foreach(cell, inhoids)
{
Oid inhrelid = lfirst_oid(cell);
HeapTuple tuple;
Datum datum;
bool isnull;
Node *boundspec;
tuple = SearchSysCache1(RELOID, inhrelid);
/*
* It is possible that the pg_class tuple of a partition has not been
* updated yet to set its relpartbound field. The only case where
* this happens is when we open the parent relation to check using its
* partition descriptor that a new partition's bound does not overlap
* some existing partition.
*/
if (!((Form_pg_class) GETSTRUCT(tuple))->relispartition)
{
ReleaseSysCache(tuple);
continue;
}
datum = SysCacheGetAttr(RELOID, tuple,
Anum_pg_class_relpartbound,
&isnull);
Assert(!isnull);
boundspec = (Node *) stringToNode(TextDatumGetCString(datum));
boundspecs = lappend(boundspecs, boundspec);
partoids = lappend_oid(partoids, inhrelid);
ReleaseSysCache(tuple);
}
nparts = list_length(partoids);
if (nparts > 0)
{
oids = (Oid *) palloc(nparts * sizeof(Oid));
i = 0;
foreach(cell, partoids)
oids[i++] = lfirst_oid(cell);
/* Convert from node to the internal representation */
if (key->strategy == PARTITION_STRATEGY_LIST)
{
List *non_null_values = NIL;
/*
* Create a unified list of non-null values across all partitions.
*/
i = 0;
found_null = false;
null_index = -1;
foreach(cell, boundspecs)
{
ListCell *c;
PartitionBoundSpec *spec = lfirst(cell);
if (spec->strategy != PARTITION_STRATEGY_LIST)
elog(ERROR, "invalid strategy in partition bound spec");
foreach(c, spec->listdatums)
{
Const *val = lfirst(c);
PartitionListValue *list_value = NULL;
if (!val->constisnull)
{
list_value = (PartitionListValue *)
palloc0(sizeof(PartitionListValue));
list_value->index = i;
list_value->value = val->constvalue;
}
else
{
/*
* Never put a null into the values array, flag
* instead for the code further down below where we
* construct the actual relcache struct.
*/
if (found_null)
elog(ERROR, "found null more than once");
found_null = true;
null_index = i;
}
if (list_value)
non_null_values = lappend(non_null_values,
list_value);
}
i++;
}
ndatums = list_length(non_null_values);
/*
* Collect all list values in one array. Alongside the value, we
* also save the index of partition the value comes from.
*/
all_values = (PartitionListValue **) palloc(ndatums *
sizeof(PartitionListValue *));
i = 0;
foreach(cell, non_null_values)
{
PartitionListValue *src = lfirst(cell);
all_values[i] = (PartitionListValue *)
palloc(sizeof(PartitionListValue));
all_values[i]->value = src->value;
all_values[i]->index = src->index;
i++;
}
qsort_arg(all_values, ndatums, sizeof(PartitionListValue *),
qsort_partition_list_value_cmp, (void *) key);
}
else if (key->strategy == PARTITION_STRATEGY_RANGE)
{
int j,
k;
PartitionRangeBound **all_bounds,
*prev;
bool *distinct_indexes;
all_bounds = (PartitionRangeBound **) palloc0(2 * nparts *
sizeof(PartitionRangeBound *));
distinct_indexes = (bool *) palloc(2 * nparts * sizeof(bool));
/*
* Create a unified list of range bounds across all the
* partitions.
*/
i = j = 0;
foreach(cell, boundspecs)
{
PartitionBoundSpec *spec = lfirst(cell);
PartitionRangeBound *lower,
*upper;
if (spec->strategy != PARTITION_STRATEGY_RANGE)
elog(ERROR, "invalid strategy in partition bound spec");
lower = make_one_range_bound(key, i, spec->lowerdatums,
true);
upper = make_one_range_bound(key, i, spec->upperdatums,
false);
all_bounds[j] = lower;
all_bounds[j + 1] = upper;
j += 2;
i++;
}
Assert(j == 2 * nparts);
/* Sort all the bounds in ascending order */
qsort_arg(all_bounds, 2 * nparts,
sizeof(PartitionRangeBound *),
qsort_partition_rbound_cmp,
(void *) key);
/*
* Count the number of distinct bounds to allocate an array of
* that size.
*/
ndatums = 0;
prev = NULL;
for (i = 0; i < 2 * nparts; i++)
{
PartitionRangeBound *cur = all_bounds[i];
bool is_distinct = false;
int j;
/* Is current bound is distinct from the previous? */
for (j = 0; j < key->partnatts; j++)
{
Datum cmpval;
if (prev == NULL)
{
is_distinct = true;
break;
}
/*
* If either of them has infinite element, we can't equate
* them. Even when both are infinite, they'd have
* opposite signs, because only one of cur and prev is a
* lower bound).
*/
if (cur->content[j] != RANGE_DATUM_FINITE ||
prev->content[j] != RANGE_DATUM_FINITE)
{
is_distinct = true;
break;
}
cmpval = FunctionCall2Coll(&key->partsupfunc[j],
key->partcollation[j],
cur->datums[j],
prev->datums[j]);
if (DatumGetInt32(cmpval) != 0)
{
is_distinct = true;
break;
}
}
/*
* Count the current bound if it is distinct from the previous
* one. Also, store if the index i contains a distinct bound
* that we'd like put in the relcache array.
*/
if (is_distinct)
{
distinct_indexes[i] = true;
ndatums++;
}
else
distinct_indexes[i] = false;
prev = cur;
}
/*
* Finally save them in an array from where they will be copied
* into the relcache.
*/
rbounds = (PartitionRangeBound **) palloc(ndatums *
sizeof(PartitionRangeBound *));
k = 0;
for (i = 0; i < 2 * nparts; i++)
{
if (distinct_indexes[i])
rbounds[k++] = all_bounds[i];
}
Assert(k == ndatums);
}
else
elog(ERROR, "unexpected partition strategy: %d",
(int) key->strategy);
}
/* Now build the actual relcache partition descriptor */
rel->rd_pdcxt = AllocSetContextCreate(CacheMemoryContext,
RelationGetRelationName(rel),
ALLOCSET_DEFAULT_SIZES);
oldcxt = MemoryContextSwitchTo(rel->rd_pdcxt);
result = (PartitionDescData *) palloc0(sizeof(PartitionDescData));
result->nparts = nparts;
if (nparts > 0)
{
PartitionBoundInfo boundinfo;
int *mapping;
int next_index = 0;
result->oids = (Oid *) palloc0(nparts * sizeof(Oid));
boundinfo = (PartitionBoundInfoData *)
palloc0(sizeof(PartitionBoundInfoData));
boundinfo->strategy = key->strategy;
boundinfo->ndatums = ndatums;
boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *));
/* Initialize mapping array with invalid values */
mapping = (int *) palloc(sizeof(int) * nparts);
for (i = 0; i < nparts; i++)
mapping[i] = -1;
switch (key->strategy)
{
case PARTITION_STRATEGY_LIST:
{
boundinfo->has_null = found_null;
boundinfo->indexes = (int *) palloc(ndatums * sizeof(int));
/*
* Copy values. Indexes of individual values are mapped
* to canonical values so that they match for any two list
* partitioned tables with same number of partitions and
* same lists per partition. One way to canonicalize is
* to assign the index in all_values[] of the smallest
* value of each partition, as the index of all of the
* partition's values.
*/
for (i = 0; i < ndatums; i++)
{
boundinfo->datums[i] = (Datum *) palloc(sizeof(Datum));
boundinfo->datums[i][0] = datumCopy(all_values[i]->value,
key->parttypbyval[0],
key->parttyplen[0]);
/* If the old index has no mapping, assign one */
if (mapping[all_values[i]->index] == -1)
mapping[all_values[i]->index] = next_index++;
boundinfo->indexes[i] = mapping[all_values[i]->index];
}
/*
* If null-accepting partition has no mapped index yet,
* assign one. This could happen if such partition
* accepts only null and hence not covered in the above
* loop which only handled non-null values.
*/
if (found_null)
{
Assert(null_index >= 0);
if (mapping[null_index] == -1)
mapping[null_index] = next_index++;
}
/* All partition must now have a valid mapping */
Assert(next_index == nparts);
if (found_null)
boundinfo->null_index = mapping[null_index];
else
boundinfo->null_index = -1;
break;
}
case PARTITION_STRATEGY_RANGE:
{
boundinfo->content = (RangeDatumContent **) palloc(ndatums *
sizeof(RangeDatumContent *));
boundinfo->indexes = (int *) palloc((ndatums + 1) *
sizeof(int));
for (i = 0; i < ndatums; i++)
{
int j;
boundinfo->datums[i] = (Datum *) palloc(key->partnatts *
sizeof(Datum));
boundinfo->content[i] = (RangeDatumContent *)
palloc(key->partnatts *
sizeof(RangeDatumContent));
for (j = 0; j < key->partnatts; j++)
{
if (rbounds[i]->content[j] == RANGE_DATUM_FINITE)
boundinfo->datums[i][j] =
datumCopy(rbounds[i]->datums[j],
key->parttypbyval[j],
key->parttyplen[j]);
/* Remember, we are storing the tri-state value. */
boundinfo->content[i][j] = rbounds[i]->content[j];
}
/*
* There is no mapping for invalid indexes.
*
* Any lower bounds in the rbounds array have invalid
* indexes assigned, because the values between the
* previous bound (if there is one) and this (lower)
* bound are not part of the range of any existing
* partition.
*/
if (rbounds[i]->lower)
boundinfo->indexes[i] = -1;
else
{
int orig_index = rbounds[i]->index;
/* If the old index is has no mapping, assign one */
if (mapping[orig_index] == -1)
mapping[orig_index] = next_index++;
boundinfo->indexes[i] = mapping[orig_index];
}
}
boundinfo->indexes[i] = -1;
break;
}
default:
elog(ERROR, "unexpected partition strategy: %d",
(int) key->strategy);
}
result->boundinfo = boundinfo;
/*
* Now assign OIDs from the original array into mapped indexes of the
* result array. Order of OIDs in the former is defined by the
* catalog scan that retrived them, whereas that in the latter is
* defined by canonicalized representation of the list values or the
* range bounds.
*/
for (i = 0; i < nparts; i++)
result->oids[mapping[i]] = oids[i];
pfree(mapping);
}
MemoryContextSwitchTo(oldcxt);
rel->rd_partdesc = result;
}
/*
* Are two partition bound collections logically equal?
*
* Used in the keep logic of relcache.c (ie, in RelationClearRelation()).
* This is also useful when b1 and b2 are bound collections of two separate
* relations, respectively, because PartitionBoundInfo is a canonical
* representation of partition bounds.
*/
bool
partition_bounds_equal(PartitionKey key,
PartitionBoundInfo b1, PartitionBoundInfo b2)
{
int i;
if (b1->strategy != b2->strategy)
return false;
if (b1->ndatums != b2->ndatums)
return false;
if (b1->has_null != b2->has_null)
return false;
if (b1->null_index != b2->null_index)
return false;
for (i = 0; i < b1->ndatums; i++)
{
int j;
for (j = 0; j < key->partnatts; j++)
{
int32 cmpval;
cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[j],
key->partcollation[j],
b1->datums[i][j],
b2->datums[i][j]));
if (cmpval != 0)
return false;
/* Range partitions can have infinite datums */
if (b1->content != NULL && b1->content[i][j] != b2->content[i][j])
return false;
}
if (b1->indexes[i] != b2->indexes[i])
return false;
}
/* There are ndatums+1 indexes in case of range partitions */
if (key->strategy == PARTITION_STRATEGY_RANGE &&
b1->indexes[i] != b2->indexes[i])
return false;
return true;
}
/*
* check_new_partition_bound
*
* Checks if the new partition's bound overlaps any of the existing partitions
* of parent. Also performs additional checks as necessary per strategy.
*/
void
check_new_partition_bound(char *relname, Relation parent, Node *bound)
{
PartitionBoundSpec *spec = (PartitionBoundSpec *) bound;
PartitionKey key = RelationGetPartitionKey(parent);
PartitionDesc partdesc = RelationGetPartitionDesc(parent);
ParseState *pstate = make_parsestate(NULL);
int with = -1;
bool overlap = false;
switch (key->strategy)
{
case PARTITION_STRATEGY_LIST:
{
Assert(spec->strategy == PARTITION_STRATEGY_LIST);
if (partdesc->nparts > 0)
{
PartitionBoundInfo boundinfo = partdesc->boundinfo;
ListCell *cell;
Assert(boundinfo &&
boundinfo->strategy == PARTITION_STRATEGY_LIST &&
(boundinfo->ndatums > 0 || boundinfo->has_null));
foreach(cell, spec->listdatums)
{
Const *val = lfirst(cell);
if (!val->constisnull)
{
int offset;
bool equal;
offset = partition_bound_bsearch(key, boundinfo,
&val->constvalue,
true, &equal);
if (offset >= 0 && equal)
{
overlap = true;
with = boundinfo->indexes[offset];
break;
}
}
else if (boundinfo->has_null)
{
overlap = true;
with = boundinfo->null_index;
break;
}
}
}
break;
}
case PARTITION_STRATEGY_RANGE:
{
PartitionRangeBound *lower,
*upper;
Assert(spec->strategy == PARTITION_STRATEGY_RANGE);
lower = make_one_range_bound(key, -1, spec->lowerdatums, true);
upper = make_one_range_bound(key, -1, spec->upperdatums, false);
/*
* First check if the resulting range would be empty with
* specified lower and upper bounds
*/
if (partition_rbound_cmp(key, lower->datums, lower->content, true,
upper) >= 0)
ereport(ERROR,
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
errmsg("cannot create range partition with empty range"),
parser_errposition(pstate, spec->location)));
if (partdesc->nparts > 0)
{
PartitionBoundInfo boundinfo = partdesc->boundinfo;
int off1,
off2;
bool equal = false;
Assert(boundinfo && boundinfo->ndatums > 0 &&
boundinfo->strategy == PARTITION_STRATEGY_RANGE);
/*
* Find the greatest index of a range bound that is less
* than or equal with the new lower bound.
*/
off1 = partition_bound_bsearch(key, boundinfo, lower, true,
&equal);
/*
* If equal has been set to true, that means the new lower
* bound is found to be equal with the bound at off1,
* which clearly means an overlap with the partition at
* index off1+1).
*
* Otherwise, check if there is a "gap" that could be
* occupied by the new partition. In case of a gap, the
* new upper bound should not cross past the upper
* boundary of the gap, that is, off2 == off1 should be
* true.
*/
if (!equal && boundinfo->indexes[off1 + 1] < 0)
{
off2 = partition_bound_bsearch(key, boundinfo, upper,
true, &equal);
if (equal || off1 != off2)
{
overlap = true;
with = boundinfo->indexes[off2 + 1];
}
}
else
{
overlap = true;
with = boundinfo->indexes[off1 + 1];
}
}
break;
}
default:
elog(ERROR, "unexpected partition strategy: %d",
(int) key->strategy);
}
if (overlap)
{
Assert(with >= 0);
ereport(ERROR,
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
errmsg("partition \"%s\" would overlap partition \"%s\"",
relname, get_rel_name(partdesc->oids[with])),
parser_errposition(pstate, spec->location)));
}
}
/*
* get_partition_parent
*
* Returns inheritance parent of a partition by scanning pg_inherits
*
* Note: Because this function assumes that the relation whose OID is passed
* as an argument will have precisely one parent, it should only be called
* when it is known that the relation is a partition.
*/
Oid
get_partition_parent(Oid relid)
{
Form_pg_inherits form;
Relation catalogRelation;
SysScanDesc scan;
ScanKeyData key[2];
HeapTuple tuple;
Oid result;
catalogRelation = heap_open(InheritsRelationId, AccessShareLock);
ScanKeyInit(&key[0],
Anum_pg_inherits_inhrelid,
BTEqualStrategyNumber, F_OIDEQ,
ObjectIdGetDatum(relid));
ScanKeyInit(&key[1],
Anum_pg_inherits_inhseqno,
BTEqualStrategyNumber, F_INT4EQ,
Int32GetDatum(1));
scan = systable_beginscan(catalogRelation, InheritsRelidSeqnoIndexId, true,
NULL, 2, key);
tuple = systable_getnext(scan);
Assert(HeapTupleIsValid(tuple));
form = (Form_pg_inherits) GETSTRUCT(tuple);
result = form->inhparent;
systable_endscan(scan);
heap_close(catalogRelation, AccessShareLock);
return result;
}
/*
* get_qual_from_partbound
* Given a parser node for partition bound, return the list of executable
* expressions as partition constraint
*/
List *
get_qual_from_partbound(Relation rel, Relation parent, Node *bound)
{
PartitionBoundSpec *spec = (PartitionBoundSpec *) bound;
PartitionKey key = RelationGetPartitionKey(parent);
List *my_qual = NIL;
TupleDesc parent_tupdesc = RelationGetDescr(parent);
AttrNumber parent_attno;
AttrNumber *partition_attnos;
bool found_whole_row;
Assert(key != NULL);
switch (key->strategy)
{
case PARTITION_STRATEGY_LIST:
Assert(spec->strategy == PARTITION_STRATEGY_LIST);
my_qual = get_qual_for_list(key, spec);
break;
case PARTITION_STRATEGY_RANGE:
Assert(spec->strategy == PARTITION_STRATEGY_RANGE);
my_qual = get_qual_for_range(key, spec);
break;
default:
elog(ERROR, "unexpected partition strategy: %d",
(int) key->strategy);
}
/*
* Translate vars in the generated expression to have correct attnos. Note
* that the vars in my_qual bear attnos dictated by key which carries
* physical attnos of the parent. We must allow for a case where physical
* attnos of a partition can be different from the parent.
*/
partition_attnos = (AttrNumber *)
palloc0(parent_tupdesc->natts * sizeof(AttrNumber));
for (parent_attno = 1; parent_attno <= parent_tupdesc->natts;
parent_attno++)
{
Form_pg_attribute attribute = parent_tupdesc->attrs[parent_attno - 1];
char *attname = NameStr(attribute->attname);
AttrNumber partition_attno;
if (attribute->attisdropped)
continue;
partition_attno = get_attnum(RelationGetRelid(rel), attname);
partition_attnos[parent_attno - 1] = partition_attno;
}
my_qual = (List *) map_variable_attnos((Node *) my_qual,
1, 0,
partition_attnos,
parent_tupdesc->natts,
&found_whole_row);
/* there can never be a whole-row reference here */
if (found_whole_row)
elog(ERROR, "unexpected whole-row reference found in partition key");
return my_qual;
}
/*
* RelationGetPartitionQual
*
* Returns a list of partition quals
*/
List *
RelationGetPartitionQual(Relation rel, bool recurse)
{
/* Quick exit */
if (!rel->rd_rel->relispartition)
return NIL;
return generate_partition_qual(rel, recurse);
}
/* Turn an array of OIDs with N elements into a list */
#define OID_ARRAY_TO_LIST(arr, N, list) \
do\
{\
int i;\
for (i = 0; i < (N); i++)\
(list) = lappend_oid((list), (arr)[i]);\
} while(0)
/*
* RelationGetPartitionDispatchInfo
* Returns information necessary to route tuples down a partition tree
*
* All the partitions will be locked with lockmode, unless it is NoLock.
* A list of the OIDs of all the leaf partition of rel is returned in
* *leaf_part_oids.
*/
PartitionDispatch *
RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
int *num_parted, List **leaf_part_oids)
{
PartitionDesc rootpartdesc = RelationGetPartitionDesc(rel);
PartitionDispatchData **pd;
List *all_parts = NIL,
*parted_rels;
ListCell *lc;
int i,
k,
offset;
Implement table partitioning. Table partitioning is like table inheritance and reuses much of the existing infrastructure, but there are some important differences. The parent is called a partitioned table and is always empty; it may not have indexes or non-inherited constraints, since those make no sense for a relation with no data of its own. The children are called partitions and contain all of the actual data. Each partition has an implicit partitioning constraint. Multiple inheritance is not allowed, and partitioning and inheritance can't be mixed. Partitions can't have extra columns and may not allow nulls unless the parent does. Tuples inserted into the parent are automatically routed to the correct partition, so tuple-routing ON INSERT triggers are not needed. Tuple routing isn't yet supported for partitions which are foreign tables, and it doesn't handle updates that cross partition boundaries. Currently, tables can be range-partitioned or list-partitioned. List partitioning is limited to a single column, but range partitioning can involve multiple columns. A partitioning "column" can be an expression. Because table partitioning is less general than table inheritance, it is hoped that it will be easier to reason about properties of partitions, and therefore that this will serve as a better foundation for a variety of possible optimizations, including query planner optimizations. The tuple routing based which this patch does based on the implicit partitioning constraints is an example of this, but it seems likely that many other useful optimizations are also possible. Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat, Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova, Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
/*
* Lock partitions and make a list of the partitioned ones to prepare
* their PartitionDispatch objects below.
*
* Cannot use find_all_inheritors() here, because then the order of OIDs
* in parted_rels list would be unknown, which does not help, because we
* we assign indexes within individual PartitionDispatch in an order that
* is predetermined (determined by the order of OIDs in individual
* partition descriptors).
*/
*num_parted = 1;
parted_rels = list_make1(rel);
OID_ARRAY_TO_LIST(rootpartdesc->oids, rootpartdesc->nparts, all_parts);
foreach(lc, all_parts)
{
Relation partrel = heap_open(lfirst_oid(lc), lockmode);
PartitionDesc partdesc = RelationGetPartitionDesc(partrel);
/*
* If this partition is a partitioned table, add its children to the
* end of the list, so that they are processed as well.
*/
if (partdesc)
{
(*num_parted)++;
parted_rels = lappend(parted_rels, partrel);
OID_ARRAY_TO_LIST(partdesc->oids, partdesc->nparts, all_parts);
}
else
heap_close(partrel, NoLock);
/*
* We keep the partitioned ones open until we're done using the
* information being collected here (for example, see
* ExecEndModifyTable).
*/
}
/*
* We want to create two arrays - one for leaf partitions and another for
* partitioned tables (including the root table and internal partitions).
* While we only create the latter here, leaf partition array of suitable
* objects (such as, ResultRelInfo) is created by the caller using the
* list of OIDs we return. Indexes into these arrays get assigned in a
* breadth-first manner, whereby partitions of any given level are placed
* consecutively in the respective arrays.
*/
Implement table partitioning. Table partitioning is like table inheritance and reuses much of the existing infrastructure, but there are some important differences. The parent is called a partitioned table and is always empty; it may not have indexes or non-inherited constraints, since those make no sense for a relation with no data of its own. The children are called partitions and contain all of the actual data. Each partition has an implicit partitioning constraint. Multiple inheritance is not allowed, and partitioning and inheritance can't be mixed. Partitions can't have extra columns and may not allow nulls unless the parent does. Tuples inserted into the parent are automatically routed to the correct partition, so tuple-routing ON INSERT triggers are not needed. Tuple routing isn't yet supported for partitions which are foreign tables, and it doesn't handle updates that cross partition boundaries. Currently, tables can be range-partitioned or list-partitioned. List partitioning is limited to a single column, but range partitioning can involve multiple columns. A partitioning "column" can be an expression. Because table partitioning is less general than table inheritance, it is hoped that it will be easier to reason about properties of partitions, and therefore that this will serve as a better foundation for a variety of possible optimizations, including query planner optimizations. The tuple routing based which this patch does based on the implicit partitioning constraints is an example of this, but it seems likely that many other useful optimizations are also possible. Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat, Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova, Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
pd = (PartitionDispatchData **) palloc(*num_parted *
sizeof(PartitionDispatchData *));
*leaf_part_oids = NIL;
i = k = offset = 0;
Implement table partitioning. Table partitioning is like table inheritance and reuses much of the existing infrastructure, but there are some important differences. The parent is called a partitioned table and is always empty; it may not have indexes or non-inherited constraints, since those make no sense for a relation with no data of its own. The children are called partitions and contain all of the actual data. Each partition has an implicit partitioning constraint. Multiple inheritance is not allowed, and partitioning and inheritance can't be mixed. Partitions can't have extra columns and may not allow nulls unless the parent does. Tuples inserted into the parent are automatically routed to the correct partition, so tuple-routing ON INSERT triggers are not needed. Tuple routing isn't yet supported for partitions which are foreign tables, and it doesn't handle updates that cross partition boundaries. Currently, tables can be range-partitioned or list-partitioned. List partitioning is limited to a single column, but range partitioning can involve multiple columns. A partitioning "column" can be an expression. Because table partitioning is less general than table inheritance, it is hoped that it will be easier to reason about properties of partitions, and therefore that this will serve as a better foundation for a variety of possible optimizations, including query planner optimizations. The tuple routing based which this patch does based on the implicit partitioning constraints is an example of this, but it seems likely that many other useful optimizations are also possible. Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat, Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova, Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
foreach(lc, parted_rels)
{
Relation partrel = lfirst(lc);
PartitionKey partkey = RelationGetPartitionKey(partrel);
PartitionDesc partdesc = RelationGetPartitionDesc(partrel);
int j,
m;
pd[i] = (PartitionDispatch) palloc(sizeof(PartitionDispatchData));
pd[i]->reldesc = partrel;
pd[i]->key = partkey;
pd[i]->keystate = NIL;
pd[i]->partdesc = partdesc;
pd[i]->indexes = (int *) palloc(partdesc->nparts * sizeof(int));
/*
* Indexes corresponding to the internal partitions are multiplied by
* -1 to distinguish them from those of leaf partitions. Encountering
* an index >= 0 means we found a leaf partition, which is immediately
* returned as the partition we are looking for. A negative index
* means we found a partitioned table, whose PartitionDispatch object
* is located at the above index multiplied back by -1. Using the
* PartitionDispatch object, search is continued further down the
* partition tree.
*/
Implement table partitioning. Table partitioning is like table inheritance and reuses much of the existing infrastructure, but there are some important differences. The parent is called a partitioned table and is always empty; it may not have indexes or non-inherited constraints, since those make no sense for a relation with no data of its own. The children are called partitions and contain all of the actual data. Each partition has an implicit partitioning constraint. Multiple inheritance is not allowed, and partitioning and inheritance can't be mixed. Partitions can't have extra columns and may not allow nulls unless the parent does. Tuples inserted into the parent are automatically routed to the correct partition, so tuple-routing ON INSERT triggers are not needed. Tuple routing isn't yet supported for partitions which are foreign tables, and it doesn't handle updates that cross partition boundaries. Currently, tables can be range-partitioned or list-partitioned. List partitioning is limited to a single column, but range partitioning can involve multiple columns. A partitioning "column" can be an expression. Because table partitioning is less general than table inheritance, it is hoped that it will be easier to reason about properties of partitions, and therefore that this will serve as a better foundation for a variety of possible optimizations, including query planner optimizations. The tuple routing based which this patch does based on the implicit partitioning constraints is an example of this, but it seems likely that many other useful optimizations are also possible. Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat, Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova, Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
m = 0;
for (j = 0; j < partdesc->nparts; j++)
{
Oid partrelid = partdesc->oids[j];
if (get_rel_relkind(partrelid) != RELKIND_PARTITIONED_TABLE)
{
*leaf_part_oids = lappend_oid(*leaf_part_oids, partrelid);
pd[i]->indexes[j] = k++;
}
else
{
/*
* offset denotes the number of partitioned tables of upper
* levels including those of the current level. Any partition
* of this table must belong to the next level and hence will
* be placed after the last partitioned table of this level.
Implement table partitioning. Table partitioning is like table inheritance and reuses much of the existing infrastructure, but there are some important differences. The parent is called a partitioned table and is always empty; it may not have indexes or non-inherited constraints, since those make no sense for a relation with no data of its own. The children are called partitions and contain all of the actual data. Each partition has an implicit partitioning constraint. Multiple inheritance is not allowed, and partitioning and inheritance can't be mixed. Partitions can't have extra columns and may not allow nulls unless the parent does. Tuples inserted into the parent are automatically routed to the correct partition, so tuple-routing ON INSERT triggers are not needed. Tuple routing isn't yet supported for partitions which are foreign tables, and it doesn't handle updates that cross partition boundaries. Currently, tables can be range-partitioned or list-partitioned. List partitioning is limited to a single column, but range partitioning can involve multiple columns. A partitioning "column" can be an expression. Because table partitioning is less general than table inheritance, it is hoped that it will be easier to reason about properties of partitions, and therefore that this will serve as a better foundation for a variety of possible optimizations, including query planner optimizations. The tuple routing based which this patch does based on the implicit partitioning constraints is an example of this, but it seems likely that many other useful optimizations are also possible. Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat, Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova, Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
*/
pd[i]->indexes[j] = -(1 + offset + m);
Implement table partitioning. Table partitioning is like table inheritance and reuses much of the existing infrastructure, but there are some important differences. The parent is called a partitioned table and is always empty; it may not have indexes or non-inherited constraints, since those make no sense for a relation with no data of its own. The children are called partitions and contain all of the actual data. Each partition has an implicit partitioning constraint. Multiple inheritance is not allowed, and partitioning and inheritance can't be mixed. Partitions can't have extra columns and may not allow nulls unless the parent does. Tuples inserted into the parent are automatically routed to the correct partition, so tuple-routing ON INSERT triggers are not needed. Tuple routing isn't yet supported for partitions which are foreign tables, and it doesn't handle updates that cross partition boundaries. Currently, tables can be range-partitioned or list-partitioned. List partitioning is limited to a single column, but range partitioning can involve multiple columns. A partitioning "column" can be an expression. Because table partitioning is less general than table inheritance, it is hoped that it will be easier to reason about properties of partitions, and therefore that this will serve as a better foundation for a variety of possible optimizations, including query planner optimizations. The tuple routing based which this patch does based on the implicit partitioning constraints is an example of this, but it seems likely that many other useful optimizations are also possible. Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat, Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova, Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
m++;
}
}
i++;
/*
* This counts the number of partitioned tables at upper levels
* including those of the current level.
*/
offset += m;
Implement table partitioning. Table partitioning is like table inheritance and reuses much of the existing infrastructure, but there are some important differences. The parent is called a partitioned table and is always empty; it may not have indexes or non-inherited constraints, since those make no sense for a relation with no data of its own. The children are called partitions and contain all of the actual data. Each partition has an implicit partitioning constraint. Multiple inheritance is not allowed, and partitioning and inheritance can't be mixed. Partitions can't have extra columns and may not allow nulls unless the parent does. Tuples inserted into the parent are automatically routed to the correct partition, so tuple-routing ON INSERT triggers are not needed. Tuple routing isn't yet supported for partitions which are foreign tables, and it doesn't handle updates that cross partition boundaries. Currently, tables can be range-partitioned or list-partitioned. List partitioning is limited to a single column, but range partitioning can involve multiple columns. A partitioning "column" can be an expression. Because table partitioning is less general than table inheritance, it is hoped that it will be easier to reason about properties of partitions, and therefore that this will serve as a better foundation for a variety of possible optimizations, including query planner optimizations. The tuple routing based which this patch does based on the implicit partitioning constraints is an example of this, but it seems likely that many other useful optimizations are also possible. Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat, Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova, Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
}
return pd;
}
/* Module-local functions */
/*
* get_qual_for_list
*
* Returns a list of expressions to use as a list partition's constraint.
*/
static List *
get_qual_for_list(PartitionKey key, PartitionBoundSpec *spec)
{
List *result;
ArrayExpr *arr;
ScalarArrayOpExpr *opexpr;
ListCell *cell,
*prev,
*next;
Node *keyCol;
Oid operoid;
bool need_relabel,
list_has_null = false;
NullTest *nulltest1 = NULL,
*nulltest2 = NULL;
/* Left operand is either a simple Var or arbitrary expression */
if (key->partattrs[0] != 0)
keyCol = (Node *) makeVar(1,
key->partattrs[0],
key->parttypid[0],
key->parttypmod[0],
key->parttypcoll[0],
0);
else
keyCol = (Node *) copyObject(linitial(key->partexprs));
/*
* We must remove any NULL value in the list; we handle it separately
* below.
*/
prev = NULL;
for (cell = list_head(spec->listdatums); cell; cell = next)
{
Const *val = (Const *) lfirst(cell);
next = lnext(cell);
if (val->constisnull)
{
list_has_null = true;
spec->listdatums = list_delete_cell(spec->listdatums,
cell, prev);
}
else
prev = cell;
}
if (!list_has_null)
{
/*
* Gin up a col IS NOT NULL test that will be AND'd with other
* expressions
*/
nulltest1 = makeNode(NullTest);
nulltest1->arg = (Expr *) keyCol;
nulltest1->nulltesttype = IS_NOT_NULL;
nulltest1->argisrow = false;
nulltest1->location = -1;
}
else
{
/*
* Gin up a col IS NULL test that will be OR'd with other expressions
*/
nulltest2 = makeNode(NullTest);
nulltest2->arg = (Expr *) keyCol;
nulltest2->nulltesttype = IS_NULL;
nulltest2->argisrow = false;
nulltest2->location = -1;
}
/* Right operand is an ArrayExpr containing this partition's values */
arr = makeNode(ArrayExpr);
arr->array_typeid = !type_is_array(key->parttypid[0])
? get_array_type(key->parttypid[0])
: key->parttypid[0];
arr->array_collid = key->parttypcoll[0];
arr->element_typeid = key->parttypid[0];
arr->elements = spec->listdatums;
arr->multidims = false;
arr->location = -1;
/* Get the correct btree equality operator */
operoid = get_partition_operator(key, 0, BTEqualStrategyNumber,
&need_relabel);
if (need_relabel || key->partcollation[0] != key->parttypcoll[0])
keyCol = (Node *) makeRelabelType((Expr *) keyCol,
key->partopcintype[0],
-1,
key->partcollation[0],
COERCE_EXPLICIT_CAST);
/* Build leftop = ANY (rightop) */
opexpr = makeNode(ScalarArrayOpExpr);
opexpr->opno = operoid;
opexpr->opfuncid = get_opcode(operoid);
opexpr->useOr = true;
opexpr->inputcollid = key->partcollation[0];
opexpr->args = list_make2(keyCol, arr);
opexpr->location = -1;
if (nulltest1)
result = list_make2(nulltest1, opexpr);
else if (nulltest2)
{
Expr *or;
or = makeBoolExpr(OR_EXPR, list_make2(nulltest2, opexpr), -1);
result = list_make1(or);
}
else
result = list_make1(opexpr);
return result;
}
/*
* get_qual_for_range
*
* Get a list of OpExpr's to use as a range partition's constraint.
*/
static List *
get_qual_for_range(PartitionKey key, PartitionBoundSpec *spec)
{
List *result = NIL;
ListCell *cell1,
*cell2,
*partexprs_item;
int i;
/*
* Iterate over columns of the key, emitting an OpExpr for each using the
* corresponding lower and upper datums as constant operands.
*/
i = 0;
partexprs_item = list_head(key->partexprs);
forboth(cell1, spec->lowerdatums, cell2, spec->upperdatums)
{
PartitionRangeDatum *ldatum = lfirst(cell1),
*udatum = lfirst(cell2);
Node *keyCol;
Const *lower_val = NULL,
*upper_val = NULL;
EState *estate;
MemoryContext oldcxt;
Expr *test_expr;
ExprState *test_exprstate;
Datum test_result;
bool isNull;
bool need_relabel = false;
Oid operoid;
NullTest *nulltest;
/* Left operand */
if (key->partattrs[i] != 0)
{
keyCol = (Node *) makeVar(1,
key->partattrs[i],
key->parttypid[i],
key->parttypmod[i],
key->parttypcoll[i],
0);
}
else
{
keyCol = (Node *) copyObject(lfirst(partexprs_item));
partexprs_item = lnext(partexprs_item);
}
/*
* Emit a IS NOT NULL expression for non-Var keys, because whereas
* simple attributes are covered by NOT NULL constraints, expression
* keys are still nullable which is not acceptable in case of range
* partitioning.
*/
if (!IsA(keyCol, Var))
{
nulltest = makeNode(NullTest);
nulltest->arg = (Expr *) keyCol;
nulltest->nulltesttype = IS_NOT_NULL;
nulltest->argisrow = false;
nulltest->location = -1;
result = lappend(result, nulltest);
}
/*
* Stop at this column if either of lower or upper datum is infinite,
* but do emit an OpExpr for the non-infinite datum.
*/
if (!ldatum->infinite)
lower_val = (Const *) ldatum->value;
if (!udatum->infinite)
upper_val = (Const *) udatum->value;
/*
* If lower_val and upper_val are both finite and happen to be equal,
* emit only (keyCol = lower_val) for this column, because all rows in
* this partition could only ever contain this value (ie, lower_val)
* in the current partitioning column. We must consider further
* columns because the above condition does not fully constrain the
* rows of this partition.
*/
if (lower_val && upper_val)
{
/* Get the correct btree equality operator for the test */
operoid = get_partition_operator(key, i, BTEqualStrategyNumber,
&need_relabel);
/* Create the test expression */
estate = CreateExecutorState();
oldcxt = MemoryContextSwitchTo(estate->es_query_cxt);
test_expr = make_opclause(operoid,
BOOLOID,
false,
(Expr *) lower_val,
(Expr *) upper_val,
InvalidOid,
key->partcollation[i]);
fix_opfuncids((Node *) test_expr);
test_exprstate = ExecInitExpr(test_expr, NULL);
test_result = ExecEvalExprSwitchContext(test_exprstate,
GetPerTupleExprContext(estate),
&isNull, NULL);
MemoryContextSwitchTo(oldcxt);
FreeExecutorState(estate);
if (DatumGetBool(test_result))
{
/* This can never be, but it's better to make sure */
if (i == key->partnatts - 1)
elog(ERROR, "invalid range bound specification");
if (need_relabel || key->partcollation[i] != key->parttypcoll[i])
keyCol = (Node *) makeRelabelType((Expr *) keyCol,
key->partopcintype[i],
-1,
key->partcollation[i],
COERCE_EXPLICIT_CAST);
result = lappend(result,
make_opclause(operoid,
BOOLOID,
false,
(Expr *) keyCol,
(Expr *) lower_val,
InvalidOid,
key->partcollation[i]));
/* Go over to consider the next column. */
i++;
continue;
}
}
/*
* We can say here that lower_val != upper_val. Emit expressions
* (keyCol >= lower_val) and (keyCol < upper_val), then stop.
*/
if (lower_val)
{
operoid = get_partition_operator(key, i,
BTGreaterEqualStrategyNumber,
&need_relabel);
if (need_relabel || key->partcollation[i] != key->parttypcoll[i])
keyCol = (Node *) makeRelabelType((Expr *) keyCol,
key->partopcintype[i],
-1,
key->partcollation[i],
COERCE_EXPLICIT_CAST);
result = lappend(result,
make_opclause(operoid,
BOOLOID,
false,
(Expr *) keyCol,
(Expr *) lower_val,
InvalidOid,
key->partcollation[i]));
}
if (upper_val)
{
operoid = get_partition_operator(key, i,
BTLessStrategyNumber,
&need_relabel);
if (need_relabel || key->partcollation[i] != key->parttypcoll[i])
keyCol = (Node *) makeRelabelType((Expr *) keyCol,
key->partopcintype[i],
-1,
key->partcollation[i],
COERCE_EXPLICIT_CAST);
result = lappend(result,
make_opclause(operoid,
BOOLOID,
false,
(Expr *) keyCol,
(Expr *) upper_val,
InvalidOid,
key->partcollation[i]));
}
/*
* We can stop at this column, because we would not have checked the
* next column when routing a given row into this partition.
*/
break;
}
return result;
}
/*
* get_partition_operator
*
* Return oid of the operator of given strategy for a given partition key
* column.
*/
static Oid
get_partition_operator(PartitionKey key, int col, StrategyNumber strategy,
bool *need_relabel)
{
Oid operoid;
/*
* First check if there exists an operator of the given strategy, with
* this column's type as both its lefttype and righttype, in the
* partitioning operator family specified for the column.
*/
operoid = get_opfamily_member(key->partopfamily[col],
key->parttypid[col],
key->parttypid[col],
strategy);
/*
* If one doesn't exist, we must resort to using an operator in the same
* opreator family but with the operator class declared input type. It is
* OK to do so, because the column's type is known to be binary-coercible
* with the operator class input type (otherwise, the operator class in
* question would not have been accepted as the partitioning operator
* class). We must however inform the caller to wrap the non-Const
* expression with a RelabelType node to denote the implicit coercion. It
* ensures that the resulting expression structurally matches similarly
* processed expressions within the optimizer.
*/
if (!OidIsValid(operoid))
{
operoid = get_opfamily_member(key->partopfamily[col],
key->partopcintype[col],
key->partopcintype[col],
strategy);
*need_relabel = true;
}
else
*need_relabel = false;
if (!OidIsValid(operoid))
elog(ERROR, "could not find operator for partitioning");
return operoid;
}
/*
* generate_partition_qual
*
* Generate partition predicate from rel's partition bound expression
*
* Result expression tree is stored CacheMemoryContext to ensure it survives
* as long as the relcache entry. But we should be running in a less long-lived
* working context. To avoid leaking cache memory if this routine fails partway
* through, we build in working memory and then copy the completed structure
* into cache memory.
*/
static List *
generate_partition_qual(Relation rel, bool recurse)
{
HeapTuple tuple;
MemoryContext oldcxt;
Datum boundDatum;
bool isnull;
Node *bound;
List *my_qual = NIL,
*result = NIL;
Relation parent;
/* Guard against stack overflow due to overly deep partition tree */
check_stack_depth();
/* Grab at least an AccessShareLock on the parent table */
parent = heap_open(get_partition_parent(RelationGetRelid(rel)),
AccessShareLock);
/* Quick copy */
if (rel->rd_partcheck)
{
if (parent->rd_rel->relispartition && recurse)
result = list_concat(generate_partition_qual(parent, true),
copyObject(rel->rd_partcheck));
else
result = copyObject(rel->rd_partcheck);
heap_close(parent, AccessShareLock);
return result;
}
/* Get pg_class.relpartbound */
if (!rel->rd_rel->relispartition) /* should not happen */
elog(ERROR, "relation \"%s\" has relispartition = false",
RelationGetRelationName(rel));
tuple = SearchSysCache1(RELOID, RelationGetRelid(rel));
boundDatum = SysCacheGetAttr(RELOID, tuple,
Anum_pg_class_relpartbound,
&isnull);
if (isnull) /* should not happen */
elog(ERROR, "relation \"%s\" has relpartbound = null",
RelationGetRelationName(rel));
bound = stringToNode(TextDatumGetCString(boundDatum));
ReleaseSysCache(tuple);
my_qual = get_qual_from_partbound(rel, parent, bound);
/* If requested, add parent's quals to the list (if any) */
if (parent->rd_rel->relispartition && recurse)
{
List *parent_check;
parent_check = generate_partition_qual(parent, true);
result = list_concat(parent_check, my_qual);
}
else
result = my_qual;
/* Save a copy of my_qual in the relcache */
oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
rel->rd_partcheck = copyObject(my_qual);
MemoryContextSwitchTo(oldcxt);
/* Keep the parent locked until commit */
heap_close(parent, NoLock);
return result;
}
/* ----------------
* FormPartitionKeyDatum
* Construct values[] and isnull[] arrays for the partition key
* of a tuple.
*
* pd Partition dispatch object of the partitioned table
Implement table partitioning. Table partitioning is like table inheritance and reuses much of the existing infrastructure, but there are some important differences. The parent is called a partitioned table and is always empty; it may not have indexes or non-inherited constraints, since those make no sense for a relation with no data of its own. The children are called partitions and contain all of the actual data. Each partition has an implicit partitioning constraint. Multiple inheritance is not allowed, and partitioning and inheritance can't be mixed. Partitions can't have extra columns and may not allow nulls unless the parent does. Tuples inserted into the parent are automatically routed to the correct partition, so tuple-routing ON INSERT triggers are not needed. Tuple routing isn't yet supported for partitions which are foreign tables, and it doesn't handle updates that cross partition boundaries. Currently, tables can be range-partitioned or list-partitioned. List partitioning is limited to a single column, but range partitioning can involve multiple columns. A partitioning "column" can be an expression. Because table partitioning is less general than table inheritance, it is hoped that it will be easier to reason about properties of partitions, and therefore that this will serve as a better foundation for a variety of possible optimizations, including query planner optimizations. The tuple routing based which this patch does based on the implicit partitioning constraints is an example of this, but it seems likely that many other useful optimizations are also possible. Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat, Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova, Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
* slot Heap tuple from which to extract partition key
* estate executor state for evaluating any partition key
* expressions (must be non-NULL)
* values Array of partition key Datums (output area)
* isnull Array of is-null indicators (output area)
*
* the ecxt_scantuple slot of estate's per-tuple expr context must point to
* the heap tuple passed in.
* ----------------
*/
static void
FormPartitionKeyDatum(PartitionDispatch pd,
TupleTableSlot *slot,
EState *estate,
Datum *values,
bool *isnull)
{
ListCell *partexpr_item;
int i;
if (pd->key->partexprs != NIL && pd->keystate == NIL)
{
/* Check caller has set up context correctly */
Assert(estate != NULL &&
GetPerTupleExprContext(estate)->ecxt_scantuple == slot);
/* First time through, set up expression evaluation state */
pd->keystate = (List *) ExecPrepareExpr((Expr *) pd->key->partexprs,
estate);
}
partexpr_item = list_head(pd->keystate);
for (i = 0; i < pd->key->partnatts; i++)
{
AttrNumber keycol = pd->key->partattrs[i];
Datum datum;
bool isNull;
if (keycol != 0)
{
/* Plain column; get the value directly from the heap tuple */
datum = slot_getattr(slot, keycol, &isNull);
}
else
{
/* Expression; need to evaluate it */
if (partexpr_item == NULL)
elog(ERROR, "wrong number of partition key expressions");
datum = ExecEvalExprSwitchContext((ExprState *) lfirst(partexpr_item),
GetPerTupleExprContext(estate),
&isNull,
NULL);
partexpr_item = lnext(partexpr_item);
}
values[i] = datum;
isnull[i] = isNull;
}
if (partexpr_item != NULL)
elog(ERROR, "wrong number of partition key expressions");
}
/*
* get_partition_for_tuple
* Finds a leaf partition for tuple contained in *slot
*
* Returned value is the sequence number of the leaf partition thus found,
* or -1 if no leaf partition is found for the tuple. *failed_at is set
* to the OID of the partitioned table whose partition was not found in
* the latter case.
*/
int
get_partition_for_tuple(PartitionDispatch *pd,
Implement table partitioning. Table partitioning is like table inheritance and reuses much of the existing infrastructure, but there are some important differences. The parent is called a partitioned table and is always empty; it may not have indexes or non-inherited constraints, since those make no sense for a relation with no data of its own. The children are called partitions and contain all of the actual data. Each partition has an implicit partitioning constraint. Multiple inheritance is not allowed, and partitioning and inheritance can't be mixed. Partitions can't have extra columns and may not allow nulls unless the parent does. Tuples inserted into the parent are automatically routed to the correct partition, so tuple-routing ON INSERT triggers are not needed. Tuple routing isn't yet supported for partitions which are foreign tables, and it doesn't handle updates that cross partition boundaries. Currently, tables can be range-partitioned or list-partitioned. List partitioning is limited to a single column, but range partitioning can involve multiple columns. A partitioning "column" can be an expression. Because table partitioning is less general than table inheritance, it is hoped that it will be easier to reason about properties of partitions, and therefore that this will serve as a better foundation for a variety of possible optimizations, including query planner optimizations. The tuple routing based which this patch does based on the implicit partitioning constraints is an example of this, but it seems likely that many other useful optimizations are also possible. Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat, Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova, Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
TupleTableSlot *slot,
EState *estate,
Oid *failed_at)
{
PartitionDispatch parent;
Datum values[PARTITION_MAX_KEYS];
bool isnull[PARTITION_MAX_KEYS];
int cur_offset,
cur_index;
int i;
/* start with the root partitioned table */
parent = pd[0];
while (true)
{
PartitionKey key = parent->key;
PartitionDesc partdesc = parent->partdesc;
/* Quick exit */
if (partdesc->nparts == 0)
{
*failed_at = RelationGetRelid(parent->reldesc);
return -1;
}
/* Extract partition key from tuple */
FormPartitionKeyDatum(parent, slot, estate, values, isnull);
if (key->strategy == PARTITION_STRATEGY_RANGE)
{
/* Disallow nulls in the range partition key of the tuple */
for (i = 0; i < key->partnatts; i++)
if (isnull[i])
ereport(ERROR,
(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
errmsg("range partition key of row contains null")));
}
if (partdesc->boundinfo->has_null && isnull[0])
/* Tuple maps to the null-accepting list partition */
cur_index = partdesc->boundinfo->null_index;
else
{
/* Else bsearch in partdesc->boundinfo */
bool equal = false;
cur_offset = partition_bound_bsearch(key, partdesc->boundinfo,
values, false, &equal);
switch (key->strategy)
{
case PARTITION_STRATEGY_LIST:
if (cur_offset >= 0 && equal)
cur_index = partdesc->boundinfo->indexes[cur_offset];
else
cur_index = -1;
break;
case PARTITION_STRATEGY_RANGE:
/*
* Offset returned is such that the bound at offset is
* found to be less or equal with the tuple. So, the bound
* at offset+1 would be the upper bound.
*/
cur_index = partdesc->boundinfo->indexes[cur_offset + 1];
break;
default:
elog(ERROR, "unexpected partition strategy: %d",
(int) key->strategy);
}
}
/*
* cur_index < 0 means we failed to find a partition of this parent.
* cur_index >= 0 means we either found the leaf partition, or the
* next parent to find a partition of.
*/
if (cur_index < 0)
{
*failed_at = RelationGetRelid(parent->reldesc);
return -1;
}
else if (parent->indexes[cur_index] < 0)
parent = pd[-parent->indexes[cur_index]];
else
break;
}
return parent->indexes[cur_index];
}
/*
* qsort_partition_list_value_cmp
*
* Compare two list partition bound datums
*/
static int32
qsort_partition_list_value_cmp(const void *a, const void *b, void *arg)
{
Datum val1 = (*(const PartitionListValue **) a)->value,
val2 = (*(const PartitionListValue **) b)->value;
PartitionKey key = (PartitionKey) arg;
return DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0],
key->partcollation[0],
val1, val2));
}
/*
* make_one_range_bound
*
* Return a PartitionRangeBound given a list of PartitionRangeDatum elements
* and a flag telling whether the bound is lower or not. Made into a function
* because there are multiple sites that want to use this facility.
*/
static PartitionRangeBound *
make_one_range_bound(PartitionKey key, int index, List *datums, bool lower)
{
PartitionRangeBound *bound;
ListCell *cell;
int i;
bound = (PartitionRangeBound *) palloc0(sizeof(PartitionRangeBound));
bound->index = index;
bound->datums = (Datum *) palloc0(key->partnatts * sizeof(Datum));
bound->content = (RangeDatumContent *) palloc0(key->partnatts *
sizeof(RangeDatumContent));
bound->lower = lower;
i = 0;
foreach(cell, datums)
{
PartitionRangeDatum *datum = lfirst(cell);
/* What's contained in this range datum? */
bound->content[i] = !datum->infinite
? RANGE_DATUM_FINITE
: (lower ? RANGE_DATUM_NEG_INF
: RANGE_DATUM_POS_INF);
if (bound->content[i] == RANGE_DATUM_FINITE)
{
Const *val = (Const *) datum->value;
if (val->constisnull)
elog(ERROR, "invalid range bound datum");
bound->datums[i] = val->constvalue;
}
i++;
}
return bound;
}
/* Used when sorting range bounds across all range partitions */
static int32
qsort_partition_rbound_cmp(const void *a, const void *b, void *arg)
{
PartitionRangeBound *b1 = (*(PartitionRangeBound *const *) a);
PartitionRangeBound *b2 = (*(PartitionRangeBound *const *) b);
PartitionKey key = (PartitionKey) arg;
return partition_rbound_cmp(key, b1->datums, b1->content, b1->lower, b2);
}
/*
* partition_rbound_cmp
*
* Return for two range bounds whether the 1st one (specified in datum1,
* content1, and lower1) is <=, =, >= the bound specified in *b2
*/
static int32
partition_rbound_cmp(PartitionKey key,
Datum *datums1, RangeDatumContent *content1, bool lower1,
PartitionRangeBound *b2)
{
int32 cmpval = 0; /* placate compiler */
Implement table partitioning. Table partitioning is like table inheritance and reuses much of the existing infrastructure, but there are some important differences. The parent is called a partitioned table and is always empty; it may not have indexes or non-inherited constraints, since those make no sense for a relation with no data of its own. The children are called partitions and contain all of the actual data. Each partition has an implicit partitioning constraint. Multiple inheritance is not allowed, and partitioning and inheritance can't be mixed. Partitions can't have extra columns and may not allow nulls unless the parent does. Tuples inserted into the parent are automatically routed to the correct partition, so tuple-routing ON INSERT triggers are not needed. Tuple routing isn't yet supported for partitions which are foreign tables, and it doesn't handle updates that cross partition boundaries. Currently, tables can be range-partitioned or list-partitioned. List partitioning is limited to a single column, but range partitioning can involve multiple columns. A partitioning "column" can be an expression. Because table partitioning is less general than table inheritance, it is hoped that it will be easier to reason about properties of partitions, and therefore that this will serve as a better foundation for a variety of possible optimizations, including query planner optimizations. The tuple routing based which this patch does based on the implicit partitioning constraints is an example of this, but it seems likely that many other useful optimizations are also possible. Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat, Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova, Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
int i;
Datum *datums2 = b2->datums;
RangeDatumContent *content2 = b2->content;
bool lower2 = b2->lower;
for (i = 0; i < key->partnatts; i++)
{
/*
* First, handle cases involving infinity, which don't require
* invoking the comparison proc.
*/
if (content1[i] != RANGE_DATUM_FINITE &&
content2[i] != RANGE_DATUM_FINITE)
/*
* Both are infinity, so they are equal unless one is negative
* infinity and other positive (or vice versa)
*/
return content1[i] == content2[i] ? 0
: (content1[i] < content2[i] ? -1 : 1);
else if (content1[i] != RANGE_DATUM_FINITE)
return content1[i] == RANGE_DATUM_NEG_INF ? -1 : 1;
else if (content2[i] != RANGE_DATUM_FINITE)
return content2[i] == RANGE_DATUM_NEG_INF ? 1 : -1;
cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[i],
key->partcollation[i],
datums1[i],
datums2[i]));
if (cmpval != 0)
break;
}
/*
* If the comparison is anything other than equal, we're done. If they
* compare equal though, we still have to consider whether the boundaries
* are inclusive or exclusive. Exclusive one is considered smaller of the
* two.
*/
if (cmpval == 0 && lower1 != lower2)
cmpval = lower1 ? 1 : -1;
return cmpval;
}
/*
* partition_rbound_datum_cmp
*
* Return whether range bound (specified in rb_datums, rb_content, and
* rb_lower) <=, =, >= partition key of tuple (tuple_datums)
*/
static int32
partition_rbound_datum_cmp(PartitionKey key,
Datum *rb_datums, RangeDatumContent *rb_content,
Datum *tuple_datums)
{
int i;
int32 cmpval = -1;
for (i = 0; i < key->partnatts; i++)
{
if (rb_content[i] != RANGE_DATUM_FINITE)
return rb_content[i] == RANGE_DATUM_NEG_INF ? -1 : 1;
cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[i],
key->partcollation[i],
rb_datums[i],
tuple_datums[i]));
if (cmpval != 0)
break;
}
return cmpval;
}
/*
* partition_bound_cmp
*
* Return whether the bound at offset in boundinfo is <=, =, >= the argument
* specified in *probe.
*/
static int32
partition_bound_cmp(PartitionKey key, PartitionBoundInfo boundinfo,
int offset, void *probe, bool probe_is_bound)
{
Datum *bound_datums = boundinfo->datums[offset];
int32 cmpval = -1;
switch (key->strategy)
{
case PARTITION_STRATEGY_LIST:
cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0],
key->partcollation[0],
bound_datums[0],
*(Datum *) probe));
break;
case PARTITION_STRATEGY_RANGE:
{
RangeDatumContent *content = boundinfo->content[offset];
if (probe_is_bound)
{
/*
* We need to pass whether the existing bound is a lower
* bound, so that two equal-valued lower and upper bounds
* are not regarded equal.
*/
bool lower = boundinfo->indexes[offset] < 0;
cmpval = partition_rbound_cmp(key,
bound_datums, content, lower,
(PartitionRangeBound *) probe);
}
else
cmpval = partition_rbound_datum_cmp(key,
bound_datums, content,
(Datum *) probe);
break;
}
default:
elog(ERROR, "unexpected partition strategy: %d",
(int) key->strategy);
}
return cmpval;
}
/*
* Binary search on a collection of partition bounds. Returns greatest index
* of bound in array boundinfo->datums which is less or equal with *probe.
* If all bounds in the array are greater than *probe, -1 is returned.
*
* *probe could either be a partition bound or a Datum array representing
* the partition key of a tuple being routed; probe_is_bound tells which.
* We pass that down to the comparison function so that it can interpret the
* contents of *probe accordingly.
*
* *is_equal is set to whether the bound at the returned index is equal with
* *probe.
*/
static int
partition_bound_bsearch(PartitionKey key, PartitionBoundInfo boundinfo,
void *probe, bool probe_is_bound, bool *is_equal)
{
int lo,
hi,
mid;
lo = -1;
hi = boundinfo->ndatums - 1;
while (lo < hi)
{
int32 cmpval;
mid = (lo + hi + 1) / 2;
cmpval = partition_bound_cmp(key, boundinfo, mid, probe,
probe_is_bound);
if (cmpval <= 0)
{
lo = mid;
*is_equal = (cmpval == 0);
}
else
hi = mid - 1;
}
return lo;
}