postgresql/src/backend/catalog/partition.c

/*-------------------------------------------------------------------------
 *
 * partition.c
 *		  Partitioning related data structures and functions.
 *
 * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
 *		  src/backend/catalog/partition.c
 *
 *-------------------------------------------------------------------------
*/

#include "postgres.h"

#include "access/heapam.h"
#include "access/htup_details.h"
#include "access/nbtree.h"
#include "access/sysattr.h"
#include "catalog/dependency.h"
#include "catalog/indexing.h"
#include "catalog/objectaddress.h"
#include "catalog/partition.h"
#include "catalog/pg_collation.h"
#include "catalog/pg_inherits.h"
#include "catalog/pg_inherits_fn.h"
#include "catalog/pg_opclass.h"
#include "catalog/pg_type.h"
#include "executor/executor.h"
#include "miscadmin.h"
#include "nodes/makefuncs.h"
#include "nodes/nodeFuncs.h"
#include "nodes/parsenodes.h"
#include "optimizer/clauses.h"
#include "optimizer/planmain.h"
#include "optimizer/var.h"
#include "rewrite/rewriteManip.h"
#include "storage/lmgr.h"
#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/datum.h"
#include "utils/memutils.h"
#include "utils/fmgroids.h"
#include "utils/inval.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
#include "utils/ruleutils.h"
#include "utils/syscache.h"

/*
 * Information about bounds of a partitioned relation
 *
 * A list partition datum that is known to be NULL is never put into the
 * datums array. Instead, it is tracked using has_null and null_index fields.
 *
 * In the case of range partitioning, ndatums will typically be far less than
 * 2 * nparts, because a partition's upper bound and the next partition's lower
 * bound are the same in most common cases, and we only store one of them.
 *
 * In the case of list partitioning, the indexes array stores one entry for
 * every datum, which is the index of the partition that accepts a given datum.
 * In case of range partitioning, it stores one entry per distinct range
 * datum, which is the index of the partition for which a given datum
 * is an upper bound.
 */

/* Ternary value to represent what's contained in a range bound datum */
typedef enum RangeDatumContent
{
	RANGE_DATUM_FINITE = 0,		/* actual datum stored elsewhere */
	RANGE_DATUM_NEG_INF,		/* negative infinity */
	RANGE_DATUM_POS_INF			/* positive infinity */
} RangeDatumContent;

typedef struct PartitionBoundInfoData
{
	char		strategy;		/* list or range bounds? */
	int			ndatums;		/* Length of the datums following array */
	Datum	  **datums;			/* Array of datum-tuples with key->partnatts
								 * datums each */
	RangeDatumContent **content;/* what's contained in each range bound datum?
								 * (see the above enum); NULL for list
								 * partitioned tables */
	int		   *indexes;		/* Partition indexes; one entry per member of
								 * the datums array (plus one if range
								 * partitioned table) */
	bool		has_null;		/* Is there a null-accepting partition? false
								 * for range partitioned tables */
	int			null_index;		/* Index of the null-accepting partition; -1
								 * for range partitioned tables */
} PartitionBoundInfoData;

/*
 * When qsort'ing partition bounds after reading from the catalog, each bound
 * is represented with one of the following structs.
 */

/* One value coming from some (index'th) list partition */
typedef struct PartitionListValue
{
	int			index;
	Datum		value;
} PartitionListValue;

/* One bound of a range partition */
typedef struct PartitionRangeBound
{
	int			index;
	Datum	   *datums;			/* range bound datums */
	RangeDatumContent *content; /* what's contained in each datum? */
	bool		lower;			/* this is the lower (vs upper) bound */
} PartitionRangeBound;

static int32 qsort_partition_list_value_cmp(const void *a, const void *b,
							   void *arg);
static int32 qsort_partition_rbound_cmp(const void *a, const void *b,
						   void *arg);

static List *get_qual_for_list(PartitionKey key, PartitionBoundSpec *spec);
static List *get_qual_for_range(PartitionKey key, PartitionBoundSpec *spec);
static Oid get_partition_operator(PartitionKey key, int col,
					   StrategyNumber strategy, bool *need_relabel);
static List *generate_partition_qual(Relation rel, bool recurse);

static PartitionRangeBound *make_one_range_bound(PartitionKey key, int index,
					 List *datums, bool lower);
static int32 partition_rbound_cmp(PartitionKey key,
					 Datum *datums1, RangeDatumContent *content1, bool lower1,
					 PartitionRangeBound *b2);
static int32 partition_rbound_datum_cmp(PartitionKey key,
						   Datum *rb_datums, RangeDatumContent *rb_content,
						   Datum *tuple_datums);

static int32 partition_bound_cmp(PartitionKey key,
					PartitionBoundInfo boundinfo,
					int offset, void *probe, bool probe_is_bound);
static int partition_bound_bsearch(PartitionKey key,
						PartitionBoundInfo boundinfo,
						void *probe, bool probe_is_bound, bool *is_equal);

/* Support get_partition_for_tuple() */
static void FormPartitionKeyDatum(PartitionDispatch pd,
					  TupleTableSlot *slot,
					  EState *estate,
					  Datum *values,
					  bool *isnull);

/*
 * RelationBuildPartitionDesc
 *		Form rel's partition descriptor
 *
 * Not flushed from the cache by RelationClearRelation() unless changed because
 * of addition or removal of partition.
 */
void
RelationBuildPartitionDesc(Relation rel)
{
	List	   *inhoids,
			   *partoids;
	Oid		   *oids = NULL;
	List	   *boundspecs = NIL;
	ListCell   *cell;
	int			i,
				nparts;
	PartitionKey key = RelationGetPartitionKey(rel);
	PartitionDesc result;
	MemoryContext oldcxt;

	int			ndatums = 0;

	/* List partitioning specific */
	PartitionListValue **all_values = NULL;
	bool		found_null = false;
	int			null_index = -1;

	/* Range partitioning specific */
	PartitionRangeBound **rbounds = NULL;

	/*
	 * The following could happen in situations where rel has a pg_class entry
	 * but not the pg_partitioned_table entry yet.
	 */
	if (key == NULL)
		return;

	/* Get partition oids from pg_inherits */
	inhoids = find_inheritance_children(RelationGetRelid(rel), NoLock);

	/* Collect bound spec nodes in a list */
	i = 0;
	partoids = NIL;
	foreach(cell, inhoids)
	{
		Oid			inhrelid = lfirst_oid(cell);
		HeapTuple	tuple;
		Datum		datum;
		bool		isnull;
		Node	   *boundspec;

		tuple = SearchSysCache1(RELOID, inhrelid);

		/*
		 * It is possible that the pg_class tuple of a partition has not been
		 * updated yet to set its relpartbound field.  The only case where
		 * this happens is when we open the parent relation to check using its
		 * partition descriptor that a new partition's bound does not overlap
		 * some existing partition.
		 */
		if (!((Form_pg_class) GETSTRUCT(tuple))->relispartition)
		{
			ReleaseSysCache(tuple);
			continue;
		}

		datum = SysCacheGetAttr(RELOID, tuple,
								Anum_pg_class_relpartbound,
								&isnull);
		Assert(!isnull);
		boundspec = (Node *) stringToNode(TextDatumGetCString(datum));
		boundspecs = lappend(boundspecs, boundspec);
		partoids = lappend_oid(partoids, inhrelid);
		ReleaseSysCache(tuple);
	}

	nparts = list_length(partoids);

	if (nparts > 0)
	{
		oids = (Oid *) palloc(nparts * sizeof(Oid));
		i = 0;
		foreach(cell, partoids)
			oids[i++] = lfirst_oid(cell);

		/* Convert from node to the internal representation */
		if (key->strategy == PARTITION_STRATEGY_LIST)
		{
			List	   *non_null_values = NIL;

			/*
			 * Create a unified list of non-null values across all partitions.
			 */
			i = 0;
			found_null = false;
			null_index = -1;
			foreach(cell, boundspecs)
			{
				ListCell   *c;
				PartitionBoundSpec *spec = lfirst(cell);

				if (spec->strategy != PARTITION_STRATEGY_LIST)
					elog(ERROR, "invalid strategy in partition bound spec");

				foreach(c, spec->listdatums)
				{
					Const	   *val = lfirst(c);
					PartitionListValue *list_value = NULL;

					if (!val->constisnull)
					{
						list_value = (PartitionListValue *)
							palloc0(sizeof(PartitionListValue));
						list_value->index = i;
						list_value->value = val->constvalue;
					}
					else
					{
						/*
						 * Never put a null into the values array, flag
						 * instead for the code further down below where we
						 * construct the actual relcache struct.
						 */
						if (found_null)
							elog(ERROR, "found null more than once");
						found_null = true;
						null_index = i;
					}

					if (list_value)
						non_null_values = lappend(non_null_values,
												  list_value);
				}

				i++;
			}

			ndatums = list_length(non_null_values);

			/*
			 * Collect all list values in one array. Alongside the value, we
			 * also save the index of partition the value comes from.
			 */
			all_values = (PartitionListValue **) palloc(ndatums *
											   sizeof(PartitionListValue *));
			i = 0;
			foreach(cell, non_null_values)
			{
				PartitionListValue *src = lfirst(cell);

				all_values[i] = (PartitionListValue *)
					palloc(sizeof(PartitionListValue));
				all_values[i]->value = src->value;
				all_values[i]->index = src->index;
				i++;
			}

			qsort_arg(all_values, ndatums, sizeof(PartitionListValue *),
					  qsort_partition_list_value_cmp, (void *) key);
		}
		else if (key->strategy == PARTITION_STRATEGY_RANGE)
		{
			int			j,
						k;
			PartitionRangeBound **all_bounds,
					   *prev;
			bool	   *distinct_indexes;

			all_bounds = (PartitionRangeBound **) palloc0(2 * nparts *
											  sizeof(PartitionRangeBound *));
			distinct_indexes = (bool *) palloc(2 * nparts * sizeof(bool));

			/*
			 * Create a unified list of range bounds across all the
			 * partitions.
			 */
			i = j = 0;
			foreach(cell, boundspecs)
			{
				PartitionBoundSpec *spec = lfirst(cell);
				PartitionRangeBound *lower,
						   *upper;

				if (spec->strategy != PARTITION_STRATEGY_RANGE)
					elog(ERROR, "invalid strategy in partition bound spec");

				lower = make_one_range_bound(key, i, spec->lowerdatums,
											 true);
				upper = make_one_range_bound(key, i, spec->upperdatums,
											 false);
				all_bounds[j] = lower;
				all_bounds[j + 1] = upper;
				j += 2;
				i++;
			}
			Assert(j == 2 * nparts);

			/* Sort all the bounds in ascending order */
			qsort_arg(all_bounds, 2 * nparts,
					  sizeof(PartitionRangeBound *),
					  qsort_partition_rbound_cmp,
					  (void *) key);

			/*
			 * Count the number of distinct bounds to allocate an array of
			 * that size.
			 */
			ndatums = 0;
			prev = NULL;
			for (i = 0; i < 2 * nparts; i++)
			{
				PartitionRangeBound *cur = all_bounds[i];
				bool		is_distinct = false;
				int			j;

				/* Is current bound is distinct from the previous? */
				for (j = 0; j < key->partnatts; j++)
				{
					Datum		cmpval;

					if (prev == NULL)
					{
						is_distinct = true;
						break;
					}

					/*
					 * If either of them has infinite element, we can't equate
					 * them.  Even when both are infinite, they'd have
					 * opposite signs, because only one of cur and prev is a
					 * lower bound).
					 */
					if (cur->content[j] != RANGE_DATUM_FINITE ||
						prev->content[j] != RANGE_DATUM_FINITE)
					{
						is_distinct = true;
						break;
					}
					cmpval = FunctionCall2Coll(&key->partsupfunc[j],
											   key->partcollation[j],
											   cur->datums[j],
											   prev->datums[j]);
					if (DatumGetInt32(cmpval) != 0)
					{
						is_distinct = true;
						break;
					}
				}

				/*
				 * Count the current bound if it is distinct from the previous
				 * one.  Also, store if the index i contains a distinct bound
				 * that we'd like put in the relcache array.
				 */
				if (is_distinct)
				{
					distinct_indexes[i] = true;
					ndatums++;
				}
				else
					distinct_indexes[i] = false;

				prev = cur;
			}

			/*
			 * Finally save them in an array from where they will be copied
			 * into the relcache.
			 */
			rbounds = (PartitionRangeBound **) palloc(ndatums *
											  sizeof(PartitionRangeBound *));
			k = 0;
			for (i = 0; i < 2 * nparts; i++)
			{
				if (distinct_indexes[i])
					rbounds[k++] = all_bounds[i];
			}
			Assert(k == ndatums);
		}
		else
			elog(ERROR, "unexpected partition strategy: %d",
				 (int) key->strategy);
	}

	/* Now build the actual relcache partition descriptor */
	rel->rd_pdcxt = AllocSetContextCreate(CacheMemoryContext,
										  RelationGetRelationName(rel),
										  ALLOCSET_DEFAULT_SIZES);
	oldcxt = MemoryContextSwitchTo(rel->rd_pdcxt);

	result = (PartitionDescData *) palloc0(sizeof(PartitionDescData));
	result->nparts = nparts;
	if (nparts > 0)
	{
		PartitionBoundInfo boundinfo;
		int		   *mapping;
		int			next_index = 0;

		result->oids = (Oid *) palloc0(nparts * sizeof(Oid));

		boundinfo = (PartitionBoundInfoData *)
			palloc0(sizeof(PartitionBoundInfoData));
		boundinfo->strategy = key->strategy;
		boundinfo->ndatums = ndatums;
		boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *));

		/* Initialize mapping array with invalid values */
		mapping = (int *) palloc(sizeof(int) * nparts);
		for (i = 0; i < nparts; i++)
			mapping[i] = -1;

		switch (key->strategy)
		{
			case PARTITION_STRATEGY_LIST:
				{
					boundinfo->has_null = found_null;
					boundinfo->indexes = (int *) palloc(ndatums * sizeof(int));

					/*
					 * Copy values.  Indexes of individual values are mapped
					 * to canonical values so that they match for any two list
					 * partitioned tables with same number of partitions and
					 * same lists per partition.  One way to canonicalize is
					 * to assign the index in all_values[] of the smallest
					 * value of each partition, as the index of all of the
					 * partition's values.
					 */
					for (i = 0; i < ndatums; i++)
					{
						boundinfo->datums[i] = (Datum *) palloc(sizeof(Datum));
						boundinfo->datums[i][0] = datumCopy(all_values[i]->value,
														key->parttypbyval[0],
														 key->parttyplen[0]);

						/* If the old index has no mapping, assign one */
						if (mapping[all_values[i]->index] == -1)
							mapping[all_values[i]->index] = next_index++;

						boundinfo->indexes[i] = mapping[all_values[i]->index];
					}

					/*
					 * If null-accepting partition has no mapped index yet,
					 * assign one.  This could happen if such partition
					 * accepts only null and hence not covered in the above
					 * loop which only handled non-null values.
					 */
					if (found_null)
					{
						Assert(null_index >= 0);
						if (mapping[null_index] == -1)
							mapping[null_index] = next_index++;
					}

					/* All partition must now have a valid mapping */
					Assert(next_index == nparts);

					if (found_null)
						boundinfo->null_index = mapping[null_index];
					else
						boundinfo->null_index = -1;
					break;
				}

			case PARTITION_STRATEGY_RANGE:
				{
					boundinfo->content = (RangeDatumContent **) palloc(ndatums *
												sizeof(RangeDatumContent *));
					boundinfo->indexes = (int *) palloc((ndatums + 1) *
														sizeof(int));

					for (i = 0; i < ndatums; i++)
					{
						int			j;

						boundinfo->datums[i] = (Datum *) palloc(key->partnatts *
															  sizeof(Datum));
						boundinfo->content[i] = (RangeDatumContent *)
							palloc(key->partnatts *
								   sizeof(RangeDatumContent));
						for (j = 0; j < key->partnatts; j++)
						{
							if (rbounds[i]->content[j] == RANGE_DATUM_FINITE)
								boundinfo->datums[i][j] =
									datumCopy(rbounds[i]->datums[j],
											  key->parttypbyval[j],
											  key->parttyplen[j]);
							/* Remember, we are storing the tri-state value. */
							boundinfo->content[i][j] = rbounds[i]->content[j];
						}

						/*
						 * There is no mapping for invalid indexes.
						 *
						 * Any lower bounds in the rbounds array have invalid
						 * indexes assigned, because the values between the
						 * previous bound (if there is one) and this (lower)
						 * bound are not part of the range of any existing
						 * partition.
						 */
						if (rbounds[i]->lower)
							boundinfo->indexes[i] = -1;
						else
						{
							int			orig_index = rbounds[i]->index;

							/* If the old index is has no mapping, assign one */
							if (mapping[orig_index] == -1)
								mapping[orig_index] = next_index++;

							boundinfo->indexes[i] = mapping[orig_index];
						}
					}
					boundinfo->indexes[i] = -1;
					break;
				}

			default:
				elog(ERROR, "unexpected partition strategy: %d",
					 (int) key->strategy);
		}

		result->boundinfo = boundinfo;

		/*
		 * Now assign OIDs from the original array into mapped indexes of the
		 * result array.  Order of OIDs in the former is defined by the
		 * catalog scan that retrived them, whereas that in the latter is
		 * defined by canonicalized representation of the list values or the
		 * range bounds.
		 */
		for (i = 0; i < nparts; i++)
			result->oids[mapping[i]] = oids[i];
		pfree(mapping);
	}

	MemoryContextSwitchTo(oldcxt);
	rel->rd_partdesc = result;
}

/*
 * Are two partition bound collections logically equal?
 *
 * Used in the keep logic of relcache.c (ie, in RelationClearRelation()).
 * This is also useful when b1 and b2 are bound collections of two separate
 * relations, respectively, because PartitionBoundInfo is a canonical
 * representation of partition bounds.
 */
bool
partition_bounds_equal(PartitionKey key,
					   PartitionBoundInfo b1, PartitionBoundInfo b2)
{
	int			i;

	if (b1->strategy != b2->strategy)
		return false;

	if (b1->ndatums != b2->ndatums)
		return false;

	if (b1->has_null != b2->has_null)
		return false;

	if (b1->null_index != b2->null_index)
		return false;

	for (i = 0; i < b1->ndatums; i++)
	{
		int			j;

		for (j = 0; j < key->partnatts; j++)
		{
			int32		cmpval;

			cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[j],
													 key->partcollation[j],
													 b1->datums[i][j],
													 b2->datums[i][j]));
			if (cmpval != 0)
				return false;

			/* Range partitions can have infinite datums */
			if (b1->content != NULL && b1->content[i][j] != b2->content[i][j])
				return false;
		}

		if (b1->indexes[i] != b2->indexes[i])
			return false;
	}

	/* There are ndatums+1 indexes in case of range partitions */
	if (key->strategy == PARTITION_STRATEGY_RANGE &&
		b1->indexes[i] != b2->indexes[i])
		return false;

	return true;
}

/*
 * check_new_partition_bound
 *
 * Checks if the new partition's bound overlaps any of the existing partitions
 * of parent.  Also performs additional checks as necessary per strategy.
 */
void
check_new_partition_bound(char *relname, Relation parent, Node *bound)
{
	PartitionBoundSpec *spec = (PartitionBoundSpec *) bound;
	PartitionKey key = RelationGetPartitionKey(parent);
	PartitionDesc partdesc = RelationGetPartitionDesc(parent);
	ParseState *pstate = make_parsestate(NULL);
	int			with = -1;
	bool		overlap = false;

	switch (key->strategy)
	{
		case PARTITION_STRATEGY_LIST:
			{
				Assert(spec->strategy == PARTITION_STRATEGY_LIST);

				if (partdesc->nparts > 0)
				{
					PartitionBoundInfo boundinfo = partdesc->boundinfo;
					ListCell   *cell;

					Assert(boundinfo &&
						   boundinfo->strategy == PARTITION_STRATEGY_LIST &&
						   (boundinfo->ndatums > 0 || boundinfo->has_null));

					foreach(cell, spec->listdatums)
					{
						Const	   *val = lfirst(cell);

						if (!val->constisnull)
						{
							int			offset;
							bool		equal;

							offset = partition_bound_bsearch(key, boundinfo,
															 &val->constvalue,
															 true, &equal);
							if (offset >= 0 && equal)
							{
								overlap = true;
								with = boundinfo->indexes[offset];
								break;
							}
						}
						else if (boundinfo->has_null)
						{
							overlap = true;
							with = boundinfo->null_index;
							break;
						}
					}
				}

				break;
			}

		case PARTITION_STRATEGY_RANGE:
			{
				PartitionRangeBound *lower,
						   *upper;

				Assert(spec->strategy == PARTITION_STRATEGY_RANGE);
				lower = make_one_range_bound(key, -1, spec->lowerdatums, true);
				upper = make_one_range_bound(key, -1, spec->upperdatums, false);

				/*
				 * First check if the resulting range would be empty with
				 * specified lower and upper bounds
				 */
				if (partition_rbound_cmp(key, lower->datums, lower->content, true,
										 upper) >= 0)
					ereport(ERROR,
							(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
					errmsg("cannot create range partition with empty range"),
							 parser_errposition(pstate, spec->location)));

				if (partdesc->nparts > 0)
				{
					PartitionBoundInfo boundinfo = partdesc->boundinfo;
					int			off1,
								off2;
					bool		equal = false;

					Assert(boundinfo && boundinfo->ndatums > 0 &&
						   boundinfo->strategy == PARTITION_STRATEGY_RANGE);

					/*
					 * Find the greatest index of a range bound that is less
					 * than or equal with the new lower bound.
					 */
					off1 = partition_bound_bsearch(key, boundinfo, lower, true,
												   &equal);

					/*
					 * If equal has been set to true, that means the new lower
					 * bound is found to be equal with the bound at off1,
					 * which clearly means an overlap with the partition at
					 * index off1+1).
					 *
					 * Otherwise, check if there is a "gap" that could be
					 * occupied by the new partition.  In case of a gap, the
					 * new upper bound should not cross past the upper
					 * boundary of the gap, that is, off2 == off1 should be
					 * true.
					 */
					if (!equal && boundinfo->indexes[off1 + 1] < 0)
					{
						off2 = partition_bound_bsearch(key, boundinfo, upper,
													   true, &equal);

						if (equal || off1 != off2)
						{
							overlap = true;
							with = boundinfo->indexes[off2 + 1];
						}
					}
					else
					{
						overlap = true;
						with = boundinfo->indexes[off1 + 1];
					}
				}

				break;
			}

		default:
			elog(ERROR, "unexpected partition strategy: %d",
				 (int) key->strategy);
	}

	if (overlap)
	{
		Assert(with >= 0);
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
				 errmsg("partition \"%s\" would overlap partition \"%s\"",
						relname, get_rel_name(partdesc->oids[with])),
				 parser_errposition(pstate, spec->location)));
	}
}

/*
 * get_partition_parent
 *
 * Returns inheritance parent of a partition by scanning pg_inherits
 *
 * Note: Because this function assumes that the relation whose OID is passed
 * as an argument will have precisely one parent, it should only be called
 * when it is known that the relation is a partition.
 */
Oid
get_partition_parent(Oid relid)
{
	Form_pg_inherits form;
	Relation	catalogRelation;
	SysScanDesc scan;
	ScanKeyData key[2];
	HeapTuple	tuple;
	Oid			result;

	catalogRelation = heap_open(InheritsRelationId, AccessShareLock);

	ScanKeyInit(&key[0],
				Anum_pg_inherits_inhrelid,
				BTEqualStrategyNumber, F_OIDEQ,
				ObjectIdGetDatum(relid));
	ScanKeyInit(&key[1],
				Anum_pg_inherits_inhseqno,
				BTEqualStrategyNumber, F_INT4EQ,
				Int32GetDatum(1));

	scan = systable_beginscan(catalogRelation, InheritsRelidSeqnoIndexId, true,
							  NULL, 2, key);

	tuple = systable_getnext(scan);
	Assert(HeapTupleIsValid(tuple));

	form = (Form_pg_inherits) GETSTRUCT(tuple);
	result = form->inhparent;

	systable_endscan(scan);
	heap_close(catalogRelation, AccessShareLock);

	return result;
}

/*
 * get_qual_from_partbound
 *		Given a parser node for partition bound, return the list of executable
 *		expressions as partition constraint
 */
List *
get_qual_from_partbound(Relation rel, Relation parent, Node *bound)
{
	PartitionBoundSpec *spec = (PartitionBoundSpec *) bound;
	PartitionKey key = RelationGetPartitionKey(parent);
	List	   *my_qual = NIL;
	TupleDesc	parent_tupdesc = RelationGetDescr(parent);
	AttrNumber	parent_attno;
	AttrNumber *partition_attnos;
	bool		found_whole_row;

	Assert(key != NULL);

	switch (key->strategy)
	{
		case PARTITION_STRATEGY_LIST:
			Assert(spec->strategy == PARTITION_STRATEGY_LIST);
			my_qual = get_qual_for_list(key, spec);
			break;

		case PARTITION_STRATEGY_RANGE:
			Assert(spec->strategy == PARTITION_STRATEGY_RANGE);
			my_qual = get_qual_for_range(key, spec);
			break;

		default:
			elog(ERROR, "unexpected partition strategy: %d",
				 (int) key->strategy);
	}

	/*
	 * Translate vars in the generated expression to have correct attnos. Note
	 * that the vars in my_qual bear attnos dictated by key which carries
	 * physical attnos of the parent.  We must allow for a case where physical
	 * attnos of a partition can be different from the parent.
	 */
	partition_attnos = (AttrNumber *)
		palloc0(parent_tupdesc->natts * sizeof(AttrNumber));
	for (parent_attno = 1; parent_attno <= parent_tupdesc->natts;
		 parent_attno++)
	{
		Form_pg_attribute attribute = parent_tupdesc->attrs[parent_attno - 1];
		char	   *attname = NameStr(attribute->attname);
		AttrNumber	partition_attno;

		if (attribute->attisdropped)
			continue;

		partition_attno = get_attnum(RelationGetRelid(rel), attname);
		partition_attnos[parent_attno - 1] = partition_attno;
	}

	my_qual = (List *) map_variable_attnos((Node *) my_qual,
										   1, 0,
										   partition_attnos,
										   parent_tupdesc->natts,
										   &found_whole_row);
	/* there can never be a whole-row reference here */
	if (found_whole_row)
		elog(ERROR, "unexpected whole-row reference found in partition key");

	return my_qual;
}

/*
 * RelationGetPartitionQual
 *
 * Returns a list of partition quals
 */
List *
RelationGetPartitionQual(Relation rel, bool recurse)
{
	/* Quick exit */
	if (!rel->rd_rel->relispartition)
		return NIL;

	return generate_partition_qual(rel, recurse);
}

/* Turn an array of OIDs with N elements into a list */
#define OID_ARRAY_TO_LIST(arr, N, list) \
	do\
	{\
		int		i;\
		for (i = 0; i < (N); i++)\
			(list) = lappend_oid((list), (arr)[i]);\
	} while(0)

/*
 * RelationGetPartitionDispatchInfo
 *		Returns information necessary to route tuples down a partition tree
 *
 * All the partitions will be locked with lockmode, unless it is NoLock.
 * A list of the OIDs of all the leaf partition of rel is returned in
 * *leaf_part_oids.
 */
PartitionDispatch *
RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
								 int *num_parted, List **leaf_part_oids)
{
	PartitionDesc rootpartdesc = RelationGetPartitionDesc(rel);
	PartitionDispatchData **pd;
	List	   *all_parts = NIL,
			   *parted_rels;
	ListCell   *lc;
	int			i,
				k,
				offset;

	/*
	 * Lock partitions and make a list of the partitioned ones to prepare
	 * their PartitionDispatch objects below.
	 *
	 * Cannot use find_all_inheritors() here, because then the order of OIDs
	 * in parted_rels list would be unknown, which does not help, because we
	 * we assign indexes within individual PartitionDispatch in an order that
	 * is predetermined (determined by the order of OIDs in individual
	 * partition descriptors).
	 */
	*num_parted = 1;
	parted_rels = list_make1(rel);
	OID_ARRAY_TO_LIST(rootpartdesc->oids, rootpartdesc->nparts, all_parts);
	foreach(lc, all_parts)
	{
		Relation	partrel = heap_open(lfirst_oid(lc), lockmode);
		PartitionDesc partdesc = RelationGetPartitionDesc(partrel);

		/*
		 * If this partition is a partitioned table, add its children to the
		 * end of the list, so that they are processed as well.
		 */
		if (partdesc)
		{
			(*num_parted)++;
			parted_rels = lappend(parted_rels, partrel);
			OID_ARRAY_TO_LIST(partdesc->oids, partdesc->nparts, all_parts);
		}
		else
			heap_close(partrel, NoLock);

		/*
		 * We keep the partitioned ones open until we're done using the
		 * information being collected here (for example, see
		 * ExecEndModifyTable).
		 */
	}

	/*
	 * We want to create two arrays - one for leaf partitions and another for
	 * partitioned tables (including the root table and internal partitions).
	 * While we only create the latter here, leaf partition array of suitable
	 * objects (such as, ResultRelInfo) is created by the caller using the
	 * list of OIDs we return.  Indexes into these arrays get assigned in a
	 * breadth-first manner, whereby partitions of any given level are placed
	 * consecutively in the respective arrays.
	 */
	pd = (PartitionDispatchData **) palloc(*num_parted *
										   sizeof(PartitionDispatchData *));
	*leaf_part_oids = NIL;
	i = k = offset = 0;
	foreach(lc, parted_rels)
	{
		Relation	partrel = lfirst(lc);
		PartitionKey partkey = RelationGetPartitionKey(partrel);
		PartitionDesc partdesc = RelationGetPartitionDesc(partrel);
		int			j,
					m;

		pd[i] = (PartitionDispatch) palloc(sizeof(PartitionDispatchData));
		pd[i]->reldesc = partrel;
		pd[i]->key = partkey;
		pd[i]->keystate = NIL;
		pd[i]->partdesc = partdesc;
		pd[i]->indexes = (int *) palloc(partdesc->nparts * sizeof(int));

		/*
		 * Indexes corresponding to the internal partitions are multiplied by
		 * -1 to distinguish them from those of leaf partitions.  Encountering
		 * an index >= 0 means we found a leaf partition, which is immediately
		 * returned as the partition we are looking for.  A negative index
		 * means we found a partitioned table, whose PartitionDispatch object
		 * is located at the above index multiplied back by -1.  Using the
		 * PartitionDispatch object, search is continued further down the
		 * partition tree.
		 */
		m = 0;
		for (j = 0; j < partdesc->nparts; j++)
		{
			Oid			partrelid = partdesc->oids[j];

			if (get_rel_relkind(partrelid) != RELKIND_PARTITIONED_TABLE)
			{
				*leaf_part_oids = lappend_oid(*leaf_part_oids, partrelid);
				pd[i]->indexes[j] = k++;
			}
			else
			{
				/*
				 * offset denotes the number of partitioned tables of upper
				 * levels including those of the current level.  Any partition
				 * of this table must belong to the next level and hence will
				 * be placed after the last partitioned table of this level.
				 */
				pd[i]->indexes[j] = -(1 + offset + m);
				m++;
			}
		}
		i++;

		/*
		 * This counts the number of partitioned tables at upper levels
		 * including those of the current level.
		 */
		offset += m;
	}

	return pd;
}

/* Module-local functions */

/*
 * get_qual_for_list
 *
 * Returns a list of expressions to use as a list partition's constraint.
 */
static List *
get_qual_for_list(PartitionKey key, PartitionBoundSpec *spec)
{
	List	   *result;
	ArrayExpr  *arr;
	ScalarArrayOpExpr *opexpr;
	ListCell   *cell,
			   *prev,
			   *next;
	Node	   *keyCol;
	Oid			operoid;
	bool		need_relabel,
				list_has_null = false;
	NullTest   *nulltest1 = NULL,
			   *nulltest2 = NULL;

	/* Left operand is either a simple Var or arbitrary expression */
	if (key->partattrs[0] != 0)
		keyCol = (Node *) makeVar(1,
								  key->partattrs[0],
								  key->parttypid[0],
								  key->parttypmod[0],
								  key->parttypcoll[0],
								  0);
	else
		keyCol = (Node *) copyObject(linitial(key->partexprs));

	/*
	 * We must remove any NULL value in the list; we handle it separately
	 * below.
	 */
	prev = NULL;
	for (cell = list_head(spec->listdatums); cell; cell = next)
	{
		Const	   *val = (Const *) lfirst(cell);

		next = lnext(cell);

		if (val->constisnull)
		{
			list_has_null = true;
			spec->listdatums = list_delete_cell(spec->listdatums,
												cell, prev);
		}
		else
			prev = cell;
	}

	if (!list_has_null)
	{
		/*
		 * Gin up a col IS NOT NULL test that will be AND'd with other
		 * expressions
		 */
		nulltest1 = makeNode(NullTest);
		nulltest1->arg = (Expr *) keyCol;
		nulltest1->nulltesttype = IS_NOT_NULL;
		nulltest1->argisrow = false;
		nulltest1->location = -1;
	}
	else
	{
		/*
		 * Gin up a col IS NULL test that will be OR'd with other expressions
		 */
		nulltest2 = makeNode(NullTest);
		nulltest2->arg = (Expr *) keyCol;
		nulltest2->nulltesttype = IS_NULL;
		nulltest2->argisrow = false;
		nulltest2->location = -1;
	}

	/* Right operand is an ArrayExpr containing this partition's values */
	arr = makeNode(ArrayExpr);
	arr->array_typeid = !type_is_array(key->parttypid[0])
		? get_array_type(key->parttypid[0])
		: key->parttypid[0];
	arr->array_collid = key->parttypcoll[0];
	arr->element_typeid = key->parttypid[0];
	arr->elements = spec->listdatums;
	arr->multidims = false;
	arr->location = -1;

	/* Get the correct btree equality operator */
	operoid = get_partition_operator(key, 0, BTEqualStrategyNumber,
									 &need_relabel);
	if (need_relabel || key->partcollation[0] != key->parttypcoll[0])
		keyCol = (Node *) makeRelabelType((Expr *) keyCol,
										  key->partopcintype[0],
										  -1,
										  key->partcollation[0],
										  COERCE_EXPLICIT_CAST);

	/* Build leftop = ANY (rightop) */
	opexpr = makeNode(ScalarArrayOpExpr);
	opexpr->opno = operoid;
	opexpr->opfuncid = get_opcode(operoid);
	opexpr->useOr = true;
	opexpr->inputcollid = key->partcollation[0];
	opexpr->args = list_make2(keyCol, arr);
	opexpr->location = -1;

	if (nulltest1)
		result = list_make2(nulltest1, opexpr);
	else if (nulltest2)
	{
		Expr	   *or;

		or = makeBoolExpr(OR_EXPR, list_make2(nulltest2, opexpr), -1);
		result = list_make1(or);
	}
	else
		result = list_make1(opexpr);

	return result;
}

/*
 * get_qual_for_range
 *
 * Get a list of OpExpr's to use as a range partition's constraint.
 */
static List *
get_qual_for_range(PartitionKey key, PartitionBoundSpec *spec)
{
	List	   *result = NIL;
	ListCell   *cell1,
			   *cell2,
			   *partexprs_item;
	int			i;

	/*
	 * Iterate over columns of the key, emitting an OpExpr for each using the
	 * corresponding lower and upper datums as constant operands.
	 */
	i = 0;
	partexprs_item = list_head(key->partexprs);
	forboth(cell1, spec->lowerdatums, cell2, spec->upperdatums)
	{
		PartitionRangeDatum *ldatum = lfirst(cell1),
				   *udatum = lfirst(cell2);
		Node	   *keyCol;
		Const	   *lower_val = NULL,
				   *upper_val = NULL;
		EState	   *estate;
		MemoryContext oldcxt;
		Expr	   *test_expr;
		ExprState  *test_exprstate;
		Datum		test_result;
		bool		isNull;
		bool		need_relabel = false;
		Oid			operoid;
		NullTest   *nulltest;

		/* Left operand */
		if (key->partattrs[i] != 0)
		{
			keyCol = (Node *) makeVar(1,
									  key->partattrs[i],
									  key->parttypid[i],
									  key->parttypmod[i],
									  key->parttypcoll[i],
									  0);
		}
		else
		{
			keyCol = (Node *) copyObject(lfirst(partexprs_item));
			partexprs_item = lnext(partexprs_item);
		}

		/*
		 * Emit a IS NOT NULL expression for non-Var keys, because whereas
		 * simple attributes are covered by NOT NULL constraints, expression
		 * keys are still nullable which is not acceptable in case of range
		 * partitioning.
		 */
		if (!IsA(keyCol, Var))
		{
			nulltest = makeNode(NullTest);
			nulltest->arg = (Expr *) keyCol;
			nulltest->nulltesttype = IS_NOT_NULL;
			nulltest->argisrow = false;
			nulltest->location = -1;
			result = lappend(result, nulltest);
		}

		/*
		 * Stop at this column if either of lower or upper datum is infinite,
		 * but do emit an OpExpr for the non-infinite datum.
		 */
		if (!ldatum->infinite)
			lower_val = (Const *) ldatum->value;
		if (!udatum->infinite)
			upper_val = (Const *) udatum->value;

		/*
		 * If lower_val and upper_val are both finite and happen to be equal,
		 * emit only (keyCol = lower_val) for this column, because all rows in
		 * this partition could only ever contain this value (ie, lower_val)
		 * in the current partitioning column.  We must consider further
		 * columns because the above condition does not fully constrain the
		 * rows of this partition.
		 */
		if (lower_val && upper_val)
		{
			/* Get the correct btree equality operator for the test */
			operoid = get_partition_operator(key, i, BTEqualStrategyNumber,
											 &need_relabel);

			/* Create the test expression */
			estate = CreateExecutorState();
			oldcxt = MemoryContextSwitchTo(estate->es_query_cxt);
			test_expr = make_opclause(operoid,
									  BOOLOID,
									  false,
									  (Expr *) lower_val,
									  (Expr *) upper_val,
									  InvalidOid,
									  key->partcollation[i]);
			fix_opfuncids((Node *) test_expr);
			test_exprstate = ExecInitExpr(test_expr, NULL);
			test_result = ExecEvalExprSwitchContext(test_exprstate,
											  GetPerTupleExprContext(estate),
													&isNull, NULL);
			MemoryContextSwitchTo(oldcxt);
			FreeExecutorState(estate);

			if (DatumGetBool(test_result))
			{
				/* This can never be, but it's better to make sure */
				if (i == key->partnatts - 1)
					elog(ERROR, "invalid range bound specification");

				if (need_relabel || key->partcollation[i] != key->parttypcoll[i])
					keyCol = (Node *) makeRelabelType((Expr *) keyCol,
													  key->partopcintype[i],
													  -1,
													  key->partcollation[i],
													  COERCE_EXPLICIT_CAST);
				result = lappend(result,
								 make_opclause(operoid,
											   BOOLOID,
											   false,
											   (Expr *) keyCol,
											   (Expr *) lower_val,
											   InvalidOid,
											   key->partcollation[i]));

				/* Go over to consider the next column. */
				i++;
				continue;
			}
		}

		/*
		 * We can say here that lower_val != upper_val.  Emit expressions
		 * (keyCol >= lower_val) and (keyCol < upper_val), then stop.
		 */
		if (lower_val)
		{
			operoid = get_partition_operator(key, i,
											 BTGreaterEqualStrategyNumber,
											 &need_relabel);

			if (need_relabel || key->partcollation[i] != key->parttypcoll[i])
				keyCol = (Node *) makeRelabelType((Expr *) keyCol,
												  key->partopcintype[i],
												  -1,
												  key->partcollation[i],
												  COERCE_EXPLICIT_CAST);
			result = lappend(result,
							 make_opclause(operoid,
										   BOOLOID,
										   false,
										   (Expr *) keyCol,
										   (Expr *) lower_val,
										   InvalidOid,
										   key->partcollation[i]));
		}

		if (upper_val)
		{
			operoid = get_partition_operator(key, i,
											 BTLessStrategyNumber,
											 &need_relabel);

			if (need_relabel || key->partcollation[i] != key->parttypcoll[i])
				keyCol = (Node *) makeRelabelType((Expr *) keyCol,
												  key->partopcintype[i],
												  -1,
												  key->partcollation[i],
												  COERCE_EXPLICIT_CAST);

			result = lappend(result,
							 make_opclause(operoid,
										   BOOLOID,
										   false,
										   (Expr *) keyCol,
										   (Expr *) upper_val,
										   InvalidOid,
										   key->partcollation[i]));
		}

		/*
		 * We can stop at this column, because we would not have checked the
		 * next column when routing a given row into this partition.
		 */
		break;
	}

	return result;
}

/*
 * get_partition_operator
 *
 * Return oid of the operator of given strategy for a given partition key
 * column.
 */
static Oid
get_partition_operator(PartitionKey key, int col, StrategyNumber strategy,
					   bool *need_relabel)
{
	Oid			operoid;

	/*
	 * First check if there exists an operator of the given strategy, with
	 * this column's type as both its lefttype and righttype, in the
	 * partitioning operator family specified for the column.
	 */
	operoid = get_opfamily_member(key->partopfamily[col],
								  key->parttypid[col],
								  key->parttypid[col],
								  strategy);

	/*
	 * If one doesn't exist, we must resort to using an operator in the same
	 * opreator family but with the operator class declared input type.  It is
	 * OK to do so, because the column's type is known to be binary-coercible
	 * with the operator class input type (otherwise, the operator class in
	 * question would not have been accepted as the partitioning operator
	 * class).  We must however inform the caller to wrap the non-Const
	 * expression with a RelabelType node to denote the implicit coercion. It
	 * ensures that the resulting expression structurally matches similarly
	 * processed expressions within the optimizer.
	 */
	if (!OidIsValid(operoid))
	{
		operoid = get_opfamily_member(key->partopfamily[col],
									  key->partopcintype[col],
									  key->partopcintype[col],
									  strategy);
		*need_relabel = true;
	}
	else
		*need_relabel = false;

	if (!OidIsValid(operoid))
		elog(ERROR, "could not find operator for partitioning");

	return operoid;
}

/*
 * generate_partition_qual
 *
 * Generate partition predicate from rel's partition bound expression
 *
 * Result expression tree is stored CacheMemoryContext to ensure it survives
 * as long as the relcache entry. But we should be running in a less long-lived
 * working context. To avoid leaking cache memory if this routine fails partway
 * through, we build in working memory and then copy the completed structure
 * into cache memory.
 */
static List *
generate_partition_qual(Relation rel, bool recurse)
{
	HeapTuple	tuple;
	MemoryContext oldcxt;
	Datum		boundDatum;
	bool		isnull;
	Node	   *bound;
	List	   *my_qual = NIL,
			   *result = NIL;
	Relation	parent;

	/* Guard against stack overflow due to overly deep partition tree */
	check_stack_depth();

	/* Grab at least an AccessShareLock on the parent table */
	parent = heap_open(get_partition_parent(RelationGetRelid(rel)),
					   AccessShareLock);

	/* Quick copy */
	if (rel->rd_partcheck)
	{
		if (parent->rd_rel->relispartition && recurse)
			result = list_concat(generate_partition_qual(parent, true),
								 copyObject(rel->rd_partcheck));
		else
			result = copyObject(rel->rd_partcheck);

		heap_close(parent, AccessShareLock);
		return result;
	}

	/* Get pg_class.relpartbound */
	if (!rel->rd_rel->relispartition)	/* should not happen */
		elog(ERROR, "relation \"%s\" has relispartition = false",
			 RelationGetRelationName(rel));
	tuple = SearchSysCache1(RELOID, RelationGetRelid(rel));
	boundDatum = SysCacheGetAttr(RELOID, tuple,
								 Anum_pg_class_relpartbound,
								 &isnull);
	if (isnull)					/* should not happen */
		elog(ERROR, "relation \"%s\" has relpartbound = null",
			 RelationGetRelationName(rel));
	bound = stringToNode(TextDatumGetCString(boundDatum));
	ReleaseSysCache(tuple);

	my_qual = get_qual_from_partbound(rel, parent, bound);

	/* If requested, add parent's quals to the list (if any) */
	if (parent->rd_rel->relispartition && recurse)
	{
		List	   *parent_check;

		parent_check = generate_partition_qual(parent, true);
		result = list_concat(parent_check, my_qual);
	}
	else
		result = my_qual;

	/* Save a copy of my_qual in the relcache */
	oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
	rel->rd_partcheck = copyObject(my_qual);
	MemoryContextSwitchTo(oldcxt);

	/* Keep the parent locked until commit */
	heap_close(parent, NoLock);

	return result;
}

/* ----------------
 *		FormPartitionKeyDatum
 *			Construct values[] and isnull[] arrays for the partition key
 *			of a tuple.
 *
 *	pd				Partition dispatch object of the partitioned table
 *	slot			Heap tuple from which to extract partition key
 *	estate			executor state for evaluating any partition key
 *					expressions (must be non-NULL)
 *	values			Array of partition key Datums (output area)
 *	isnull			Array of is-null indicators (output area)
 *
 * the ecxt_scantuple slot of estate's per-tuple expr context must point to
 * the heap tuple passed in.
 * ----------------
 */
static void
FormPartitionKeyDatum(PartitionDispatch pd,
					  TupleTableSlot *slot,
					  EState *estate,
					  Datum *values,
					  bool *isnull)
{
	ListCell   *partexpr_item;
	int			i;

	if (pd->key->partexprs != NIL && pd->keystate == NIL)
	{
		/* Check caller has set up context correctly */
		Assert(estate != NULL &&
			   GetPerTupleExprContext(estate)->ecxt_scantuple == slot);

		/* First time through, set up expression evaluation state */
		pd->keystate = (List *) ExecPrepareExpr((Expr *) pd->key->partexprs,
												estate);
	}

	partexpr_item = list_head(pd->keystate);
	for (i = 0; i < pd->key->partnatts; i++)
	{
		AttrNumber	keycol = pd->key->partattrs[i];
		Datum		datum;
		bool		isNull;

		if (keycol != 0)
		{
			/* Plain column; get the value directly from the heap tuple */
			datum = slot_getattr(slot, keycol, &isNull);
		}
		else
		{
			/* Expression; need to evaluate it */
			if (partexpr_item == NULL)
				elog(ERROR, "wrong number of partition key expressions");
			datum = ExecEvalExprSwitchContext((ExprState *) lfirst(partexpr_item),
											  GetPerTupleExprContext(estate),
											  &isNull,
											  NULL);
			partexpr_item = lnext(partexpr_item);
		}
		values[i] = datum;
		isnull[i] = isNull;
	}

	if (partexpr_item != NULL)
		elog(ERROR, "wrong number of partition key expressions");
}

/*
 * get_partition_for_tuple
 *		Finds a leaf partition for tuple contained in *slot
 *
 * Returned value is the sequence number of the leaf partition thus found,
 * or -1 if no leaf partition is found for the tuple.  *failed_at is set
 * to the OID of the partitioned table whose partition was not found in
 * the latter case.
 */
int
get_partition_for_tuple(PartitionDispatch *pd,
						TupleTableSlot *slot,
						EState *estate,
						Oid *failed_at)
{
	PartitionDispatch parent;
	Datum		values[PARTITION_MAX_KEYS];
	bool		isnull[PARTITION_MAX_KEYS];
	int			cur_offset,
				cur_index;
	int			i;

	/* start with the root partitioned table */
	parent = pd[0];
	while (true)
	{
		PartitionKey key = parent->key;
		PartitionDesc partdesc = parent->partdesc;

		/* Quick exit */
		if (partdesc->nparts == 0)
		{
			*failed_at = RelationGetRelid(parent->reldesc);
			return -1;
		}

		/* Extract partition key from tuple */
		FormPartitionKeyDatum(parent, slot, estate, values, isnull);

		if (key->strategy == PARTITION_STRATEGY_RANGE)
		{
			/* Disallow nulls in the range partition key of the tuple */
			for (i = 0; i < key->partnatts; i++)
				if (isnull[i])
					ereport(ERROR,
							(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
						errmsg("range partition key of row contains null")));
		}

		if (partdesc->boundinfo->has_null && isnull[0])
			/* Tuple maps to the null-accepting list partition */
			cur_index = partdesc->boundinfo->null_index;
		else
		{
			/* Else bsearch in partdesc->boundinfo */
			bool		equal = false;

			cur_offset = partition_bound_bsearch(key, partdesc->boundinfo,
												 values, false, &equal);
			switch (key->strategy)
			{
				case PARTITION_STRATEGY_LIST:
					if (cur_offset >= 0 && equal)
						cur_index = partdesc->boundinfo->indexes[cur_offset];
					else
						cur_index = -1;
					break;

				case PARTITION_STRATEGY_RANGE:

					/*
					 * Offset returned is such that the bound at offset is
					 * found to be less or equal with the tuple. So, the bound
					 * at offset+1 would be the upper bound.
					 */
					cur_index = partdesc->boundinfo->indexes[cur_offset + 1];
					break;

				default:
					elog(ERROR, "unexpected partition strategy: %d",
						 (int) key->strategy);
			}
		}

		/*
		 * cur_index < 0 means we failed to find a partition of this parent.
		 * cur_index >= 0 means we either found the leaf partition, or the
		 * next parent to find a partition of.
		 */
		if (cur_index < 0)
		{
			*failed_at = RelationGetRelid(parent->reldesc);
			return -1;
		}
		else if (parent->indexes[cur_index] < 0)
			parent = pd[-parent->indexes[cur_index]];
		else
			break;
	}

	return parent->indexes[cur_index];
}

/*
 * qsort_partition_list_value_cmp
 *
 * Compare two list partition bound datums
 */
static int32
qsort_partition_list_value_cmp(const void *a, const void *b, void *arg)
{
	Datum		val1 = (*(const PartitionListValue **) a)->value,
				val2 = (*(const PartitionListValue **) b)->value;
	PartitionKey key = (PartitionKey) arg;

	return DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0],
										   key->partcollation[0],
										   val1, val2));
}

/*
 * make_one_range_bound
 *
 * Return a PartitionRangeBound given a list of PartitionRangeDatum elements
 * and a flag telling whether the bound is lower or not.  Made into a function
 * because there are multiple sites that want to use this facility.
 */
static PartitionRangeBound *
make_one_range_bound(PartitionKey key, int index, List *datums, bool lower)
{
	PartitionRangeBound *bound;
	ListCell   *cell;
	int			i;

	bound = (PartitionRangeBound *) palloc0(sizeof(PartitionRangeBound));
	bound->index = index;
	bound->datums = (Datum *) palloc0(key->partnatts * sizeof(Datum));
	bound->content = (RangeDatumContent *) palloc0(key->partnatts *
												   sizeof(RangeDatumContent));
	bound->lower = lower;

	i = 0;
	foreach(cell, datums)
	{
		PartitionRangeDatum *datum = lfirst(cell);

		/* What's contained in this range datum? */
		bound->content[i] = !datum->infinite
			? RANGE_DATUM_FINITE
			: (lower ? RANGE_DATUM_NEG_INF
			   : RANGE_DATUM_POS_INF);

		if (bound->content[i] == RANGE_DATUM_FINITE)
		{
			Const	   *val = (Const *) datum->value;

			if (val->constisnull)
				elog(ERROR, "invalid range bound datum");
			bound->datums[i] = val->constvalue;
		}

		i++;
	}

	return bound;
}

/* Used when sorting range bounds across all range partitions */
static int32
qsort_partition_rbound_cmp(const void *a, const void *b, void *arg)
{
	PartitionRangeBound *b1 = (*(PartitionRangeBound *const *) a);
	PartitionRangeBound *b2 = (*(PartitionRangeBound *const *) b);
	PartitionKey key = (PartitionKey) arg;

	return partition_rbound_cmp(key, b1->datums, b1->content, b1->lower, b2);
}

/*
 * partition_rbound_cmp
 *
 * Return for two range bounds whether the 1st one (specified in datum1,
 * content1, and lower1) is <=, =, >= the bound specified in *b2
 */
static int32
partition_rbound_cmp(PartitionKey key,
					 Datum *datums1, RangeDatumContent *content1, bool lower1,
					 PartitionRangeBound *b2)
{
	int32		cmpval = 0;		/* placate compiler */
	int			i;
	Datum	   *datums2 = b2->datums;
	RangeDatumContent *content2 = b2->content;
	bool		lower2 = b2->lower;

	for (i = 0; i < key->partnatts; i++)
	{
		/*
		 * First, handle cases involving infinity, which don't require
		 * invoking the comparison proc.
		 */
		if (content1[i] != RANGE_DATUM_FINITE &&
			content2[i] != RANGE_DATUM_FINITE)

			/*
			 * Both are infinity, so they are equal unless one is negative
			 * infinity and other positive (or vice versa)
			 */
			return content1[i] == content2[i] ? 0
				: (content1[i] < content2[i] ? -1 : 1);
		else if (content1[i] != RANGE_DATUM_FINITE)
			return content1[i] == RANGE_DATUM_NEG_INF ? -1 : 1;
		else if (content2[i] != RANGE_DATUM_FINITE)
			return content2[i] == RANGE_DATUM_NEG_INF ? 1 : -1;

		cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[i],
												 key->partcollation[i],
												 datums1[i],
												 datums2[i]));
		if (cmpval != 0)
			break;
	}

	/*
	 * If the comparison is anything other than equal, we're done. If they
	 * compare equal though, we still have to consider whether the boundaries
	 * are inclusive or exclusive.  Exclusive one is considered smaller of the
	 * two.
	 */
	if (cmpval == 0 && lower1 != lower2)
		cmpval = lower1 ? 1 : -1;

	return cmpval;
}

/*
 * partition_rbound_datum_cmp
 *
 * Return whether range bound (specified in rb_datums, rb_content, and
 * rb_lower) <=, =, >= partition key of tuple (tuple_datums)
 */
static int32
partition_rbound_datum_cmp(PartitionKey key,
						   Datum *rb_datums, RangeDatumContent *rb_content,
						   Datum *tuple_datums)
{
	int			i;
	int32		cmpval = -1;

	for (i = 0; i < key->partnatts; i++)
	{
		if (rb_content[i] != RANGE_DATUM_FINITE)
			return rb_content[i] == RANGE_DATUM_NEG_INF ? -1 : 1;

		cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[i],
												 key->partcollation[i],
												 rb_datums[i],
												 tuple_datums[i]));
		if (cmpval != 0)
			break;
	}

	return cmpval;
}

/*
 * partition_bound_cmp
 *
 * Return whether the bound at offset in boundinfo is <=, =, >= the argument
 * specified in *probe.
 */
static int32
partition_bound_cmp(PartitionKey key, PartitionBoundInfo boundinfo,
					int offset, void *probe, bool probe_is_bound)
{
	Datum	   *bound_datums = boundinfo->datums[offset];
	int32		cmpval = -1;

	switch (key->strategy)
	{
		case PARTITION_STRATEGY_LIST:
			cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0],
													 key->partcollation[0],
													 bound_datums[0],
													 *(Datum *) probe));
			break;

		case PARTITION_STRATEGY_RANGE:
			{
				RangeDatumContent *content = boundinfo->content[offset];

				if (probe_is_bound)
				{
					/*
					 * We need to pass whether the existing bound is a lower
					 * bound, so that two equal-valued lower and upper bounds
					 * are not regarded equal.
					 */
					bool		lower = boundinfo->indexes[offset] < 0;

					cmpval = partition_rbound_cmp(key,
												bound_datums, content, lower,
											  (PartitionRangeBound *) probe);
				}
				else
					cmpval = partition_rbound_datum_cmp(key,
														bound_datums, content,
														(Datum *) probe);
				break;
			}

		default:
			elog(ERROR, "unexpected partition strategy: %d",
				 (int) key->strategy);
	}

	return cmpval;
}

/*
 * Binary search on a collection of partition bounds. Returns greatest index
 * of bound in array boundinfo->datums which is less or equal with *probe.
 * If all bounds in the array are greater than *probe, -1 is returned.
 *
 * *probe could either be a partition bound or a Datum array representing
 * the partition key of a tuple being routed; probe_is_bound tells which.
 * We pass that down to the comparison function so that it can interpret the
 * contents of *probe accordingly.
 *
 * *is_equal is set to whether the bound at the returned index is equal with
 * *probe.
 */
static int
partition_bound_bsearch(PartitionKey key, PartitionBoundInfo boundinfo,
						void *probe, bool probe_is_bound, bool *is_equal)
{
	int			lo,
				hi,
				mid;

	lo = -1;
	hi = boundinfo->ndatums - 1;
	while (lo < hi)
	{
		int32		cmpval;

		mid = (lo + hi + 1) / 2;
		cmpval = partition_bound_cmp(key, boundinfo, mid, probe,
									 probe_is_bound);
		if (cmpval <= 0)
		{
			lo = mid;
			*is_equal = (cmpval == 0);
		}
		else
			hi = mid - 1;
	}

	return lo;
}