Allow left join removals and unique joins on partitioned tables

This allows left join removals and unique joins to work with partitioned
tables.  The planner just lacked sufficient proofs that a given join
would not cause any row duplication.  Unique indexes currently serve as
that proof, so have get_relation_info() populate the indexlist for
partitioned tables too.

Author: Arne Roland
Reviewed-by: Alvaro Herrera, Zhihong Yu, Amit Langote, David Rowley
Discussion: https://postgr.es/m/c3b2408b7a39433b8230bbcd02e9f302@index.de
This commit is contained in:
David Rowley 2023-01-09 17:15:08 +13:00
parent 216a784829
commit 3c569049b7
7 changed files with 179 additions and 124 deletions

View File

@ -109,7 +109,9 @@ static void set_baserel_partition_constraint(Relation relation,
* If inhparent is true, all we need to do is set up the attr arrays:
* the RelOptInfo actually represents the appendrel formed by an inheritance
* tree, and so the parent rel's physical size and index information isn't
* important for it.
* important for it, however, for partitioned tables, we do populate the
* indexlist as the planner uses unique indexes as unique proofs for certain
* optimizations.
*/
void
get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
@ -175,10 +177,14 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
/*
* Make list of indexes. Ignore indexes on system catalogs if told to.
* Don't bother with indexes for an inheritance parent, either.
* Don't bother with indexes from traditional inheritance parents. For
* partitioned tables, we need a list of at least unique indexes as these
* serve as unique proofs for certain planner optimizations. However,
* let's not discriminate here and just record all partitioned indexes
* whether they're unique indexes or not.
*/
if (inhparent ||
(IgnoreSystemIndexes && IsSystemRelation(relation)))
if ((inhparent && relation->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
|| (IgnoreSystemIndexes && IsSystemRelation(relation)))
hasindex = false;
else
hasindex = relation->rd_rel->relhasindex;
@ -231,16 +237,6 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
continue;
}
/*
* Ignore partitioned indexes, since they are not usable for
* queries.
*/
if (indexRelation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
{
index_close(indexRelation, NoLock);
continue;
}
/*
* If the index is valid, but cannot yet be used, ignore it; but
* mark the plan we are generating as transient. See
@ -285,105 +281,129 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
info->relam = indexRelation->rd_rel->relam;
/* We copy just the fields we need, not all of rd_indam */
amroutine = indexRelation->rd_indam;
info->amcanorderbyop = amroutine->amcanorderbyop;
info->amoptionalkey = amroutine->amoptionalkey;
info->amsearcharray = amroutine->amsearcharray;
info->amsearchnulls = amroutine->amsearchnulls;
info->amcanparallel = amroutine->amcanparallel;
info->amhasgettuple = (amroutine->amgettuple != NULL);
info->amhasgetbitmap = amroutine->amgetbitmap != NULL &&
relation->rd_tableam->scan_bitmap_next_block != NULL;
info->amcanmarkpos = (amroutine->ammarkpos != NULL &&
amroutine->amrestrpos != NULL);
info->amcostestimate = amroutine->amcostestimate;
Assert(info->amcostestimate != NULL);
/* Fetch index opclass options */
info->opclassoptions = RelationGetIndexAttOptions(indexRelation, true);
/*
* Fetch the ordering information for the index, if any.
* We don't have an AM for partitioned indexes, so we'll just
* NULLify the AM related fields for those.
*/
if (info->relam == BTREE_AM_OID)
if (indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX)
{
/* We copy just the fields we need, not all of rd_indam */
amroutine = indexRelation->rd_indam;
info->amcanorderbyop = amroutine->amcanorderbyop;
info->amoptionalkey = amroutine->amoptionalkey;
info->amsearcharray = amroutine->amsearcharray;
info->amsearchnulls = amroutine->amsearchnulls;
info->amcanparallel = amroutine->amcanparallel;
info->amhasgettuple = (amroutine->amgettuple != NULL);
info->amhasgetbitmap = amroutine->amgetbitmap != NULL &&
relation->rd_tableam->scan_bitmap_next_block != NULL;
info->amcanmarkpos = (amroutine->ammarkpos != NULL &&
amroutine->amrestrpos != NULL);
info->amcostestimate = amroutine->amcostestimate;
Assert(info->amcostestimate != NULL);
/* Fetch index opclass options */
info->opclassoptions = RelationGetIndexAttOptions(indexRelation, true);
/*
* If it's a btree index, we can use its opfamily OIDs
* directly as the sort ordering opfamily OIDs.
* Fetch the ordering information for the index, if any.
*/
Assert(amroutine->amcanorder);
info->sortopfamily = info->opfamily;
info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns);
info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns);
for (i = 0; i < nkeycolumns; i++)
if (info->relam == BTREE_AM_OID)
{
int16 opt = indexRelation->rd_indoption[i];
/*
* If it's a btree index, we can use its opfamily OIDs
* directly as the sort ordering opfamily OIDs.
*/
Assert(amroutine->amcanorder);
info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
info->sortopfamily = info->opfamily;
info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns);
info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns);
for (i = 0; i < nkeycolumns; i++)
{
int16 opt = indexRelation->rd_indoption[i];
info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
}
}
}
else if (amroutine->amcanorder)
{
/*
* Otherwise, identify the corresponding btree opfamilies by
* trying to map this index's "<" operators into btree. Since
* "<" uniquely defines the behavior of a sort order, this is
* a sufficient test.
*
* XXX This method is rather slow and also requires the
* undesirable assumption that the other index AM numbers its
* strategies the same as btree. It'd be better to have a way
* to explicitly declare the corresponding btree opfamily for
* each opfamily of the other index type. But given the lack
* of current or foreseeable amcanorder index types, it's not
* worth expending more effort on now.
*/
info->sortopfamily = (Oid *) palloc(sizeof(Oid) * nkeycolumns);
info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns);
info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns);
for (i = 0; i < nkeycolumns; i++)
else if (amroutine->amcanorder)
{
int16 opt = indexRelation->rd_indoption[i];
Oid ltopr;
Oid btopfamily;
Oid btopcintype;
int16 btstrategy;
/*
* Otherwise, identify the corresponding btree opfamilies
* by trying to map this index's "<" operators into btree.
* Since "<" uniquely defines the behavior of a sort
* order, this is a sufficient test.
*
* XXX This method is rather slow and also requires the
* undesirable assumption that the other index AM numbers
* its strategies the same as btree. It'd be better to
* have a way to explicitly declare the corresponding
* btree opfamily for each opfamily of the other index
* type. But given the lack of current or foreseeable
* amcanorder index types, it's not worth expending more
* effort on now.
*/
info->sortopfamily = (Oid *) palloc(sizeof(Oid) * nkeycolumns);
info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns);
info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns);
info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
for (i = 0; i < nkeycolumns; i++)
{
int16 opt = indexRelation->rd_indoption[i];
Oid ltopr;
Oid btopfamily;
Oid btopcintype;
int16 btstrategy;
ltopr = get_opfamily_member(info->opfamily[i],
info->opcintype[i],
info->opcintype[i],
BTLessStrategyNumber);
if (OidIsValid(ltopr) &&
get_ordering_op_properties(ltopr,
&btopfamily,
&btopcintype,
&btstrategy) &&
btopcintype == info->opcintype[i] &&
btstrategy == BTLessStrategyNumber)
{
/* Successful mapping */
info->sortopfamily[i] = btopfamily;
}
else
{
/* Fail ... quietly treat index as unordered */
info->sortopfamily = NULL;
info->reverse_sort = NULL;
info->nulls_first = NULL;
break;
info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
ltopr = get_opfamily_member(info->opfamily[i],
info->opcintype[i],
info->opcintype[i],
BTLessStrategyNumber);
if (OidIsValid(ltopr) &&
get_ordering_op_properties(ltopr,
&btopfamily,
&btopcintype,
&btstrategy) &&
btopcintype == info->opcintype[i] &&
btstrategy == BTLessStrategyNumber)
{
/* Successful mapping */
info->sortopfamily[i] = btopfamily;
}
else
{
/* Fail ... quietly treat index as unordered */
info->sortopfamily = NULL;
info->reverse_sort = NULL;
info->nulls_first = NULL;
break;
}
}
}
else
{
info->sortopfamily = NULL;
info->reverse_sort = NULL;
info->nulls_first = NULL;
}
}
else
{
info->amcanorderbyop = false;
info->amoptionalkey = false;
info->amsearcharray = false;
info->amsearchnulls = false;
info->amcanparallel = false;
info->amhasgettuple = false;
info->amhasgetbitmap = false;
info->amcanmarkpos = false;
info->amcostestimate = NULL;
info->sortopfamily = NULL;
info->reverse_sort = NULL;
info->nulls_first = NULL;
@ -416,31 +436,45 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
* the number-of-tuples estimate to equal the parent table; if it
* is partial then we have to use the same methods as we would for
* a table, except we can be sure that the index is not larger
* than the table.
* than the table. We must ignore partitioned indexes here as as
* there are not physical indexes.
*/
if (info->indpred == NIL)
if (indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX)
{
info->pages = RelationGetNumberOfBlocks(indexRelation);
info->tuples = rel->tuples;
}
else
{
double allvisfrac; /* dummy */
estimate_rel_size(indexRelation, NULL,
&info->pages, &info->tuples, &allvisfrac);
if (info->tuples > rel->tuples)
if (info->indpred == NIL)
{
info->pages = RelationGetNumberOfBlocks(indexRelation);
info->tuples = rel->tuples;
}
}
else
{
double allvisfrac; /* dummy */
if (info->relam == BTREE_AM_OID)
{
/* For btrees, get tree height while we have the index open */
info->tree_height = _bt_getrootheight(indexRelation);
estimate_rel_size(indexRelation, NULL,
&info->pages, &info->tuples, &allvisfrac);
if (info->tuples > rel->tuples)
info->tuples = rel->tuples;
}
if (info->relam == BTREE_AM_OID)
{
/*
* For btrees, get tree height while we have the index
* open
*/
info->tree_height = _bt_getrootheight(indexRelation);
}
else
{
/* For other index types, just set it to "unknown" for now */
info->tree_height = -1;
}
}
else
{
/* For other index types, just set it to "unknown" for now */
/* Zero these out for partitioned indexes */
info->pages = 0;
info->tuples = 0.0;
info->tree_height = -1;
}

View File

@ -5994,6 +5994,10 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
rte = root->simple_rte_array[rel->relid];
Assert(rte->rtekind == RTE_RELATION);
/* ignore partitioned tables. Any indexes here are not real indexes */
if (rte->relkind == RELKIND_PARTITIONED_TABLE)
return false;
/* Search through the indexes to see if any match our problem */
foreach(lc, rel->indexlist)
{

View File

@ -653,7 +653,7 @@ typedef struct PartitionSchemeData *PartitionScheme;
* lateral_referencers - relids of rels that reference this one laterally
* (includes both direct and indirect lateral references)
* indexlist - list of IndexOptInfo nodes for relation's indexes
* (always NIL if it's not a table)
* (always NIL if it's not a table or partitioned table)
* pages - number of disk pages in relation (zero if not a table)
* tuples - number of tuples in relation (not considering restrictions)
* allvisfrac - fraction of disk pages that are marked all-visible
@ -1097,11 +1097,11 @@ struct IndexOptInfo
Oid *opfamily pg_node_attr(array_size(nkeycolumns));
/* OIDs of opclass declared input data types */
Oid *opcintype pg_node_attr(array_size(nkeycolumns));
/* OIDs of btree opfamilies, if orderable */
/* OIDs of btree opfamilies, if orderable. NULL if partitioned index */
Oid *sortopfamily pg_node_attr(array_size(nkeycolumns));
/* is sort order descending? */
/* is sort order descending? or NULL if partitioned index */
bool *reverse_sort pg_node_attr(array_size(nkeycolumns));
/* do NULLs come first in the sort order? */
/* do NULLs come first in the sort order? or NULL if partitioned index */
bool *nulls_first pg_node_attr(array_size(nkeycolumns));
/* opclass-specific options for columns */
bytea **opclassoptions pg_node_attr(read_write_ignore);
@ -1139,7 +1139,7 @@ struct IndexOptInfo
/*
* Remaining fields are copied from the index AM's API struct
* (IndexAmRoutine).
* (IndexAmRoutine). These fields are not set for partitioned indexes.
*/
bool amcanorderbyop;
bool amoptionalkey;

View File

@ -4860,6 +4860,16 @@ select 1 from (select a.id FROM a left join b on a.b_id = b.id) q,
Filter: (a.id = i)
(4 rows)
CREATE TEMP TABLE parted_b (id int PRIMARY KEY) partition by range(id);
CREATE TEMP TABLE parted_b1 partition of parted_b for values from (0) to (10);
-- test join removals on a partitioned table
explain (costs off)
select a.* from a left join parted_b pb on a.b_id = pb.id;
QUERY PLAN
---------------
Seq Scan on a
(1 row)
rollback;
create temp table parent (k int primary key, pd int);
create temp table child (k int unique, cd int);

View File

@ -4874,7 +4874,7 @@ ANALYZE fract_t;
SET max_parallel_workers_per_gather = 0;
SET enable_partitionwise_join = on;
EXPLAIN (COSTS OFF)
SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id ASC LIMIT 10;
QUERY PLAN
-----------------------------------------------------------------------
Limit
@ -4891,7 +4891,7 @@ SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
(11 rows)
EXPLAIN (COSTS OFF)
SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id DESC LIMIT 10;
SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id DESC LIMIT 10;
QUERY PLAN
--------------------------------------------------------------------------------
Limit

View File

@ -1709,6 +1709,13 @@ explain (costs off)
select 1 from (select a.id FROM a left join b on a.b_id = b.id) q,
lateral generate_series(1, q.id) gs(i) where q.id = gs.i;
CREATE TEMP TABLE parted_b (id int PRIMARY KEY) partition by range(id);
CREATE TEMP TABLE parted_b1 partition of parted_b for values from (0) to (10);
-- test join removals on a partitioned table
explain (costs off)
select a.* from a left join parted_b pb on a.b_id = pb.id;
rollback;
create temp table parent (k int primary key, pd int);

View File

@ -1157,10 +1157,10 @@ SET max_parallel_workers_per_gather = 0;
SET enable_partitionwise_join = on;
EXPLAIN (COSTS OFF)
SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id ASC LIMIT 10;
EXPLAIN (COSTS OFF)
SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id DESC LIMIT 10;
SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id DESC LIMIT 10;
-- cleanup
DROP TABLE fract_t;