diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index e62e3d8fba..6588ebd6dc 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -2316,6 +2316,7 @@ CopyFrom(CopyState cstate) bool *nulls; ResultRelInfo *resultRelInfo; ResultRelInfo *target_resultRelInfo; + ResultRelInfo *prevResultRelInfo = NULL; EState *estate = CreateExecutorState(); /* for ExecConstraints() */ ModifyTableState *mtstate; ExprContext *econtext; @@ -2331,7 +2332,6 @@ CopyFrom(CopyState cstate) CopyInsertMethod insertMethod; uint64 processed = 0; int nBufferedTuples = 0; - int prev_leaf_part_index = -1; bool has_before_insert_row_trig; bool has_instead_insert_row_trig; bool leafpart_use_multi_insert = false; @@ -2515,8 +2515,12 @@ CopyFrom(CopyState cstate) /* * If there are any triggers with transition tables on the named relation, * we need to be prepared to capture transition tuples. + * + * Because partition tuple routing would like to know about whether + * transition capture is active, we also set it in mtstate, which is + * passed to ExecFindPartition() below. */ - cstate->transition_capture = + cstate->transition_capture = mtstate->mt_transition_capture = MakeTransitionCaptureState(cstate->rel->trigdesc, RelationGetRelid(cstate->rel), CMD_INSERT); @@ -2526,19 +2530,8 @@ CopyFrom(CopyState cstate) * CopyFrom tuple routing. */ if (cstate->rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) - { proute = ExecSetupPartitionTupleRouting(NULL, cstate->rel); - /* - * If we are capturing transition tuples, they may need to be - * converted from partition format back to partitioned table format - * (this is only ever necessary if a BEFORE trigger modifies the - * tuple). - */ - if (cstate->transition_capture != NULL) - ExecSetupChildParentMapForLeaf(proute); - } - /* * It's more efficient to prepare a bunch of tuples for insertion, and * insert them in one heap_multi_insert() call, than call heap_insert() @@ -2694,25 +2687,17 @@ CopyFrom(CopyState cstate) /* Determine the partition to heap_insert the tuple into */ if (proute) { - int leaf_part_index; TupleConversionMap *map; /* - * Away we go ... If we end up not finding a partition after all, - * ExecFindPartition() does not return and errors out instead. - * Otherwise, the returned value is to be used as an index into - * arrays mt_partitions[] and mt_partition_tupconv_maps[] that - * will get us the ResultRelInfo and TupleConversionMap for the - * partition, respectively. + * Attempt to find a partition suitable for this tuple. + * ExecFindPartition() will raise an error if none can be found or + * if the found partition is not suitable for INSERTs. */ - leaf_part_index = ExecFindPartition(target_resultRelInfo, - proute->partition_dispatch_info, - slot, - estate); - Assert(leaf_part_index >= 0 && - leaf_part_index < proute->num_partitions); + resultRelInfo = ExecFindPartition(mtstate, target_resultRelInfo, + proute, slot, estate); - if (prev_leaf_part_index != leaf_part_index) + if (prevResultRelInfo != resultRelInfo) { /* Check if we can multi-insert into this partition */ if (insertMethod == CIM_MULTI_CONDITIONAL) @@ -2725,12 +2710,9 @@ CopyFrom(CopyState cstate) if (nBufferedTuples > 0) { ExprContext *swapcontext; - ResultRelInfo *presultRelInfo; - - presultRelInfo = proute->partitions[prev_leaf_part_index]; CopyFromInsertBatch(cstate, estate, mycid, hi_options, - presultRelInfo, myslot, bistate, + prevResultRelInfo, myslot, bistate, nBufferedTuples, bufferedTuples, firstBufferedLineNo); nBufferedTuples = 0; @@ -2787,21 +2769,6 @@ CopyFrom(CopyState cstate) } } - /* - * Overwrite resultRelInfo with the corresponding partition's - * one. - */ - resultRelInfo = proute->partitions[leaf_part_index]; - if (unlikely(resultRelInfo == NULL)) - { - resultRelInfo = ExecInitPartitionInfo(mtstate, - target_resultRelInfo, - proute, estate, - leaf_part_index); - proute->partitions[leaf_part_index] = resultRelInfo; - Assert(resultRelInfo != NULL); - } - /* Determine which triggers exist on this partition */ has_before_insert_row_trig = (resultRelInfo->ri_TrigDesc && resultRelInfo->ri_TrigDesc->trig_insert_before_row); @@ -2827,7 +2794,7 @@ CopyFrom(CopyState cstate) * buffer when the partition being inserted into changes. */ ReleaseBulkInsertStatePin(bistate); - prev_leaf_part_index = leaf_part_index; + prevResultRelInfo = resultRelInfo; } /* @@ -2837,7 +2804,7 @@ CopyFrom(CopyState cstate) /* * If we're capturing transition tuples, we might need to convert - * from the partition rowtype to parent rowtype. + * from the partition rowtype to root rowtype. */ if (cstate->transition_capture != NULL) { @@ -2850,8 +2817,7 @@ CopyFrom(CopyState cstate) */ cstate->transition_capture->tcs_original_insert_tuple = NULL; cstate->transition_capture->tcs_map = - TupConvMapForLeaf(proute, target_resultRelInfo, - leaf_part_index); + resultRelInfo->ri_PartitionInfo->pi_PartitionToRootMap; } else { @@ -2865,18 +2831,18 @@ CopyFrom(CopyState cstate) } /* - * We might need to convert from the parent rowtype to the - * partition rowtype. + * We might need to convert from the root rowtype to the partition + * rowtype. */ - map = proute->parent_child_tupconv_maps[leaf_part_index]; + map = resultRelInfo->ri_PartitionInfo->pi_RootToPartitionMap; if (map != NULL) { TupleTableSlot *new_slot; MemoryContext oldcontext; - Assert(proute->partition_tuple_slots != NULL && - proute->partition_tuple_slots[leaf_part_index] != NULL); - new_slot = proute->partition_tuple_slots[leaf_part_index]; + new_slot = resultRelInfo->ri_PartitionInfo->pi_PartitionTupleSlot; + Assert(new_slot != NULL); + slot = execute_attr_map_slot(map->attrMap, slot, new_slot); /* @@ -3021,12 +2987,8 @@ CopyFrom(CopyState cstate) { if (insertMethod == CIM_MULTI_CONDITIONAL) { - ResultRelInfo *presultRelInfo; - - presultRelInfo = proute->partitions[prev_leaf_part_index]; - CopyFromInsertBatch(cstate, estate, mycid, hi_options, - presultRelInfo, myslot, bistate, + prevResultRelInfo, myslot, bistate, nBufferedTuples, bufferedTuples, firstBufferedLineNo); } diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 74398eb464..757df0705d 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -1345,7 +1345,7 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo, resultRelInfo->ri_PartitionCheck = partition_check; resultRelInfo->ri_PartitionRoot = partition_root; - resultRelInfo->ri_PartitionReadyForRouting = false; + resultRelInfo->ri_PartitionInfo = NULL; /* may be set later */ } /* diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index e11fe68712..e3cb4fb1be 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -33,21 +33,98 @@ /*----------------------- - * PartitionDispatch - information about one partitioned table in a partition - * hierarchy required to route a tuple to one of its partitions + * PartitionTupleRouting - Encapsulates all information required to + * route a tuple inserted into a partitioned table to one of its leaf + * partitions. * - * reldesc Relation descriptor of the table - * key Partition key information of the table - * keystate Execution state required for expressions in the partition key - * partdesc Partition descriptor of the table - * tupslot A standalone TupleTableSlot initialized with this table's tuple - * descriptor - * tupmap TupleConversionMap to convert from the parent's rowtype to - * this table's rowtype (when extracting the partition key of a - * tuple just before routing it through this table) - * indexes Array with partdesc->nparts members (for details on what - * individual members represent, see how they are set in - * get_partition_dispatch_recurse()) + * partition_root + * The partitioned table that's the target of the command. + * + * partition_dispatch_info + * Array of 'max_dispatch' elements containing a pointer to a + * PartitionDispatch object for every partitioned table touched by tuple + * routing. The entry for the target partitioned table is *always* + * present in the 0th element of this array. See comment for + * PartitionDispatchData->indexes for details on how this array is + * indexed. + * + * num_dispatch + * The current number of items stored in the 'partition_dispatch_info' + * array. Also serves as the index of the next free array element for + * new PartitionDispatch objects that need to be stored. + * + * max_dispatch + * The current allocated size of the 'partition_dispatch_info' array. + * + * partitions + * Array of 'max_partitions' elements containing a pointer to a + * ResultRelInfo for every leaf partitions touched by tuple routing. + * Some of these are pointers to ResultRelInfos which are borrowed out of + * 'subplan_resultrel_htab'. The remainder have been built especially + * for tuple routing. See comment for PartitionDispatchData->indexes for + * details on how this array is indexed. + * + * num_partitions + * The current number of items stored in the 'partitions' array. Also + * serves as the index of the next free array element for new + * ResultRelInfo objects that need to be stored. + * + * max_partitions + * The current allocated size of the 'partitions' array. + * + * subplan_resultrel_htab + * Hash table to store subplan ResultRelInfos by Oid. This is used to + * cache ResultRelInfos from subplans of an UPDATE ModifyTable node; + * NULL in other cases. Some of these may be useful for tuple routing + * to save having to build duplicates. + * + * memcxt + * Memory context used to allocate subsidiary structs. + *----------------------- + */ +typedef struct PartitionTupleRouting +{ + Relation partition_root; + PartitionDispatch *partition_dispatch_info; + int num_dispatch; + int max_dispatch; + ResultRelInfo **partitions; + int num_partitions; + int max_partitions; + HTAB *subplan_resultrel_htab; + MemoryContext memcxt; +} PartitionTupleRouting; + +/*----------------------- + * PartitionDispatch - information about one partitioned table in a partition + * hierarchy required to route a tuple to any of its partitions. A + * PartitionDispatch is always encapsulated inside a PartitionTupleRouting + * struct and stored inside its 'partition_dispatch_info' array. + * + * reldesc + * Relation descriptor of the table + * key + * Partition key information of the table + * keystate + * Execution state required for expressions in the partition key + * partdesc + * Partition descriptor of the table + * tupslot + * A standalone TupleTableSlot initialized with this table's tuple + * descriptor, or NULL if no tuple conversion between the parent is + * required. + * tupmap + * TupleConversionMap to convert from the parent's rowtype to this table's + * rowtype (when extracting the partition key of a tuple just before + * routing it through this table). A NULL value is stored if no tuple + * conversion is required. + * indexes + * Array of partdesc->nparts elements. For leaf partitions the index + * corresponds to the partition's ResultRelInfo in the encapsulating + * PartitionTupleRouting's partitions array. For partitioned partitions, + * the index corresponds to the PartitionDispatch for it in its + * partition_dispatch_info array. -1 indicates we've not yet allocated + * anything in PartitionTupleRouting for the partition. *----------------------- */ typedef struct PartitionDispatchData @@ -58,14 +135,32 @@ typedef struct PartitionDispatchData PartitionDesc partdesc; TupleTableSlot *tupslot; AttrNumber *tupmap; - int *indexes; + int indexes[FLEXIBLE_ARRAY_MEMBER]; } PartitionDispatchData; +/* struct to hold result relations coming from UPDATE subplans */ +typedef struct SubplanResultRelHashElem +{ + Oid relid; /* hash key -- must be first */ + ResultRelInfo *rri; +} SubplanResultRelHashElem; -static PartitionDispatch *RelationGetPartitionDispatchInfo(Relation rel, - int *num_parted, List **leaf_part_oids); -static void get_partition_dispatch_recurse(Relation rel, Relation parent, - List **pds, List **leaf_part_oids); + +static void ExecHashSubPlanResultRelsByOid(ModifyTableState *mtstate, + PartitionTupleRouting *proute); +static ResultRelInfo *ExecInitPartitionInfo(ModifyTableState *mtstate, + EState *estate, PartitionTupleRouting *proute, + PartitionDispatch dispatch, + ResultRelInfo *rootResultRelInfo, + int partidx); +static void ExecInitRoutingInfo(ModifyTableState *mtstate, + EState *estate, + PartitionTupleRouting *proute, + PartitionDispatch dispatch, + ResultRelInfo *partRelInfo, + int partidx); +static PartitionDispatch ExecInitPartitionDispatchInfo(PartitionTupleRouting *proute, + Oid partoid, PartitionDispatch parent_pd, int partidx); static void FormPartitionKeyDatum(PartitionDispatch pd, TupleTableSlot *slot, EState *estate, @@ -92,131 +187,87 @@ static void find_matching_subplans_recurse(PartitionPruningData *prunedata, * Note that all the relations in the partition tree are locked using the * RowExclusiveLock mode upon return from this function. * - * While we allocate the arrays of pointers of ResultRelInfo and - * TupleConversionMap for all partitions here, actual objects themselves are - * lazily allocated for a given partition if a tuple is actually routed to it; - * see ExecInitPartitionInfo. However, if the function is invoked for update - * tuple routing, caller would already have initialized ResultRelInfo's for - * some of the partitions, which are reused and assigned to their respective - * slot in the aforementioned array. For such partitions, we delay setting - * up objects such as TupleConversionMap until those are actually chosen as - * the partitions to route tuples to. See ExecPrepareTupleRouting. + * Callers must use the returned PartitionTupleRouting during calls to + * ExecFindPartition(). The actual ResultRelInfo for a partition is only + * allocated when the partition is found for the first time. + * + * The current memory context is used to allocate this struct and all + * subsidiary structs that will be allocated from it later on. Typically + * it should be estate->es_query_cxt. */ PartitionTupleRouting * ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, Relation rel) { - List *leaf_parts; - ListCell *cell; - int i; - ResultRelInfo *update_rri = NULL; - int num_update_rri = 0, - update_rri_index = 0; PartitionTupleRouting *proute; - int nparts; ModifyTable *node = mtstate ? (ModifyTable *) mtstate->ps.plan : NULL; - /* - * Get the information about the partition tree after locking all the - * partitions. - */ + /* Lock all the partitions. */ (void) find_all_inheritors(RelationGetRelid(rel), RowExclusiveLock, NULL); - proute = (PartitionTupleRouting *) palloc0(sizeof(PartitionTupleRouting)); - proute->partition_dispatch_info = - RelationGetPartitionDispatchInfo(rel, &proute->num_dispatch, - &leaf_parts); - proute->num_partitions = nparts = list_length(leaf_parts); - proute->partitions = - (ResultRelInfo **) palloc(nparts * sizeof(ResultRelInfo *)); - proute->parent_child_tupconv_maps = - (TupleConversionMap **) palloc0(nparts * sizeof(TupleConversionMap *)); - proute->partition_oids = (Oid *) palloc(nparts * sizeof(Oid)); - - /* Set up details specific to the type of tuple routing we are doing. */ - if (node && node->operation == CMD_UPDATE) - { - update_rri = mtstate->resultRelInfo; - num_update_rri = list_length(node->plans); - proute->subplan_partition_offsets = - palloc(num_update_rri * sizeof(int)); - proute->num_subplan_partition_offsets = num_update_rri; - - /* - * We need an additional tuple slot for storing transient tuples that - * are converted to the root table descriptor. - */ - proute->root_tuple_slot = MakeTupleTableSlot(RelationGetDescr(rel), - &TTSOpsHeapTuple); - } - - i = 0; - foreach(cell, leaf_parts) - { - ResultRelInfo *leaf_part_rri = NULL; - Oid leaf_oid = lfirst_oid(cell); - - proute->partition_oids[i] = leaf_oid; - - /* - * If the leaf partition is already present in the per-subplan result - * rels, we re-use that rather than initialize a new result rel. The - * per-subplan resultrels and the resultrels of the leaf partitions - * are both in the same canonical order. So while going through the - * leaf partition oids, we need to keep track of the next per-subplan - * result rel to be looked for in the leaf partition resultrels. - */ - if (update_rri_index < num_update_rri && - RelationGetRelid(update_rri[update_rri_index].ri_RelationDesc) == leaf_oid) - { - leaf_part_rri = &update_rri[update_rri_index]; - - /* - * This is required in order to convert the partition's tuple to - * be compatible with the root partitioned table's tuple - * descriptor. When generating the per-subplan result rels, this - * was not set. - */ - leaf_part_rri->ri_PartitionRoot = rel; - - /* Remember the subplan offset for this ResultRelInfo */ - proute->subplan_partition_offsets[update_rri_index] = i; - - update_rri_index++; - } - - proute->partitions[i] = leaf_part_rri; - i++; - } /* - * For UPDATE, we should have found all the per-subplan resultrels in the - * leaf partitions. (If this is an INSERT, both values will be zero.) + * Here we attempt to expend as little effort as possible in setting up + * the PartitionTupleRouting. Each partition's ResultRelInfo is built on + * demand, only when we actually need to route a tuple to that partition. + * The reason for this is that a common case is for INSERT to insert a + * single tuple into a partitioned table and this must be fast. */ - Assert(update_rri_index == num_update_rri); + proute = (PartitionTupleRouting *) palloc0(sizeof(PartitionTupleRouting)); + proute->partition_root = rel; + proute->memcxt = CurrentMemoryContext; + /* Rest of members initialized by zeroing */ + + /* + * Initialize this table's PartitionDispatch object. Here we pass in the + * parent as NULL as we don't need to care about any parent of the target + * partitioned table. + */ + ExecInitPartitionDispatchInfo(proute, RelationGetRelid(rel), NULL, 0); + + /* + * If performing an UPDATE with tuple routing, we can reuse partition + * sub-plan result rels. We build a hash table to map the OIDs of + * partitions present in mtstate->resultRelInfo to their ResultRelInfos. + * Every time a tuple is routed to a partition that we've yet to set the + * ResultRelInfo for, before we go to the trouble of making one, we check + * for a pre-made one in the hash table. + */ + if (node && node->operation == CMD_UPDATE) + ExecHashSubPlanResultRelsByOid(mtstate, proute); return proute; } /* - * ExecFindPartition -- Find a leaf partition in the partition tree rooted - * at parent, for the heap tuple contained in *slot + * ExecFindPartition -- Return the ResultRelInfo for the leaf partition that + * the tuple contained in *slot should belong to. + * + * If the partition's ResultRelInfo does not yet exist in 'proute' then we set + * one up or reuse one from mtstate's resultRelInfo array. When reusing a + * ResultRelInfo from the mtstate we verify that the relation is a valid + * target for INSERTs and then set up a PartitionRoutingInfo for it. + * + * rootResultRelInfo is the relation named in the query. * * estate must be non-NULL; we'll need it to compute any expressions in the - * partition key(s) + * partition keys. Also, its per-tuple contexts are used as evaluation + * scratch space. * * If no leaf partition is found, this routine errors out with the appropriate - * error message, else it returns the leaf partition sequence number - * as an index into the array of (ResultRelInfos of) all leaf partitions in - * the partition tree. + * error message. An error may also raised if the found target partition is + * not a valid target for an INSERT. */ -int -ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, +ResultRelInfo * +ExecFindPartition(ModifyTableState *mtstate, + ResultRelInfo *rootResultRelInfo, + PartitionTupleRouting *proute, TupleTableSlot *slot, EState *estate) { - int result; + PartitionDispatch *pd = proute->partition_dispatch_info; Datum values[PARTITION_MAX_KEYS]; bool isnull[PARTITION_MAX_KEYS]; Relation rel; PartitionDispatch dispatch; + PartitionDesc partdesc; ExprContext *ecxt = GetPerTupleExprContext(estate); TupleTableSlot *ecxt_scantuple_old = ecxt->ecxt_scantuple; TupleTableSlot *myslot = NULL; @@ -229,25 +280,31 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, * First check the root table's partition constraint, if any. No point in * routing the tuple if it doesn't belong in the root table itself. */ - if (resultRelInfo->ri_PartitionCheck) - ExecPartitionCheck(resultRelInfo, slot, estate, true); + if (rootResultRelInfo->ri_PartitionCheck) + ExecPartitionCheck(rootResultRelInfo, slot, estate, true); /* start with the root partitioned table */ dispatch = pd[0]; while (true) { AttrNumber *map = dispatch->tupmap; - int cur_index = -1; + int partidx = -1; + + CHECK_FOR_INTERRUPTS(); rel = dispatch->reldesc; + partdesc = dispatch->partdesc; /* * Convert the tuple to this parent's layout, if different from the * current relation. */ myslot = dispatch->tupslot; - if (myslot != NULL && map != NULL) + if (myslot != NULL) + { + Assert(map != NULL); slot = execute_attr_map_slot(map, slot, myslot); + } /* * Extract partition key from tuple. Expression evaluation machinery @@ -261,97 +318,196 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, FormPartitionKeyDatum(dispatch, slot, estate, values, isnull); /* - * Nothing for get_partition_for_tuple() to do if there are no - * partitions to begin with. + * If this partitioned table has no partitions or no partition for + * these values, error out. */ - if (dispatch->partdesc->nparts == 0) + if (partdesc->nparts == 0 || + (partidx = get_partition_for_tuple(dispatch, values, isnull)) < 0) { - result = -1; - break; + char *val_desc; + + val_desc = ExecBuildSlotPartitionKeyDescription(rel, + values, isnull, 64); + Assert(OidIsValid(RelationGetRelid(rel))); + ereport(ERROR, + (errcode(ERRCODE_CHECK_VIOLATION), + errmsg("no partition of relation \"%s\" found for row", + RelationGetRelationName(rel)), + val_desc ? + errdetail("Partition key of the failing row contains %s.", + val_desc) : 0)); } - cur_index = get_partition_for_tuple(dispatch, values, isnull); + if (partdesc->is_leaf[partidx]) + { + ResultRelInfo *rri; - /* - * cur_index < 0 means we failed to find a partition of this parent. - * cur_index >= 0 means we either found the leaf partition, or the - * next parent to find a partition of. - */ - if (cur_index < 0) - { - result = -1; - break; - } - else if (dispatch->indexes[cur_index] >= 0) - { - result = dispatch->indexes[cur_index]; - /* success! */ - break; + /* + * Look to see if we've already got a ResultRelInfo for this + * partition. + */ + if (likely(dispatch->indexes[partidx] >= 0)) + { + /* ResultRelInfo already built */ + Assert(dispatch->indexes[partidx] < proute->num_partitions); + rri = proute->partitions[dispatch->indexes[partidx]]; + } + else + { + bool found = false; + + /* + * We have not yet set up a ResultRelInfo for this partition, + * but if we have a subplan hash table, we might have one + * there. If not, we'll have to create one. + */ + if (proute->subplan_resultrel_htab) + { + Oid partoid = partdesc->oids[partidx]; + SubplanResultRelHashElem *elem; + + elem = hash_search(proute->subplan_resultrel_htab, + &partoid, HASH_FIND, NULL); + if (elem) + { + found = true; + rri = elem->rri; + + /* Verify this ResultRelInfo allows INSERTs */ + CheckValidResultRel(rri, CMD_INSERT); + + /* Set up the PartitionRoutingInfo for it */ + ExecInitRoutingInfo(mtstate, estate, proute, dispatch, + rri, partidx); + } + } + + /* We need to create a new one. */ + if (!found) + rri = ExecInitPartitionInfo(mtstate, estate, proute, + dispatch, + rootResultRelInfo, partidx); + } + + /* Release the tuple in the lowest parent's dedicated slot. */ + if (slot == myslot) + ExecClearTuple(myslot); + + MemoryContextSwitchTo(oldcxt); + ecxt->ecxt_scantuple = ecxt_scantuple_old; + return rri; } else { - /* move down one level */ - dispatch = pd[-dispatch->indexes[cur_index]]; + /* + * Partition is a sub-partitioned table; get the PartitionDispatch + */ + if (likely(dispatch->indexes[partidx] >= 0)) + { + /* Already built. */ + Assert(dispatch->indexes[partidx] < proute->num_dispatch); + + /* + * Move down to the next partition level and search again + * until we find a leaf partition that matches this tuple + */ + dispatch = pd[dispatch->indexes[partidx]]; + } + else + { + /* Not yet built. Do that now. */ + PartitionDispatch subdispatch; + + /* + * Create the new PartitionDispatch. We pass the current one + * in as the parent PartitionDispatch + */ + subdispatch = ExecInitPartitionDispatchInfo(proute, + partdesc->oids[partidx], + dispatch, partidx); + Assert(dispatch->indexes[partidx] >= 0 && + dispatch->indexes[partidx] < proute->num_dispatch); + dispatch = subdispatch; + } } } +} - /* Release the tuple in the lowest parent's dedicated slot. */ - if (slot == myslot) - ExecClearTuple(myslot); +/* + * ExecHashSubPlanResultRelsByOid + * Build a hash table to allow fast lookups of subplan ResultRelInfos by + * partition Oid. We also populate the subplan ResultRelInfo with an + * ri_PartitionRoot. + */ +static void +ExecHashSubPlanResultRelsByOid(ModifyTableState *mtstate, + PartitionTupleRouting *proute) +{ + HASHCTL ctl; + HTAB *htab; + int i; - /* A partition was not found. */ - if (result < 0) + memset(&ctl, 0, sizeof(ctl)); + ctl.keysize = sizeof(Oid); + ctl.entrysize = sizeof(SubplanResultRelHashElem); + ctl.hcxt = CurrentMemoryContext; + + htab = hash_create("PartitionTupleRouting table", mtstate->mt_nplans, + &ctl, HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + proute->subplan_resultrel_htab = htab; + + /* Hash all subplans by their Oid */ + for (i = 0; i < mtstate->mt_nplans; i++) { - char *val_desc; + ResultRelInfo *rri = &mtstate->resultRelInfo[i]; + bool found; + Oid partoid = RelationGetRelid(rri->ri_RelationDesc); + SubplanResultRelHashElem *elem; - val_desc = ExecBuildSlotPartitionKeyDescription(rel, - values, isnull, 64); - Assert(OidIsValid(RelationGetRelid(rel))); - ereport(ERROR, - (errcode(ERRCODE_CHECK_VIOLATION), - errmsg("no partition of relation \"%s\" found for row", - RelationGetRelationName(rel)), - val_desc ? errdetail("Partition key of the failing row contains %s.", val_desc) : 0)); + elem = (SubplanResultRelHashElem *) + hash_search(htab, &partoid, HASH_ENTER, &found); + Assert(!found); + elem->rri = rri; + + /* + * This is required in order to convert the partition's tuple to be + * compatible with the root partitioned table's tuple descriptor. When + * generating the per-subplan result rels, this was not set. + */ + rri->ri_PartitionRoot = proute->partition_root; } - - MemoryContextSwitchTo(oldcxt); - ecxt->ecxt_scantuple = ecxt_scantuple_old; - - return result; } /* * ExecInitPartitionInfo * Initialize ResultRelInfo and other information for a partition + * and store it in the next empty slot in the proute->partitions array. * * Returns the ResultRelInfo */ -ResultRelInfo * -ExecInitPartitionInfo(ModifyTableState *mtstate, - ResultRelInfo *resultRelInfo, +static ResultRelInfo * +ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate, PartitionTupleRouting *proute, - EState *estate, int partidx) + PartitionDispatch dispatch, + ResultRelInfo *rootResultRelInfo, + int partidx) { ModifyTable *node = (ModifyTable *) mtstate->ps.plan; - Relation rootrel = resultRelInfo->ri_RelationDesc, + Relation rootrel = rootResultRelInfo->ri_RelationDesc, partrel; Relation firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc; ResultRelInfo *leaf_part_rri; - MemoryContext oldContext; + MemoryContext oldcxt; AttrNumber *part_attnos = NULL; bool found_whole_row; + oldcxt = MemoryContextSwitchTo(proute->memcxt); + /* * We locked all the partitions in ExecSetupPartitionTupleRouting * including the leaf partitions. */ - partrel = heap_open(proute->partition_oids[partidx], NoLock); - - /* - * Keep ResultRelInfo and other information for this partition in the - * per-query memory context so they'll survive throughout the query. - */ - oldContext = MemoryContextSwitchTo(estate->es_query_cxt); + partrel = heap_open(dispatch->partdesc->oids[partidx], NoLock); leaf_part_rri = makeNode(ResultRelInfo); InitResultRelInfo(leaf_part_rri, @@ -367,18 +523,6 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, */ CheckValidResultRel(leaf_part_rri, CMD_INSERT); - /* - * Since we've just initialized this ResultRelInfo, it's not in any list - * attached to the estate as yet. Add it, so that it can be found later. - * - * Note that the entries in this list appear in no predetermined order, - * because partition result rels are initialized as and when they're - * needed. - */ - estate->es_tuple_routing_result_relations = - lappend(estate->es_tuple_routing_result_relations, - leaf_part_rri); - /* * Open partition indices. The user may have asked to check for conflicts * within this leaf partition and do "nothing" instead of throwing an @@ -522,14 +666,14 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, } /* Set up information needed for routing tuples to the partition. */ - ExecInitRoutingInfo(mtstate, estate, proute, leaf_part_rri, partidx); + ExecInitRoutingInfo(mtstate, estate, proute, dispatch, + leaf_part_rri, partidx); /* * If there is an ON CONFLICT clause, initialize state for it. */ if (node && node->onConflictAction != ONCONFLICT_NONE) { - TupleConversionMap *map = proute->parent_child_tupconv_maps[partidx]; int firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex; TupleDesc partrelDesc = RelationGetDescr(partrel); ExprContext *econtext = mtstate->ps.ps_ExprContext; @@ -542,7 +686,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, * list and searching for ancestry relationships to each index in the * ancestor table. */ - if (list_length(resultRelInfo->ri_onConflictArbiterIndexes) > 0) + if (list_length(rootResultRelInfo->ri_onConflictArbiterIndexes) > 0) { List *childIdxs; @@ -555,7 +699,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, ListCell *lc2; ancestors = get_partition_ancestors(childIdx); - foreach(lc2, resultRelInfo->ri_onConflictArbiterIndexes) + foreach(lc2, rootResultRelInfo->ri_onConflictArbiterIndexes) { if (list_member_oid(ancestors, lfirst_oid(lc2))) arbiterIndexes = lappend_oid(arbiterIndexes, childIdx); @@ -569,7 +713,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, * (This shouldn't happen, since arbiter index selection should not * pick up an invalid index.) */ - if (list_length(resultRelInfo->ri_onConflictArbiterIndexes) != + if (list_length(rootResultRelInfo->ri_onConflictArbiterIndexes) != list_length(arbiterIndexes)) elog(ERROR, "invalid arbiter index list"); leaf_part_rri->ri_onConflictArbiterIndexes = arbiterIndexes; @@ -579,8 +723,12 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, */ if (node->onConflictAction == ONCONFLICT_UPDATE) { + TupleConversionMap *map; + + map = leaf_part_rri->ri_PartitionInfo->pi_RootToPartitionMap; + Assert(node->onConflictSet != NIL); - Assert(resultRelInfo->ri_onConflict != NULL); + Assert(rootResultRelInfo->ri_onConflict != NULL); /* * If the partition's tuple descriptor matches exactly the root @@ -589,7 +737,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, * need to create state specific to this partition. */ if (map == NULL) - leaf_part_rri->ri_onConflict = resultRelInfo->ri_onConflict; + leaf_part_rri->ri_onConflict = rootResultRelInfo->ri_onConflict; else { List *onconflset; @@ -680,37 +828,51 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, } } - Assert(proute->partitions[partidx] == NULL); - proute->partitions[partidx] = leaf_part_rri; + /* + * Since we've just initialized this ResultRelInfo, it's not in any list + * attached to the estate as yet. Add it, so that it can be found later. + * + * Note that the entries in this list appear in no predetermined order, + * because partition result rels are initialized as and when they're + * needed. + */ + MemoryContextSwitchTo(estate->es_query_cxt); + estate->es_tuple_routing_result_relations = + lappend(estate->es_tuple_routing_result_relations, + leaf_part_rri); - MemoryContextSwitchTo(oldContext); + MemoryContextSwitchTo(oldcxt); return leaf_part_rri; } /* * ExecInitRoutingInfo - * Set up information needed for routing tuples to a leaf partition + * Set up information needed for translating tuples between root + * partitioned table format and partition format, and keep track of it + * in PartitionTupleRouting. */ -void +static void ExecInitRoutingInfo(ModifyTableState *mtstate, EState *estate, PartitionTupleRouting *proute, + PartitionDispatch dispatch, ResultRelInfo *partRelInfo, int partidx) { - MemoryContext oldContext; + MemoryContext oldcxt; + PartitionRoutingInfo *partrouteinfo; + int rri_index; - /* - * Switch into per-query memory context. - */ - oldContext = MemoryContextSwitchTo(estate->es_query_cxt); + oldcxt = MemoryContextSwitchTo(proute->memcxt); + + partrouteinfo = palloc(sizeof(PartitionRoutingInfo)); /* * Set up a tuple conversion map to convert a tuple routed to the * partition from the parent's type to the partition's. */ - proute->parent_child_tupconv_maps[partidx] = + partrouteinfo->pi_RootToPartitionMap = convert_tuples_by_name(RelationGetDescr(partRelInfo->ri_PartitionRoot), RelationGetDescr(partRelInfo->ri_RelationDesc), gettext_noop("could not convert row type")); @@ -721,29 +883,36 @@ ExecInitRoutingInfo(ModifyTableState *mtstate, * for various operations that are applied to tuples after routing, such * as checking constraints. */ - if (proute->parent_child_tupconv_maps[partidx] != NULL) + if (partrouteinfo->pi_RootToPartitionMap != NULL) { Relation partrel = partRelInfo->ri_RelationDesc; - /* - * Initialize the array in proute where these slots are stored, if not - * already done. - */ - if (proute->partition_tuple_slots == NULL) - proute->partition_tuple_slots = (TupleTableSlot **) - palloc0(proute->num_partitions * - sizeof(TupleTableSlot *)); - /* * Initialize the slot itself setting its descriptor to this * partition's TupleDesc; TupleDesc reference will be released at the * end of the command. */ - proute->partition_tuple_slots[partidx] = - ExecInitExtraTupleSlot(estate, - RelationGetDescr(partrel), + partrouteinfo->pi_PartitionTupleSlot = + ExecInitExtraTupleSlot(estate, RelationGetDescr(partrel), &TTSOpsHeapTuple); } + else + partrouteinfo->pi_PartitionTupleSlot = NULL; + + /* + * Also, if transition capture is required, store a map to convert tuples + * from partition's rowtype to the root partition table's. + */ + if (mtstate && + (mtstate->mt_transition_capture || mtstate->mt_oc_transition_capture)) + { + partrouteinfo->pi_PartitionToRootMap = + convert_tuples_by_name(RelationGetDescr(partRelInfo->ri_RelationDesc), + RelationGetDescr(partRelInfo->ri_PartitionRoot), + gettext_noop("could not convert row type")); + } + else + partrouteinfo->pi_PartitionToRootMap = NULL; /* * If the partition is a foreign table, let the FDW init itself for @@ -753,73 +922,138 @@ ExecInitRoutingInfo(ModifyTableState *mtstate, partRelInfo->ri_FdwRoutine->BeginForeignInsert != NULL) partRelInfo->ri_FdwRoutine->BeginForeignInsert(mtstate, partRelInfo); - MemoryContextSwitchTo(oldContext); - - partRelInfo->ri_PartitionReadyForRouting = true; -} - -/* - * ExecSetupChildParentMapForLeaf -- Initialize the per-leaf-partition - * child-to-root tuple conversion map array. - * - * This map is required for capturing transition tuples when the target table - * is a partitioned table. For a tuple that is routed by an INSERT or UPDATE, - * we need to convert it from the leaf partition to the target table - * descriptor. - */ -void -ExecSetupChildParentMapForLeaf(PartitionTupleRouting *proute) -{ - Assert(proute != NULL); + partRelInfo->ri_PartitionInfo = partrouteinfo; /* - * These array elements get filled up with maps on an on-demand basis. - * Initially just set all of them to NULL. + * Keep track of it in the PartitionTupleRouting->partitions array. */ - proute->child_parent_tupconv_maps = - (TupleConversionMap **) palloc0(sizeof(TupleConversionMap *) * - proute->num_partitions); + Assert(dispatch->indexes[partidx] == -1); - /* Same is the case for this array. All the values are set to false */ - proute->child_parent_map_not_required = - (bool *) palloc0(sizeof(bool) * proute->num_partitions); + rri_index = proute->num_partitions++; + + /* Allocate or enlarge the array, as needed */ + if (proute->num_partitions >= proute->max_partitions) + { + if (proute->max_partitions == 0) + { + proute->max_partitions = 8; + proute->partitions = (ResultRelInfo **) + palloc(sizeof(ResultRelInfo *) * proute->max_partitions); + } + else + { + proute->max_partitions *= 2; + proute->partitions = (ResultRelInfo **) + repalloc(proute->partitions, sizeof(ResultRelInfo *) * + proute->max_partitions); + } + } + + proute->partitions[rri_index] = partRelInfo; + dispatch->indexes[partidx] = rri_index; + + MemoryContextSwitchTo(oldcxt); } /* - * TupConvMapForLeaf -- Get the tuple conversion map for a given leaf partition - * index. + * ExecInitPartitionDispatchInfo + * Initialize PartitionDispatch for a partitioned table and store it in + * the next available slot in the proute->partition_dispatch_info array. + * Also, record the index into this array in the parent_pd->indexes[] + * array in the partidx element so that we can properly retrieve the + * newly created PartitionDispatch later. */ -TupleConversionMap * -TupConvMapForLeaf(PartitionTupleRouting *proute, - ResultRelInfo *rootRelInfo, int leaf_index) +static PartitionDispatch +ExecInitPartitionDispatchInfo(PartitionTupleRouting *proute, Oid partoid, + PartitionDispatch parent_pd, int partidx) { - ResultRelInfo **resultRelInfos = proute->partitions; - TupleConversionMap **map; - TupleDesc tupdesc; + Relation rel; + PartitionDesc partdesc; + PartitionDispatch pd; + int dispatchidx; + MemoryContext oldcxt; - /* Don't call this if we're not supposed to be using this type of map. */ - Assert(proute->child_parent_tupconv_maps != NULL); + oldcxt = MemoryContextSwitchTo(proute->memcxt); - /* If it's already known that we don't need a map, return NULL. */ - if (proute->child_parent_map_not_required[leaf_index]) - return NULL; + if (partoid != RelationGetRelid(proute->partition_root)) + rel = heap_open(partoid, NoLock); + else + rel = proute->partition_root; + partdesc = RelationGetPartitionDesc(rel); - /* If we've already got a map, return it. */ - map = &proute->child_parent_tupconv_maps[leaf_index]; - if (*map != NULL) - return *map; + pd = (PartitionDispatch) palloc(offsetof(PartitionDispatchData, indexes) + + partdesc->nparts * sizeof(int)); + pd->reldesc = rel; + pd->key = RelationGetPartitionKey(rel); + pd->keystate = NIL; + pd->partdesc = partdesc; + if (parent_pd != NULL) + { + TupleDesc tupdesc = RelationGetDescr(rel); - /* No map yet; try to create one. */ - tupdesc = RelationGetDescr(resultRelInfos[leaf_index]->ri_RelationDesc); - *map = - convert_tuples_by_name(tupdesc, - RelationGetDescr(rootRelInfo->ri_RelationDesc), - gettext_noop("could not convert row type")); + /* + * For sub-partitioned tables where the column order differs from its + * direct parent partitioned table, we must store a tuple table slot + * initialized with its tuple descriptor and a tuple conversion map to + * convert a tuple from its parent's rowtype to its own. This is to + * make sure that we are looking at the correct row using the correct + * tuple descriptor when computing its partition key for tuple + * routing. + */ + pd->tupmap = convert_tuples_by_name_map_if_req(RelationGetDescr(parent_pd->reldesc), + tupdesc, + gettext_noop("could not convert row type")); + pd->tupslot = pd->tupmap ? + MakeSingleTupleTableSlot(tupdesc, &TTSOpsHeapTuple) : NULL; + } + else + { + /* Not required for the root partitioned table */ + pd->tupmap = NULL; + pd->tupslot = NULL; + } - /* If it turns out no map is needed, remember for next time. */ - proute->child_parent_map_not_required[leaf_index] = (*map == NULL); + /* + * Initialize with -1 to signify that the corresponding partition's + * ResultRelInfo or PartitionDispatch has not been created yet. + */ + memset(pd->indexes, -1, sizeof(int) * partdesc->nparts); - return *map; + /* Track in PartitionTupleRouting for later use */ + dispatchidx = proute->num_dispatch++; + + /* Allocate or enlarge the array, as needed */ + if (proute->num_dispatch >= proute->max_dispatch) + { + if (proute->max_dispatch == 0) + { + proute->max_dispatch = 4; + proute->partition_dispatch_info = (PartitionDispatch *) + palloc(sizeof(PartitionDispatch) * proute->max_dispatch); + } + else + { + proute->max_dispatch *= 2; + proute->partition_dispatch_info = (PartitionDispatch *) + repalloc(proute->partition_dispatch_info, + sizeof(PartitionDispatch) * proute->max_dispatch); + } + } + proute->partition_dispatch_info[dispatchidx] = pd; + + /* + * Finally, if setting up a PartitionDispatch for a sub-partitioned table, + * install a downlink in the parent to allow quick descent. + */ + if (parent_pd) + { + Assert(parent_pd->indexes[partidx] == -1); + parent_pd->indexes[partidx] = dispatchidx; + } + + MemoryContextSwitchTo(oldcxt); + + return pd; } /* @@ -832,8 +1066,8 @@ void ExecCleanupTupleRouting(ModifyTableState *mtstate, PartitionTupleRouting *proute) { + HTAB *htab = proute->subplan_resultrel_htab; int i; - int subplan_index = 0; /* * Remember, proute->partition_dispatch_info[0] corresponds to the root @@ -847,187 +1081,40 @@ ExecCleanupTupleRouting(ModifyTableState *mtstate, PartitionDispatch pd = proute->partition_dispatch_info[i]; heap_close(pd->reldesc, NoLock); - ExecDropSingleTupleTableSlot(pd->tupslot); + + if (pd->tupslot) + ExecDropSingleTupleTableSlot(pd->tupslot); } for (i = 0; i < proute->num_partitions; i++) { ResultRelInfo *resultRelInfo = proute->partitions[i]; - /* skip further processing for uninitialized partitions */ - if (resultRelInfo == NULL) - continue; + /* + * Check if this result rel is one belonging to the node's subplans, + * if so, let ExecEndPlan() clean it up. + */ + if (htab) + { + Oid partoid; + bool found; + + partoid = RelationGetRelid(resultRelInfo->ri_RelationDesc); + + (void) hash_search(htab, &partoid, HASH_FIND, &found); + if (found) + continue; + } /* Allow any FDWs to shut down if they've been exercised */ - if (resultRelInfo->ri_PartitionReadyForRouting && - resultRelInfo->ri_FdwRoutine != NULL && + if (resultRelInfo->ri_FdwRoutine != NULL && resultRelInfo->ri_FdwRoutine->EndForeignInsert != NULL) resultRelInfo->ri_FdwRoutine->EndForeignInsert(mtstate->ps.state, resultRelInfo); - /* - * If this result rel is one of the UPDATE subplan result rels, let - * ExecEndPlan() close it. For INSERT or COPY, - * proute->subplan_partition_offsets will always be NULL. Note that - * the subplan_partition_offsets array and the partitions array have - * the partitions in the same order. So, while we iterate over - * partitions array, we also iterate over the - * subplan_partition_offsets array in order to figure out which of the - * result rels are present in the UPDATE subplans. - */ - if (proute->subplan_partition_offsets && - subplan_index < proute->num_subplan_partition_offsets && - proute->subplan_partition_offsets[subplan_index] == i) - { - subplan_index++; - continue; - } - ExecCloseIndices(resultRelInfo); heap_close(resultRelInfo->ri_RelationDesc, NoLock); } - - /* Release the standalone partition tuple descriptors, if any */ - if (proute->root_tuple_slot) - ExecDropSingleTupleTableSlot(proute->root_tuple_slot); -} - -/* - * RelationGetPartitionDispatchInfo - * Returns information necessary to route tuples down a partition tree - * - * The number of elements in the returned array (that is, the number of - * PartitionDispatch objects for the partitioned tables in the partition tree) - * is returned in *num_parted and a list of the OIDs of all the leaf - * partitions of rel is returned in *leaf_part_oids. - * - * All the relations in the partition tree (including 'rel') must have been - * locked (using at least the AccessShareLock) by the caller. - */ -static PartitionDispatch * -RelationGetPartitionDispatchInfo(Relation rel, - int *num_parted, List **leaf_part_oids) -{ - List *pdlist = NIL; - PartitionDispatchData **pd; - ListCell *lc; - int i; - - Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); - - *num_parted = 0; - *leaf_part_oids = NIL; - - get_partition_dispatch_recurse(rel, NULL, &pdlist, leaf_part_oids); - *num_parted = list_length(pdlist); - pd = (PartitionDispatchData **) palloc(*num_parted * - sizeof(PartitionDispatchData *)); - i = 0; - foreach(lc, pdlist) - { - pd[i++] = lfirst(lc); - } - - return pd; -} - -/* - * get_partition_dispatch_recurse - * Recursively expand partition tree rooted at rel - * - * As the partition tree is expanded in a depth-first manner, we maintain two - * global lists: of PartitionDispatch objects corresponding to partitioned - * tables in *pds and of the leaf partition OIDs in *leaf_part_oids. - * - * Note that the order of OIDs of leaf partitions in leaf_part_oids matches - * the order in which the planner's expand_partitioned_rtentry() processes - * them. It's not necessarily the case that the offsets match up exactly, - * because constraint exclusion might prune away some partitions on the - * planner side, whereas we'll always have the complete list; but unpruned - * partitions will appear in the same order in the plan as they are returned - * here. - */ -static void -get_partition_dispatch_recurse(Relation rel, Relation parent, - List **pds, List **leaf_part_oids) -{ - TupleDesc tupdesc = RelationGetDescr(rel); - PartitionDesc partdesc = RelationGetPartitionDesc(rel); - PartitionKey partkey = RelationGetPartitionKey(rel); - PartitionDispatch pd; - int i; - - check_stack_depth(); - - /* Build a PartitionDispatch for this table and add it to *pds. */ - pd = (PartitionDispatch) palloc(sizeof(PartitionDispatchData)); - *pds = lappend(*pds, pd); - pd->reldesc = rel; - pd->key = partkey; - pd->keystate = NIL; - pd->partdesc = partdesc; - if (parent != NULL) - { - /* - * For every partitioned table other than the root, we must store a - * tuple table slot initialized with its tuple descriptor and a tuple - * conversion map to convert a tuple from its parent's rowtype to its - * own. That is to make sure that we are looking at the correct row - * using the correct tuple descriptor when computing its partition key - * for tuple routing. - */ - pd->tupslot = MakeSingleTupleTableSlot(tupdesc, &TTSOpsHeapTuple); - pd->tupmap = convert_tuples_by_name_map_if_req(RelationGetDescr(parent), - tupdesc, - gettext_noop("could not convert row type")); - } - else - { - /* Not required for the root partitioned table */ - pd->tupslot = NULL; - pd->tupmap = NULL; - } - - /* - * Go look at each partition of this table. If it's a leaf partition, - * simply add its OID to *leaf_part_oids. If it's a partitioned table, - * recursively call get_partition_dispatch_recurse(), so that its - * partitions are processed as well and a corresponding PartitionDispatch - * object gets added to *pds. - * - * The 'indexes' array is used when searching for a partition matching a - * given tuple. The actual value we store here depends on whether the - * array element belongs to a leaf partition or a subpartitioned table. - * For leaf partitions we store the index into *leaf_part_oids, and for - * sub-partitioned tables we store a negative version of the index into - * the *pds list. Both indexes are 0-based, but the first element of the - * *pds list is the root partition, so 0 always means the first leaf. When - * searching, if we see a negative value, the search must continue in the - * corresponding sub-partition; otherwise, we've identified the correct - * partition. - */ - pd->indexes = (int *) palloc(partdesc->nparts * sizeof(int)); - for (i = 0; i < partdesc->nparts; i++) - { - Oid partrelid = partdesc->oids[i]; - - if (get_rel_relkind(partrelid) != RELKIND_PARTITIONED_TABLE) - { - *leaf_part_oids = lappend_oid(*leaf_part_oids, partrelid); - pd->indexes[i] = list_length(*leaf_part_oids) - 1; - } - else - { - /* - * We assume all tables in the partition tree were already locked - * by the caller. - */ - Relation partrel = heap_open(partrelid, NoLock); - - pd->indexes[i] = -list_length(*pds); - get_partition_dispatch_recurse(partrel, rel, pds, leaf_part_oids); - } - } } /* ---------------- diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index bb344a7070..65d46c8ea8 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -68,7 +68,6 @@ static TupleTableSlot *ExecPrepareTupleRouting(ModifyTableState *mtstate, ResultRelInfo *targetRelInfo, TupleTableSlot *slot); static ResultRelInfo *getTargetResultRelInfo(ModifyTableState *node); -static void ExecSetupChildParentMapForTcs(ModifyTableState *mtstate); static void ExecSetupChildParentMapForSubplan(ModifyTableState *mtstate); static TupleConversionMap *tupconv_map_for_subplan(ModifyTableState *node, int whichplan); @@ -1157,7 +1156,8 @@ lreplace:; tupconv_map = tupconv_map_for_subplan(mtstate, map_index); if (tupconv_map != NULL) slot = execute_attr_map_slot(tupconv_map->attrMap, - slot, proute->root_tuple_slot); + slot, + mtstate->mt_root_tuple_slot); /* * Prepare for tuple routing, making it look like we're inserting @@ -1653,7 +1653,7 @@ ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate) if (mtstate->mt_transition_capture != NULL || mtstate->mt_oc_transition_capture != NULL) { - ExecSetupChildParentMapForTcs(mtstate); + ExecSetupChildParentMapForSubplan(mtstate); /* * Install the conversion map for the first plan for UPDATE and DELETE @@ -1686,52 +1686,21 @@ ExecPrepareTupleRouting(ModifyTableState *mtstate, TupleTableSlot *slot) { ModifyTable *node; - int partidx; ResultRelInfo *partrel; + PartitionRoutingInfo *partrouteinfo; HeapTuple tuple; TupleConversionMap *map; /* - * Determine the target partition. If ExecFindPartition does not find a - * partition after all, it doesn't return here; otherwise, the returned - * value is to be used as an index into the arrays for the ResultRelInfo - * and TupleConversionMap for the partition. + * Lookup the target partition's ResultRelInfo. If ExecFindPartition does + * not find a valid partition for the tuple in 'slot' then an error is + * raised. An error may also be raised if the found partition is not a + * valid target for INSERTs. This is required since a partitioned table + * UPDATE to another partition becomes a DELETE+INSERT. */ - partidx = ExecFindPartition(targetRelInfo, - proute->partition_dispatch_info, - slot, - estate); - Assert(partidx >= 0 && partidx < proute->num_partitions); - - /* - * Get the ResultRelInfo corresponding to the selected partition; if not - * yet there, initialize it. - */ - partrel = proute->partitions[partidx]; - if (partrel == NULL) - partrel = ExecInitPartitionInfo(mtstate, targetRelInfo, - proute, estate, - partidx); - - /* - * Check whether the partition is routable if we didn't yet - * - * Note: an UPDATE of a partition key invokes an INSERT that moves the - * tuple to a new partition. This check would be applied to a subplan - * partition of such an UPDATE that is chosen as the partition to route - * the tuple to. The reason we do this check here rather than in - * ExecSetupPartitionTupleRouting is to avoid aborting such an UPDATE - * unnecessarily due to non-routable subplan partitions that may not be - * chosen for update tuple movement after all. - */ - if (!partrel->ri_PartitionReadyForRouting) - { - /* Verify the partition is a valid target for INSERT. */ - CheckValidResultRel(partrel, CMD_INSERT); - - /* Set up information needed for routing tuples to the partition. */ - ExecInitRoutingInfo(mtstate, estate, proute, partrel, partidx); - } + partrel = ExecFindPartition(mtstate, targetRelInfo, proute, slot, estate); + partrouteinfo = partrel->ri_PartitionInfo; + Assert(partrouteinfo != NULL); /* * Make it look like we are inserting into the partition. @@ -1743,7 +1712,7 @@ ExecPrepareTupleRouting(ModifyTableState *mtstate, /* * If we're capturing transition tuples, we might need to convert from the - * partition rowtype to parent rowtype. + * partition rowtype to root partitioned table's rowtype. */ if (mtstate->mt_transition_capture != NULL) { @@ -1756,7 +1725,7 @@ ExecPrepareTupleRouting(ModifyTableState *mtstate, */ mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL; mtstate->mt_transition_capture->tcs_map = - TupConvMapForLeaf(proute, targetRelInfo, partidx); + partrouteinfo->pi_PartitionToRootMap; } else { @@ -1771,20 +1740,17 @@ ExecPrepareTupleRouting(ModifyTableState *mtstate, if (mtstate->mt_oc_transition_capture != NULL) { mtstate->mt_oc_transition_capture->tcs_map = - TupConvMapForLeaf(proute, targetRelInfo, partidx); + partrouteinfo->pi_PartitionToRootMap; } /* * Convert the tuple, if necessary. */ - map = proute->parent_child_tupconv_maps[partidx]; + map = partrouteinfo->pi_RootToPartitionMap; if (map != NULL) { - TupleTableSlot *new_slot; + TupleTableSlot *new_slot = partrouteinfo->pi_PartitionTupleSlot; - Assert(proute->partition_tuple_slots != NULL && - proute->partition_tuple_slots[partidx] != NULL); - new_slot = proute->partition_tuple_slots[partidx]; slot = execute_attr_map_slot(map->attrMap, slot, new_slot); } @@ -1822,17 +1788,6 @@ ExecSetupChildParentMapForSubplan(ModifyTableState *mtstate) int numResultRelInfos = mtstate->mt_nplans; int i; - /* - * First check if there is already a per-subplan array allocated. Even if - * there is already a per-leaf map array, we won't require a per-subplan - * one, since we will use the subplan offset array to convert the subplan - * index to per-leaf index. - */ - if (mtstate->mt_per_subplan_tupconv_maps || - (mtstate->mt_partition_tuple_routing && - mtstate->mt_partition_tuple_routing->child_parent_tupconv_maps)) - return; - /* * Build array of conversion maps from each child's TupleDesc to the one * used in the target relation. The map pointers may be NULL when no @@ -1854,79 +1809,18 @@ ExecSetupChildParentMapForSubplan(ModifyTableState *mtstate) } } -/* - * Initialize the child-to-root tuple conversion map array required for - * capturing transition tuples. - * - * The map array can be indexed either by subplan index or by leaf-partition - * index. For transition tables, we need a subplan-indexed access to the map, - * and where tuple-routing is present, we also require a leaf-indexed access. - */ -static void -ExecSetupChildParentMapForTcs(ModifyTableState *mtstate) -{ - PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing; - - /* - * If partition tuple routing is set up, we will require partition-indexed - * access. In that case, create the map array indexed by partition; we - * will still be able to access the maps using a subplan index by - * converting the subplan index to a partition index using - * subplan_partition_offsets. If tuple routing is not set up, it means we - * don't require partition-indexed access. In that case, create just a - * subplan-indexed map. - */ - if (proute) - { - /* - * If a partition-indexed map array is to be created, the subplan map - * array has to be NULL. If the subplan map array is already created, - * we won't be able to access the map using a partition index. - */ - Assert(mtstate->mt_per_subplan_tupconv_maps == NULL); - - ExecSetupChildParentMapForLeaf(proute); - } - else - ExecSetupChildParentMapForSubplan(mtstate); -} - /* * For a given subplan index, get the tuple conversion map. */ static TupleConversionMap * tupconv_map_for_subplan(ModifyTableState *mtstate, int whichplan) { - /* - * If a partition-index tuple conversion map array is allocated, we need - * to first get the index into the partition array. Exactly *one* of the - * two arrays is allocated. This is because if there is a partition array - * required, we don't require subplan-indexed array since we can translate - * subplan index into partition index. And, we create a subplan-indexed - * array *only* if partition-indexed array is not required. - */ + /* If nobody else set the per-subplan array of maps, do so ourselves. */ if (mtstate->mt_per_subplan_tupconv_maps == NULL) - { - int leaf_index; - PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing; + ExecSetupChildParentMapForSubplan(mtstate); - /* - * If subplan-indexed array is NULL, things should have been arranged - * to convert the subplan index to partition index. - */ - Assert(proute && proute->subplan_partition_offsets != NULL && - whichplan < proute->num_subplan_partition_offsets); - - leaf_index = proute->subplan_partition_offsets[whichplan]; - - return TupConvMapForLeaf(proute, getTargetResultRelInfo(mtstate), - leaf_index); - } - else - { - Assert(whichplan >= 0 && whichplan < mtstate->mt_nplans); - return mtstate->mt_per_subplan_tupconv_maps[whichplan]; - } + Assert(whichplan >= 0 && whichplan < mtstate->mt_nplans); + return mtstate->mt_per_subplan_tupconv_maps[whichplan]; } /* ---------------------------------------------------------------- @@ -2370,10 +2264,15 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) * descriptor of a source partition does not match the root partitioned * table descriptor. In such a case we need to convert tuples to the root * tuple descriptor, because the search for destination partition starts - * from the root. Skip this setup if it's not a partition key update. + * from the root. We'll also need a slot to store these converted tuples. + * We can skip this setup if it's not a partition key update. */ if (update_tuple_routing_needed) + { ExecSetupChildParentMapForSubplan(mtstate); + mtstate->mt_root_tuple_slot = MakeTupleTableSlot(RelationGetDescr(rel), + &TTSOpsHeapTuple); + } /* * Initialize any WITH CHECK OPTION constraints if needed. @@ -2716,10 +2615,18 @@ ExecEndModifyTable(ModifyTableState *node) resultRelInfo); } - /* Close all the partitioned tables, leaf partitions, and their indices */ + /* + * Close all the partitioned tables, leaf partitions, and their indices + * and release the slot used for tuple routing, if set. + */ if (node->mt_partition_tuple_routing) + { ExecCleanupTupleRouting(node, node->mt_partition_tuple_routing); + if (node->mt_root_tuple_slot) + ExecDropSingleTupleTableSlot(node->mt_root_tuple_slot); + } + /* * Free the exprcontext */ diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index d5720518a8..2a1c1cb2e1 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -1657,9 +1657,6 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) /* * expand_partitioned_rtentry * Recursively expand an RTE for a partitioned table. - * - * Note that RelationGetPartitionDispatchInfo will expand partitions in the - * same order as this code. */ static void expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte, diff --git a/src/backend/utils/cache/partcache.c b/src/backend/utils/cache/partcache.c index 07653f312b..7856b47cdd 100644 --- a/src/backend/utils/cache/partcache.c +++ b/src/backend/utils/cache/partcache.c @@ -340,15 +340,23 @@ RelationBuildPartitionDesc(Relation rel) oldcxt = MemoryContextSwitchTo(rel->rd_pdcxt); partdesc->boundinfo = partition_bounds_copy(boundinfo, key); partdesc->oids = (Oid *) palloc(partdesc->nparts * sizeof(Oid)); + partdesc->is_leaf = (bool *) palloc(partdesc->nparts * sizeof(bool)); /* * Now assign OIDs from the original array into mapped indexes of the - * result array. Order of OIDs in the former is defined by the catalog - * scan that retrieved them, whereas that in the latter is defined by - * canonicalized representation of the partition bounds. + * result array. The order of OIDs in the former is defined by the + * catalog scan that retrieved them, whereas that in the latter is defined + * by canonicalized representation of the partition bounds. */ for (i = 0; i < partdesc->nparts; i++) - partdesc->oids[mapping[i]] = oids_orig[i]; + { + int index = mapping[i]; + + partdesc->oids[index] = oids_orig[i]; + /* Record if the partition is a leaf partition */ + partdesc->is_leaf[index] = + (get_rel_relkind(oids_orig[i]) != RELKIND_PARTITIONED_TABLE); + } MemoryContextSwitchTo(oldcxt); rel->rd_partdesc = partdesc; diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h index a53de2372e..59c7a6ab69 100644 --- a/src/include/catalog/partition.h +++ b/src/include/catalog/partition.h @@ -25,7 +25,11 @@ typedef struct PartitionDescData { int nparts; /* Number of partitions */ - Oid *oids; /* OIDs of partitions */ + Oid *oids; /* Array of 'nparts' elements containing + * partition OIDs in order of the their bounds */ + bool *is_leaf; /* Array of 'nparts' elements storing whether + * the corresponding 'oids' element belongs to + * a leaf partition or not */ PartitionBoundInfo boundinfo; /* collection of partition bounds */ } PartitionDescData; diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h index 3e08104ea4..d3cfb55f9f 100644 --- a/src/include/executor/execPartition.h +++ b/src/include/executor/execPartition.h @@ -18,74 +18,36 @@ #include "nodes/plannodes.h" #include "partitioning/partprune.h" -/* See execPartition.c for the definition. */ +/* See execPartition.c for the definitions. */ typedef struct PartitionDispatchData *PartitionDispatch; +typedef struct PartitionTupleRouting PartitionTupleRouting; -/*----------------------- - * PartitionTupleRouting - Encapsulates all information required to execute - * tuple-routing between partitions. +/* + * PartitionRoutingInfo * - * partition_dispatch_info Array of PartitionDispatch objects with one - * entry for every partitioned table in the - * partition tree. - * num_dispatch number of partitioned tables in the partition - * tree (= length of partition_dispatch_info[]) - * partition_oids Array of leaf partitions OIDs with one entry - * for every leaf partition in the partition tree, - * initialized in full by - * ExecSetupPartitionTupleRouting. - * partitions Array of ResultRelInfo* objects with one entry - * for every leaf partition in the partition tree, - * initialized lazily by ExecInitPartitionInfo. - * num_partitions Number of leaf partitions in the partition tree - * (= 'partitions_oid'/'partitions' array length) - * parent_child_tupconv_maps Array of TupleConversionMap objects with one - * entry for every leaf partition (required to - * convert tuple from the root table's rowtype to - * a leaf partition's rowtype after tuple routing - * is done) - * child_parent_tupconv_maps Array of TupleConversionMap objects with one - * entry for every leaf partition (required to - * convert an updated tuple from the leaf - * partition's rowtype to the root table's rowtype - * so that tuple routing can be done) - * child_parent_map_not_required Array of bool. True value means that a map is - * determined to be not required for the given - * partition. False means either we haven't yet - * checked if a map is required, or it was - * determined to be required. - * subplan_partition_offsets Integer array ordered by UPDATE subplans. Each - * element of this array has the index into the - * corresponding partition in partitions array. - * num_subplan_partition_offsets Length of 'subplan_partition_offsets' array - * partition_tuple_slots Array of TupleTableSlot objects; if non-NULL, - * contains one entry for every leaf partition, - * of which only those of the leaf partitions - * whose attribute numbers differ from the root - * parent have a non-NULL value. NULL if all of - * the partitions encountered by a given command - * happen to have same rowtype as the root parent - * root_tuple_slot TupleTableSlot to be used to transiently hold - * copy of a tuple that's being moved across - * partitions in the root partitioned table's - * rowtype - *----------------------- + * Additional result relation information specific to routing tuples to a + * table partition. */ -typedef struct PartitionTupleRouting +typedef struct PartitionRoutingInfo { - PartitionDispatch *partition_dispatch_info; - int num_dispatch; - Oid *partition_oids; - ResultRelInfo **partitions; - int num_partitions; - TupleConversionMap **parent_child_tupconv_maps; - TupleConversionMap **child_parent_tupconv_maps; - bool *child_parent_map_not_required; - int *subplan_partition_offsets; - int num_subplan_partition_offsets; - TupleTableSlot **partition_tuple_slots; - TupleTableSlot *root_tuple_slot; -} PartitionTupleRouting; + /* + * Map for converting tuples in root partitioned table format into + * partition format, or NULL if no conversion is required. + */ + TupleConversionMap *pi_RootToPartitionMap; + + /* + * Map for converting tuples in partition format into the root partitioned + * table format, or NULL if no conversion is required. + */ + TupleConversionMap *pi_PartitionToRootMap; + + /* + * Slot to store tuples in partition format, or NULL when no translation + * is required between root and partition. + */ + TupleTableSlot *pi_PartitionTupleSlot; +} PartitionRoutingInfo; /* * PartitionedRelPruningData - Per-partitioned-table data for run-time pruning @@ -175,22 +137,11 @@ typedef struct PartitionPruneState extern PartitionTupleRouting *ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, Relation rel); -extern int ExecFindPartition(ResultRelInfo *resultRelInfo, - PartitionDispatch *pd, +extern ResultRelInfo *ExecFindPartition(ModifyTableState *mtstate, + ResultRelInfo *rootResultRelInfo, + PartitionTupleRouting *proute, TupleTableSlot *slot, EState *estate); -extern ResultRelInfo *ExecInitPartitionInfo(ModifyTableState *mtstate, - ResultRelInfo *resultRelInfo, - PartitionTupleRouting *proute, - EState *estate, int partidx); -extern void ExecInitRoutingInfo(ModifyTableState *mtstate, - EState *estate, - PartitionTupleRouting *proute, - ResultRelInfo *partRelInfo, - int partidx); -extern void ExecSetupChildParentMapForLeaf(PartitionTupleRouting *proute); -extern TupleConversionMap *TupConvMapForLeaf(PartitionTupleRouting *proute, - ResultRelInfo *rootRelInfo, int leaf_index); extern void ExecCleanupTupleRouting(ModifyTableState *mtstate, PartitionTupleRouting *proute); extern PartitionPruneState *ExecCreatePartitionPruneState(PlanState *planstate, diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 63c871e6d0..569cc7c476 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -33,6 +33,7 @@ struct PlanState; /* forward references in this file */ +struct PartitionRoutingInfo; struct ParallelHashJoinState; struct ExecRowMark; struct ExprState; @@ -469,8 +470,8 @@ typedef struct ResultRelInfo /* relation descriptor for root partitioned table */ Relation ri_PartitionRoot; - /* true if ready for tuple routing */ - bool ri_PartitionReadyForRouting; + /* Additional information specific to partition tuple routing */ + struct PartitionRoutingInfo *ri_PartitionInfo; } ResultRelInfo; /* ---------------- @@ -1112,6 +1113,12 @@ typedef struct ModifyTableState List *mt_excludedtlist; /* the excluded pseudo relation's tlist */ TupleTableSlot *mt_conflproj; /* CONFLICT ... SET ... projection target */ + /* + * Slot for storing tuples in the root partitioned table's rowtype during + * an UPDATE of a partitioned table. + */ + TupleTableSlot *mt_root_tuple_slot; + /* Tuple-routing support info */ struct PartitionTupleRouting *mt_partition_tuple_routing;