From 11cad29c91524aac1d0b61e0ea0357398ab79bf8 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Thu, 14 Oct 2010 16:56:39 -0400 Subject: [PATCH] Support MergeAppend plans, to allow sorted output from append relations. This patch eliminates the former need to sort the output of an Append scan when an ordered scan of an inheritance tree is wanted. This should be particularly useful for fast-start cases such as queries with LIMIT. Original patch by Greg Stark, with further hacking by Hans-Jurgen Schonig, Robert Haas, and Tom Lane. --- src/backend/commands/explain.c | 57 +++- src/backend/executor/Makefile | 2 +- src/backend/executor/execAmi.c | 5 + src/backend/executor/execCurrent.c | 23 ++ src/backend/executor/execProcnode.c | 14 + src/backend/executor/nodeMergeAppend.c | 392 ++++++++++++++++++++++++ src/backend/nodes/copyfuncs.c | 28 ++ src/backend/nodes/outfuncs.c | 42 +++ src/backend/optimizer/README | 5 +- src/backend/optimizer/path/allpaths.c | 169 ++++++++-- src/backend/optimizer/path/costsize.c | 78 ++++- src/backend/optimizer/path/equivclass.c | 14 +- src/backend/optimizer/plan/createplan.c | 205 +++++++++++-- src/backend/optimizer/plan/setrefs.c | 18 ++ src/backend/optimizer/plan/subselect.c | 16 + src/backend/optimizer/util/pathnode.c | 71 +++++ src/backend/utils/adt/ruleutils.c | 14 +- src/include/executor/nodeMergeAppend.h | 24 ++ src/include/nodes/execnodes.h | 27 ++ src/include/nodes/nodes.h | 3 + src/include/nodes/plannodes.h | 16 + src/include/nodes/relation.h | 25 +- src/include/optimizer/cost.h | 4 + src/include/optimizer/pathnode.h | 4 + src/test/regress/expected/inherit.out | 95 ++++++ src/test/regress/sql/inherit.sql | 33 ++ 26 files changed, 1316 insertions(+), 68 deletions(-) create mode 100644 src/backend/executor/nodeMergeAppend.c create mode 100644 src/include/executor/nodeMergeAppend.h diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index b66caa4c08..f494ec98e5 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -73,6 +73,11 @@ static void show_upper_qual(List *qual, const char *qlabel, ExplainState *es); static void show_sort_keys(SortState *sortstate, List *ancestors, ExplainState *es); +static void show_merge_append_keys(MergeAppendState *mstate, List *ancestors, + ExplainState *es); +static void show_sort_keys_common(PlanState *planstate, + int nkeys, AttrNumber *keycols, + List *ancestors, ExplainState *es); static void show_sort_info(SortState *sortstate, ExplainState *es); static void show_hash_info(HashState *hashstate, ExplainState *es); static const char *explain_get_index_name(Oid indexId); @@ -647,6 +652,9 @@ ExplainNode(PlanState *planstate, List *ancestors, case T_Append: pname = sname = "Append"; break; + case T_MergeAppend: + pname = sname = "Merge Append"; + break; case T_RecursiveUnion: pname = sname = "Recursive Union"; break; @@ -1074,6 +1082,10 @@ ExplainNode(PlanState *planstate, List *ancestors, show_sort_keys((SortState *) planstate, ancestors, es); show_sort_info((SortState *) planstate, es); break; + case T_MergeAppend: + show_merge_append_keys((MergeAppendState *) planstate, + ancestors, es); + break; case T_Result: show_upper_qual((List *) ((Result *) plan)->resconstantqual, "One-Time Filter", planstate, ancestors, es); @@ -1170,6 +1182,7 @@ ExplainNode(PlanState *planstate, List *ancestors, innerPlanState(planstate) || IsA(plan, ModifyTable) || IsA(plan, Append) || + IsA(plan, MergeAppend) || IsA(plan, BitmapAnd) || IsA(plan, BitmapOr) || IsA(plan, SubqueryScan) || @@ -1208,6 +1221,11 @@ ExplainNode(PlanState *planstate, List *ancestors, ((AppendState *) planstate)->appendplans, ancestors, es); break; + case T_MergeAppend: + ExplainMemberNodes(((MergeAppend *) plan)->mergeplans, + ((MergeAppendState *) planstate)->mergeplans, + ancestors, es); + break; case T_BitmapAnd: ExplainMemberNodes(((BitmapAnd *) plan)->bitmapplans, ((BitmapAndState *) planstate)->bitmapplans, @@ -1265,7 +1283,9 @@ show_plan_tlist(PlanState *planstate, List *ancestors, ExplainState *es) /* The tlist of an Append isn't real helpful, so suppress it */ if (IsA(plan, Append)) return; - /* Likewise for RecursiveUnion */ + /* Likewise for MergeAppend and RecursiveUnion */ + if (IsA(plan, MergeAppend)) + return; if (IsA(plan, RecursiveUnion)) return; @@ -1369,8 +1389,31 @@ static void show_sort_keys(SortState *sortstate, List *ancestors, ExplainState *es) { Sort *plan = (Sort *) sortstate->ss.ps.plan; - int nkeys = plan->numCols; - AttrNumber *keycols = plan->sortColIdx; + + show_sort_keys_common((PlanState *) sortstate, + plan->numCols, plan->sortColIdx, + ancestors, es); +} + +/* + * Likewise, for a MergeAppend node. + */ +static void +show_merge_append_keys(MergeAppendState *mstate, List *ancestors, + ExplainState *es) +{ + MergeAppend *plan = (MergeAppend *) mstate->ps.plan; + + show_sort_keys_common((PlanState *) mstate, + plan->numCols, plan->sortColIdx, + ancestors, es); +} + +static void +show_sort_keys_common(PlanState *planstate, int nkeys, AttrNumber *keycols, + List *ancestors, ExplainState *es) +{ + Plan *plan = planstate->plan; List *context; List *result = NIL; bool useprefix; @@ -1381,7 +1424,7 @@ show_sort_keys(SortState *sortstate, List *ancestors, ExplainState *es) return; /* Set up deparsing context */ - context = deparse_context_for_planstate((Node *) sortstate, + context = deparse_context_for_planstate((Node *) planstate, ancestors, es->rtable); useprefix = (list_length(es->rtable) > 1 || es->verbose); @@ -1390,7 +1433,7 @@ show_sort_keys(SortState *sortstate, List *ancestors, ExplainState *es) { /* find key expression in tlist */ AttrNumber keyresno = keycols[keyno]; - TargetEntry *target = get_tle_by_resno(plan->plan.targetlist, + TargetEntry *target = get_tle_by_resno(plan->targetlist, keyresno); if (!target) @@ -1603,8 +1646,8 @@ ExplainScanTarget(Scan *plan, ExplainState *es) } /* - * Explain the constituent plans of a ModifyTable, Append, BitmapAnd, - * or BitmapOr node. + * Explain the constituent plans of a ModifyTable, Append, MergeAppend, + * BitmapAnd, or BitmapOr node. * * The ancestors list should already contain the immediate parent of these * plans. diff --git a/src/backend/executor/Makefile b/src/backend/executor/Makefile index f260bcfd64..da4fbb440b 100644 --- a/src/backend/executor/Makefile +++ b/src/backend/executor/Makefile @@ -18,7 +18,7 @@ OBJS = execAmi.o execCurrent.o execGrouping.o execJunk.o execMain.o \ nodeBitmapAnd.o nodeBitmapOr.o \ nodeBitmapHeapscan.o nodeBitmapIndexscan.o nodeHash.o \ nodeHashjoin.o nodeIndexscan.o nodeLimit.o nodeLockRows.o \ - nodeMaterial.o nodeMergejoin.o nodeModifyTable.o \ + nodeMaterial.o nodeMergeAppend.o nodeMergejoin.o nodeModifyTable.o \ nodeNestloop.o nodeFunctionscan.o nodeRecursiveunion.o nodeResult.o \ nodeSeqscan.o nodeSetOp.o nodeSort.o nodeUnique.o \ nodeValuesscan.o nodeCtescan.o nodeWorktablescan.o \ diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c index 20a50192e0..d999a4481d 100644 --- a/src/backend/executor/execAmi.c +++ b/src/backend/executor/execAmi.c @@ -30,6 +30,7 @@ #include "executor/nodeLimit.h" #include "executor/nodeLockRows.h" #include "executor/nodeMaterial.h" +#include "executor/nodeMergeAppend.h" #include "executor/nodeMergejoin.h" #include "executor/nodeModifyTable.h" #include "executor/nodeNestloop.h" @@ -129,6 +130,10 @@ ExecReScan(PlanState *node) ExecReScanAppend((AppendState *) node); break; + case T_MergeAppendState: + ExecReScanMergeAppend((MergeAppendState *) node); + break; + case T_RecursiveUnionState: ExecReScanRecursiveUnion((RecursiveUnionState *) node); break; diff --git a/src/backend/executor/execCurrent.c b/src/backend/executor/execCurrent.c index 79a1425a97..cf552a016a 100644 --- a/src/backend/executor/execCurrent.c +++ b/src/backend/executor/execCurrent.c @@ -295,6 +295,29 @@ search_plan_tree(PlanState *node, Oid table_oid) return result; } + /* + * Similarly for MergeAppend + */ + case T_MergeAppendState: + { + MergeAppendState *mstate = (MergeAppendState *) node; + ScanState *result = NULL; + int i; + + for (i = 0; i < mstate->ms_nplans; i++) + { + ScanState *elem = search_plan_tree(mstate->mergeplans[i], + table_oid); + + if (!elem) + continue; + if (result) + return NULL; /* multiple matches */ + result = elem; + } + return result; + } + /* * Result and Limit can be descended through (these are safe * because they always return their input's current row) diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index 64cb701e41..edd175e129 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -93,6 +93,7 @@ #include "executor/nodeLimit.h" #include "executor/nodeLockRows.h" #include "executor/nodeMaterial.h" +#include "executor/nodeMergeAppend.h" #include "executor/nodeMergejoin.h" #include "executor/nodeModifyTable.h" #include "executor/nodeNestloop.h" @@ -158,6 +159,11 @@ ExecInitNode(Plan *node, EState *estate, int eflags) estate, eflags); break; + case T_MergeAppend: + result = (PlanState *) ExecInitMergeAppend((MergeAppend *) node, + estate, eflags); + break; + case T_RecursiveUnion: result = (PlanState *) ExecInitRecursiveUnion((RecursiveUnion *) node, estate, eflags); @@ -363,6 +369,10 @@ ExecProcNode(PlanState *node) result = ExecAppend((AppendState *) node); break; + case T_MergeAppendState: + result = ExecMergeAppend((MergeAppendState *) node); + break; + case T_RecursiveUnionState: result = ExecRecursiveUnion((RecursiveUnionState *) node); break; @@ -581,6 +591,10 @@ ExecEndNode(PlanState *node) ExecEndAppend((AppendState *) node); break; + case T_MergeAppendState: + ExecEndMergeAppend((MergeAppendState *) node); + break; + case T_RecursiveUnionState: ExecEndRecursiveUnion((RecursiveUnionState *) node); break; diff --git a/src/backend/executor/nodeMergeAppend.c b/src/backend/executor/nodeMergeAppend.c new file mode 100644 index 0000000000..9ded25d8fc --- /dev/null +++ b/src/backend/executor/nodeMergeAppend.c @@ -0,0 +1,392 @@ +/*------------------------------------------------------------------------- + * + * nodeMergeAppend.c + * routines to handle MergeAppend nodes. + * + * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeMergeAppend.c + * + *------------------------------------------------------------------------- + */ +/* INTERFACE ROUTINES + * ExecInitMergeAppend - initialize the MergeAppend node + * ExecMergeAppend - retrieve the next tuple from the node + * ExecEndMergeAppend - shut down the MergeAppend node + * ExecReScanMergeAppend - rescan the MergeAppend node + * + * NOTES + * A MergeAppend node contains a list of one or more subplans. + * These are each expected to deliver tuples that are sorted according + * to a common sort key. The MergeAppend node merges these streams + * to produce output sorted the same way. + * + * MergeAppend nodes don't make use of their left and right + * subtrees, rather they maintain a list of subplans so + * a typical MergeAppend node looks like this in the plan tree: + * + * ... + * / + * MergeAppend---+------+------+--- nil + * / \ | | | + * nil nil ... ... ... + * subplans + */ + +#include "postgres.h" + +#include "access/nbtree.h" +#include "executor/execdebug.h" +#include "executor/nodeMergeAppend.h" +#include "utils/lsyscache.h" + +/* + * It gets quite confusing having a heap array (indexed by integers) which + * contains integers which index into the slots array. These typedefs try to + * clear it up, but they're only documentation. + */ +typedef int SlotNumber; +typedef int HeapPosition; + +static void heap_insert_slot(MergeAppendState *node, SlotNumber new_slot); +static void heap_siftup_slot(MergeAppendState *node); +static int32 heap_compare_slots(MergeAppendState *node, SlotNumber slot1, SlotNumber slot2); + + +/* ---------------------------------------------------------------- + * ExecInitMergeAppend + * + * Begin all of the subscans of the MergeAppend node. + * ---------------------------------------------------------------- + */ +MergeAppendState * +ExecInitMergeAppend(MergeAppend *node, EState *estate, int eflags) +{ + MergeAppendState *mergestate = makeNode(MergeAppendState); + PlanState **mergeplanstates; + int nplans; + int i; + ListCell *lc; + + /* check for unsupported flags */ + Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); + + /* + * Set up empty vector of subplan states + */ + nplans = list_length(node->mergeplans); + + mergeplanstates = (PlanState **) palloc0(nplans * sizeof(PlanState *)); + + /* + * create new MergeAppendState for our node + */ + mergestate->ps.plan = (Plan *) node; + mergestate->ps.state = estate; + mergestate->mergeplans = mergeplanstates; + mergestate->ms_nplans = nplans; + + mergestate->ms_slots = (TupleTableSlot **) palloc0(sizeof(TupleTableSlot *) * nplans); + mergestate->ms_heap = (int *) palloc0(sizeof(int) * nplans); + + /* + * Miscellaneous initialization + * + * MergeAppend plans don't have expression contexts because they never + * call ExecQual or ExecProject. + */ + + /* + * MergeAppend nodes do have Result slots, which hold pointers to tuples, + * so we have to initialize them. + */ + ExecInitResultTupleSlot(estate, &mergestate->ps); + + /* + * call ExecInitNode on each of the plans to be executed and save the + * results into the array "mergeplans". + */ + i = 0; + foreach(lc, node->mergeplans) + { + Plan *initNode = (Plan *) lfirst(lc); + + mergeplanstates[i] = ExecInitNode(initNode, estate, eflags); + i++; + } + + /* + * initialize output tuple type + */ + ExecAssignResultTypeFromTL(&mergestate->ps); + mergestate->ps.ps_ProjInfo = NULL; + + /* + * initialize sort-key information + */ + mergestate->ms_nkeys = node->numCols; + mergestate->ms_scankeys = palloc0(sizeof(ScanKeyData) * node->numCols); + + for (i = 0; i < node->numCols; i++) + { + Oid sortFunction; + bool reverse; + + if (!get_compare_function_for_ordering_op(node->sortOperators[i], + &sortFunction, &reverse)) + elog(ERROR, "operator %u is not a valid ordering operator", + node->sortOperators[i]); + + /* + * We needn't fill in sk_strategy or sk_subtype since these scankeys + * will never be passed to an index. + */ + ScanKeyInit(&mergestate->ms_scankeys[i], + node->sortColIdx[i], + InvalidStrategy, + sortFunction, + (Datum) 0); + + /* However, we use btree's conventions for encoding directionality */ + if (reverse) + mergestate->ms_scankeys[i].sk_flags |= SK_BT_DESC; + if (node->nullsFirst[i]) + mergestate->ms_scankeys[i].sk_flags |= SK_BT_NULLS_FIRST; + } + + /* + * initialize to show we have not run the subplans yet + */ + mergestate->ms_heap_size = 0; + mergestate->ms_initialized = false; + mergestate->ms_last_slot = -1; + + return mergestate; +} + +/* ---------------------------------------------------------------- + * ExecMergeAppend + * + * Handles iteration over multiple subplans. + * ---------------------------------------------------------------- + */ +TupleTableSlot * +ExecMergeAppend(MergeAppendState *node) +{ + TupleTableSlot *result; + SlotNumber i; + + if (!node->ms_initialized) + { + /* + * First time through: pull the first tuple from each subplan, + * and set up the heap. + */ + for (i = 0; i < node->ms_nplans; i++) + { + node->ms_slots[i] = ExecProcNode(node->mergeplans[i]); + if (!TupIsNull(node->ms_slots[i])) + heap_insert_slot(node, i); + } + node->ms_initialized = true; + } + else + { + /* + * Otherwise, pull the next tuple from whichever subplan we returned + * from last time, and insert it into the heap. (We could simplify + * the logic a bit by doing this before returning from the prior call, + * but it's better to not pull tuples until necessary.) + */ + i = node->ms_last_slot; + node->ms_slots[i] = ExecProcNode(node->mergeplans[i]); + if (!TupIsNull(node->ms_slots[i])) + heap_insert_slot(node, i); + } + + if (node->ms_heap_size > 0) + { + /* Return the topmost heap node, and sift up the remaining nodes */ + i = node->ms_heap[0]; + result = node->ms_slots[i]; + node->ms_last_slot = i; + heap_siftup_slot(node); + } + else + { + /* All the subplans are exhausted, and so is the heap */ + result = ExecClearTuple(node->ps.ps_ResultTupleSlot); + } + + return result; +} + +/* + * Insert a new slot into the heap. The slot must contain a valid tuple. + */ +static void +heap_insert_slot(MergeAppendState *node, SlotNumber new_slot) +{ + SlotNumber *heap = node->ms_heap; + HeapPosition j; + + Assert(!TupIsNull(node->ms_slots[new_slot])); + + j = node->ms_heap_size++; /* j is where the "hole" is */ + while (j > 0) + { + int i = (j-1)/2; + + if (heap_compare_slots(node, new_slot, node->ms_heap[i]) >= 0) + break; + heap[j] = heap[i]; + j = i; + } + heap[j] = new_slot; +} + +/* + * Delete the heap top (the slot in heap[0]), and sift up. + */ +static void +heap_siftup_slot(MergeAppendState *node) +{ + SlotNumber *heap = node->ms_heap; + HeapPosition i, + n; + + if (--node->ms_heap_size <= 0) + return; + n = node->ms_heap_size; /* heap[n] needs to be reinserted */ + i = 0; /* i is where the "hole" is */ + for (;;) + { + int j = 2 * i + 1; + + if (j >= n) + break; + if (j+1 < n && heap_compare_slots(node, heap[j], heap[j+1]) > 0) + j++; + if (heap_compare_slots(node, heap[n], heap[j]) <= 0) + break; + heap[i] = heap[j]; + i = j; + } + heap[i] = heap[n]; +} + +/* + * Compare the tuples in the two given slots. + */ +static int32 +heap_compare_slots(MergeAppendState *node, SlotNumber slot1, SlotNumber slot2) +{ + TupleTableSlot *s1 = node->ms_slots[slot1]; + TupleTableSlot *s2 = node->ms_slots[slot2]; + int nkey; + + Assert(!TupIsNull(s1)); + Assert(!TupIsNull(s2)); + + for (nkey = 0; nkey < node->ms_nkeys; nkey++) + { + ScanKey scankey = node->ms_scankeys + nkey; + AttrNumber attno = scankey->sk_attno; + Datum datum1, + datum2; + bool isNull1, + isNull2; + int32 compare; + + datum1 = slot_getattr(s1, attno, &isNull1); + datum2 = slot_getattr(s2, attno, &isNull2); + + if (isNull1) + { + if (isNull2) + continue; /* NULL "=" NULL */ + else if (scankey->sk_flags & SK_BT_NULLS_FIRST) + return -1; /* NULL "<" NOT_NULL */ + else + return 1; /* NULL ">" NOT_NULL */ + } + else if (isNull2) + { + if (scankey->sk_flags & SK_BT_NULLS_FIRST) + return 1; /* NOT_NULL ">" NULL */ + else + return -1; /* NOT_NULL "<" NULL */ + } + else + { + compare = DatumGetInt32(FunctionCall2(&scankey->sk_func, + datum1, datum2)); + if (compare != 0) + { + if (scankey->sk_flags & SK_BT_DESC) + compare = -compare; + return compare; + } + } + } + return 0; +} + +/* ---------------------------------------------------------------- + * ExecEndMergeAppend + * + * Shuts down the subscans of the MergeAppend node. + * + * Returns nothing of interest. + * ---------------------------------------------------------------- + */ +void +ExecEndMergeAppend(MergeAppendState *node) +{ + PlanState **mergeplans; + int nplans; + int i; + + /* + * get information from the node + */ + mergeplans = node->mergeplans; + nplans = node->ms_nplans; + + /* + * shut down each of the subscans + */ + for (i = 0; i < nplans; i++) + ExecEndNode(mergeplans[i]); +} + +void +ExecReScanMergeAppend(MergeAppendState *node) +{ + int i; + + for (i = 0; i < node->ms_nplans; i++) + { + PlanState *subnode = node->mergeplans[i]; + + /* + * ExecReScan doesn't know about my subplans, so I have to do + * changed-parameter signaling myself. + */ + if (node->ps.chgParam != NULL) + UpdateChangedParamSet(subnode, node->ps.chgParam); + + /* + * If chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode. + */ + if (subnode->chgParam == NULL) + ExecReScan(subnode); + } + node->ms_heap_size = 0; + node->ms_initialized = false; + node->ms_last_slot = -1; +} diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 2118a333a3..6ce3984bff 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -202,6 +202,31 @@ _copyAppend(Append *from) return newnode; } +/* + * _copyMergeAppend + */ +static MergeAppend * +_copyMergeAppend(MergeAppend *from) +{ + MergeAppend *newnode = makeNode(MergeAppend); + + /* + * copy node superclass fields + */ + CopyPlanFields((Plan *) from, (Plan *) newnode); + + /* + * copy remainder of node + */ + COPY_NODE_FIELD(mergeplans); + COPY_SCALAR_FIELD(numCols); + COPY_POINTER_FIELD(sortColIdx, from->numCols * sizeof(AttrNumber)); + COPY_POINTER_FIELD(sortOperators, from->numCols * sizeof(Oid)); + COPY_POINTER_FIELD(nullsFirst, from->numCols * sizeof(bool)); + + return newnode; +} + /* * _copyRecursiveUnion */ @@ -3625,6 +3650,9 @@ copyObject(void *from) case T_Append: retval = _copyAppend(from); break; + case T_MergeAppend: + retval = _copyMergeAppend(from); + break; case T_RecursiveUnion: retval = _copyRecursiveUnion(from); break; diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 55665ca20e..ee2aeb0ad3 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -344,6 +344,32 @@ _outAppend(StringInfo str, Append *node) WRITE_NODE_FIELD(appendplans); } +static void +_outMergeAppend(StringInfo str, MergeAppend *node) +{ + int i; + + WRITE_NODE_TYPE("MERGEAPPEND"); + + _outPlanInfo(str, (Plan *) node); + + WRITE_NODE_FIELD(mergeplans); + + WRITE_INT_FIELD(numCols); + + appendStringInfo(str, " :sortColIdx"); + for (i = 0; i < node->numCols; i++) + appendStringInfo(str, " %d", node->sortColIdx[i]); + + appendStringInfo(str, " :sortOperators"); + for (i = 0; i < node->numCols; i++) + appendStringInfo(str, " %u", node->sortOperators[i]); + + appendStringInfo(str, " :nullsFirst"); + for (i = 0; i < node->numCols; i++) + appendStringInfo(str, " %s", booltostr(node->nullsFirst[i])); +} + static void _outRecursiveUnion(StringInfo str, RecursiveUnion *node) { @@ -1459,6 +1485,16 @@ _outAppendPath(StringInfo str, AppendPath *node) WRITE_NODE_FIELD(subpaths); } +static void +_outMergeAppendPath(StringInfo str, MergeAppendPath *node) +{ + WRITE_NODE_TYPE("MERGEAPPENDPATH"); + + _outPathInfo(str, (Path *) node); + + WRITE_NODE_FIELD(subpaths); +} + static void _outResultPath(StringInfo str, ResultPath *node) { @@ -2498,6 +2534,9 @@ _outNode(StringInfo str, void *obj) case T_Append: _outAppend(str, obj); break; + case T_MergeAppend: + _outMergeAppend(str, obj); + break; case T_RecursiveUnion: _outRecursiveUnion(str, obj); break; @@ -2745,6 +2784,9 @@ _outNode(StringInfo str, void *obj) case T_AppendPath: _outAppendPath(str, obj); break; + case T_MergeAppendPath: + _outMergeAppendPath(str, obj); + break; case T_ResultPath: _outResultPath(str, obj); break; diff --git a/src/backend/optimizer/README b/src/backend/optimizer/README index fc793ac8ae..d6402cf911 100644 --- a/src/backend/optimizer/README +++ b/src/backend/optimizer/README @@ -343,11 +343,12 @@ RelOptInfo - a relation or joined relations join clauses) Path - every way to generate a RelOptInfo(sequential,index,joins) - SeqScan - a plain Path node with pathtype = T_SeqScan - IndexPath - index scans + SeqScan - represents a sequential scan plan + IndexPath - index scan BitmapHeapPath - top of a bitmapped index scan TidPath - scan by CTID AppendPath - append multiple subpaths together + MergeAppendPath - merge multiple subpaths, preserving their common sort order ResultPath - a Result plan node (used for FROM-less SELECT) MaterialPath - a Material plan node UniquePath - remove duplicate rows diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index b7cf0b815b..aa9a90cbfa 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -51,6 +51,7 @@ static void set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte); static void set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTblEntry *rte); +static List *accumulate_append_subpath(List *subpaths, Path *path); static void set_dummy_rel_pathlist(RelOptInfo *rel); static void set_subquery_pathlist(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTblEntry *rte); @@ -283,7 +284,9 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTblEntry *rte) { int parentRTindex = rti; + List *live_childrels = NIL; List *subpaths = NIL; + List *all_child_pathkeys = NIL; double parent_rows; double parent_size; double *parent_attrsizes; @@ -321,7 +324,7 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RelOptInfo *childrel; List *childquals; Node *childqual; - Path *childpath; + ListCell *lcp; ListCell *parentvars; ListCell *childvars; @@ -395,13 +398,15 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, /* * We have to make child entries in the EquivalenceClass data - * structures as well. + * structures as well. This is needed either if the parent + * participates in some eclass joins (because we will want to + * consider inner-indexscan joins on the individual children) + * or if the parent has useful pathkeys (because we should try + * to build MergeAppend paths that produce those sort orderings). */ - if (rel->has_eclass_joins) - { + if (rel->has_eclass_joins || has_useful_pathkeys(root, rel)) add_child_rel_equivalences(root, appinfo, rel, childrel); - childrel->has_eclass_joins = true; - } + childrel->has_eclass_joins = rel->has_eclass_joins; /* * Note: we could compute appropriate attr_needed data for the child's @@ -411,23 +416,52 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, * otherrels. So we just leave the child's attr_needed empty. */ + /* Remember which childrels are live, for MergeAppend logic below */ + live_childrels = lappend(live_childrels, childrel); + /* * Compute the child's access paths, and add the cheapest one to the * Append path we are constructing for the parent. - * - * It's possible that the child is itself an appendrel, in which case - * we can "cut out the middleman" and just add its child paths to our - * own list. (We don't try to do this earlier because we need to - * apply both levels of transformation to the quals.) */ set_rel_pathlist(root, childrel, childRTindex, childRTE); - childpath = childrel->cheapest_total_path; - if (IsA(childpath, AppendPath)) - subpaths = list_concat(subpaths, - list_copy(((AppendPath *) childpath)->subpaths)); - else - subpaths = lappend(subpaths, childpath); + subpaths = accumulate_append_subpath(subpaths, + childrel->cheapest_total_path); + + /* + * Collect a list of all the available path orderings for all the + * children. We use this as a heuristic to indicate which sort + * orderings we should build MergeAppend paths for. + */ + foreach(lcp, childrel->pathlist) + { + Path *childpath = (Path *) lfirst(lcp); + List *childkeys = childpath->pathkeys; + ListCell *lpk; + bool found = false; + + /* Ignore unsorted paths */ + if (childkeys == NIL) + continue; + + /* Have we already seen this ordering? */ + foreach(lpk, all_child_pathkeys) + { + List *existing_pathkeys = (List *) lfirst(lpk); + + if (compare_pathkeys(existing_pathkeys, + childkeys) == PATHKEYS_EQUAL) + { + found = true; + break; + } + } + if (!found) + { + /* No, so add it to all_child_pathkeys */ + all_child_pathkeys = lappend(all_child_pathkeys, childkeys); + } + } /* * Accumulate size information from each child. @@ -483,16 +517,106 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, pfree(parent_attrsizes); /* - * Finally, build Append path and install it as the only access path for - * the parent rel. (Note: this is correct even if we have zero or one - * live subpath due to constraint exclusion.) + * Next, build an unordered Append path for the rel. (Note: this is + * correct even if we have zero or one live subpath due to constraint + * exclusion.) */ add_path(rel, (Path *) create_append_path(rel, subpaths)); - /* Select cheapest path (pretty easy in this case...) */ + /* + * Next, build MergeAppend paths based on the collected list of child + * pathkeys. We consider both cheapest-startup and cheapest-total + * cases, ie, for each interesting ordering, collect all the cheapest + * startup subpaths and all the cheapest total paths, and build a + * MergeAppend path for each list. + */ + foreach(l, all_child_pathkeys) + { + List *pathkeys = (List *) lfirst(l); + List *startup_subpaths = NIL; + List *total_subpaths = NIL; + bool startup_neq_total = false; + ListCell *lcr; + + /* Select the child paths for this ordering... */ + foreach(lcr, live_childrels) + { + RelOptInfo *childrel = (RelOptInfo *) lfirst(lcr); + Path *cheapest_startup, + *cheapest_total; + + /* Locate the right paths, if they are available. */ + cheapest_startup = + get_cheapest_path_for_pathkeys(childrel->pathlist, + pathkeys, + STARTUP_COST); + cheapest_total = + get_cheapest_path_for_pathkeys(childrel->pathlist, + pathkeys, + TOTAL_COST); + + /* + * If we can't find any paths with the right order just add the + * cheapest-total path; we'll have to sort it. + */ + if (cheapest_startup == NULL) + cheapest_startup = childrel->cheapest_total_path; + if (cheapest_total == NULL) + cheapest_total = childrel->cheapest_total_path; + + /* + * Notice whether we actually have different paths for the + * "cheapest" and "total" cases; frequently there will be no + * point in two create_merge_append_path() calls. + */ + if (cheapest_startup != cheapest_total) + startup_neq_total = true; + + startup_subpaths = + accumulate_append_subpath(startup_subpaths, cheapest_startup); + total_subpaths = + accumulate_append_subpath(total_subpaths, cheapest_total); + } + + /* ... and build the MergeAppend paths */ + add_path(rel, (Path *) create_merge_append_path(root, + rel, + startup_subpaths, + pathkeys)); + if (startup_neq_total) + add_path(rel, (Path *) create_merge_append_path(root, + rel, + total_subpaths, + pathkeys)); + } + + /* Select cheapest path */ set_cheapest(rel); } +/* + * accumulate_append_subpath + * Add a subpath to the list being built for an Append or MergeAppend + * + * It's possible that the child is itself an Append path, in which case + * we can "cut out the middleman" and just add its child paths to our + * own list. (We don't try to do this earlier because we need to + * apply both levels of transformation to the quals.) + */ +static List * +accumulate_append_subpath(List *subpaths, Path *path) +{ + if (IsA(path, AppendPath)) + { + AppendPath *apath = (AppendPath *) path; + + /* list_copy is important here to avoid sharing list substructure */ + return list_concat(subpaths, list_copy(apath->subpaths)); + } + else + return lappend(subpaths, path); +} + /* * set_dummy_rel_pathlist * Build a dummy path for a relation that's been excluded by constraints @@ -1385,6 +1509,9 @@ print_path(PlannerInfo *root, Path *path, int indent) case T_AppendPath: ptype = "Append"; break; + case T_MergeAppendPath: + ptype = "MergeAppend"; + break; case T_ResultPath: ptype = "Result"; break; diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index b27dc53fef..067cbca125 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -1209,6 +1209,70 @@ cost_sort(Path *path, PlannerInfo *root, path->total_cost = startup_cost + run_cost; } +/* + * cost_merge_append + * Determines and returns the cost of a MergeAppend node. + * + * MergeAppend merges several pre-sorted input streams, using a heap that + * at any given instant holds the next tuple from each stream. If there + * are N streams, we need about N*log2(N) tuple comparisons to construct + * the heap at startup, and then for each output tuple, about log2(N) + * comparisons to delete the top heap entry and another log2(N) comparisons + * to insert its successor from the same stream. + * + * (The effective value of N will drop once some of the input streams are + * exhausted, but it seems unlikely to be worth trying to account for that.) + * + * The heap is never spilled to disk, since we assume N is not very large. + * So this is much simpler than cost_sort. + * + * As in cost_sort, we charge two operator evals per tuple comparison. + * + * 'pathkeys' is a list of sort keys + * 'n_streams' is the number of input streams + * 'input_startup_cost' is the sum of the input streams' startup costs + * 'input_total_cost' is the sum of the input streams' total costs + * 'tuples' is the number of tuples in all the streams + */ +void +cost_merge_append(Path *path, PlannerInfo *root, + List *pathkeys, int n_streams, + Cost input_startup_cost, Cost input_total_cost, + double tuples) +{ + Cost startup_cost = 0; + Cost run_cost = 0; + Cost comparison_cost; + double N; + double logN; + + /* + * Avoid log(0)... + */ + N = (n_streams < 2) ? 2.0 : (double) n_streams; + logN = LOG2(N); + + /* Assumed cost per tuple comparison */ + comparison_cost = 2.0 * cpu_operator_cost; + + /* Heap creation cost */ + startup_cost += comparison_cost * N * logN; + + /* Per-tuple heap maintenance cost */ + run_cost += tuples * comparison_cost * 2.0 * logN; + + /* + * Also charge a small amount (arbitrarily set equal to operator cost) per + * extracted tuple. We don't charge cpu_tuple_cost because a MergeAppend + * node doesn't do qual-checking or projection, so it has less overhead + * than most plan nodes. + */ + run_cost += cpu_operator_cost * tuples; + + path->startup_cost = startup_cost + input_startup_cost; + path->total_cost = startup_cost + run_cost + input_total_cost; +} + /* * cost_material * Determines and returns the cost of materializing a relation, including @@ -1405,7 +1469,9 @@ cost_group(Path *path, PlannerInfo *root, * output row count, which may be lower than the restriction-clause-only row * count of its parent. (We don't include this case in the PATH_ROWS macro * because it applies *only* to a nestloop's inner relation.) We have to - * be prepared to recurse through Append nodes in case of an appendrel. + * be prepared to recurse through Append or MergeAppend nodes in case of an + * appendrel. (It's not clear MergeAppend can be seen here, but we may as + * well handle it if so.) */ static double nestloop_inner_path_rows(Path *path) @@ -1426,6 +1492,16 @@ nestloop_inner_path_rows(Path *path) result += nestloop_inner_path_rows((Path *) lfirst(l)); } } + else if (IsA(path, MergeAppendPath)) + { + ListCell *l; + + result = 0; + foreach(l, ((MergeAppendPath *) path)->subpaths) + { + result += nestloop_inner_path_rows((Path *) lfirst(l)); + } + } else result = PATH_ROWS(path); diff --git a/src/backend/optimizer/path/equivclass.c b/src/backend/optimizer/path/equivclass.c index a20ed5f36c..e44e960b54 100644 --- a/src/backend/optimizer/path/equivclass.c +++ b/src/backend/optimizer/path/equivclass.c @@ -1611,8 +1611,8 @@ exprs_known_equal(PlannerInfo *root, Node *item1, Node *item2) * Search for EC members that reference (only) the parent_rel, and * add transformed members referencing the child_rel. * - * We only need to do this for ECs that could generate join conditions, - * since the child members are only used for creating inner-indexscan paths. + * Note that this function won't be called at all unless we have at least some + * reason to believe that the EC members it generates will be useful. * * parent_rel and child_rel could be derived from appinfo, but since the * caller has already computed them, we might as well just pass them in. @@ -1631,10 +1631,14 @@ add_child_rel_equivalences(PlannerInfo *root, ListCell *lc2; /* - * Won't generate joinclauses if const or single-member (the latter - * test covers the volatile case too) + * If this EC contains a constant, then it's not useful for sorting + * or driving an inner index-scan, so we skip generating child EMs. + * + * If this EC contains a volatile expression, then generating child + * EMs would be downright dangerous. We rely on a volatile EC having + * only one EM. */ - if (cur_ec->ec_has_const || list_length(cur_ec->ec_members) <= 1) + if (cur_ec->ec_has_const || cur_ec->ec_has_volatile) continue; /* No point in searching if parent rel not mentioned in eclass */ diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 2c398d2eed..52dd27b158 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -25,6 +25,7 @@ #include "nodes/nodeFuncs.h" #include "optimizer/clauses.h" #include "optimizer/cost.h" +#include "optimizer/paths.h" #include "optimizer/plancat.h" #include "optimizer/planmain.h" #include "optimizer/predtest.h" @@ -45,6 +46,7 @@ static void disuse_physical_tlist(Plan *plan, Path *path); static Plan *create_gating_plan(PlannerInfo *root, Plan *plan, List *quals); static Plan *create_join_plan(PlannerInfo *root, JoinPath *best_path); static Plan *create_append_plan(PlannerInfo *root, AppendPath *best_path); +static Plan *create_merge_append_plan(PlannerInfo *root, MergeAppendPath *best_path); static Result *create_result_plan(PlannerInfo *root, ResultPath *best_path); static Material *create_material_plan(PlannerInfo *root, MaterialPath *best_path); static Plan *create_unique_plan(PlannerInfo *root, UniquePath *best_path); @@ -134,6 +136,13 @@ static MergeJoin *make_mergejoin(List *tlist, static Sort *make_sort(PlannerInfo *root, Plan *lefttree, int numCols, AttrNumber *sortColIdx, Oid *sortOperators, bool *nullsFirst, double limit_tuples); +static Plan *prepare_sort_from_pathkeys(PlannerInfo *root, + Plan *lefttree, List *pathkeys, + bool adjust_tlist_in_place, + int *p_numsortkeys, + AttrNumber **p_sortColIdx, + Oid **p_sortOperators, + bool **p_nullsFirst); static Material *make_material(Plan *lefttree); @@ -203,6 +212,10 @@ create_plan_recurse(PlannerInfo *root, Path *best_path) plan = create_append_plan(root, (AppendPath *) best_path); break; + case T_MergeAppend: + plan = create_merge_append_plan(root, + (MergeAppendPath *) best_path); + break; case T_Result: plan = (Plan *) create_result_plan(root, (ResultPath *) best_path); @@ -619,6 +632,97 @@ create_append_plan(PlannerInfo *root, AppendPath *best_path) return (Plan *) plan; } +/* + * create_merge_append_plan + * Create a MergeAppend plan for 'best_path' and (recursively) plans + * for its subpaths. + * + * Returns a Plan node. + */ +static Plan * +create_merge_append_plan(PlannerInfo *root, MergeAppendPath *best_path) +{ + MergeAppend *node = makeNode(MergeAppend); + Plan *plan = &node->plan; + List *tlist = build_relation_tlist(best_path->path.parent); + List *pathkeys = best_path->path.pathkeys; + List *subplans = NIL; + ListCell *subpaths; + + /* + * We don't have the actual creation of the MergeAppend node split out + * into a separate make_xxx function. This is because we want to run + * prepare_sort_from_pathkeys on it before we do so on the individual + * child plans, to make cross-checking the sort info easier. + */ + copy_path_costsize(plan, (Path *) best_path); + plan->targetlist = tlist; + plan->qual = NIL; + plan->lefttree = NULL; + plan->righttree = NULL; + + /* Compute sort column info, and adjust MergeAppend's tlist as needed */ + (void) prepare_sort_from_pathkeys(root, plan, pathkeys, + true, + &node->numCols, + &node->sortColIdx, + &node->sortOperators, + &node->nullsFirst); + + /* + * Now prepare the child plans. We must apply prepare_sort_from_pathkeys + * even to subplans that don't need an explicit sort, to make sure they + * are returning the same sort key columns the MergeAppend expects. + */ + foreach(subpaths, best_path->subpaths) + { + Path *subpath = (Path *) lfirst(subpaths); + Plan *subplan; + int numsortkeys; + AttrNumber *sortColIdx; + Oid *sortOperators; + bool *nullsFirst; + + /* Build the child plan */ + subplan = create_plan_recurse(root, subpath); + + /* Compute sort column info, and adjust subplan's tlist as needed */ + subplan = prepare_sort_from_pathkeys(root, subplan, pathkeys, + false, + &numsortkeys, + &sortColIdx, + &sortOperators, + &nullsFirst); + + /* + * Check that we got the same sort key information. We just Assert + * that the sortops match, since those depend only on the pathkeys; + * but it seems like a good idea to check the sort column numbers + * explicitly, to ensure the tlists really do match up. + */ + Assert(numsortkeys == node->numCols); + if (memcmp(sortColIdx, node->sortColIdx, + numsortkeys * sizeof(AttrNumber)) != 0) + elog(ERROR, "MergeAppend child's targetlist doesn't match MergeAppend"); + Assert(memcmp(sortOperators, node->sortOperators, + numsortkeys * sizeof(Oid)) == 0); + Assert(memcmp(nullsFirst, node->nullsFirst, + numsortkeys * sizeof(bool)) == 0); + + /* Now, insert a Sort node if subplan isn't sufficiently ordered */ + if (!pathkeys_contained_in(pathkeys, subpath->pathkeys)) + subplan = (Plan *) make_sort(root, subplan, numsortkeys, + sortColIdx, sortOperators, nullsFirst, + -1.0); + + subplans = lappend(subplans, subplan); + } + + node->mergeplans = subplans; + + return (Plan *) node; +} + /* * create_result_plan * Create a Result plan for 'best_path'. @@ -2502,7 +2606,7 @@ order_qual_clauses(PlannerInfo *root, List *clauses) /* * Copy cost and size info from a Path node to the Plan node created from it. - * The executor won't use this info, but it's needed by EXPLAIN. + * The executor usually won't use this info, but it's needed by EXPLAIN. */ static void copy_path_costsize(Plan *dest, Path *src) @@ -2525,9 +2629,7 @@ copy_path_costsize(Plan *dest, Path *src) /* * Copy cost and size info from a lower plan node to an inserted node. - * This is not critical, since the decisions have already been made, - * but it helps produce more reasonable-looking EXPLAIN output. - * (Some callers alter the info after copying it.) + * (Most callers alter the info after copying it.) */ static void copy_plan_costsize(Plan *dest, Plan *src) @@ -2796,6 +2898,12 @@ make_append(List *appendplans, List *tlist) * Compute cost as sum of subplan costs. We charge nothing extra for the * Append itself, which perhaps is too optimistic, but since it doesn't do * any selection or projection, it is a pretty cheap node. + * + * If you change this, see also create_append_path(). Also, the size + * calculations should match set_append_rel_pathlist(). It'd be better + * not to duplicate all this logic, but some callers of this function + * aren't working from an appendrel or AppendPath, so there's noplace + * to copy the data from. */ plan->startup_cost = 0; plan->total_cost = 0; @@ -3102,26 +3210,41 @@ add_sort_column(AttrNumber colIdx, Oid sortOp, bool nulls_first, } /* - * make_sort_from_pathkeys - * Create sort plan to sort according to given pathkeys + * prepare_sort_from_pathkeys + * Prepare to sort according to given pathkeys * + * This is used to set up for both Sort and MergeAppend nodes. It calculates + * the executor's representation of the sort key information, and adjusts the + * plan targetlist if needed to add resjunk sort columns. + * + * Input parameters: * 'lefttree' is the node which yields input tuples * 'pathkeys' is the list of pathkeys by which the result is to be sorted - * 'limit_tuples' is the bound on the number of output tuples; - * -1 if no bound + * 'adjust_tlist_in_place' is TRUE if lefttree must be modified in-place * * We must convert the pathkey information into arrays of sort key column - * numbers and sort operator OIDs. + * numbers and sort operator OIDs, which is the representation the executor + * wants. These are returned into the output parameters *p_numsortkeys etc. * * If the pathkeys include expressions that aren't simple Vars, we will * usually need to add resjunk items to the input plan's targetlist to - * compute these expressions (since the Sort node itself won't do it). - * If the input plan type isn't one that can do projections, this means - * adding a Result node just to do the projection. + * compute these expressions, since the Sort/MergeAppend node itself won't + * do any such calculations. If the input plan type isn't one that can do + * projections, this means adding a Result node just to do the projection. + * However, the caller can pass adjust_tlist_in_place = TRUE to force the + * lefttree tlist to be modified in-place regardless of whether the node type + * can project --- we use this for fixing the tlist of MergeAppend itself. + * + * Returns the node which is to be the input to the Sort (either lefttree, + * or a Result stacked atop lefttree). */ -Sort * -make_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, List *pathkeys, - double limit_tuples) +static Plan * +prepare_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, List *pathkeys, + bool adjust_tlist_in_place, + int *p_numsortkeys, + AttrNumber **p_sortColIdx, + Oid **p_sortOperators, + bool **p_nullsFirst) { List *tlist = lefttree->targetlist; ListCell *i; @@ -3185,7 +3308,12 @@ make_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, List *pathkeys, { EquivalenceMember *em = (EquivalenceMember *) lfirst(j); - if (em->em_is_const || em->em_is_child) + /* + * We shouldn't be trying to sort by an equivalence class that + * contains a constant, so no need to consider such cases any + * further. + */ + if (em->em_is_const) continue; tle = tlist_member((Node *) em->em_expr, tlist); @@ -3221,7 +3349,7 @@ make_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, List *pathkeys, List *exprvars; ListCell *k; - if (em->em_is_const || em->em_is_child) + if (em->em_is_const) continue; sortexpr = em->em_expr; exprvars = pull_var_clause((Node *) sortexpr, @@ -3244,7 +3372,8 @@ make_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, List *pathkeys, /* * Do we need to insert a Result node? */ - if (!is_projection_capable_plan(lefttree)) + if (!adjust_tlist_in_place && + !is_projection_capable_plan(lefttree)) { /* copy needed so we don't modify input's tlist below */ tlist = copyObject(tlist); @@ -3252,6 +3381,9 @@ make_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, List *pathkeys, lefttree); } + /* Don't bother testing is_projection_capable_plan again */ + adjust_tlist_in_place = true; + /* * Add resjunk entry to input's tlist */ @@ -3292,6 +3424,42 @@ make_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, List *pathkeys, Assert(numsortkeys > 0); + /* Return results */ + *p_numsortkeys = numsortkeys; + *p_sortColIdx = sortColIdx; + *p_sortOperators = sortOperators; + *p_nullsFirst = nullsFirst; + + return lefttree; +} + +/* + * make_sort_from_pathkeys + * Create sort plan to sort according to given pathkeys + * + * 'lefttree' is the node which yields input tuples + * 'pathkeys' is the list of pathkeys by which the result is to be sorted + * 'limit_tuples' is the bound on the number of output tuples; + * -1 if no bound + */ +Sort * +make_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, List *pathkeys, + double limit_tuples) +{ + int numsortkeys; + AttrNumber *sortColIdx; + Oid *sortOperators; + bool *nullsFirst; + + /* Compute sort column info, and adjust lefttree as needed */ + lefttree = prepare_sort_from_pathkeys(root, lefttree, pathkeys, + false, + &numsortkeys, + &sortColIdx, + &sortOperators, + &nullsFirst); + + /* Now build the Sort node */ return make_sort(root, lefttree, numsortkeys, sortColIdx, sortOperators, nullsFirst, limit_tuples); } @@ -3998,6 +4166,7 @@ is_projection_capable_plan(Plan *plan) case T_Limit: case T_ModifyTable: case T_Append: + case T_MergeAppend: case T_RecursiveUnion: return false; default: diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c index d5e9212f6a..9aef7fc35a 100644 --- a/src/backend/optimizer/plan/setrefs.c +++ b/src/backend/optimizer/plan/setrefs.c @@ -554,6 +554,24 @@ set_plan_refs(PlannerGlobal *glob, Plan *plan, int rtoffset) } } break; + case T_MergeAppend: + { + MergeAppend *splan = (MergeAppend *) plan; + + /* + * MergeAppend, like Sort et al, doesn't actually evaluate its + * targetlist or check quals. + */ + set_dummy_tlist_references(plan, rtoffset); + Assert(splan->plan.qual == NIL); + foreach(l, splan->mergeplans) + { + lfirst(l) = set_plan_refs(glob, + (Plan *) lfirst(l), + rtoffset); + } + } + break; case T_RecursiveUnion: /* This doesn't evaluate targetlist or check quals either */ set_dummy_tlist_references(plan, rtoffset); diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index ab62e2a234..137e2ce049 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -2065,6 +2065,22 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params, } break; + case T_MergeAppend: + { + ListCell *l; + + foreach(l, ((MergeAppend *) plan)->mergeplans) + { + context.paramids = + bms_add_members(context.paramids, + finalize_plan(root, + (Plan *) lfirst(l), + valid_params, + scan_params)); + } + } + break; + case T_BitmapAnd: { ListCell *l; diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 71e0e75a56..8214acc571 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -636,6 +636,8 @@ create_tidscan_path(PlannerInfo *root, RelOptInfo *rel, List *tidquals) * create_append_path * Creates a path corresponding to an Append plan, returning the * pathnode. + * + * Note that we must handle subpaths = NIL, representing a dummy access path. */ AppendPath * create_append_path(RelOptInfo *rel, List *subpaths) @@ -649,6 +651,13 @@ create_append_path(RelOptInfo *rel, List *subpaths) * unsorted */ pathnode->subpaths = subpaths; + /* + * Compute cost as sum of subplan costs. We charge nothing extra for the + * Append itself, which perhaps is too optimistic, but since it doesn't do + * any selection or projection, it is a pretty cheap node. + * + * If you change this, see also make_append(). + */ pathnode->path.startup_cost = 0; pathnode->path.total_cost = 0; foreach(l, subpaths) @@ -663,6 +672,68 @@ create_append_path(RelOptInfo *rel, List *subpaths) return pathnode; } +/* + * create_merge_append_path + * Creates a path corresponding to a MergeAppend plan, returning the + * pathnode. + */ +MergeAppendPath * +create_merge_append_path(PlannerInfo *root, + RelOptInfo *rel, + List *subpaths, + List *pathkeys) +{ + MergeAppendPath *pathnode = makeNode(MergeAppendPath); + Cost input_startup_cost; + Cost input_total_cost; + ListCell *l; + + pathnode->path.pathtype = T_MergeAppend; + pathnode->path.parent = rel; + pathnode->path.pathkeys = pathkeys; + pathnode->subpaths = subpaths; + + /* Add up all the costs of the input paths */ + input_startup_cost = 0; + input_total_cost = 0; + foreach(l, subpaths) + { + Path *subpath = (Path *) lfirst(l); + + if (pathkeys_contained_in(pathkeys, subpath->pathkeys)) + { + /* Subpath is adequately ordered, we won't need to sort it */ + input_startup_cost += subpath->startup_cost; + input_total_cost += subpath->total_cost; + } + else + { + /* We'll need to insert a Sort node, so include cost for that */ + Path sort_path; /* dummy for result of cost_sort */ + + cost_sort(&sort_path, + root, + pathkeys, + subpath->total_cost, + subpath->parent->tuples, + subpath->parent->width, + 0.0, + work_mem, + -1.0); + input_startup_cost += sort_path.startup_cost; + input_total_cost += sort_path.total_cost; + } + } + + /* Now we can compute total costs of the MergeAppend */ + cost_merge_append(&pathnode->path, root, + pathkeys, list_length(subpaths), + input_startup_cost, input_total_cost, + rel->tuples); + + return pathnode; +} + /* * create_result_path * Creates a path representing a Result-and-nothing-else plan. diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index f1c1d04ee0..b5437612a9 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -2174,15 +2174,17 @@ set_deparse_planstate(deparse_namespace *dpns, PlanState *ps) dpns->planstate = ps; /* - * We special-case Append to pretend that the first child plan is the - * OUTER referent; we have to interpret OUTER Vars in the Append's tlist - * according to one of the children, and the first one is the most natural - * choice. Likewise special-case ModifyTable to pretend that the first - * child plan is the OUTER referent; this is to support RETURNING lists - * containing references to non-target relations. + * We special-case Append and MergeAppend to pretend that the first child + * plan is the OUTER referent; we have to interpret OUTER Vars in their + * tlists according to one of the children, and the first one is the most + * natural choice. Likewise special-case ModifyTable to pretend that the + * first child plan is the OUTER referent; this is to support RETURNING + * lists containing references to non-target relations. */ if (IsA(ps, AppendState)) dpns->outer_planstate = ((AppendState *) ps)->appendplans[0]; + else if (IsA(ps, MergeAppendState)) + dpns->outer_planstate = ((MergeAppendState *) ps)->mergeplans[0]; else if (IsA(ps, ModifyTableState)) dpns->outer_planstate = ((ModifyTableState *) ps)->mt_plans[0]; else diff --git a/src/include/executor/nodeMergeAppend.h b/src/include/executor/nodeMergeAppend.h new file mode 100644 index 0000000000..7e7e494b71 --- /dev/null +++ b/src/include/executor/nodeMergeAppend.h @@ -0,0 +1,24 @@ +/*------------------------------------------------------------------------- + * + * nodeMergeAppend.h + * + * + * + * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/executor/nodeMergeAppend.h + * + *------------------------------------------------------------------------- + */ +#ifndef NODEMERGEAPPEND_H +#define NODEMERGEAPPEND_H + +#include "nodes/execnodes.h" + +extern MergeAppendState *ExecInitMergeAppend(MergeAppend *node, EState *estate, int eflags); +extern TupleTableSlot *ExecMergeAppend(MergeAppendState *node); +extern void ExecEndMergeAppend(MergeAppendState *node); +extern void ExecReScanMergeAppend(MergeAppendState *node); + +#endif /* NODEMERGEAPPEND_H */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 8a1f2ee047..78188a3954 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1049,6 +1049,33 @@ typedef struct AppendState int as_whichplan; } AppendState; +/* ---------------- + * MergeAppendState information + * + * nplans how many plans are in the array + * nkeys number of sort key columns + * scankeys sort keys in ScanKey representation + * slots current output tuple of each subplan + * heap heap of active tuples (represented as array indexes) + * heap_size number of active heap entries + * initialized true if we have fetched first tuple from each subplan + * last_slot last subplan fetched from (which must be re-called) + * ---------------- + */ +typedef struct MergeAppendState +{ + PlanState ps; /* its first field is NodeTag */ + PlanState **mergeplans; /* array of PlanStates for my inputs */ + int ms_nplans; + int ms_nkeys; + ScanKey ms_scankeys; /* array of length ms_nkeys */ + TupleTableSlot **ms_slots; /* array of length ms_nplans */ + int *ms_heap; /* array of length ms_nplans */ + int ms_heap_size; /* current active length of ms_heap[] */ + bool ms_initialized; /* are subplans started? */ + int ms_last_slot; /* last subplan slot we returned from */ +} MergeAppendState; + /* ---------------- * RecursiveUnionState information * diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index 0d33a2ed5f..15dabe31ce 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -45,6 +45,7 @@ typedef enum NodeTag T_Result, T_ModifyTable, T_Append, + T_MergeAppend, T_RecursiveUnion, T_BitmapAnd, T_BitmapOr, @@ -87,6 +88,7 @@ typedef enum NodeTag T_ResultState, T_ModifyTableState, T_AppendState, + T_MergeAppendState, T_RecursiveUnionState, T_BitmapAndState, T_BitmapOrState, @@ -215,6 +217,7 @@ typedef enum NodeTag T_HashPath, T_TidPath, T_AppendPath, + T_MergeAppendPath, T_ResultPath, T_MaterialPath, T_UniquePath, diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index f2f99f4a2b..81038f6ad1 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -182,6 +182,22 @@ typedef struct Append List *appendplans; } Append; +/* ---------------- + * MergeAppend node - + * Merge the results of pre-sorted sub-plans to preserve the ordering. + * ---------------- + */ +typedef struct MergeAppend +{ + Plan plan; + List *mergeplans; + /* remaining fields are just like the sort-key info in struct Sort */ + int numCols; /* number of sort-key columns */ + AttrNumber *sortColIdx; /* their indexes in the target list */ + Oid *sortOperators; /* OIDs of operators to sort them by */ + bool *nullsFirst; /* NULLS FIRST/LAST directions */ +} MergeAppend; + /* ---------------- * RecursiveUnion node - * Generate a recursive union of two subplans. diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index 91f4c5c1c4..6e3d0f3518 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -256,9 +256,9 @@ typedef struct PlannerInfo * the entire append relation. The member RTEs are otherrels. The parent * is present in the query join tree but the members are not. The member * RTEs and otherrels are used to plan the scans of the individual tables or - * subqueries of the append set; then the parent baserel is given an Append - * plan comprising the best plans for the individual member rels. (See - * comments for AppendRelInfo for more information.) + * subqueries of the append set; then the parent baserel is given Append + * and/or MergeAppend paths comprising the best paths for the individual + * member rels. (See comments for AppendRelInfo for more information.) * * At one time we also made otherrels to represent join RTEs, for use in * handling join alias Vars. Currently this is not needed because all join @@ -542,10 +542,11 @@ typedef struct EquivalenceClass * * em_is_child signifies that this element was built by transposing a member * for an inheritance parent relation to represent the corresponding expression - * on an inheritance child. The element should be ignored for all purposes - * except constructing inner-indexscan paths for the child relation. (Other - * types of join are driven from transposed joininfo-list entries.) Note - * that the EC's ec_relids field does NOT include the child relation. + * on an inheritance child. These elements are used for constructing + * inner-indexscan paths for the child relation (other types of join are + * driven from transposed joininfo-list entries) and for constructing + * MergeAppend paths for the whole inheritance tree. Note that the EC's + * ec_relids field does NOT include the child relation. * * em_datatype is usually the same as exprType(em_expr), but can be * different when dealing with a binary-compatible opfamily; in particular @@ -755,6 +756,16 @@ typedef struct AppendPath #define IS_DUMMY_PATH(p) \ (IsA((p), AppendPath) && ((AppendPath *) (p))->subpaths == NIL) +/* + * MergeAppendPath represents a MergeAppend plan, ie, the merging of sorted + * results from several member plans to produce similarly-sorted output. + */ +typedef struct MergeAppendPath +{ + Path path; + List *subpaths; /* list of component Paths */ +} MergeAppendPath; + /* * ResultPath represents use of a Result plan node to compute a variable-free * targetlist with no underlying tables (a "SELECT expressions" query). diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index 5a4b33f2b1..e1dcd6df14 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -86,6 +86,10 @@ extern void cost_sort(Path *path, PlannerInfo *root, List *pathkeys, Cost input_cost, double tuples, int width, Cost comparison_cost, int sort_mem, double limit_tuples); +extern void cost_merge_append(Path *path, PlannerInfo *root, + List *pathkeys, int n_streams, + Cost input_startup_cost, Cost input_total_cost, + double tuples); extern void cost_material(Path *path, Cost input_startup_cost, Cost input_total_cost, double tuples, int width); diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index 5e0ebe046b..53ebe5756b 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -47,6 +47,10 @@ extern BitmapOrPath *create_bitmap_or_path(PlannerInfo *root, extern TidPath *create_tidscan_path(PlannerInfo *root, RelOptInfo *rel, List *tidquals); extern AppendPath *create_append_path(RelOptInfo *rel, List *subpaths); +extern MergeAppendPath *create_merge_append_path(PlannerInfo *root, + RelOptInfo *rel, + List *subpaths, + List *pathkeys); extern ResultPath *create_result_path(List *quals); extern MaterialPath *create_material_path(RelOptInfo *rel, Path *subpath); extern UniquePath *create_unique_path(PlannerInfo *root, RelOptInfo *rel, diff --git a/src/test/regress/expected/inherit.out b/src/test/regress/expected/inherit.out index 6030cba1df..581cc7d0e7 100644 --- a/src/test/regress/expected/inherit.out +++ b/src/test/regress/expected/inherit.out @@ -1147,3 +1147,98 @@ DETAIL: drop cascades to table t2 drop cascades to table ts drop cascades to table t3 drop cascades to table t4 +-- +-- Test merge-append plans for inheritance trees +-- +create table matest0 (id serial primary key, name text); +NOTICE: CREATE TABLE will create implicit sequence "matest0_id_seq" for serial column "matest0.id" +NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "matest0_pkey" for table "matest0" +create table matest1 (id integer primary key) inherits (matest0); +NOTICE: merging column "id" with inherited definition +NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "matest1_pkey" for table "matest1" +create table matest2 (id integer primary key) inherits (matest0); +NOTICE: merging column "id" with inherited definition +NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "matest2_pkey" for table "matest2" +create table matest3 (id integer primary key) inherits (matest0); +NOTICE: merging column "id" with inherited definition +NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "matest3_pkey" for table "matest3" +create index matest0i on matest0 ((1-id)); +create index matest1i on matest1 ((1-id)); +-- create index matest2i on matest2 ((1-id)); -- intentionally missing +create index matest3i on matest3 ((1-id)); +insert into matest1 (name) values ('Test 1'); +insert into matest1 (name) values ('Test 2'); +insert into matest2 (name) values ('Test 3'); +insert into matest2 (name) values ('Test 4'); +insert into matest3 (name) values ('Test 5'); +insert into matest3 (name) values ('Test 6'); +set enable_indexscan = off; -- force use of seqscan/sort, so no merge +explain (verbose, costs off) select * from matest0 order by 1-id; + QUERY PLAN +--------------------------------------------------------------------------------- + Sort + Output: public.matest0.id, public.matest0.name, ((1 - public.matest0.id)) + Sort Key: ((1 - public.matest0.id)) + -> Result + Output: public.matest0.id, public.matest0.name, (1 - public.matest0.id) + -> Append + -> Seq Scan on public.matest0 + Output: public.matest0.id, public.matest0.name + -> Seq Scan on public.matest1 matest0 + Output: public.matest0.id, public.matest0.name + -> Seq Scan on public.matest2 matest0 + Output: public.matest0.id, public.matest0.name + -> Seq Scan on public.matest3 matest0 + Output: public.matest0.id, public.matest0.name +(14 rows) + +select * from matest0 order by 1-id; + id | name +----+-------- + 6 | Test 6 + 5 | Test 5 + 4 | Test 4 + 3 | Test 3 + 2 | Test 2 + 1 | Test 1 +(6 rows) + +reset enable_indexscan; +set enable_seqscan = off; -- plan with fewest seqscans should be merge +explain (verbose, costs off) select * from matest0 order by 1-id; + QUERY PLAN +--------------------------------------------------------------------------------------------- + Result + Output: public.matest0.id, public.matest0.name, ((1 - public.matest0.id)) + -> Merge Append + Sort Key: ((1 - public.matest0.id)) + -> Index Scan using matest0i on public.matest0 + Output: public.matest0.id, public.matest0.name, (1 - public.matest0.id) + -> Index Scan using matest1i on public.matest1 matest0 + Output: public.matest0.id, public.matest0.name, (1 - public.matest0.id) + -> Sort + Output: public.matest0.id, public.matest0.name, ((1 - public.matest0.id)) + Sort Key: ((1 - public.matest0.id)) + -> Seq Scan on public.matest2 matest0 + Output: public.matest0.id, public.matest0.name, (1 - public.matest0.id) + -> Index Scan using matest3i on public.matest3 matest0 + Output: public.matest0.id, public.matest0.name, (1 - public.matest0.id) +(15 rows) + +select * from matest0 order by 1-id; + id | name +----+-------- + 6 | Test 6 + 5 | Test 5 + 4 | Test 4 + 3 | Test 3 + 2 | Test 2 + 1 | Test 1 +(6 rows) + +reset enable_seqscan; +drop table matest0 cascade; +NOTICE: drop cascades to 3 other objects +DETAIL: drop cascades to table matest1 +drop cascades to table matest2 +drop cascades to table matest3 diff --git a/src/test/regress/sql/inherit.sql b/src/test/regress/sql/inherit.sql index 435b1cd3b4..e87cf66110 100644 --- a/src/test/regress/sql/inherit.sql +++ b/src/test/regress/sql/inherit.sql @@ -373,3 +373,36 @@ SELECT a.attrelid::regclass, a.attname, a.attinhcount, e.expected ORDER BY a.attrelid::regclass::name, a.attnum; DROP TABLE t1, s1 CASCADE; + +-- +-- Test merge-append plans for inheritance trees +-- + +create table matest0 (id serial primary key, name text); +create table matest1 (id integer primary key) inherits (matest0); +create table matest2 (id integer primary key) inherits (matest0); +create table matest3 (id integer primary key) inherits (matest0); + +create index matest0i on matest0 ((1-id)); +create index matest1i on matest1 ((1-id)); +-- create index matest2i on matest2 ((1-id)); -- intentionally missing +create index matest3i on matest3 ((1-id)); + +insert into matest1 (name) values ('Test 1'); +insert into matest1 (name) values ('Test 2'); +insert into matest2 (name) values ('Test 3'); +insert into matest2 (name) values ('Test 4'); +insert into matest3 (name) values ('Test 5'); +insert into matest3 (name) values ('Test 6'); + +set enable_indexscan = off; -- force use of seqscan/sort, so no merge +explain (verbose, costs off) select * from matest0 order by 1-id; +select * from matest0 order by 1-id; +reset enable_indexscan; + +set enable_seqscan = off; -- plan with fewest seqscans should be merge +explain (verbose, costs off) select * from matest0 order by 1-id; +select * from matest0 order by 1-id; +reset enable_seqscan; + +drop table matest0 cascade;