diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index ef475d95a1..8b797e3b46 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -2809,6 +2809,226 @@ cost_agg(Path *path, PlannerInfo *root, path->total_cost = total_cost; } +/* + * get_windowclause_startup_tuples + * Estimate how many tuples we'll need to fetch from a WindowAgg's + * subnode before we can output the first WindowAgg tuple. + * + * How many tuples need to be read depends on the WindowClause. For example, + * a WindowClause with no PARTITION BY and no ORDER BY requires that all + * subnode tuples are read and aggregated before the WindowAgg can output + * anything. If there's a PARTITION BY, then we only need to look at tuples + * in the first partition. Here we attempt to estimate just how many + * 'input_tuples' the WindowAgg will need to read for the given WindowClause + * before the first tuple can be output. + */ +static double +get_windowclause_startup_tuples(PlannerInfo *root, WindowClause *wc, + double input_tuples) +{ + int frameOptions = wc->frameOptions; + double partition_tuples; + double return_tuples; + double peer_tuples; + + /* + * First, figure out how many partitions there are likely to be and set + * partition_tuples according to that estimate. + */ + if (wc->partitionClause != NIL) + { + double num_partitions; + List *partexprs = get_sortgrouplist_exprs(wc->partitionClause, + root->parse->targetList); + + num_partitions = estimate_num_groups(root, partexprs, input_tuples, + NULL, NULL); + list_free(partexprs); + + partition_tuples = input_tuples / num_partitions; + } + else + { + /* all tuples belong to the same partition */ + partition_tuples = input_tuples; + } + + /* estimate the number of tuples in each peer group */ + if (wc->orderClause != NIL) + { + double num_groups; + List *orderexprs; + + orderexprs = get_sortgrouplist_exprs(wc->orderClause, + root->parse->targetList); + + /* estimate out how many peer groups there are in the partition */ + num_groups = estimate_num_groups(root, orderexprs, + partition_tuples, NULL, + NULL); + list_free(orderexprs); + peer_tuples = partition_tuples / num_groups; + } + else + { + /* no ORDER BY so only 1 tuple belongs in each peer group */ + peer_tuples = 1.0; + } + + if (frameOptions & FRAMEOPTION_END_UNBOUNDED_FOLLOWING) + { + /* include all partition rows */ + return_tuples = partition_tuples; + } + else if (frameOptions & FRAMEOPTION_END_CURRENT_ROW) + { + if (frameOptions & FRAMEOPTION_ROWS) + { + /* just count the current row */ + return_tuples = 1.0; + } + else if (frameOptions & (FRAMEOPTION_RANGE | FRAMEOPTION_GROUPS)) + { + /* + * When in RANGE/GROUPS mode, it's more complex. If there's no + * ORDER BY, then all rows in the partition are peers, otherwise + * we'll need to read the first group of peers. + */ + if (wc->orderClause == NIL) + return_tuples = partition_tuples; + else + return_tuples = peer_tuples; + } + else + { + /* + * Something new we don't support yet? This needs attention. + * We'll just return 1.0 in the meantime. + */ + Assert(false); + return_tuples = 1.0; + } + } + else if (frameOptions & FRAMEOPTION_END_OFFSET_PRECEDING) + { + /* + * BETWEEN ... AND N PRECEDING will only need to read the WindowAgg's + * subnode after N ROWS/RANGES/GROUPS. N can be 0, but not negative, + * so we'll just assume only the current row needs to be read to fetch + * the first WindowAgg row. + */ + return_tuples = 1.0; + } + else if (frameOptions & FRAMEOPTION_END_OFFSET_FOLLOWING) + { + Const *endOffset = (Const *) wc->endOffset; + double end_offset_value; + + /* try and figure out the value specified in the endOffset. */ + if (IsA(endOffset, Const)) + { + if (endOffset->constisnull) + { + /* + * NULLs are not allowed, but currently, there's no code to + * error out if there's a NULL Const. We'll only discover + * this during execution. For now, just pretend everything is + * fine and assume that just the current row/range/group will + * be needed. + */ + end_offset_value = 1.0; + } + else + { + switch (endOffset->consttype) + { + case INT2OID: + end_offset_value = + (double) DatumGetInt16(endOffset->constvalue); + break; + case INT4OID: + end_offset_value = + (double) DatumGetInt32(endOffset->constvalue); + break; + case INT8OID: + end_offset_value = + (double) DatumGetInt64(endOffset->constvalue); + break; + default: + end_offset_value = + partition_tuples / peer_tuples * + DEFAULT_INEQ_SEL; + break; + } + } + } + else + { + /* + * When the end bound is not a Const, we'll just need to guess. We + * just make use of DEFAULT_INEQ_SEL. + */ + end_offset_value = + partition_tuples / peer_tuples * DEFAULT_INEQ_SEL; + } + + if (frameOptions & FRAMEOPTION_ROWS) + { + /* include the N FOLLOWING and the current row */ + return_tuples = end_offset_value + 1.0; + } + else if (frameOptions & (FRAMEOPTION_RANGE | FRAMEOPTION_GROUPS)) + { + /* include N FOLLOWING ranges/group and the initial range/group */ + return_tuples = peer_tuples * (end_offset_value + 1.0); + } + else + { + /* + * Something new we don't support yet? This needs attention. + * We'll just return 1.0 in the meantime. + */ + Assert(false); + return_tuples = 1.0; + } + } + else + { + /* + * Something new we don't support yet? This needs attention. We'll + * just return 1.0 in the meantime. + */ + Assert(false); + return_tuples = 1.0; + } + + if (wc->partitionClause != NIL || wc->orderClause != NIL) + { + /* + * Cap the return value to the estimated partition tuples and account + * for the extra tuple WindowAgg will need to read to confirm the next + * tuple does not belong to the same partition or peer group. + */ + return_tuples = Min(return_tuples + 1.0, partition_tuples); + } + else + { + /* + * Cap the return value so it's never higher than the expected tuples + * in the partition. + */ + return_tuples = Min(return_tuples, partition_tuples); + } + + /* + * We needn't worry about any EXCLUDE options as those only exclude rows + * from being aggregated, not from being read from the WindowAgg's + * subnode. + */ + + return clamp_row_est(return_tuples); +} + /* * cost_windowagg * Determines and returns the cost of performing a WindowAgg plan node, @@ -2818,17 +3038,32 @@ cost_agg(Path *path, PlannerInfo *root, */ void cost_windowagg(Path *path, PlannerInfo *root, - List *windowFuncs, int numPartCols, int numOrderCols, + List *windowFuncs, WindowClause *winclause, Cost input_startup_cost, Cost input_total_cost, double input_tuples) { Cost startup_cost; Cost total_cost; + double startup_tuples; + int numPartCols; + int numOrderCols; ListCell *lc; + numPartCols = list_length(winclause->partitionClause); + numOrderCols = list_length(winclause->orderClause); + startup_cost = input_startup_cost; total_cost = input_total_cost; + /* + * Estimate how many tuples we'll need to read from the subnode before we + * can output the first WindowAgg row. + */ + startup_tuples = get_windowclause_startup_tuples(root, winclause, + input_tuples); + + elog(DEBUG1, "startup_tuples = %g", startup_tuples); /* XXX not for commit */ + /* * Window functions are assumed to cost their stated execution cost, plus * the cost of evaluating their input expressions, per tuple. Since they @@ -2880,6 +3115,18 @@ cost_windowagg(Path *path, PlannerInfo *root, path->rows = input_tuples; path->startup_cost = startup_cost; path->total_cost = total_cost; + + /* + * Also, take into account how many tuples we need to read from the + * subnode in order to produce the first tuple from the WindowAgg. To do + * this we proportion the run cost (total cost not including startup cost) + * over the estimated startup tuples. We already included the startup + * cost of the subnode, so we only need to do this when the estimated + * startup tuples is above 1.0. + */ + if (startup_tuples > 1.0) + path->startup_cost += (total_cost - startup_cost) / input_tuples * + (startup_tuples - 1.0); } /* diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index f123fcb41e..754f0b9f34 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -3457,8 +3457,7 @@ create_windowagg_path(PlannerInfo *root, */ cost_windowagg(&pathnode->path, root, windowFuncs, - list_length(winclause->partitionClause), - list_length(winclause->orderClause), + winclause, subpath->startup_cost, subpath->total_cost, subpath->rows); diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index 6cf49705d3..bee090ffc2 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -131,7 +131,7 @@ extern void cost_agg(Path *path, PlannerInfo *root, Cost input_startup_cost, Cost input_total_cost, double input_tuples, double input_width); extern void cost_windowagg(Path *path, PlannerInfo *root, - List *windowFuncs, int numPartCols, int numOrderCols, + List *windowFuncs, WindowClause *winclause, Cost input_startup_cost, Cost input_total_cost, double input_tuples); extern void cost_group(Path *path, PlannerInfo *root, diff --git a/src/test/regress/expected/window.out b/src/test/regress/expected/window.out index 1d4b78b9b2..2628033327 100644 --- a/src/test/regress/expected/window.out +++ b/src/test/regress/expected/window.out @@ -4808,6 +4808,80 @@ SELECT i, b, bool_and(b) OVER w, bool_or(b) OVER w 5 | t | t | t (5 rows) +-- +-- Test WindowAgg costing takes into account the number of rows that need to +-- be fetched before the first row can be output. +-- +-- Ensure we get a cheap start up plan as the WindowAgg can output the first +-- row after reading 1 row from the join. +EXPLAIN (COSTS OFF) +SELECT COUNT(*) OVER (ORDER BY t1.unique1) +FROM tenk1 t1 INNER JOIN tenk1 t2 ON t1.unique1 = t2.tenthous +LIMIT 1; + QUERY PLAN +-------------------------------------------------------------------------- + Limit + -> WindowAgg + -> Nested Loop + -> Index Only Scan using tenk1_unique1 on tenk1 t1 + -> Index Only Scan using tenk1_thous_tenthous on tenk1 t2 + Index Cond: (tenthous = t1.unique1) +(6 rows) + +-- Ensure we get a cheap total plan. Lack of ORDER BY in the WindowClause +-- means that all rows must be read from the join, so a cheap startup plan +-- isn't a good choice. +EXPLAIN (COSTS OFF) +SELECT COUNT(*) OVER () +FROM tenk1 t1 INNER JOIN tenk1 t2 ON t1.unique1 = t2.tenthous +LIMIT 1; + QUERY PLAN +-------------------------------------------------------------------------------- + Limit + -> WindowAgg + -> Hash Join + Hash Cond: (t1.unique1 = t2.tenthous) + -> Index Only Scan using tenk1_unique1 on tenk1 t1 + -> Hash + -> Index Only Scan using tenk1_thous_tenthous on tenk1 t2 +(7 rows) + +-- Ensure we get a cheap total plan. This time use UNBOUNDED FOLLOWING, which +-- needs to read all join rows to output the first WindowAgg row. +EXPLAIN (COSTS OFF) +SELECT COUNT(*) OVER (ORDER BY t1.unique1 ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) +FROM tenk1 t1 INNER JOIN tenk1 t2 ON t1.unique1 = t2.tenthous +LIMIT 1; + QUERY PLAN +-------------------------------------------------------------------------------- + Limit + -> WindowAgg + -> Merge Join + Merge Cond: (t1.unique1 = t2.tenthous) + -> Index Only Scan using tenk1_unique1 on tenk1 t1 + -> Sort + Sort Key: t2.tenthous + -> Index Only Scan using tenk1_thous_tenthous on tenk1 t2 +(8 rows) + +-- Ensure we get a cheap total plan. This time use 10000 FOLLOWING so we need +-- to read all join rows. +EXPLAIN (COSTS OFF) +SELECT COUNT(*) OVER (ORDER BY t1.unique1 ROWS BETWEEN UNBOUNDED PRECEDING AND 10000 FOLLOWING) +FROM tenk1 t1 INNER JOIN tenk1 t2 ON t1.unique1 = t2.tenthous +LIMIT 1; + QUERY PLAN +-------------------------------------------------------------------------------- + Limit + -> WindowAgg + -> Merge Join + Merge Cond: (t1.unique1 = t2.tenthous) + -> Index Only Scan using tenk1_unique1 on tenk1 t1 + -> Sort + Sort Key: t2.tenthous + -> Index Only Scan using tenk1_thous_tenthous on tenk1 t2 +(8 rows) + -- Tests for problems with failure to walk or mutate expressions -- within window frame clauses. -- test walker (fails with collation error if expressions are not walked) diff --git a/src/test/regress/sql/window.sql b/src/test/regress/sql/window.sql index 3ab6ac715d..4789de0937 100644 --- a/src/test/regress/sql/window.sql +++ b/src/test/regress/sql/window.sql @@ -1716,6 +1716,40 @@ SELECT i, b, bool_and(b) OVER w, bool_or(b) OVER w FROM (VALUES (1,true), (2,true), (3,false), (4,false), (5,true)) v(i,b) WINDOW w AS (ORDER BY i ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING); +-- +-- Test WindowAgg costing takes into account the number of rows that need to +-- be fetched before the first row can be output. +-- + +-- Ensure we get a cheap start up plan as the WindowAgg can output the first +-- row after reading 1 row from the join. +EXPLAIN (COSTS OFF) +SELECT COUNT(*) OVER (ORDER BY t1.unique1) +FROM tenk1 t1 INNER JOIN tenk1 t2 ON t1.unique1 = t2.tenthous +LIMIT 1; + +-- Ensure we get a cheap total plan. Lack of ORDER BY in the WindowClause +-- means that all rows must be read from the join, so a cheap startup plan +-- isn't a good choice. +EXPLAIN (COSTS OFF) +SELECT COUNT(*) OVER () +FROM tenk1 t1 INNER JOIN tenk1 t2 ON t1.unique1 = t2.tenthous +LIMIT 1; + +-- Ensure we get a cheap total plan. This time use UNBOUNDED FOLLOWING, which +-- needs to read all join rows to output the first WindowAgg row. +EXPLAIN (COSTS OFF) +SELECT COUNT(*) OVER (ORDER BY t1.unique1 ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) +FROM tenk1 t1 INNER JOIN tenk1 t2 ON t1.unique1 = t2.tenthous +LIMIT 1; + +-- Ensure we get a cheap total plan. This time use 10000 FOLLOWING so we need +-- to read all join rows. +EXPLAIN (COSTS OFF) +SELECT COUNT(*) OVER (ORDER BY t1.unique1 ROWS BETWEEN UNBOUNDED PRECEDING AND 10000 FOLLOWING) +FROM tenk1 t1 INNER JOIN tenk1 t2 ON t1.unique1 = t2.tenthous +LIMIT 1; + -- Tests for problems with failure to walk or mutate expressions -- within window frame clauses.