From 9391f71523b6e57f1194d9f6543bc7948c16411b Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Thu, 4 Jan 2024 18:36:19 -0500 Subject: [PATCH] Teach estimate_array_length() to use statistics where available. If we have DECHIST statistics about the argument expression, use the average number of distinct elements as the array length estimate. (It'd be better to use the average total number of elements, but that is not currently calculated by compute_array_stats(), and it's unclear that it'd be worth extra effort to get.) To do this, we have to change the signature of estimate_array_length to pass the "root" pointer. While at it, also change its result type to "double". That's probably not really necessary, but it avoids any risk of overflow of the value extracted from DECHIST. All existing callers are going to use the result in a "double" calculation anyway. Paul Jungwirth, reviewed by Jian He and myself Discussion: https://postgr.es/m/CA+renyUnM2d+SmrxKpDuAdpiq6FOM=FByvi6aS6yi__qyf6j9A@mail.gmail.com --- src/backend/optimizer/path/costsize.c | 10 +++--- src/backend/utils/adt/arrayfuncs.c | 2 +- src/backend/utils/adt/selfuncs.c | 47 ++++++++++++++++++++++----- src/include/utils/selfuncs.h | 2 +- 4 files changed, 45 insertions(+), 16 deletions(-) diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 7cfebc95d6..8b76e98529 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -1256,7 +1256,7 @@ cost_tidscan(Path *path, PlannerInfo *root, QualCost qpqual_cost; Cost cpu_per_tuple; QualCost tid_qual_cost; - int ntuples; + double ntuples; ListCell *l; double spc_random_page_cost; @@ -1283,7 +1283,7 @@ cost_tidscan(Path *path, PlannerInfo *root, ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) qual; Node *arraynode = (Node *) lsecond(saop->args); - ntuples += estimate_array_length(arraynode); + ntuples += estimate_array_length(root, arraynode); } else if (IsA(qual, CurrentOfExpr)) { @@ -4770,7 +4770,7 @@ cost_qual_eval_walker(Node *node, cost_qual_eval_context *context) Node *arraynode = (Node *) lsecond(saop->args); QualCost sacosts; QualCost hcosts; - int estarraylen = estimate_array_length(arraynode); + double estarraylen = estimate_array_length(context->root, arraynode); set_sa_opfuncid(saop); sacosts.startup = sacosts.per_tuple = 0; @@ -4808,7 +4808,7 @@ cost_qual_eval_walker(Node *node, cost_qual_eval_context *context) */ context->total.startup += sacosts.startup; context->total.per_tuple += sacosts.per_tuple * - estimate_array_length(arraynode) * 0.5; + estimate_array_length(context->root, arraynode) * 0.5; } } else if (IsA(node, Aggref) || @@ -4859,7 +4859,7 @@ cost_qual_eval_walker(Node *node, cost_qual_eval_context *context) context->total.startup += perelemcost.startup; if (perelemcost.per_tuple > 0) context->total.per_tuple += perelemcost.per_tuple * - estimate_array_length((Node *) acoerce->arg); + estimate_array_length(context->root, (Node *) acoerce->arg); } else if (IsA(node, RowCompareExpr)) { diff --git a/src/backend/utils/adt/arrayfuncs.c b/src/backend/utils/adt/arrayfuncs.c index 957d21a0a0..f3fee54e37 100644 --- a/src/backend/utils/adt/arrayfuncs.c +++ b/src/backend/utils/adt/arrayfuncs.c @@ -6340,7 +6340,7 @@ array_unnest_support(PG_FUNCTION_ARGS) /* We can use estimated argument values here */ arg1 = estimate_expression_value(req->root, linitial(args)); - req->rows = estimate_array_length(arg1); + req->rows = estimate_array_length(req->root, arg1); ret = (Node *) req; } } diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 7a3f69f2d9..dbcd98d985 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -2128,10 +2128,11 @@ scalararraysel(PlannerInfo *root, /* * Estimate number of elements in the array yielded by an expression. * - * It's important that this agree with scalararraysel. + * Note: the result is integral, but we use "double" to avoid overflow + * concerns. Most callers will use it in double-type expressions anyway. */ -int -estimate_array_length(Node *arrayexpr) +double +estimate_array_length(PlannerInfo *root, Node *arrayexpr) { /* look through any binary-compatible relabeling of arrayexpr */ arrayexpr = strip_array_coercion(arrayexpr); @@ -2152,11 +2153,39 @@ estimate_array_length(Node *arrayexpr) { return list_length(((ArrayExpr *) arrayexpr)->elements); } - else + else if (arrayexpr) { - /* default guess --- see also scalararraysel */ - return 10; + /* See if we can find any statistics about it */ + VariableStatData vardata; + AttStatsSlot sslot; + double nelem = 0; + + examine_variable(root, arrayexpr, 0, &vardata); + if (HeapTupleIsValid(vardata.statsTuple)) + { + /* + * Found stats, so use the average element count, which is stored + * in the last stanumbers element of the DECHIST statistics. + * Actually that is the average count of *distinct* elements; + * perhaps we should scale it up somewhat? + */ + if (get_attstatsslot(&sslot, vardata.statsTuple, + STATISTIC_KIND_DECHIST, InvalidOid, + ATTSTATSSLOT_NUMBERS)) + { + if (sslot.nnumbers > 0) + nelem = clamp_row_est(sslot.numbers[sslot.nnumbers - 1]); + free_attstatsslot(&sslot); + } + } + ReleaseVariableStats(vardata); + + if (nelem > 0) + return nelem; } + + /* Else use a default guess --- this should match scalararraysel */ + return 10; } /* @@ -6540,7 +6569,7 @@ genericcostestimate(PlannerInfo *root, if (IsA(rinfo->clause, ScalarArrayOpExpr)) { ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) rinfo->clause; - int alength = estimate_array_length(lsecond(saop->args)); + double alength = estimate_array_length(root, lsecond(saop->args)); if (alength > 1) num_sa_scans *= alength; @@ -6820,7 +6849,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, { ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause; Node *other_operand = (Node *) lsecond(saop->args); - int alength = estimate_array_length(other_operand); + double alength = estimate_array_length(root, other_operand); clause_op = saop->opno; found_saop = true; @@ -7414,7 +7443,7 @@ gincost_scalararrayopexpr(PlannerInfo *root, { counts->exactEntries++; counts->searchEntries++; - counts->arrayScans *= estimate_array_length(rightop); + counts->arrayScans *= estimate_array_length(root, rightop); return true; } diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h index 6dd5171d54..2fa4c4fc1b 100644 --- a/src/include/utils/selfuncs.h +++ b/src/include/utils/selfuncs.h @@ -200,7 +200,7 @@ extern Selectivity scalararraysel(PlannerInfo *root, ScalarArrayOpExpr *clause, bool is_join_clause, int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo); -extern int estimate_array_length(Node *arrayexpr); +extern double estimate_array_length(PlannerInfo *root, Node *arrayexpr); extern Selectivity rowcomparesel(PlannerInfo *root, RowCompareExpr *clause, int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo);