From 9c79e646c6f0f8df06d966c536d0c6aa33bf1b06 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Mon, 7 Sep 2020 12:03:04 -0400 Subject: [PATCH] Frob numeric.c loop so that clang will auto-vectorize it too. Experimentation shows that clang will auto-vectorize the critical multiplication loop if the termination condition is written "i2 < limit" rather than "i2 <= limit". This seems unbelievably stupid, but I've reproduced it on both clang 9.0.1 (RHEL8) and 11.0.3 (macOS Catalina). gcc doesn't care, so tweak the code to do it that way. Discussion: https://postgr.es/m/CAJ3gD9evtA_vBo+WMYMyT-u=keHX7-r8p2w7OSRfXf42LTwCZQ@mail.gmail.com --- src/backend/utils/adt/numeric.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c index d2a42b811d..dfd455fc74 100644 --- a/src/backend/utils/adt/numeric.c +++ b/src/backend/utils/adt/numeric.c @@ -8191,7 +8191,6 @@ mul_var(const NumericVar *var1, const NumericVar *var2, NumericVar *result, int res_weight; int maxdigits; int *dig; - int *dig_i1_2; int carry; int maxdig; int newdig; @@ -8327,7 +8326,7 @@ mul_var(const NumericVar *var1, const NumericVar *var2, NumericVar *result, * Add the appropriate multiple of var2 into the accumulator. * * As above, digits of var2 can be ignored if they don't contribute, - * so we only include digits for which i1+i2+2 <= res_ndigits - 1. + * so we only include digits for which i1+i2+2 < res_ndigits. * * This inner loop is the performance bottleneck for multiplication, * so we want to keep it simple enough so that it can be @@ -8336,10 +8335,13 @@ mul_var(const NumericVar *var1, const NumericVar *var2, NumericVar *result, * Since we aren't propagating carries in this loop, the order does * not matter. */ - i = Min(var2ndigits - 1, res_ndigits - i1 - 3); - dig_i1_2 = &dig[i1 + 2]; - for (i2 = 0; i2 <= i; i2++) - dig_i1_2[i2] += var1digit * var2digits[i2]; + { + int i2limit = Min(var2ndigits, res_ndigits - i1 - 2); + int *dig_i1_2 = &dig[i1 + 2]; + + for (i2 = 0; i2 < i2limit; i2++) + dig_i1_2[i2] += var1digit * var2digits[i2]; + } } /*