Speed-up build of MCV lists with many distinct values

When building multi-column MCV lists, we compute base frequency for each
item, i.e. a product of per-column frequencies for values from the item.
As a value may be in multiple groups, the code was scanning the whole
array of groups while adding items to the MCV list.  This works fine as
long as the number of distinct groups is small, but it's easy to trigger
trigger O(N^2) behavior, especially after increasing statistics target.

This commit precomputes frequencies for values in all columns, so that
when computing the base frequency it's enough to make a simple bsearch
lookup in the array.

Backpatch to 12, where multi-column MCV lists were introduced.

Discussion: https://postgr.es/m/20190618205920.qtlzcu73whfpfqne@development
This commit is contained in:
Tomas Vondra 2019-07-04 23:02:02 +02:00
parent 1970da1a0a
commit 57f459cf6c
1 changed files with 122 additions and 8 deletions

View File

@ -78,6 +78,9 @@ static MultiSortSupport build_mss(VacAttrStats **stats, int numattrs);
static SortItem *build_distinct_groups(int numrows, SortItem *items,
MultiSortSupport mss, int *ndistinct);
static SortItem **build_column_frequencies(SortItem *groups, int ngroups,
MultiSortSupport mss, int *ncounts);
static int count_distinct_groups(int numrows, SortItem *items,
MultiSortSupport mss);
@ -242,6 +245,20 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
if (nitems > 0)
{
int j;
SortItem key;
MultiSortSupport tmp;
/* frequencies for values in each attribute */
SortItem **freqs;
int *nfreqs;
/* used to search values */
tmp = (MultiSortSupport) palloc(offsetof(MultiSortSupportData, ssup)
+ sizeof(SortSupportData));
/* compute frequencies for values in each column */
nfreqs = (int *) palloc0(sizeof(int) * numattrs);
freqs = build_column_frequencies(groups, ngroups, mss, nfreqs);
/*
* Allocate the MCV list structure, set the global parameters.
@ -281,18 +298,26 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
item->base_frequency = 1.0;
for (j = 0; j < numattrs; j++)
{
int count = 0;
int k;
SortItem *freq;
for (k = 0; k < ngroups; k++)
{
if (multi_sort_compare_dim(j, &groups[i], &groups[k], mss) == 0)
count += groups[k].count;
}
/* single dimension */
tmp->ndims = 1;
tmp->ssup[0] = mss->ssup[j];
item->base_frequency *= (double) count / numrows;
/* fill search key */
key.values = &groups[i].values[j];
key.isnull = &groups[i].isnull[j];
freq = (SortItem *) bsearch_arg(&key, freqs[j], nfreqs[j],
sizeof(SortItem),
multi_sort_compare, tmp);
item->base_frequency *= ((double) freq->count) / numrows;
}
}
pfree(nfreqs);
pfree(freqs);
}
pfree(items);
@ -419,6 +444,95 @@ build_distinct_groups(int numrows, SortItem *items, MultiSortSupport mss,
return groups;
}
/* compare sort items (single dimension) */
static int
sort_item_compare(const void *a, const void *b, void *arg)
{
SortSupport ssup = (SortSupport) arg;
SortItem *ia = (SortItem *) a;
SortItem *ib = (SortItem *) b;
return ApplySortComparator(ia->values[0], ia->isnull[0],
ib->values[0], ib->isnull[0],
ssup);
}
/*
* build_column_frequencies
* compute frequencies of values in each column
*
* This returns an array of SortItems for each attibute the MCV is built
* on, with a frequency (number of occurrences) for each value. This is
* then used to compute "base" frequency of MCV items.
*
* All the memory is allocated in a single chunk, so that a single pfree
* is enough to release it. We do not allocate space for values/isnull
* arrays in the SortItems, because we can simply point into the input
* groups directly.
*/
static SortItem **
build_column_frequencies(SortItem *groups, int ngroups,
MultiSortSupport mss, int *ncounts)
{
int i,
dim;
SortItem **result;
char *ptr;
Assert(groups);
Assert(ncounts);
/* allocate arrays for all columns as a single chunk */
ptr = palloc(MAXALIGN(sizeof(SortItem *) * mss->ndims) +
mss->ndims * MAXALIGN(sizeof(SortItem) * ngroups));
/* initial array of pointers */
result = (SortItem **) ptr;
ptr += MAXALIGN(sizeof(SortItem *) * mss->ndims);
for (dim = 0; dim < mss->ndims; dim++)
{
SortSupport ssup = &mss->ssup[dim];
/* array of values for a single column */
result[dim] = (SortItem *) ptr;
ptr += MAXALIGN(sizeof(SortItem) * ngroups);
/* extract data for the dimension */
for (i = 0; i < ngroups; i++)
{
/* point into the input groups */
result[dim][i].values = &groups[i].values[dim];
result[dim][i].isnull = &groups[i].isnull[dim];
result[dim][i].count = groups[i].count;
}
/* sort the values, deduplicate */
qsort_arg((void *) result[dim], ngroups, sizeof(SortItem),
sort_item_compare, ssup);
/*
* Identify distinct values, compute frequency (there might be
* multiple MCV items containing this value, so we need to sum
* counts from all of them.
*/
ncounts[dim] = 1;
for (i = 1; i < ngroups; i++)
{
if (sort_item_compare(&result[dim][i-1], &result[dim][i], ssup) == 0)
{
result[dim][ncounts[dim]-1].count += result[dim][i].count;
continue;
}
result[dim][ncounts[dim]] = result[dim][i];
ncounts[dim]++;
}
}
return result;
}
/*
* statext_mcv_load