From da11977de9c685ef808d3a293727f9ce26753ec4 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 12 Jul 2017 22:03:38 +0300 Subject: [PATCH] Reduce memory usage of tsvector type analyze function. compute_tsvector_stats() detoasted and kept in memory every tsvector value in the sample, but that can be a lot of memory. The original bug report described a case using over 10 gigabytes, with statistics target of 10000 (the maximum). To fix, allocate a separate copy of just the lexemes that we keep around, and free the detoasted tsvector values as we go. This adds some palloc/pfree overhead, when you have a lot of distinct lexemes in the sample, but it's better than running out of memory. Fixes bug #14654 reported by James C. Reviewed by Tom Lane. Backport to all supported versions. Discussion: https://www.postgresql.org/message-id/20170514200602.1451.46797@wrigleys.postgresql.org --- src/backend/tsearch/ts_typanalyze.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c index ab224b76b8..320c7f1a61 100644 --- a/src/backend/tsearch/ts_typanalyze.c +++ b/src/backend/tsearch/ts_typanalyze.c @@ -232,9 +232,7 @@ compute_tsvector_stats(VacAttrStats *stats, /* * We loop through the lexemes in the tsvector and add them to our - * tracking hashtable. Note: the hashtable entries will point into - * the (detoasted) tsvector value, therefore we cannot free that - * storage until we're done. + * tracking hashtable. */ lexemesptr = STRPTR(vector); curentryptr = ARRPTR(vector); @@ -242,7 +240,12 @@ compute_tsvector_stats(VacAttrStats *stats, { bool found; - /* Construct a hash key */ + /* + * Construct a hash key. The key points into the (detoasted) + * tsvector value at this point, but if a new entry is created, we + * make a copy of it. This way we can free the tsvector value + * once we've processed all its lexemes. + */ hash_key.lexeme = lexemesptr + curentryptr->pos; hash_key.length = curentryptr->len; @@ -261,6 +264,9 @@ compute_tsvector_stats(VacAttrStats *stats, /* Initialize new tracking list element */ item->frequency = 1; item->delta = b_current - 1; + + item->key.lexeme = palloc(hash_key.length); + memcpy(item->key.lexeme, hash_key.lexeme, hash_key.length); } /* lexeme_no is the number of elements processed (ie N) */ @@ -276,6 +282,10 @@ compute_tsvector_stats(VacAttrStats *stats, /* Advance to the next WordEntry in the tsvector */ curentryptr++; } + + /* If the vector was toasted, free the detoasted copy. */ + if (TSVectorGetDatum(vector) != value) + pfree(vector); } /* We can only compute real stats if we found some non-null values. */ @@ -447,9 +457,12 @@ prune_lexemes_hashtable(HTAB *lexemes_tab, int b_current) { if (item->frequency + item->delta <= b_current) { + char *lexeme = item->key.lexeme; + if (hash_search(lexemes_tab, (const void *) &item->key, HASH_REMOVE, NULL) == NULL) elog(ERROR, "hash table corrupted"); + pfree(lexeme); } } }