diff --git a/contrib/pg_trgm/trgm_regexp.c b/contrib/pg_trgm/trgm_regexp.c index c4dfdf2a3d..2e6fa21935 100644 --- a/contrib/pg_trgm/trgm_regexp.c +++ b/contrib/pg_trgm/trgm_regexp.c @@ -122,9 +122,23 @@ * thousands of trigrams would be slow, and would likely produce so many * false positives that we would have to traverse a large fraction of the * index, the graph is simplified further in a lossy fashion by removing - * color trigrams until the number of trigrams after expansion is below - * the MAX_TRGM_COUNT threshold. When a color trigram is removed, the states - * connected by any arcs labelled with that trigram are merged. + * color trigrams. When a color trigram is removed, the states connected by + * any arcs labelled with that trigram are merged. + * + * Trigrams do not all have equivalent value for searching: some of them are + * more frequent and some of them are less frequent. Ideally, we would like + * to know the distribution of trigrams, but we don't. But because of padding + * we know for sure that the empty character is more frequent than others, + * so we can penalize trigrams according to presence of whitespace. The + * penalty assigned to each color trigram is the number of simple trigrams + * it would produce, times the penalties[] multiplier associated with its + * whitespace content. (The penalties[] constants were calculated by analysis + * of some real-life text.) We eliminate color trigrams starting with the + * highest-penalty one, until we get to a total penalty of no more than + * WISH_TRGM_PENALTY. However, we cannot remove a color trigram if that would + * lead to merging the initial and final states, so we may not be able to + * reach WISH_TRGM_PENALTY. It's still okay so long as we have no more than + * MAX_TRGM_COUNT simple trigrams in total, otherwise we fail. * * 4) Pack the graph into a compact representation * ----------------------------------------------- @@ -199,13 +213,30 @@ * MAX_EXPANDED_STATES - How many states we allow in expanded graph * MAX_EXPANDED_ARCS - How many arcs we allow in expanded graph * MAX_TRGM_COUNT - How many simple trigrams we allow to be extracted + * WISH_TRGM_PENALTY - Maximum desired sum of color trigram penalties * COLOR_COUNT_LIMIT - Maximum number of characters per color */ #define MAX_EXPANDED_STATES 128 #define MAX_EXPANDED_ARCS 1024 #define MAX_TRGM_COUNT 256 +#define WISH_TRGM_PENALTY 16 #define COLOR_COUNT_LIMIT 256 +/* + * Penalty multipliers for trigram counts depending on whitespace contents. + * Numbers based on analysis of real-life texts. + */ +const float4 penalties[8] = { + 1.0, /* "aaa" */ + 3.5, /* "aa " */ + 0.0, /* "a a" (impossible) */ + 0.0, /* "a " (impossible) */ + 4.2, /* " aa" */ + 2.1, /* " a " */ + 25.0, /* " a" */ + 0.0 /* " " (impossible) */ +}; + /* Struct representing a single pg_wchar, converted back to multibyte form */ typedef struct { @@ -339,6 +370,7 @@ typedef struct ColorTrgm ctrgm; int number; int count; + float4 penalty; bool expanded; List *arcs; } ColorTrgmInfo; @@ -459,7 +491,7 @@ static TRGM *expandColorTrigrams(TrgmNFA *trgmNFA, MemoryContext rcontext); static void fillTrgm(trgm *ptrgm, trgm_mb_char s[3]); static void mergeStates(TrgmState *state1, TrgmState *state2); static int colorTrgmInfoCmp(const void *p1, const void *p2); -static int colorTrgmInfoCountCmp(const void *p1, const void *p2); +static int colorTrgmInfoPenaltyCmp(const void *p1, const void *p2); static TrgmPackedGraph *packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext); static int packArcInfoCmp(const void *a1, const void *a2); @@ -1424,6 +1456,7 @@ selectColorTrigrams(TrgmNFA *trgmNFA) TrgmState *state; ColorTrgmInfo *colorTrgms; int64 totalTrgmCount; + float4 totalTrgmPenalty; int number; /* Collect color trigrams from all arcs */ @@ -1482,53 +1515,67 @@ selectColorTrigrams(TrgmNFA *trgmNFA) } /* - * Count number of simple trigrams generated by each color trigram. + * Count number of simple trigrams generated by each color trigram, and + * also compute a penalty value, which is the number of simple trigrams + * times a multiplier that depends on its whitespace content. * * Note: per-color-trigram counts cannot overflow an int so long as * COLOR_COUNT_LIMIT is not more than the cube root of INT_MAX, ie about * 1290. However, the grand total totalTrgmCount might conceivably - * overflow an int, so we use int64 for that within this routine. + * overflow an int, so we use int64 for that within this routine. Also, + * penalties are calculated in float4 arithmetic to avoid any overflow + * worries. */ totalTrgmCount = 0; + totalTrgmPenalty = 0.0f; for (i = 0; i < trgmNFA->colorTrgmsCount; i++) { ColorTrgmInfo *trgmInfo = &colorTrgms[i]; int j, - count = 1; + count = 1, + typeIndex = 0; for (j = 0; j < 3; j++) { TrgmColor c = trgmInfo->ctrgm.colors[j]; - if (c != COLOR_BLANK) + typeIndex *= 2; + if (c == COLOR_BLANK) + typeIndex++; + else count *= trgmNFA->colorInfo[c].wordCharsCount; } trgmInfo->count = count; totalTrgmCount += count; + trgmInfo->penalty = penalties[typeIndex] * (float4) count; + totalTrgmPenalty += trgmInfo->penalty; } - /* Sort color trigrams in descending order of simple trigram counts */ + /* Sort color trigrams in descending order of their penalties */ qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo), - colorTrgmInfoCountCmp); + colorTrgmInfoPenaltyCmp); /* - * Remove color trigrams from the graph so long as total number of simple - * trigrams exceeds MAX_TRGM_COUNT. We prefer to remove color trigrams - * with the most associated simple trigrams, since those are the most - * promising for reducing the total number of simple trigrams. When - * removing a color trigram we have to merge states connected by arcs - * labeled with that trigram. It's necessary to not merge initial and - * final states, because our graph becomes useless if that happens; so we - * cannot always remove the trigram we'd prefer to. + * Remove color trigrams from the graph so long as total penalty of color + * trigrams exceeds WISH_TRGM_PENALTY. (If we fail to get down to + * WISH_TRGM_PENALTY, it's OK so long as total count is no more than + * MAX_TRGM_COUNT.) We prefer to remove color trigrams with higher + * penalty, since those are the most promising for reducing the total + * penalty. When removing a color trigram we have to merge states + * connected by arcs labeled with that trigram. It's necessary to not + * merge initial and final states, because our graph becomes useless if + * that happens; so we cannot always remove the trigram we'd prefer to. */ - for (i = 0; - (i < trgmNFA->colorTrgmsCount) && (totalTrgmCount > MAX_TRGM_COUNT); - i++) + for (i = 0; i < trgmNFA->colorTrgmsCount; i++) { ColorTrgmInfo *trgmInfo = &colorTrgms[i]; bool canRemove = true; ListCell *cell; + /* Done if we've reached the target */ + if (totalTrgmPenalty <= WISH_TRGM_PENALTY) + break; + /* * Does any arc of this color trigram connect initial and final * states? If so we can't remove it. @@ -1570,9 +1617,10 @@ selectColorTrigrams(TrgmNFA *trgmNFA) mergeStates(source, target); } - /* Mark trigram unexpanded, and update totalTrgmCount */ + /* Mark trigram unexpanded, and update totals */ trgmInfo->expanded = false; totalTrgmCount -= trgmInfo->count; + totalTrgmPenalty -= trgmInfo->penalty; } /* Did we succeed in fitting into MAX_TRGM_COUNT? */ @@ -1746,17 +1794,17 @@ colorTrgmInfoCmp(const void *p1, const void *p2) /* * Compare function for sorting color trigrams in descending order of - * their simple trigrams counts. + * their penalty fields. */ static int -colorTrgmInfoCountCmp(const void *p1, const void *p2) +colorTrgmInfoPenaltyCmp(const void *p1, const void *p2) { - const ColorTrgmInfo *c1 = (const ColorTrgmInfo *) p1; - const ColorTrgmInfo *c2 = (const ColorTrgmInfo *) p2; + float4 penalty1 = ((const ColorTrgmInfo *) p1)->penalty; + float4 penalty2 = ((const ColorTrgmInfo *) p2)->penalty; - if (c1->count < c2->count) + if (penalty1 < penalty2) return 1; - else if (c1->count == c2->count) + else if (penalty1 == penalty2) return 0; else return -1;