diff --git a/src/backend/snowball/dict_snowball.c b/src/backend/snowball/dict_snowball.c index 8c25f3ebbf..11624145d6 100644 --- a/src/backend/snowball/dict_snowball.c +++ b/src/backend/snowball/dict_snowball.c @@ -275,8 +275,24 @@ dsnowball_lexize(PG_FUNCTION_ARGS) char *txt = lowerstr_with_len(in, len); TSLexeme *res = palloc0(sizeof(TSLexeme) * 2); - if (*txt == '\0' || searchstoplist(&(d->stoplist), txt)) + /* + * Do not pass strings exceeding 1000 bytes to the stemmer, as they're + * surely not words in any human language. This restriction avoids + * wasting cycles on stuff like base64-encoded data, and it protects us + * against possible inefficiency or misbehavior in the stemmer. (For + * example, the Turkish stemmer has an indefinite recursion, so it can + * crash on long-enough strings.) However, Snowball dictionaries are + * defined to recognize all strings, so we can't reject the string as an + * unknown word. + */ + if (len > 1000) { + /* return the lexeme lowercased, but otherwise unmodified */ + res->lexeme = txt; + } + else if (*txt == '\0' || searchstoplist(&(d->stoplist), txt)) + { + /* empty or stopword, so report as stopword */ pfree(txt); } else