diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index 5025a449fb..d8b6921234 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -35,6 +35,7 @@ #include "regex/regex.h" #include "utils/array.h" #include "utils/builtins.h" +#include "utils/memutils.h" #include "utils/varlena.h" #define PG_GETARG_TEXT_PP_IF_EXISTS(_n) \ @@ -61,6 +62,9 @@ typedef struct regexp_matches_ctx /* workspace for build_regexp_match_result() */ Datum *elems; /* has npatterns elements */ bool *nulls; /* has npatterns elements */ + pg_wchar *wide_str; /* wide-char version of original string */ + char *conv_buf; /* conversion buffer */ + int conv_bufsiz; /* size thereof */ } regexp_matches_ctx; /* @@ -111,8 +115,8 @@ static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *flags, Oid collation, bool use_subpatterns, - bool ignore_degenerate); -static void cleanup_regexp_matches(regexp_matches_ctx *matchctx); + bool ignore_degenerate, + bool fetching_unmatched); static ArrayType *build_regexp_match_result(regexp_matches_ctx *matchctx); static Datum build_regexp_split_result(regexp_matches_ctx *splitctx); @@ -863,7 +867,7 @@ regexp_match(PG_FUNCTION_ARGS) errhint("Use the regexp_matches function instead."))); matchctx = setup_regexp_matches(orig_str, pattern, &re_flags, - PG_GET_COLLATION(), true, false); + PG_GET_COLLATION(), true, false, false); if (matchctx->nmatches == 0) PG_RETURN_NULL(); @@ -911,7 +915,7 @@ regexp_matches(PG_FUNCTION_ARGS) matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern, &re_flags, PG_GET_COLLATION(), - true, false); + true, false, false); /* Pre-create workspace that build_regexp_match_result needs */ matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns); @@ -933,9 +937,6 @@ regexp_matches(PG_FUNCTION_ARGS) SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary)); } - /* release space in multi-call ctx to avoid intraquery memory leak */ - cleanup_regexp_matches(matchctx); - SRF_RETURN_DONE(funcctx); } @@ -954,17 +955,24 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS) * all the matching in one swoop. The returned regexp_matches_ctx contains * the locations of all the substrings matching the pattern. * - * The two bool parameters have only two patterns (one for matching, one for + * The three bool parameters have only two patterns (one for matching, one for * splitting) but it seems clearer to distinguish the functionality this way - * than to key it all off one "is_split" flag. + * than to key it all off one "is_split" flag. We don't currently assume that + * fetching_unmatched is exclusive of fetching the matched text too; if it's + * set, the conversion buffer is large enough to fetch any single matched or + * unmatched string, but not any larger substring. (In practice, when splitting + * the matches are usually small anyway, and it didn't seem worth complicating + * the code further.) */ static regexp_matches_ctx * setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags, Oid collation, bool use_subpatterns, - bool ignore_degenerate) + bool ignore_degenerate, + bool fetching_unmatched) { regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx)); + int eml = pg_database_encoding_max_length(); int orig_len; pg_wchar *wide_str; int wide_len; @@ -975,6 +983,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags, int array_idx; int prev_match_end; int start_search; + int maxlen = 0; /* largest fetch length in characters */ /* save original string --- we'll extract result substrings from it */ matchctx->orig_str = orig_str; @@ -1003,8 +1012,13 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags, /* temporary output space for RE package */ pmatch = palloc(sizeof(regmatch_t) * pmatch_len); - /* the real output space (grown dynamically if needed) */ - array_len = re_flags->glob ? 256 : 32; + /* + * the real output space (grown dynamically if needed) + * + * use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather + * than at 2^27 + */ + array_len = re_flags->glob ? 255 : 31; matchctx->match_locs = (int *) palloc(sizeof(int) * array_len); array_idx = 0; @@ -1024,9 +1038,13 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags, pmatch[0].rm_eo > prev_match_end)) { /* enlarge output space if needed */ - while (array_idx + matchctx->npatterns * 2 > array_len) + while (array_idx + matchctx->npatterns * 2 + 1 > array_len) { - array_len *= 2; + array_len += array_len + 1; /* 2^n-1 => 2^(n+1)-1 */ + if (array_len > MaxAllocSize/sizeof(int)) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("too many regular expression matches"))); matchctx->match_locs = (int *) repalloc(matchctx->match_locs, sizeof(int) * array_len); } @@ -1038,16 +1056,33 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags, for (i = 1; i <= matchctx->npatterns; i++) { - matchctx->match_locs[array_idx++] = pmatch[i].rm_so; - matchctx->match_locs[array_idx++] = pmatch[i].rm_eo; + int so = pmatch[i].rm_so; + int eo = pmatch[i].rm_eo; + matchctx->match_locs[array_idx++] = so; + matchctx->match_locs[array_idx++] = eo; + if (so >= 0 && eo >= 0 && (eo - so) > maxlen) + maxlen = (eo - so); } } else { - matchctx->match_locs[array_idx++] = pmatch[0].rm_so; - matchctx->match_locs[array_idx++] = pmatch[0].rm_eo; + int so = pmatch[0].rm_so; + int eo = pmatch[0].rm_eo; + matchctx->match_locs[array_idx++] = so; + matchctx->match_locs[array_idx++] = eo; + if (so >= 0 && eo >= 0 && (eo - so) > maxlen) + maxlen = (eo - so); } matchctx->nmatches++; + + /* + * check length of unmatched portion between end of previous match + * and start of current one + */ + if (fetching_unmatched && + pmatch[0].rm_so >= 0 && + (pmatch[0].rm_so - prev_match_end) > maxlen) + maxlen = (pmatch[0].rm_so - prev_match_end); } prev_match_end = pmatch[0].rm_eo; @@ -1068,34 +1103,67 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags, break; } + /* + * check length of unmatched portion between end of last match and end of + * input string + */ + if (fetching_unmatched && + (wide_len - prev_match_end) > maxlen) + maxlen = (wide_len - prev_match_end); + + /* + * Keep a note of the end position of the string for the benefit of + * splitting code. + */ + matchctx->match_locs[array_idx] = wide_len; + + if (eml > 1) + { + int64 maxsiz = eml * (int64) maxlen; + int conv_bufsiz; + + /* + * Make the conversion buffer large enough for any substring of + * interest. + * + * Worst case: assume we need the maximum size (maxlen*eml), but take + * advantage of the fact that the original string length in bytes is an + * upper bound on the byte length of any fetched substring (and we know + * that len+1 is safe to allocate because the varlena header is longer + * than 1 byte). + */ + if (maxsiz > orig_len) + conv_bufsiz = orig_len + 1; + else + conv_bufsiz = maxsiz + 1; /* safe since maxsiz < 2^30 */ + + matchctx->conv_buf = palloc(conv_bufsiz); + matchctx->conv_bufsiz = conv_bufsiz; + matchctx->wide_str = wide_str; + } + else + { + /* No need to keep the wide string if we're in a single-byte charset. */ + pfree(wide_str); + matchctx->wide_str = NULL; + matchctx->conv_buf = NULL; + matchctx->conv_bufsiz = 0; + } + /* Clean up temp storage */ - pfree(wide_str); pfree(pmatch); return matchctx; } -/* - * cleanup_regexp_matches - release memory of a regexp_matches_ctx - */ -static void -cleanup_regexp_matches(regexp_matches_ctx *matchctx) -{ - pfree(matchctx->orig_str); - pfree(matchctx->match_locs); - if (matchctx->elems) - pfree(matchctx->elems); - if (matchctx->nulls) - pfree(matchctx->nulls); - pfree(matchctx); -} - /* * build_regexp_match_result - build output array for current match */ static ArrayType * build_regexp_match_result(regexp_matches_ctx *matchctx) { + char *buf = matchctx->conv_buf; + int bufsiz PG_USED_FOR_ASSERTS_ONLY = matchctx->conv_bufsiz; Datum *elems = matchctx->elems; bool *nulls = matchctx->nulls; int dims[1]; @@ -1115,6 +1183,15 @@ build_regexp_match_result(regexp_matches_ctx *matchctx) elems[i] = (Datum) 0; nulls[i] = true; } + else if (buf) + { + int len = pg_wchar2mb_with_len(matchctx->wide_str + so, + buf, + eo - so); + Assert(len < bufsiz); + elems[i] = PointerGetDatum(cstring_to_text_with_len(buf, len)); + nulls[i] = false; + } else { elems[i] = DirectFunctionCall3(text_substr, @@ -1168,7 +1245,7 @@ regexp_split_to_table(PG_FUNCTION_ARGS) splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern, &re_flags, PG_GET_COLLATION(), - false, true); + false, true, true); MemoryContextSwitchTo(oldcontext); funcctx->user_fctx = (void *) splitctx; @@ -1185,9 +1262,6 @@ regexp_split_to_table(PG_FUNCTION_ARGS) SRF_RETURN_NEXT(funcctx, result); } - /* release space in multi-call ctx to avoid intraquery memory leak */ - cleanup_regexp_matches(splitctx); - SRF_RETURN_DONE(funcctx); } @@ -1224,7 +1298,7 @@ regexp_split_to_array(PG_FUNCTION_ARGS) PG_GETARG_TEXT_PP(1), &re_flags, PG_GET_COLLATION(), - false, true); + false, true, true); while (splitctx->next_match <= splitctx->nmatches) { @@ -1236,12 +1310,6 @@ regexp_split_to_array(PG_FUNCTION_ARGS) splitctx->next_match++; } - /* - * We don't call cleanup_regexp_matches here; it would try to pfree the - * input string, which we didn't copy. The space is not in a long-lived - * memory context anyway. - */ - PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate, CurrentMemoryContext)); } @@ -1261,6 +1329,7 @@ regexp_split_to_array_no_flags(PG_FUNCTION_ARGS) static Datum build_regexp_split_result(regexp_matches_ctx *splitctx) { + char *buf = splitctx->conv_buf; int startpos; int endpos; @@ -1271,7 +1340,21 @@ build_regexp_split_result(regexp_matches_ctx *splitctx) if (startpos < 0) elog(ERROR, "invalid match ending position"); - if (splitctx->next_match < splitctx->nmatches) + if (buf) + { + int bufsiz PG_USED_FOR_ASSERTS_ONLY = splitctx->conv_bufsiz; + int len; + + endpos = splitctx->match_locs[splitctx->next_match * 2]; + if (endpos < startpos) + elog(ERROR, "invalid match starting position"); + len = pg_wchar2mb_with_len(splitctx->wide_str + startpos, + buf, + endpos-startpos); + Assert(len < bufsiz); + return PointerGetDatum(cstring_to_text_with_len(buf, len)); + } + else { endpos = splitctx->match_locs[splitctx->next_match * 2]; if (endpos < startpos) @@ -1281,13 +1364,6 @@ build_regexp_split_result(regexp_matches_ctx *splitctx) Int32GetDatum(startpos + 1), Int32GetDatum(endpos - startpos)); } - else - { - /* no more matches, return rest of string */ - return DirectFunctionCall2(text_substr_no_len, - PointerGetDatum(splitctx->orig_str), - Int32GetDatum(startpos + 1)); - } } /*