From d074b4e50d11768ab6da696b13d40ec05e4823fb Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 31 Jul 2013 11:31:22 -0400 Subject: [PATCH] Fix regexp_matches() handling of zero-length matches. We'd find the same match twice if it was of zero length and not immediately adjacent to the previous match. replace_text_regexp() got similar cases right, so adjust this search logic to match that. Note that even though the regexp_split_to_xxx() functions share this code, they did not display equivalent misbehavior, because the second match would be considered degenerate and ignored. Jeevan Chalke, with some cosmetic changes by me. --- src/backend/utils/adt/regexp.c | 13 +++--- src/backend/utils/adt/varlena.c | 5 ++- src/test/regress/expected/strings.out | 58 +++++++++++++++++++++++++++ src/test/regress/sql/strings.sql | 7 ++++ 4 files changed, 75 insertions(+), 8 deletions(-) diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index 6a89fabf4b..ee37dfe991 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -957,14 +957,13 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags, break; /* - * Advance search position. Normally we start just after the end of - * the previous match, but always advance at least one character (the - * special case can occur if the pattern matches zero characters just - * after the prior match or at the end of the string). + * Advance search position. Normally we start the next search at the + * end of the previous match; but if the match was of zero length, we + * have to advance by one character, or we'd just find the same match + * again. */ - if (start_search < pmatch[0].rm_eo) - start_search = pmatch[0].rm_eo; - else + start_search = prev_match_end; + if (pmatch[0].rm_so == pmatch[0].rm_eo) start_search++; if (start_search > wide_len) break; diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 4aa344896f..5e2c2ddc53 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -3083,7 +3083,10 @@ replace_text_regexp(text *src_text, void *regexp, break; /* - * Search from next character when the matching text is zero width. + * Advance search position. Normally we start the next search at the + * end of the previous match; but if the match was of zero length, we + * have to advance by one character, or we'd just find the same match + * again. */ search_start = data_pos; if (pmatch[0].rm_so == pmatch[0].rm_eo) diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index 281c69528a..19708c32fd 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -440,6 +440,64 @@ SELECT regexp_matches('foobarbequebaz', $re$barbeque$re$); {barbeque} (1 row) +-- start/end-of-line matches are of zero length +SELECT regexp_matches('foo' || chr(10) || 'bar' || chr(10) || 'bequq' || chr(10) || 'baz', '^', 'mg'); + regexp_matches +---------------- + {""} + {""} + {""} + {""} +(4 rows) + +SELECT regexp_matches('foo' || chr(10) || 'bar' || chr(10) || 'bequq' || chr(10) || 'baz', '$', 'mg'); + regexp_matches +---------------- + {""} + {""} + {""} + {""} +(4 rows) + +SELECT regexp_matches('1' || chr(10) || '2' || chr(10) || '3' || chr(10) || '4' || chr(10), '^.?', 'mg'); + regexp_matches +---------------- + {1} + {2} + {3} + {4} + {""} +(5 rows) + +SELECT regexp_matches(chr(10) || '1' || chr(10) || '2' || chr(10) || '3' || chr(10) || '4' || chr(10), '.?$', 'mg'); + regexp_matches +---------------- + {""} + {1} + {""} + {2} + {""} + {3} + {""} + {4} + {""} + {""} +(10 rows) + +SELECT regexp_matches(chr(10) || '1' || chr(10) || '2' || chr(10) || '3' || chr(10) || '4', '.?$', 'mg'); + regexp_matches +---------------- + {""} + {1} + {""} + {2} + {""} + {3} + {""} + {4} + {""} +(9 rows) + -- give me errors SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, 'gz'); ERROR: invalid regexp option: "z" diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index e7841aa20d..f9cfaeb44a 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -158,6 +158,13 @@ SELECT regexp_matches('foobarbequebaz', $re$(bar)(.+)?(beque)$re$); -- no capture groups SELECT regexp_matches('foobarbequebaz', $re$barbeque$re$); +-- start/end-of-line matches are of zero length +SELECT regexp_matches('foo' || chr(10) || 'bar' || chr(10) || 'bequq' || chr(10) || 'baz', '^', 'mg'); +SELECT regexp_matches('foo' || chr(10) || 'bar' || chr(10) || 'bequq' || chr(10) || 'baz', '$', 'mg'); +SELECT regexp_matches('1' || chr(10) || '2' || chr(10) || '3' || chr(10) || '4' || chr(10), '^.?', 'mg'); +SELECT regexp_matches(chr(10) || '1' || chr(10) || '2' || chr(10) || '3' || chr(10) || '4' || chr(10), '.?$', 'mg'); +SELECT regexp_matches(chr(10) || '1' || chr(10) || '2' || chr(10) || '3' || chr(10) || '4', '.?$', 'mg'); + -- give me errors SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, 'gz'); SELECT regexp_matches('foobarbequebaz', $re$(barbeque$re$);