postgresql/src/test/regress/expected/regex.out

--
-- Regular expression tests
--
-- Don't want to have to double backslashes in regexes
set standard_conforming_strings = on;
-- Test simple quantified backrefs
select 'bbbbb' ~ '^([bc])\1*$' as t;
 t 
---
 t
(1 row)

select 'ccc' ~ '^([bc])\1*$' as t;
 t 
---
 t
(1 row)

select 'xxx' ~ '^([bc])\1*$' as f;
 f 
---
 f
(1 row)

select 'bbc' ~ '^([bc])\1*$' as f;
 f 
---
 f
(1 row)

select 'b' ~ '^([bc])\1*$' as t;
 t 
---
 t
(1 row)

-- Test quantified backref within a larger expression
select 'abc abc abc' ~ '^(\w+)( \1)+$' as t;
 t 
---
 t
(1 row)

select 'abc abd abc' ~ '^(\w+)( \1)+$' as f;
 f 
---
 f
(1 row)

select 'abc abc abd' ~ '^(\w+)( \1)+$' as f;
 f 
---
 f
(1 row)

select 'abc abc abc' ~ '^(.+)( \1)+$' as t;
 t 
---
 t
(1 row)

select 'abc abd abc' ~ '^(.+)( \1)+$' as f;
 f 
---
 f
(1 row)

select 'abc abc abd' ~ '^(.+)( \1)+$' as f;
 f 
---
 f
(1 row)

-- Test some cases that crashed in 9.2beta1 due to pmatch[] array overrun
select substring('asd TO foo' from ' TO (([a-z0-9._]+|"([^"]+|"")+")+)');
 substring 
-----------
 foo
(1 row)

select substring('a' from '((a))+');
 substring 
-----------
 a
(1 row)

select substring('a' from '((a)+)');
 substring 
-----------
 a
(1 row)

-- Test conversion of regex patterns to indexable conditions
explain (costs off) select * from pg_proc where proname ~ 'abc';
            QUERY PLAN             
-----------------------------------
 Seq Scan on pg_proc
   Filter: (proname ~ 'abc'::text)
(2 rows)

explain (costs off) select * from pg_proc where proname ~ '^abc';
                              QUERY PLAN                              
----------------------------------------------------------------------
 Index Scan using pg_proc_proname_args_nsp_index on pg_proc
   Index Cond: ((proname >= 'abc'::name) AND (proname < 'abd'::name))
   Filter: (proname ~ '^abc'::text)
(3 rows)

explain (costs off) select * from pg_proc where proname ~ '^abc$';
                         QUERY PLAN                         
------------------------------------------------------------
 Index Scan using pg_proc_proname_args_nsp_index on pg_proc
   Index Cond: (proname = 'abc'::name)
   Filter: (proname ~ '^abc$'::text)
(3 rows)

explain (costs off) select * from pg_proc where proname ~ '^abcd*e';
                              QUERY PLAN                              
----------------------------------------------------------------------
 Index Scan using pg_proc_proname_args_nsp_index on pg_proc
   Index Cond: ((proname >= 'abc'::name) AND (proname < 'abd'::name))
   Filter: (proname ~ '^abcd*e'::text)
(3 rows)

explain (costs off) select * from pg_proc where proname ~ '^abc+d';
                              QUERY PLAN                              
----------------------------------------------------------------------
 Index Scan using pg_proc_proname_args_nsp_index on pg_proc
   Index Cond: ((proname >= 'abc'::name) AND (proname < 'abd'::name))
   Filter: (proname ~ '^abc+d'::text)
(3 rows)

explain (costs off) select * from pg_proc where proname ~ '^(abc)(def)';
                                 QUERY PLAN                                 
----------------------------------------------------------------------------
 Index Scan using pg_proc_proname_args_nsp_index on pg_proc
   Index Cond: ((proname >= 'abcdef'::name) AND (proname < 'abcdeg'::name))
   Filter: (proname ~ '^(abc)(def)'::text)
(3 rows)

explain (costs off) select * from pg_proc where proname ~ '^(abc)$';
                         QUERY PLAN                         
------------------------------------------------------------
 Index Scan using pg_proc_proname_args_nsp_index on pg_proc
   Index Cond: (proname = 'abc'::name)
   Filter: (proname ~ '^(abc)$'::text)
(3 rows)

explain (costs off) select * from pg_proc where proname ~ '^(abc)?d';
               QUERY PLAN               
----------------------------------------
 Seq Scan on pg_proc
   Filter: (proname ~ '^(abc)?d'::text)
(2 rows)

-- Test for infinite loop in pullback() (CVE-2007-4772)
select 'a' ~ '($|^)*';
 ?column? 
----------
 t
(1 row)

-- Test for infinite loop in fixempties() (Tcl bugs 3604074, 3606683)
select 'a' ~ '((((((a)*)*)*)*)*)*';
 ?column? 
----------
 t
(1 row)

select 'a' ~ '((((((a+|)+|)+|)+|)+|)+|)';
 ?column? 
----------
 t
(1 row)

-- Test backref in combination with non-greedy quantifier
-- https://core.tcl.tk/tcl/tktview/6585b21ca8fa6f3678d442b97241fdd43dba2ec0
select 'Programmer' ~ '(\w).*?\1' as t;
 t 
---
 t
(1 row)

select regexp_matches('Programmer', '(\w)(.*?\1)', 'g');
 regexp_matches 
----------------
 {r,ogr}
 {m,m}
(2 rows)

-- Test for proper matching of non-greedy iteration (bug #11478)
select regexp_matches('foo/bar/baz',
                      '^([^/]+?)(?:/([^/]+?))(?:/([^/]+?))?$', '');
 regexp_matches 
----------------
 {foo,bar,baz}
(1 row)

-- Test for infinite loop in cfindloop with zero-length possible match
-- but no actual match (can only happen in the presence of backrefs)
select 'a' ~ '$()|^\1';
 ?column? 
----------
 f
(1 row)

select 'a' ~ '.. ()|\1';
 ?column? 
----------
 f
(1 row)

select 'a' ~ '()*\1';
 ?column? 
----------
 t
(1 row)

select 'a' ~ '()+\1';
 ?column? 
----------
 t
(1 row)
Fix regex back-references that are directly quantified with . The syntax "\n", that is a backref with a * quantifier directly applied to it, has never worked correctly in Spencer's library. This has been an open bug in the Tcl bug tracker since 2005: https://sourceforge.net/tracker/index.php?func=detail&aid=1115587&group_id=10894&atid=110894 The core of the problem is in parseqatom(), which first changes "\n" to "\n+\|" and then applies repeat() to the NFA representing the backref atom. repeat() thinks that any arc leading into its "rp" argument is part of the sub-NFA to be repeated. Unfortunately, since parseqatom() already created the arc that was intended to represent the empty bypass around "\n+", this arc gets moved too, so that it now leads into the state loop created by repeat(). Thus, what was supposed to be an "empty" bypass gets turned into something that represents zero or more repetitions of the NFA representing the backref atom. In the original example, in place of ^([bc])\1$ we now have something that acts like ^([bc])(\1+\|[bc])$ At runtime, the branch involving the actual backref fails, as it's supposed to, but then the other branch succeeds anyway. We could no doubt fix this by some rearrangement of the operations in parseqatom(), but that code is plenty ugly already, and what's more the whole business of converting "x" to "x+\|" probably needs to go away to fix another problem I'll mention in a moment. Instead, this patch suppresses the *-conversion when the target is a simple backref atom, leaving the case of m == 0 to be handled at runtime. This makes the patch in regcomp.c a one-liner, at the cost of having to tweak cbrdissect() a little. In the event I went a bit further than that and rewrote cbrdissect() to check all the string-length-related conditions before it starts comparing characters. It seems a bit stupid to possibly iterate through many copies of an n-character backreference, only to fail at the end because the target string's length isn't a multiple of n --- we could have found that out before starting. The existing coding could only be a win if integer division is hugely expensive compared to character comparison, but I don't know of any modern machine where that might be true. This does not fix all the problems with quantified back-references. In particular, the code is still broken for back-references that appear within a larger expression that is quantified (so that direct insertion of the quantification limits into the BACKREF node doesn't apply). I think fixing that will take some major surgery on the NFA code, specifically introducing an explicit iteration node type instead of trying to transform iteration into concatenation of modified regexps. Back-patch to all supported branches. In HEAD, also add a regression test case for this. (It may seem a bit silly to create a regression test file for just one test case; but I'm expecting that we will soon import a whole bunch of regex regression tests from Tcl, so might as well create the infrastructure now.) 2012-02-20 06:52:33 +01:00			`--`
			`-- Regular expression tests`
			`--`
			`-- Don't want to have to double backslashes in regexes`
			`set standard_conforming_strings = on;`
			`-- Test simple quantified backrefs`
			`select 'bbbbb' ~ '^([bc])\1*$' as t;`
			`t`
			`---`
			`t`
			`(1 row)`

			`select 'ccc' ~ '^([bc])\1*$' as t;`
			`t`
			`---`
			`t`
			`(1 row)`

			`select 'xxx' ~ '^([bc])\1*$' as f;`
			`f`
			`---`
			`f`
			`(1 row)`

			`select 'bbc' ~ '^([bc])\1*$' as f;`
			`f`
			`---`
			`f`
			`(1 row)`

			`select 'b' ~ '^([bc])\1*$' as t;`
			`t`
			`---`
			`t`
			`(1 row)`

Fix the general case of quantified regex back-references. Cases where a back-reference is part of a larger subexpression that is quantified have never worked in Spencer's regex engine, because he used a compile-time transformation that neglected the need to check the back-reference match in iterations before the last one. (That was okay for capturing parens, and we still do it if the regex has only capturing parens ... but it's not okay for backrefs.) To make this work properly, we have to add an "iteration" node type to the regex engine's vocabulary of sub-regex nodes. Since this is a moderately large change with a fair risk of introducing new bugs of its own, apply to HEAD only, even though it's a fix for a longstanding bug. 2012-02-24 07:40:18 +01:00			`-- Test quantified backref within a larger expression`
			`select 'abc abc abc' ~ '^(\w+)( \1)+$' as t;`
			`t`
			`---`
			`t`
			`(1 row)`

			`select 'abc abd abc' ~ '^(\w+)( \1)+$' as f;`
			`f`
			`---`
			`f`
			`(1 row)`

			`select 'abc abc abd' ~ '^(\w+)( \1)+$' as f;`
			`f`
			`---`
			`f`
			`(1 row)`

			`select 'abc abc abc' ~ '^(.+)( \1)+$' as t;`
			`t`
			`---`
			`t`
			`(1 row)`

			`select 'abc abd abc' ~ '^(.+)( \1)+$' as f;`
			`f`
			`---`
			`f`
			`(1 row)`

			`select 'abc abc abd' ~ '^(.+)( \1)+$' as f;`
			`f`
			`---`
			`f`
			`(1 row)`

Fix array overrun in regex code. zaptreesubs() was coded to unconditionally reset a capture subre's corresponding pmatch[] entry. However, in regexes without backrefs, that array is caller-supplied and might not have as many entries as the regex has capturing parens. So check the array length and do nothing if there is no corresponding entry, much as subset() does. Failure to check this resulted in a stack clobber in the case reported by Marko Kreen. This bug appears to have been latent in the regex library from the beginning. It was not exposed because find() called dissect() not cdissect(), and the dissect() code path didn't ever call zaptreesubs() (formerly zapmem()). When I unified dissect() and cdissect() in commit 4dd78bf37aa29d04b3f358b08c4a2fa43cf828e7, the problem was exposed. Now that I've seen this, I'm rather suspicious that we might need to back-patch it; but will refrain for now, for lack of evidence that the case can be hit in the previous coding. 2012-05-24 19:56:16 +02:00			`-- Test some cases that crashed in 9.2beta1 due to pmatch[] array overrun`
			`select substring('asd TO foo' from ' TO (([a-z0-9._]+\|"([^"]+\|"")+")+)');`
			`substring`
			`-----------`
			`foo`
			`(1 row)`

			`select substring('a' from '((a))+');`
			`substring`
			`-----------`
			`a`
			`(1 row)`

			`select substring('a' from '((a)+)');`
			`substring`
			`-----------`
			`a`
			`(1 row)`

Re-implement extraction of fixed prefixes from regular expressions. To generate btree-indexable conditions from regex WHERE conditions (such as WHERE indexed_col ~ '^foo'), we need to be able to identify any fixed prefix that a regex might have; that is, find any string that must be a prefix of all strings satisfying the regex. We used to do that with entirely ad-hoc code that looked at the source text of the regex. It didn't know very much about regex syntax, which mostly meant that it would fail to identify some optimizable cases; but Viktor Rosenfeld reported that it would produce actively wrong answers for quantified parenthesized subexpressions, such as '^(foo)?bar'. Rather than trying to extend the ad-hoc code to cover this, let's get rid of it altogether in favor of identifying prefixes by examining the compiled form of a regex. To do this, I've added a new entry point "pg_regprefix" to the regex library; hopefully it is defined in a sufficiently general fashion that it can remain in the library when/if that code gets split out as a standalone project. Since this bug has been there for a very long time, this fix needs to get back-patched. However it depends on some other recent commits (particularly the addition of wchar-to-database-encoding conversion), so I'll commit this separately and then go to work on back-porting the necessary fixes. 2012-07-10 20:54:37 +02:00			`-- Test conversion of regex patterns to indexable conditions`
			`explain (costs off) select * from pg_proc where proname ~ 'abc';`
			`QUERY PLAN`
			`-----------------------------------`
			`Seq Scan on pg_proc`
			`Filter: (proname ~ 'abc'::text)`
			`(2 rows)`

			`explain (costs off) select * from pg_proc where proname ~ '^abc';`
			`QUERY PLAN`
			`----------------------------------------------------------------------`
			`Index Scan using pg_proc_proname_args_nsp_index on pg_proc`
			`Index Cond: ((proname >= 'abc'::name) AND (proname < 'abd'::name))`
			`Filter: (proname ~ '^abc'::text)`
			`(3 rows)`

			`explain (costs off) select * from pg_proc where proname ~ '^abc$';`
			`QUERY PLAN`
			`------------------------------------------------------------`
			`Index Scan using pg_proc_proname_args_nsp_index on pg_proc`
			`Index Cond: (proname = 'abc'::name)`
			`Filter: (proname ~ '^abc$'::text)`
			`(3 rows)`

			`explain (costs off) select * from pg_proc where proname ~ '^abcd*e';`
			`QUERY PLAN`
			`----------------------------------------------------------------------`
			`Index Scan using pg_proc_proname_args_nsp_index on pg_proc`
			`Index Cond: ((proname >= 'abc'::name) AND (proname < 'abd'::name))`
			`Filter: (proname ~ '^abcd*e'::text)`
			`(3 rows)`

			`explain (costs off) select * from pg_proc where proname ~ '^abc+d';`
			`QUERY PLAN`
			`----------------------------------------------------------------------`
			`Index Scan using pg_proc_proname_args_nsp_index on pg_proc`
			`Index Cond: ((proname >= 'abc'::name) AND (proname < 'abd'::name))`
			`Filter: (proname ~ '^abc+d'::text)`
			`(3 rows)`

			`explain (costs off) select * from pg_proc where proname ~ '^(abc)(def)';`
			`QUERY PLAN`
			`----------------------------------------------------------------------------`
			`Index Scan using pg_proc_proname_args_nsp_index on pg_proc`
			`Index Cond: ((proname >= 'abcdef'::name) AND (proname < 'abcdeg'::name))`
			`Filter: (proname ~ '^(abc)(def)'::text)`
			`(3 rows)`

			`explain (costs off) select * from pg_proc where proname ~ '^(abc)$';`
			`QUERY PLAN`
			`------------------------------------------------------------`
			`Index Scan using pg_proc_proname_args_nsp_index on pg_proc`
			`Index Cond: (proname = 'abc'::name)`
			`Filter: (proname ~ '^(abc)$'::text)`
			`(3 rows)`

			`explain (costs off) select * from pg_proc where proname ~ '^(abc)?d';`
			`QUERY PLAN`
			`----------------------------------------`
			`Seq Scan on pg_proc`
			`Filter: (proname ~ '^(abc)?d'::text)`
			`(2 rows)`

Fix infinite-loop risk in fixempties() stage of regex compilation. The previous coding of this function could get into situations where it would never terminate, because successive passes would re-add EMPTY arcs that had been removed by the previous pass. Rewrite the function completely using a new algorithm that is guaranteed to terminate, and also seems to be usually faster than the old one. Per Tcl bugs 3604074 and 3606683. Tom Lane and Don Porter 2013-03-07 17:51:03 +01:00			`-- Test for infinite loop in pullback() (CVE-2007-4772)`
			`select 'a' ~ '($\|^)*';`
			`?column?`
			`----------`
			`t`
			`(1 row)`

			`-- Test for infinite loop in fixempties() (Tcl bugs 3604074, 3606683)`
			`select 'a' ~ '((((((a))))))';`
			`?column?`
			`----------`
			`t`
			`(1 row)`

			`select 'a' ~ '((((((a+\|)+\|)+\|)+\|)+\|)+\|)';`
			`?column?`
			`----------`
			`t`
			`(1 row)`

Fix regex match failures for backrefs combined with non-greedy quantifiers. An ancient logic error in cfindloop() could cause the regex engine to fail to find matches that begin later than the start of the string. This function is only used when the regex pattern contains a back reference, and so far as we can tell the error is only reachable if the pattern is non-greedy (i.e. its first quantifier uses the ? modifier). Furthermore, the actual match must begin after some potential match that satisfies the DFA but then fails the back-reference's match test. Reported and fixed by Jeevan Chalke, with cosmetic adjustments by me. 2013-07-19 03:22:37 +02:00			`-- Test backref in combination with non-greedy quantifier`
			`-- https://core.tcl.tk/tcl/tktview/6585b21ca8fa6f3678d442b97241fdd43dba2ec0`
			`select 'Programmer' ~ '(\w).*?\1' as t;`
			`t`
			`---`
			`t`
			`(1 row)`

			`select regexp_matches('Programmer', '(\w)(.*?\1)', 'g');`
			`regexp_matches`
			`----------------`
			`{r,ogr}`
			`{m,m}`
			`(2 rows)`

Fix incorrect search for "x?" style matches in creviterdissect(). When the number of allowed iterations is limited (either a "?" quantifier or a bound expression), the last sub-match has to reach to the end of the target string. The previous coding here first tried the shortest possible match (one character, usually) and then gave up and back-tracked if that didn't work, typically leading to failure to match overall, as shown in bug #11478 from Christoph Berg. The minimum change to fix that would be to not decrement k before "goto backtrack"; but that would be a pretty stupid solution, because we'd laboriously try each possible sub-match length before finally discovering that only ending at the end can work. Instead, force the sub-match endpoint limit up to the end for even the first shortest() call if we cannot have any more sub-matches after this one. Bug introduced in my rewrite that added the iterdissect logic, commit 173e29aa5deefd9e71c183583ba37805c8102a72. The shortest-first search code was too closely modeled on the longest-first code, which hasn't got this issue since it tries a match reaching to the end to start with anyway. Back-patch to all affected branches. 2014-09-24 02:25:31 +02:00			`-- Test for proper matching of non-greedy iteration (bug #11478)`
			`select regexp_matches('foo/bar/baz',`
			`'^([^/]+?)(?:/([^/]+?))(?:/([^/]+?))?$', '');`
			`regexp_matches`
			`----------------`
			`{foo,bar,baz}`
			`(1 row)`

Fix potential infinite loop in regular expression execution. In cfindloop(), if the initial call to shortest() reports that a zero-length match is possible at the current search start point, but then it is unable to construct any actual match to that, it'll just loop around with the same start point, and thus make no progress. We need to force the start point to be advanced. This is safe because the loop over "begin" points has already tried and failed to match starting at "close", so there is surely no need to try that again. This bug was introduced in commit e2bd904955e2221eddf01110b1f25002de2aaa83, wherein we allowed continued searching after we'd run out of match possibilities, but evidently failed to think hard enough about exactly where we needed to search next. Because of the way this code works, such a match failure is only possible in the presence of backrefs --- otherwise, shortest()'s judgment that a match is possible should always be correct. That probably explains how come the bug has escaped detection for several years. The actual fix is a one-liner, but I took the trouble to add/improve some comments related to the loop logic. After fixing that, the submitted test case "()*\1" didn't loop anymore. But it reported failure, though it seems like it ought to match a zero-length string; both Tcl and Perl think it does. That seems to be from overenthusiastic optimization on my part when I rewrote the iteration match logic in commit 173e29aa5deefd9e71c183583ba37805c8102a72: we can't just "declare victory" for a zero-length match without bothering to set match data for capturing parens inside the iterator node. Per fuzz testing by Greg Stark. The first part of this is a bug in all supported branches, and the second part is a bug since 9.2 where the iteration rewrite happened. 2015-10-02 20:26:36 +02:00			`-- Test for infinite loop in cfindloop with zero-length possible match`
			`-- but no actual match (can only happen in the presence of backrefs)`
			`select 'a' ~ '$()\|^\1';`
			`?column?`
			`----------`
			`f`
			`(1 row)`

			`select 'a' ~ '.. ()\|\1';`
			`?column?`
			`----------`
			`f`
			`(1 row)`

			`select 'a' ~ '()*\1';`
			`?column?`
			`----------`
			`t`
			`(1 row)`

			`select 'a' ~ '()+\1';`
			`?column?`
			`----------`
			`t`
			`(1 row)`