2011-02-14 02:06:41 +01:00
|
|
|
CREATE EXTENSION pg_trgm;
|
2004-05-31 19:18:12 +02:00
|
|
|
|
2016-11-29 21:05:22 +01:00
|
|
|
-- Check whether any of our opclasses fail amvalidate
|
|
|
|
SELECT amname, opcname
|
|
|
|
FROM pg_opclass opc LEFT JOIN pg_am am ON am.oid = opcmethod
|
|
|
|
WHERE opc.oid >= 16384 AND NOT amvalidate(opc.oid);
|
|
|
|
|
2017-12-12 12:59:27 +01:00
|
|
|
--backslash is used in tests below, installcheck will fail if
|
|
|
|
--standard_conforming_string is off
|
|
|
|
set standard_conforming_strings=on;
|
|
|
|
|
Change floating-point output format for improved performance.
Previously, floating-point output was done by rounding to a specific
decimal precision; by default, to 6 or 15 decimal digits (losing
information) or as requested using extra_float_digits. Drivers that
wanted exact float values, and applications like pg_dump that must
preserve values exactly, set extra_float_digits=3 (or sometimes 2 for
historical reasons, though this isn't enough for float4).
Unfortunately, decimal rounded output is slow enough to become a
noticable bottleneck when dealing with large result sets or COPY of
large tables when many floating-point values are involved.
Floating-point output can be done much faster when the output is not
rounded to a specific decimal length, but rather is chosen as the
shortest decimal representation that is closer to the original float
value than to any other value representable in the same precision. The
recently published Ryu algorithm by Ulf Adams is both relatively
simple and remarkably fast.
Accordingly, change float4out/float8out to output shortest decimal
representations if extra_float_digits is greater than 0, and make that
the new default. Applications that need rounded output can set
extra_float_digits back to 0 or below, and take the resulting
performance hit.
We make one concession to portability for systems with buggy
floating-point input: we do not output decimal values that fall
exactly halfway between adjacent representable binary values (which
would rely on the reader doing round-to-nearest-even correctly). This
is known to be a problem at least for VS2013 on Windows.
Our version of the Ryu code originates from
https://github.com/ulfjack/ryu/ at commit c9c3fb1979, but with the
following (significant) modifications:
- Output format is changed to use fixed-point notation for small
exponents, as printf would, and also to use lowercase 'e', a
minimum of 2 exponent digits, and a mandatory sign on the exponent,
to keep the formatting as close as possible to previous output.
- The output of exact midpoint values is disabled as noted above.
- The integer fast-path code is changed somewhat (since we have
fixed-point output and the upstream did not).
- Our project style has been largely applied to the code with the
exception of C99 declaration-after-statement, which has been
retained as an exception to our present policy.
- Most of upstream's debugging and conditionals are removed, and we
use our own configure tests to determine things like uint128
availability.
Changing the float output format obviously affects a number of
regression tests. This patch uses an explicit setting of
extra_float_digits=0 for test output that is not expected to be
exactly reproducible (e.g. due to numerical instability or differing
algorithms for transcendental functions).
Conversions from floats to numeric are unchanged by this patch. These
may appear in index expressions and it is not yet clear whether any
change should be made, so that can be left for another day.
This patch assumes that the only supported floating point format is
now IEEE format, and the documentation is updated to reflect that.
Code by me, adapting the work of Ulf Adams and other contributors.
References:
https://dl.acm.org/citation.cfm?id=3192369
Reviewed-by: Tom Lane, Andres Freund, Donald Dong
Discussion: https://postgr.es/m/87r2el1bx6.fsf@news-spur.riddles.org.uk
2019-02-13 16:20:33 +01:00
|
|
|
-- reduce noise
|
|
|
|
set extra_float_digits = 0;
|
|
|
|
|
2004-05-31 19:18:12 +02:00
|
|
|
select show_trgm('');
|
|
|
|
select show_trgm('(*&^$@%@');
|
|
|
|
select show_trgm('a b c');
|
|
|
|
select show_trgm(' a b c ');
|
|
|
|
select show_trgm('aA bB cC');
|
|
|
|
select show_trgm(' aA bB cC ');
|
|
|
|
select show_trgm('a b C0*%^');
|
|
|
|
|
|
|
|
select similarity('wow','WOWa ');
|
|
|
|
select similarity('wow',' WOW ');
|
|
|
|
|
2013-02-13 20:07:06 +01:00
|
|
|
select similarity('---', '####---');
|
|
|
|
|
2016-03-16 16:59:21 +01:00
|
|
|
CREATE TABLE test_trgm(t text COLLATE "C");
|
2004-05-31 19:18:12 +02:00
|
|
|
|
2013-04-09 07:05:55 +02:00
|
|
|
\copy test_trgm from 'data/trgm.data'
|
2004-05-31 19:18:12 +02:00
|
|
|
|
|
|
|
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
|
|
|
|
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
|
|
|
|
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
|
2010-12-04 06:16:21 +01:00
|
|
|
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
|
Fix contrib/pg_trgm's extraction of trigrams from regular expressions.
The logic for removing excess trigrams from the result was faulty.
It intends to avoid merging the initial and final states of the NFA,
which is necessary, but in testing whether removal of a specific trigram
would cause that, it failed to consider the combined effects of all the
state merges that that trigram's removal would cause. This could result
in a broken final graph that would never match anything, leading to GIN
or GiST indexscans not finding anything.
To fix, add a "tentParent" field that is used only within this loop,
and set it to show state merges that we are tentatively going to do.
While examining a particular arc, we must chase up through tentParent
links as well as regular parent links (the former can only appear atop
the latter), and we must account for state init/fin flag merges that
haven't actually been done yet.
To simplify the latter, combine the separate init and fin bool fields
into a bitmap flags field. I also chose to get rid of the "children"
state list, which seems entirely inessential.
Per bug #14563 from Alexey Isayko, which the added test cases are based on.
Back-patch to 9.3 where this code was added.
Report: https://postgr.es/m/20170222111446.1256.67547@wrigleys.postgresql.org
Discussion: https://postgr.es/m/8816.1487787594@sss.pgh.pa.us
2017-02-22 21:04:07 +01:00
|
|
|
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
|
2004-05-31 19:18:12 +02:00
|
|
|
|
|
|
|
create index trgm_idx on test_trgm using gist (t gist_trgm_ops);
|
|
|
|
set enable_seqscan=off;
|
|
|
|
|
|
|
|
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
|
|
|
|
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
|
|
|
|
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
|
Implement operator class parameters
PostgreSQL provides set of template index access methods, where opclasses have
much freedom in the semantics of indexing. These index AMs are GiST, GIN,
SP-GiST and BRIN. There opclasses define representation of keys, operations on
them and supported search strategies. So, it's natural that opclasses may be
faced some tradeoffs, which require user-side decision. This commit implements
opclass parameters allowing users to set some values, which tell opclass how to
index the particular dataset.
This commit doesn't introduce new storage in system catalog. Instead it uses
pg_attribute.attoptions, which is used for table column storage options but
unused for index attributes.
In order to evade changing signature of each opclass support function, we
implement unified way to pass options to opclass support functions. Options
are set to fn_expr as the constant bytea expression. It's possible due to the
fact that opclass support functions are executed outside of expressions, so
fn_expr is unused for them.
This commit comes with some examples of opclass options usage. We parametrize
signature length in GiST. That applies to multiple opclasses: tsvector_ops,
gist__intbig_ops, gist_ltree_ops, gist__ltree_ops, gist_trgm_ops and
gist_hstore_ops. Also we parametrize maximum number of integer ranges for
gist__int_ops. However, the main future usage of this feature is expected
to be json, where users would be able to specify which way to index particular
json parts.
Catversion is bumped.
Discussion: https://postgr.es/m/d22c3a18-31c7-1879-fc11-4c1ce2f5e5af%40postgrespro.ru
Author: Nikita Glukhov, revised by me
Reviwed-by: Nikolay Shaplov, Robert Haas, Tom Lane, Tomas Vondra, Alvaro Herrera
2020-03-30 18:17:11 +02:00
|
|
|
explain (costs off)
|
|
|
|
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
|
|
|
|
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
|
|
|
|
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
|
|
|
|
|
|
|
|
drop index trgm_idx;
|
|
|
|
create index trgm_idx on test_trgm using gist (t gist_trgm_ops(siglen=0));
|
|
|
|
create index trgm_idx on test_trgm using gist (t gist_trgm_ops(siglen=2025));
|
|
|
|
create index trgm_idx on test_trgm using gist (t gist_trgm_ops(siglen=2024));
|
|
|
|
set enable_seqscan=off;
|
|
|
|
|
|
|
|
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
|
|
|
|
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
|
|
|
|
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
|
2010-12-04 06:16:21 +01:00
|
|
|
explain (costs off)
|
|
|
|
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
|
|
|
|
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
|
Fix contrib/pg_trgm's extraction of trigrams from regular expressions.
The logic for removing excess trigrams from the result was faulty.
It intends to avoid merging the initial and final states of the NFA,
which is necessary, but in testing whether removal of a specific trigram
would cause that, it failed to consider the combined effects of all the
state merges that that trigram's removal would cause. This could result
in a broken final graph that would never match anything, leading to GIN
or GiST indexscans not finding anything.
To fix, add a "tentParent" field that is used only within this loop,
and set it to show state merges that we are tentatively going to do.
While examining a particular arc, we must chase up through tentParent
links as well as regular parent links (the former can only appear atop
the latter), and we must account for state init/fin flag merges that
haven't actually been done yet.
To simplify the latter, combine the separate init and fin bool fields
into a bitmap flags field. I also chose to get rid of the "children"
state list, which seems entirely inessential.
Per bug #14563 from Alexey Isayko, which the added test cases are based on.
Back-patch to 9.3 where this code was added.
Report: https://postgr.es/m/20170222111446.1256.67547@wrigleys.postgresql.org
Discussion: https://postgr.es/m/8816.1487787594@sss.pgh.pa.us
2017-02-22 21:04:07 +01:00
|
|
|
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
|
2004-05-31 19:18:12 +02:00
|
|
|
|
2007-03-14 15:15:40 +01:00
|
|
|
drop index trgm_idx;
|
|
|
|
create index trgm_idx on test_trgm using gin (t gin_trgm_ops);
|
|
|
|
set enable_seqscan=off;
|
|
|
|
|
|
|
|
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
|
|
|
|
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
|
|
|
|
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
|
Fix contrib/pg_trgm's extraction of trigrams from regular expressions.
The logic for removing excess trigrams from the result was faulty.
It intends to avoid merging the initial and final states of the NFA,
which is necessary, but in testing whether removal of a specific trigram
would cause that, it failed to consider the combined effects of all the
state merges that that trigram's removal would cause. This could result
in a broken final graph that would never match anything, leading to GIN
or GiST indexscans not finding anything.
To fix, add a "tentParent" field that is used only within this loop,
and set it to show state merges that we are tentatively going to do.
While examining a particular arc, we must chase up through tentParent
links as well as regular parent links (the former can only appear atop
the latter), and we must account for state init/fin flag merges that
haven't actually been done yet.
To simplify the latter, combine the separate init and fin bool fields
into a bitmap flags field. I also chose to get rid of the "children"
state list, which seems entirely inessential.
Per bug #14563 from Alexey Isayko, which the added test cases are based on.
Back-patch to 9.3 where this code was added.
Report: https://postgr.es/m/20170222111446.1256.67547@wrigleys.postgresql.org
Discussion: https://postgr.es/m/8816.1487787594@sss.pgh.pa.us
2017-02-22 21:04:07 +01:00
|
|
|
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
|
2011-02-01 03:33:55 +01:00
|
|
|
|
Avoid full scan of GIN indexes when possible
The strategy of GIN index scan is driven by opclass-specific extract_query
method. This method that needed search mode is GIN_SEARCH_MODE_ALL. This
mode means that matching tuple may contain none of extracted entries. Simple
example is '!term' tsquery, which doesn't need any term to exist in matching
tsvector.
In order to handle such scan key GIN calculates virtual entry, which contains
all TIDs of all entries of attribute. In fact this is full scan of index
attribute. And typically this is very slow, but allows to handle some queries
correctly in GIN. However, current algorithm calculate such virtual entry for
each GIN_SEARCH_MODE_ALL scan key even if they are multiple for the same
attribute. This is clearly not optimal.
This commit improves the situation by introduction of "exclude only" scan keys.
Such scan keys are not capable to return set of matching TIDs. Instead, they
are capable only to filter TIDs produced by normal scan keys. Therefore,
each attribute should contain at least one normal scan key, while rest of them
may be "exclude only" if search mode is GIN_SEARCH_MODE_ALL.
The same optimization might be applied to the whole scan, not per-attribute.
But that leads to NULL values elimination problem. There is trade-off between
multiple possible ways to do this. We probably want to do this later using
some cost-based decision algorithm.
Discussion: https://postgr.es/m/CAOBaU_YGP5-BEt5Cc0%3DzMve92vocPzD%2BXiZgiZs1kjY0cj%3DXBg%40mail.gmail.com
Author: Nikita Glukhov, Alexander Korotkov, Tom Lane, Julien Rouhaud
Reviewed-by: Julien Rouhaud, Tomas Vondra, Tom Lane
2020-01-17 23:11:39 +01:00
|
|
|
-- check handling of indexquals that generate no searchable conditions
|
|
|
|
explain (costs off)
|
|
|
|
select count(*) from test_trgm where t like '%99%' and t like '%qwerty%';
|
|
|
|
select count(*) from test_trgm where t like '%99%' and t like '%qwerty%';
|
|
|
|
explain (costs off)
|
|
|
|
select count(*) from test_trgm where t like '%99%' and t like '%qw%';
|
|
|
|
select count(*) from test_trgm where t like '%99%' and t like '%qw%';
|
|
|
|
-- ensure that pending-list items are handled correctly, too
|
|
|
|
create temp table t_test_trgm(t text COLLATE "C");
|
|
|
|
create index t_trgm_idx on t_test_trgm using gin (t gin_trgm_ops);
|
|
|
|
insert into t_test_trgm values ('qwerty99'), ('qwerty01');
|
|
|
|
explain (costs off)
|
|
|
|
select count(*) from t_test_trgm where t like '%99%' and t like '%qwerty%';
|
|
|
|
select count(*) from t_test_trgm where t like '%99%' and t like '%qwerty%';
|
|
|
|
explain (costs off)
|
|
|
|
select count(*) from t_test_trgm where t like '%99%' and t like '%qw%';
|
|
|
|
select count(*) from t_test_trgm where t like '%99%' and t like '%qw%';
|
|
|
|
|
|
|
|
-- run the same queries with sequential scan to check the results
|
|
|
|
set enable_bitmapscan=off;
|
|
|
|
set enable_seqscan=on;
|
|
|
|
select count(*) from test_trgm where t like '%99%' and t like '%qwerty%';
|
|
|
|
select count(*) from test_trgm where t like '%99%' and t like '%qw%';
|
|
|
|
select count(*) from t_test_trgm where t like '%99%' and t like '%qwerty%';
|
|
|
|
select count(*) from t_test_trgm where t like '%99%' and t like '%qw%';
|
|
|
|
reset enable_bitmapscan;
|
|
|
|
|
2016-03-16 16:59:21 +01:00
|
|
|
create table test2(t text COLLATE "C");
|
2011-02-01 03:33:55 +01:00
|
|
|
insert into test2 values ('abcdef');
|
|
|
|
insert into test2 values ('quark');
|
2013-04-09 07:05:55 +02:00
|
|
|
insert into test2 values (' z foo bar');
|
Further fix pg_trgm's extraction of trigrams from regular expressions.
Commit 9e43e8714 turns out to have been insufficient: not only is it
necessary to track tentative parent links while considering a set of
arc removals, but it's necessary to track tentative flag additions
as well. This is because we always merge arc target states into
arc source states; therefore, when considering a merge of the final
state with some other, it is the other state that will acquire a new
TSTATE_FIN bit. If there's another arc for the same color trigram
that would cause merging of that state with the initial state, we
failed to recognize the problem. The test cases for the prior commit
evidently only exercised situations where a tentative merge with the
initial state occurs before one with the final state. If it goes the
other way around, we'll happily merge the initial and final states,
either producing a broken final graph that would never match anything,
or triggering the Assert added by the prior commit.
It's tempting to consider switching the merge direction when the merge
involves the final state, but I lack the time to analyze that idea in
detail. Instead just keep track of the flag changes that would result
from proposed merges, in the same way that the prior commit tracked
proposed parent links.
Along the way, add some more debugging support, because I'm not entirely
confident that this is the last bug here. And tweak matters so that
the transformed.dot file uses small integers rather than pointer values
to identify states; that makes it more readable if you're just eyeballing
it rather than fooling with Graphviz. And rename a couple of identically
named struct fields to reduce confusion.
Per report from Corey Csuhta. Add a test case based on his example.
(Note: this case does not trigger the bug under 9.3, apparently because
its different measurement of costs causes it to stop merging states before
it hits the failure. I spent some time trying to find a variant that would
fail in 9.3, without success; but I'm sure such cases exist.)
Like the previous patch, back-patch to 9.3 where this code was added.
Report: https://postgr.es/m/E2B01A4B-4530-406B-8D17-2F67CF9A16BA@csuhta.com
2017-04-14 20:52:03 +02:00
|
|
|
insert into test2 values ('/123/-45/');
|
2020-11-15 06:52:12 +01:00
|
|
|
insert into test2 values ('line 1');
|
|
|
|
insert into test2 values ('%line 2');
|
|
|
|
insert into test2 values ('line 3%');
|
|
|
|
insert into test2 values ('%line 4%');
|
|
|
|
insert into test2 values ('%li%ne 5%');
|
|
|
|
insert into test2 values ('li_e 6');
|
2011-02-01 03:33:55 +01:00
|
|
|
create index test2_idx_gin on test2 using gin (t gin_trgm_ops);
|
|
|
|
set enable_seqscan=off;
|
|
|
|
explain (costs off)
|
|
|
|
select * from test2 where t like '%BCD%';
|
|
|
|
explain (costs off)
|
|
|
|
select * from test2 where t ilike '%BCD%';
|
|
|
|
select * from test2 where t like '%BCD%';
|
|
|
|
select * from test2 where t like '%bcd%';
|
2012-08-20 19:24:52 +02:00
|
|
|
select * from test2 where t like E'%\\bcd%';
|
2011-02-01 03:33:55 +01:00
|
|
|
select * from test2 where t ilike '%BCD%';
|
|
|
|
select * from test2 where t ilike 'qua%';
|
2013-04-09 07:05:55 +02:00
|
|
|
select * from test2 where t like '%z foo bar%';
|
|
|
|
select * from test2 where t like ' z foo%';
|
|
|
|
explain (costs off)
|
|
|
|
select * from test2 where t ~ '[abc]{3}';
|
|
|
|
explain (costs off)
|
|
|
|
select * from test2 where t ~* 'DEF';
|
|
|
|
select * from test2 where t ~ '[abc]{3}';
|
|
|
|
select * from test2 where t ~ 'a[bc]+d';
|
|
|
|
select * from test2 where t ~ '(abc)*$';
|
|
|
|
select * from test2 where t ~* 'DEF';
|
|
|
|
select * from test2 where t ~ 'dEf';
|
|
|
|
select * from test2 where t ~* '^q';
|
|
|
|
select * from test2 where t ~* '[abc]{3}[def]{3}';
|
|
|
|
select * from test2 where t ~* 'ab[a-z]{3}';
|
|
|
|
select * from test2 where t ~* '(^| )qua';
|
|
|
|
select * from test2 where t ~ 'q.*rk$';
|
|
|
|
select * from test2 where t ~ 'q';
|
|
|
|
select * from test2 where t ~ '[a-z]{3}';
|
|
|
|
select * from test2 where t ~* '(a{10}|b{10}|c{10}){10}';
|
|
|
|
select * from test2 where t ~ 'z foo bar';
|
|
|
|
select * from test2 where t ~ ' z foo bar';
|
|
|
|
select * from test2 where t ~ ' z foo bar';
|
|
|
|
select * from test2 where t ~ ' z foo';
|
2017-04-13 23:18:35 +02:00
|
|
|
select * from test2 where t ~ 'qua(?!foo)';
|
Further fix pg_trgm's extraction of trigrams from regular expressions.
Commit 9e43e8714 turns out to have been insufficient: not only is it
necessary to track tentative parent links while considering a set of
arc removals, but it's necessary to track tentative flag additions
as well. This is because we always merge arc target states into
arc source states; therefore, when considering a merge of the final
state with some other, it is the other state that will acquire a new
TSTATE_FIN bit. If there's another arc for the same color trigram
that would cause merging of that state with the initial state, we
failed to recognize the problem. The test cases for the prior commit
evidently only exercised situations where a tentative merge with the
initial state occurs before one with the final state. If it goes the
other way around, we'll happily merge the initial and final states,
either producing a broken final graph that would never match anything,
or triggering the Assert added by the prior commit.
It's tempting to consider switching the merge direction when the merge
involves the final state, but I lack the time to analyze that idea in
detail. Instead just keep track of the flag changes that would result
from proposed merges, in the same way that the prior commit tracked
proposed parent links.
Along the way, add some more debugging support, because I'm not entirely
confident that this is the last bug here. And tweak matters so that
the transformed.dot file uses small integers rather than pointer values
to identify states; that makes it more readable if you're just eyeballing
it rather than fooling with Graphviz. And rename a couple of identically
named struct fields to reduce confusion.
Per report from Corey Csuhta. Add a test case based on his example.
(Note: this case does not trigger the bug under 9.3, apparently because
its different measurement of costs causes it to stop merging states before
it hits the failure. I spent some time trying to find a variant that would
fail in 9.3, without success; but I'm sure such cases exist.)
Like the previous patch, back-patch to 9.3 where this code was added.
Report: https://postgr.es/m/E2B01A4B-4530-406B-8D17-2F67CF9A16BA@csuhta.com
2017-04-14 20:52:03 +02:00
|
|
|
select * from test2 where t ~ '/\d+/-\d';
|
2020-11-15 06:52:12 +01:00
|
|
|
-- test = operator
|
|
|
|
explain (costs off)
|
|
|
|
select * from test2 where t = 'abcdef';
|
|
|
|
select * from test2 where t = 'abcdef';
|
|
|
|
explain (costs off)
|
|
|
|
select * from test2 where t = '%line%';
|
|
|
|
select * from test2 where t = '%line%';
|
|
|
|
select * from test2 where t = 'li_e 1';
|
|
|
|
select * from test2 where t = '%line 2';
|
|
|
|
select * from test2 where t = 'line 3%';
|
|
|
|
select * from test2 where t = '%line 3%';
|
|
|
|
select * from test2 where t = '%line 4%';
|
|
|
|
select * from test2 where t = '%line 5%';
|
|
|
|
select * from test2 where t = '%li_ne 5%';
|
|
|
|
select * from test2 where t = '%li%ne 5%';
|
|
|
|
select * from test2 where t = 'line 6';
|
|
|
|
select * from test2 where t = 'li_e 6';
|
2011-02-01 03:33:55 +01:00
|
|
|
drop index test2_idx_gin;
|
2017-04-13 23:18:35 +02:00
|
|
|
|
2011-02-01 03:33:55 +01:00
|
|
|
create index test2_idx_gist on test2 using gist (t gist_trgm_ops);
|
|
|
|
set enable_seqscan=off;
|
|
|
|
explain (costs off)
|
|
|
|
select * from test2 where t like '%BCD%';
|
|
|
|
explain (costs off)
|
|
|
|
select * from test2 where t ilike '%BCD%';
|
|
|
|
select * from test2 where t like '%BCD%';
|
|
|
|
select * from test2 where t like '%bcd%';
|
2012-08-20 19:24:52 +02:00
|
|
|
select * from test2 where t like E'%\\bcd%';
|
2011-02-01 03:33:55 +01:00
|
|
|
select * from test2 where t ilike '%BCD%';
|
|
|
|
select * from test2 where t ilike 'qua%';
|
2013-04-10 19:30:14 +02:00
|
|
|
select * from test2 where t like '%z foo bar%';
|
|
|
|
select * from test2 where t like ' z foo%';
|
|
|
|
explain (costs off)
|
|
|
|
select * from test2 where t ~ '[abc]{3}';
|
|
|
|
explain (costs off)
|
|
|
|
select * from test2 where t ~* 'DEF';
|
|
|
|
select * from test2 where t ~ '[abc]{3}';
|
|
|
|
select * from test2 where t ~ 'a[bc]+d';
|
|
|
|
select * from test2 where t ~ '(abc)*$';
|
|
|
|
select * from test2 where t ~* 'DEF';
|
|
|
|
select * from test2 where t ~ 'dEf';
|
|
|
|
select * from test2 where t ~* '^q';
|
|
|
|
select * from test2 where t ~* '[abc]{3}[def]{3}';
|
|
|
|
select * from test2 where t ~* 'ab[a-z]{3}';
|
|
|
|
select * from test2 where t ~* '(^| )qua';
|
|
|
|
select * from test2 where t ~ 'q.*rk$';
|
|
|
|
select * from test2 where t ~ 'q';
|
|
|
|
select * from test2 where t ~ '[a-z]{3}';
|
|
|
|
select * from test2 where t ~* '(a{10}|b{10}|c{10}){10}';
|
|
|
|
select * from test2 where t ~ 'z foo bar';
|
|
|
|
select * from test2 where t ~ ' z foo bar';
|
|
|
|
select * from test2 where t ~ ' z foo bar';
|
|
|
|
select * from test2 where t ~ ' z foo';
|
2017-04-13 23:18:35 +02:00
|
|
|
select * from test2 where t ~ 'qua(?!foo)';
|
Further fix pg_trgm's extraction of trigrams from regular expressions.
Commit 9e43e8714 turns out to have been insufficient: not only is it
necessary to track tentative parent links while considering a set of
arc removals, but it's necessary to track tentative flag additions
as well. This is because we always merge arc target states into
arc source states; therefore, when considering a merge of the final
state with some other, it is the other state that will acquire a new
TSTATE_FIN bit. If there's another arc for the same color trigram
that would cause merging of that state with the initial state, we
failed to recognize the problem. The test cases for the prior commit
evidently only exercised situations where a tentative merge with the
initial state occurs before one with the final state. If it goes the
other way around, we'll happily merge the initial and final states,
either producing a broken final graph that would never match anything,
or triggering the Assert added by the prior commit.
It's tempting to consider switching the merge direction when the merge
involves the final state, but I lack the time to analyze that idea in
detail. Instead just keep track of the flag changes that would result
from proposed merges, in the same way that the prior commit tracked
proposed parent links.
Along the way, add some more debugging support, because I'm not entirely
confident that this is the last bug here. And tweak matters so that
the transformed.dot file uses small integers rather than pointer values
to identify states; that makes it more readable if you're just eyeballing
it rather than fooling with Graphviz. And rename a couple of identically
named struct fields to reduce confusion.
Per report from Corey Csuhta. Add a test case based on his example.
(Note: this case does not trigger the bug under 9.3, apparently because
its different measurement of costs causes it to stop merging states before
it hits the failure. I spent some time trying to find a variant that would
fail in 9.3, without success; but I'm sure such cases exist.)
Like the previous patch, back-patch to 9.3 where this code was added.
Report: https://postgr.es/m/E2B01A4B-4530-406B-8D17-2F67CF9A16BA@csuhta.com
2017-04-14 20:52:03 +02:00
|
|
|
select * from test2 where t ~ '/\d+/-\d';
|
2020-11-15 06:52:12 +01:00
|
|
|
-- test = operator
|
|
|
|
explain (costs off)
|
|
|
|
select * from test2 where t = 'abcdef';
|
|
|
|
select * from test2 where t = 'abcdef';
|
|
|
|
explain (costs off)
|
|
|
|
select * from test2 where t = '%line%';
|
|
|
|
select * from test2 where t = '%line%';
|
|
|
|
select * from test2 where t = 'li_e 1';
|
|
|
|
select * from test2 where t = '%line 2';
|
|
|
|
select * from test2 where t = 'line 3%';
|
|
|
|
select * from test2 where t = '%line 3%';
|
|
|
|
select * from test2 where t = '%line 4%';
|
|
|
|
select * from test2 where t = '%line 5%';
|
|
|
|
select * from test2 where t = '%li_ne 5%';
|
|
|
|
select * from test2 where t = '%li%ne 5%';
|
|
|
|
select * from test2 where t = 'line 6';
|
|
|
|
select * from test2 where t = 'li_e 6';
|
2016-06-20 16:49:19 +02:00
|
|
|
|
|
|
|
-- Check similarity threshold (bug #14202)
|
|
|
|
|
|
|
|
CREATE TEMP TABLE restaurants (city text);
|
|
|
|
INSERT INTO restaurants SELECT 'Warsaw' FROM generate_series(1, 10000);
|
|
|
|
INSERT INTO restaurants SELECT 'Szczecin' FROM generate_series(1, 10000);
|
|
|
|
CREATE INDEX ON restaurants USING gist(city gist_trgm_ops);
|
|
|
|
|
|
|
|
-- Similarity of the two names (for reference).
|
|
|
|
SELECT similarity('Szczecin', 'Warsaw');
|
|
|
|
|
|
|
|
-- Should get only 'Warsaw' for either setting of set_limit.
|
|
|
|
EXPLAIN (COSTS OFF)
|
|
|
|
SELECT DISTINCT city, similarity(city, 'Warsaw'), show_limit()
|
|
|
|
FROM restaurants WHERE city % 'Warsaw';
|
|
|
|
SELECT set_limit(0.3);
|
|
|
|
SELECT DISTINCT city, similarity(city, 'Warsaw'), show_limit()
|
|
|
|
FROM restaurants WHERE city % 'Warsaw';
|
|
|
|
SELECT set_limit(0.5);
|
|
|
|
SELECT DISTINCT city, similarity(city, 'Warsaw'), show_limit()
|
|
|
|
FROM restaurants WHERE city % 'Warsaw';
|