postgresql/contrib/pg_trgm/sql/pg_trgm.sql
Tom Lane 9e43e8714c Fix contrib/pg_trgm's extraction of trigrams from regular expressions.
The logic for removing excess trigrams from the result was faulty.
It intends to avoid merging the initial and final states of the NFA,
which is necessary, but in testing whether removal of a specific trigram
would cause that, it failed to consider the combined effects of all the
state merges that that trigram's removal would cause.  This could result
in a broken final graph that would never match anything, leading to GIN
or GiST indexscans not finding anything.

To fix, add a "tentParent" field that is used only within this loop,
and set it to show state merges that we are tentatively going to do.
While examining a particular arc, we must chase up through tentParent
links as well as regular parent links (the former can only appear atop
the latter), and we must account for state init/fin flag merges that
haven't actually been done yet.

To simplify the latter, combine the separate init and fin bool fields
into a bitmap flags field.  I also chose to get rid of the "children"
state list, which seems entirely inessential.

Per bug #14563 from Alexey Isayko, which the added test cases are based on.
Back-patch to 9.3 where this code was added.

Report: https://postgr.es/m/20170222111446.1256.67547@wrigleys.postgresql.org
Discussion: https://postgr.es/m/8816.1487787594@sss.pgh.pa.us
2017-02-22 15:04:26 -05:00

145 lines
5.9 KiB
SQL

CREATE EXTENSION pg_trgm;
-- Check whether any of our opclasses fail amvalidate
SELECT amname, opcname
FROM pg_opclass opc LEFT JOIN pg_am am ON am.oid = opcmethod
WHERE opc.oid >= 16384 AND NOT amvalidate(opc.oid);
select show_trgm('');
select show_trgm('(*&^$@%@');
select show_trgm('a b c');
select show_trgm(' a b c ');
select show_trgm('aA bB cC');
select show_trgm(' aA bB cC ');
select show_trgm('a b C0*%^');
select similarity('wow','WOWa ');
select similarity('wow',' WOW ');
select similarity('---', '####---');
CREATE TABLE test_trgm(t text COLLATE "C");
\copy test_trgm from 'data/trgm.data'
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
create index trgm_idx on test_trgm using gist (t gist_trgm_ops);
set enable_seqscan=off;
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
explain (costs off)
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
drop index trgm_idx;
create index trgm_idx on test_trgm using gin (t gin_trgm_ops);
set enable_seqscan=off;
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
create table test2(t text COLLATE "C");
insert into test2 values ('abcdef');
insert into test2 values ('quark');
insert into test2 values (' z foo bar');
create index test2_idx_gin on test2 using gin (t gin_trgm_ops);
set enable_seqscan=off;
explain (costs off)
select * from test2 where t like '%BCD%';
explain (costs off)
select * from test2 where t ilike '%BCD%';
select * from test2 where t like '%BCD%';
select * from test2 where t like '%bcd%';
select * from test2 where t like E'%\\bcd%';
select * from test2 where t ilike '%BCD%';
select * from test2 where t ilike 'qua%';
select * from test2 where t like '%z foo bar%';
select * from test2 where t like ' z foo%';
explain (costs off)
select * from test2 where t ~ '[abc]{3}';
explain (costs off)
select * from test2 where t ~* 'DEF';
select * from test2 where t ~ '[abc]{3}';
select * from test2 where t ~ 'a[bc]+d';
select * from test2 where t ~ '(abc)*$';
select * from test2 where t ~* 'DEF';
select * from test2 where t ~ 'dEf';
select * from test2 where t ~* '^q';
select * from test2 where t ~* '[abc]{3}[def]{3}';
select * from test2 where t ~* 'ab[a-z]{3}';
select * from test2 where t ~* '(^| )qua';
select * from test2 where t ~ 'q.*rk$';
select * from test2 where t ~ 'q';
select * from test2 where t ~ '[a-z]{3}';
select * from test2 where t ~* '(a{10}|b{10}|c{10}){10}';
select * from test2 where t ~ 'z foo bar';
select * from test2 where t ~ ' z foo bar';
select * from test2 where t ~ ' z foo bar';
select * from test2 where t ~ ' z foo';
drop index test2_idx_gin;
create index test2_idx_gist on test2 using gist (t gist_trgm_ops);
set enable_seqscan=off;
explain (costs off)
select * from test2 where t like '%BCD%';
explain (costs off)
select * from test2 where t ilike '%BCD%';
select * from test2 where t like '%BCD%';
select * from test2 where t like '%bcd%';
select * from test2 where t like E'%\\bcd%';
select * from test2 where t ilike '%BCD%';
select * from test2 where t ilike 'qua%';
select * from test2 where t like '%z foo bar%';
select * from test2 where t like ' z foo%';
explain (costs off)
select * from test2 where t ~ '[abc]{3}';
explain (costs off)
select * from test2 where t ~* 'DEF';
select * from test2 where t ~ '[abc]{3}';
select * from test2 where t ~ 'a[bc]+d';
select * from test2 where t ~ '(abc)*$';
select * from test2 where t ~* 'DEF';
select * from test2 where t ~ 'dEf';
select * from test2 where t ~* '^q';
select * from test2 where t ~* '[abc]{3}[def]{3}';
select * from test2 where t ~* 'ab[a-z]{3}';
select * from test2 where t ~* '(^| )qua';
select * from test2 where t ~ 'q.*rk$';
select * from test2 where t ~ 'q';
select * from test2 where t ~ '[a-z]{3}';
select * from test2 where t ~* '(a{10}|b{10}|c{10}){10}';
select * from test2 where t ~ 'z foo bar';
select * from test2 where t ~ ' z foo bar';
select * from test2 where t ~ ' z foo bar';
select * from test2 where t ~ ' z foo';
-- Check similarity threshold (bug #14202)
CREATE TEMP TABLE restaurants (city text);
INSERT INTO restaurants SELECT 'Warsaw' FROM generate_series(1, 10000);
INSERT INTO restaurants SELECT 'Szczecin' FROM generate_series(1, 10000);
CREATE INDEX ON restaurants USING gist(city gist_trgm_ops);
-- Similarity of the two names (for reference).
SELECT similarity('Szczecin', 'Warsaw');
-- Should get only 'Warsaw' for either setting of set_limit.
EXPLAIN (COSTS OFF)
SELECT DISTINCT city, similarity(city, 'Warsaw'), show_limit()
FROM restaurants WHERE city % 'Warsaw';
SELECT set_limit(0.3);
SELECT DISTINCT city, similarity(city, 'Warsaw'), show_limit()
FROM restaurants WHERE city % 'Warsaw';
SELECT set_limit(0.5);
SELECT DISTINCT city, similarity(city, 'Warsaw'), show_limit()
FROM restaurants WHERE city % 'Warsaw';