postgresql/src/test/regress/expected/tsearch.out

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

3008 lines
77 KiB
Plaintext
Raw Normal View History

-- directory paths are passed to us in environment variables
\getenv abs_srcdir PG_ABS_SRCDIR
--
-- Sanity checks for text search catalogs
--
-- NB: we assume the oidjoins test will have caught any dangling links,
-- that is OID or REGPROC fields that are not zero and do not match some
-- row in the linked-to table. However, if we want to enforce that a link
-- field can't be 0, we have to check it here.
-- Find unexpected zero link entries
SELECT oid, prsname
FROM pg_ts_parser
WHERE prsnamespace = 0 OR prsstart = 0 OR prstoken = 0 OR prsend = 0 OR
-- prsheadline is optional
prslextype = 0;
oid | prsname
-----+---------
(0 rows)
SELECT oid, dictname
FROM pg_ts_dict
WHERE dictnamespace = 0 OR dictowner = 0 OR dicttemplate = 0;
oid | dictname
-----+----------
(0 rows)
SELECT oid, tmplname
FROM pg_ts_template
WHERE tmplnamespace = 0 OR tmpllexize = 0; -- tmplinit is optional
oid | tmplname
-----+----------
(0 rows)
SELECT oid, cfgname
FROM pg_ts_config
WHERE cfgnamespace = 0 OR cfgowner = 0 OR cfgparser = 0;
oid | cfgname
-----+---------
(0 rows)
SELECT mapcfg, maptokentype, mapseqno
FROM pg_ts_config_map
WHERE mapcfg = 0 OR mapdict = 0;
mapcfg | maptokentype | mapseqno
--------+--------------+----------
(0 rows)
-- Look for pg_ts_config_map entries that aren't one of parser's token types
SELECT * FROM
( SELECT oid AS cfgid, (ts_token_type(cfgparser)).tokid AS tokid
FROM pg_ts_config ) AS tt
RIGHT JOIN pg_ts_config_map AS m
ON (tt.cfgid=m.mapcfg AND tt.tokid=m.maptokentype)
WHERE
tt.cfgid IS NULL OR tt.tokid IS NULL;
cfgid | tokid | mapcfg | maptokentype | mapseqno | mapdict
-------+-------+--------+--------------+----------+---------
(0 rows)
-- Load some test data
CREATE TABLE test_tsvector(
t text,
a tsvector
);
\set filename :abs_srcdir '/data/tsearch.data'
COPY test_tsvector FROM :'filename';
ANALYZE test_tsvector;
-- test basic text search behavior without indexes, then with
SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh';
count
-------
158
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wr&qh';
count
-------
17
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'eq&yt';
count
-------
6
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'eq|yt';
count
-------
98
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '(eq&yt)|(wr&qh)';
count
-------
23
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '(eq|yt)&(wr|qh)';
count
-------
39
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'w:*|q:*';
count
-------
494
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}');
count
-------
158
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'no_such_lexeme';
count
-------
0
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!no_such_lexeme';
count
-------
508
(1 row)
Fix full text search to handle NOT above a phrase search correctly. Queries such as '!(foo<->bar)' failed to find matching rows when implemented as a GiST or GIN index search. That's because of failing to handle phrase searches as tri-valued when considering a query without any position information for the target tsvector. We can only say that the phrase operator might match, not that it does match; and therefore its NOT also might match. The previous coding incorrectly inverted the approximate phrase result to decide that there was certainly no match. To fix, we need to make TS_phrase_execute return a real ternary result, and then bubble that up accurately in TS_execute. As long as we have to do that anyway, we can simplify the baroque things TS_phrase_execute was doing internally to manage tri-valued searching with only a bool as explicit result. For now, I left the externally-visible result of TS_execute as a plain bool. There do not appear to be any outside callers that need to distinguish a three-way result, given that they passed in a flag saying what to do in the absence of position data. This might need to change someday, but we wouldn't want to back-patch such a change. Although tsginidx.c has its own TS_execute_ternary implementation for use at upper index levels, that sadly managed to get this case wrong as well :-(. Fixing it is a lot easier fortunately. Per bug #16388 from Charles Offenbacher. Back-patch to 9.6 where phrase search was introduced. Discussion: https://postgr.es/m/16388-98cffba38d0b7e6e@postgresql.org
2020-04-27 18:21:04 +02:00
SELECT count(*) FROM test_tsvector WHERE a @@ 'pl <-> yh';
count
-------
1
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'yh <-> pl';
count
-------
0
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'qe <2> qt';
count
-------
1
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!pl <-> yh';
count
-------
3
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!pl <-> !yh';
count
-------
432
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!yh <-> pl';
count
-------
1
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!qe <2> qt';
count
-------
6
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!(pl <-> yh)';
count
-------
507
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!(yh <-> pl)';
count
-------
508
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!(qe <2> qt)';
count
-------
507
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:A';
count
-------
56
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:D';
count
-------
58
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:A';
count
-------
452
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:D';
count
-------
450
(1 row)
create index wowidx on test_tsvector using gist (a);
SET enable_seqscan=OFF;
SET enable_indexscan=ON;
SET enable_bitmapscan=OFF;
explain (costs off) SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh';
QUERY PLAN
-------------------------------------------------------
Aggregate
-> Index Scan using wowidx on test_tsvector
Index Cond: (a @@ '''wr'' | ''qh'''::tsquery)
(3 rows)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh';
count
-------
158
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wr&qh';
count
-------
17
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'eq&yt';
count
-------
6
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'eq|yt';
count
-------
98
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '(eq&yt)|(wr&qh)';
count
-------
23
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '(eq|yt)&(wr|qh)';
count
-------
39
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'w:*|q:*';
count
-------
494
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}');
count
-------
158
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'no_such_lexeme';
count
-------
0
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!no_such_lexeme';
count
-------
508
(1 row)
Fix full text search to handle NOT above a phrase search correctly. Queries such as '!(foo<->bar)' failed to find matching rows when implemented as a GiST or GIN index search. That's because of failing to handle phrase searches as tri-valued when considering a query without any position information for the target tsvector. We can only say that the phrase operator might match, not that it does match; and therefore its NOT also might match. The previous coding incorrectly inverted the approximate phrase result to decide that there was certainly no match. To fix, we need to make TS_phrase_execute return a real ternary result, and then bubble that up accurately in TS_execute. As long as we have to do that anyway, we can simplify the baroque things TS_phrase_execute was doing internally to manage tri-valued searching with only a bool as explicit result. For now, I left the externally-visible result of TS_execute as a plain bool. There do not appear to be any outside callers that need to distinguish a three-way result, given that they passed in a flag saying what to do in the absence of position data. This might need to change someday, but we wouldn't want to back-patch such a change. Although tsginidx.c has its own TS_execute_ternary implementation for use at upper index levels, that sadly managed to get this case wrong as well :-(. Fixing it is a lot easier fortunately. Per bug #16388 from Charles Offenbacher. Back-patch to 9.6 where phrase search was introduced. Discussion: https://postgr.es/m/16388-98cffba38d0b7e6e@postgresql.org
2020-04-27 18:21:04 +02:00
SELECT count(*) FROM test_tsvector WHERE a @@ 'pl <-> yh';
count
-------
1
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'yh <-> pl';
count
-------
0
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'qe <2> qt';
count
-------
1
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!pl <-> yh';
count
-------
3
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!pl <-> !yh';
count
-------
432
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!yh <-> pl';
count
-------
1
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!qe <2> qt';
count
-------
6
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!(pl <-> yh)';
count
-------
507
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!(yh <-> pl)';
count
-------
508
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!(qe <2> qt)';
count
-------
507
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:A';
count
-------
56
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:D';
count
-------
58
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:A';
count
-------
452
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:D';
count
-------
450
(1 row)
SET enable_indexscan=OFF;
SET enable_bitmapscan=ON;
explain (costs off) SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh';
QUERY PLAN
-------------------------------------------------------------
Aggregate
-> Bitmap Heap Scan on test_tsvector
Recheck Cond: (a @@ '''wr'' | ''qh'''::tsquery)
-> Bitmap Index Scan on wowidx
Index Cond: (a @@ '''wr'' | ''qh'''::tsquery)
(5 rows)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh';
count
-------
158
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wr&qh';
count
-------
17
(1 row)
Implement operator class parameters PostgreSQL provides set of template index access methods, where opclasses have much freedom in the semantics of indexing. These index AMs are GiST, GIN, SP-GiST and BRIN. There opclasses define representation of keys, operations on them and supported search strategies. So, it's natural that opclasses may be faced some tradeoffs, which require user-side decision. This commit implements opclass parameters allowing users to set some values, which tell opclass how to index the particular dataset. This commit doesn't introduce new storage in system catalog. Instead it uses pg_attribute.attoptions, which is used for table column storage options but unused for index attributes. In order to evade changing signature of each opclass support function, we implement unified way to pass options to opclass support functions. Options are set to fn_expr as the constant bytea expression. It's possible due to the fact that opclass support functions are executed outside of expressions, so fn_expr is unused for them. This commit comes with some examples of opclass options usage. We parametrize signature length in GiST. That applies to multiple opclasses: tsvector_ops, gist__intbig_ops, gist_ltree_ops, gist__ltree_ops, gist_trgm_ops and gist_hstore_ops. Also we parametrize maximum number of integer ranges for gist__int_ops. However, the main future usage of this feature is expected to be json, where users would be able to specify which way to index particular json parts. Catversion is bumped. Discussion: https://postgr.es/m/d22c3a18-31c7-1879-fc11-4c1ce2f5e5af%40postgrespro.ru Author: Nikita Glukhov, revised by me Reviwed-by: Nikolay Shaplov, Robert Haas, Tom Lane, Tomas Vondra, Alvaro Herrera
2020-03-30 18:17:11 +02:00
SELECT count(*) FROM test_tsvector WHERE a @@ 'eq&yt';
count
-------
6
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'eq|yt';
count
-------
98
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '(eq&yt)|(wr&qh)';
count
-------
23
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '(eq|yt)&(wr|qh)';
count
-------
39
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'w:*|q:*';
count
-------
494
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}');
count
-------
158
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'no_such_lexeme';
count
-------
0
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!no_such_lexeme';
count
-------
508
(1 row)
Fix full text search to handle NOT above a phrase search correctly. Queries such as '!(foo<->bar)' failed to find matching rows when implemented as a GiST or GIN index search. That's because of failing to handle phrase searches as tri-valued when considering a query without any position information for the target tsvector. We can only say that the phrase operator might match, not that it does match; and therefore its NOT also might match. The previous coding incorrectly inverted the approximate phrase result to decide that there was certainly no match. To fix, we need to make TS_phrase_execute return a real ternary result, and then bubble that up accurately in TS_execute. As long as we have to do that anyway, we can simplify the baroque things TS_phrase_execute was doing internally to manage tri-valued searching with only a bool as explicit result. For now, I left the externally-visible result of TS_execute as a plain bool. There do not appear to be any outside callers that need to distinguish a three-way result, given that they passed in a flag saying what to do in the absence of position data. This might need to change someday, but we wouldn't want to back-patch such a change. Although tsginidx.c has its own TS_execute_ternary implementation for use at upper index levels, that sadly managed to get this case wrong as well :-(. Fixing it is a lot easier fortunately. Per bug #16388 from Charles Offenbacher. Back-patch to 9.6 where phrase search was introduced. Discussion: https://postgr.es/m/16388-98cffba38d0b7e6e@postgresql.org
2020-04-27 18:21:04 +02:00
SELECT count(*) FROM test_tsvector WHERE a @@ 'pl <-> yh';
count
-------
1
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'yh <-> pl';
count
-------
0
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'qe <2> qt';
count
-------
1
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!pl <-> yh';
count
-------
3
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!pl <-> !yh';
count
-------
432
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!yh <-> pl';
count
-------
1
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!qe <2> qt';
count
-------
6
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!(pl <-> yh)';
count
-------
507
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!(yh <-> pl)';
count
-------
508
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!(qe <2> qt)';
count
-------
507
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:A';
count
-------
56
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:D';
count
-------
58
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:A';
count
-------
452
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:D';
count
-------
450
(1 row)
Implement operator class parameters PostgreSQL provides set of template index access methods, where opclasses have much freedom in the semantics of indexing. These index AMs are GiST, GIN, SP-GiST and BRIN. There opclasses define representation of keys, operations on them and supported search strategies. So, it's natural that opclasses may be faced some tradeoffs, which require user-side decision. This commit implements opclass parameters allowing users to set some values, which tell opclass how to index the particular dataset. This commit doesn't introduce new storage in system catalog. Instead it uses pg_attribute.attoptions, which is used for table column storage options but unused for index attributes. In order to evade changing signature of each opclass support function, we implement unified way to pass options to opclass support functions. Options are set to fn_expr as the constant bytea expression. It's possible due to the fact that opclass support functions are executed outside of expressions, so fn_expr is unused for them. This commit comes with some examples of opclass options usage. We parametrize signature length in GiST. That applies to multiple opclasses: tsvector_ops, gist__intbig_ops, gist_ltree_ops, gist__ltree_ops, gist_trgm_ops and gist_hstore_ops. Also we parametrize maximum number of integer ranges for gist__int_ops. However, the main future usage of this feature is expected to be json, where users would be able to specify which way to index particular json parts. Catversion is bumped. Discussion: https://postgr.es/m/d22c3a18-31c7-1879-fc11-4c1ce2f5e5af%40postgrespro.ru Author: Nikita Glukhov, revised by me Reviwed-by: Nikolay Shaplov, Robert Haas, Tom Lane, Tomas Vondra, Alvaro Herrera
2020-03-30 18:17:11 +02:00
-- Test siglen parameter of GiST tsvector_ops
CREATE INDEX wowidx1 ON test_tsvector USING gist (a tsvector_ops(foo=1));
ERROR: unrecognized parameter "foo"
CREATE INDEX wowidx1 ON test_tsvector USING gist (a tsvector_ops(siglen=0));
ERROR: value 0 out of bounds for option "siglen"
DETAIL: Valid values are between "1" and "2024".
CREATE INDEX wowidx1 ON test_tsvector USING gist (a tsvector_ops(siglen=2048));
ERROR: value 2048 out of bounds for option "siglen"
DETAIL: Valid values are between "1" and "2024".
CREATE INDEX wowidx1 ON test_tsvector USING gist (a tsvector_ops(siglen=100,foo='bar'));
ERROR: unrecognized parameter "foo"
CREATE INDEX wowidx1 ON test_tsvector USING gist (a tsvector_ops(siglen=100, siglen = 200));
ERROR: parameter "siglen" specified more than once
CREATE INDEX wowidx2 ON test_tsvector USING gist (a tsvector_ops(siglen=1));
\d test_tsvector
Table "public.test_tsvector"
Column | Type | Collation | Nullable | Default
--------+----------+-----------+----------+---------
t | text | | |
a | tsvector | | |
Indexes:
"wowidx" gist (a)
"wowidx2" gist (a tsvector_ops (siglen='1'))
DROP INDEX wowidx;
EXPLAIN (costs off) SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh';
QUERY PLAN
-------------------------------------------------------------
Aggregate
-> Bitmap Heap Scan on test_tsvector
Recheck Cond: (a @@ '''wr'' | ''qh'''::tsquery)
-> Bitmap Index Scan on wowidx2
Index Cond: (a @@ '''wr'' | ''qh'''::tsquery)
(5 rows)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh';
count
-------
158
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wr&qh';
count
-------
17
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'eq&yt';
count
-------
6
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'eq|yt';
count
-------
98
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '(eq&yt)|(wr&qh)';
count
-------
23
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '(eq|yt)&(wr|qh)';
count
-------
39
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'w:*|q:*';
count
-------
494
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}');
count
-------
158
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'no_such_lexeme';
count
-------
0
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!no_such_lexeme';
count
-------
508
(1 row)
Fix full text search to handle NOT above a phrase search correctly. Queries such as '!(foo<->bar)' failed to find matching rows when implemented as a GiST or GIN index search. That's because of failing to handle phrase searches as tri-valued when considering a query without any position information for the target tsvector. We can only say that the phrase operator might match, not that it does match; and therefore its NOT also might match. The previous coding incorrectly inverted the approximate phrase result to decide that there was certainly no match. To fix, we need to make TS_phrase_execute return a real ternary result, and then bubble that up accurately in TS_execute. As long as we have to do that anyway, we can simplify the baroque things TS_phrase_execute was doing internally to manage tri-valued searching with only a bool as explicit result. For now, I left the externally-visible result of TS_execute as a plain bool. There do not appear to be any outside callers that need to distinguish a three-way result, given that they passed in a flag saying what to do in the absence of position data. This might need to change someday, but we wouldn't want to back-patch such a change. Although tsginidx.c has its own TS_execute_ternary implementation for use at upper index levels, that sadly managed to get this case wrong as well :-(. Fixing it is a lot easier fortunately. Per bug #16388 from Charles Offenbacher. Back-patch to 9.6 where phrase search was introduced. Discussion: https://postgr.es/m/16388-98cffba38d0b7e6e@postgresql.org
2020-04-27 18:21:04 +02:00
SELECT count(*) FROM test_tsvector WHERE a @@ 'pl <-> yh';
count
-------
1
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'yh <-> pl';
count
-------
0
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'qe <2> qt';
count
-------
1
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!pl <-> yh';
count
-------
3
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!pl <-> !yh';
count
-------
432
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!yh <-> pl';
count
-------
1
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!qe <2> qt';
count
-------
6
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!(pl <-> yh)';
count
-------
507
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!(yh <-> pl)';
count
-------
508
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!(qe <2> qt)';
count
-------
507
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:A';
count
-------
56
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:D';
count
-------
58
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:A';
count
-------
452
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:D';
count
-------
450
(1 row)
Implement operator class parameters PostgreSQL provides set of template index access methods, where opclasses have much freedom in the semantics of indexing. These index AMs are GiST, GIN, SP-GiST and BRIN. There opclasses define representation of keys, operations on them and supported search strategies. So, it's natural that opclasses may be faced some tradeoffs, which require user-side decision. This commit implements opclass parameters allowing users to set some values, which tell opclass how to index the particular dataset. This commit doesn't introduce new storage in system catalog. Instead it uses pg_attribute.attoptions, which is used for table column storage options but unused for index attributes. In order to evade changing signature of each opclass support function, we implement unified way to pass options to opclass support functions. Options are set to fn_expr as the constant bytea expression. It's possible due to the fact that opclass support functions are executed outside of expressions, so fn_expr is unused for them. This commit comes with some examples of opclass options usage. We parametrize signature length in GiST. That applies to multiple opclasses: tsvector_ops, gist__intbig_ops, gist_ltree_ops, gist__ltree_ops, gist_trgm_ops and gist_hstore_ops. Also we parametrize maximum number of integer ranges for gist__int_ops. However, the main future usage of this feature is expected to be json, where users would be able to specify which way to index particular json parts. Catversion is bumped. Discussion: https://postgr.es/m/d22c3a18-31c7-1879-fc11-4c1ce2f5e5af%40postgrespro.ru Author: Nikita Glukhov, revised by me Reviwed-by: Nikolay Shaplov, Robert Haas, Tom Lane, Tomas Vondra, Alvaro Herrera
2020-03-30 18:17:11 +02:00
DROP INDEX wowidx2;
CREATE INDEX wowidx ON test_tsvector USING gist (a tsvector_ops(siglen=484));
\d test_tsvector
Table "public.test_tsvector"
Column | Type | Collation | Nullable | Default
--------+----------+-----------+----------+---------
t | text | | |
a | tsvector | | |
Indexes:
"wowidx" gist (a tsvector_ops (siglen='484'))
EXPLAIN (costs off) SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh';
QUERY PLAN
-------------------------------------------------------------
Aggregate
-> Bitmap Heap Scan on test_tsvector
Recheck Cond: (a @@ '''wr'' | ''qh'''::tsquery)
-> Bitmap Index Scan on wowidx
Index Cond: (a @@ '''wr'' | ''qh'''::tsquery)
(5 rows)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh';
count
-------
158
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wr&qh';
count
-------
17
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'eq&yt';
count
-------
6
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'eq|yt';
count
-------
98
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '(eq&yt)|(wr&qh)';
count
-------
23
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '(eq|yt)&(wr|qh)';
count
-------
39
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'w:*|q:*';
count
-------
494
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}');
count
-------
158
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'no_such_lexeme';
count
-------
0
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!no_such_lexeme';
count
-------
508
(1 row)
Fix full text search to handle NOT above a phrase search correctly. Queries such as '!(foo<->bar)' failed to find matching rows when implemented as a GiST or GIN index search. That's because of failing to handle phrase searches as tri-valued when considering a query without any position information for the target tsvector. We can only say that the phrase operator might match, not that it does match; and therefore its NOT also might match. The previous coding incorrectly inverted the approximate phrase result to decide that there was certainly no match. To fix, we need to make TS_phrase_execute return a real ternary result, and then bubble that up accurately in TS_execute. As long as we have to do that anyway, we can simplify the baroque things TS_phrase_execute was doing internally to manage tri-valued searching with only a bool as explicit result. For now, I left the externally-visible result of TS_execute as a plain bool. There do not appear to be any outside callers that need to distinguish a three-way result, given that they passed in a flag saying what to do in the absence of position data. This might need to change someday, but we wouldn't want to back-patch such a change. Although tsginidx.c has its own TS_execute_ternary implementation for use at upper index levels, that sadly managed to get this case wrong as well :-(. Fixing it is a lot easier fortunately. Per bug #16388 from Charles Offenbacher. Back-patch to 9.6 where phrase search was introduced. Discussion: https://postgr.es/m/16388-98cffba38d0b7e6e@postgresql.org
2020-04-27 18:21:04 +02:00
SELECT count(*) FROM test_tsvector WHERE a @@ 'pl <-> yh';
count
-------
1
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'yh <-> pl';
count
-------
0
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'qe <2> qt';
count
-------
1
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!pl <-> yh';
count
-------
3
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!pl <-> !yh';
count
-------
432
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!yh <-> pl';
count
-------
1
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!qe <2> qt';
count
-------
6
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!(pl <-> yh)';
count
-------
507
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!(yh <-> pl)';
count
-------
508
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!(qe <2> qt)';
count
-------
507
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:A';
count
-------
56
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:D';
count
-------
58
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:A';
count
-------
452
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:D';
count
-------
450
(1 row)
RESET enable_seqscan;
RESET enable_indexscan;
RESET enable_bitmapscan;
DROP INDEX wowidx;
CREATE INDEX wowidx ON test_tsvector USING gin (a);
SET enable_seqscan=OFF;
-- GIN only supports bitmapscan, so no need to test plain indexscan
explain (costs off) SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh';
QUERY PLAN
-------------------------------------------------------------
Aggregate
-> Bitmap Heap Scan on test_tsvector
Recheck Cond: (a @@ '''wr'' | ''qh'''::tsquery)
-> Bitmap Index Scan on wowidx
Index Cond: (a @@ '''wr'' | ''qh'''::tsquery)
(5 rows)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh';
count
-------
158
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wr&qh';
count
-------
17
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'eq&yt';
count
-------
6
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'eq|yt';
count
-------
98
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '(eq&yt)|(wr&qh)';
count
-------
23
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '(eq|yt)&(wr|qh)';
count
-------
39
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'w:*|q:*';
count
-------
494
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}');
count
-------
158
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'no_such_lexeme';
count
-------
0
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!no_such_lexeme';
count
-------
508
(1 row)
Fix full text search to handle NOT above a phrase search correctly. Queries such as '!(foo<->bar)' failed to find matching rows when implemented as a GiST or GIN index search. That's because of failing to handle phrase searches as tri-valued when considering a query without any position information for the target tsvector. We can only say that the phrase operator might match, not that it does match; and therefore its NOT also might match. The previous coding incorrectly inverted the approximate phrase result to decide that there was certainly no match. To fix, we need to make TS_phrase_execute return a real ternary result, and then bubble that up accurately in TS_execute. As long as we have to do that anyway, we can simplify the baroque things TS_phrase_execute was doing internally to manage tri-valued searching with only a bool as explicit result. For now, I left the externally-visible result of TS_execute as a plain bool. There do not appear to be any outside callers that need to distinguish a three-way result, given that they passed in a flag saying what to do in the absence of position data. This might need to change someday, but we wouldn't want to back-patch such a change. Although tsginidx.c has its own TS_execute_ternary implementation for use at upper index levels, that sadly managed to get this case wrong as well :-(. Fixing it is a lot easier fortunately. Per bug #16388 from Charles Offenbacher. Back-patch to 9.6 where phrase search was introduced. Discussion: https://postgr.es/m/16388-98cffba38d0b7e6e@postgresql.org
2020-04-27 18:21:04 +02:00
SELECT count(*) FROM test_tsvector WHERE a @@ 'pl <-> yh';
count
-------
1
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'yh <-> pl';
count
-------
0
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'qe <2> qt';
count
-------
1
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!pl <-> yh';
count
-------
3
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!pl <-> !yh';
count
-------
432
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!yh <-> pl';
count
-------
1
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!qe <2> qt';
count
-------
6
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!(pl <-> yh)';
count
-------
507
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!(yh <-> pl)';
count
-------
508
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!(qe <2> qt)';
count
-------
507
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:A';
count
-------
56
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:D';
count
-------
58
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:A';
count
-------
452
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:D';
count
-------
450
(1 row)
Avoid full scan of GIN indexes when possible The strategy of GIN index scan is driven by opclass-specific extract_query method. This method that needed search mode is GIN_SEARCH_MODE_ALL. This mode means that matching tuple may contain none of extracted entries. Simple example is '!term' tsquery, which doesn't need any term to exist in matching tsvector. In order to handle such scan key GIN calculates virtual entry, which contains all TIDs of all entries of attribute. In fact this is full scan of index attribute. And typically this is very slow, but allows to handle some queries correctly in GIN. However, current algorithm calculate such virtual entry for each GIN_SEARCH_MODE_ALL scan key even if they are multiple for the same attribute. This is clearly not optimal. This commit improves the situation by introduction of "exclude only" scan keys. Such scan keys are not capable to return set of matching TIDs. Instead, they are capable only to filter TIDs produced by normal scan keys. Therefore, each attribute should contain at least one normal scan key, while rest of them may be "exclude only" if search mode is GIN_SEARCH_MODE_ALL. The same optimization might be applied to the whole scan, not per-attribute. But that leads to NULL values elimination problem. There is trade-off between multiple possible ways to do this. We probably want to do this later using some cost-based decision algorithm. Discussion: https://postgr.es/m/CAOBaU_YGP5-BEt5Cc0%3DzMve92vocPzD%2BXiZgiZs1kjY0cj%3DXBg%40mail.gmail.com Author: Nikita Glukhov, Alexander Korotkov, Tom Lane, Julien Rouhaud Reviewed-by: Julien Rouhaud, Tomas Vondra, Tom Lane
2020-01-17 23:11:39 +01:00
-- Test optimization of non-empty GIN_SEARCH_MODE_ALL queries
EXPLAIN (COSTS OFF)
SELECT count(*) FROM test_tsvector WHERE a @@ '!qh';
QUERY PLAN
-----------------------------------------------------
Aggregate
-> Bitmap Heap Scan on test_tsvector
Recheck Cond: (a @@ '!''qh'''::tsquery)
-> Bitmap Index Scan on wowidx
Index Cond: (a @@ '!''qh'''::tsquery)
(5 rows)
SELECT count(*) FROM test_tsvector WHERE a @@ '!qh';
count
-------
410
(1 row)
EXPLAIN (COSTS OFF)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wr' AND a @@ '!qh';
QUERY PLAN
------------------------------------------------------------------------------------
Aggregate
-> Bitmap Heap Scan on test_tsvector
Recheck Cond: ((a @@ '''wr'''::tsquery) AND (a @@ '!''qh'''::tsquery))
-> Bitmap Index Scan on wowidx
Index Cond: ((a @@ '''wr'''::tsquery) AND (a @@ '!''qh'''::tsquery))
(5 rows)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wr' AND a @@ '!qh';
count
-------
60
(1 row)
RESET enable_seqscan;
INSERT INTO test_tsvector VALUES ('???', 'DFG:1A,2B,6C,10 FGH');
SELECT * FROM ts_stat('SELECT a FROM test_tsvector') ORDER BY ndoc DESC, nentry DESC, word LIMIT 10;
word | ndoc | nentry
------+------+--------
qq | 108 | 108
qt | 102 | 102
Fix full text search to handle NOT above a phrase search correctly. Queries such as '!(foo<->bar)' failed to find matching rows when implemented as a GiST or GIN index search. That's because of failing to handle phrase searches as tri-valued when considering a query without any position information for the target tsvector. We can only say that the phrase operator might match, not that it does match; and therefore its NOT also might match. The previous coding incorrectly inverted the approximate phrase result to decide that there was certainly no match. To fix, we need to make TS_phrase_execute return a real ternary result, and then bubble that up accurately in TS_execute. As long as we have to do that anyway, we can simplify the baroque things TS_phrase_execute was doing internally to manage tri-valued searching with only a bool as explicit result. For now, I left the externally-visible result of TS_execute as a plain bool. There do not appear to be any outside callers that need to distinguish a three-way result, given that they passed in a flag saying what to do in the absence of position data. This might need to change someday, but we wouldn't want to back-patch such a change. Although tsginidx.c has its own TS_execute_ternary implementation for use at upper index levels, that sadly managed to get this case wrong as well :-(. Fixing it is a lot easier fortunately. Per bug #16388 from Charles Offenbacher. Back-patch to 9.6 where phrase search was introduced. Discussion: https://postgr.es/m/16388-98cffba38d0b7e6e@postgresql.org
2020-04-27 18:21:04 +02:00
qe | 100 | 101
qh | 98 | 99
qw | 98 | 98
qa | 97 | 97
ql | 94 | 94
qs | 94 | 94
Fix full text search to handle NOT above a phrase search correctly. Queries such as '!(foo<->bar)' failed to find matching rows when implemented as a GiST or GIN index search. That's because of failing to handle phrase searches as tri-valued when considering a query without any position information for the target tsvector. We can only say that the phrase operator might match, not that it does match; and therefore its NOT also might match. The previous coding incorrectly inverted the approximate phrase result to decide that there was certainly no match. To fix, we need to make TS_phrase_execute return a real ternary result, and then bubble that up accurately in TS_execute. As long as we have to do that anyway, we can simplify the baroque things TS_phrase_execute was doing internally to manage tri-valued searching with only a bool as explicit result. For now, I left the externally-visible result of TS_execute as a plain bool. There do not appear to be any outside callers that need to distinguish a three-way result, given that they passed in a flag saying what to do in the absence of position data. This might need to change someday, but we wouldn't want to back-patch such a change. Although tsginidx.c has its own TS_execute_ternary implementation for use at upper index levels, that sadly managed to get this case wrong as well :-(. Fixing it is a lot easier fortunately. Per bug #16388 from Charles Offenbacher. Back-patch to 9.6 where phrase search was introduced. Discussion: https://postgr.es/m/16388-98cffba38d0b7e6e@postgresql.org
2020-04-27 18:21:04 +02:00
qr | 92 | 93
qi | 92 | 92
(10 rows)
SELECT * FROM ts_stat('SELECT a FROM test_tsvector', 'AB') ORDER BY ndoc DESC, nentry DESC, word;
word | ndoc | nentry
------+------+--------
DFG | 1 | 2
(1 row)
--dictionaries and to_tsvector
SELECT ts_lexize('english_stem', 'skies');
ts_lexize
-----------
{sky}
(1 row)
SELECT ts_lexize('english_stem', 'identity');
ts_lexize
-----------
{ident}
(1 row)
SELECT * FROM ts_token_type('default');
tokid | alias | description
-------+-----------------+------------------------------------------
1 | asciiword | Word, all ASCII
2 | word | Word, all letters
3 | numword | Word, letters and digits
4 | email | Email address
5 | url | URL
6 | host | Host
7 | sfloat | Scientific notation
8 | version | Version number
9 | hword_numpart | Hyphenated word part, letters and digits
10 | hword_part | Hyphenated word part, all letters
11 | hword_asciipart | Hyphenated word part, all ASCII
12 | blank | Space symbols
13 | tag | XML tag
14 | protocol | Protocol head
15 | numhword | Hyphenated word, letters and digits
16 | asciihword | Hyphenated word, all ASCII
17 | hword | Hyphenated word, all letters
18 | url_path | URL path
19 | file | File or path name
20 | float | Decimal notation
21 | int | Signed integer
22 | uint | Unsigned integer
23 | entity | XML entity
(23 rows)
SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net teodor@123-stack.net 123_teodor@stack.net 123-teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
/usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
<i <b> wow < jqw <> qwerty');
tokid | token
-------+--------------------------------------
22 | 345
12 |
1 | qwe
12 | @
19 | efd.r
12 | '
14 | http://
6 | www.com
12 | /
14 | http://
5 | aew.werc.ewr/?ad=qwe&dw
6 | aew.werc.ewr
18 | /?ad=qwe&dw
12 |
5 | 1aew.werc.ewr/?ad=qwe&dw
6 | 1aew.werc.ewr
18 | /?ad=qwe&dw
12 |
6 | 2aew.werc.ewr
12 |
14 | http://
5 | 3aew.werc.ewr/?ad=qwe&dw
6 | 3aew.werc.ewr
18 | /?ad=qwe&dw
12 |
14 | http://
6 | 4aew.werc.ewr
12 |
14 | http://
5 | 5aew.werc.ewr:8100/?
6 | 5aew.werc.ewr:8100
18 | /?
12 |
1 | ad
12 | =
1 | qwe
12 | &
1 | dw
12 |
5 | 6aew.werc.ewr:8100/?ad=qwe&dw
6 | 6aew.werc.ewr:8100
18 | /?ad=qwe&dw
12 |
5 | 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32
6 | 7aew.werc.ewr:8100
18 | /?ad=qwe&dw=%20%32
12 |
7 | +4.0e-10
12 |
1 | qwe
12 |
1 | qwe
12 |
1 | qwqwe
12 |
20 | 234.435
12 |
22 | 455
12 |
20 | 5.005
12 |
4 | teodor@stack.net
12 |
4 | teodor@123-stack.net
12 |
4 | 123_teodor@stack.net
12 |
4 | 123-teodor@stack.net
12 |
16 | qwe-wer
11 | qwe
12 | -
11 | wer
12 |
1 | asdf
12 |
13 | <fr>
1 | qwer
12 |
1 | jf
12 |
1 | sdjk
12 | <
1 | we
12 |
1 | hjwer
12 |
13 | <werrwe>
12 |
3 | ewr1
12 | >
3 | ewri2
12 |
13 | <a href="qwe<qwe>">
12 | +
|
19 | /usr/local/fff
12 |
19 | /awdf/dwqe/4325
12 |
19 | rewt/ewr
12 |
1 | wefjn
12 |
19 | /wqe-324/ewr
12 |
19 | gist.h
12 |
19 | gist.h.c
12 |
19 | gist.c
12 | .
1 | readline
12 |
20 | 4.2
12 |
20 | 4.2
12 | .
20 | 4.2
12 | ,
1 | readline
20 | -4.2
12 |
1 | readline
20 | -4.2
12 | .
22 | 234
12 | +
|
12 | <
1 | i
12 |
13 | <b>
12 |
1 | wow
12 |
12 | <
1 | jqw
12 |
12 | <>
1 | qwerty
(139 rows)
SELECT to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net teodor@123-stack.net 123_teodor@stack.net 123-teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
/usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
<i <b> wow < jqw <> qwerty');
to_tsvector
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
'+4.0e-10':28 '-4.2':63,65 '/?':18 '/?ad=qwe&dw':7,10,14,24 '/?ad=qwe&dw=%20%32':27 '/awdf/dwqe/4325':51 '/usr/local/fff':50 '/wqe-324/ewr':54 '123-teodor@stack.net':38 '123_teodor@stack.net':37 '1aew.werc.ewr':9 '1aew.werc.ewr/?ad=qwe&dw':8 '234':66 '234.435':32 '2aew.werc.ewr':11 '345':1 '3aew.werc.ewr':13 '3aew.werc.ewr/?ad=qwe&dw':12 '4.2':59,60,61 '455':33 '4aew.werc.ewr':15 '5.005':34 '5aew.werc.ewr:8100':17 '5aew.werc.ewr:8100/?':16 '6aew.werc.ewr:8100':23 '6aew.werc.ewr:8100/?ad=qwe&dw':22 '7aew.werc.ewr:8100':26 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':25 'ad':19 'aew.werc.ewr':6 'aew.werc.ewr/?ad=qwe&dw':5 'asdf':42 'dw':21 'efd.r':3 'ewr1':48 'ewri2':49 'gist.c':57 'gist.h':55 'gist.h.c':56 'hjwer':47 'jf':44 'jqw':69 'qwe':2,20,29,30,40 'qwe-wer':39 'qwer':43 'qwerti':70 'qwqwe':31 'readlin':58,62,64 'rewt/ewr':52 'sdjk':45 'teodor@123-stack.net':36 'teodor@stack.net':35 'wefjn':53 'wer':41 'wow':68 'www.com':4
(1 row)
SELECT length(to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net teodor@123-stack.net 123_teodor@stack.net 123-teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
/usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
<i <b> wow < jqw <> qwerty'));
length
--------
56
(1 row)
-- ts_debug
SELECT * from ts_debug('english', '<myns:foo-bar_baz.blurfl>abc&nm1;def&#xa9;ghi&#245;jkl</myns:foo-bar_baz.blurfl>');
alias | description | token | dictionaries | dictionary | lexemes
-----------+-----------------+----------------------------+----------------+--------------+---------
tag | XML tag | <myns:foo-bar_baz.blurfl> | {} | |
asciiword | Word, all ASCII | abc | {english_stem} | english_stem | {abc}
entity | XML entity | &nm1; | {} | |
asciiword | Word, all ASCII | def | {english_stem} | english_stem | {def}
entity | XML entity | &#xa9; | {} | |
asciiword | Word, all ASCII | ghi | {english_stem} | english_stem | {ghi}
entity | XML entity | &#245; | {} | |
asciiword | Word, all ASCII | jkl | {english_stem} | english_stem | {jkl}
tag | XML tag | </myns:foo-bar_baz.blurfl> | {} | |
(9 rows)
-- check parsing of URLs
SELECT * from ts_debug('english', 'http://www.harewoodsolutions.co.uk/press.aspx</span>');
alias | description | token | dictionaries | dictionary | lexemes
----------+---------------+----------------------------------------+--------------+------------+------------------------------------------
protocol | Protocol head | http:// | {} | |
url | URL | www.harewoodsolutions.co.uk/press.aspx | {simple} | simple | {www.harewoodsolutions.co.uk/press.aspx}
host | Host | www.harewoodsolutions.co.uk | {simple} | simple | {www.harewoodsolutions.co.uk}
url_path | URL path | /press.aspx | {simple} | simple | {/press.aspx}
tag | XML tag | </span> | {} | |
(5 rows)
SELECT * from ts_debug('english', 'http://aew.wer0c.ewr/id?ad=qwe&dw<span>');
alias | description | token | dictionaries | dictionary | lexemes
----------+---------------+----------------------------+--------------+------------+------------------------------
protocol | Protocol head | http:// | {} | |
url | URL | aew.wer0c.ewr/id?ad=qwe&dw | {simple} | simple | {aew.wer0c.ewr/id?ad=qwe&dw}
host | Host | aew.wer0c.ewr | {simple} | simple | {aew.wer0c.ewr}
url_path | URL path | /id?ad=qwe&dw | {simple} | simple | {/id?ad=qwe&dw}
tag | XML tag | <span> | {} | |
(5 rows)
SELECT * from ts_debug('english', 'http://5aew.werc.ewr:8100/?');
alias | description | token | dictionaries | dictionary | lexemes
----------+---------------+----------------------+--------------+------------+------------------------
protocol | Protocol head | http:// | {} | |
url | URL | 5aew.werc.ewr:8100/? | {simple} | simple | {5aew.werc.ewr:8100/?}
host | Host | 5aew.werc.ewr:8100 | {simple} | simple | {5aew.werc.ewr:8100}
url_path | URL path | /? | {simple} | simple | {/?}
(4 rows)
SELECT * from ts_debug('english', '5aew.werc.ewr:8100/?xx');
alias | description | token | dictionaries | dictionary | lexemes
----------+-------------+------------------------+--------------+------------+--------------------------
url | URL | 5aew.werc.ewr:8100/?xx | {simple} | simple | {5aew.werc.ewr:8100/?xx}
host | Host | 5aew.werc.ewr:8100 | {simple} | simple | {5aew.werc.ewr:8100}
url_path | URL path | /?xx | {simple} | simple | {/?xx}
(3 rows)
2017-09-25 17:55:24 +02:00
SELECT token, alias,
dictionaries, dictionaries is null as dnull, array_dims(dictionaries) as ddims,
lexemes, lexemes is null as lnull, array_dims(lexemes) as ldims
from ts_debug('english', 'a title');
token | alias | dictionaries | dnull | ddims | lexemes | lnull | ldims
-------+-----------+----------------+-------+-------+---------+-------+-------
a | asciiword | {english_stem} | f | [1:1] | {} | f |
| blank | {} | f | | | t |
title | asciiword | {english_stem} | f | [1:1] | {titl} | f | [1:1]
(3 rows)
-- to_tsquery
SELECT to_tsquery('english', 'qwe & sKies ');
to_tsquery
---------------
'qwe' & 'sky'
(1 row)
SELECT to_tsquery('simple', 'qwe & sKies ');
to_tsquery
-----------------
'qwe' & 'skies'
(1 row)
SELECT to_tsquery('english', '''the wether'':dc & '' sKies '':BC ');
to_tsquery
------------------------
'wether':CD & 'sky':BC
(1 row)
SELECT to_tsquery('english', 'asd&(and|fghj)');
to_tsquery
----------------
'asd' & 'fghj'
(1 row)
SELECT to_tsquery('english', '(asd&and)|fghj');
to_tsquery
----------------
'asd' | 'fghj'
(1 row)
SELECT to_tsquery('english', '(asd&!and)|fghj');
to_tsquery
----------------
'asd' | 'fghj'
(1 row)
SELECT to_tsquery('english', '(the|and&(i&1))&fghj');
to_tsquery
--------------
'1' & 'fghj'
(1 row)
SELECT plainto_tsquery('english', 'the and z 1))& fghj');
plainto_tsquery
--------------------
'z' & '1' & 'fghj'
(1 row)
SELECT plainto_tsquery('english', 'foo bar') && plainto_tsquery('english', 'asd');
?column?
-----------------------
'foo' & 'bar' & 'asd'
(1 row)
SELECT plainto_tsquery('english', 'foo bar') || plainto_tsquery('english', 'asd fg');
?column?
------------------------------
'foo' & 'bar' | 'asd' & 'fg'
(1 row)
SELECT plainto_tsquery('english', 'foo bar') || !!plainto_tsquery('english', 'asd fg');
?column?
-----------------------------------
'foo' & 'bar' | !( 'asd' & 'fg' )
(1 row)
SELECT plainto_tsquery('english', 'foo bar') && 'asd | fg';
?column?
----------------------------------
'foo' & 'bar' & ( 'asd' | 'fg' )
(1 row)
-- Check stop word deletion, a and s are stop-words
SELECT to_tsquery('english', '!(a & !b) & c');
Fix strange behavior (and possible crashes) in full text phrase search. In an attempt to simplify the tsquery matching engine, the original phrase search patch invented rewrite rules that would rearrange a tsquery so that no AND/OR/NOT operator appeared below a PHRASE operator. But this approach had numerous problems. The rearrangement step was missed by ts_rewrite (and perhaps other places), allowing tsqueries to be created that would cause Assert failures or perhaps crashes at execution, as reported by Andreas Seltenreich. The rewrite rules effectively defined semantics for operators underneath PHRASE that were buggy, or at least unintuitive. And because rewriting was done in tsqueryin() rather than at execution, the rearrangement was user-visible, which is not very desirable --- for example, it might cause unexpected matches or failures to match in ts_rewrite. As a somewhat independent problem, the behavior of nested PHRASE operators was only sane for left-deep trees; queries like "x <-> (y <-> z)" did not behave intuitively at all. To fix, get rid of the rewrite logic altogether, and instead teach the tsquery execution engine to manage AND/OR/NOT below a PHRASE operator by explicitly computing the match location(s) and match widths for these operators. This requires introducing some additional fields into the publicly visible ExecPhraseData struct; but since there's no way for third-party code to pass such a struct to TS_phrase_execute, it shouldn't create an ABI problem as long as we don't move the offsets of the existing fields. Another related problem was that index searches supposed that "!x <-> y" could be lossily approximated as "!x & y", which isn't correct because the latter will reject, say, "x q y" which the query itself accepts. This required some tweaking in TS_execute_ternary along with the main tsquery engine. Back-patch to 9.6 where phrase operators were introduced. While this could be argued to change behavior more than we'd like in a stable branch, we have to do something about the crash hazards and index-vs-seqscan inconsistency, and it doesn't seem desirable to let the unintuitive behaviors induced by the rewriting implementation stand as precedent. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-21 21:18:25 +01:00
to_tsquery
-------------
!!'b' & 'c'
(1 row)
SELECT to_tsquery('english', '!(a & !b)');
to_tsquery
------------
Fix strange behavior (and possible crashes) in full text phrase search. In an attempt to simplify the tsquery matching engine, the original phrase search patch invented rewrite rules that would rearrange a tsquery so that no AND/OR/NOT operator appeared below a PHRASE operator. But this approach had numerous problems. The rearrangement step was missed by ts_rewrite (and perhaps other places), allowing tsqueries to be created that would cause Assert failures or perhaps crashes at execution, as reported by Andreas Seltenreich. The rewrite rules effectively defined semantics for operators underneath PHRASE that were buggy, or at least unintuitive. And because rewriting was done in tsqueryin() rather than at execution, the rearrangement was user-visible, which is not very desirable --- for example, it might cause unexpected matches or failures to match in ts_rewrite. As a somewhat independent problem, the behavior of nested PHRASE operators was only sane for left-deep trees; queries like "x <-> (y <-> z)" did not behave intuitively at all. To fix, get rid of the rewrite logic altogether, and instead teach the tsquery execution engine to manage AND/OR/NOT below a PHRASE operator by explicitly computing the match location(s) and match widths for these operators. This requires introducing some additional fields into the publicly visible ExecPhraseData struct; but since there's no way for third-party code to pass such a struct to TS_phrase_execute, it shouldn't create an ABI problem as long as we don't move the offsets of the existing fields. Another related problem was that index searches supposed that "!x <-> y" could be lossily approximated as "!x & y", which isn't correct because the latter will reject, say, "x q y" which the query itself accepts. This required some tweaking in TS_execute_ternary along with the main tsquery engine. Back-patch to 9.6 where phrase operators were introduced. While this could be argued to change behavior more than we'd like in a stable branch, we have to do something about the crash hazards and index-vs-seqscan inconsistency, and it doesn't seem desirable to let the unintuitive behaviors induced by the rewriting implementation stand as precedent. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-21 21:18:25 +01:00
!!'b'
(1 row)
SELECT to_tsquery('english', '(1 <-> 2) <-> a');
to_tsquery
-------------
'1' <-> '2'
(1 row)
SELECT to_tsquery('english', '(1 <-> a) <-> 2');
to_tsquery
-------------
'1' <2> '2'
(1 row)
SELECT to_tsquery('english', '(a <-> 1) <-> 2');
to_tsquery
-------------
'1' <-> '2'
(1 row)
SELECT to_tsquery('english', 'a <-> (1 <-> 2)');
to_tsquery
-------------
'1' <-> '2'
(1 row)
SELECT to_tsquery('english', '1 <-> (a <-> 2)');
to_tsquery
-------------
Fix handling of phrase operator removal while removing tsquery stopwords. The distance of a removed phrase operator should propagate up to a parent phrase operator if there is one, but this only worked correctly in left-deep trees. Throwing in a few parentheses confused it completely, as indeed was illustrated by bizarre results in existing regression test cases. To fix, track unaccounted-for distances that should propagate to the left and to the right of the current node, rather than trying to make it work with only one returned distance. Also make some adjustments to behave as well as we can for cases of intermixed phrase and regular (AND/OR) operators. I don't think it's possible to be 100% correct for that without a rethinking of the tsquery representation; for example, maybe we should just not drop stopword nodes at all underneath phrase operators. But this is better than it was, and changing tsquery representation wouldn't be safely back-patchable. While at it, I simplified the API of the clean_fakeval_intree function a bit by getting rid of the "char *result" output parameter; that wasn't doing anything that wasn't redundant with whether the result node is NULL or not, and testing for NULL seems a lot clearer/safer. This is part of a larger project to fix various infelicities in the phrase-search implementation, but this part seems comittable on its own. Back-patch to 9.6 where phrase operators were introduced. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-19 19:49:45 +01:00
'1' <2> '2'
(1 row)
SELECT to_tsquery('english', '1 <-> (2 <-> a)');
to_tsquery
-------------
'1' <-> '2'
(1 row)
SELECT to_tsquery('english', '(1 <-> 2) <3> a');
to_tsquery
-------------
'1' <-> '2'
(1 row)
SELECT to_tsquery('english', '(1 <-> a) <3> 2');
to_tsquery
-------------
'1' <4> '2'
(1 row)
SELECT to_tsquery('english', '(a <-> 1) <3> 2');
to_tsquery
-------------
'1' <3> '2'
(1 row)
SELECT to_tsquery('english', 'a <3> (1 <-> 2)');
to_tsquery
-------------
'1' <-> '2'
(1 row)
SELECT to_tsquery('english', '1 <3> (a <-> 2)');
to_tsquery
-------------
Fix handling of phrase operator removal while removing tsquery stopwords. The distance of a removed phrase operator should propagate up to a parent phrase operator if there is one, but this only worked correctly in left-deep trees. Throwing in a few parentheses confused it completely, as indeed was illustrated by bizarre results in existing regression test cases. To fix, track unaccounted-for distances that should propagate to the left and to the right of the current node, rather than trying to make it work with only one returned distance. Also make some adjustments to behave as well as we can for cases of intermixed phrase and regular (AND/OR) operators. I don't think it's possible to be 100% correct for that without a rethinking of the tsquery representation; for example, maybe we should just not drop stopword nodes at all underneath phrase operators. But this is better than it was, and changing tsquery representation wouldn't be safely back-patchable. While at it, I simplified the API of the clean_fakeval_intree function a bit by getting rid of the "char *result" output parameter; that wasn't doing anything that wasn't redundant with whether the result node is NULL or not, and testing for NULL seems a lot clearer/safer. This is part of a larger project to fix various infelicities in the phrase-search implementation, but this part seems comittable on its own. Back-patch to 9.6 where phrase operators were introduced. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-19 19:49:45 +01:00
'1' <4> '2'
(1 row)
SELECT to_tsquery('english', '1 <3> (2 <-> a)');
to_tsquery
-------------
'1' <3> '2'
(1 row)
SELECT to_tsquery('english', '(1 <3> 2) <-> a');
to_tsquery
-------------
'1' <3> '2'
(1 row)
SELECT to_tsquery('english', '(1 <3> a) <-> 2');
to_tsquery
-------------
'1' <4> '2'
(1 row)
SELECT to_tsquery('english', '(a <3> 1) <-> 2');
to_tsquery
-------------
'1' <-> '2'
(1 row)
SELECT to_tsquery('english', 'a <-> (1 <3> 2)');
to_tsquery
-------------
'1' <3> '2'
(1 row)
SELECT to_tsquery('english', '1 <-> (a <3> 2)');
to_tsquery
-------------
Fix handling of phrase operator removal while removing tsquery stopwords. The distance of a removed phrase operator should propagate up to a parent phrase operator if there is one, but this only worked correctly in left-deep trees. Throwing in a few parentheses confused it completely, as indeed was illustrated by bizarre results in existing regression test cases. To fix, track unaccounted-for distances that should propagate to the left and to the right of the current node, rather than trying to make it work with only one returned distance. Also make some adjustments to behave as well as we can for cases of intermixed phrase and regular (AND/OR) operators. I don't think it's possible to be 100% correct for that without a rethinking of the tsquery representation; for example, maybe we should just not drop stopword nodes at all underneath phrase operators. But this is better than it was, and changing tsquery representation wouldn't be safely back-patchable. While at it, I simplified the API of the clean_fakeval_intree function a bit by getting rid of the "char *result" output parameter; that wasn't doing anything that wasn't redundant with whether the result node is NULL or not, and testing for NULL seems a lot clearer/safer. This is part of a larger project to fix various infelicities in the phrase-search implementation, but this part seems comittable on its own. Back-patch to 9.6 where phrase operators were introduced. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-19 19:49:45 +01:00
'1' <4> '2'
(1 row)
SELECT to_tsquery('english', '1 <-> (2 <3> a)');
to_tsquery
-------------
'1' <-> '2'
(1 row)
SELECT to_tsquery('english', '((a <-> 1) <-> 2) <-> s');
to_tsquery
-------------
'1' <-> '2'
(1 row)
SELECT to_tsquery('english', '(2 <-> (a <-> 1)) <-> s');
to_tsquery
-------------
Fix handling of phrase operator removal while removing tsquery stopwords. The distance of a removed phrase operator should propagate up to a parent phrase operator if there is one, but this only worked correctly in left-deep trees. Throwing in a few parentheses confused it completely, as indeed was illustrated by bizarre results in existing regression test cases. To fix, track unaccounted-for distances that should propagate to the left and to the right of the current node, rather than trying to make it work with only one returned distance. Also make some adjustments to behave as well as we can for cases of intermixed phrase and regular (AND/OR) operators. I don't think it's possible to be 100% correct for that without a rethinking of the tsquery representation; for example, maybe we should just not drop stopword nodes at all underneath phrase operators. But this is better than it was, and changing tsquery representation wouldn't be safely back-patchable. While at it, I simplified the API of the clean_fakeval_intree function a bit by getting rid of the "char *result" output parameter; that wasn't doing anything that wasn't redundant with whether the result node is NULL or not, and testing for NULL seems a lot clearer/safer. This is part of a larger project to fix various infelicities in the phrase-search implementation, but this part seems comittable on its own. Back-patch to 9.6 where phrase operators were introduced. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-19 19:49:45 +01:00
'2' <2> '1'
(1 row)
SELECT to_tsquery('english', '((1 <-> a) <-> 2) <-> s');
to_tsquery
-------------
'1' <2> '2'
(1 row)
SELECT to_tsquery('english', '(2 <-> (1 <-> a)) <-> s');
to_tsquery
-------------
'2' <-> '1'
(1 row)
SELECT to_tsquery('english', 's <-> ((a <-> 1) <-> 2)');
to_tsquery
-------------
'1' <-> '2'
(1 row)
SELECT to_tsquery('english', 's <-> (2 <-> (a <-> 1))');
to_tsquery
-------------
Fix handling of phrase operator removal while removing tsquery stopwords. The distance of a removed phrase operator should propagate up to a parent phrase operator if there is one, but this only worked correctly in left-deep trees. Throwing in a few parentheses confused it completely, as indeed was illustrated by bizarre results in existing regression test cases. To fix, track unaccounted-for distances that should propagate to the left and to the right of the current node, rather than trying to make it work with only one returned distance. Also make some adjustments to behave as well as we can for cases of intermixed phrase and regular (AND/OR) operators. I don't think it's possible to be 100% correct for that without a rethinking of the tsquery representation; for example, maybe we should just not drop stopword nodes at all underneath phrase operators. But this is better than it was, and changing tsquery representation wouldn't be safely back-patchable. While at it, I simplified the API of the clean_fakeval_intree function a bit by getting rid of the "char *result" output parameter; that wasn't doing anything that wasn't redundant with whether the result node is NULL or not, and testing for NULL seems a lot clearer/safer. This is part of a larger project to fix various infelicities in the phrase-search implementation, but this part seems comittable on its own. Back-patch to 9.6 where phrase operators were introduced. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-19 19:49:45 +01:00
'2' <2> '1'
(1 row)
SELECT to_tsquery('english', 's <-> ((1 <-> a) <-> 2)');
to_tsquery
-------------
'1' <2> '2'
(1 row)
SELECT to_tsquery('english', 's <-> (2 <-> (1 <-> a))');
to_tsquery
-------------
'2' <-> '1'
(1 row)
SELECT to_tsquery('english', '((a <-> 1) <-> s) <-> 2');
to_tsquery
-------------
'1' <2> '2'
(1 row)
SELECT to_tsquery('english', '(s <-> (a <-> 1)) <-> 2');
to_tsquery
-------------
'1' <-> '2'
(1 row)
SELECT to_tsquery('english', '((1 <-> a) <-> s) <-> 2');
to_tsquery
-------------
'1' <3> '2'
(1 row)
SELECT to_tsquery('english', '(s <-> (1 <-> a)) <-> 2');
to_tsquery
-------------
'1' <2> '2'
(1 row)
SELECT to_tsquery('english', '2 <-> ((a <-> 1) <-> s)');
to_tsquery
-------------
Fix handling of phrase operator removal while removing tsquery stopwords. The distance of a removed phrase operator should propagate up to a parent phrase operator if there is one, but this only worked correctly in left-deep trees. Throwing in a few parentheses confused it completely, as indeed was illustrated by bizarre results in existing regression test cases. To fix, track unaccounted-for distances that should propagate to the left and to the right of the current node, rather than trying to make it work with only one returned distance. Also make some adjustments to behave as well as we can for cases of intermixed phrase and regular (AND/OR) operators. I don't think it's possible to be 100% correct for that without a rethinking of the tsquery representation; for example, maybe we should just not drop stopword nodes at all underneath phrase operators. But this is better than it was, and changing tsquery representation wouldn't be safely back-patchable. While at it, I simplified the API of the clean_fakeval_intree function a bit by getting rid of the "char *result" output parameter; that wasn't doing anything that wasn't redundant with whether the result node is NULL or not, and testing for NULL seems a lot clearer/safer. This is part of a larger project to fix various infelicities in the phrase-search implementation, but this part seems comittable on its own. Back-patch to 9.6 where phrase operators were introduced. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-19 19:49:45 +01:00
'2' <2> '1'
(1 row)
SELECT to_tsquery('english', '2 <-> (s <-> (a <-> 1))');
to_tsquery
-------------
Fix handling of phrase operator removal while removing tsquery stopwords. The distance of a removed phrase operator should propagate up to a parent phrase operator if there is one, but this only worked correctly in left-deep trees. Throwing in a few parentheses confused it completely, as indeed was illustrated by bizarre results in existing regression test cases. To fix, track unaccounted-for distances that should propagate to the left and to the right of the current node, rather than trying to make it work with only one returned distance. Also make some adjustments to behave as well as we can for cases of intermixed phrase and regular (AND/OR) operators. I don't think it's possible to be 100% correct for that without a rethinking of the tsquery representation; for example, maybe we should just not drop stopword nodes at all underneath phrase operators. But this is better than it was, and changing tsquery representation wouldn't be safely back-patchable. While at it, I simplified the API of the clean_fakeval_intree function a bit by getting rid of the "char *result" output parameter; that wasn't doing anything that wasn't redundant with whether the result node is NULL or not, and testing for NULL seems a lot clearer/safer. This is part of a larger project to fix various infelicities in the phrase-search implementation, but this part seems comittable on its own. Back-patch to 9.6 where phrase operators were introduced. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-19 19:49:45 +01:00
'2' <3> '1'
(1 row)
SELECT to_tsquery('english', '2 <-> ((1 <-> a) <-> s)');
to_tsquery
-------------
'2' <-> '1'
(1 row)
SELECT to_tsquery('english', '2 <-> (s <-> (1 <-> a))');
to_tsquery
-------------
Fix handling of phrase operator removal while removing tsquery stopwords. The distance of a removed phrase operator should propagate up to a parent phrase operator if there is one, but this only worked correctly in left-deep trees. Throwing in a few parentheses confused it completely, as indeed was illustrated by bizarre results in existing regression test cases. To fix, track unaccounted-for distances that should propagate to the left and to the right of the current node, rather than trying to make it work with only one returned distance. Also make some adjustments to behave as well as we can for cases of intermixed phrase and regular (AND/OR) operators. I don't think it's possible to be 100% correct for that without a rethinking of the tsquery representation; for example, maybe we should just not drop stopword nodes at all underneath phrase operators. But this is better than it was, and changing tsquery representation wouldn't be safely back-patchable. While at it, I simplified the API of the clean_fakeval_intree function a bit by getting rid of the "char *result" output parameter; that wasn't doing anything that wasn't redundant with whether the result node is NULL or not, and testing for NULL seems a lot clearer/safer. This is part of a larger project to fix various infelicities in the phrase-search implementation, but this part seems comittable on its own. Back-patch to 9.6 where phrase operators were introduced. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-19 19:49:45 +01:00
'2' <2> '1'
(1 row)
SELECT to_tsquery('english', 'foo <-> (a <-> (the <-> bar))');
to_tsquery
-----------------
Fix handling of phrase operator removal while removing tsquery stopwords. The distance of a removed phrase operator should propagate up to a parent phrase operator if there is one, but this only worked correctly in left-deep trees. Throwing in a few parentheses confused it completely, as indeed was illustrated by bizarre results in existing regression test cases. To fix, track unaccounted-for distances that should propagate to the left and to the right of the current node, rather than trying to make it work with only one returned distance. Also make some adjustments to behave as well as we can for cases of intermixed phrase and regular (AND/OR) operators. I don't think it's possible to be 100% correct for that without a rethinking of the tsquery representation; for example, maybe we should just not drop stopword nodes at all underneath phrase operators. But this is better than it was, and changing tsquery representation wouldn't be safely back-patchable. While at it, I simplified the API of the clean_fakeval_intree function a bit by getting rid of the "char *result" output parameter; that wasn't doing anything that wasn't redundant with whether the result node is NULL or not, and testing for NULL seems a lot clearer/safer. This is part of a larger project to fix various infelicities in the phrase-search implementation, but this part seems comittable on its own. Back-patch to 9.6 where phrase operators were introduced. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-19 19:49:45 +01:00
'foo' <3> 'bar'
(1 row)
SELECT to_tsquery('english', '((foo <-> a) <-> the) <-> bar');
to_tsquery
-----------------
'foo' <3> 'bar'
(1 row)
SELECT to_tsquery('english', 'foo <-> a <-> the <-> bar');
to_tsquery
-----------------
'foo' <3> 'bar'
(1 row)
SELECT phraseto_tsquery('english', 'PostgreSQL can be extended by the user in many ways');
phraseto_tsquery
-----------------------------------------------------------
'postgresql' <3> 'extend' <3> 'user' <2> 'mani' <-> 'way'
(1 row)
SELECT ts_rank_cd(to_tsvector('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
'), to_tsquery('english', 'paint&water'));
ts_rank_cd
------------
0.05
(1 row)
SELECT ts_rank_cd(to_tsvector('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
'), to_tsquery('english', 'breath&motion&water'));
Change floating-point output format for improved performance. Previously, floating-point output was done by rounding to a specific decimal precision; by default, to 6 or 15 decimal digits (losing information) or as requested using extra_float_digits. Drivers that wanted exact float values, and applications like pg_dump that must preserve values exactly, set extra_float_digits=3 (or sometimes 2 for historical reasons, though this isn't enough for float4). Unfortunately, decimal rounded output is slow enough to become a noticable bottleneck when dealing with large result sets or COPY of large tables when many floating-point values are involved. Floating-point output can be done much faster when the output is not rounded to a specific decimal length, but rather is chosen as the shortest decimal representation that is closer to the original float value than to any other value representable in the same precision. The recently published Ryu algorithm by Ulf Adams is both relatively simple and remarkably fast. Accordingly, change float4out/float8out to output shortest decimal representations if extra_float_digits is greater than 0, and make that the new default. Applications that need rounded output can set extra_float_digits back to 0 or below, and take the resulting performance hit. We make one concession to portability for systems with buggy floating-point input: we do not output decimal values that fall exactly halfway between adjacent representable binary values (which would rely on the reader doing round-to-nearest-even correctly). This is known to be a problem at least for VS2013 on Windows. Our version of the Ryu code originates from https://github.com/ulfjack/ryu/ at commit c9c3fb1979, but with the following (significant) modifications: - Output format is changed to use fixed-point notation for small exponents, as printf would, and also to use lowercase 'e', a minimum of 2 exponent digits, and a mandatory sign on the exponent, to keep the formatting as close as possible to previous output. - The output of exact midpoint values is disabled as noted above. - The integer fast-path code is changed somewhat (since we have fixed-point output and the upstream did not). - Our project style has been largely applied to the code with the exception of C99 declaration-after-statement, which has been retained as an exception to our present policy. - Most of upstream's debugging and conditionals are removed, and we use our own configure tests to determine things like uint128 availability. Changing the float output format obviously affects a number of regression tests. This patch uses an explicit setting of extra_float_digits=0 for test output that is not expected to be exactly reproducible (e.g. due to numerical instability or differing algorithms for transcendental functions). Conversions from floats to numeric are unchanged by this patch. These may appear in index expressions and it is not yet clear whether any change should be made, so that can be left for another day. This patch assumes that the only supported floating point format is now IEEE format, and the documentation is updated to reflect that. Code by me, adapting the work of Ulf Adams and other contributors. References: https://dl.acm.org/citation.cfm?id=3192369 Reviewed-by: Tom Lane, Andres Freund, Donald Dong Discussion: https://postgr.es/m/87r2el1bx6.fsf@news-spur.riddles.org.uk
2019-02-13 16:20:33 +01:00
ts_rank_cd
-------------
0.008333334
(1 row)
SELECT ts_rank_cd(to_tsvector('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
'), to_tsquery('english', 'ocean'));
ts_rank_cd
------------
0.1
(1 row)
SELECT ts_rank_cd(to_tsvector('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
'), to_tsquery('english', 'painted <-> Ship'));
ts_rank_cd
------------
0.1
(1 row)
SELECT ts_rank_cd(strip(to_tsvector('both stripped')),
to_tsquery('both & stripped'));
ts_rank_cd
------------
0
(1 row)
SELECT ts_rank_cd(to_tsvector('unstripped') || strip(to_tsvector('stripped')),
to_tsquery('unstripped & stripped'));
ts_rank_cd
------------
0
(1 row)
--headline tests
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'paint&water'));
ts_headline
-----------------------------------------
<b>painted</b> Ocean. +
<b>Water</b>, <b>water</b>, every where+
And all the boards did shrink; +
<b>Water</b>, <b>water</b>, every
(1 row)
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'breath&motion&water'));
ts_headline
----------------------------------
<b>breath</b> nor <b>motion</b>,+
As idle as a painted Ship +
Upon a painted Ocean. +
<b>Water</b>, <b>water</b>
(1 row)
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'ocean'));
ts_headline
----------------------------------
<b>Ocean</b>. +
Water, water, every where +
And all the boards did shrink;+
Water, water, every where
(1 row)
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'day & drink'));
ts_headline
------------------------------------
<b>day</b>, +
We stuck, nor breath nor motion,+
As idle as a painted Ship +
Upon a painted Ocean. +
Water, water, every where +
And all the boards did shrink; +
Water, water, every where, +
Nor any drop
(1 row)
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'day | drink'));
ts_headline
-----------------------------------------------------------
<b>Day</b> after <b>day</b>, <b>day</b> after <b>day</b>,+
We stuck, nor breath nor motion, +
As idle as a painted
(1 row)
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'day | !drink'));
ts_headline
-----------------------------------------------------------
<b>Day</b> after <b>day</b>, <b>day</b> after <b>day</b>,+
We stuck, nor breath nor motion, +
As idle as a painted
(1 row)
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'painted <-> Ship & drink'));
ts_headline
----------------------------------
<b>painted</b> <b>Ship</b> +
Upon a <b>painted</b> Ocean. +
Water, water, every where +
And all the boards did shrink;+
Water, water, every where, +
Nor any drop to <b>drink</b>
(1 row)
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'painted <-> Ship | drink'));
ts_headline
---------------------------------
<b>painted</b> <b>Ship</b> +
Upon a <b>painted</b> Ocean. +
Water, water, every where +
Fix default text search parser's ts_headline code for phrase queries. This code could produce very poor results when asked to highlight a string based on a query using phrase-match operators. The root cause is that hlCover(), which is supposed to find a minimal substring that matches the query, was written assuming that word position is not significant. I'm only 95% convinced that its algorithm was correct even for plain AND/OR queries; but it definitely fails completely for phrase matches, causing it to possibly not identify a cover string at all. Hence, rewrite hlCover() with a less-tense algorithm that just tries all the possible substrings, earlier and shorter ones first. (This is not as bad as it sounds performance-wise, because all of the string matching has been done already: the repeated tsquery match checks boil down to pointer comparisons.) Unfortunately, since that approach produces more candidate cover strings than before, it also exposes that there were bugs in the heuristics in mark_hl_words() for selecting a best cover string. Fixes there include: * Do not apply the ShortWord filter to words that appear in the query. * Remove a misguided optimization for quickly rejecting a cover. * Fix order-of-operation bug that could cause computation of a wrong figure of merit (poslen) when shortening a cover. * Change the preference rule so that candidate headlines that do not include their whole cover string (after MaxWords trimming) are lowest priority, since they may not actually satisfy the user's query. This results in some changes in existing regression test cases, but they all seem reasonable. Note in particular that the tests involving strings like "1 2 3" were previously being affected by the ShortWord filter, masking the normal matching behavior. Per bug #16345 from Augustinas Jokubauskas; the new test cases are based on that example. Back-patch to 9.6 where phrase search was added to tsquery. Discussion: https://postgr.es/m/16345-2e0cf5cddbdcd3b4@postgresql.org
2020-04-09 19:19:23 +02:00
And all the boards did shrink
(1 row)
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'painted <-> Ship | !drink'));
ts_headline
---------------------------------
<b>painted</b> <b>Ship</b> +
Upon a <b>painted</b> Ocean. +
Water, water, every where +
And all the boards did shrink
(1 row)
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', phraseto_tsquery('english', 'painted Ocean'));
ts_headline
----------------------------------
<b>painted</b> <b>Ocean</b>. +
Water, water, every where +
And all the boards did shrink;+
Water, water, every
(1 row)
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', phraseto_tsquery('english', 'idle as a painted Ship'));
ts_headline
---------------------------------------------
<b>idle</b> as a <b>painted</b> <b>Ship</b>+
Upon a <b>painted</b> Ocean. +
Water, water, every where +
And all the boards
(1 row)
Fix default text search parser's ts_headline code for phrase queries. This code could produce very poor results when asked to highlight a string based on a query using phrase-match operators. The root cause is that hlCover(), which is supposed to find a minimal substring that matches the query, was written assuming that word position is not significant. I'm only 95% convinced that its algorithm was correct even for plain AND/OR queries; but it definitely fails completely for phrase matches, causing it to possibly not identify a cover string at all. Hence, rewrite hlCover() with a less-tense algorithm that just tries all the possible substrings, earlier and shorter ones first. (This is not as bad as it sounds performance-wise, because all of the string matching has been done already: the repeated tsquery match checks boil down to pointer comparisons.) Unfortunately, since that approach produces more candidate cover strings than before, it also exposes that there were bugs in the heuristics in mark_hl_words() for selecting a best cover string. Fixes there include: * Do not apply the ShortWord filter to words that appear in the query. * Remove a misguided optimization for quickly rejecting a cover. * Fix order-of-operation bug that could cause computation of a wrong figure of merit (poslen) when shortening a cover. * Change the preference rule so that candidate headlines that do not include their whole cover string (after MaxWords trimming) are lowest priority, since they may not actually satisfy the user's query. This results in some changes in existing regression test cases, but they all seem reasonable. Note in particular that the tests involving strings like "1 2 3" were previously being affected by the ShortWord filter, masking the normal matching behavior. Per bug #16345 from Augustinas Jokubauskas; the new test cases are based on that example. Back-patch to 9.6 where phrase search was added to tsquery. Discussion: https://postgr.es/m/16345-2e0cf5cddbdcd3b4@postgresql.org
2020-04-09 19:19:23 +02:00
SELECT ts_headline('english',
'Lorem ipsum urna. Nullam nullam ullamcorper urna.',
to_tsquery('english','Lorem') && phraseto_tsquery('english','ullamcorper urna'),
'MaxWords=100, MinWords=1');
ts_headline
-------------------------------------------------------------------------------
<b>Lorem</b> ipsum <b>urna</b>. Nullam nullam <b>ullamcorper</b> <b>urna</b>
(1 row)
SELECT ts_headline('english',
'Lorem ipsum urna. Nullam nullam ullamcorper urna.',
phraseto_tsquery('english','ullamcorper urna'),
'MaxWords=100, MinWords=5');
ts_headline
-------------------------------------------------------------
<b>urna</b>. Nullam nullam <b>ullamcorper</b> <b>urna</b>.
(1 row)
SELECT ts_headline('english', '
<html>
<!-- some comment -->
<body>
Sea view wow <u>foo bar</u> <i>qq</i>
<a href="http://www.google.com/foo.bar.html" target="_blank">YES &nbsp;</a>
ff-bg
<script>
document.write(15);
</script>
</body>
</html>',
to_tsquery('english', 'sea&foo'), 'HighlightAll=true');
ts_headline
-----------------------------------------------------------------------------
+
<html> +
<!-- some comment --> +
<body> +
<b>Sea</b> view wow <u><b>foo</b> bar</u> <i>qq</i> +
<a href="http://www.google.com/foo.bar.html" target="_blank">YES &nbsp;</a>+
ff-bg +
<script> +
document.write(15); +
</script> +
</body> +
</html>
(1 row)
SELECT ts_headline('simple', '1 2 3 1 3'::text, '1 <-> 3', 'MaxWords=2, MinWords=1');
ts_headline
-------------------
<b>1</b> <b>3</b>
(1 row)
SELECT ts_headline('simple', '1 2 3 1 3'::text, '1 & 3', 'MaxWords=4, MinWords=1');
Fix default text search parser's ts_headline code for phrase queries. This code could produce very poor results when asked to highlight a string based on a query using phrase-match operators. The root cause is that hlCover(), which is supposed to find a minimal substring that matches the query, was written assuming that word position is not significant. I'm only 95% convinced that its algorithm was correct even for plain AND/OR queries; but it definitely fails completely for phrase matches, causing it to possibly not identify a cover string at all. Hence, rewrite hlCover() with a less-tense algorithm that just tries all the possible substrings, earlier and shorter ones first. (This is not as bad as it sounds performance-wise, because all of the string matching has been done already: the repeated tsquery match checks boil down to pointer comparisons.) Unfortunately, since that approach produces more candidate cover strings than before, it also exposes that there were bugs in the heuristics in mark_hl_words() for selecting a best cover string. Fixes there include: * Do not apply the ShortWord filter to words that appear in the query. * Remove a misguided optimization for quickly rejecting a cover. * Fix order-of-operation bug that could cause computation of a wrong figure of merit (poslen) when shortening a cover. * Change the preference rule so that candidate headlines that do not include their whole cover string (after MaxWords trimming) are lowest priority, since they may not actually satisfy the user's query. This results in some changes in existing regression test cases, but they all seem reasonable. Note in particular that the tests involving strings like "1 2 3" were previously being affected by the ShortWord filter, masking the normal matching behavior. Per bug #16345 from Augustinas Jokubauskas; the new test cases are based on that example. Back-patch to 9.6 where phrase search was added to tsquery. Discussion: https://postgr.es/m/16345-2e0cf5cddbdcd3b4@postgresql.org
2020-04-09 19:19:23 +02:00
ts_headline
---------------------
<b>1</b> 2 <b>3</b>
(1 row)
SELECT ts_headline('simple', '1 2 3 1 3'::text, '1 <-> 3', 'MaxWords=4, MinWords=1');
ts_headline
-------------------
<b>1</b> <b>3</b>
(1 row)
--Check if headline fragments work
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'ocean'), 'MaxFragments=1');
ts_headline
------------------------------------
after day, +
We stuck, nor breath nor motion,+
As idle as a painted Ship +
Upon a painted <b>Ocean</b>. +
Water, water, every where +
And all the boards did shrink; +
Water, water, every where, +
Nor any drop
(1 row)
--Check if more than one fragments are displayed
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'Coleridge & stuck'), 'MaxFragments=2');
ts_headline
----------------------------------------------
after day, day after day, +
We <b>stuck</b>, nor breath nor motion, +
As idle as a painted Ship +
Upon a painted Ocean. +
Water, water, every where +
And all the boards did shrink; +
Water, water, every where ... drop to drink.+
S. T. <b>Coleridge</b>
(1 row)
--Fragments when there all query words are not in the document
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'ocean & seahorse'), 'MaxFragments=1');
ts_headline
------------------------------------
+
Day after day, day after day, +
We stuck, nor breath nor motion,+
As idle as
(1 row)
--FragmentDelimiter option
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'Coleridge & stuck'), 'MaxFragments=2,FragmentDelimiter=***');
ts_headline
--------------------------------------------
after day, day after day, +
We <b>stuck</b>, nor breath nor motion, +
As idle as a painted Ship +
Upon a painted Ocean. +
Water, water, every where +
And all the boards did shrink; +
Water, water, every where***drop to drink.+
S. T. <b>Coleridge</b>
(1 row)
Fix default text search parser's ts_headline code for phrase queries. This code could produce very poor results when asked to highlight a string based on a query using phrase-match operators. The root cause is that hlCover(), which is supposed to find a minimal substring that matches the query, was written assuming that word position is not significant. I'm only 95% convinced that its algorithm was correct even for plain AND/OR queries; but it definitely fails completely for phrase matches, causing it to possibly not identify a cover string at all. Hence, rewrite hlCover() with a less-tense algorithm that just tries all the possible substrings, earlier and shorter ones first. (This is not as bad as it sounds performance-wise, because all of the string matching has been done already: the repeated tsquery match checks boil down to pointer comparisons.) Unfortunately, since that approach produces more candidate cover strings than before, it also exposes that there were bugs in the heuristics in mark_hl_words() for selecting a best cover string. Fixes there include: * Do not apply the ShortWord filter to words that appear in the query. * Remove a misguided optimization for quickly rejecting a cover. * Fix order-of-operation bug that could cause computation of a wrong figure of merit (poslen) when shortening a cover. * Change the preference rule so that candidate headlines that do not include their whole cover string (after MaxWords trimming) are lowest priority, since they may not actually satisfy the user's query. This results in some changes in existing regression test cases, but they all seem reasonable. Note in particular that the tests involving strings like "1 2 3" were previously being affected by the ShortWord filter, masking the normal matching behavior. Per bug #16345 from Augustinas Jokubauskas; the new test cases are based on that example. Back-patch to 9.6 where phrase search was added to tsquery. Discussion: https://postgr.es/m/16345-2e0cf5cddbdcd3b4@postgresql.org
2020-04-09 19:19:23 +02:00
--Fragments with phrase search
SELECT ts_headline('english',
'Lorem ipsum urna. Nullam nullam ullamcorper urna.',
to_tsquery('english','Lorem') && phraseto_tsquery('english','ullamcorper urna'),
'MaxFragments=100, MaxWords=100, MinWords=1');
ts_headline
-------------------------------------------------------------------------------
<b>Lorem</b> ipsum <b>urna</b>. Nullam nullam <b>ullamcorper</b> <b>urna</b>
(1 row)
2023-04-06 21:52:37 +02:00
-- Edge cases with empty query
SELECT ts_headline('english',
'', to_tsquery('english', ''));
2023-04-06 21:52:37 +02:00
NOTICE: text-search query doesn't contain lexemes: ""
ts_headline
-------------
(1 row)
SELECT ts_headline('english',
'foo bar', to_tsquery('english', ''));
2023-04-06 21:52:37 +02:00
NOTICE: text-search query doesn't contain lexemes: ""
ts_headline
-------------
foo bar
(1 row)
--Rewrite sub system
CREATE TABLE test_tsquery (txtkeyword TEXT, txtsample TEXT);
\set ECHO none
ALTER TABLE test_tsquery ADD COLUMN keyword tsquery;
UPDATE test_tsquery SET keyword = to_tsquery('english', txtkeyword);
ALTER TABLE test_tsquery ADD COLUMN sample tsquery;
UPDATE test_tsquery SET sample = to_tsquery('english', txtsample::text);
SELECT COUNT(*) FROM test_tsquery WHERE keyword < 'new <-> york';
count
-------
2
(1 row)
SELECT COUNT(*) FROM test_tsquery WHERE keyword <= 'new <-> york';
count
-------
3
(1 row)
SELECT COUNT(*) FROM test_tsquery WHERE keyword = 'new <-> york';
count
-------
1
(1 row)
SELECT COUNT(*) FROM test_tsquery WHERE keyword >= 'new <-> york';
count
-------
4
(1 row)
SELECT COUNT(*) FROM test_tsquery WHERE keyword > 'new <-> york';
count
-------
3
(1 row)
CREATE UNIQUE INDEX bt_tsq ON test_tsquery (keyword);
SET enable_seqscan=OFF;
SELECT COUNT(*) FROM test_tsquery WHERE keyword < 'new <-> york';
count
-------
2
(1 row)
SELECT COUNT(*) FROM test_tsquery WHERE keyword <= 'new <-> york';
count
-------
3
(1 row)
SELECT COUNT(*) FROM test_tsquery WHERE keyword = 'new <-> york';
count
-------
1
(1 row)
SELECT COUNT(*) FROM test_tsquery WHERE keyword >= 'new <-> york';
count
-------
4
(1 row)
SELECT COUNT(*) FROM test_tsquery WHERE keyword > 'new <-> york';
count
-------
3
(1 row)
RESET enable_seqscan;
SELECT ts_rewrite('foo & bar & qq & new & york', 'new & york'::tsquery, 'big & apple | nyc | new & york & city');
ts_rewrite
------------------------------------------------------------------------------
'foo' & 'bar' & 'qq' & ( 'city' & 'new' & 'york' | 'nyc' | 'big' & 'apple' )
(1 row)
SELECT ts_rewrite(ts_rewrite('new & !york ', 'york', '!jersey'),
'jersey', 'mexico');
ts_rewrite
--------------------
'new' & !!'mexico'
(1 row)
SELECT ts_rewrite('moscow', 'SELECT keyword, sample FROM test_tsquery'::text );
ts_rewrite
---------------------
'moskva' | 'moscow'
(1 row)
SELECT ts_rewrite('moscow & hotel', 'SELECT keyword, sample FROM test_tsquery'::text );
ts_rewrite
-----------------------------------
'hotel' & ( 'moskva' | 'moscow' )
(1 row)
SELECT ts_rewrite('bar & qq & foo & (new <-> york)', 'SELECT keyword, sample FROM test_tsquery'::text );
ts_rewrite
-------------------------------------------------------------------------------------
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' <-> 'appl' | 'new' <-> 'york' )
(1 row)
SELECT ts_rewrite( 'moscow', 'SELECT keyword, sample FROM test_tsquery');
ts_rewrite
---------------------
'moskva' | 'moscow'
(1 row)
SELECT ts_rewrite( 'moscow & hotel', 'SELECT keyword, sample FROM test_tsquery');
ts_rewrite
-----------------------------------
'hotel' & ( 'moskva' | 'moscow' )
(1 row)
SELECT ts_rewrite( 'bar & qq & foo & (new <-> york)', 'SELECT keyword, sample FROM test_tsquery');
ts_rewrite
-------------------------------------------------------------------------------------
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' <-> 'appl' | 'new' <-> 'york' )
(1 row)
SELECT ts_rewrite('1 & (2 <-> 3)', 'SELECT keyword, sample FROM test_tsquery'::text );
ts_rewrite
-------------
'2' <-> '4'
(1 row)
SELECT ts_rewrite('1 & (2 <2> 3)', 'SELECT keyword, sample FROM test_tsquery'::text );
ts_rewrite
-------------------
'1' & '2' <2> '3'
(1 row)
SELECT ts_rewrite('5 <-> (1 & (2 <-> 3))', 'SELECT keyword, sample FROM test_tsquery'::text );
Fix strange behavior (and possible crashes) in full text phrase search. In an attempt to simplify the tsquery matching engine, the original phrase search patch invented rewrite rules that would rearrange a tsquery so that no AND/OR/NOT operator appeared below a PHRASE operator. But this approach had numerous problems. The rearrangement step was missed by ts_rewrite (and perhaps other places), allowing tsqueries to be created that would cause Assert failures or perhaps crashes at execution, as reported by Andreas Seltenreich. The rewrite rules effectively defined semantics for operators underneath PHRASE that were buggy, or at least unintuitive. And because rewriting was done in tsqueryin() rather than at execution, the rearrangement was user-visible, which is not very desirable --- for example, it might cause unexpected matches or failures to match in ts_rewrite. As a somewhat independent problem, the behavior of nested PHRASE operators was only sane for left-deep trees; queries like "x <-> (y <-> z)" did not behave intuitively at all. To fix, get rid of the rewrite logic altogether, and instead teach the tsquery execution engine to manage AND/OR/NOT below a PHRASE operator by explicitly computing the match location(s) and match widths for these operators. This requires introducing some additional fields into the publicly visible ExecPhraseData struct; but since there's no way for third-party code to pass such a struct to TS_phrase_execute, it shouldn't create an ABI problem as long as we don't move the offsets of the existing fields. Another related problem was that index searches supposed that "!x <-> y" could be lossily approximated as "!x & y", which isn't correct because the latter will reject, say, "x q y" which the query itself accepts. This required some tweaking in TS_execute_ternary along with the main tsquery engine. Back-patch to 9.6 where phrase operators were introduced. While this could be argued to change behavior more than we'd like in a stable branch, we have to do something about the crash hazards and index-vs-seqscan inconsistency, and it doesn't seem desirable to let the unintuitive behaviors induced by the rewriting implementation stand as precedent. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-21 21:18:25 +01:00
ts_rewrite
-------------------------
'5' <-> ( '2' <-> '4' )
(1 row)
SELECT ts_rewrite('5 <-> (6 | 8)', 'SELECT keyword, sample FROM test_tsquery'::text );
Fix strange behavior (and possible crashes) in full text phrase search. In an attempt to simplify the tsquery matching engine, the original phrase search patch invented rewrite rules that would rearrange a tsquery so that no AND/OR/NOT operator appeared below a PHRASE operator. But this approach had numerous problems. The rearrangement step was missed by ts_rewrite (and perhaps other places), allowing tsqueries to be created that would cause Assert failures or perhaps crashes at execution, as reported by Andreas Seltenreich. The rewrite rules effectively defined semantics for operators underneath PHRASE that were buggy, or at least unintuitive. And because rewriting was done in tsqueryin() rather than at execution, the rearrangement was user-visible, which is not very desirable --- for example, it might cause unexpected matches or failures to match in ts_rewrite. As a somewhat independent problem, the behavior of nested PHRASE operators was only sane for left-deep trees; queries like "x <-> (y <-> z)" did not behave intuitively at all. To fix, get rid of the rewrite logic altogether, and instead teach the tsquery execution engine to manage AND/OR/NOT below a PHRASE operator by explicitly computing the match location(s) and match widths for these operators. This requires introducing some additional fields into the publicly visible ExecPhraseData struct; but since there's no way for third-party code to pass such a struct to TS_phrase_execute, it shouldn't create an ABI problem as long as we don't move the offsets of the existing fields. Another related problem was that index searches supposed that "!x <-> y" could be lossily approximated as "!x & y", which isn't correct because the latter will reject, say, "x q y" which the query itself accepts. This required some tweaking in TS_execute_ternary along with the main tsquery engine. Back-patch to 9.6 where phrase operators were introduced. While this could be argued to change behavior more than we'd like in a stable branch, we have to do something about the crash hazards and index-vs-seqscan inconsistency, and it doesn't seem desirable to let the unintuitive behaviors induced by the rewriting implementation stand as precedent. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-21 21:18:25 +01:00
ts_rewrite
-----------------------
'5' <-> ( '6' | '8' )
(1 row)
-- Check empty substitution
SELECT ts_rewrite(to_tsquery('5 & (6 | 5)'), to_tsquery('5'), to_tsquery(''));
NOTICE: text-search query doesn't contain lexemes: ""
ts_rewrite
------------
'6'
(1 row)
SELECT ts_rewrite(to_tsquery('!5'), to_tsquery('5'), to_tsquery(''));
NOTICE: text-search query doesn't contain lexemes: ""
ts_rewrite
------------
(1 row)
SELECT keyword FROM test_tsquery WHERE keyword @> 'new';
keyword
------------------
'new' <-> 'york'
(1 row)
SELECT keyword FROM test_tsquery WHERE keyword @> 'moscow';
keyword
----------
'moscow'
(1 row)
SELECT keyword FROM test_tsquery WHERE keyword <@ 'new';
keyword
---------
(0 rows)
SELECT keyword FROM test_tsquery WHERE keyword <@ 'moscow';
keyword
----------
'moscow'
(1 row)
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'moscow') AS query;
ts_rewrite
---------------------
'moskva' | 'moscow'
(1 row)
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'moscow & hotel') AS query;
ts_rewrite
-----------------------------------
'hotel' & ( 'moskva' | 'moscow' )
(1 row)
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'bar & qq & foo & (new <-> york)') AS query;
ts_rewrite
-------------------------------------------------------------------------------------
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' <-> 'appl' | 'new' <-> 'york' )
(1 row)
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'moscow') AS query;
ts_rewrite
---------------------
'moskva' | 'moscow'
(1 row)
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'moscow & hotel') AS query;
ts_rewrite
-----------------------------------
'hotel' & ( 'moskva' | 'moscow' )
(1 row)
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'bar & qq & foo & (new <-> york)') AS query;
ts_rewrite
-------------------------------------------------------------------------------------
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' <-> 'appl' | 'new' <-> 'york' )
(1 row)
CREATE INDEX qq ON test_tsquery USING gist (keyword tsquery_ops);
SET enable_seqscan=OFF;
SELECT keyword FROM test_tsquery WHERE keyword @> 'new';
keyword
------------------
'new' <-> 'york'
(1 row)
SELECT keyword FROM test_tsquery WHERE keyword @> 'moscow';
keyword
----------
'moscow'
(1 row)
SELECT keyword FROM test_tsquery WHERE keyword <@ 'new';
keyword
---------
(0 rows)
SELECT keyword FROM test_tsquery WHERE keyword <@ 'moscow';
keyword
----------
'moscow'
(1 row)
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'moscow') AS query;
ts_rewrite
---------------------
'moskva' | 'moscow'
(1 row)
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'moscow & hotel') AS query;
ts_rewrite
-----------------------------------
'hotel' & ( 'moskva' | 'moscow' )
(1 row)
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'bar & qq & foo & (new <-> york)') AS query;
ts_rewrite
-------------------------------------------------------------------------------------
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' <-> 'appl' | 'new' <-> 'york' )
(1 row)
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'moscow') AS query;
ts_rewrite
---------------------
'moskva' | 'moscow'
(1 row)
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'moscow & hotel') AS query;
ts_rewrite
-----------------------------------
'hotel' & ( 'moskva' | 'moscow' )
(1 row)
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'bar & qq & foo & (new <-> york)') AS query;
ts_rewrite
-------------------------------------------------------------------------------------
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' <-> 'appl' | 'new' <-> 'york' )
(1 row)
Fix strange behavior (and possible crashes) in full text phrase search. In an attempt to simplify the tsquery matching engine, the original phrase search patch invented rewrite rules that would rearrange a tsquery so that no AND/OR/NOT operator appeared below a PHRASE operator. But this approach had numerous problems. The rearrangement step was missed by ts_rewrite (and perhaps other places), allowing tsqueries to be created that would cause Assert failures or perhaps crashes at execution, as reported by Andreas Seltenreich. The rewrite rules effectively defined semantics for operators underneath PHRASE that were buggy, or at least unintuitive. And because rewriting was done in tsqueryin() rather than at execution, the rearrangement was user-visible, which is not very desirable --- for example, it might cause unexpected matches or failures to match in ts_rewrite. As a somewhat independent problem, the behavior of nested PHRASE operators was only sane for left-deep trees; queries like "x <-> (y <-> z)" did not behave intuitively at all. To fix, get rid of the rewrite logic altogether, and instead teach the tsquery execution engine to manage AND/OR/NOT below a PHRASE operator by explicitly computing the match location(s) and match widths for these operators. This requires introducing some additional fields into the publicly visible ExecPhraseData struct; but since there's no way for third-party code to pass such a struct to TS_phrase_execute, it shouldn't create an ABI problem as long as we don't move the offsets of the existing fields. Another related problem was that index searches supposed that "!x <-> y" could be lossily approximated as "!x & y", which isn't correct because the latter will reject, say, "x q y" which the query itself accepts. This required some tweaking in TS_execute_ternary along with the main tsquery engine. Back-patch to 9.6 where phrase operators were introduced. While this could be argued to change behavior more than we'd like in a stable branch, we have to do something about the crash hazards and index-vs-seqscan inconsistency, and it doesn't seem desirable to let the unintuitive behaviors induced by the rewriting implementation stand as precedent. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-21 21:18:25 +01:00
SELECT ts_rewrite(tsquery_phrase('foo', 'foo'), 'foo', 'bar | baz');
ts_rewrite
-----------------------------------------
( 'bar' | 'baz' ) <-> ( 'bar' | 'baz' )
(1 row)
SELECT to_tsvector('foo bar') @@
ts_rewrite(tsquery_phrase('foo', 'foo'), 'foo', 'bar | baz');
?column?
----------
f
(1 row)
SELECT to_tsvector('bar baz') @@
ts_rewrite(tsquery_phrase('foo', 'foo'), 'foo', 'bar | baz');
?column?
----------
t
(1 row)
RESET enable_seqscan;
--test GUC
SET default_text_search_config=simple;
SELECT to_tsvector('SKIES My booKs');
to_tsvector
----------------------------
'books':3 'my':2 'skies':1
(1 row)
SELECT plainto_tsquery('SKIES My booKs');
plainto_tsquery
--------------------------
'skies' & 'my' & 'books'
(1 row)
SELECT to_tsquery('SKIES & My | booKs');
to_tsquery
--------------------------
'skies' & 'my' | 'books'
(1 row)
SET default_text_search_config=english;
SELECT to_tsvector('SKIES My booKs');
to_tsvector
------------------
'book':3 'sky':1
(1 row)
SELECT plainto_tsquery('SKIES My booKs');
plainto_tsquery
-----------------
'sky' & 'book'
(1 row)
SELECT to_tsquery('SKIES & My | booKs');
to_tsquery
----------------
'sky' | 'book'
(1 row)
--trigger
CREATE TRIGGER tsvectorupdate
BEFORE UPDATE OR INSERT ON test_tsvector
FOR EACH ROW EXECUTE PROCEDURE tsvector_update_trigger(a, 'pg_catalog.english', t);
SELECT count(*) FROM test_tsvector WHERE a @@ to_tsquery('345&qwerty');
count
-------
0
(1 row)
INSERT INTO test_tsvector (t) VALUES ('345 qwerty');
SELECT count(*) FROM test_tsvector WHERE a @@ to_tsquery('345&qwerty');
count
-------
1
(1 row)
UPDATE test_tsvector SET t = null WHERE t = '345 qwerty';
SELECT count(*) FROM test_tsvector WHERE a @@ to_tsquery('345&qwerty');
count
-------
0
(1 row)
INSERT INTO test_tsvector (t) VALUES ('345 qwerty');
SELECT count(*) FROM test_tsvector WHERE a @@ to_tsquery('345&qwerty');
count
-------
1
(1 row)
Allow functions-in-FROM to be pulled up if they reduce to constants. This allows simplification of the plan tree in some common usage patterns: we can get rid of a join to the function RTE. In principle we could pull up any immutable expression, but restricting it to Consts avoids the risk that multiple evaluations of the expression might cost more than we can save. (Possibly this could be improved in future --- but we've more or less promised people that putting a function in FROM guarantees single evaluation, so we'd have to tread carefully.) To do this, we need to rearrange when eval_const_expressions() happens for expressions in function RTEs. I moved it to inline_set_returning_functions(), which already has to iterate over every function RTE, and in consequence renamed that function to preprocess_function_rtes(). A useful consequence is that inline_set_returning_function() no longer has to do this for itself, simplifying that code. In passing, break out pull_up_simple_subquery's code that knows where everything that needs pullup_replace_vars() processing is, so that the new pull_up_constant_function() routine can share it. We'd gotten away with one-and-a-half copies of that code so far, since pull_up_simple_values() could assume that a lot of cases didn't apply to it --- but I don't think pull_up_constant_function() can make any simplifying assumptions. Might as well make pull_up_simple_values() use it too. (Possibly this refactoring should go further: maybe we could share some of the code to fill in the pullup_replace_vars_context struct? For now, I left it that the callers fill that completely.) Note: the one existing test case that this patch changes has to be changed because inlining its function RTEs would destroy the point of the test, namely to check join order. Alexander Kuzmenkov and Aleksandr Parfenov, reviewed by Antonin Houska and Anastasia Lubennikova, and whacked around some more by me Discussion: https://postgr.es/m/402356c32eeb93d4fed01f66d6c7fe2d@postgrespro.ru
2019-08-02 00:50:22 +02:00
-- Test inlining of immutable constant functions
-- to_tsquery(text) is not immutable, so it won't be inlined
explain (costs off)
select * from test_tsquery, to_tsquery('new') q where txtsample @@ q;
QUERY PLAN
------------------------------------------------
Nested Loop
Join Filter: (test_tsquery.txtsample @@ q.q)
-> Function Scan on to_tsquery q
-> Seq Scan on test_tsquery
(4 rows)
-- to_tsquery(regconfig, text) is an immutable function.
-- That allows us to get rid of using function scan and join at all.
explain (costs off)
select * from test_tsquery, to_tsquery('english', 'new') q where txtsample @@ q;
QUERY PLAN
---------------------------------------------
Seq Scan on test_tsquery
Filter: (txtsample @@ '''new'''::tsquery)
(2 rows)
-- test finding items in GIN's pending list
create temp table pendtest (ts tsvector);
create index pendtest_idx on pendtest using gin(ts);
insert into pendtest values (to_tsvector('Lore ipsam'));
insert into pendtest values (to_tsvector('Lore ipsum'));
select * from pendtest where 'ipsu:*'::tsquery @@ ts;
ts
--------------------
'ipsum':2 'lore':1
(1 row)
select * from pendtest where 'ipsa:*'::tsquery @@ ts;
ts
--------------------
'ipsam':2 'lore':1
(1 row)
select * from pendtest where 'ips:*'::tsquery @@ ts;
ts
--------------------
'ipsam':2 'lore':1
'ipsum':2 'lore':1
(2 rows)
select * from pendtest where 'ipt:*'::tsquery @@ ts;
ts
----
(0 rows)
select * from pendtest where 'ipi:*'::tsquery @@ ts;
ts
----
(0 rows)
--check OP_PHRASE on index
create temp table phrase_index_test(fts tsvector);
insert into phrase_index_test values ('A fat cat has just eaten a rat.');
insert into phrase_index_test values (to_tsvector('english', 'A fat cat has just eaten a rat.'));
create index phrase_index_test_idx on phrase_index_test using gin(fts);
set enable_seqscan = off;
select * from phrase_index_test where fts @@ phraseto_tsquery('english', 'fat cat');
fts
-----------------------------------
'cat':3 'eaten':6 'fat':2 'rat':8
(1 row)
set enable_seqscan = on;
-- test websearch_to_tsquery function
select websearch_to_tsquery('simple', 'I have a fat:*ABCD cat');
websearch_to_tsquery
---------------------------------------------
'i' & 'have' & 'a' & 'fat' & 'abcd' & 'cat'
(1 row)
select websearch_to_tsquery('simple', 'orange:**AABBCCDD');
websearch_to_tsquery
-----------------------
'orange' & 'aabbccdd'
(1 row)
select websearch_to_tsquery('simple', 'fat:A!cat:B|rat:C<');
websearch_to_tsquery
-----------------------------------------
'fat' & 'a' & 'cat' & 'b' & 'rat' & 'c'
(1 row)
select websearch_to_tsquery('simple', 'fat:A : cat:B');
websearch_to_tsquery
---------------------------
'fat' & 'a' & 'cat' & 'b'
(1 row)
select websearch_to_tsquery('simple', 'fat*rat');
websearch_to_tsquery
----------------------
'fat' <-> 'rat'
(1 row)
select websearch_to_tsquery('simple', 'fat-rat');
websearch_to_tsquery
-------------------------------
'fat-rat' <-> 'fat' <-> 'rat'
(1 row)
select websearch_to_tsquery('simple', 'fat_rat');
websearch_to_tsquery
----------------------
'fat' <-> 'rat'
(1 row)
-- weights are completely ignored
select websearch_to_tsquery('simple', 'abc : def');
websearch_to_tsquery
----------------------
'abc' & 'def'
(1 row)
select websearch_to_tsquery('simple', 'abc:def');
websearch_to_tsquery
----------------------
'abc' & 'def'
(1 row)
select websearch_to_tsquery('simple', 'a:::b');
websearch_to_tsquery
----------------------
'a' & 'b'
(1 row)
select websearch_to_tsquery('simple', 'abc:d');
websearch_to_tsquery
----------------------
'abc' & 'd'
(1 row)
select websearch_to_tsquery('simple', ':');
NOTICE: text-search query contains only stop words or doesn't contain lexemes, ignored
websearch_to_tsquery
----------------------
(1 row)
-- these operators are ignored
select websearch_to_tsquery('simple', 'abc & def');
websearch_to_tsquery
----------------------
'abc' & 'def'
(1 row)
select websearch_to_tsquery('simple', 'abc | def');
websearch_to_tsquery
----------------------
'abc' & 'def'
(1 row)
select websearch_to_tsquery('simple', 'abc <-> def');
websearch_to_tsquery
----------------------
'abc' & 'def'
(1 row)
select websearch_to_tsquery('simple', 'abc (pg or class)');
websearch_to_tsquery
------------------------
'abc' & 'pg' | 'class'
(1 row)
-- NOT is ignored in quotes
select websearch_to_tsquery('english', 'My brand new smartphone');
websearch_to_tsquery
-------------------------------
'brand' & 'new' & 'smartphon'
(1 row)
select websearch_to_tsquery('english', 'My brand "new smartphone"');
websearch_to_tsquery
---------------------------------
'brand' & 'new' <-> 'smartphon'
(1 row)
select websearch_to_tsquery('english', 'My brand "new -smartphone"');
websearch_to_tsquery
---------------------------------
'brand' & 'new' <-> 'smartphon'
(1 row)
-- test OR operator
select websearch_to_tsquery('simple', 'cat or rat');
websearch_to_tsquery
----------------------
'cat' | 'rat'
(1 row)
select websearch_to_tsquery('simple', 'cat OR rat');
websearch_to_tsquery
----------------------
'cat' | 'rat'
(1 row)
select websearch_to_tsquery('simple', 'cat "OR" rat');
websearch_to_tsquery
----------------------
'cat' & 'or' & 'rat'
(1 row)
select websearch_to_tsquery('simple', 'cat OR');
websearch_to_tsquery
----------------------
'cat' & 'or'
(1 row)
select websearch_to_tsquery('simple', 'OR rat');
websearch_to_tsquery
----------------------
'or' & 'rat'
(1 row)
select websearch_to_tsquery('simple', '"fat cat OR rat"');
websearch_to_tsquery
------------------------------------
'fat' <-> 'cat' <-> 'or' <-> 'rat'
(1 row)
select websearch_to_tsquery('simple', 'fat (cat OR rat');
websearch_to_tsquery
-----------------------
'fat' & 'cat' | 'rat'
(1 row)
select websearch_to_tsquery('simple', 'or OR or');
websearch_to_tsquery
----------------------
'or' | 'or'
(1 row)
-- OR is an operator here ...
select websearch_to_tsquery('simple', '"fat cat"or"fat rat"');
websearch_to_tsquery
-----------------------------------
'fat' <-> 'cat' | 'fat' <-> 'rat'
(1 row)
select websearch_to_tsquery('simple', 'fat or(rat');
websearch_to_tsquery
----------------------
'fat' | 'rat'
(1 row)
select websearch_to_tsquery('simple', 'fat or)rat');
websearch_to_tsquery
----------------------
'fat' | 'rat'
(1 row)
select websearch_to_tsquery('simple', 'fat or&rat');
websearch_to_tsquery
----------------------
'fat' | 'rat'
(1 row)
select websearch_to_tsquery('simple', 'fat or|rat');
websearch_to_tsquery
----------------------
'fat' | 'rat'
(1 row)
select websearch_to_tsquery('simple', 'fat or!rat');
websearch_to_tsquery
----------------------
'fat' | 'rat'
(1 row)
select websearch_to_tsquery('simple', 'fat or<rat');
websearch_to_tsquery
----------------------
'fat' | 'rat'
(1 row)
select websearch_to_tsquery('simple', 'fat or>rat');
websearch_to_tsquery
----------------------
'fat' | 'rat'
(1 row)
select websearch_to_tsquery('simple', 'fat or ');
websearch_to_tsquery
----------------------
'fat' & 'or'
(1 row)
-- ... but not here
select websearch_to_tsquery('simple', 'abc orange');
websearch_to_tsquery
----------------------
'abc' & 'orange'
(1 row)
select websearch_to_tsquery('simple', 'abc OR1234');
websearch_to_tsquery
----------------------
'abc' & 'or1234'
(1 row)
select websearch_to_tsquery('simple', 'abc or-abc');
websearch_to_tsquery
-------------------------------------
'abc' & 'or-abc' <-> 'or' <-> 'abc'
(1 row)
select websearch_to_tsquery('simple', 'abc OR_abc');
websearch_to_tsquery
------------------------
'abc' & 'or' <-> 'abc'
(1 row)
-- test quotes
select websearch_to_tsquery('english', '"pg_class pg');
websearch_to_tsquery
---------------------------
'pg' <-> 'class' <-> 'pg'
(1 row)
select websearch_to_tsquery('english', 'pg_class pg"');
websearch_to_tsquery
-------------------------
'pg' <-> 'class' & 'pg'
(1 row)
select websearch_to_tsquery('english', '"pg_class pg"');
websearch_to_tsquery
---------------------------
'pg' <-> 'class' <-> 'pg'
(1 row)
select websearch_to_tsquery('english', '"pg_class : pg"');
websearch_to_tsquery
---------------------------
'pg' <-> 'class' <-> 'pg'
(1 row)
select websearch_to_tsquery('english', 'abc "pg_class pg"');
websearch_to_tsquery
-----------------------------------
'abc' & 'pg' <-> 'class' <-> 'pg'
(1 row)
select websearch_to_tsquery('english', '"pg_class pg" def');
websearch_to_tsquery
-----------------------------------
'pg' <-> 'class' <-> 'pg' & 'def'
(1 row)
select websearch_to_tsquery('english', 'abc "pg pg_class pg" def');
websearch_to_tsquery
----------------------------------------------------
'abc' & 'pg' <-> 'pg' <-> 'class' <-> 'pg' & 'def'
(1 row)
select websearch_to_tsquery('english', ' or "pg pg_class pg" or ');
websearch_to_tsquery
------------------------------------
'pg' <-> 'pg' <-> 'class' <-> 'pg'
(1 row)
select websearch_to_tsquery('english', '""pg pg_class pg""');
websearch_to_tsquery
--------------------------------
'pg' & 'pg' <-> 'class' & 'pg'
(1 row)
select websearch_to_tsquery('english', 'abc """"" def');
websearch_to_tsquery
----------------------
'abc' & 'def'
(1 row)
select websearch_to_tsquery('english', 'cat -"fat rat"');
websearch_to_tsquery
------------------------------
'cat' & !( 'fat' <-> 'rat' )
(1 row)
select websearch_to_tsquery('english', 'cat -"fat rat" cheese');
websearch_to_tsquery
----------------------------------------
'cat' & !( 'fat' <-> 'rat' ) & 'chees'
(1 row)
select websearch_to_tsquery('english', 'abc "def -"');
websearch_to_tsquery
----------------------
'abc' & 'def'
(1 row)
select websearch_to_tsquery('english', 'abc "def :"');
websearch_to_tsquery
----------------------
'abc' & 'def'
(1 row)
select websearch_to_tsquery('english', '"A fat cat" has just eaten a -rat.');
websearch_to_tsquery
------------------------------------
'fat' <-> 'cat' & 'eaten' & !'rat'
(1 row)
select websearch_to_tsquery('english', '"A fat cat" has just eaten OR !rat.');
websearch_to_tsquery
-----------------------------------
'fat' <-> 'cat' & 'eaten' | 'rat'
(1 row)
select websearch_to_tsquery('english', '"A fat cat" has just (+eaten OR -rat)');
websearch_to_tsquery
------------------------------------
'fat' <-> 'cat' & 'eaten' | !'rat'
(1 row)
select websearch_to_tsquery('english', 'this is ----fine');
websearch_to_tsquery
----------------------
!!!!'fine'
(1 row)
select websearch_to_tsquery('english', '(()) )))) this ||| is && -fine, "dear friend" OR good');
websearch_to_tsquery
----------------------------------------
!'fine' & 'dear' <-> 'friend' | 'good'
(1 row)
select websearch_to_tsquery('english', 'an old <-> cat " is fine &&& too');
websearch_to_tsquery
------------------------
'old' & 'cat' & 'fine'
(1 row)
select websearch_to_tsquery('english', '"A the" OR just on');
NOTICE: text-search query contains only stop words or doesn't contain lexemes, ignored
websearch_to_tsquery
----------------------
(1 row)
select websearch_to_tsquery('english', '"a fat cat" ate a rat');
websearch_to_tsquery
---------------------------------
'fat' <-> 'cat' & 'ate' & 'rat'
(1 row)
select to_tsvector('english', 'A fat cat ate a rat') @@
websearch_to_tsquery('english', '"a fat cat" ate a rat');
?column?
----------
t
(1 row)
select to_tsvector('english', 'A fat grey cat ate a rat') @@
websearch_to_tsquery('english', '"a fat cat" ate a rat');
?column?
----------
f
(1 row)
-- cases handled by gettoken_tsvector()
select websearch_to_tsquery('''');
NOTICE: text-search query contains only stop words or doesn't contain lexemes, ignored
websearch_to_tsquery
----------------------
(1 row)
select websearch_to_tsquery('''abc''''def''');
websearch_to_tsquery
----------------------
'abc' <-> 'def'
(1 row)
select websearch_to_tsquery('\abc');
websearch_to_tsquery
----------------------
'abc'
(1 row)
select websearch_to_tsquery('\');
NOTICE: text-search query contains only stop words or doesn't contain lexemes, ignored
websearch_to_tsquery
----------------------
(1 row)