postgresql/src/test/regress/expected/tsearch.out

1675 lines
48 KiB
Plaintext
Raw Normal View History

--
-- Sanity checks for text search catalogs
--
-- NB: we assume the oidjoins test will have caught any dangling links,
-- that is OID or REGPROC fields that are not zero and do not match some
-- row in the linked-to table. However, if we want to enforce that a link
-- field can't be 0, we have to check it here.
-- Find unexpected zero link entries
SELECT oid, prsname
FROM pg_ts_parser
WHERE prsnamespace = 0 OR prsstart = 0 OR prstoken = 0 OR prsend = 0 OR
-- prsheadline is optional
prslextype = 0;
oid | prsname
-----+---------
(0 rows)
SELECT oid, dictname
FROM pg_ts_dict
WHERE dictnamespace = 0 OR dictowner = 0 OR dicttemplate = 0;
oid | dictname
-----+----------
(0 rows)
SELECT oid, tmplname
FROM pg_ts_template
WHERE tmplnamespace = 0 OR tmpllexize = 0; -- tmplinit is optional
oid | tmplname
-----+----------
(0 rows)
SELECT oid, cfgname
FROM pg_ts_config
WHERE cfgnamespace = 0 OR cfgowner = 0 OR cfgparser = 0;
oid | cfgname
-----+---------
(0 rows)
SELECT mapcfg, maptokentype, mapseqno
FROM pg_ts_config_map
WHERE mapcfg = 0 OR mapdict = 0;
mapcfg | maptokentype | mapseqno
--------+--------------+----------
(0 rows)
-- Look for pg_ts_config_map entries that aren't one of parser's token types
SELECT * FROM
( SELECT oid AS cfgid, (ts_token_type(cfgparser)).tokid AS tokid
FROM pg_ts_config ) AS tt
RIGHT JOIN pg_ts_config_map AS m
ON (tt.cfgid=m.mapcfg AND tt.tokid=m.maptokentype)
WHERE
tt.cfgid IS NULL OR tt.tokid IS NULL;
cfgid | tokid | mapcfg | maptokentype | mapseqno | mapdict
-------+-------+--------+--------------+----------+---------
(0 rows)
-- test basic text search behavior without indexes, then with
SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh';
count
-------
158
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wr&qh';
count
-------
17
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'eq&yt';
count
-------
6
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'eq|yt';
count
-------
98
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '(eq&yt)|(wr&qh)';
count
-------
23
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '(eq|yt)&(wr|qh)';
count
-------
39
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'w:*|q:*';
count
-------
494
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}');
count
-------
158
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'no_such_lexeme';
count
-------
0
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!no_such_lexeme';
count
-------
508
(1 row)
create index wowidx on test_tsvector using gist (a);
SET enable_seqscan=OFF;
SET enable_indexscan=ON;
SET enable_bitmapscan=OFF;
explain (costs off) SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh';
QUERY PLAN
-------------------------------------------------------
Aggregate
-> Index Scan using wowidx on test_tsvector
Index Cond: (a @@ '''wr'' | ''qh'''::tsquery)
(3 rows)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh';
count
-------
158
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wr&qh';
count
-------
17
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'eq&yt';
count
-------
6
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'eq|yt';
count
-------
98
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '(eq&yt)|(wr&qh)';
count
-------
23
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '(eq|yt)&(wr|qh)';
count
-------
39
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'w:*|q:*';
count
-------
494
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}');
count
-------
158
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'no_such_lexeme';
count
-------
0
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!no_such_lexeme';
count
-------
508
(1 row)
SET enable_indexscan=OFF;
SET enable_bitmapscan=ON;
explain (costs off) SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh';
QUERY PLAN
-------------------------------------------------------------
Aggregate
-> Bitmap Heap Scan on test_tsvector
Recheck Cond: (a @@ '''wr'' | ''qh'''::tsquery)
-> Bitmap Index Scan on wowidx
Index Cond: (a @@ '''wr'' | ''qh'''::tsquery)
(5 rows)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh';
count
-------
158
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wr&qh';
count
-------
17
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'eq&yt';
count
-------
6
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'eq|yt';
count
-------
98
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '(eq&yt)|(wr&qh)';
count
-------
23
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '(eq|yt)&(wr|qh)';
count
-------
39
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'w:*|q:*';
count
-------
494
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}');
count
-------
158
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'no_such_lexeme';
count
-------
0
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!no_such_lexeme';
count
-------
508
(1 row)
RESET enable_seqscan;
RESET enable_indexscan;
RESET enable_bitmapscan;
DROP INDEX wowidx;
CREATE INDEX wowidx ON test_tsvector USING gin (a);
SET enable_seqscan=OFF;
-- GIN only supports bitmapscan, so no need to test plain indexscan
explain (costs off) SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh';
QUERY PLAN
-------------------------------------------------------------
Aggregate
-> Bitmap Heap Scan on test_tsvector
Recheck Cond: (a @@ '''wr'' | ''qh'''::tsquery)
-> Bitmap Index Scan on wowidx
Index Cond: (a @@ '''wr'' | ''qh'''::tsquery)
(5 rows)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh';
count
-------
158
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'wr&qh';
count
-------
17
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'eq&yt';
count
-------
6
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'eq|yt';
count
-------
98
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '(eq&yt)|(wr&qh)';
count
-------
23
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '(eq|yt)&(wr|qh)';
count
-------
39
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'w:*|q:*';
count
-------
494
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}');
count
-------
158
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ 'no_such_lexeme';
count
-------
0
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ '!no_such_lexeme';
count
-------
508
(1 row)
RESET enable_seqscan;
INSERT INTO test_tsvector VALUES ('???', 'DFG:1A,2B,6C,10 FGH');
SELECT * FROM ts_stat('SELECT a FROM test_tsvector') ORDER BY ndoc DESC, nentry DESC, word LIMIT 10;
word | ndoc | nentry
------+------+--------
qq | 108 | 108
qt | 102 | 102
qe | 100 | 100
qh | 98 | 98
qw | 98 | 98
qa | 97 | 97
ql | 94 | 94
qs | 94 | 94
qi | 92 | 92
qr | 92 | 92
(10 rows)
SELECT * FROM ts_stat('SELECT a FROM test_tsvector', 'AB') ORDER BY ndoc DESC, nentry DESC, word;
word | ndoc | nentry
------+------+--------
DFG | 1 | 2
(1 row)
--dictionaries and to_tsvector
SELECT ts_lexize('english_stem', 'skies');
ts_lexize
-----------
{sky}
(1 row)
SELECT ts_lexize('english_stem', 'identity');
ts_lexize
-----------
{ident}
(1 row)
SELECT * FROM ts_token_type('default');
tokid | alias | description
-------+-----------------+------------------------------------------
1 | asciiword | Word, all ASCII
2 | word | Word, all letters
3 | numword | Word, letters and digits
4 | email | Email address
5 | url | URL
6 | host | Host
7 | sfloat | Scientific notation
8 | version | Version number
9 | hword_numpart | Hyphenated word part, letters and digits
10 | hword_part | Hyphenated word part, all letters
11 | hword_asciipart | Hyphenated word part, all ASCII
12 | blank | Space symbols
13 | tag | XML tag
14 | protocol | Protocol head
15 | numhword | Hyphenated word, letters and digits
16 | asciihword | Hyphenated word, all ASCII
17 | hword | Hyphenated word, all letters
18 | url_path | URL path
19 | file | File or path name
20 | float | Decimal notation
21 | int | Signed integer
22 | uint | Unsigned integer
23 | entity | XML entity
(23 rows)
SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net teodor@123-stack.net 123_teodor@stack.net 123-teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
/usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
<i <b> wow < jqw <> qwerty');
tokid | token
-------+--------------------------------------
22 | 345
12 |
1 | qwe
12 | @
19 | efd.r
12 | '
14 | http://
6 | www.com
12 | /
14 | http://
5 | aew.werc.ewr/?ad=qwe&dw
6 | aew.werc.ewr
18 | /?ad=qwe&dw
12 |
5 | 1aew.werc.ewr/?ad=qwe&dw
6 | 1aew.werc.ewr
18 | /?ad=qwe&dw
12 |
6 | 2aew.werc.ewr
12 |
14 | http://
5 | 3aew.werc.ewr/?ad=qwe&dw
6 | 3aew.werc.ewr
18 | /?ad=qwe&dw
12 |
14 | http://
6 | 4aew.werc.ewr
12 |
14 | http://
5 | 5aew.werc.ewr:8100/?
6 | 5aew.werc.ewr:8100
18 | /?
12 |
1 | ad
12 | =
1 | qwe
12 | &
1 | dw
12 |
5 | 6aew.werc.ewr:8100/?ad=qwe&dw
6 | 6aew.werc.ewr:8100
18 | /?ad=qwe&dw
12 |
5 | 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32
6 | 7aew.werc.ewr:8100
18 | /?ad=qwe&dw=%20%32
12 |
7 | +4.0e-10
12 |
1 | qwe
12 |
1 | qwe
12 |
1 | qwqwe
12 |
20 | 234.435
12 |
22 | 455
12 |
20 | 5.005
12 |
4 | teodor@stack.net
12 |
4 | teodor@123-stack.net
12 |
4 | 123_teodor@stack.net
12 |
4 | 123-teodor@stack.net
12 |
16 | qwe-wer
11 | qwe
12 | -
11 | wer
12 |
1 | asdf
12 |
13 | <fr>
1 | qwer
12 |
1 | jf
12 |
1 | sdjk
12 | <
1 | we
12 |
1 | hjwer
12 |
13 | <werrwe>
12 |
3 | ewr1
12 | >
3 | ewri2
12 |
13 | <a href="qwe<qwe>">
12 | +
|
19 | /usr/local/fff
12 |
19 | /awdf/dwqe/4325
12 |
19 | rewt/ewr
12 |
1 | wefjn
12 |
19 | /wqe-324/ewr
12 |
19 | gist.h
12 |
19 | gist.h.c
12 |
19 | gist.c
12 | .
1 | readline
12 |
20 | 4.2
12 |
20 | 4.2
12 | .
20 | 4.2
12 | ,
1 | readline
20 | -4.2
12 |
1 | readline
20 | -4.2
12 | .
22 | 234
12 | +
|
12 | <
1 | i
12 |
13 | <b>
12 |
1 | wow
12 |
12 | <
1 | jqw
12 |
12 | <>
1 | qwerty
(139 rows)
SELECT to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net teodor@123-stack.net 123_teodor@stack.net 123-teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
/usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
<i <b> wow < jqw <> qwerty');
to_tsvector
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
'+4.0e-10':28 '-4.2':63,65 '/?':18 '/?ad=qwe&dw':7,10,14,24 '/?ad=qwe&dw=%20%32':27 '/awdf/dwqe/4325':51 '/usr/local/fff':50 '/wqe-324/ewr':54 '123-teodor@stack.net':38 '123_teodor@stack.net':37 '1aew.werc.ewr':9 '1aew.werc.ewr/?ad=qwe&dw':8 '234':66 '234.435':32 '2aew.werc.ewr':11 '345':1 '3aew.werc.ewr':13 '3aew.werc.ewr/?ad=qwe&dw':12 '4.2':59,60,61 '455':33 '4aew.werc.ewr':15 '5.005':34 '5aew.werc.ewr:8100':17 '5aew.werc.ewr:8100/?':16 '6aew.werc.ewr:8100':23 '6aew.werc.ewr:8100/?ad=qwe&dw':22 '7aew.werc.ewr:8100':26 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':25 'ad':19 'aew.werc.ewr':6 'aew.werc.ewr/?ad=qwe&dw':5 'asdf':42 'dw':21 'efd.r':3 'ewr1':48 'ewri2':49 'gist.c':57 'gist.h':55 'gist.h.c':56 'hjwer':47 'jf':44 'jqw':69 'qwe':2,20,29,30,40 'qwe-wer':39 'qwer':43 'qwerti':70 'qwqwe':31 'readlin':58,62,64 'rewt/ewr':52 'sdjk':45 'teodor@123-stack.net':36 'teodor@stack.net':35 'wefjn':53 'wer':41 'wow':68 'www.com':4
(1 row)
SELECT length(to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net teodor@123-stack.net 123_teodor@stack.net 123-teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
/usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
<i <b> wow < jqw <> qwerty'));
length
--------
56
(1 row)
-- ts_debug
SELECT * from ts_debug('english', '<myns:foo-bar_baz.blurfl>abc&nm1;def&#xa9;ghi&#245;jkl</myns:foo-bar_baz.blurfl>');
alias | description | token | dictionaries | dictionary | lexemes
-----------+-----------------+----------------------------+----------------+--------------+---------
tag | XML tag | <myns:foo-bar_baz.blurfl> | {} | |
asciiword | Word, all ASCII | abc | {english_stem} | english_stem | {abc}
entity | XML entity | &nm1; | {} | |
asciiword | Word, all ASCII | def | {english_stem} | english_stem | {def}
entity | XML entity | &#xa9; | {} | |
asciiword | Word, all ASCII | ghi | {english_stem} | english_stem | {ghi}
entity | XML entity | &#245; | {} | |
asciiword | Word, all ASCII | jkl | {english_stem} | english_stem | {jkl}
tag | XML tag | </myns:foo-bar_baz.blurfl> | {} | |
(9 rows)
-- check parsing of URLs
SELECT * from ts_debug('english', 'http://www.harewoodsolutions.co.uk/press.aspx</span>');
alias | description | token | dictionaries | dictionary | lexemes
----------+---------------+----------------------------------------+--------------+------------+------------------------------------------
protocol | Protocol head | http:// | {} | |
url | URL | www.harewoodsolutions.co.uk/press.aspx | {simple} | simple | {www.harewoodsolutions.co.uk/press.aspx}
host | Host | www.harewoodsolutions.co.uk | {simple} | simple | {www.harewoodsolutions.co.uk}
url_path | URL path | /press.aspx | {simple} | simple | {/press.aspx}
tag | XML tag | </span> | {} | |
(5 rows)
SELECT * from ts_debug('english', 'http://aew.wer0c.ewr/id?ad=qwe&dw<span>');
alias | description | token | dictionaries | dictionary | lexemes
----------+---------------+----------------------------+--------------+------------+------------------------------
protocol | Protocol head | http:// | {} | |
url | URL | aew.wer0c.ewr/id?ad=qwe&dw | {simple} | simple | {aew.wer0c.ewr/id?ad=qwe&dw}
host | Host | aew.wer0c.ewr | {simple} | simple | {aew.wer0c.ewr}
url_path | URL path | /id?ad=qwe&dw | {simple} | simple | {/id?ad=qwe&dw}
tag | XML tag | <span> | {} | |
(5 rows)
SELECT * from ts_debug('english', 'http://5aew.werc.ewr:8100/?');
alias | description | token | dictionaries | dictionary | lexemes
----------+---------------+----------------------+--------------+------------+------------------------
protocol | Protocol head | http:// | {} | |
url | URL | 5aew.werc.ewr:8100/? | {simple} | simple | {5aew.werc.ewr:8100/?}
host | Host | 5aew.werc.ewr:8100 | {simple} | simple | {5aew.werc.ewr:8100}
url_path | URL path | /? | {simple} | simple | {/?}
(4 rows)
SELECT * from ts_debug('english', '5aew.werc.ewr:8100/?xx');
alias | description | token | dictionaries | dictionary | lexemes
----------+-------------+------------------------+--------------+------------+--------------------------
url | URL | 5aew.werc.ewr:8100/?xx | {simple} | simple | {5aew.werc.ewr:8100/?xx}
host | Host | 5aew.werc.ewr:8100 | {simple} | simple | {5aew.werc.ewr:8100}
url_path | URL path | /?xx | {simple} | simple | {/?xx}
(3 rows)
2017-09-25 17:55:24 +02:00
SELECT token, alias,
dictionaries, dictionaries is null as dnull, array_dims(dictionaries) as ddims,
lexemes, lexemes is null as lnull, array_dims(lexemes) as ldims
from ts_debug('english', 'a title');
token | alias | dictionaries | dnull | ddims | lexemes | lnull | ldims
-------+-----------+----------------+-------+-------+---------+-------+-------
a | asciiword | {english_stem} | f | [1:1] | {} | f |
| blank | {} | f | | | t |
title | asciiword | {english_stem} | f | [1:1] | {titl} | f | [1:1]
(3 rows)
-- to_tsquery
SELECT to_tsquery('english', 'qwe & sKies ');
to_tsquery
---------------
'qwe' & 'sky'
(1 row)
SELECT to_tsquery('simple', 'qwe & sKies ');
to_tsquery
-----------------
'qwe' & 'skies'
(1 row)
SELECT to_tsquery('english', '''the wether'':dc & '' sKies '':BC ');
to_tsquery
------------------------
'wether':CD & 'sky':BC
(1 row)
SELECT to_tsquery('english', 'asd&(and|fghj)');
to_tsquery
----------------
'asd' & 'fghj'
(1 row)
SELECT to_tsquery('english', '(asd&and)|fghj');
to_tsquery
----------------
'asd' | 'fghj'
(1 row)
SELECT to_tsquery('english', '(asd&!and)|fghj');
to_tsquery
----------------
'asd' | 'fghj'
(1 row)
SELECT to_tsquery('english', '(the|and&(i&1))&fghj');
to_tsquery
--------------
'1' & 'fghj'
(1 row)
SELECT plainto_tsquery('english', 'the and z 1))& fghj');
plainto_tsquery
--------------------
'z' & '1' & 'fghj'
(1 row)
SELECT plainto_tsquery('english', 'foo bar') && plainto_tsquery('english', 'asd');
?column?
-----------------------
'foo' & 'bar' & 'asd'
(1 row)
SELECT plainto_tsquery('english', 'foo bar') || plainto_tsquery('english', 'asd fg');
?column?
------------------------------
'foo' & 'bar' | 'asd' & 'fg'
(1 row)
SELECT plainto_tsquery('english', 'foo bar') || !!plainto_tsquery('english', 'asd fg');
?column?
-----------------------------------
'foo' & 'bar' | !( 'asd' & 'fg' )
(1 row)
SELECT plainto_tsquery('english', 'foo bar') && 'asd | fg';
?column?
----------------------------------
'foo' & 'bar' & ( 'asd' | 'fg' )
(1 row)
-- Check stop word deletion, a and s are stop-words
SELECT to_tsquery('english', '!(a & !b) & c');
Fix strange behavior (and possible crashes) in full text phrase search. In an attempt to simplify the tsquery matching engine, the original phrase search patch invented rewrite rules that would rearrange a tsquery so that no AND/OR/NOT operator appeared below a PHRASE operator. But this approach had numerous problems. The rearrangement step was missed by ts_rewrite (and perhaps other places), allowing tsqueries to be created that would cause Assert failures or perhaps crashes at execution, as reported by Andreas Seltenreich. The rewrite rules effectively defined semantics for operators underneath PHRASE that were buggy, or at least unintuitive. And because rewriting was done in tsqueryin() rather than at execution, the rearrangement was user-visible, which is not very desirable --- for example, it might cause unexpected matches or failures to match in ts_rewrite. As a somewhat independent problem, the behavior of nested PHRASE operators was only sane for left-deep trees; queries like "x <-> (y <-> z)" did not behave intuitively at all. To fix, get rid of the rewrite logic altogether, and instead teach the tsquery execution engine to manage AND/OR/NOT below a PHRASE operator by explicitly computing the match location(s) and match widths for these operators. This requires introducing some additional fields into the publicly visible ExecPhraseData struct; but since there's no way for third-party code to pass such a struct to TS_phrase_execute, it shouldn't create an ABI problem as long as we don't move the offsets of the existing fields. Another related problem was that index searches supposed that "!x <-> y" could be lossily approximated as "!x & y", which isn't correct because the latter will reject, say, "x q y" which the query itself accepts. This required some tweaking in TS_execute_ternary along with the main tsquery engine. Back-patch to 9.6 where phrase operators were introduced. While this could be argued to change behavior more than we'd like in a stable branch, we have to do something about the crash hazards and index-vs-seqscan inconsistency, and it doesn't seem desirable to let the unintuitive behaviors induced by the rewriting implementation stand as precedent. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-21 21:18:25 +01:00
to_tsquery
-------------
!!'b' & 'c'
(1 row)
SELECT to_tsquery('english', '!(a & !b)');
to_tsquery
------------
Fix strange behavior (and possible crashes) in full text phrase search. In an attempt to simplify the tsquery matching engine, the original phrase search patch invented rewrite rules that would rearrange a tsquery so that no AND/OR/NOT operator appeared below a PHRASE operator. But this approach had numerous problems. The rearrangement step was missed by ts_rewrite (and perhaps other places), allowing tsqueries to be created that would cause Assert failures or perhaps crashes at execution, as reported by Andreas Seltenreich. The rewrite rules effectively defined semantics for operators underneath PHRASE that were buggy, or at least unintuitive. And because rewriting was done in tsqueryin() rather than at execution, the rearrangement was user-visible, which is not very desirable --- for example, it might cause unexpected matches or failures to match in ts_rewrite. As a somewhat independent problem, the behavior of nested PHRASE operators was only sane for left-deep trees; queries like "x <-> (y <-> z)" did not behave intuitively at all. To fix, get rid of the rewrite logic altogether, and instead teach the tsquery execution engine to manage AND/OR/NOT below a PHRASE operator by explicitly computing the match location(s) and match widths for these operators. This requires introducing some additional fields into the publicly visible ExecPhraseData struct; but since there's no way for third-party code to pass such a struct to TS_phrase_execute, it shouldn't create an ABI problem as long as we don't move the offsets of the existing fields. Another related problem was that index searches supposed that "!x <-> y" could be lossily approximated as "!x & y", which isn't correct because the latter will reject, say, "x q y" which the query itself accepts. This required some tweaking in TS_execute_ternary along with the main tsquery engine. Back-patch to 9.6 where phrase operators were introduced. While this could be argued to change behavior more than we'd like in a stable branch, we have to do something about the crash hazards and index-vs-seqscan inconsistency, and it doesn't seem desirable to let the unintuitive behaviors induced by the rewriting implementation stand as precedent. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-21 21:18:25 +01:00
!!'b'
(1 row)
SELECT to_tsquery('english', '(1 <-> 2) <-> a');
to_tsquery
-------------
'1' <-> '2'
(1 row)
SELECT to_tsquery('english', '(1 <-> a) <-> 2');
to_tsquery
-------------
'1' <2> '2'
(1 row)
SELECT to_tsquery('english', '(a <-> 1) <-> 2');
to_tsquery
-------------
'1' <-> '2'
(1 row)
SELECT to_tsquery('english', 'a <-> (1 <-> 2)');
to_tsquery
-------------
'1' <-> '2'
(1 row)
SELECT to_tsquery('english', '1 <-> (a <-> 2)');
to_tsquery
-------------
Fix handling of phrase operator removal while removing tsquery stopwords. The distance of a removed phrase operator should propagate up to a parent phrase operator if there is one, but this only worked correctly in left-deep trees. Throwing in a few parentheses confused it completely, as indeed was illustrated by bizarre results in existing regression test cases. To fix, track unaccounted-for distances that should propagate to the left and to the right of the current node, rather than trying to make it work with only one returned distance. Also make some adjustments to behave as well as we can for cases of intermixed phrase and regular (AND/OR) operators. I don't think it's possible to be 100% correct for that without a rethinking of the tsquery representation; for example, maybe we should just not drop stopword nodes at all underneath phrase operators. But this is better than it was, and changing tsquery representation wouldn't be safely back-patchable. While at it, I simplified the API of the clean_fakeval_intree function a bit by getting rid of the "char *result" output parameter; that wasn't doing anything that wasn't redundant with whether the result node is NULL or not, and testing for NULL seems a lot clearer/safer. This is part of a larger project to fix various infelicities in the phrase-search implementation, but this part seems comittable on its own. Back-patch to 9.6 where phrase operators were introduced. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-19 19:49:45 +01:00
'1' <2> '2'
(1 row)
SELECT to_tsquery('english', '1 <-> (2 <-> a)');
to_tsquery
-------------
'1' <-> '2'
(1 row)
SELECT to_tsquery('english', '(1 <-> 2) <3> a');
to_tsquery
-------------
'1' <-> '2'
(1 row)
SELECT to_tsquery('english', '(1 <-> a) <3> 2');
to_tsquery
-------------
'1' <4> '2'
(1 row)
SELECT to_tsquery('english', '(a <-> 1) <3> 2');
to_tsquery
-------------
'1' <3> '2'
(1 row)
SELECT to_tsquery('english', 'a <3> (1 <-> 2)');
to_tsquery
-------------
'1' <-> '2'
(1 row)
SELECT to_tsquery('english', '1 <3> (a <-> 2)');
to_tsquery
-------------
Fix handling of phrase operator removal while removing tsquery stopwords. The distance of a removed phrase operator should propagate up to a parent phrase operator if there is one, but this only worked correctly in left-deep trees. Throwing in a few parentheses confused it completely, as indeed was illustrated by bizarre results in existing regression test cases. To fix, track unaccounted-for distances that should propagate to the left and to the right of the current node, rather than trying to make it work with only one returned distance. Also make some adjustments to behave as well as we can for cases of intermixed phrase and regular (AND/OR) operators. I don't think it's possible to be 100% correct for that without a rethinking of the tsquery representation; for example, maybe we should just not drop stopword nodes at all underneath phrase operators. But this is better than it was, and changing tsquery representation wouldn't be safely back-patchable. While at it, I simplified the API of the clean_fakeval_intree function a bit by getting rid of the "char *result" output parameter; that wasn't doing anything that wasn't redundant with whether the result node is NULL or not, and testing for NULL seems a lot clearer/safer. This is part of a larger project to fix various infelicities in the phrase-search implementation, but this part seems comittable on its own. Back-patch to 9.6 where phrase operators were introduced. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-19 19:49:45 +01:00
'1' <4> '2'
(1 row)
SELECT to_tsquery('english', '1 <3> (2 <-> a)');
to_tsquery
-------------
'1' <3> '2'
(1 row)
SELECT to_tsquery('english', '(1 <3> 2) <-> a');
to_tsquery
-------------
'1' <3> '2'
(1 row)
SELECT to_tsquery('english', '(1 <3> a) <-> 2');
to_tsquery
-------------
'1' <4> '2'
(1 row)
SELECT to_tsquery('english', '(a <3> 1) <-> 2');
to_tsquery
-------------
'1' <-> '2'
(1 row)
SELECT to_tsquery('english', 'a <-> (1 <3> 2)');
to_tsquery
-------------
'1' <3> '2'
(1 row)
SELECT to_tsquery('english', '1 <-> (a <3> 2)');
to_tsquery
-------------
Fix handling of phrase operator removal while removing tsquery stopwords. The distance of a removed phrase operator should propagate up to a parent phrase operator if there is one, but this only worked correctly in left-deep trees. Throwing in a few parentheses confused it completely, as indeed was illustrated by bizarre results in existing regression test cases. To fix, track unaccounted-for distances that should propagate to the left and to the right of the current node, rather than trying to make it work with only one returned distance. Also make some adjustments to behave as well as we can for cases of intermixed phrase and regular (AND/OR) operators. I don't think it's possible to be 100% correct for that without a rethinking of the tsquery representation; for example, maybe we should just not drop stopword nodes at all underneath phrase operators. But this is better than it was, and changing tsquery representation wouldn't be safely back-patchable. While at it, I simplified the API of the clean_fakeval_intree function a bit by getting rid of the "char *result" output parameter; that wasn't doing anything that wasn't redundant with whether the result node is NULL or not, and testing for NULL seems a lot clearer/safer. This is part of a larger project to fix various infelicities in the phrase-search implementation, but this part seems comittable on its own. Back-patch to 9.6 where phrase operators were introduced. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-19 19:49:45 +01:00
'1' <4> '2'
(1 row)
SELECT to_tsquery('english', '1 <-> (2 <3> a)');
to_tsquery
-------------
'1' <-> '2'
(1 row)
SELECT to_tsquery('english', '((a <-> 1) <-> 2) <-> s');
to_tsquery
-------------
'1' <-> '2'
(1 row)
SELECT to_tsquery('english', '(2 <-> (a <-> 1)) <-> s');
to_tsquery
-------------
Fix handling of phrase operator removal while removing tsquery stopwords. The distance of a removed phrase operator should propagate up to a parent phrase operator if there is one, but this only worked correctly in left-deep trees. Throwing in a few parentheses confused it completely, as indeed was illustrated by bizarre results in existing regression test cases. To fix, track unaccounted-for distances that should propagate to the left and to the right of the current node, rather than trying to make it work with only one returned distance. Also make some adjustments to behave as well as we can for cases of intermixed phrase and regular (AND/OR) operators. I don't think it's possible to be 100% correct for that without a rethinking of the tsquery representation; for example, maybe we should just not drop stopword nodes at all underneath phrase operators. But this is better than it was, and changing tsquery representation wouldn't be safely back-patchable. While at it, I simplified the API of the clean_fakeval_intree function a bit by getting rid of the "char *result" output parameter; that wasn't doing anything that wasn't redundant with whether the result node is NULL or not, and testing for NULL seems a lot clearer/safer. This is part of a larger project to fix various infelicities in the phrase-search implementation, but this part seems comittable on its own. Back-patch to 9.6 where phrase operators were introduced. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-19 19:49:45 +01:00
'2' <2> '1'
(1 row)
SELECT to_tsquery('english', '((1 <-> a) <-> 2) <-> s');
to_tsquery
-------------
'1' <2> '2'
(1 row)
SELECT to_tsquery('english', '(2 <-> (1 <-> a)) <-> s');
to_tsquery
-------------
'2' <-> '1'
(1 row)
SELECT to_tsquery('english', 's <-> ((a <-> 1) <-> 2)');
to_tsquery
-------------
'1' <-> '2'
(1 row)
SELECT to_tsquery('english', 's <-> (2 <-> (a <-> 1))');
to_tsquery
-------------
Fix handling of phrase operator removal while removing tsquery stopwords. The distance of a removed phrase operator should propagate up to a parent phrase operator if there is one, but this only worked correctly in left-deep trees. Throwing in a few parentheses confused it completely, as indeed was illustrated by bizarre results in existing regression test cases. To fix, track unaccounted-for distances that should propagate to the left and to the right of the current node, rather than trying to make it work with only one returned distance. Also make some adjustments to behave as well as we can for cases of intermixed phrase and regular (AND/OR) operators. I don't think it's possible to be 100% correct for that without a rethinking of the tsquery representation; for example, maybe we should just not drop stopword nodes at all underneath phrase operators. But this is better than it was, and changing tsquery representation wouldn't be safely back-patchable. While at it, I simplified the API of the clean_fakeval_intree function a bit by getting rid of the "char *result" output parameter; that wasn't doing anything that wasn't redundant with whether the result node is NULL or not, and testing for NULL seems a lot clearer/safer. This is part of a larger project to fix various infelicities in the phrase-search implementation, but this part seems comittable on its own. Back-patch to 9.6 where phrase operators were introduced. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-19 19:49:45 +01:00
'2' <2> '1'
(1 row)
SELECT to_tsquery('english', 's <-> ((1 <-> a) <-> 2)');
to_tsquery
-------------
'1' <2> '2'
(1 row)
SELECT to_tsquery('english', 's <-> (2 <-> (1 <-> a))');
to_tsquery
-------------
'2' <-> '1'
(1 row)
SELECT to_tsquery('english', '((a <-> 1) <-> s) <-> 2');
to_tsquery
-------------
'1' <2> '2'
(1 row)
SELECT to_tsquery('english', '(s <-> (a <-> 1)) <-> 2');
to_tsquery
-------------
'1' <-> '2'
(1 row)
SELECT to_tsquery('english', '((1 <-> a) <-> s) <-> 2');
to_tsquery
-------------
'1' <3> '2'
(1 row)
SELECT to_tsquery('english', '(s <-> (1 <-> a)) <-> 2');
to_tsquery
-------------
'1' <2> '2'
(1 row)
SELECT to_tsquery('english', '2 <-> ((a <-> 1) <-> s)');
to_tsquery
-------------
Fix handling of phrase operator removal while removing tsquery stopwords. The distance of a removed phrase operator should propagate up to a parent phrase operator if there is one, but this only worked correctly in left-deep trees. Throwing in a few parentheses confused it completely, as indeed was illustrated by bizarre results in existing regression test cases. To fix, track unaccounted-for distances that should propagate to the left and to the right of the current node, rather than trying to make it work with only one returned distance. Also make some adjustments to behave as well as we can for cases of intermixed phrase and regular (AND/OR) operators. I don't think it's possible to be 100% correct for that without a rethinking of the tsquery representation; for example, maybe we should just not drop stopword nodes at all underneath phrase operators. But this is better than it was, and changing tsquery representation wouldn't be safely back-patchable. While at it, I simplified the API of the clean_fakeval_intree function a bit by getting rid of the "char *result" output parameter; that wasn't doing anything that wasn't redundant with whether the result node is NULL or not, and testing for NULL seems a lot clearer/safer. This is part of a larger project to fix various infelicities in the phrase-search implementation, but this part seems comittable on its own. Back-patch to 9.6 where phrase operators were introduced. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-19 19:49:45 +01:00
'2' <2> '1'
(1 row)
SELECT to_tsquery('english', '2 <-> (s <-> (a <-> 1))');
to_tsquery
-------------
Fix handling of phrase operator removal while removing tsquery stopwords. The distance of a removed phrase operator should propagate up to a parent phrase operator if there is one, but this only worked correctly in left-deep trees. Throwing in a few parentheses confused it completely, as indeed was illustrated by bizarre results in existing regression test cases. To fix, track unaccounted-for distances that should propagate to the left and to the right of the current node, rather than trying to make it work with only one returned distance. Also make some adjustments to behave as well as we can for cases of intermixed phrase and regular (AND/OR) operators. I don't think it's possible to be 100% correct for that without a rethinking of the tsquery representation; for example, maybe we should just not drop stopword nodes at all underneath phrase operators. But this is better than it was, and changing tsquery representation wouldn't be safely back-patchable. While at it, I simplified the API of the clean_fakeval_intree function a bit by getting rid of the "char *result" output parameter; that wasn't doing anything that wasn't redundant with whether the result node is NULL or not, and testing for NULL seems a lot clearer/safer. This is part of a larger project to fix various infelicities in the phrase-search implementation, but this part seems comittable on its own. Back-patch to 9.6 where phrase operators were introduced. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-19 19:49:45 +01:00
'2' <3> '1'
(1 row)
SELECT to_tsquery('english', '2 <-> ((1 <-> a) <-> s)');
to_tsquery
-------------
'2' <-> '1'
(1 row)
SELECT to_tsquery('english', '2 <-> (s <-> (1 <-> a))');
to_tsquery
-------------
Fix handling of phrase operator removal while removing tsquery stopwords. The distance of a removed phrase operator should propagate up to a parent phrase operator if there is one, but this only worked correctly in left-deep trees. Throwing in a few parentheses confused it completely, as indeed was illustrated by bizarre results in existing regression test cases. To fix, track unaccounted-for distances that should propagate to the left and to the right of the current node, rather than trying to make it work with only one returned distance. Also make some adjustments to behave as well as we can for cases of intermixed phrase and regular (AND/OR) operators. I don't think it's possible to be 100% correct for that without a rethinking of the tsquery representation; for example, maybe we should just not drop stopword nodes at all underneath phrase operators. But this is better than it was, and changing tsquery representation wouldn't be safely back-patchable. While at it, I simplified the API of the clean_fakeval_intree function a bit by getting rid of the "char *result" output parameter; that wasn't doing anything that wasn't redundant with whether the result node is NULL or not, and testing for NULL seems a lot clearer/safer. This is part of a larger project to fix various infelicities in the phrase-search implementation, but this part seems comittable on its own. Back-patch to 9.6 where phrase operators were introduced. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-19 19:49:45 +01:00
'2' <2> '1'
(1 row)
SELECT to_tsquery('english', 'foo <-> (a <-> (the <-> bar))');
to_tsquery
-----------------
Fix handling of phrase operator removal while removing tsquery stopwords. The distance of a removed phrase operator should propagate up to a parent phrase operator if there is one, but this only worked correctly in left-deep trees. Throwing in a few parentheses confused it completely, as indeed was illustrated by bizarre results in existing regression test cases. To fix, track unaccounted-for distances that should propagate to the left and to the right of the current node, rather than trying to make it work with only one returned distance. Also make some adjustments to behave as well as we can for cases of intermixed phrase and regular (AND/OR) operators. I don't think it's possible to be 100% correct for that without a rethinking of the tsquery representation; for example, maybe we should just not drop stopword nodes at all underneath phrase operators. But this is better than it was, and changing tsquery representation wouldn't be safely back-patchable. While at it, I simplified the API of the clean_fakeval_intree function a bit by getting rid of the "char *result" output parameter; that wasn't doing anything that wasn't redundant with whether the result node is NULL or not, and testing for NULL seems a lot clearer/safer. This is part of a larger project to fix various infelicities in the phrase-search implementation, but this part seems comittable on its own. Back-patch to 9.6 where phrase operators were introduced. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-19 19:49:45 +01:00
'foo' <3> 'bar'
(1 row)
SELECT to_tsquery('english', '((foo <-> a) <-> the) <-> bar');
to_tsquery
-----------------
'foo' <3> 'bar'
(1 row)
SELECT to_tsquery('english', 'foo <-> a <-> the <-> bar');
to_tsquery
-----------------
'foo' <3> 'bar'
(1 row)
SELECT phraseto_tsquery('english', 'PostgreSQL can be extended by the user in many ways');
phraseto_tsquery
-----------------------------------------------------------
'postgresql' <3> 'extend' <3> 'user' <2> 'mani' <-> 'way'
(1 row)
SELECT ts_rank_cd(to_tsvector('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
'), to_tsquery('english', 'paint&water'));
ts_rank_cd
------------
0.05
(1 row)
SELECT ts_rank_cd(to_tsvector('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
'), to_tsquery('english', 'breath&motion&water'));
ts_rank_cd
------------
0.00833333
(1 row)
SELECT ts_rank_cd(to_tsvector('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
'), to_tsquery('english', 'ocean'));
ts_rank_cd
------------
0.1
(1 row)
SELECT ts_rank_cd(to_tsvector('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
'), to_tsquery('english', 'painted <-> Ship'));
ts_rank_cd
------------
0.1
(1 row)
SELECT ts_rank_cd(strip(to_tsvector('both stripped')),
to_tsquery('both & stripped'));
ts_rank_cd
------------
0
(1 row)
SELECT ts_rank_cd(to_tsvector('unstripped') || strip(to_tsvector('stripped')),
to_tsquery('unstripped & stripped'));
ts_rank_cd
------------
0
(1 row)
--headline tests
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'paint&water'));
ts_headline
-----------------------------------------
<b>painted</b> Ocean. +
<b>Water</b>, <b>water</b>, every where+
And all the boards did shrink; +
<b>Water</b>, <b>water</b>, every
(1 row)
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'breath&motion&water'));
ts_headline
----------------------------------
<b>breath</b> nor <b>motion</b>,+
As idle as a painted Ship +
Upon a painted Ocean. +
<b>Water</b>, <b>water</b>
(1 row)
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'ocean'));
ts_headline
----------------------------------
<b>Ocean</b>. +
Water, water, every where +
And all the boards did shrink;+
Water, water, every where
(1 row)
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', phraseto_tsquery('english', 'painted Ocean'));
ts_headline
----------------------------------
<b>painted</b> <b>Ocean</b>. +
Water, water, every where +
And all the boards did shrink;+
Water, water, every
(1 row)
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', phraseto_tsquery('english', 'idle as a painted Ship'));
ts_headline
---------------------------------------------
<b>idle</b> as a <b>painted</b> <b>Ship</b>+
Upon a <b>painted</b> Ocean. +
Water, water, every where +
And all the boards
(1 row)
SELECT ts_headline('english', '
<html>
<!-- some comment -->
<body>
Sea view wow <u>foo bar</u> <i>qq</i>
<a href="http://www.google.com/foo.bar.html" target="_blank">YES &nbsp;</a>
ff-bg
<script>
document.write(15);
</script>
</body>
</html>',
to_tsquery('english', 'sea&foo'), 'HighlightAll=true');
ts_headline
-----------------------------------------------------------------------------
+
<html> +
<!-- some comment --> +
<body> +
<b>Sea</b> view wow <u><b>foo</b> bar</u> <i>qq</i> +
<a href="http://www.google.com/foo.bar.html" target="_blank">YES &nbsp;</a>+
ff-bg +
<script> +
document.write(15); +
</script> +
</body> +
</html>
(1 row)
SELECT ts_headline('simple', '1 2 3 1 3'::text, '1 <-> 3', 'MaxWords=2, MinWords=1');
ts_headline
-------------------
<b>1</b> <b>3</b>
(1 row)
SELECT ts_headline('simple', '1 2 3 1 3'::text, '1 & 3', 'MaxWords=4, MinWords=1');
ts_headline
------------------------------
<b>1</b> 2 <b>3</b> <b>1</b>
(1 row)
SELECT ts_headline('simple', '1 2 3 1 3'::text, '1 <-> 3', 'MaxWords=4, MinWords=1');
ts_headline
-------------------
<b>1</b> <b>3</b>
(1 row)
--Check if headline fragments work
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'ocean'), 'MaxFragments=1');
ts_headline
------------------------------------
after day, +
We stuck, nor breath nor motion,+
As idle as a painted Ship +
Upon a painted <b>Ocean</b>. +
Water, water, every where +
And all the boards did shrink; +
Water, water, every where, +
Nor any drop
(1 row)
--Check if more than one fragments are displayed
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'Coleridge & stuck'), 'MaxFragments=2');
ts_headline
----------------------------------------------
after day, day after day, +
We <b>stuck</b>, nor breath nor motion, +
As idle as a painted Ship +
Upon a painted Ocean. +
Water, water, every where +
And all the boards did shrink; +
Water, water, every where ... drop to drink.+
S. T. <b>Coleridge</b>
(1 row)
--Fragments when there all query words are not in the document
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'ocean & seahorse'), 'MaxFragments=1');
ts_headline
------------------------------------
+
Day after day, day after day, +
We stuck, nor breath nor motion,+
As idle as
(1 row)
--FragmentDelimiter option
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'Coleridge & stuck'), 'MaxFragments=2,FragmentDelimiter=***');
ts_headline
--------------------------------------------
after day, day after day, +
We <b>stuck</b>, nor breath nor motion, +
As idle as a painted Ship +
Upon a painted Ocean. +
Water, water, every where +
And all the boards did shrink; +
Water, water, every where***drop to drink.+
S. T. <b>Coleridge</b>
(1 row)
--Rewrite sub system
CREATE TABLE test_tsquery (txtkeyword TEXT, txtsample TEXT);
\set ECHO none
ALTER TABLE test_tsquery ADD COLUMN keyword tsquery;
UPDATE test_tsquery SET keyword = to_tsquery('english', txtkeyword);
ALTER TABLE test_tsquery ADD COLUMN sample tsquery;
UPDATE test_tsquery SET sample = to_tsquery('english', txtsample::text);
SELECT COUNT(*) FROM test_tsquery WHERE keyword < 'new & york';
count
-------
2
(1 row)
SELECT COUNT(*) FROM test_tsquery WHERE keyword <= 'new & york';
count
-------
3
(1 row)
SELECT COUNT(*) FROM test_tsquery WHERE keyword = 'new & york';
count
-------
1
(1 row)
SELECT COUNT(*) FROM test_tsquery WHERE keyword >= 'new & york';
count
-------
4
(1 row)
SELECT COUNT(*) FROM test_tsquery WHERE keyword > 'new & york';
count
-------
3
(1 row)
CREATE UNIQUE INDEX bt_tsq ON test_tsquery (keyword);
SET enable_seqscan=OFF;
SELECT COUNT(*) FROM test_tsquery WHERE keyword < 'new & york';
count
-------
2
(1 row)
SELECT COUNT(*) FROM test_tsquery WHERE keyword <= 'new & york';
count
-------
3
(1 row)
SELECT COUNT(*) FROM test_tsquery WHERE keyword = 'new & york';
count
-------
1
(1 row)
SELECT COUNT(*) FROM test_tsquery WHERE keyword >= 'new & york';
count
-------
4
(1 row)
SELECT COUNT(*) FROM test_tsquery WHERE keyword > 'new & york';
count
-------
3
(1 row)
RESET enable_seqscan;
SELECT ts_rewrite('foo & bar & qq & new & york', 'new & york'::tsquery, 'big & apple | nyc | new & york & city');
ts_rewrite
------------------------------------------------------------------------------
'foo' & 'bar' & 'qq' & ( 'city' & 'new' & 'york' | 'nyc' | 'big' & 'apple' )
(1 row)
SELECT ts_rewrite(ts_rewrite('new & !york ', 'york', '!jersey'),
'jersey', 'mexico');
ts_rewrite
--------------------
'new' & !!'mexico'
(1 row)
SELECT ts_rewrite('moscow', 'SELECT keyword, sample FROM test_tsquery'::text );
ts_rewrite
---------------------
'moskva' | 'moscow'
(1 row)
SELECT ts_rewrite('moscow & hotel', 'SELECT keyword, sample FROM test_tsquery'::text );
ts_rewrite
-----------------------------------
'hotel' & ( 'moskva' | 'moscow' )
(1 row)
SELECT ts_rewrite('bar & new & qq & foo & york', 'SELECT keyword, sample FROM test_tsquery'::text );
ts_rewrite
---------------------------------------------------------------------------------
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' & 'appl' | 'new' & 'york' )
(1 row)
SELECT ts_rewrite( 'moscow', 'SELECT keyword, sample FROM test_tsquery');
ts_rewrite
---------------------
'moskva' | 'moscow'
(1 row)
SELECT ts_rewrite( 'moscow & hotel', 'SELECT keyword, sample FROM test_tsquery');
ts_rewrite
-----------------------------------
'hotel' & ( 'moskva' | 'moscow' )
(1 row)
SELECT ts_rewrite( 'bar & new & qq & foo & york', 'SELECT keyword, sample FROM test_tsquery');
ts_rewrite
---------------------------------------------------------------------------------
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' & 'appl' | 'new' & 'york' )
(1 row)
SELECT ts_rewrite('1 & (2 <-> 3)', 'SELECT keyword, sample FROM test_tsquery'::text );
ts_rewrite
-------------
'2' <-> '4'
(1 row)
SELECT ts_rewrite('1 & (2 <2> 3)', 'SELECT keyword, sample FROM test_tsquery'::text );
ts_rewrite
-------------------
'1' & '2' <2> '3'
(1 row)
SELECT ts_rewrite('5 <-> (1 & (2 <-> 3))', 'SELECT keyword, sample FROM test_tsquery'::text );
Fix strange behavior (and possible crashes) in full text phrase search. In an attempt to simplify the tsquery matching engine, the original phrase search patch invented rewrite rules that would rearrange a tsquery so that no AND/OR/NOT operator appeared below a PHRASE operator. But this approach had numerous problems. The rearrangement step was missed by ts_rewrite (and perhaps other places), allowing tsqueries to be created that would cause Assert failures or perhaps crashes at execution, as reported by Andreas Seltenreich. The rewrite rules effectively defined semantics for operators underneath PHRASE that were buggy, or at least unintuitive. And because rewriting was done in tsqueryin() rather than at execution, the rearrangement was user-visible, which is not very desirable --- for example, it might cause unexpected matches or failures to match in ts_rewrite. As a somewhat independent problem, the behavior of nested PHRASE operators was only sane for left-deep trees; queries like "x <-> (y <-> z)" did not behave intuitively at all. To fix, get rid of the rewrite logic altogether, and instead teach the tsquery execution engine to manage AND/OR/NOT below a PHRASE operator by explicitly computing the match location(s) and match widths for these operators. This requires introducing some additional fields into the publicly visible ExecPhraseData struct; but since there's no way for third-party code to pass such a struct to TS_phrase_execute, it shouldn't create an ABI problem as long as we don't move the offsets of the existing fields. Another related problem was that index searches supposed that "!x <-> y" could be lossily approximated as "!x & y", which isn't correct because the latter will reject, say, "x q y" which the query itself accepts. This required some tweaking in TS_execute_ternary along with the main tsquery engine. Back-patch to 9.6 where phrase operators were introduced. While this could be argued to change behavior more than we'd like in a stable branch, we have to do something about the crash hazards and index-vs-seqscan inconsistency, and it doesn't seem desirable to let the unintuitive behaviors induced by the rewriting implementation stand as precedent. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-21 21:18:25 +01:00
ts_rewrite
-------------------------
'5' <-> ( '2' <-> '4' )
(1 row)
SELECT ts_rewrite('5 <-> (6 | 8)', 'SELECT keyword, sample FROM test_tsquery'::text );
Fix strange behavior (and possible crashes) in full text phrase search. In an attempt to simplify the tsquery matching engine, the original phrase search patch invented rewrite rules that would rearrange a tsquery so that no AND/OR/NOT operator appeared below a PHRASE operator. But this approach had numerous problems. The rearrangement step was missed by ts_rewrite (and perhaps other places), allowing tsqueries to be created that would cause Assert failures or perhaps crashes at execution, as reported by Andreas Seltenreich. The rewrite rules effectively defined semantics for operators underneath PHRASE that were buggy, or at least unintuitive. And because rewriting was done in tsqueryin() rather than at execution, the rearrangement was user-visible, which is not very desirable --- for example, it might cause unexpected matches or failures to match in ts_rewrite. As a somewhat independent problem, the behavior of nested PHRASE operators was only sane for left-deep trees; queries like "x <-> (y <-> z)" did not behave intuitively at all. To fix, get rid of the rewrite logic altogether, and instead teach the tsquery execution engine to manage AND/OR/NOT below a PHRASE operator by explicitly computing the match location(s) and match widths for these operators. This requires introducing some additional fields into the publicly visible ExecPhraseData struct; but since there's no way for third-party code to pass such a struct to TS_phrase_execute, it shouldn't create an ABI problem as long as we don't move the offsets of the existing fields. Another related problem was that index searches supposed that "!x <-> y" could be lossily approximated as "!x & y", which isn't correct because the latter will reject, say, "x q y" which the query itself accepts. This required some tweaking in TS_execute_ternary along with the main tsquery engine. Back-patch to 9.6 where phrase operators were introduced. While this could be argued to change behavior more than we'd like in a stable branch, we have to do something about the crash hazards and index-vs-seqscan inconsistency, and it doesn't seem desirable to let the unintuitive behaviors induced by the rewriting implementation stand as precedent. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-21 21:18:25 +01:00
ts_rewrite
-----------------------
'5' <-> ( '6' | '8' )
(1 row)
-- Check empty substitution
SELECT ts_rewrite(to_tsquery('5 & (6 | 5)'), to_tsquery('5'), to_tsquery(''));
NOTICE: text-search query doesn't contain lexemes: ""
ts_rewrite
------------
'6'
(1 row)
SELECT ts_rewrite(to_tsquery('!5'), to_tsquery('5'), to_tsquery(''));
NOTICE: text-search query doesn't contain lexemes: ""
ts_rewrite
------------
(1 row)
SELECT keyword FROM test_tsquery WHERE keyword @> 'new';
keyword
----------------
'new' & 'york'
(1 row)
SELECT keyword FROM test_tsquery WHERE keyword @> 'moscow';
keyword
----------
'moscow'
(1 row)
SELECT keyword FROM test_tsquery WHERE keyword <@ 'new';
keyword
---------
(0 rows)
SELECT keyword FROM test_tsquery WHERE keyword <@ 'moscow';
keyword
----------
'moscow'
(1 row)
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'moscow') AS query;
ts_rewrite
---------------------
'moskva' | 'moscow'
(1 row)
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'moscow & hotel') AS query;
ts_rewrite
-----------------------------------
'hotel' & ( 'moskva' | 'moscow' )
(1 row)
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'bar & new & qq & foo & york') AS query;
ts_rewrite
---------------------------------------------------------------------------------
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' & 'appl' | 'new' & 'york' )
(1 row)
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'moscow') AS query;
ts_rewrite
---------------------
'moskva' | 'moscow'
(1 row)
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'moscow & hotel') AS query;
ts_rewrite
-----------------------------------
'hotel' & ( 'moskva' | 'moscow' )
(1 row)
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'bar & new & qq & foo & york') AS query;
ts_rewrite
---------------------------------------------------------------------------------
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' & 'appl' | 'new' & 'york' )
(1 row)
CREATE INDEX qq ON test_tsquery USING gist (keyword tsquery_ops);
SET enable_seqscan=OFF;
SELECT keyword FROM test_tsquery WHERE keyword @> 'new';
keyword
----------------
'new' & 'york'
(1 row)
SELECT keyword FROM test_tsquery WHERE keyword @> 'moscow';
keyword
----------
'moscow'
(1 row)
SELECT keyword FROM test_tsquery WHERE keyword <@ 'new';
keyword
---------
(0 rows)
SELECT keyword FROM test_tsquery WHERE keyword <@ 'moscow';
keyword
----------
'moscow'
(1 row)
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'moscow') AS query;
ts_rewrite
---------------------
'moskva' | 'moscow'
(1 row)
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'moscow & hotel') AS query;
ts_rewrite
-----------------------------------
'hotel' & ( 'moskva' | 'moscow' )
(1 row)
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'bar & new & qq & foo & york') AS query;
ts_rewrite
---------------------------------------------------------------------------------
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' & 'appl' | 'new' & 'york' )
(1 row)
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'moscow') AS query;
ts_rewrite
---------------------
'moskva' | 'moscow'
(1 row)
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'moscow & hotel') AS query;
ts_rewrite
-----------------------------------
'hotel' & ( 'moskva' | 'moscow' )
(1 row)
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'bar & new & qq & foo & york') AS query;
ts_rewrite
---------------------------------------------------------------------------------
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' & 'appl' | 'new' & 'york' )
(1 row)
Fix strange behavior (and possible crashes) in full text phrase search. In an attempt to simplify the tsquery matching engine, the original phrase search patch invented rewrite rules that would rearrange a tsquery so that no AND/OR/NOT operator appeared below a PHRASE operator. But this approach had numerous problems. The rearrangement step was missed by ts_rewrite (and perhaps other places), allowing tsqueries to be created that would cause Assert failures or perhaps crashes at execution, as reported by Andreas Seltenreich. The rewrite rules effectively defined semantics for operators underneath PHRASE that were buggy, or at least unintuitive. And because rewriting was done in tsqueryin() rather than at execution, the rearrangement was user-visible, which is not very desirable --- for example, it might cause unexpected matches or failures to match in ts_rewrite. As a somewhat independent problem, the behavior of nested PHRASE operators was only sane for left-deep trees; queries like "x <-> (y <-> z)" did not behave intuitively at all. To fix, get rid of the rewrite logic altogether, and instead teach the tsquery execution engine to manage AND/OR/NOT below a PHRASE operator by explicitly computing the match location(s) and match widths for these operators. This requires introducing some additional fields into the publicly visible ExecPhraseData struct; but since there's no way for third-party code to pass such a struct to TS_phrase_execute, it shouldn't create an ABI problem as long as we don't move the offsets of the existing fields. Another related problem was that index searches supposed that "!x <-> y" could be lossily approximated as "!x & y", which isn't correct because the latter will reject, say, "x q y" which the query itself accepts. This required some tweaking in TS_execute_ternary along with the main tsquery engine. Back-patch to 9.6 where phrase operators were introduced. While this could be argued to change behavior more than we'd like in a stable branch, we have to do something about the crash hazards and index-vs-seqscan inconsistency, and it doesn't seem desirable to let the unintuitive behaviors induced by the rewriting implementation stand as precedent. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-21 21:18:25 +01:00
SELECT ts_rewrite(tsquery_phrase('foo', 'foo'), 'foo', 'bar | baz');
ts_rewrite
-----------------------------------------
( 'bar' | 'baz' ) <-> ( 'bar' | 'baz' )
(1 row)
SELECT to_tsvector('foo bar') @@
ts_rewrite(tsquery_phrase('foo', 'foo'), 'foo', 'bar | baz');
?column?
----------
f
(1 row)
SELECT to_tsvector('bar baz') @@
ts_rewrite(tsquery_phrase('foo', 'foo'), 'foo', 'bar | baz');
?column?
----------
t
(1 row)
RESET enable_seqscan;
--test GUC
SET default_text_search_config=simple;
SELECT to_tsvector('SKIES My booKs');
to_tsvector
----------------------------
'books':3 'my':2 'skies':1
(1 row)
SELECT plainto_tsquery('SKIES My booKs');
plainto_tsquery
--------------------------
'skies' & 'my' & 'books'
(1 row)
SELECT to_tsquery('SKIES & My | booKs');
to_tsquery
--------------------------
'skies' & 'my' | 'books'
(1 row)
SET default_text_search_config=english;
SELECT to_tsvector('SKIES My booKs');
to_tsvector
------------------
'book':3 'sky':1
(1 row)
SELECT plainto_tsquery('SKIES My booKs');
plainto_tsquery
-----------------
'sky' & 'book'
(1 row)
SELECT to_tsquery('SKIES & My | booKs');
to_tsquery
----------------
'sky' | 'book'
(1 row)
--trigger
CREATE TRIGGER tsvectorupdate
BEFORE UPDATE OR INSERT ON test_tsvector
FOR EACH ROW EXECUTE PROCEDURE tsvector_update_trigger(a, 'pg_catalog.english', t);
SELECT count(*) FROM test_tsvector WHERE a @@ to_tsquery('345&qwerty');
count
-------
0
(1 row)
INSERT INTO test_tsvector (t) VALUES ('345 qwerty');
SELECT count(*) FROM test_tsvector WHERE a @@ to_tsquery('345&qwerty');
count
-------
1
(1 row)
UPDATE test_tsvector SET t = null WHERE t = '345 qwerty';
SELECT count(*) FROM test_tsvector WHERE a @@ to_tsquery('345&qwerty');
count
-------
0
(1 row)
INSERT INTO test_tsvector (t) VALUES ('345 qwerty');
SELECT count(*) FROM test_tsvector WHERE a @@ to_tsquery('345&qwerty');
count
-------
1
(1 row)
-- test finding items in GIN's pending list
create temp table pendtest (ts tsvector);
create index pendtest_idx on pendtest using gin(ts);
insert into pendtest values (to_tsvector('Lore ipsam'));
insert into pendtest values (to_tsvector('Lore ipsum'));
select * from pendtest where 'ipsu:*'::tsquery @@ ts;
ts
--------------------
'ipsum':2 'lore':1
(1 row)
select * from pendtest where 'ipsa:*'::tsquery @@ ts;
ts
--------------------
'ipsam':2 'lore':1
(1 row)
select * from pendtest where 'ips:*'::tsquery @@ ts;
ts
--------------------
'ipsam':2 'lore':1
'ipsum':2 'lore':1
(2 rows)
select * from pendtest where 'ipt:*'::tsquery @@ ts;
ts
----
(0 rows)
select * from pendtest where 'ipi:*'::tsquery @@ ts;
ts
----
(0 rows)
--check OP_PHRASE on index
create temp table phrase_index_test(fts tsvector);
insert into phrase_index_test values ('A fat cat has just eaten a rat.');
insert into phrase_index_test values (to_tsvector('english', 'A fat cat has just eaten a rat.'));
create index phrase_index_test_idx on phrase_index_test using gin(fts);
set enable_seqscan = off;
select * from phrase_index_test where fts @@ phraseto_tsquery('english', 'fat cat');
fts
-----------------------------------
'cat':3 'eaten':6 'fat':2 'rat':8
(1 row)
set enable_seqscan = on;