From bb8f629c7aa31e3d3197b9cebf80f0986a6fbf47 Mon Sep 17 00:00:00 2001 From: Bruce Momjian Date: Wed, 29 Aug 2007 20:37:14 +0000 Subject: [PATCH] Move full text search operators, functions, and data type sections into the main documentation, out of its own text search chapter. --- doc/src/sgml/datatype.sgml | 145 +++- doc/src/sgml/func.sgml | 916 ++++++++++++++++++++++++- doc/src/sgml/textsearch.sgml | 1222 +++------------------------------- 3 files changed, 1147 insertions(+), 1136 deletions(-) diff --git a/doc/src/sgml/datatype.sgml b/doc/src/sgml/datatype.sgml index f1d882a11e..4c8c0c56ca 100644 --- a/doc/src/sgml/datatype.sgml +++ b/doc/src/sgml/datatype.sgml @@ -1,4 +1,4 @@ - + Data Types @@ -234,6 +234,18 @@ date and time, including time zone + + tsquery + + full text search query + + + + tsvector + + full text search document + + uuid @@ -3264,6 +3276,137 @@ a0eebc999c0b4ef8bb6d6bb9bd380a11 + + Full Text Search + + + + + tsvector + + + + tsvector + + + + tsvector is a data type that represents a document and is + optimized for full text searching. In the simplest case, + tsvector is a sorted list of lexemes, so even without indexes + full text searches perform better than standard ~ and + LIKE operations: + + +SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector; + tsvector +---------------------------------------------------- + 'a' 'on' 'and' 'ate' 'cat' 'fat' 'mat' 'rat' 'sat' + + + Notice, that space is also a lexeme: + + +SELECT 'space '' '' is a lexeme'::tsvector; + tsvector +---------------------------------- + 'a' 'is' ' ' 'space' 'lexeme' + + + Each lexeme, optionally, can have positional information which is used for + proximity ranking: + + +SELECT 'a:1 fat:2 cat:3 sat:4 on:5 a:6 mat:7 and:8 ate:9 a:10 fat:11 rat:12'::tsvector; + tsvector +------------------------------------------------------------------------------- + 'a':1,6,10 'on':5 'and':8 'ate':9 'cat':3 'fat':2,11 'mat':7 'rat':12 'sat':4 + + + Each lexeme position also can be labeled as A, + B, C, D, + where D is the default. These labels can be used to group + lexemes into different importance or + rankings, for example to reflect document structure. + Actual values can be assigned at search time and used during the calculation + of the document rank. This is very useful for controlling search results. + + + + The concatenation operator, e.g. tsvector || tsvector, + can "construct" a document from several parts. The order is important if + tsvector contains positional information. Of course, + it is also possible to build a document using different tables: + + +SELECT 'fat:1 cat:2'::tsvector || 'fat:1 rat:2'::tsvector; + ?column? +--------------------------- + 'cat':2 'fat':1,3 'rat':4 + +SELECT 'fat:1 rat:2'::tsvector || 'fat:1 cat:2'::tsvector; + ?column? +--------------------------- + 'cat':4 'fat':1,3 'rat':2 + + + + + + + + + + tsquery + + + + tsquery + + + + tsquery is a data type for textual queries which supports + the boolean operators & (AND), | (OR), + and parentheses. A tsquery consists of lexemes + (optionally labeled by letters) with boolean operators in between: + + +SELECT 'fat & cat'::tsquery; + tsquery +--------------- + 'fat' & 'cat' +SELECT 'fat:ab & cat'::tsquery; + tsquery +------------------ + 'fat':AB & 'cat' + + + Labels can be used to restrict the search region, which allows the + development of different search engines using the same full text index. + + + + tsqueries can be concatenated using && (AND) + and || (OR) operators: + + +SELECT 'a & b'::tsquery && 'c | d'::tsquery; + ?column? +--------------------------- + 'a' & 'b' & ( 'c' | 'd' ) + +SELECT 'a & b'::tsquery || 'c|d'::tsquery; + ?column? +--------------------------- + 'a' & 'b' | ( 'c' | 'd' ) + + + + + + + + + <acronym>XML</> Type diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index bc9abc689c..9ec780b4c9 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -1,4 +1,4 @@ - + Functions and Operators @@ -7551,6 +7551,920 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple + + Full Text Search Operators and Functions + + + This section outlines all the functions and operators that are available + for full text searching. + + + + Full text search vectors and queries both use lexemes, but for different + purposes. A tsvector represents the lexemes (tokens) parsed + out of a document, with an optional position. A tsquery + specifies a boolean condition using lexemes. + + + + All of the following functions that accept a configuration argument can + use a textual configuration name to select a configuration. If the option + is omitted the configuration specified by + default_text_search_config is used. For more information on + configuration, see . + + + + Search + + The operator @@ is used to perform full text + searches: + + + + + + + + TSVECTOR @@ TSQUERY + + + + + + TSVECTOR @@ TSQUERY + TSQUERY @@ TSVECTOR + + + + + + Returns true if TSQUERY is contained + in TSVECTOR, and false if not: + + +SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector @@ 'cat & rat'::tsquery; + ?column? +---------- + t + +SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector @@ 'fat & cow'::tsquery; + ?column? +---------- + f + + + + + + + + + + TEXT @@ TSQUERY + + + + + text @@ tsquery + + + + + + Returns true if TSQUERY is contained + in TEXT, and false if not: + + +SELECT 'a fat cat sat on a mat and ate a fat rat'::text @@ 'cat & rat'::tsquery; + ?column? +---------- + t + +SELECT 'a fat cat sat on a mat and ate a fat rat'::text @@ 'cat & cow'::tsquery; + ?column? +---------- + f + + + + + + + + + TEXT @@ TEXT + + + + + + text @@ text + + + + + + Returns true if the right + argument (the query) is contained in the left argument, and + false otherwise: + + +SELECT 'a fat cat sat on a mat and ate a fat rat' @@ 'cat rat'; + ?column? +---------- + t + +SELECT 'a fat cat sat on a mat and ate a fat rat' @@ 'cat cow'; + ?column? +---------- + f + + + + + + + + + + For index support of full text operators consult . + + + + + + tsvector + + + + + + + to_tsvector + + + + + to_tsvector(config_name, document TEXT) returns TSVECTOR + + + + + + Parses a document into tokens, reduces the tokens to lexemes, and returns a + tsvector which lists the lexemes together with their positions in the document + in lexicographic order. + + + + + + + + + strip + + + + + strip(vector TSVECTOR) returns TSVECTOR + + + + + + Returns a vector which lists the same lexemes as the given vector, but + which lacks any information about where in the document each lexeme + appeared. While the returned vector is useless for relevance ranking it + will usually be much smaller. + + + + + + + + + setweight + + + + + setweight(vector TSVECTOR, letter) returns TSVECTOR + + + + + + This function returns a copy of the input vector in which every location + has been labeled with either the letter A, + B, or C, or the default label + D (which is the default for new vectors + and as such is usually not displayed). These labels are retained + when vectors are concatenated, allowing words from different parts of a + document to be weighted differently by ranking functions. + + + + + + + + tsvector concatenation + + + + + vector1 || vector2 + tsvector_concat(vector1 TSVECTOR, vector2 TSVECTOR) returns TSVECTOR + + + + + + Returns a vector which combines the lexemes and positional information of + the two vectors given as arguments. Positional weight labels (described + in the previous paragraph) are retained during the concatenation. This + has at least two uses. First, if some sections of your document need to be + parsed with different configurations than others, you can parse them + separately and then concatenate the resulting vectors. Second, you can + weigh words from one section of your document differently than the others + by parsing the sections into separate vectors and assigning each vector + a different position label with the setweight() + function. You can then concatenate them into a single vector and provide + a weights argument to the ts_rank() function that assigns + different weights to positions with different labels. + + + + + + + + length(tsvector) + + + + + length(vector TSVECTOR) returns INT4 + + + + + + Returns the number of lexemes stored in the vector. + + + + + + + + text::tsvector + + + + + text::TSVECTOR returns TSVECTOR + + + + + + Directly casting text to a tsvector allows you + to directly inject lexemes into a vector with whatever positions and + positional weights you choose to specify. The text should be formatted to + match the way a vector is displayed by SELECT. + + + + + + + + + trigger + for updating a derived tsvector column + + + + + tsvector_update_trigger(tsvector_column_name, config_name, text_column_name , ... ) + tsvector_update_trigger_column(tsvector_column_name, config_column_name, text_column_name , ... ) + + + + + + Two built-in trigger functions are available to automatically update a + tsvector column from one or more textual columns. An example + of their use is: + + +CREATE TABLE tblMessages ( + strMessage text, + tsv tsvector +); + +CREATE TRIGGER tsvectorupdate BEFORE INSERT OR UPDATE +ON tblMessages FOR EACH ROW EXECUTE PROCEDURE +tsvector_update_trigger(tsv, 'pg_catalog.english', strMessage); + + + Having created this trigger, any change in strMessage + will be automatically reflected into tsv. + + + + Both triggers require you to specify the text search configuration to + be used to perform the conversion. For + tsvector_update_trigger, the configuration name is simply + given as the second trigger argument. It must be schema-qualified as + shown above, so that the trigger behavior will not change with changes + in search_path. For + tsvector_update_trigger_column, the second trigger argument + is the name of another table column, which must be of type + regconfig. This allows a per-row selection of configuration + to be made. + + + + + + + + ts_stat + + + + + ts_stat(sqlquery text , weights text ) returns SETOF statinfo + + + + + + Here statinfo is a type, defined as: + + +CREATE TYPE statinfo AS (word text, ndoc integer, nentry integer); + + + and sqlquery is a text value containing a SQL query + which returns a single tsvector column. ts_stat + executes the query and returns statistics about the resulting + tsvector data, i.e., the number of documents, ndoc, + and the total number of words in the collection, nentry. It is + useful for checking your configuration and to find stop word candidates. For + example, to find the ten most frequent words: + + +SELECT * FROM ts_stat('SELECT vector from apod') +ORDER BY ndoc DESC, nentry DESC, word +LIMIT 10; + + + Optionally, one can specify weights to obtain + statistics about words with a specific weight: + + +SELECT * FROM ts_stat('SELECT vector FROM apod','a') +ORDER BY ndoc DESC, nentry DESC, word +LIMIT 10; + + + + + + + + + + Btree operations for tsvector + + + + + TSVECTOR < TSVECTOR + TSVECTOR <= TSVECTOR + TSVECTOR = TSVECTOR + TSVECTOR >= TSVECTOR + TSVECTOR > TSVECTOR + + + + + + All btree operations are defined for the tsvector type. + tsvectors are compared with each other using + lexicographical ordering. + + + + + + + + + + + tsquery + + + + + + + + to_tsquery + + + + + to_tsquery(config_name, querytext text) returns TSQUERY + + + + + + Accepts querytext, which should consist of single tokens + separated by the boolean operators & (and), | + (or) and ! (not), which can be grouped using parentheses. + In other words, to_tsquery expects already parsed text. + Each token is reduced to a lexeme using the specified or current configuration. + A weight class can be assigned to each lexeme entry to restrict the search region + (see setweight for an explanation). For example: + + +'fat:a & rats' + + + The to_tsquery function can also accept a text + string. In this case querytext should + be quoted. This may be useful, for example, to use with a thesaurus + dictionary. In the example below, a thesaurus contains rule supernovae + stars : sn: + + +SELECT to_tsquery('''supernovae stars'' & !crab'); + to_tsquery +--------------- + 'sn' & !'crab' + + + Without quotes to_tsquery will generate a syntax error. + + + + + + + + + + + plainto_tsquery + + + + + plainto_tsquery(config_name, querytext text) returns TSQUERY + + + + + + Transforms unformatted text querytext to tsquery. + It is the same as to_tsquery but accepts text + without quotes and will call the parser to break it into tokens. + plainto_tsquery assumes the & boolean + operator between words and does not recognize weight classes. + + + + + + + + + + querytree + + + + + querytree(query TSQUERY) returns TEXT + + + + + + This returns the query used for searching an index. It can be used to test + for an empty query. The SELECT below returns NULL, + which corresponds to an empty query since GIN indexes do not support queries with negation + + (a full index scan is inefficient): + + +SELECT querytree(to_tsquery('!defined')); + querytree +----------- + + + + + + + + + + text::tsquery casting + + + + + text::TSQUERY returns TSQUERY + + + + + + Directly casting text to a tsquery + allows you to directly inject lexemes into a query using whatever positions + and positional weight flags you choose to specify. The text should be + formatted to match the way a vector is displayed by + SELECT. + + + + + + + + + numnode + + + + + numnode(query TSQUERY) returns INTEGER + + + + + + This returns the number of nodes in a query tree. This function can be + used to determine if query is meaningful + (returns > 0), or contains only stop words (returns 0): + + +SELECT numnode(plainto_tsquery('the any')); +NOTICE: query contains only stopword(s) or does not contain lexeme(s), ignored + numnode +--------- + 0 + +SELECT numnode(plainto_tsquery('the table')); + numnode +--------- + 1 + +SELECT numnode(plainto_tsquery('long table')); + numnode +--------- + 3 + + + + + + + + + TSQUERY && TSQUERY + + + + + TSQUERY && TSQUERY returns TSQUERY + + + + + + Returns AND-ed TSQUERY + + + + + + + + TSQUERY || TSQUERY + + + + + TSQUERY || TSQUERY returns TSQUERY + + + + + + Returns OR-ed TSQUERY + + + + + + + + !! TSQUERY + + + + + !! TSQUERY returns TSQUERY + + + + + + negation of TSQUERY + + + + + + + + Btree operations for tsquery + + + + + TSQUERY < TSQUERY + TSQUERY <= TSQUERY + TSQUERY = TSQUERY + TSQUERY >= TSQUERY + TSQUERY > TSQUERY + + + + + + All btree operations are defined for the tsquery type. + tsqueries are compared to each other using lexicographical + ordering. + + + + + + + + Query Rewriting + + + Query rewriting is a set of functions and operators for the + tsquery data type. It allows control at search + query time without reindexing (the opposite of the + thesaurus). For example, you can expand the search using synonyms + (new york, big apple, nyc, + gotham) or narrow the search to direct the user to some hot + topic. + + + + The ts_rewrite() function changes the original query by + replacing part of the query with some other string of type tsquery, + as defined by the rewrite rule. Arguments to ts_rewrite() + can be names of columns of type tsquery. + + + +CREATE TABLE aliases (t TSQUERY PRIMARY KEY, s TSQUERY); +INSERT INTO aliases VALUES('a', 'c'); + + + + + + + + ts_rewrite + + + + + ts_rewrite (query TSQUERY, target TSQUERY, sample TSQUERY) returns TSQUERY + + + + + + +SELECT ts_rewrite('a & b'::tsquery, 'a'::tsquery, 'c'::tsquery); + ts_rewrite +------------ + 'b' & 'c' + + + + + + + + + + ts_rewrite(ARRAY[query TSQUERY, target TSQUERY, sample TSQUERY]) returns TSQUERY + + + + + + +SELECT ts_rewrite(ARRAY['a & b'::tsquery, t,s]) FROM aliases; + ts_rewrite +------------ + 'b' & 'c' + + + + + + + + + + ts_rewrite (query TSQUERY,'SELECT target ,sample FROM test'::text) returns TSQUERY + + + + + + +SELECT ts_rewrite('a & b'::tsquery, 'SELECT t,s FROM aliases'); + ts_rewrite +------------ + 'b' & 'c' + + + + + + + + + What if there are several instances of rewriting? For example, query + 'a & b' can be rewritten as + 'b & c' and 'cc'. + + +SELECT * FROM aliases; + t | s +-----------+------ + 'a' | 'c' + 'x' | 'z' + 'a' & 'b' | 'cc' + + + This ambiguity can be resolved by specifying a sort order: + + +SELECT ts_rewrite('a & b', 'SELECT t, s FROM aliases ORDER BY t DESC'); + ts_rewrite + --------- + 'cc' + +SELECT ts_rewrite('a & b', 'SELECT t, s FROM aliases ORDER BY t ASC'); + ts_rewrite +-------------- + 'b' & 'c' + + + + + Let's consider a real-life astronomical example. We'll expand query + supernovae using table-driven rewriting rules: + + +CREATE TABLE aliases (t tsquery primary key, s tsquery); +INSERT INTO aliases VALUES(to_tsquery('supernovae'), to_tsquery('supernovae|sn')); + +SELECT ts_rewrite(to_tsquery('supernovae'), 'SELECT * FROM aliases') && to_tsquery('crab'); + ?column? +------------------------------- +( 'supernova' | 'sn' ) & 'crab' + + + Notice, that we can change the rewriting rule online: + + +UPDATE aliases SET s=to_tsquery('supernovae|sn & !nebulae') WHERE t=to_tsquery('supernovae'); +SELECT ts_rewrite(to_tsquery('supernovae'), 'SELECT * FROM aliases') && to_tsquery('crab'); + ?column? +----------------------------------------------- + 'supernova' | 'sn' & !'nebula' ) & 'crab' + + + + + + Operators For tsquery + + + Rewriting can be slow for many rewriting rules since it checks every rule + for a possible hit. To filter out obvious non-candidate rules there are containment + operators for the tsquery type. In the example below, we select only those + rules which might contain the original query: + + +SELECT ts_rewrite(ARRAY['a & b'::tsquery, t,s]) +FROM aliases +WHERE 'a & b' @> t; + ts_rewrite +------------ + 'b' & 'c' + + + + + + Two operators are defined for tsquery: + + + + + + + + TSQUERY @> TSQUERY + + + + + TSQUERY @> TSQUERY + + + + + + Returns true if the right argument might be contained in left argument. + + + + + + + + tsquery <@ tsquery + + + + + TSQUERY <@ TSQUERY + + + + + + Returns true if the left argument might be contained in right argument. + + + + + + + + + + + Index For tsquery + + + To speed up operators <@ and @> for + tsquery one can use a GiST index with + a tsquery_ops opclass: + + +CREATE INDEX t_idx ON aliases USING gist (t tsquery_ops); + + + + + + + + + + XML Functions diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index afa4415d93..87e24a019d 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -4,7 +4,7 @@ - Introduction + Introduction Full Text Searching (or just text search) allows @@ -85,12 +85,12 @@ - Store preprocessed documents - optimized for searching. For example, represent each document as a sorted array + Store preprocessed documents optimized for + searching. For example, represent each document as a sorted array of lexemes. Along with lexemes it is desirable to store positional - information to use for proximity ranking, so that a - document which contains a more "dense" region of query words is assigned - a higher rank than one with scattered query words. + information to use for proximity ranking, so that + a document which contains a more "dense" region of query words is + assigned a higher rank than one with scattered query words. @@ -135,7 +135,7 @@ - A data type (), tsvector + A data type (), tsvector is provided, for storing preprocessed documents, along with a type tsquery for representing textual queries. Also, a full text search operator @@ is defined @@ -146,11 +146,11 @@ - What Is a <firstterm>Document</firstterm>? + What Is a <firstterm>Document</firstterm>? - - document - + + document + A document can be a simple text file stored in the file system. The full @@ -189,139 +189,8 @@ WHERE mid = did AND mid = 12; - - Data Types - - - - - tsvector - - - - tsvector - - - - tsvector is a data type that represents a document and is - optimized for full text searching. In the simplest case, - tsvector is a sorted list of lexemes, so even without indexes - full text searches perform better than standard ~ and - LIKE operations: - - -SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector; - tsvector ----------------------------------------------------- - 'a' 'on' 'and' 'ate' 'cat' 'fat' 'mat' 'rat' 'sat' - - - Notice, that space is also a lexeme: - - -SELECT 'space '' '' is a lexeme'::tsvector; - tsvector ----------------------------------- - 'a' 'is' ' ' 'space' 'lexeme' - - - Each lexeme, optionally, can have positional information which is used for - proximity ranking: - - -SELECT 'a:1 fat:2 cat:3 sat:4 on:5 a:6 mat:7 and:8 ate:9 a:10 fat:11 rat:12'::tsvector; - tsvector -------------------------------------------------------------------------------- - 'a':1,6,10 'on':5 'and':8 'ate':9 'cat':3 'fat':2,11 'mat':7 'rat':12 'sat':4 - - - Each lexeme position also can be labeled as A, - B, C, D, - where D is the default. These labels can be used to group - lexemes into different importance or - rankings, for example to reflect document structure. - Actual values can be assigned at search time and used during the calculation - of the document rank. This is very useful for controlling search results. - - - - The concatenation operator, e.g. tsvector || tsvector, - can "construct" a document from several parts. The order is important if - tsvector contains positional information. Of course, - it is also possible to build a document using different tables: - - -SELECT 'fat:1 cat:2'::tsvector || 'fat:1 rat:2'::tsvector; - ?column? ---------------------------- - 'cat':2 'fat':1,3 'rat':4 - -SELECT 'fat:1 rat:2'::tsvector || 'fat:1 cat:2'::tsvector; - ?column? ---------------------------- - 'cat':4 'fat':1,3 'rat':2 - - - - - - - - - - tsquery - - - - tsquery - - - - tsquery is a data type for textual queries which supports - the boolean operators & (AND), | (OR), - and parentheses. A tsquery consists of lexemes - (optionally labeled by letters) with boolean operators in between: - - -SELECT 'fat & cat'::tsquery; - tsquery ---------------- - 'fat' & 'cat' -SELECT 'fat:ab & cat'::tsquery; - tsquery ------------------- - 'fat':AB & 'cat' - - - Labels can be used to restrict the search region, which allows the - development of different search engines using the same full text index. - - - - tsqueries can be concatenated using && (AND) - and || (OR) operators: - - -SELECT 'a & b'::tsquery && 'c | d'::tsquery; - ?column? ---------------------------- - 'a' & 'b' & ( 'c' | 'd' ) - -SELECT 'a & b'::tsquery || 'c|d'::tsquery; - ?column? ---------------------------- - 'a' & 'b' | ( 'c' | 'd' ) - - - - - - - - - - Performing Searches + Performing Searches Full text searching in PostgreSQL is based on @@ -362,10 +231,12 @@ SELECT 'fat & cow'::tsquery @@ 'a fat cat sat on a mat and ate a fat rat'::t is equivalent to to_tsvector(x) @@ y. The form text @@ text is equivalent to to_tsvector(x) @@ plainto_tsquery(y). + contains a full list of full text + search operators and functions. - Configurations + Configurations configurations @@ -389,7 +260,7 @@ SELECT 'fat & cow'::tsquery @@ 'a fat cat sat on a mat and ate a fat rat'::t - Tables and Indexes + Tables and Indexes The previous section described how to perform full text searches using @@ -398,7 +269,7 @@ SELECT 'fat & cow'::tsquery @@ 'a fat cat sat on a mat and ate a fat rat'::t - Searching a Table + Searching a Table It is possible to do full text table search with no index. A simple query @@ -435,80 +306,80 @@ ORDER BY dlm DESC LIMIT 10; - Creating Indexes + Creating Indexes - - We can create a GIN () index to speed up the search: + + We can create a GIN () index to speed up the search: CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('english', body)); Notice that the 2-argument version of to_tsvector is - used. Only text search functions which specify a configuration name can - be used in expression indexes (). - This is because the index contents must be unaffected by . If they were affected, the - index contents might be inconsistent because different entries could - contain tsvectors that were created with different text search - configurations, and there would be no way to guess which was which. It - would be impossible to dump and restore such an index correctly. - + used. Only text search functions which specify a configuration name can + be used in expression indexes (). + This is because the index contents must be unaffected by . If they were affected, the + index contents might be inconsistent because different entries could + contain tsvectors that were created with different text search + configurations, and there would be no way to guess which was which. It + would be impossible to dump and restore such an index correctly. + + + + Because the two-argument version of to_tsvector was + used in the index above, only a query reference that uses the 2-argument + version of to_tsvector with the same configuration + name will use that index, i.e. WHERE 'a & b' @@ + to_svector('english', body) will use the index, but WHERE + 'a & b' @@ to_svector(body)) and WHERE 'a & b' @@ + body::tsvector will not. This guarantees that an index will be used + only with the same configuration used to create the index rows. + - Because the two-argument version of to_tsvector was - used in the index above, only a query reference that uses the 2-argument - version of to_tsvector with the same configuration - name will use that index, i.e. WHERE 'a & b' @@ - to_svector('english', body) will use the index, but WHERE - 'a & b' @@ to_svector(body)) and WHERE 'a & b' @@ - body::tsvector will not. This guarantees that an index will be used - only with the same configuration used to create the index rows. - - - - It is possible to setup more complex expression indexes where the - configuration name is specified by another column, e.g.: + It is possible to setup more complex expression indexes where the + configuration name is specified by another column, e.g.: CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector(config_name, body)); - where config_name is a column in the pgweb - table. This allows mixed configurations in the same index while - recording which configuration was used for each index row. - + where config_name is a column in the pgweb + table. This allows mixed configurations in the same index while + recording which configuration was used for each index row. + - - Indexes can even concatenate columns: + + Indexes can even concatenate columns: CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('english', title || body)); - + - - A more complex case is to create a separate tsvector column - to hold the output of to_tsvector(). This example is a - concatenation of title and body, - with ranking information. We assign different labels to them to encode - information about the origin of each word: + + A more complex case is to create a separate tsvector column + to hold the output of to_tsvector(). This example is a + concatenation of title and body, + with ranking information. We assign different labels to them to encode + information about the origin of each word: ALTER TABLE pgweb ADD COLUMN textsearch_index tsvector; UPDATE pgweb SET textsearch_index = - setweight(to_tsvector('english', coalesce(title,'')), 'A') || ' ' || - setweight(to_tsvector('english', coalesce(body,'')),'D'); + setweight(to_tsvector('english', coalesce(title,'')), 'A') || ' ' || + setweight(to_tsvector('english', coalesce(body,'')),'D'); - Then we create a GIN index to speed up the search: + Then we create a GIN index to speed up the search: CREATE INDEX textsearch_idx ON pgweb USING gin(textsearch_index); - After vacuuming, we are ready to perform a fast full text search: + After vacuuming, we are ready to perform a fast full text search: SELECT ts_rank_cd(textsearch_index, q) AS rank, title @@ -517,935 +388,20 @@ WHERE q @@ textsearch_index ORDER BY rank DESC LIMIT 10; - It is necessary to create a trigger to keep the new tsvector - column current anytime title or body changes. - Keep in mind that, just like with expression indexes, it is important to - specify the configuration name when creating text search data types - inside triggers so the column's contents are not affected by changes to - default_text_search_config. - - - - - - - - Operators and Functions - - - This section outlines all the functions and operators that are available - for full text searching. - - - - Full text search vectors and queries both use lexemes, but for different - purposes. A tsvector represents the lexemes (tokens) parsed - out of a document, with an optional position. A tsquery - specifies a boolean condition using lexemes. - - - - All of the following functions that accept a configuration argument can - use a textual configuration name to select a configuration. If the option - is omitted the configuration specified by - default_text_search_config is used. For more information on - configuration, see . - - - - Search - - The operator @@ is used to perform full text - searches: + It is necessary to create a trigger to keep the new tsvector + column current anytime title or body changes. + Keep in mind that, just like with expression indexes, it is important to + specify the configuration name when creating text search data types + inside triggers so the column's contents are not affected by changes to + default_text_search_config. - - - - - - TSVECTOR @@ TSQUERY - - - - - - TSVECTOR @@ TSQUERY - TSQUERY @@ TSVECTOR - - - - - - Returns true if TSQUERY is contained - in TSVECTOR, and false if not: - - -SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector @@ 'cat & rat'::tsquery; - ?column? ----------- - t - -SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector @@ 'fat & cow'::tsquery; - ?column? ----------- - f - - - - - - - - - - TEXT @@ TSQUERY - - - - - text @@ tsquery - - - - - - Returns true if TSQUERY is contained - in TEXT, and false if not: - - -SELECT 'a fat cat sat on a mat and ate a fat rat'::text @@ 'cat & rat'::tsquery; - ?column? ----------- - t - -SELECT 'a fat cat sat on a mat and ate a fat rat'::text @@ 'cat & cow'::tsquery; - ?column? ----------- - f - - - - - - - - - TEXT @@ TEXT - - - - - - text @@ text - - - - - - Returns true if the right - argument (the query) is contained in the left argument, and - false otherwise: - - -SELECT 'a fat cat sat on a mat and ate a fat rat' @@ 'cat rat'; - ?column? ----------- - t - -SELECT 'a fat cat sat on a mat and ate a fat rat' @@ 'cat cow'; - ?column? ----------- - f - - - - - - - - - - For index support of full text operators consult . - - - - - - - - tsvector - - - - - - - to_tsvector - - - - - to_tsvector(config_name, document TEXT) returns TSVECTOR - - - - - - Parses a document into tokens, reduces the tokens to lexemes, and returns a - tsvector which lists the lexemes together with their positions in the document - in lexicographic order. - - - - - - - - - strip - - - - - strip(vector TSVECTOR) returns TSVECTOR - - - - - - Returns a vector which lists the same lexemes as the given vector, but - which lacks any information about where in the document each lexeme - appeared. While the returned vector is useless for relevance ranking it - will usually be much smaller. - - - - - - - - - setweight - - - - - setweight(vector TSVECTOR, letter) returns TSVECTOR - - - - - - This function returns a copy of the input vector in which every location - has been labeled with either the letter A, - B, or C, or the default label - D (which is the default for new vectors - and as such is usually not displayed). These labels are retained - when vectors are concatenated, allowing words from different parts of a - document to be weighted differently by ranking functions. - - - - - - - - tsvector concatenation - - - - - vector1 || vector2 - tsvector_concat(vector1 TSVECTOR, vector2 TSVECTOR) returns TSVECTOR - - - - - - Returns a vector which combines the lexemes and positional information of - the two vectors given as arguments. Positional weight labels (described - in the previous paragraph) are retained during the concatenation. This - has at least two uses. First, if some sections of your document need to be - parsed with different configurations than others, you can parse them - separately and then concatenate the resulting vectors. Second, you can - weigh words from one section of your document differently than the others - by parsing the sections into separate vectors and assigning each vector - a different position label with the setweight() - function. You can then concatenate them into a single vector and provide - a weights argument to the ts_rank() function that assigns - different weights to positions with different labels. - - - - - - - - length(tsvector) - - - - - length(vector TSVECTOR) returns INT4 - - - - - - Returns the number of lexemes stored in the vector. - - - - - - - - text::tsvector - - - - - text::TSVECTOR returns TSVECTOR - - - - - - Directly casting text to a tsvector allows you - to directly inject lexemes into a vector with whatever positions and - positional weights you choose to specify. The text should be formatted to - match the way a vector is displayed by SELECT. - - - - - - - - - trigger - for updating a derived tsvector column - - - - - tsvector_update_trigger(tsvector_column_name, config_name, text_column_name , ... ) - tsvector_update_trigger_column(tsvector_column_name, config_column_name, text_column_name , ... ) - - - - - - Two built-in trigger functions are available to automatically update a - tsvector column from one or more textual columns. An example - of their use is: - - -CREATE TABLE tblMessages ( - strMessage text, - tsv tsvector -); - -CREATE TRIGGER tsvectorupdate BEFORE INSERT OR UPDATE -ON tblMessages FOR EACH ROW EXECUTE PROCEDURE -tsvector_update_trigger(tsv, 'pg_catalog.english', strMessage); - - - Having created this trigger, any change in strMessage - will be automatically reflected into tsv. - - - - Both triggers require you to specify the text search configuration to - be used to perform the conversion. For - tsvector_update_trigger, the configuration name is simply - given as the second trigger argument. It must be schema-qualified as - shown above, so that the trigger behavior will not change with changes - in search_path. For - tsvector_update_trigger_column, the second trigger argument - is the name of another table column, which must be of type - regconfig. This allows a per-row selection of configuration - to be made. - - - - - - - - ts_stat - - - - - ts_stat(sqlquery text , weights text ) returns SETOF statinfo - - - - - - Here statinfo is a type, defined as: - - -CREATE TYPE statinfo AS (word text, ndoc integer, nentry integer); - - - and sqlquery is a text value containing a SQL query - which returns a single tsvector column. ts_stat - executes the query and returns statistics about the resulting - tsvector data, i.e., the number of documents, ndoc, - and the total number of words in the collection, nentry. It is - useful for checking your configuration and to find stop word candidates. For - example, to find the ten most frequent words: - - -SELECT * FROM ts_stat('SELECT vector from apod') -ORDER BY ndoc DESC, nentry DESC, word -LIMIT 10; - - - Optionally, one can specify weights to obtain - statistics about words with a specific weight: - - -SELECT * FROM ts_stat('SELECT vector FROM apod','a') -ORDER BY ndoc DESC, nentry DESC, word -LIMIT 10; - - - - - - - - - - Btree operations for tsvector - - - - - TSVECTOR < TSVECTOR - TSVECTOR <= TSVECTOR - TSVECTOR = TSVECTOR - TSVECTOR >= TSVECTOR - TSVECTOR > TSVECTOR - - - - - - All btree operations are defined for the tsvector type. - tsvectors are compared with each other using - lexicographical ordering. - - - - - - - - - - - tsquery - - - - - - - - to_tsquery - - - - - to_tsquery(config_name, querytext text) returns TSQUERY - - - - - - Accepts querytext, which should consist of single tokens - separated by the boolean operators & (and), | - (or) and ! (not), which can be grouped using parentheses. - In other words, to_tsquery expects already parsed text. - Each token is reduced to a lexeme using the specified or current configuration. - A weight class can be assigned to each lexeme entry to restrict the search region - (see setweight for an explanation). For example: - - -'fat:a & rats' - - - The to_tsquery function can also accept a text - string. In this case querytext should - be quoted. This may be useful, for example, to use with a thesaurus - dictionary. In the example below, a thesaurus contains rule supernovae - stars : sn: - - -SELECT to_tsquery('''supernovae stars'' & !crab'); - to_tsquery ---------------- - 'sn' & !'crab' - - - Without quotes to_tsquery will generate a syntax error. - - - - - - - - - - - plainto_tsquery - - - - - plainto_tsquery(config_name, querytext text) returns TSQUERY - - - - - - Transforms unformatted text querytext to tsquery. - It is the same as to_tsquery but accepts text - without quotes and will call the parser to break it into tokens. - plainto_tsquery assumes the & boolean - operator between words and does not recognize weight classes. - - - - - - - - - - querytree - - - - - querytree(query TSQUERY) returns TEXT - - - - - - This returns the query used for searching an index. It can be used to test - for an empty query. The SELECT below returns NULL, - which corresponds to an empty query since GIN indexes do not support queries with negation - - (a full index scan is inefficient): - - -SELECT querytree(to_tsquery('!defined')); - querytree ------------ - - - - - - - - - - text::tsquery casting - - - - - text::TSQUERY returns TSQUERY - - - - - - Directly casting text to a tsquery - allows you to directly inject lexemes into a query using whatever positions - and positional weight flags you choose to specify. The text should be - formatted to match the way a vector is displayed by - SELECT. - - - - - - - - - numnode - - - - - numnode(query TSQUERY) returns INTEGER - - - - - - This returns the number of nodes in a query tree. This function can be - used to determine if query is meaningful - (returns > 0), or contains only stop words (returns 0): - - -SELECT numnode(plainto_tsquery('the any')); -NOTICE: query contains only stopword(s) or does not contain lexeme(s), ignored - numnode ---------- - 0 - -SELECT numnode(plainto_tsquery('the table')); - numnode ---------- - 1 - -SELECT numnode(plainto_tsquery('long table')); - numnode ---------- - 3 - - - - - - - - - TSQUERY && TSQUERY - - - - - TSQUERY && TSQUERY returns TSQUERY - - - - - - Returns AND-ed TSQUERY - - - - - - - - TSQUERY || TSQUERY - - - - - TSQUERY || TSQUERY returns TSQUERY - - - - - - Returns OR-ed TSQUERY - - - - - - - - !! TSQUERY - - - - - !! TSQUERY returns TSQUERY - - - - - - negation of TSQUERY - - - - - - - - Btree operations for tsquery - - - - - TSQUERY < TSQUERY - TSQUERY <= TSQUERY - TSQUERY = TSQUERY - TSQUERY >= TSQUERY - TSQUERY > TSQUERY - - - - - - All btree operations are defined for the tsquery type. - tsqueries are compared to each other using lexicographical - ordering. - - - - - - - - Query Rewriting - - - Query rewriting is a set of functions and operators for the - tsquery data type. It allows control at search - query time without reindexing (the opposite of the - thesaurus). For example, you can expand the search using synonyms - (new york, big apple, nyc, - gotham) or narrow the search to direct the user to some hot - topic. - - - - The ts_rewrite() function changes the original query by - replacing part of the query with some other string of type tsquery, - as defined by the rewrite rule. Arguments to ts_rewrite() - can be names of columns of type tsquery. - - - -CREATE TABLE aliases (t TSQUERY PRIMARY KEY, s TSQUERY); -INSERT INTO aliases VALUES('a', 'c'); - - - - - - - - ts_rewrite - - - - - ts_rewrite (query TSQUERY, target TSQUERY, sample TSQUERY) returns TSQUERY - - - - - - -SELECT ts_rewrite('a & b'::tsquery, 'a'::tsquery, 'c'::tsquery); - ts_rewrite ------------- - 'b' & 'c' - - - - - - - - - - ts_rewrite(ARRAY[query TSQUERY, target TSQUERY, sample TSQUERY]) returns TSQUERY - - - - - - -SELECT ts_rewrite(ARRAY['a & b'::tsquery, t,s]) FROM aliases; - ts_rewrite ------------- - 'b' & 'c' - - - - - - - - - - ts_rewrite (query TSQUERY,'SELECT target ,sample FROM test'::text) returns TSQUERY - - - - - - -SELECT ts_rewrite('a & b'::tsquery, 'SELECT t,s FROM aliases'); - ts_rewrite ------------- - 'b' & 'c' - - - - - - - - - What if there are several instances of rewriting? For example, query - 'a & b' can be rewritten as - 'b & c' and 'cc'. - - -SELECT * FROM aliases; - t | s ------------+------ - 'a' | 'c' - 'x' | 'z' - 'a' & 'b' | 'cc' - - - This ambiguity can be resolved by specifying a sort order: - - -SELECT ts_rewrite('a & b', 'SELECT t, s FROM aliases ORDER BY t DESC'); - ts_rewrite - --------- - 'cc' - -SELECT ts_rewrite('a & b', 'SELECT t, s FROM aliases ORDER BY t ASC'); - ts_rewrite --------------- - 'b' & 'c' - - - - - Let's consider a real-life astronomical example. We'll expand query - supernovae using table-driven rewriting rules: - - -CREATE TABLE aliases (t tsquery primary key, s tsquery); -INSERT INTO aliases VALUES(to_tsquery('supernovae'), to_tsquery('supernovae|sn')); - -SELECT ts_rewrite(to_tsquery('supernovae'), 'SELECT * FROM aliases') && to_tsquery('crab'); - ?column? -------------------------------- -( 'supernova' | 'sn' ) & 'crab' - - - Notice, that we can change the rewriting rule online: - - -UPDATE aliases SET s=to_tsquery('supernovae|sn & !nebulae') WHERE t=to_tsquery('supernovae'); -SELECT ts_rewrite(to_tsquery('supernovae'), 'SELECT * FROM aliases') && to_tsquery('crab'); - ?column? ------------------------------------------------ - 'supernova' | 'sn' & !'nebula' ) & 'crab' - - - - - - Operators For tsquery - - - Rewriting can be slow for many rewriting rules since it checks every rule - for a possible hit. To filter out obvious non-candidate rules there are containment - operators for the tsquery type. In the example below, we select only those - rules which might contain the original query: - - -SELECT ts_rewrite(ARRAY['a & b'::tsquery, t,s]) -FROM aliases -WHERE 'a & b' @> t; - ts_rewrite ------------- - 'b' & 'c' - - - - - - Two operators are defined for tsquery: - - - - - - - - TSQUERY @> TSQUERY - - - - - TSQUERY @> TSQUERY - - - - - - Returns true if the right argument might be contained in left argument. - - - - - - - - tsquery <@ tsquery - - - - - TSQUERY <@ TSQUERY - - - - - - Returns true if the left argument might be contained in right argument. - - - - - - - - - - - Index For tsquery - - - To speed up operators <@ and @> for - tsquery one can use a GiST index with - a tsquery_ops opclass: - - -CREATE INDEX t_idx ON aliases USING gist (t tsquery_ops); - - - - - - Additional Controls + Additional Controls To implement full text searching there must be a function to create a @@ -1458,13 +414,13 @@ CREATE INDEX t_idx ON aliases USING gist (t tsquery_ops); - Parsing + Parsing Full text searching in PostgreSQL provides function to_tsvector, which converts a document to the tsvector data type. More details are available in , but for now consider a simple example: + linkend="functions-textsearch-tsvector">, but for now consider a simple example: SELECT to_tsvector('english', 'a fat cat sat on a mat - it ate a fat rats'); @@ -1666,7 +622,7 @@ SELECT * FROM ts_token_type('default'); - Ranking Search Results + Ranking Search Results Ranking attempts to measure how relevant documents are to a particular @@ -1878,9 +834,8 @@ ORDER BY rnk DESC LIMIT 10; - - Highlighting Results + Highlighting Results headline @@ -2003,7 +958,7 @@ ORDER BY rank DESC LIMIT 10) AS foo; - Dictionaries + Dictionaries Dictionaries are used to eliminate words that should not be considered in a @@ -2139,7 +1094,7 @@ SELECT ts_lexize('english_stem', 'stars'); - Stop Words + Stop Words Stop words are words which are very common, appear in almost @@ -2220,9 +1175,8 @@ SELECT ts_lexize('public.simple_dict','The'); - - Synonym Dictionary + Synonym Dictionary This dictionary template is used to create dictionaries which replace a @@ -2257,7 +1211,7 @@ SELECT * FROM ts_debug('english','Paris'); - Thesaurus Dictionary + Thesaurus Dictionary A thesaurus dictionary (sometimes abbreviated as TZ) is @@ -2336,7 +1290,7 @@ the one a two : swsw2 - Thesaurus Configuration + Thesaurus Configuration To define a new thesaurus dictionary one can use the thesaurus template. @@ -2387,7 +1341,7 @@ ALTER TEXT SEARCH CONFIGURATION russian - Thesaurus Example + Thesaurus Example Consider a simple astronomical thesaurus thesaurus_astro, @@ -2462,7 +1416,7 @@ SELECT plainto_tsquery('supernova star'); - Ispell Dictionary + Ispell Dictionary The Ispell template dictionary for full text allows the @@ -2565,7 +1519,7 @@ SELECT ts_lexize('norwegian_ispell','sjokoladefabrikk'); - <application>Snowball</> Stemming Dictionary + <application>Snowball</> Stemming Dictionary The Snowball dictionary template is based on the project @@ -2594,7 +1548,7 @@ CREATE TEXT SEARCH DICTIONARY english_stem ( - Dictionary Testing + Dictionary Testing The ts_lexize function facilitates dictionary testing: @@ -2668,7 +1622,7 @@ SELECT plainto_tsquery('supernovae stars'); - Configuration Example + Configuration Example A full text configuration specifies all options necessary to transform a @@ -2828,7 +1782,7 @@ SHOW default_text_search_config; - Managing Multiple Configurations + Managing Multiple Configurations If you are using the same text search configuration for the entire cluster @@ -2859,7 +1813,7 @@ CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('french', title || body)); - GiST and GIN Index Types + GiST and GIN Index Types index @@ -3052,7 +2006,7 @@ EXPLAIN SELECT * FROM apod WHERE textsearch @@@ to_tsquery('supernovae:a'); - Limitations + Limitations The current limitations of Full Text Searching are: @@ -3094,7 +2048,7 @@ EXPLAIN SELECT * FROM apod WHERE textsearch @@@ to_tsquery('supernovae:a'); - <application>psql</> Support + <application>psql</> Support Information about full text searching objects can be obtained @@ -3287,7 +2241,7 @@ EXPLAIN SELECT * FROM apod WHERE textsearch @@@ to_tsquery('supernovae:a'); - Debugging + Debugging Function ts_debug allows easy testing of your full text searching @@ -3390,7 +2344,7 @@ FROM ts_debug('public.english','The Brightest supernovaes'); - Example of Creating a Rule-Based Dictionary + Example of Creating a Rule-Based Dictionary The motivation for this example dictionary is to control the indexing of @@ -3641,7 +2595,7 @@ END; - Example of Creating a Parser + Example of Creating a Parser SQL command CREATE TEXT SEARCH PARSER creates