From 40c1d7c10e21aa95bfde957f52b287148394fe8a Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 22 Aug 2007 04:45:20 +0000 Subject: [PATCH] Text search doc updates --- first cut at syncing the existing docs with the final syntax decisions. --- doc/src/sgml/config.sgml | 22 +- doc/src/sgml/ref/psql-ref.sgml | 62 +++- doc/src/sgml/textsearch.sgml | 634 +++++++++++++++++---------------- 3 files changed, 409 insertions(+), 309 deletions(-) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 879e045577..5ef230a4fe 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1,4 +1,4 @@ - + Server Configuration @@ -4106,6 +4106,26 @@ SET XML OPTION { DOCUMENT | CONTENT }; + + default_text_search_config (string) + + default_text_search_config configuration parameter + + + + Selects the text search configuration that is used by those variants + of the text search functions that do not have an explicit argument + specifying the configuration. + See for further information. + The built-in default is pg_catalog.simple, but + initdb will initialize the + configuration file with a setting that corresponds to the + chosen lc_ctype locale, if a configuration + matching that locale can be identified. + + + + diff --git a/doc/src/sgml/ref/psql-ref.sgml b/doc/src/sgml/ref/psql-ref.sgml index 1bec1059e8..c01f1f5cc8 100644 --- a/doc/src/sgml/ref/psql-ref.sgml +++ b/doc/src/sgml/ref/psql-ref.sgml @@ -1,5 +1,5 @@ @@ -997,6 +997,66 @@ testdb=> + + \dF [ pattern ] + \dF+ [ pattern ] + + + Lists available text search configurations. + If pattern is specified, + only configurations whose names match the pattern are shown. + If the form \dF+ is used, a full description of + each configuration is shown, including the underlying text search + parser and the dictionary list for each parser token type. + + + + + + \dFd [ pattern ] + \dFd+ [ pattern ] + + + Lists available text search dictionaries. + If pattern is specified, + only dictionaries whose names match the pattern are shown. + If the form \dFd+ is used, additional information + is shown about each selected dictionary, including the underlying + text search template and the option values. + + + + + + \dFp [ pattern ] + \dFp+ [ pattern ] + + + Lists available text search parsers. + If pattern is specified, + only parsers whose names match the pattern are shown. + If the form \dFp+ is used, a full description of + each parser is shown, including the underlying functions and the + list of recognized token types. + + + + + + \dFt [ pattern ] + \dFt+ [ pattern ] + + + Lists available text search templates. + If pattern is specified, + only templates whose names match the pattern are shown. + If the form \dFt+ is used, additional information + is shown about each template, including the underlying function names. + + + + + \dg [ pattern ] diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index a6601d6edb..0d1ab50002 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -6,11 +6,11 @@ Introduction -Full Text Searching (text search) allows the -searching of documents that satisfy a query, and -optionally returns them in some order. The most common search is to find -all documents containing query terms and return them -in order of their similarity to the +Full Text Searching (or just text search) allows +identifying documents that satisfy a query, and +optionally sorting them by relevance to the query. The most common search +is to find all documents containing given query terms +and return them in order of their similarity to the query. Notions of query and similarity are very flexible and depend on the specific application. The simplest search considers query as a @@ -250,9 +250,9 @@ SELECT 'fat:1 rat:2'::tsvector || 'fat:1 cat:2'::tsvector; -Tsquery is a data type for textual queries which supports +tsquery is a data type for textual queries which supports the boolean operators & (AND), | (OR), -and parentheses. A Tsquery consists of lexemes +and parentheses. A tsquery consists of lexemes (optionally labeled by letters) with boolean operators in between: @@ -273,7 +273,7 @@ development of different search engines using the same full text index. tsqueries can be concatenated using && (AND) and || (OR) operators: -SELECT 'a & b'::tsquery && 'c|d'::tsquery; +SELECT 'a & b'::tsquery && 'c | d'::tsquery; ?column? --------------------------- 'a' & 'b' & ( 'c' | 'd' ) @@ -294,22 +294,24 @@ SELECT 'a & b'::tsquery || 'c|d'::tsquery; Performing Searches -Full text searching in PostgreSQL provides the -operator @@ for two data types: tsvector -(document) and tsquery (query). Also, this operator -supports TEXT, VARCHAR, and CHAR -data types so simple full text searches can be done, but without ranking -support: +Full text searching in PostgreSQL is based on +the operator @@, which tests whether a tsvector +(document) matches a tsquery (query). Also, this operator +supports text input, allowing explicit conversion of a text +string to tsvector to be skipped. The variants available +are: tsvector @@ tsquery tsquery @@ tsvector -TEXT | VARCHAR | CHAR @@ TEXT | tsquery +text @@ tsquery +text @@ text -The full text operator @@ returns true if -tsvector contains tsquery: +The match operator @@ returns true if +the tsvector matches the tsquery. It doesn't +matter which data type is written first: SELECT 'cat & rat'::tsquery @@ 'a fat cat sat on a mat and ate a fat rat'::tsvector; ?column? @@ -320,12 +322,18 @@ SELECT 'fat & cow'::tsquery @@ 'a fat cat sat on a mat and ate a fat rat'::t ---------- f - + +The form text @@ tsquery +is equivalent to to_tsvector(x) @@ y. +The form text @@ text +is equivalent to to_tsvector(x) @@ plainto_tsquery(y). +Note that the results of these forms will depend on the setting of . + - @@ -358,11 +366,11 @@ or body: SELECT title FROM pgweb -WHERE to_tsvector('english', textcat(title, body)) @@ to_tsquery('create & table') +WHERE to_tsvector('english', title || body) @@ to_tsquery('create & table') ORDER BY dlm DESC LIMIT 10; -dlm is the last-modified date in seconds since 1970 so we -used ORDER BY dlm LIMIT 10 to get the most recent +dlm is the last-modified date so we +used ORDER BY dlm LIMIT 10 to get the ten most recent matches. For clarity we omitted the coalesce function which prevents the unwanted effect of NULL concatenation. @@ -382,13 +390,13 @@ CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('english', body)); Notice that the 2-argument version of to_tsvector is used. Only text search functions which specify a configuration name can be used in expression indexes (). -Casting to a text search data type (::) is also unsupported. -This is because the index contents should be unaffected by -default_text_search_config. If they were affected, the index -contents might be inconsistent because they could contain -tsvectors that were created with different default text search -configurations. Recovering a table from a pg_dump would -also not recreate index tsvectors properly. +This is because the index contents must be unaffected by +. +If they were affected, the index +contents might be inconsistent because different entries could contain +tsvectors that were created with different text search +configurations, and there would be no way to guess which was which. +It would be impossible to dump and restore such an index correctly. @@ -406,9 +414,9 @@ only with the same configuration used to create the index rows. It is possible to setup more complex expression indexes where the configuration name is specified by another column, e.g.: -CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector(conf_name, body)); +CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector(config_name, body)); -where conf_name is a column in the pgweb +where config_name is a column in the pgweb table. This allows mixed configurations in the same index while recording which configuration was used for each index row. @@ -416,7 +424,7 @@ recording which configuration was used for each index row. Indexes can even concatenate columns: -CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('english', textcat(title, body))); +CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('english', title || body)); @@ -438,7 +446,7 @@ CREATE INDEX textsearch_idx ON pgweb USING gin(textsearch_index); After vacuuming, we are ready to perform a fast full text search: -SELECT rank_cd(textsearch_index, q) AS rank, title +SELECT ts_rank_cd(textsearch_index, q) AS rank, title FROM pgweb, to_tsquery('create & table') q WHERE q @@ textsearch_index ORDER BY rank DESC LIMIT 10; @@ -527,16 +535,14 @@ SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector @@ 'fat & cow':: -TEXT @@ TSQUERY -VARCHAR @@ TSQUERY -CHAR @@ TSQUERY +text @@ tsquery Returns true if TSQUERY is contained -in TEXT/VARCHAR, and false if not: +in TEXT, and false if not: SELECT 'a fat cat sat on a mat and ate a fat rat'::text @@ 'cat & rat'::tsquery; ?column? @@ -562,9 +568,7 @@ SELECT 'a fat cat sat on a mat and ate a fat rat'::text @@ 'cat & cow'::tsqu -TEXT @@ TEXT -VARCHAR @@ TEXT -CHAR @@ TEXT +text @@ text @@ -612,7 +616,7 @@ For index support of full text operators consult conf_name, document TEXT) returns TSVECTOR +to_tsvector(config_name, document TEXT) returns TSVECTOR @@ -685,7 +689,7 @@ document to be weighted differently by ranking functions. vector1 || vector2 -concat(vector1 TSVECTOR, vector2 TSVECTOR) returns TSVECTOR +tsvector_concat(vector1 TSVECTOR, vector2 TSVECTOR) returns TSVECTOR @@ -701,7 +705,7 @@ weigh words from one section of your document differently than the others by parsing the sections into separate vectors and assigning each vector a different position label with the setweight() function. You can then concatenate them into a single vector and provide -a weights argument to the rank() function that assigns +a weights argument to the ts_rank() function that assigns different weights to positions with different labels. @@ -751,42 +755,51 @@ it yet) --> - trigger +for updating a derived tsvector column -tsvector_update_trigger(vector_column_name, filter_name, text_column_name , ... ) +tsvector_update_trigger(tsvector_column_name, config_name, text_column_name , ... ) +tsvector_update_trigger_column(tsvector_column_name, config_column_name, text_column_name , ... ) -The tsvector_update_trigger() trigger is used to -automatically update vector_column_name. -filter_name is the function name to preprocess -text_column_name. There can be many functions -and text columns specified in a -tsvector_update_trigger() trigger. If multiple -functions are specified, they apply to the following columns until the -next function appears. As an example of using a filter, function -dropatsymbol replaces all entries of the -@ sign with a space: +Two built-in trigger functions are available to automatically update a +tsvector column from one or more textual columns. An example +of their use is: -CREATE FUNCTION dropatsymbol(text) -RETURNS text -AS 'SELECT replace($1, ''@'', '' '');' -LANGUAGE SQL; +CREATE TABLE tblMessages ( + strMessage text, + tsv tsvector +); -CREATE TRIGGER tsvectorupdate BEFORE UPDATE OR INSERT +CREATE TRIGGER tsvectorupdate BEFORE INSERT OR UPDATE ON tblMessages FOR EACH ROW EXECUTE PROCEDURE -tsvector_update_trigger(tsvector_column, dropatsymbol, strMessage); +tsvector_update_trigger(tsv, 'pg_catalog.english', strMessage); +Having created this trigger, any change in strMessage +will be automatically reflected into tsv. + + + +Both triggers require you to specify the text search configuration to +be used to perform the conversion. For +tsvector_update_trigger, the configuration name is simply +given as the second trigger argument. It must be schema-qualified as +shown above, so that the trigger behavior will not change with changes +in search_path. For +tsvector_update_trigger_column, the second trigger argument +is the name of another table column, which must be of type +regconfig. This allows a per-row selection of configuration +to be made. @@ -882,7 +895,7 @@ All btree operations are defined for the tsvector type. -to_tsquery(conf_name, querytext text) returns TSQUERY +to_tsquery(config_name, querytext text) returns TSQUERY @@ -925,7 +938,7 @@ Without quotes to_tsquery will generate a syntax error. -plainto_tsquery(conf_name, querytext text) returns TSQUERY +plainto_tsquery(config_name, querytext text) returns TSQUERY @@ -1418,32 +1431,32 @@ function ( ), which shows all details of the full text machinery: SELECT * FROM ts_debug('english','a fat cat sat on a mat - it ate a fat rats'); - Alias | Description | Token | Dicts list | Lexized token --------+---------------+-------+----------------------+--------------------------- - lword | Latin word | a | {pg_catalog.en_stem} | pg_catalog.en_stem: {} - blank | Space symbols | | | - lword | Latin word | fat | {pg_catalog.en_stem} | pg_catalog.en_stem: {fat} - blank | Space symbols | | | - lword | Latin word | cat | {pg_catalog.en_stem} | pg_catalog.en_stem: {cat} - blank | Space symbols | | | - lword | Latin word | sat | {pg_catalog.en_stem} | pg_catalog.en_stem: {sat} - blank | Space symbols | | | - lword | Latin word | on | {pg_catalog.en_stem} | pg_catalog.en_stem: {} - blank | Space symbols | | | - lword | Latin word | a | {pg_catalog.en_stem} | pg_catalog.en_stem: {} - blank | Space symbols | | | - lword | Latin word | mat | {pg_catalog.en_stem} | pg_catalog.en_stem: {mat} - blank | Space symbols | | | - blank | Space symbols | - | | - lword | Latin word | it | {pg_catalog.en_stem} | pg_catalog.en_stem: {} - blank | Space symbols | | | - lword | Latin word | ate | {pg_catalog.en_stem} | pg_catalog.en_stem: {ate} - blank | Space symbols | | | - lword | Latin word | a | {pg_catalog.en_stem} | pg_catalog.en_stem: {} - blank | Space symbols | | | - lword | Latin word | fat | {pg_catalog.en_stem} | pg_catalog.en_stem: {fat} - blank | Space symbols | | | - lword | Latin word | rats | {pg_catalog.en_stem} | pg_catalog.en_stem: {rat} + Alias | Description | Token | Dictionaries | Lexized token +-------+---------------+-------+--------------+---------------- + lword | Latin word | a | {english} | english: {} + blank | Space symbols | | | + lword | Latin word | fat | {english} | english: {fat} + blank | Space symbols | | | + lword | Latin word | cat | {english} | english: {cat} + blank | Space symbols | | | + lword | Latin word | sat | {english} | english: {sat} + blank | Space symbols | | | + lword | Latin word | on | {english} | english: {} + blank | Space symbols | | | + lword | Latin word | a | {english} | english: {} + blank | Space symbols | | | + lword | Latin word | mat | {english} | english: {mat} + blank | Space symbols | | | + blank | Space symbols | - | | + lword | Latin word | it | {english} | english: {} + blank | Space symbols | | | + lword | Latin word | ate | {english} | english: {ate} + blank | Space symbols | | | + lword | Latin word | a | {english} | english: {} + blank | Space symbols | | | + lword | Latin word | fat | {english} | english: {fat} + blank | Space symbols | | | + lword | Latin word | rats | {english} | english: {rat} (24 rows) @@ -1485,7 +1498,7 @@ The following functions allow manual parsing control: -parse(parser, document TEXT) returns SETOF tokenout +ts_parse(parser, document TEXT) returns SETOF tokenout @@ -1496,7 +1509,7 @@ of records, one for each token produced by parsing. Each record includes a tokid giving its type and a token which gives its content: -SELECT * FROM parse('default','123 - a number'); +SELECT * FROM ts_parse('default','123 - a number'); tokid | token -------+-------- 22 | 123 @@ -1517,7 +1530,7 @@ SELECT * FROM parse('default','123 - a number'); -token_type(parser ) returns SETOF tokentype +ts_token_type(parser ) returns SETOF tokentype @@ -1530,7 +1543,7 @@ type the table gives the tokid which the token of that type, the alias which names the token type, and a short description: -SELECT * FROM token_type('default'); +SELECT * FROM ts_token_type('default'); tokid | alias | description -------+--------------+----------------------------------- 1 | lword | Latin word @@ -1598,12 +1611,12 @@ The two ranking functions currently available are: -rank +ts_rank -rank( weights float4[], vector TSVECTOR, query TSQUERY, normalization int4 ) returns float4 +ts_rank( weights float4[], vector TSVECTOR, query TSQUERY, normalization int4 ) returns float4 @@ -1630,12 +1643,12 @@ than words in the document body. -rank_cd +ts_rank_cd -rank_cd( weights float4[], vector TSVECTOR, query TSQUERY, normalization int4 ) returns float4 +ts_rank_cd( weights float4[], vector TSVECTOR, query TSQUERY, normalization int4 ) returns float4 @@ -1699,7 +1712,7 @@ a cosmetic change, i.e., the ordering of the search results will not change. Several examples are shown below; note that the second example uses normalized ranking: -SELECT title, rank_cd('{0.1, 0.2, 0.4, 1.0}',textsearch, query) AS rnk +SELECT title, ts_rank_cd('{0.1, 0.2, 0.4, 1.0}',textsearch, query) AS rnk FROM apod, to_tsquery('neutrino|(dark & matter)') query WHERE query @@ textsearch ORDER BY rnk DESC LIMIT 10; @@ -1716,8 +1729,8 @@ ORDER BY rnk DESC LIMIT 10; Ice Fishing for Cosmic Neutrinos | 1.6 Weak Lensing Distorts the Universe | 0.818218 -SELECT title, rank_cd('{0.1, 0.2, 0.4, 1.0}',textsearch, query)/ -(rank_cd('{0.1, 0.2, 0.4, 1.0}',textsearch, query) + 1) AS rnk +SELECT title, ts_rank_cd('{0.1, 0.2, 0.4, 1.0}',textsearch, query)/ +(ts_rank_cd('{0.1, 0.2, 0.4, 1.0}',textsearch, query) + 1) AS rnk FROM apod, to_tsquery('neutrino|(dark & matter)') query WHERE query @@ textsearch ORDER BY rnk DESC LIMIT 10; @@ -1737,7 +1750,7 @@ ORDER BY rnk DESC LIMIT 10; -The first argument in rank_cd ('{0.1, 0.2, +The first argument in ts_rank_cd ('{0.1, 0.2, 0.4, 1.0}') is an optional parameter which specifies the weights for labels D, C, B, and A used in function @@ -1785,17 +1798,17 @@ implements such functionality. -headline( conf_name text, document text, query TSQUERY, options text ) returns text +ts_headline( config_name text, document text, query TSQUERY, options text ) returns text -The headline() function accepts a document along with +The ts_headline function accepts a document along with a query, and returns one or more ellipsis-separated excerpts from the document in which terms from the query are highlighted. The configuration used to parse the document can be specified by its -conf_name; if none is specified, the current +config_name; if none is specified, the current configuration is used. @@ -1840,13 +1853,13 @@ StartSel=<b>, StopSel=</b>, MaxWords=35, MinWords=15, ShortWord=3, H For example: -SELECT headline('a b c', 'c'::tsquery); +SELECT ts_headline('a b c', 'c'::tsquery); headline -------------- a b <b>c</b> -SELECT headline('a b c', 'c'::tsquery, 'StartSel=<,StopSel=>'); - headline ----------- +SELECT ts_headline('a b c', 'c'::tsquery, 'StartSel=<,StopSel=>'); + ts_headline +------------- a b <c> @@ -1860,8 +1873,8 @@ shown. SQL subselects can help here; below is an example: -SELECT id,headline(body,q), rank -FROM (SELECT id,body,q, rank_cd (ti,q) AS rank FROM apod, to_tsquery('stars') q +SELECT id,ts_headline(body,q), rank +FROM (SELECT id,body,q, ts_rank_cd (ti,q) AS rank FROM apod, to_tsquery('stars') q WHERE ti @@ q ORDER BY rank DESC LIMIT 10) AS foo; @@ -1869,8 +1882,8 @@ FROM (SELECT id,body,q, rank_cd (ti,q) AS rank FROM apod, to_tsquery('stars') q Note that the cascade dropping of the parser function -causes dropping of the headline used in the full text search -configuration conf_name. +causes dropping of the ts_headline used in the full text search +configuration config_name. @@ -1958,7 +1971,7 @@ linkend="textsearch-rule-dictionary-example">) as an example. -The ALTER TEXT SEARCH CONFIGURATION public.pg ADD +The ALTER TEXT SEARCH CONFIGURATION ADD MAPPING command binds specific types of lexemes and a set of dictionaries to process them. (Mappings can also be specified as part of configuration creation.) Lexemes are processed by a stack of dictionaries @@ -1979,12 +1992,12 @@ ALTER TEXT SEARCH CONFIGURATION astro_en ADD MAPPING FOR lword WITH astrosyn, en -Function lexize can be used to test dictionaries, +Function ts_lexize can be used to test dictionaries, for example: -SELECT lexize('en_stem', 'stars'); - lexize --------- +SELECT ts_lexize('en_stem', 'stars'); + ts_lexize +----------- {star} (1 row) @@ -2010,15 +2023,15 @@ SELECT to_tsvector('english','in the list of stop words'); The gaps between positions 1-3 and 3-5 are because of stop words, so ranks calculated for documents with and without stop words are quite different: -SELECT rank_cd ('{1,1,1,1}', to_tsvector('english','in the list of stop words'), to_tsquery('list & stop')); - rank_cd ---------- - 0.5 +SELECT ts_rank_cd ('{1,1,1,1}', to_tsvector('english','in the list of stop words'), to_tsquery('list & stop')); + ts_rank_cd +------------ + 0.5 -SELECT rank_cd ('{1,1,1,1}', to_tsvector('english','list stop words'), to_tsquery('list & stop')); - rank_cd ---------- - 1 +SELECT ts_rank_cd ('{1,1,1,1}', to_tsvector('english','list stop words'), to_tsquery('list & stop')); + ts_rank_cd +------------ + 1 @@ -2033,26 +2046,24 @@ behaviour is an attempt to decrease possible noise. Here is an example of a dictionary that returns the input word as lowercase -or NULL if it is a stop word; it also specifies the location -of the file of stop words. It uses the simple dictionary as +or NULL if it is a stop word; it also specifies the name +of a file of stop words. It uses the simple dictionary as a template: -CREATE TEXT SEARCH DICTIONARY public.simple_dict - TEMPLATE pg_catalog.simple - OPTION 'english.stop'; +CREATE TEXT SEARCH DICTIONARY public.simple_dict ( + TEMPLATE = pg_catalog.simple, + STOPWORDS = english +); -Relative paths in OPTION resolve relative to -share/. Now we can test our -dictionary: +Now we can test our dictionary: -SELECT lexize('public.simple_dict','YeS'); - lexize --------- +SELECT ts_lexize('public.simple_dict','YeS'); + ts_lexize +----------- {yes} -SELECT lexize('public.simple_dict','The'); - lexize --------- +SELECT ts_lexize('public.simple_dict','The'); + ts_lexize +----------- {} @@ -2066,7 +2077,7 @@ SELECT lexize('public.simple_dict','The'); This dictionary template is used to create dictionaries which replace a word with a synonym. Phrases are not supported (use the thesaurus -dictionary () if you need them). Synonym +dictionary () for that). A synonym dictionary can be used to overcome linguistic problems, for example, to prevent an English stemmer dictionary from reducing the word 'Paris' to 'pari'. In that case, it is enough to have a Paris @@ -2074,17 +2085,18 @@ paris line in the synonym dictionary and put it before the en_stem dictionary: SELECT * FROM ts_debug('english','Paris'); - Alias | Description | Token | Dicts list | Lexized token --------+-------------+-------+----------------------+---------------------------- - lword | Latin word | Paris | {pg_catalog.en_stem} | pg_catalog.en_stem: {pari} + Alias | Description | Token | Dictionaries | Lexized token +-------+-------------+-------+--------------+----------------- + lword | Latin word | Paris | {english} | english: {pari} (1 row) -ALTER TEXT SEARCH CONFIGURATION ADD MAPPING ON english FOR lword WITH synonym, en_stem; -ALTER TEXT SEARCH MAPPING -Time: 340.867 ms + +ALTER TEXT SEARCH CONFIGURATION english + ADD MAPPING FOR lword WITH synonym, en_stem; + SELECT * FROM ts_debug('english','Paris'); - Alias | Description | Token | Dicts list | Lexized token --------+-------------+-------+-----------------------------------------+----------------------------- - lword | Latin word | Paris | {pg_catalog.synonym,pg_catalog.en_stem} | pg_catalog.synonym: {paris} + Alias | Description | Token | Dictionaries | Lexized token +-------+-------------+-------+-------------------+------------------ + lword | Latin word | Paris | {synonym,en_stem} | synonym: {paris} (1 row) @@ -2171,9 +2183,11 @@ To define a new thesaurus dictionary one can use the thesaurus template. For example: -CREATE TEXT SEARCH DICTIONARY thesaurus_simple - TEMPLATE thesaurus_template - OPTION 'DictFile="dicts_data/thesaurus.txt.sample", Dictionary="en_stem"'; +CREATE TEXT SEARCH DICTIONARY thesaurus_simple ( + TEMPLATE = thesaurus, + DictFile = mythesaurus, + Dictionary = pg_catalog.en_stem +); Here: @@ -2181,12 +2195,15 @@ Here: thesaurus_simple is the thesaurus dictionary name -DictFile="/path/to/thesaurus_simple.txt" is the location of the thesaurus file +mythesaurus is the base name of the thesaurus file +(its full name will be $SHAREDIR/tsearch_data/mythesaurus.ths, +where $SHAREDIR means the installation shared-data directory, +often /usr/local/share). -Dictionary="en_stem" defines the dictionary (snowball +pg_catalog.en_stem is the dictionary (snowball English stemmer) to use for thesaurus normalization. Notice that the -en_stem dictionary has it is own configuration (for example, +en_stem dictionary has its own configuration (for example, stop words). @@ -2195,7 +2212,8 @@ Now it is possible to bind the thesaurus dictionary thesaurus_simpletokens, for example: -ALTER TEXT SEARCH russian ADD MAPPING FOR lword, lhword, lpart_hword WITH thesaurus_simple; +ALTER TEXT SEARCH CONFIGURATION russian + ADD MAPPING FOR lword, lhword, lpart_hword WITH thesaurus_simple; @@ -2214,15 +2232,17 @@ crab nebulae : crab Below we create a dictionary and bind some token types with an astronomical thesaurus and english stemmer: -CREATE TEXT SEARCH DICTIONARY thesaurus_astro OPTION - TEMPLATE thesaurus_template - 'DictFile="dicts_data/thesaurus_astro.txt", Dictionary="en_stem"'; -ALTER TEXT SEARCH CONFIGURATION russian ADD MAPPING FOR lword, lhword, lpart_hword - WITH thesaurus_astro, en_stem; +CREATE TEXT SEARCH DICTIONARY thesaurus_astro ( + TEMPLATE = thesaurus, + DictFile = thesaurus_astro, + Dictionary = en_stem +); +ALTER TEXT SEARCH CONFIGURATION russian + ADD MAPPING FOR lword, lhword, lpart_hword WITH thesaurus_astro, en_stem; -Now we can see how it works. Note that lexize cannot +Now we can see how it works. Note that ts_lexize cannot be used for testing the thesaurus (see description of -lexize), but we can use +ts_lexize), but we can use plainto_tsquery and to_tsvector which accept text arguments, not lexemes: @@ -2288,17 +2308,17 @@ conjugations of the search term bank, e.g. banking, banked, banks, banks', and bank's. -SELECT lexize('en_ispell','banking'); - lexize --------- +SELECT ts_lexize('en_ispell','banking'); + ts_lexize +----------- {bank} -SELECT lexize('en_ispell','bank''s'); - lexize --------- +SELECT ts_lexize('en_ispell','bank''s'); + ts_lexize +----------- {bank} -SELECT lexize('en_ispell','banked'); - lexize --------- +SELECT ts_lexize('en_ispell','banked'); + ts_lexize +----------- {bank} @@ -2306,38 +2326,26 @@ SELECT lexize('en_ispell','banked'); To create an ispell dictionary one should use the built-in -ispell_template dictionary and specify several +ispell dictionary and specify several parameters. -CREATE TEXT SEARCH DICTIONARY en_ispell - TEMPLATE ispell_template - OPTION 'DictFile="/usr/local/share/dicts/ispell/english.dict", - AffFile="/usr/local/share/dicts/ispell/english.aff", - StopFile="/usr/local/share/dicts/ispell/english.stop"'; +CREATE TEXT SEARCH DICTIONARY en_ispell ( + TEMPLATE = ispell, + DictFile = english, + AffFile = english, + StopWords = english +); -Here, DictFile, AffFile, StopFile -specify the location of the dictionary and stop words files. +Here, DictFile, AffFile, and StopWords +specify the names of the dictionary, affixes, and stop-words files. -Relative paths in OPTION resolve relative to -share/dicts_data: - -CREATE TEXT SEARCH DICTIONARY en_ispell - TEMPLATE ispell_template - OPTION 'DictFile="ispell/english.dict", - AffFile="ispell/english.aff", - StopFile="english.stop"'; - - - - -Ispell dictionaries usually recognize a restricted set of words so it +Ispell dictionaries usually recognize a restricted set of words so they should be used in conjunction with another broader dictionary; for example, a stemming dictionary, which recognizes everything. - @@ -2352,9 +2360,9 @@ compoundwords controlled z Several examples for the Norwegian language: -SELECT lexize('norwegian_ispell','overbuljongterningpakkmesterassistent'); +SELECT ts_lexize('norwegian_ispell','overbuljongterningpakkmesterassistent'); {over,buljong,terning,pakk,mester,assistent} -SELECT lexize('norwegian_ispell','sjokoladefabrikk'); +SELECT ts_lexize('norwegian_ispell','sjokoladefabrikk'); {sjokoladefabrikk,sjokolade,fabrikk} @@ -2374,27 +2382,18 @@ operations of Hunspell. <application>Snowball</> Stemming Dictionary -The Snowball template dictionary is based on the project -of Martin Porter, an inventor of the popular Porter's stemming algorithm +The Snowball dictionary template is based on the project +of Martin Porter, inventor of the popular Porter's stemming algorithm for the English language and now supported in many languages (see the Snowball site for more information). Full text searching contains a large number of stemmers for -many languages. The only option that is accepted by a snowball stemmer is the -location of a file with stop words. It can be defined using the -ALTER TEXT SEARCH DICTIONARY command. - - +many languages. A Snowball dictionary requires a language parameter to +identify which stemmer to use, and optionally can specify a stopword file name. +For example, -ALTER TEXT SEARCH DICTIONARY en_stem - SET OPTION 'StopFile=english-utf8.stop, Language=english'; - - - - -Relative paths in OPTION resolve relative -share/dicts/data: - -ALTER TEXT SEARCH DICTIONARY en_stem OPTION 'english.stop'; +ALTER TEXT SEARCH DICTIONARY en_stem ( + StopWords = english-utf8, Language = english +); @@ -2410,18 +2409,18 @@ before any other dictionary because a lexeme will not pass through its stemmer. Dictionary Testing -The lexize function facilitates dictionary testing: +The ts_lexize function facilitates dictionary testing: -lexize +ts_lexize -lexize( dict_name text, lexeme text) returns text[] +ts_lexize( dict_name text, lexeme text) returns text[] @@ -2433,13 +2432,13 @@ array if the lexeme is known to the dictionary but it is a stop word, or NULL if it is an unknown word. -SELECT lexize('en_stem', 'stars'); - lexize --------- +SELECT ts_lexize('en_stem', 'stars'); + ts_lexize +----------- {star} -SELECT lexize('en_stem', 'a'); - lexize --------- +SELECT ts_lexize('en_stem', 'a'); + ts_lexize +----------- {} @@ -2450,16 +2449,16 @@ SELECT lexize('en_stem', 'a'); -The lexize function expects a +The ts_lexize function expects a lexeme, not text. Below is an example: -SELECT lexize('thesaurus_astro','supernovae stars') is null; +SELECT ts_lexize('thesaurus_astro','supernovae stars') is null; ?column? ---------- t Thesaurus dictionary thesaurus_astro does know -supernovae stars, but lexize fails since it does not +supernovae stars, but ts_lexize fails since it does not parse the input text and considers it as a single lexeme. Use plainto_tsquery and to_tsvector to test thesaurus dictionaries: @@ -2489,23 +2488,24 @@ about full text searching objects (). -The GUC variable default_text_search_config -(optionally schema-qualified) defines the name of the current -active configuration. It can be defined in -postgresql.conf or using the SET command. +The configuration parameter + +specifies the name of the current default configuration, which is the +one used by text search functions when an explicit configuration +parameter is omitted. +It can be set in postgresql.conf, or set for an +individual session using the SET command. -Predefined full text searching objects are available in the +Several predefined text searching configurations are available in the pg_catalog schema. If you need a custom configuration -you can create a new full text searching object and modify it using SQL +you can create a new text searching configuration and modify it using SQL commands. -New full text searching objects are created in the current schema by default +New text searching objects are created in the current schema by default (usually the public schema), but a schema-qualified -name can be used to create objects in the specified schema. It is owned -by the current user and can be changed using the ALTER TEXT -SEARCH OWNER command. +name can be used to create objects in the specified schema. @@ -2515,55 +2515,61 @@ As an example, we will create a configuration BEGIN; -CREATE TEXT SEARCH CONFIGURATION public.pg LIKE english WITH MAP; +CREATE TEXT SEARCH CONFIGURATION public.pg ( COPY = english ); -We will use a PostgreSQL-specific synonym dictionary -and store it in the share/dicts_data directory. The -dictionary looks like: +We will use a PostgreSQL-specific synonym list +and store it in share/tsearch_data/pg_dict.syn. +The file contents look like: postgres pg pgsql pg postgresql pg +We define the dictionary like this: -CREATE TEXT SEARCH DICTIONARY pg_dict - TEMPLATE synonym - OPTION 'pg_dict.txt'; +CREATE TEXT SEARCH DICTIONARY pg_dict ( + TEMPLATE = synonym + SYNONYMS = pg_dict +); -Then register the ispell dictionary en_ispell using -the ispell_template template: +Then register the ispell dictionary +en_ispell using the ispell template: -CREATE TEXT SEARCH DICTIONARY en_ispell - TEMPLATE ispell_template - OPTION 'DictFile="english-utf8.dict", - AffFile="english-utf8.aff", - StopFile="english-utf8.stop"'; +CREATE TEXT SEARCH DICTIONARY en_ispell ( + TEMPLATE = ispell, + DictFile = english-utf8, + AffFile = english-utf8, + StopWords = english-utf8 +); -Use the same stop word list for the Snowball stemmer en_stem, -which is available by default: +We can use the same stop word list for the Snowball stemmer +en_stem, which is available by default: -ALTER TEXT SEARCH DICTIONARY en_stem SET OPTION 'english-utf8.stop'; +ALTER TEXT SEARCH DICTIONARY en_stem ( + StopWords = english-utf8 +); -Modify mappings for Latin words for configuration 'pg': +Now modify mappings for Latin words for configuration pg: -ALTER TEXT SEARCH CONFIGURATION pg ALTER MAPPING FOR lword, lhword, lpart_hword +ALTER TEXT SEARCH CONFIGURATION pg + ALTER MAPPING FOR lword, lhword, lpart_hword WITH pg_dict, en_ispell, en_stem; @@ -2572,7 +2578,8 @@ ALTER TEXT SEARCH CONFIGURATION pg ALTER MAPPING FOR lword, lhword, lpart_hword We do not index or search some tokens: -ALTER TEXT SEARCH CONFIGURATION pg DROP MAPPING FOR email, url, sfloat, uri, float; +ALTER TEXT SEARCH CONFIGURATION pg + DROP MAPPING FOR email, url, sfloat, uri, float; @@ -2582,7 +2589,7 @@ Now, we can test our configuration: SELECT * FROM ts_debug('public.pg', ' PostgreSQL, the highly scalable, SQL compliant, open source object-relational database management system, is now undergoing beta testing of the next -version of our software: PostgreSQL 8.2. +version of our software: PostgreSQL 8.3. '); COMMIT; @@ -2603,7 +2610,7 @@ are shown: path | character varying | not null body | character varying | title | character varying | - dlm | integer | + dlm | date | @@ -2644,15 +2651,15 @@ DATABASE ... SET. However, if you need to use several text search configurations in the same database you must be careful to reference the proper text search configuration. This can be done by either setting -default_text_search_conf in each session or supplying the -configuration name in every function call, e.g. to_tsquery('pg', -'friend'), to_tsvector('pg', col). If you are using an expression index, +default_text_search_config in each session or supplying the +configuration name in every function call, e.g. to_tsquery('french', +'friend'), to_tsvector('english', col). If you are using an expression index, you must also be sure to use the proper text search configuration every time an INSERT or UPDATE is executed because these will modify the index, or you can embed the configuration name into the expression index, e.g.: -CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('pg', textcat(title, body))); +CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('french', title || body)); And if you do that, make sure you specify the configuration name in the WHERE clause as well so the expression index will be used. @@ -2680,10 +2687,9 @@ Note that indexes are not mandatory for full text searching. - index -GIST +GIST, for text searching @@ -2695,6 +2701,8 @@ CREATE INDEX name ON table Creates a GiST (Generalized Search Tree)-based index. +The column can be of tsvector or +tsquery type. @@ -2716,9 +2724,7 @@ CREATE INDEX name ON table Creates a GIN (Generalized Inverted Index)-based index. -column is a -TSVECTOR, TEXT, -VARCHAR, or CHAR-type column. +The column must be of tsvector type. @@ -2728,10 +2734,11 @@ Creates a GIN (Generalized Inverted Index)-based index. -A GiST index is lossy, meaning it is necessary -to consult the heap to check for false results. -PostgreSQL does this automatically; see -Filter: in the example below: +A GiST index is lossy, meaning it is necessary +to check the actual table row to eliminate false matches. +PostgreSQL does this automatically; for +example, in the query plan below, the Filter: +line indicates the index output will be rechecked: EXPLAIN SELECT * FROM apod WHERE textsearch @@ to_tsquery('supernovae'); QUERY PLAN @@ -2788,7 +2795,8 @@ the number of unique words. There is one side-effect of the non-lossiness of a GIN index when using query labels/weights, like 'supernovae:a'. A GIN index has all the information necessary to determine a match, so the heap is -not accessed. However, if the query has label information it must access +not accessed. However, label information is not stored in the index, +so if the query involves label weights it must access the heap. Therefore, a special full text search operator @@@ was created which forces the use of the heap to get information about labels. GiST indexes are lossy so it always reads the heap and there is @@ -3073,24 +3081,25 @@ configuration. -ts_debug(conf_name, document TEXT) returns SETOF tsdebug +ts_debug(config_name, document TEXT) returns SETOF ts_debug ts_debug displays information about every token of document as produced by the parser and processed by the configured dictionaries using the configuration -specified by conf_name. +specified by config_name. -tsdebug type defined as: +ts_debug type defined as: -CREATE TYPE tsdebug AS ( - "Alias" text, - "Description" text, - "Token" text, - "Dicts list" text[], - "Lexized token" text +CREATE TYPE ts_debug AS ( + "Alias" text, + "Description" text, + "Token" text, + "Dictionaries" regdictionary[], + "Lexized token" text +); @@ -3101,13 +3110,17 @@ ispell dictionary for the English language. You can skip the test step and play with the standard english configuration. -CREATE TEXT SEARCH CONFIGURATION public.english LIKE pg_catalog.english WITH MAP AS DEFAULT; -CREATE TEXT SEARCH DICTIONARY en_ispell - TEMPLATE ispell_template - OPTION 'DictFile="/usr/local/share/dicts/ispell/english-utf8.dict", - AffFile="/usr/local/share/dicts/ispell/english-utf8.aff", - StopFile="/usr/local/share/dicts/english.stop"'; -ALTER TEXT SEARCH MAPPING ON public.english FOR lword WITH en_ispell,en_stem; +CREATE TEXT SEARCH CONFIGURATION public.english ( COPY = pg_catalog.english ); + +CREATE TEXT SEARCH DICTIONARY en_ispell ( + TEMPLATE = ispell, + DictFile = english-utf8, + AffFile = english-utf8, + StopWords = english +); + +ALTER TEXT SEARCH CONFIGURATION public.english + ALTER MAPPING FOR lword WITH en_ispell, en_stem; @@ -3211,9 +3224,9 @@ shortened numbers. Examples: -SELECT lexize('intdict', 11234567890); - lexize ----------- +SELECT ts_lexize('intdict', 11234567890); + ts_lexize +----------- {112345} @@ -3221,10 +3234,12 @@ SELECT lexize('intdict', 11234567890); Now, we want to ignore long integers: -ALTER TEXT SEARCH DICTIONARY intdict SET OPTION 'MAXLEN=6, REJECTLONG=TRUE'; -SELECT lexize('intdict', 11234567890); - lexize --------- +ALTER TEXT SEARCH DICTIONARY intdict ( + MAXLEN = 6, REJECTLONG = TRUE +); +SELECT ts_lexize('intdict', 11234567890); + ts_lexize +----------- {} @@ -3379,9 +3394,14 @@ AS 'MODULE_PATHNAME' LANGUAGE 'C' WITH (isstrict); -CREATE TEXT SEARCH DICTIONARY intdict - LEXIZE 'dlexize_intdict' INIT 'dinit_intdict' - OPTION 'MAXLEN=6,REJECTLONG = false'; +CREATE TEXT SEARCH TEMPLATE intdict_template ( + LEXIZE = dlexize_intdict, INIT = dinit_intdict +); + +CREATE TEXT SEARCH DICTIONARY intdict ( + TEMPLATE = intdict_template, + MAXLEN = 6, REJECTLONG = false +); COMMENT ON TEXT SEARCH DICTIONARY intdict IS 'Dictionary for Integers'; @@ -3483,7 +3503,7 @@ Below is the source code of our test parser, organized as a contrib Testing: -SELECT * FROM parse('testparser','That''s my first own parser'); +SELECT * FROM ts_parse('testparser','That''s my first own parser'); tokid | token -------+-------- 3 | That's @@ -3499,7 +3519,7 @@ SELECT to_tsvector('testcfg','That''s my first own parser'); to_tsvector ------------------------------------------------- 'my':2 'own':4 'first':3 'parser':5 'that''s':1 -SELECT headline('testcfg','Supernovae stars are the brightest phenomena in galaxies', to_tsquery('testcfg', 'star')); +SELECT ts_headline('testcfg','Supernovae stars are the brightest phenomena in galaxies', to_tsquery('testcfg', 'star')); headline ----------------------------------------------------------------- Supernovae <b>stars</b> are the brightest phenomena in galaxies @@ -3696,15 +3716,15 @@ AS 'MODULE_PATHNAME' LANGUAGE 'C' with (isstrict); -CREATE TEXT SEARCH PARSER testparser - START 'testprs_start' - GETTOKEN 'testprs_getlexeme' - END 'testprs_end' - LEXTYPES 'testprs_lextype' +CREATE TEXT SEARCH PARSER testparser ( + START = testprs_start, + GETTOKEN = testprs_getlexeme, + END = testprs_end, + LEXTYPES = testprs_lextype ; -CREATE TEXT SEARCH CONFIGURATION testcfg PARSER 'testparser'; -CREATE TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH simple; +CREATE TEXT SEARCH CONFIGURATION testcfg ( PARSER = testparser ); +ALTER TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH simple; END;