From 7627f64ba21a2734192c6832d01a9c38948872bc Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Thu, 9 Apr 2020 15:11:08 -0400 Subject: [PATCH] Doc: improve documentation about ts_headline() function. Now that I've had my nose in that code, I thought the docs about it left something to be desired. --- doc/src/sgml/textsearch.sgml | 110 +++++++++++++++++++---------------- 1 file changed, 60 insertions(+), 50 deletions(-) diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index 3b54dd575d..765186544b 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -1301,64 +1301,75 @@ ts_headline( config - StartSel, StopSel: the strings with - which to delimit query words appearing in the document, to distinguish - them from other excerpted words. You must double-quote these strings - if they contain spaces or commas. + MaxWords, MinWords (integers): + these numbers determine the longest and shortest headlines to output. + The default values are 35 and 15. - MaxWords, MinWords: these numbers - determine the longest and shortest headlines to output. + ShortWord (integer): words of this length or less + will be dropped at the start and end of a headline, unless they are + query terms. The default value of three eliminates common English + articles. - ShortWord: words of this length or less will be - dropped at the start and end of a headline. The default - value of three eliminates common English articles. - - - - - HighlightAll: Boolean flag; if + HighlightAll (boolean): if true the whole document will be used as the - headline, ignoring the preceding three parameters. + headline, ignoring the preceding three parameters. The default + is false. - MaxFragments: maximum number of text excerpts - or fragments to display. The default value of zero selects a - non-fragment-oriented headline generation method. A value greater than - zero selects fragment-based headline generation. This method - finds text fragments with as many query words as possible and - stretches those fragments around the query words. As a result - query words are close to the middle of each fragment and have words on - each side. Each fragment will be of at most MaxWords and - words of length ShortWord or less are dropped at the start - and end of each fragment. If not all query words are found in the - document, then a single fragment of the first MinWords - in the document will be displayed. + MaxFragments (integer): maximum number of text + fragments to display. The default value of zero selects a + non-fragment-based headline generation method. A value greater + than zero selects fragment-based headline generation (see below). - FragmentDelimiter: When more than one fragment is - displayed, the fragments will be separated by this string. + StartSel, StopSel (strings): + the strings with which to delimit query words appearing in the + document, to distinguish them from other excerpted words. The + default values are <b> and + </b>, which can be suitable + for HTML output. + + + + + FragmentDelimiter (string): When more than one + fragment is displayed, the fragments will be separated by this string. + The default is ... . These option names are recognized case-insensitively. - Any unspecified options receive these defaults: + You must double-quote string values if they contain spaces or commas. + - -StartSel=<b>, StopSel=</b>, -MaxWords=35, MinWords=15, ShortWord=3, HighlightAll=FALSE, -MaxFragments=0, FragmentDelimiter=" ... " - + + In non-fragment-based headline + generation, ts_headline locates matches for the + given query and chooses a + single one to display, preferring matches that have more query words + within the allowed headline length. + In fragment-based headline generation, ts_headline + locates the query matches and splits each match + into fragments of no more than MaxWords + words each, preferring fragments with more query words, and when + possible stretching fragments to include surrounding + words. The fragment-based mode is thus more useful when the query + matches span large sections of the document, or when it's desirable to + display multiple matches. + In either mode, if no query matches can be identified, then a single + fragment of the first MinWords words in the document + will be displayed. @@ -1370,25 +1381,24 @@ SELECT ts_headline('english', is to find all documents containing given query terms and return them in order of their similarity to the query.', - to_tsquery('query & similarity')); - ts_headline + to_tsquery('english', 'query & similarity')); + ts_headline ------------------------------------------------------------ - containing given <b>query</b> terms - and return them in order of their <b>similarity</b> to the + containing given <b>query</b> terms + + and return them in order of their <b>similarity</b> to the+ <b>query</b>. SELECT ts_headline('english', - 'The most common type of search -is to find all documents containing given query terms -and return them in order of their similarity to the -query.', - to_tsquery('query & similarity'), - 'StartSel = <, StopSel = >'); - ts_headline -------------------------------------------------------- - containing given <query> terms - and return them in order of their <similarity> to the - <query>. + 'Search terms may occur +many times in a document, +requiring ranking of the search matches to decide which +occurrences to display in the result.', + to_tsquery('english', 'search & term'), + 'MaxFragments=10, MaxWords=7, MinWords=3, StartSel=<<, StopSel=>>'); + ts_headline +------------------------------------------------------------ + <<Search>> <<terms>> may occur + + many times ... ranking of the <<search>> matches to decide