From 91be1d1906a740725147c4e2a81ccda10543b714 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 9 Apr 2020 13:19:23 -0400
Subject: [PATCH] Fix default text search parser's ts_headline code for phrase
 queries.

This code could produce very poor results when asked to highlight a
string based on a query using phrase-match operators.  The root cause
is that hlCover(), which is supposed to find a minimal substring that
matches the query, was written assuming that word position is not
significant.  I'm only 95% convinced that its algorithm was correct even
for plain AND/OR queries; but it definitely fails completely for phrase
matches, causing it to possibly not identify a cover string at all.

Hence, rewrite hlCover() with a less-tense algorithm that just tries
all the possible substrings, earlier and shorter ones first.  (This is
not as bad as it sounds performance-wise, because all of the string
matching has been done already: the repeated tsquery match checks
boil down to pointer comparisons.)

Unfortunately, since that approach produces more candidate cover
strings than before, it also exposes that there were bugs in the
heuristics in mark_hl_words() for selecting a best cover string.
Fixes there include:
* Do not apply the ShortWord filter to words that appear in the query.
* Remove a misguided optimization for quickly rejecting a cover.
* Fix order-of-operation bug that could cause computation of a
wrong figure of merit (poslen) when shortening a cover.
* Change the preference rule so that candidate headlines that do not
include their whole cover string (after MaxWords trimming) are lowest
priority, since they may not actually satisfy the user's query.

This results in some changes in existing regression test cases,
but they all seem reasonable.  Note in particular that the tests
involving strings like "1 2 3" were previously being affected by
the ShortWord filter, masking the normal matching behavior.

Per bug #16345 from Augustinas Jokubauskas; the new test cases are
based on that example.  Back-patch to 9.6 where phrase search was
added to tsquery.

Discussion: https://postgr.es/m/16345-2e0cf5cddbdcd3b4@postgresql.org
---
 src/backend/tsearch/wparser_def.c     | 183 +++++++++++++++-----------
 src/test/regress/expected/tsearch.out |  43 ++++--
 src/test/regress/sql/tsearch.sql      |  11 ++
 3 files changed, 146 insertions(+), 91 deletions(-)

diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c
index cbc22a71a2..c895a7827b 100644
--- a/src/backend/tsearch/wparser_def.c
+++ b/src/backend/tsearch/wparser_def.c
@@ -1942,9 +1942,10 @@ prsd_end(PG_FUNCTION_ARGS)
 #define INTERESTINGWORD(j) \
 	(prs->words[j].item && !prs->words[j].repeated)
 
-/* Don't want to end at a non-word or a short word */
+/* Don't want to end at a non-word or a short word, unless interesting */
 #define BADENDPOINT(j) \
-	(NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword)
+	((NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword) && \
+	 !INTERESTINGWORD(j))
 
 typedef struct
 {
@@ -2003,75 +2004,97 @@ checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
 	return false;
 }
 
+/*
+ * hlFirstIndex: find first index >= pos containing any word used in query
+ *
+ * Returns -1 if no such index
+ */
+static int
+hlFirstIndex(HeadlineParsedText *prs, TSQuery query, int pos)
+{
+	int			i;
 
+	/* For each word ... */
+	for (i = pos; i < prs->curwords; i++)
+	{
+		/* ... scan the query to see if this word matches any operand */
+		QueryItem  *item = GETQUERY(query);
+		int			j;
+
+		for (j = 0; j < query->size; j++)
+		{
+			if (item->type == QI_VAL &&
+				prs->words[i].item == &item->qoperand)
+				return i;
+			item++;
+		}
+	}
+	return -1;
+}
+
+/*
+ * hlCover: try to find a substring of prs' word list that satisfies query
+ *
+ * At entry, *p must be the first word index to consider (initialize this to
+ * zero, or to the next index after a previous successful search).
+ *
+ * On success, sets *p to first word index and *q to last word index of the
+ * cover substring, and returns true.
+ *
+ * The result is a minimal cover, in the sense that both *p and *q will be
+ * words used in the query.
+ */
 static bool
 hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
 {
-	int			i,
-				j;
-	QueryItem  *item = GETQUERY(query);
-	int			pos = *p;
+	int			pmin,
+				pmax,
+				nextpmin,
+				nextpmax;
+	hlCheck		ch;
 
-	*q = -1;
-	*p = INT_MAX;
-
-	for (j = 0; j < query->size; j++)
+	/*
+	 * We look for the earliest, shortest substring of prs->words that
+	 * satisfies the query.  Both the pmin and pmax indices must be words
+	 * appearing in the query; there's no point in trying endpoints in between
+	 * such points.
+	 */
+	pmin = hlFirstIndex(prs, query, *p);
+	while (pmin >= 0)
 	{
-		if (item->type != QI_VAL)
+		/* This useless assignment just keeps stupider compilers quiet */
+		nextpmin = -1;
+		/* Consider substrings starting at pmin */
+		ch.words = &(prs->words[pmin]);
+		/* Consider the length-one substring first, then longer substrings */
+		pmax = pmin;
+		do
 		{
-			item++;
-			continue;
-		}
-		for (i = pos; i < prs->curwords; i++)
-		{
-			if (prs->words[i].item == &item->qoperand)
+			/* Try to match query against pmin .. pmax substring */
+			ch.len = pmax - pmin + 1;
+			if (TS_execute(GETQUERY(query), &ch,
+						   TS_EXEC_EMPTY, checkcondition_HL))
 			{
-				if (i > *q)
-					*q = i;
-				break;
+				*p = pmin;
+				*q = pmax;
+				return true;
 			}
+			/* Nope, so advance pmax to next feasible endpoint */
+			nextpmax = hlFirstIndex(prs, query, pmax + 1);
+
+			/*
+			 * If this is our first advance past pmin, then the result is also
+			 * the next feasible value of pmin; remember it to save a
+			 * redundant search.
+			 */
+			if (pmax == pmin)
+				nextpmin = nextpmax;
+			pmax = nextpmax;
 		}
-		item++;
+		while (pmax >= 0);
+		/* No luck here, so try next feasible startpoint */
+		pmin = nextpmin;
 	}
-
-	if (*q < 0)
-		return false;
-
-	item = GETQUERY(query);
-	for (j = 0; j < query->size; j++)
-	{
-		if (item->type != QI_VAL)
-		{
-			item++;
-			continue;
-		}
-		for (i = *q; i >= pos; i--)
-		{
-			if (prs->words[i].item == &item->qoperand)
-			{
-				if (i < *p)
-					*p = i;
-				break;
-			}
-		}
-		item++;
-	}
-
-	if (*p <= *q)
-	{
-		hlCheck		ch;
-
-		ch.words = &(prs->words[*p]);
-		ch.len = *q - *p + 1;
-		if (TS_execute(GETQUERY(query), &ch, TS_EXEC_EMPTY, checkcondition_HL))
-			return true;
-		else
-		{
-			(*p)++;
-			return hlCover(prs, query, p, q);
-		}
-	}
-
 	return false;
 }
 
@@ -2357,11 +2380,12 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
 	int			bestb = -1,
 				beste = -1;
 	int			bestlen = -1;
+	bool		bestcover = false;
 	int			pose,
 				posb,
 				poslen,
 				curlen;
-
+	bool		poscover;
 	int			i;
 
 	if (!highlightall)
@@ -2387,14 +2411,6 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
 				pose = i;
 			}
 
-			/* XXX this optimization seems unnecessary and wrong */
-			if (poslen < bestlen && !BADENDPOINT(beste))
-			{
-				/* better cover already found, so try next cover */
-				p++;
-				continue;
-			}
-
 			if (curlen < max_words)
 			{
 				/*
@@ -2449,29 +2465,38 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
 					i = q;
 				for (; curlen > min_words; i--)
 				{
+					if (!BADENDPOINT(i))
+						break;
 					if (!NONWORDTOKEN(prs->words[i].type))
 						curlen--;
 					if (INTERESTINGWORD(i))
 						poslen--;
-					pose = i;
-					if (!BADENDPOINT(i))
-						break;
+					pose = i - 1;
 				}
 			}
 
 			/*
-			 * Adopt this headline if it's the first, or if it has more
-			 * interesting words and isn't ending at a bad endpoint, or if it
-			 * replaces a bad endpoint with a good one (XXX even if it has
-			 * fewer interesting words?  Really?)
+			 * Check whether the proposed headline includes the original
+			 * cover; it might not if we trimmed it due to max_words.
 			 */
-			if (bestlen < 0 ||
-				(poslen > bestlen && !BADENDPOINT(pose)) ||
-				(!BADENDPOINT(pose) && BADENDPOINT(beste)))
+			poscover = (posb <= p && pose >= q);
+
+			/*
+			 * Adopt this headline if it's better than the last one, giving
+			 * highest priority to headlines including the cover, then to
+			 * headlines with more interesting words, then to headlines with
+			 * good stopping points.  (Since bestlen is initially -1, we will
+			 * certainly adopt the first headline.)
+			 */
+			if (poscover > bestcover ||
+				(poscover == bestcover && poslen > bestlen) ||
+				(poscover == bestcover && poslen == bestlen &&
+				 !BADENDPOINT(pose) && BADENDPOINT(beste)))
 			{
 				bestb = posb;
 				beste = pose;
 				bestlen = poslen;
+				bestcover = poscover;
 			}
 
 			/* move p to generate the next cover */
diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out
index b088ff0d4f..02e1b12b85 100644
--- a/src/test/regress/expected/tsearch.out
+++ b/src/test/regress/expected/tsearch.out
@@ -1090,12 +1090,12 @@ Water, water, every where,
   Nor any drop to drink.
 S. T. Coleridge (1772-1834)
 ', phraseto_tsquery('english', 'painted Ocean'));
-           ts_headline            
-----------------------------------
- <b>painted</b> <b>Ocean</b>.    +
- Water, water, every where       +
-   And all the boards did shrink;+
- Water, water, every
+              ts_headline              
+---------------------------------------
+ <b>painted</b> Ship                  +
+   Upon a <b>painted</b> <b>Ocean</b>.+
+ Water, water, every where            +
+   And all the boards did shrink
 (1 row)
 
 SELECT ts_headline('english', '
@@ -1117,6 +1117,15 @@ S. T. Coleridge (1772-1834)
    And all the boards
 (1 row)
 
+SELECT ts_headline('english',
+'Lorem ipsum urna.  Nullam nullam ullamcorper urna.',
+to_tsquery('english','Lorem') && phraseto_tsquery('english','ullamcorper urna'),
+'MaxWords=100, MinWords=1');
+                                  ts_headline                                  
+-------------------------------------------------------------------------------
+ <b>Lorem</b> ipsum <b>urna</b>.  Nullam nullam <b>ullamcorper</b> <b>urna</b>
+(1 row)
+
 SELECT ts_headline('english', '
 <html>
 <!-- some comment -->
@@ -1153,15 +1162,15 @@ SELECT ts_headline('simple', '1 2 3 1 3'::text, '1 <-> 3', 'MaxWords=2, MinWords
 (1 row)
 
 SELECT ts_headline('simple', '1 2 3 1 3'::text, '1 & 3', 'MaxWords=4, MinWords=1');
-         ts_headline          
-------------------------------
- <b>1</b> 2 <b>3</b> <b>1</b>
+     ts_headline     
+---------------------
+ <b>1</b> 2 <b>3</b>
 (1 row)
 
 SELECT ts_headline('simple', '1 2 3 1 3'::text, '1 <-> 3', 'MaxWords=4, MinWords=1');
-    ts_headline    
--------------------
- <b>1</b> <b>3</b>
+        ts_headline         
+----------------------------
+ <b>3</b> <b>1</b> <b>3</b>
 (1 row)
 
 --Check if headline fragments work
@@ -1256,6 +1265,16 @@ S. T. Coleridge (1772-1834)
  S. T. <b>Coleridge</b>
 (1 row)
 
+--Fragments with phrase search
+SELECT ts_headline('english',
+'Lorem ipsum urna.  Nullam nullam ullamcorper urna.',
+to_tsquery('english','Lorem') && phraseto_tsquery('english','ullamcorper urna'),
+'MaxFragments=100, MaxWords=100, MinWords=1');
+                                  ts_headline                                  
+-------------------------------------------------------------------------------
+ <b>Lorem</b> ipsum <b>urna</b>.  Nullam nullam <b>ullamcorper</b> <b>urna</b>
+(1 row)
+
 --Rewrite sub system
 CREATE TABLE test_tsquery (txtkeyword TEXT, txtsample TEXT);
 \set ECHO none
diff --git a/src/test/regress/sql/tsearch.sql b/src/test/regress/sql/tsearch.sql
index 637bfb3012..6cadffd985 100644
--- a/src/test/regress/sql/tsearch.sql
+++ b/src/test/regress/sql/tsearch.sql
@@ -330,6 +330,11 @@ Water, water, every where,
 S. T. Coleridge (1772-1834)
 ', phraseto_tsquery('english', 'idle as a painted Ship'));
 
+SELECT ts_headline('english',
+'Lorem ipsum urna.  Nullam nullam ullamcorper urna.',
+to_tsquery('english','Lorem') && phraseto_tsquery('english','ullamcorper urna'),
+'MaxWords=100, MinWords=1');
+
 SELECT ts_headline('english', '
 <html>
 <!-- some comment -->
@@ -400,6 +405,12 @@ Water, water, every where,
 S. T. Coleridge (1772-1834)
 ', to_tsquery('english', 'Coleridge & stuck'), 'MaxFragments=2,FragmentDelimiter=***');
 
+--Fragments with phrase search
+SELECT ts_headline('english',
+'Lorem ipsum urna.  Nullam nullam ullamcorper urna.',
+to_tsquery('english','Lorem') && phraseto_tsquery('english','ullamcorper urna'),
+'MaxFragments=100, MaxWords=100, MinWords=1');
+
 --Rewrite sub system
 
 CREATE TABLE test_tsquery (txtkeyword TEXT, txtsample TEXT);