Cosmetic improvements for default text search parser's ts_headline code.

This code was woefully unreadable and under-commented. Try to improve matters by adding comments, using some macros to make complicated if-tests more readable, using boolean type where appropriate, etc. There are a couple of tiny coding improvements too, but this commit includes (I hope) no behavioral change. Nonetheless, back-patch as far as 9.6, because a followup bug-fixing commit depends on this. Discussion: https://postgr.es/m/16345-2e0cf5cddbdcd3b4@postgresql.org
2020-04-09 12:37:00 -04:00 · 2020-04-09 12:37:00 -04:00 · ff081d6bb8
parent f179e9f01b
commit ff081d6bb8
1 changed files with 176 additions and 97 deletions
--- a/src/backend/tsearch/wparser_def.c
+++ b/src/backend/tsearch/wparser_def.c
@ -1915,6 +1915,12 @@ prsd_end(PG_FUNCTION_ARGS)
 	PG_RETURN_VOID();
 }

+
+/*
+ * ts_headline support begins here
+ */
+
+/* token type classification macros */
 #define LEAVETOKEN(x)	( (x)==SPACE )
 #define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
 #define ENDPUNCTOKEN(x) ( (x)==SPACE )
@ -1926,23 +1932,54 @@ prsd_end(PG_FUNCTION_ARGS)
 #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
 #define NOENDTOKEN(x)	( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )

+/*
+ * Macros useful in headline selection.  These rely on availability of
+ * "HeadlineParsedText *prs" describing some text, and "int shortword"
+ * describing the "short word" length parameter.
+ */
+
+/* Interesting words are non-repeated search terms */
+#define INTERESTINGWORD(j) \
+	(prs->words[j].item && !prs->words[j].repeated)
+
+/* Don't want to end at a non-word or a short word */
+#define BADENDPOINT(j) \
+	(NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword)
+
 typedef struct
 {
+	/* one cover (well, really one fragment) for mark_hl_fragments */
+	int32		startpos;		/* fragment's starting word index */
+	int32		endpos;			/* ending word index (inclusive) */
+	int32		poslen;			/* number of interesting words */
+	int32		curlen;			/* total number of words */
+	bool		chosen;			/* chosen? */
+	bool		excluded;		/* excluded? */
+} CoverPos;
+
+typedef struct
+{
+	/* callback data for checkcondition_HL */
 	HeadlineWordEntry *words;
 	int			len;
 } hlCheck;

+
+/*
+ * TS_execute callback for matching a tsquery operand to headline words
+ */
 static bool
 checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
 {
-	int			i;
 	hlCheck    *checkval = (hlCheck *) opaque;
+	int			i;

+	/* scan words array for marching items */
 	for (i = 0; i < checkval->len; i++)
 	{
 		if (checkval->words[i].item == val)
 		{
-			/* don't need to find all positions */
+			/* if data == NULL, don't need to report positions */
 			if (!data)
 				return true;

@ -2038,8 +2075,14 @@ hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
 	return false;
 }

+/*
+ * Apply suitable highlight marking to words selected by headline selector
+ *
+ * The words from startpos to endpos inclusive are marked per highlightall
+ */
 static void
-mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
+mark_fragment(HeadlineParsedText *prs, bool highlightall,
+			  int startpos, int endpos)
 {
 	int			i;

@ -2047,7 +2090,7 @@ mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
 	{
 		if (prs->words[i].item)
 			prs->words[i].selected = 1;
-		if (highlight == 0)
+		if (!highlightall)
 		{
 			if (HLIDREPLACE(prs->words[i].type))
 				prs->words[i].replace = 1;
@ -2064,16 +2107,15 @@ mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
 	}
 }

-typedef struct
-{
-	int32		startpos;
-	int32		endpos;
-	int32		poslen;
-	int32		curlen;
-	int16		in;
-	int16		excluded;
-} CoverPos;
-
+/*
+ * split a cover substring into fragments not longer than max_words
+ *
+ * At entry, *startpos and *endpos are the (remaining) bounds of the cover
+ * substring.  They are updated to hold the bounds of the next fragment.
+ *
+ * *curlen and *poslen are set to the fragment's length, in words and
+ * interesting words respectively.
+ */
 static void
 get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
 				  int *curlen, int *poslen, int max_words)
@ -2081,17 +2123,17 @@ get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
 	int			i;

 	/*
-	 * Objective: Generate a fragment of words between startpos and endpos
-	 * such that it has at most max_words and both ends has query words. If
-	 * the startpos and endpos are the endpoints of the cover and the cover
-	 * has fewer words than max_words, then this function should just return
-	 * the cover
+	 * Objective: select a fragment of words between startpos and endpos such
+	 * that it has at most max_words and both ends have query words. If the
+	 * startpos and endpos are the endpoints of the cover and the cover has
+	 * fewer words than max_words, then this function should just return the
+	 * cover
 	 */
 	/* first move startpos to an item */
 	for (i = *startpos; i <= *endpos; i++)
 	{
 		*startpos = i;
-		if (prs->words[i].item && !prs->words[i].repeated)
+		if (INTERESTINGWORD(i))
 			break;
 	}
 	/* cut endpos to have only max_words */
@ -2101,7 +2143,7 @@ get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
 	{
 		if (!NONWORDTOKEN(prs->words[i].type))
 			*curlen += 1;
-		if (prs->words[i].item && !prs->words[i].repeated)
+		if (INTERESTINGWORD(i))
 			*poslen += 1;
 	}
 	/* if the cover was cut then move back endpos to a query item */
@ -2111,7 +2153,7 @@ get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
 		for (i = *endpos; i >= *startpos; i--)
 		{
 			*endpos = i;
-			if (prs->words[i].item && !prs->words[i].repeated)
+			if (INTERESTINGWORD(i))
 				break;
 			if (!NONWORDTOKEN(prs->words[i].type))
 				*curlen -= 1;
@ -2119,8 +2161,14 @@ get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
 	}
 }

+/*
+ * Headline selector used when MaxFragments > 0
+ *
+ * Note: in this mode, highlightall is disregarded for phrase selection;
+ * it only controls presentation details.
+ */
 static void
-mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
+mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
 				  int shortword, int min_words,
 				  int max_words, int max_fragments)
 {
@ -2156,7 +2204,7 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,

 		/*
 		 * Break the cover into smaller fragments such that each fragment has
-		 * at most max_words. Also ensure that each end of the fragment is a
+		 * at most max_words. Also ensure that each end of each fragment is a
 		 * query word. This will allow us to stretch the fragment in either
 		 * direction
 		 */
@ -2173,12 +2221,13 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
 			covers[numcovers].endpos = endpos;
 			covers[numcovers].curlen = curlen;
 			covers[numcovers].poslen = poslen;
-			covers[numcovers].in = 0;
-			covers[numcovers].excluded = 0;
+			covers[numcovers].chosen = false;
+			covers[numcovers].excluded = false;
 			numcovers++;
 			startpos = endpos + 1;
 			endpos = q;
 		}
+
 		/* move p to generate the next cover */
 		p++;
 	}
@ -2196,9 +2245,10 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
 		 */
 		for (i = 0; i < numcovers; i++)
 		{
-			if (!covers[i].in && !covers[i].excluded &&
-				(maxitems < covers[i].poslen || (maxitems == covers[i].poslen
-												 && minwords > covers[i].curlen)))
+			if (!covers[i].chosen && !covers[i].excluded &&
+				(maxitems < covers[i].poslen ||
+				 (maxitems == covers[i].poslen &&
+				  minwords > covers[i].curlen)))
 			{
 				maxitems = covers[i].poslen;
 				minwords = covers[i].curlen;
@ -2208,7 +2258,7 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
 		/* if a cover was found mark it */
 		if (minI >= 0)
 		{
-			covers[minI].in = 1;
+			covers[minI].chosen = true;
 			/* adjust the size of cover */
 			startpos = covers[minI].startpos;
 			endpos = covers[minI].endpos;
@ -2235,8 +2285,8 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
 					}
 					posmarker = i;
 				}
-				/* cut back startpos till we find a non short token */
-				for (i = posmarker; i < startpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i++)
+				/* cut back startpos till we find a good endpoint */
+				for (i = posmarker; i < startpos && BADENDPOINT(i); i++)
 				{
 					if (!NONWORDTOKEN(prs->words[i].type))
 						curlen--;
@ -2250,8 +2300,8 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
 						curlen++;
 					posmarker = i;
 				}
-				/* cut back endpos till we find a non-short token */
-				for (i = posmarker; i > endpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i--)
+				/* cut back endpos till we find a good endpoint */
+				for (i = posmarker; i > endpos && BADENDPOINT(i); i--)
 				{
 					if (!NONWORDTOKEN(prs->words[i].type))
 						curlen--;
@ -2262,20 +2312,24 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
 			covers[minI].endpos = endpos;
 			covers[minI].curlen = curlen;
 			/* Mark the chosen fragments (covers) */
-			mark_fragment(prs, highlight, startpos, endpos);
+			mark_fragment(prs, highlightall, startpos, endpos);
 			num_f++;
 			/* exclude overlapping covers */
 			for (i = 0; i < numcovers; i++)
 			{
-				if (i != minI && ((covers[i].startpos >= covers[minI].startpos && covers[i].startpos <= covers[minI].endpos) || (covers[i].endpos >= covers[minI].startpos && covers[i].endpos <= covers[minI].endpos)))
-					covers[i].excluded = 1;
+				if (i != minI &&
+					((covers[i].startpos >= covers[minI].startpos &&
+					  covers[i].startpos <= covers[minI].endpos) ||
+					 (covers[i].endpos >= covers[minI].startpos &&
+					  covers[i].endpos <= covers[minI].endpos)))
+					covers[i].excluded = true;
 			}
 		}
 		else
 			break;
 	}

-	/* show at least min_words we have not marked anything */
+	/* show at least min_words if we have not marked anything */
 	if (num_f <= 0)
 	{
 		startpos = endpos = curlen = 0;
@ -2285,13 +2339,17 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
 				curlen++;
 			endpos = i;
 		}
-		mark_fragment(prs, highlight, startpos, endpos);
+		mark_fragment(prs, highlightall, startpos, endpos);
 	}
+
 	pfree(covers);
 }

+/*
+ * Headline selector used when MaxFragments == 0
+ */
 static void
-mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
+mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
 			  int shortword, int min_words, int max_words)
 {
 	int			p = 0,
@ -2299,66 +2357,81 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
 	int			bestb = -1,
 				beste = -1;
 	int			bestlen = -1;
-	int			pose = 0,
+	int			pose,
 				posb,
 				poslen,
 				curlen;

 	int			i;

-	if (highlight == 0)
+	if (!highlightall)
 	{
+		/* examine all covers, select a headline using the best one */
 		while (hlCover(prs, query, &p, &q))
 		{
-			/* find cover len in words */
+			/*
+			 * Count words (curlen) and interesting words (poslen) within
+			 * cover, but stop once we reach max_words.  This step doesn't
+			 * consider whether that's a good stopping point.  posb and pose
+			 * are set to the start and end indexes of the possible headline.
+			 */
 			curlen = 0;
 			poslen = 0;
+			posb = pose = p;
 			for (i = p; i <= q && curlen < max_words; i++)
 			{
 				if (!NONWORDTOKEN(prs->words[i].type))
 					curlen++;
-				if (prs->words[i].item && !prs->words[i].repeated)
+				if (INTERESTINGWORD(i))
 					poslen++;
 				pose = i;
 			}

-			if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
+			/* XXX this optimization seems unnecessary and wrong */
+			if (poslen < bestlen && !BADENDPOINT(beste))
 			{
-				/* best already found, so try one more cover */
+				/* better cover already found, so try next cover */
 				p++;
 				continue;
 			}

-			posb = p;
 			if (curlen < max_words)
-			{					/* find good end */
+			{
+				/*
+				 * We have room to lengthen the headline, so search forward
+				 * until it's full or we find a good stopping point.  We'll
+				 * reconsider the word at "q", then move forward.
+				 */
 				for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
 				{
-					if (i != q)
+					if (i > q)
 					{
 						if (!NONWORDTOKEN(prs->words[i].type))
 							curlen++;
-						if (prs->words[i].item && !prs->words[i].repeated)
+						if (INTERESTINGWORD(i))
 							poslen++;
 					}
 					pose = i;
-					if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
+					if (BADENDPOINT(i))
 						continue;
 					if (curlen >= min_words)
 						break;
 				}
-				if (curlen < min_words && i >= prs->curwords)
-				{				/* got end of text and our cover is shorter
-								 * than min_words */
+				if (curlen < min_words)
+				{
+					/*
+					 * Reached end of text and our headline is still shorter
+					 * than min_words, so try to extend it to the left.
+					 */
 					for (i = p - 1; i >= 0; i--)
 					{
 						if (!NONWORDTOKEN(prs->words[i].type))
 							curlen++;
-						if (prs->words[i].item && !prs->words[i].repeated)
+						if (INTERESTINGWORD(i))
 							poslen++;
 						if (curlen >= max_words)
 							break;
-						if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
+						if (BADENDPOINT(i))
 							continue;
 						if (curlen >= min_words)
 							break;
@ -2367,34 +2440,48 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
 				}
 			}
 			else
-			{					/* shorter cover :((( */
+			{
+				/*
+				 * Can't make headline longer, so consider making it shorter
+				 * if needed to avoid a bad endpoint.
+				 */
 				if (i > q)
 					i = q;
 				for (; curlen > min_words; i--)
 				{
 					if (!NONWORDTOKEN(prs->words[i].type))
 						curlen--;
-					if (prs->words[i].item && !prs->words[i].repeated)
+					if (INTERESTINGWORD(i))
 						poslen--;
 					pose = i;
-					if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
-						continue;
-					break;
+					if (!BADENDPOINT(i))
+						break;
 				}
 			}

-			if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
-				(bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
-				 (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
+			/*
+			 * Adopt this headline if it's the first, or if it has more
+			 * interesting words and isn't ending at a bad endpoint, or if it
+			 * replaces a bad endpoint with a good one (XXX even if it has
+			 * fewer interesting words?  Really?)
+			 */
+			if (bestlen < 0 ||
+				(poslen > bestlen && !BADENDPOINT(pose)) ||
+				(!BADENDPOINT(pose) && BADENDPOINT(beste)))
 			{
 				bestb = posb;
 				beste = pose;
 				bestlen = poslen;
 			}

+			/* move p to generate the next cover */
 			p++;
 		}

+		/*
+		 * If we found nothing acceptable, select min_words words starting at
+		 * the beginning.
+		 */
 		if (bestlen < 0)
 		{
 			curlen = 0;
@ -2410,32 +2497,17 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
 	}
 	else
 	{
+		/* highlightall mode: headline is whole document */
 		bestb = 0;
 		beste = prs->curwords - 1;
 	}

-	for (i = bestb; i <= beste; i++)
-	{
-		if (prs->words[i].item)
-			prs->words[i].selected = 1;
-		if (highlight == 0)
-		{
-			if (HLIDREPLACE(prs->words[i].type))
-				prs->words[i].replace = 1;
-			else if (HLIDSKIP(prs->words[i].type))
-				prs->words[i].skip = 1;
-		}
-		else
-		{
-			if (XMLHLIDSKIP(prs->words[i].type))
-				prs->words[i].skip = 1;
-		}
-
-		prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
-	}
-
+	mark_fragment(prs, highlightall, bestb, beste);
 }

+/*
+ * Default parser's prsheadline function
+ */
 Datum
 prsd_headline(PG_FUNCTION_ARGS)
 {
@ -2443,17 +2515,18 @@ prsd_headline(PG_FUNCTION_ARGS)
 	List	   *prsoptions = (List *) PG_GETARG_POINTER(1);
 	TSQuery		query = PG_GETARG_TSQUERY(2);

-	/* from opt + start and end tag */
+	/* default option values: */
 	int			min_words = 15;
 	int			max_words = 35;
 	int			shortword = 3;
 	int			max_fragments = 0;
-	int			highlight = 0;
+	bool		highlightall = false;
 	ListCell   *l;

-	/* config */
+	/* Extract configuration option values */
 	prs->startsel = NULL;
 	prs->stopsel = NULL;
+	prs->fragdelim = NULL;
 	foreach(l, prsoptions)
 	{
 		DefElem    *defel = (DefElem *) lfirst(l);
@ -2474,12 +2547,12 @@ prsd_headline(PG_FUNCTION_ARGS)
 		else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
 			prs->fragdelim = pstrdup(val);
 		else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
-			highlight = (pg_strcasecmp(val, "1") == 0 ||
-						 pg_strcasecmp(val, "on") == 0 ||
-						 pg_strcasecmp(val, "true") == 0 ||
-						 pg_strcasecmp(val, "t") == 0 ||
-						 pg_strcasecmp(val, "y") == 0 ||
-						 pg_strcasecmp(val, "yes") == 0);
+			highlightall = (pg_strcasecmp(val, "1") == 0 ||
+							pg_strcasecmp(val, "on") == 0 ||
+							pg_strcasecmp(val, "true") == 0 ||
+							pg_strcasecmp(val, "t") == 0 ||
+							pg_strcasecmp(val, "y") == 0 ||
+							pg_strcasecmp(val, "yes") == 0);
 		else
 			ereport(ERROR,
 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
@ -2487,7 +2560,8 @@ prsd_headline(PG_FUNCTION_ARGS)
 							defel->defname)));
 	}

-	if (highlight == 0)
+	/* in HighlightAll mode these parameters are ignored */
+	if (!highlightall)
 	{
 		if (min_words >= max_words)
 			ereport(ERROR,
@ -2507,18 +2581,23 @@ prsd_headline(PG_FUNCTION_ARGS)
 					 errmsg("MaxFragments should be >= 0")));
 	}

+	/* Apply appropriate headline selector */
 	if (max_fragments == 0)
-		/* call the default headline generator */
-		mark_hl_words(prs, query, highlight, shortword, min_words, max_words);
+		mark_hl_words(prs, query, highlightall, shortword,
+					  min_words, max_words);
 	else
-		mark_hl_fragments(prs, query, highlight, shortword, min_words, max_words, max_fragments);
+		mark_hl_fragments(prs, query, highlightall, shortword,
+						  min_words, max_words, max_fragments);

+	/* Fill in default values for string options */
 	if (!prs->startsel)
 		prs->startsel = pstrdup("<b>");
 	if (!prs->stopsel)
 		prs->stopsel = pstrdup("</b>");
 	if (!prs->fragdelim)
 		prs->fragdelim = pstrdup(" ... ");
+
+	/* Caller will need these lengths, too */
 	prs->startsellen = strlen(prs->startsel);
 	prs->stopsellen = strlen(prs->stopsel);
 	prs->fragdelimlen = strlen(prs->fragdelim);