diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c index f0c3441990..cbc22a71a2 100644 --- a/src/backend/tsearch/wparser_def.c +++ b/src/backend/tsearch/wparser_def.c @@ -1915,6 +1915,12 @@ prsd_end(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } + +/* + * ts_headline support begins here + */ + +/* token type classification macros */ #define LEAVETOKEN(x) ( (x)==SPACE ) #define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD ) #define ENDPUNCTOKEN(x) ( (x)==SPACE ) @@ -1926,23 +1932,54 @@ prsd_end(PG_FUNCTION_ARGS) #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) ) #define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) ) +/* + * Macros useful in headline selection. These rely on availability of + * "HeadlineParsedText *prs" describing some text, and "int shortword" + * describing the "short word" length parameter. + */ + +/* Interesting words are non-repeated search terms */ +#define INTERESTINGWORD(j) \ + (prs->words[j].item && !prs->words[j].repeated) + +/* Don't want to end at a non-word or a short word */ +#define BADENDPOINT(j) \ + (NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword) + typedef struct { + /* one cover (well, really one fragment) for mark_hl_fragments */ + int32 startpos; /* fragment's starting word index */ + int32 endpos; /* ending word index (inclusive) */ + int32 poslen; /* number of interesting words */ + int32 curlen; /* total number of words */ + bool chosen; /* chosen? */ + bool excluded; /* excluded? */ +} CoverPos; + +typedef struct +{ + /* callback data for checkcondition_HL */ HeadlineWordEntry *words; int len; } hlCheck; + +/* + * TS_execute callback for matching a tsquery operand to headline words + */ static bool checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data) { - int i; hlCheck *checkval = (hlCheck *) opaque; + int i; + /* scan words array for marching items */ for (i = 0; i < checkval->len; i++) { if (checkval->words[i].item == val) { - /* don't need to find all positions */ + /* if data == NULL, don't need to report positions */ if (!data) return true; @@ -2038,8 +2075,14 @@ hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q) return false; } +/* + * Apply suitable highlight marking to words selected by headline selector + * + * The words from startpos to endpos inclusive are marked per highlightall + */ static void -mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos) +mark_fragment(HeadlineParsedText *prs, bool highlightall, + int startpos, int endpos) { int i; @@ -2047,7 +2090,7 @@ mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos) { if (prs->words[i].item) prs->words[i].selected = 1; - if (highlight == 0) + if (!highlightall) { if (HLIDREPLACE(prs->words[i].type)) prs->words[i].replace = 1; @@ -2064,16 +2107,15 @@ mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos) } } -typedef struct -{ - int32 startpos; - int32 endpos; - int32 poslen; - int32 curlen; - int16 in; - int16 excluded; -} CoverPos; - +/* + * split a cover substring into fragments not longer than max_words + * + * At entry, *startpos and *endpos are the (remaining) bounds of the cover + * substring. They are updated to hold the bounds of the next fragment. + * + * *curlen and *poslen are set to the fragment's length, in words and + * interesting words respectively. + */ static void get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos, int *curlen, int *poslen, int max_words) @@ -2081,17 +2123,17 @@ get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos, int i; /* - * Objective: Generate a fragment of words between startpos and endpos - * such that it has at most max_words and both ends has query words. If - * the startpos and endpos are the endpoints of the cover and the cover - * has fewer words than max_words, then this function should just return - * the cover + * Objective: select a fragment of words between startpos and endpos such + * that it has at most max_words and both ends have query words. If the + * startpos and endpos are the endpoints of the cover and the cover has + * fewer words than max_words, then this function should just return the + * cover */ /* first move startpos to an item */ for (i = *startpos; i <= *endpos; i++) { *startpos = i; - if (prs->words[i].item && !prs->words[i].repeated) + if (INTERESTINGWORD(i)) break; } /* cut endpos to have only max_words */ @@ -2101,7 +2143,7 @@ get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos, { if (!NONWORDTOKEN(prs->words[i].type)) *curlen += 1; - if (prs->words[i].item && !prs->words[i].repeated) + if (INTERESTINGWORD(i)) *poslen += 1; } /* if the cover was cut then move back endpos to a query item */ @@ -2111,7 +2153,7 @@ get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos, for (i = *endpos; i >= *startpos; i--) { *endpos = i; - if (prs->words[i].item && !prs->words[i].repeated) + if (INTERESTINGWORD(i)) break; if (!NONWORDTOKEN(prs->words[i].type)) *curlen -= 1; @@ -2119,8 +2161,14 @@ get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos, } } +/* + * Headline selector used when MaxFragments > 0 + * + * Note: in this mode, highlightall is disregarded for phrase selection; + * it only controls presentation details. + */ static void -mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight, +mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall, int shortword, int min_words, int max_words, int max_fragments) { @@ -2156,7 +2204,7 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight, /* * Break the cover into smaller fragments such that each fragment has - * at most max_words. Also ensure that each end of the fragment is a + * at most max_words. Also ensure that each end of each fragment is a * query word. This will allow us to stretch the fragment in either * direction */ @@ -2173,12 +2221,13 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight, covers[numcovers].endpos = endpos; covers[numcovers].curlen = curlen; covers[numcovers].poslen = poslen; - covers[numcovers].in = 0; - covers[numcovers].excluded = 0; + covers[numcovers].chosen = false; + covers[numcovers].excluded = false; numcovers++; startpos = endpos + 1; endpos = q; } + /* move p to generate the next cover */ p++; } @@ -2196,9 +2245,10 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight, */ for (i = 0; i < numcovers; i++) { - if (!covers[i].in && !covers[i].excluded && - (maxitems < covers[i].poslen || (maxitems == covers[i].poslen - && minwords > covers[i].curlen))) + if (!covers[i].chosen && !covers[i].excluded && + (maxitems < covers[i].poslen || + (maxitems == covers[i].poslen && + minwords > covers[i].curlen))) { maxitems = covers[i].poslen; minwords = covers[i].curlen; @@ -2208,7 +2258,7 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight, /* if a cover was found mark it */ if (minI >= 0) { - covers[minI].in = 1; + covers[minI].chosen = true; /* adjust the size of cover */ startpos = covers[minI].startpos; endpos = covers[minI].endpos; @@ -2235,8 +2285,8 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight, } posmarker = i; } - /* cut back startpos till we find a non short token */ - for (i = posmarker; i < startpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i++) + /* cut back startpos till we find a good endpoint */ + for (i = posmarker; i < startpos && BADENDPOINT(i); i++) { if (!NONWORDTOKEN(prs->words[i].type)) curlen--; @@ -2250,8 +2300,8 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight, curlen++; posmarker = i; } - /* cut back endpos till we find a non-short token */ - for (i = posmarker; i > endpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i--) + /* cut back endpos till we find a good endpoint */ + for (i = posmarker; i > endpos && BADENDPOINT(i); i--) { if (!NONWORDTOKEN(prs->words[i].type)) curlen--; @@ -2262,20 +2312,24 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight, covers[minI].endpos = endpos; covers[minI].curlen = curlen; /* Mark the chosen fragments (covers) */ - mark_fragment(prs, highlight, startpos, endpos); + mark_fragment(prs, highlightall, startpos, endpos); num_f++; /* exclude overlapping covers */ for (i = 0; i < numcovers; i++) { - if (i != minI && ((covers[i].startpos >= covers[minI].startpos && covers[i].startpos <= covers[minI].endpos) || (covers[i].endpos >= covers[minI].startpos && covers[i].endpos <= covers[minI].endpos))) - covers[i].excluded = 1; + if (i != minI && + ((covers[i].startpos >= covers[minI].startpos && + covers[i].startpos <= covers[minI].endpos) || + (covers[i].endpos >= covers[minI].startpos && + covers[i].endpos <= covers[minI].endpos))) + covers[i].excluded = true; } } else break; } - /* show at least min_words we have not marked anything */ + /* show at least min_words if we have not marked anything */ if (num_f <= 0) { startpos = endpos = curlen = 0; @@ -2285,13 +2339,17 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight, curlen++; endpos = i; } - mark_fragment(prs, highlight, startpos, endpos); + mark_fragment(prs, highlightall, startpos, endpos); } + pfree(covers); } +/* + * Headline selector used when MaxFragments == 0 + */ static void -mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight, +mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall, int shortword, int min_words, int max_words) { int p = 0, @@ -2299,66 +2357,81 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight, int bestb = -1, beste = -1; int bestlen = -1; - int pose = 0, + int pose, posb, poslen, curlen; int i; - if (highlight == 0) + if (!highlightall) { + /* examine all covers, select a headline using the best one */ while (hlCover(prs, query, &p, &q)) { - /* find cover len in words */ + /* + * Count words (curlen) and interesting words (poslen) within + * cover, but stop once we reach max_words. This step doesn't + * consider whether that's a good stopping point. posb and pose + * are set to the start and end indexes of the possible headline. + */ curlen = 0; poslen = 0; + posb = pose = p; for (i = p; i <= q && curlen < max_words; i++) { if (!NONWORDTOKEN(prs->words[i].type)) curlen++; - if (prs->words[i].item && !prs->words[i].repeated) + if (INTERESTINGWORD(i)) poslen++; pose = i; } - if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)) + /* XXX this optimization seems unnecessary and wrong */ + if (poslen < bestlen && !BADENDPOINT(beste)) { - /* best already found, so try one more cover */ + /* better cover already found, so try next cover */ p++; continue; } - posb = p; if (curlen < max_words) - { /* find good end */ + { + /* + * We have room to lengthen the headline, so search forward + * until it's full or we find a good stopping point. We'll + * reconsider the word at "q", then move forward. + */ for (i = i - 1; i < prs->curwords && curlen < max_words; i++) { - if (i != q) + if (i > q) { if (!NONWORDTOKEN(prs->words[i].type)) curlen++; - if (prs->words[i].item && !prs->words[i].repeated) + if (INTERESTINGWORD(i)) poslen++; } pose = i; - if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword) + if (BADENDPOINT(i)) continue; if (curlen >= min_words) break; } - if (curlen < min_words && i >= prs->curwords) - { /* got end of text and our cover is shorter - * than min_words */ + if (curlen < min_words) + { + /* + * Reached end of text and our headline is still shorter + * than min_words, so try to extend it to the left. + */ for (i = p - 1; i >= 0; i--) { if (!NONWORDTOKEN(prs->words[i].type)) curlen++; - if (prs->words[i].item && !prs->words[i].repeated) + if (INTERESTINGWORD(i)) poslen++; if (curlen >= max_words) break; - if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword) + if (BADENDPOINT(i)) continue; if (curlen >= min_words) break; @@ -2367,34 +2440,48 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight, } } else - { /* shorter cover :((( */ + { + /* + * Can't make headline longer, so consider making it shorter + * if needed to avoid a bad endpoint. + */ if (i > q) i = q; for (; curlen > min_words; i--) { if (!NONWORDTOKEN(prs->words[i].type)) curlen--; - if (prs->words[i].item && !prs->words[i].repeated) + if (INTERESTINGWORD(i)) poslen--; pose = i; - if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword) - continue; - break; + if (!BADENDPOINT(i)) + break; } } - if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) || - (bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) && - (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))) + /* + * Adopt this headline if it's the first, or if it has more + * interesting words and isn't ending at a bad endpoint, or if it + * replaces a bad endpoint with a good one (XXX even if it has + * fewer interesting words? Really?) + */ + if (bestlen < 0 || + (poslen > bestlen && !BADENDPOINT(pose)) || + (!BADENDPOINT(pose) && BADENDPOINT(beste))) { bestb = posb; beste = pose; bestlen = poslen; } + /* move p to generate the next cover */ p++; } + /* + * If we found nothing acceptable, select min_words words starting at + * the beginning. + */ if (bestlen < 0) { curlen = 0; @@ -2410,32 +2497,17 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight, } else { + /* highlightall mode: headline is whole document */ bestb = 0; beste = prs->curwords - 1; } - for (i = bestb; i <= beste; i++) - { - if (prs->words[i].item) - prs->words[i].selected = 1; - if (highlight == 0) - { - if (HLIDREPLACE(prs->words[i].type)) - prs->words[i].replace = 1; - else if (HLIDSKIP(prs->words[i].type)) - prs->words[i].skip = 1; - } - else - { - if (XMLHLIDSKIP(prs->words[i].type)) - prs->words[i].skip = 1; - } - - prs->words[i].in = (prs->words[i].repeated) ? 0 : 1; - } - + mark_fragment(prs, highlightall, bestb, beste); } +/* + * Default parser's prsheadline function + */ Datum prsd_headline(PG_FUNCTION_ARGS) { @@ -2443,17 +2515,18 @@ prsd_headline(PG_FUNCTION_ARGS) List *prsoptions = (List *) PG_GETARG_POINTER(1); TSQuery query = PG_GETARG_TSQUERY(2); - /* from opt + start and end tag */ + /* default option values: */ int min_words = 15; int max_words = 35; int shortword = 3; int max_fragments = 0; - int highlight = 0; + bool highlightall = false; ListCell *l; - /* config */ + /* Extract configuration option values */ prs->startsel = NULL; prs->stopsel = NULL; + prs->fragdelim = NULL; foreach(l, prsoptions) { DefElem *defel = (DefElem *) lfirst(l); @@ -2474,12 +2547,12 @@ prsd_headline(PG_FUNCTION_ARGS) else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0) prs->fragdelim = pstrdup(val); else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0) - highlight = (pg_strcasecmp(val, "1") == 0 || - pg_strcasecmp(val, "on") == 0 || - pg_strcasecmp(val, "true") == 0 || - pg_strcasecmp(val, "t") == 0 || - pg_strcasecmp(val, "y") == 0 || - pg_strcasecmp(val, "yes") == 0); + highlightall = (pg_strcasecmp(val, "1") == 0 || + pg_strcasecmp(val, "on") == 0 || + pg_strcasecmp(val, "true") == 0 || + pg_strcasecmp(val, "t") == 0 || + pg_strcasecmp(val, "y") == 0 || + pg_strcasecmp(val, "yes") == 0); else ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), @@ -2487,7 +2560,8 @@ prsd_headline(PG_FUNCTION_ARGS) defel->defname))); } - if (highlight == 0) + /* in HighlightAll mode these parameters are ignored */ + if (!highlightall) { if (min_words >= max_words) ereport(ERROR, @@ -2507,18 +2581,23 @@ prsd_headline(PG_FUNCTION_ARGS) errmsg("MaxFragments should be >= 0"))); } + /* Apply appropriate headline selector */ if (max_fragments == 0) - /* call the default headline generator */ - mark_hl_words(prs, query, highlight, shortword, min_words, max_words); + mark_hl_words(prs, query, highlightall, shortword, + min_words, max_words); else - mark_hl_fragments(prs, query, highlight, shortword, min_words, max_words, max_fragments); + mark_hl_fragments(prs, query, highlightall, shortword, + min_words, max_words, max_fragments); + /* Fill in default values for string options */ if (!prs->startsel) prs->startsel = pstrdup(""); if (!prs->stopsel) prs->stopsel = pstrdup(""); if (!prs->fragdelim) prs->fragdelim = pstrdup(" ... "); + + /* Caller will need these lengths, too */ prs->startsellen = strlen(prs->startsel); prs->stopsellen = strlen(prs->stopsel); prs->fragdelimlen = strlen(prs->fragdelim);