From 2a0083ede88d278922fcfddeaf60d84a7cff6a5c Mon Sep 17 00:00:00 2001 From: Teodor Sigaev Date: Fri, 17 Oct 2008 18:05:19 +0000 Subject: [PATCH] Improve headeline generation. Now headline can contain several fragments a-la Google. Sushant Sinha --- doc/src/sgml/textsearch.sgml | 27 +- src/backend/tsearch/ts_parse.c | 27 +- src/backend/tsearch/wparser_def.c | 379 ++++++++++++++++++++++---- src/include/tsearch/ts_public.h | 4 +- src/test/regress/expected/tsearch.out | 92 +++++++ src/test/regress/sql/tsearch.sql | 52 ++++ 6 files changed, 518 insertions(+), 63 deletions(-) diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index 45a9f5a389..ac8b75512e 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -1,4 +1,4 @@ - + Full Text Search @@ -1098,6 +1098,29 @@ ORDER BY rank DESC LIMIT 10; value of three eliminates the English articles. + + + MaxFragments: maximum number of text excerpts + or fragments that matches the query words. It also triggers a + different headline generation function than the default one. This + function finds text fragments with as many query words as possible and + stretches those fragments around the query words. As a result + query words are close to the middle of each fragment and have words on + each side. Each fragment will be of at most MaxWords and will not + have words of size less than or equal to ShortWord at the start or + end of a fragment. If all query words are not found in the document, + then a single fragment of MinWords will be displayed. + + + + + FragmentDelimiter: When more than one fragments are + displayed, then the fragments will be separated by this delimiter. This + option is effective only if MaxFragments is greater than 1 and there are + more than one fragments to be diplayed. This option has no effect on the + default headline generation function. + + HighlightAll: Boolean flag; if @@ -1109,7 +1132,7 @@ ORDER BY rank DESC LIMIT 10; Any unspecified options receive these defaults: -StartSel=<b>, StopSel=</b>, MaxWords=35, MinWords=15, ShortWord=3, HighlightAll=FALSE +StartSel=<b>, StopSel=</b>, MaxFragments=0, FragmentDelimiter=" ... ", MaxWords=35, MinWords=15, ShortWord=3, HighlightAll=FALSE diff --git a/src/backend/tsearch/ts_parse.c b/src/backend/tsearch/ts_parse.c index 0634f54a71..6202eb444d 100644 --- a/src/backend/tsearch/ts_parse.c +++ b/src/backend/tsearch/ts_parse.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/tsearch/ts_parse.c,v 1.8 2008/05/16 16:31:01 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/tsearch/ts_parse.c,v 1.9 2008/10/17 18:05:19 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -583,8 +583,11 @@ text * generateHeadline(HeadlineParsedText *prs) { text *out; - int len = 128; char *ptr; + int len = 128; + int numfragments = 0; + int2 infrag = 0; + HeadlineWordEntry *wrd = prs->words; out = (text *) palloc(len); @@ -592,7 +595,7 @@ generateHeadline(HeadlineParsedText *prs) while (wrd - prs->words < prs->curwords) { - while (wrd->len + prs->stopsellen + prs->startsellen + (ptr - ((char *) out)) >= len) + while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len) { int dist = ptr - ((char *) out); @@ -603,6 +606,20 @@ generateHeadline(HeadlineParsedText *prs) if (wrd->in && !wrd->repeated) { + if (!infrag) + { + + /* start of a new fragment */ + infrag = 1; + numfragments ++; + /* add a fragment delimitor if this is after the first one */ + if (numfragments > 1) + { + memcpy(ptr, prs->fragdelim, prs->fragdelimlen); + ptr += prs->fragdelimlen; + } + + } if (wrd->replace) { *ptr = ' '; @@ -625,7 +642,11 @@ generateHeadline(HeadlineParsedText *prs) } } else if (!wrd->repeated) + { + if (infrag) + infrag = 0; pfree(wrd->word); + } wrd++; } diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c index 07e71d9b3f..e365541247 100644 --- a/src/backend/tsearch/wparser_def.c +++ b/src/backend/tsearch/wparser_def.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.16 2008/10/17 17:27:46 teodor Exp $ + * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.17 2008/10/17 18:05:19 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -1684,18 +1684,247 @@ hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q) return false; } -Datum -prsd_headline(PG_FUNCTION_ARGS) +static void +mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos) { - HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0); - List *prsoptions = (List *) PG_GETARG_POINTER(1); - TSQuery query = PG_GETARG_TSQUERY(2); + int i; - /* from opt + start and and tag */ - int min_words = 15; - int max_words = 35; - int shortword = 3; + for (i = startpos; i <= endpos; i++) + { + if (prs->words[i].item) + prs->words[i].selected = 1; + if (highlight == 0) + { + if (HLIDIGNORE(prs->words[i].type)) + prs->words[i].replace = 1; + } + else + { + if (XMLHLIDIGNORE(prs->words[i].type)) + prs->words[i].replace = 1; + } + prs->words[i].in = (prs->words[i].repeated) ? 0 : 1; + } +} + +typedef struct +{ + int4 startpos; + int4 endpos; + int4 poslen; + int4 curlen; + int2 in; + int2 excluded; +} CoverPos; + +static void +get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos, + int *curlen, int *poslen, int max_words) +{ + int i; + /* Objective: Generate a fragment of words between startpos and endpos + * such that it has at most max_words and both ends has query words. + * If the startpos and endpos are the endpoints of the cover and the + * cover has fewer words than max_words, then this function should + * just return the cover + */ + /* first move startpos to an item */ + for(i = *startpos; i <= *endpos; i++) + { + *startpos = i; + if (prs->words[i].item && !prs->words[i].repeated) + break; + } + /* cut endpos to have only max_words */ + *curlen = 0; + *poslen = 0; + for(i = *startpos; i <= *endpos && *curlen < max_words; i++) + { + if (!NONWORDTOKEN(prs->words[i].type)) + *curlen += 1; + if (prs->words[i].item && !prs->words[i].repeated) + *poslen += 1; + } + /* if the cover was cut then move back endpos to a query item */ + if (*endpos > i) + { + *endpos = i; + for(i = *endpos; i >= *startpos; i --) + { + *endpos = i; + if (prs->words[i].item && !prs->words[i].repeated) + break; + if (!NONWORDTOKEN(prs->words[i].type)) + *curlen -= 1; + } + } +} + +static void +mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight, + int shortword, int min_words, + int max_words, int max_fragments) +{ + int4 poslen, curlen, i, f, num_f = 0; + int4 stretch, maxstretch, posmarker; + + int4 startpos = 0, + endpos = 0, + p = 0, + q = 0; + + int4 numcovers = 0, + maxcovers = 32; + + int4 minI, minwords, maxitems; + CoverPos *covers; + + covers = palloc(maxcovers * sizeof(CoverPos)); + + /* get all covers */ + while (hlCover(prs, query, &p, &q)) + { + startpos = p; + endpos = q; + + /* Break the cover into smaller fragments such that each fragment + * has at most max_words. Also ensure that each end of the fragment + * is a query word. This will allow us to stretch the fragment in + * either direction + */ + + while (startpos <= endpos) + { + get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words); + if (numcovers >= maxcovers) + { + maxcovers *= 2; + covers = repalloc(covers, sizeof(CoverPos) * maxcovers); + } + covers[numcovers].startpos = startpos; + covers[numcovers].endpos = endpos; + covers[numcovers].curlen = curlen; + covers[numcovers].poslen = poslen; + covers[numcovers].in = 0; + covers[numcovers].excluded = 0; + numcovers ++; + startpos = endpos + 1; + endpos = q; + } + /* move p to generate the next cover */ + p++; + } + + /* choose best covers */ + for (f = 0; f < max_fragments; f++) + { + maxitems = 0; + minwords = 0x7fffffff; + minI = -1; + /* Choose the cover that contains max items. + * In case of tie choose the one with smaller + * number of words. + */ + for (i = 0; i < numcovers; i ++) + { + if (!covers[i].in && !covers[i].excluded && + (maxitems < covers[i].poslen || (maxitems == covers[i].poslen + && minwords > covers[i].curlen))) + { + maxitems = covers[i].poslen; + minwords = covers[i].curlen; + minI = i; + } + } + /* if a cover was found mark it */ + if (minI >= 0) + { + covers[minI].in = 1; + /* adjust the size of cover */ + startpos = covers[minI].startpos; + endpos = covers[minI].endpos; + curlen = covers[minI].curlen; + /* stretch the cover if cover size is lower than max_words */ + if (curlen < max_words) + { + /* divide the stretch on both sides of cover */ + maxstretch = (max_words - curlen)/2; + /* first stretch the startpos + * stop stretching if + * 1. we hit the beginning of document + * 2. exceed maxstretch + * 3. we hit an already marked fragment + */ + stretch = 0; + posmarker = startpos; + for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--) + { + if (!NONWORDTOKEN(prs->words[i].type)) + { + curlen ++; + stretch ++; + } + posmarker = i; + } + /* cut back startpos till we find a non short token */ + for (i = posmarker; i < startpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i++) + { + if (!NONWORDTOKEN(prs->words[i].type)) + curlen --; + } + startpos = i; + /* now stretch the endpos as much as possible*/ + posmarker = endpos; + for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++) + { + if (!NONWORDTOKEN(prs->words[i].type)) + curlen ++; + posmarker = i; + } + /* cut back endpos till we find a non-short token */ + for ( i = posmarker; i > endpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i--) + { + if (!NONWORDTOKEN(prs->words[i].type)) + curlen --; + } + endpos = i; + } + covers[minI].startpos = startpos; + covers[minI].endpos = endpos; + covers[minI].curlen = curlen; + /* Mark the chosen fragments (covers) */ + mark_fragment(prs, highlight, startpos, endpos); + num_f ++; + /* exclude overlapping covers */ + for (i = 0; i < numcovers; i ++) + { + if (i != minI && ( (covers[i].startpos >= covers[minI].startpos && covers[i].startpos <= covers[minI].endpos) || (covers[i].endpos >= covers[minI].startpos && covers[i].endpos <= covers[minI].endpos))) + covers[i].excluded = 1; + } + } + else + break; + } + + /* show at least min_words we have not marked anything*/ + if (num_f <= 0) + { + startpos = endpos = curlen = 0; + for (i = 0; i < prs->curwords && curlen < min_words; i++) + { + if (!NONWORDTOKEN(prs->words[i].type)) + curlen++; + endpos = i; + } + mark_fragment(prs, highlight, startpos, endpos); + } + pfree(covers); +} +static void +mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight, + int shortword, int min_words, int max_words) +{ int p = 0, q = 0; int bestb = -1, @@ -1707,56 +1936,9 @@ prsd_headline(PG_FUNCTION_ARGS) curlen; int i; - int highlight = 0; - ListCell *l; - - /* config */ - prs->startsel = NULL; - prs->stopsel = NULL; - foreach(l, prsoptions) - { - DefElem *defel = (DefElem *) lfirst(l); - char *val = defGetString(defel); - - if (pg_strcasecmp(defel->defname, "MaxWords") == 0) - max_words = pg_atoi(val, sizeof(int32), 0); - else if (pg_strcasecmp(defel->defname, "MinWords") == 0) - min_words = pg_atoi(val, sizeof(int32), 0); - else if (pg_strcasecmp(defel->defname, "ShortWord") == 0) - shortword = pg_atoi(val, sizeof(int32), 0); - else if (pg_strcasecmp(defel->defname, "StartSel") == 0) - prs->startsel = pstrdup(val); - else if (pg_strcasecmp(defel->defname, "StopSel") == 0) - prs->stopsel = pstrdup(val); - else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0) - highlight = (pg_strcasecmp(val, "1") == 0 || - pg_strcasecmp(val, "on") == 0 || - pg_strcasecmp(val, "true") == 0 || - pg_strcasecmp(val, "t") == 0 || - pg_strcasecmp(val, "y") == 0 || - pg_strcasecmp(val, "yes") == 0); - else - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("unrecognized headline parameter: \"%s\"", - defel->defname))); - } if (highlight == 0) { - if (min_words >= max_words) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("MinWords should be less than MaxWords"))); - if (min_words <= 0) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("MinWords should be positive"))); - if (shortword < 0) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("ShortWord should be >= 0"))); - while (hlCover(prs, query, &p, &q)) { /* find cover len in words */ @@ -1877,12 +2059,95 @@ prsd_headline(PG_FUNCTION_ARGS) prs->words[i].in = (prs->words[i].repeated) ? 0 : 1; } +} + +Datum +prsd_headline(PG_FUNCTION_ARGS) +{ + HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0); + List *prsoptions = (List *) PG_GETARG_POINTER(1); + TSQuery query = PG_GETARG_TSQUERY(2); + + /* from opt + start and and tag */ + int min_words = 15; + int max_words = 35; + int shortword = 3; + int max_fragments = 0; + int highlight = 0; + ListCell *l; + + /* config */ + prs->startsel = NULL; + prs->stopsel = NULL; + foreach(l, prsoptions) + { + DefElem *defel = (DefElem *) lfirst(l); + char *val = defGetString(defel); + + if (pg_strcasecmp(defel->defname, "MaxWords") == 0) + max_words = pg_atoi(val, sizeof(int32), 0); + else if (pg_strcasecmp(defel->defname, "MinWords") == 0) + min_words = pg_atoi(val, sizeof(int32), 0); + else if (pg_strcasecmp(defel->defname, "ShortWord") == 0) + shortword = pg_atoi(val, sizeof(int32), 0); + else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0) + max_fragments = pg_atoi(val, sizeof(int32), 0); + else if (pg_strcasecmp(defel->defname, "StartSel") == 0) + prs->startsel = pstrdup(val); + else if (pg_strcasecmp(defel->defname, "StopSel") == 0) + prs->stopsel = pstrdup(val); + else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0) + prs->fragdelim = pstrdup(val); + else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0) + highlight = (pg_strcasecmp(val, "1") == 0 || + pg_strcasecmp(val, "on") == 0 || + pg_strcasecmp(val, "true") == 0 || + pg_strcasecmp(val, "t") == 0 || + pg_strcasecmp(val, "y") == 0 || + pg_strcasecmp(val, "yes") == 0); + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized headline parameter: \"%s\"", + defel->defname))); + } + + if (highlight == 0) + { + if (min_words >= max_words) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("MinWords should be less than MaxWords"))); + if (min_words <= 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("MinWords should be positive"))); + if (shortword < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("ShortWord should be >= 0"))); + if (max_fragments < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("MaxFragments should be >= 0"))); + } + + if (max_fragments == 0) + /* call the default headline generator */ + mark_hl_words(prs, query, highlight, shortword, min_words, max_words); + else + mark_hl_fragments(prs, query, highlight, shortword, min_words, max_words, max_fragments); + if (!prs->startsel) prs->startsel = pstrdup(""); if (!prs->stopsel) prs->stopsel = pstrdup(""); + if (!prs->fragdelim) + prs->fragdelim = pstrdup(" ... "); prs->startsellen = strlen(prs->startsel); prs->stopsellen = strlen(prs->stopsel); + prs->fragdelimlen = strlen(prs->fragdelim); PG_RETURN_POINTER(prs); } + diff --git a/src/include/tsearch/ts_public.h b/src/include/tsearch/ts_public.h index 5e3723fa8e..b07e8d6cb5 100644 --- a/src/include/tsearch/ts_public.h +++ b/src/include/tsearch/ts_public.h @@ -6,7 +6,7 @@ * * Copyright (c) 1998-2008, PostgreSQL Global Development Group * - * $PostgreSQL: pgsql/src/include/tsearch/ts_public.h,v 1.10 2008/06/18 18:42:54 momjian Exp $ + * $PostgreSQL: pgsql/src/include/tsearch/ts_public.h,v 1.11 2008/10/17 18:05:19 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -52,8 +52,10 @@ typedef struct int4 curwords; char *startsel; char *stopsel; + char *fragdelim; int2 startsellen; int2 stopsellen; + int2 fragdelimlen; } HeadlineParsedText; /* diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out index 468a623e97..39dbaf67b5 100644 --- a/src/test/regress/expected/tsearch.out +++ b/src/test/regress/expected/tsearch.out @@ -632,6 +632,98 @@ to_tsquery('english', 'sea&foo'), 'HighlightAll=true'); (1 row) +--Check if headline fragments work +SELECT ts_headline('english', ' +Day after day, day after day, + We stuck, nor breath nor motion, +As idle as a painted Ship + Upon a painted Ocean. +Water, water, every where + And all the boards did shrink; +Water, water, every where, + Nor any drop to drink. +S. T. Coleridge (1772-1834) +', to_tsquery('english', 'ocean'), 'MaxFragments=1'); + ts_headline +------------------------------------ + after day, + We stuck, nor breath nor motion, + As idle as a painted Ship + Upon a painted Ocean. + Water, water, every where + And all the boards did shrink; + Water, water, every where, + Nor any drop +(1 row) + +--Check if more than one fragments are displayed +SELECT ts_headline('english', ' +Day after day, day after day, + We stuck, nor breath nor motion, +As idle as a painted Ship + Upon a painted Ocean. +Water, water, every where + And all the boards did shrink; +Water, water, every where, + Nor any drop to drink. +S. T. Coleridge (1772-1834) +', to_tsquery('english', 'Coleridge & stuck'), 'MaxFragments=2'); + ts_headline +---------------------------------------------- + after day, day after day, + We stuck, nor breath nor motion, + As idle as a painted Ship + Upon a painted Ocean. + Water, water, every where + And all the boards did shrink; + Water, water, every where ... drop to drink. + S. T. Coleridge +(1 row) + +--Fragments when there all query words are not in the document +SELECT ts_headline('english', ' +Day after day, day after day, + We stuck, nor breath nor motion, +As idle as a painted Ship + Upon a painted Ocean. +Water, water, every where + And all the boards did shrink; +Water, water, every where, + Nor any drop to drink. +S. T. Coleridge (1772-1834) +', to_tsquery('english', 'ocean & seahorse'), 'MaxFragments=1'); + ts_headline +------------------------------------ + + Day after day, day after day, + We stuck, nor breath nor motion, + As idle as +(1 row) + +--FragmentDelimiter option +SELECT ts_headline('english', ' +Day after day, day after day, + We stuck, nor breath nor motion, +As idle as a painted Ship + Upon a painted Ocean. +Water, water, every where + And all the boards did shrink; +Water, water, every where, + Nor any drop to drink. +S. T. Coleridge (1772-1834) +', to_tsquery('english', 'Coleridge & stuck'), 'MaxFragments=2,FragmentDelimiter=***'); + ts_headline +-------------------------------------------- + after day, day after day, + We stuck, nor breath nor motion, + As idle as a painted Ship + Upon a painted Ocean. + Water, water, every where + And all the boards did shrink; + Water, water, every where***drop to drink. + S. T. Coleridge +(1 row) + --Rewrite sub system CREATE TABLE test_tsquery (txtkeyword TEXT, txtsample TEXT); \set ECHO none diff --git a/src/test/regress/sql/tsearch.sql b/src/test/regress/sql/tsearch.sql index dc7427d3b1..f15d79318e 100644 --- a/src/test/regress/sql/tsearch.sql +++ b/src/test/regress/sql/tsearch.sql @@ -208,6 +208,58 @@ ff-bg ', to_tsquery('english', 'sea&foo'), 'HighlightAll=true'); +--Check if headline fragments work +SELECT ts_headline('english', ' +Day after day, day after day, + We stuck, nor breath nor motion, +As idle as a painted Ship + Upon a painted Ocean. +Water, water, every where + And all the boards did shrink; +Water, water, every where, + Nor any drop to drink. +S. T. Coleridge (1772-1834) +', to_tsquery('english', 'ocean'), 'MaxFragments=1'); + +--Check if more than one fragments are displayed +SELECT ts_headline('english', ' +Day after day, day after day, + We stuck, nor breath nor motion, +As idle as a painted Ship + Upon a painted Ocean. +Water, water, every where + And all the boards did shrink; +Water, water, every where, + Nor any drop to drink. +S. T. Coleridge (1772-1834) +', to_tsquery('english', 'Coleridge & stuck'), 'MaxFragments=2'); + +--Fragments when there all query words are not in the document +SELECT ts_headline('english', ' +Day after day, day after day, + We stuck, nor breath nor motion, +As idle as a painted Ship + Upon a painted Ocean. +Water, water, every where + And all the boards did shrink; +Water, water, every where, + Nor any drop to drink. +S. T. Coleridge (1772-1834) +', to_tsquery('english', 'ocean & seahorse'), 'MaxFragments=1'); + +--FragmentDelimiter option +SELECT ts_headline('english', ' +Day after day, day after day, + We stuck, nor breath nor motion, +As idle as a painted Ship + Upon a painted Ocean. +Water, water, every where + And all the boards did shrink; +Water, water, every where, + Nor any drop to drink. +S. T. Coleridge (1772-1834) +', to_tsquery('english', 'Coleridge & stuck'), 'MaxFragments=2,FragmentDelimiter=***'); + --Rewrite sub system CREATE TABLE test_tsquery (txtkeyword TEXT, txtsample TEXT);