Improve headeline generation. Now headline can contain

several fragments a-la Google.

Sushant Sinha <sushant354@gmail.com>
This commit is contained in:
Teodor Sigaev 2008-10-17 18:05:19 +00:00
parent 906b7e5f6c
commit 2a0083ede8
6 changed files with 518 additions and 63 deletions

View File

@ -1,4 +1,4 @@
<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.45 2008/09/23 09:20:34 heikki Exp $ -->
<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.46 2008/10/17 18:05:19 teodor Exp $ -->
<chapter id="textsearch">
<title id="textsearch-title">Full Text Search</title>
@ -1098,6 +1098,29 @@ ORDER BY rank DESC LIMIT 10;
value of three eliminates the English articles.
</para>
</listitem>
<listitem>
<para>
<literal>MaxFragments</literal>: maximum number of text excerpts
or fragments that matches the query words. It also triggers a
different headline generation function than the default one. This
function finds text fragments with as many query words as possible and
stretches those fragments around the query words. As a result
query words are close to the middle of each fragment and have words on
each side. Each fragment will be of at most MaxWords and will not
have words of size less than or equal to ShortWord at the start or
end of a fragment. If all query words are not found in the document,
then a single fragment of MinWords will be displayed.
</para>
</listitem>
<listitem>
<para>
<literal>FragmentDelimiter</literal>: When more than one fragments are
displayed, then the fragments will be separated by this delimiter. This
option is effective only if MaxFragments is greater than 1 and there are
more than one fragments to be diplayed. This option has no effect on the
default headline generation function.
</para>
</listitem>
<listitem>
<para>
<literal>HighlightAll</literal>: Boolean flag; if
@ -1109,7 +1132,7 @@ ORDER BY rank DESC LIMIT 10;
Any unspecified options receive these defaults:
<programlisting>
StartSel=&lt;b&gt;, StopSel=&lt;/b&gt;, MaxWords=35, MinWords=15, ShortWord=3, HighlightAll=FALSE
StartSel=&lt;b&gt;, StopSel=&lt;/b&gt;, MaxFragments=0, FragmentDelimiter=" ... ", MaxWords=35, MinWords=15, ShortWord=3, HighlightAll=FALSE
</programlisting>
</para>

View File

@ -7,7 +7,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/ts_parse.c,v 1.8 2008/05/16 16:31:01 tgl Exp $
* $PostgreSQL: pgsql/src/backend/tsearch/ts_parse.c,v 1.9 2008/10/17 18:05:19 teodor Exp $
*
*-------------------------------------------------------------------------
*/
@ -583,8 +583,11 @@ text *
generateHeadline(HeadlineParsedText *prs)
{
text *out;
int len = 128;
char *ptr;
int len = 128;
int numfragments = 0;
int2 infrag = 0;
HeadlineWordEntry *wrd = prs->words;
out = (text *) palloc(len);
@ -592,7 +595,7 @@ generateHeadline(HeadlineParsedText *prs)
while (wrd - prs->words < prs->curwords)
{
while (wrd->len + prs->stopsellen + prs->startsellen + (ptr - ((char *) out)) >= len)
while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len)
{
int dist = ptr - ((char *) out);
@ -603,6 +606,20 @@ generateHeadline(HeadlineParsedText *prs)
if (wrd->in && !wrd->repeated)
{
if (!infrag)
{
/* start of a new fragment */
infrag = 1;
numfragments ++;
/* add a fragment delimitor if this is after the first one */
if (numfragments > 1)
{
memcpy(ptr, prs->fragdelim, prs->fragdelimlen);
ptr += prs->fragdelimlen;
}
}
if (wrd->replace)
{
*ptr = ' ';
@ -625,7 +642,11 @@ generateHeadline(HeadlineParsedText *prs)
}
}
else if (!wrd->repeated)
{
if (infrag)
infrag = 0;
pfree(wrd->word);
}
wrd++;
}

View File

@ -7,7 +7,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.16 2008/10/17 17:27:46 teodor Exp $
* $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.17 2008/10/17 18:05:19 teodor Exp $
*
*-------------------------------------------------------------------------
*/
@ -1684,18 +1684,247 @@ hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
return false;
}
Datum
prsd_headline(PG_FUNCTION_ARGS)
static void
mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
{
HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
List *prsoptions = (List *) PG_GETARG_POINTER(1);
TSQuery query = PG_GETARG_TSQUERY(2);
int i;
/* from opt + start and and tag */
int min_words = 15;
int max_words = 35;
int shortword = 3;
for (i = startpos; i <= endpos; i++)
{
if (prs->words[i].item)
prs->words[i].selected = 1;
if (highlight == 0)
{
if (HLIDIGNORE(prs->words[i].type))
prs->words[i].replace = 1;
}
else
{
if (XMLHLIDIGNORE(prs->words[i].type))
prs->words[i].replace = 1;
}
prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
}
}
typedef struct
{
int4 startpos;
int4 endpos;
int4 poslen;
int4 curlen;
int2 in;
int2 excluded;
} CoverPos;
static void
get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
int *curlen, int *poslen, int max_words)
{
int i;
/* Objective: Generate a fragment of words between startpos and endpos
* such that it has at most max_words and both ends has query words.
* If the startpos and endpos are the endpoints of the cover and the
* cover has fewer words than max_words, then this function should
* just return the cover
*/
/* first move startpos to an item */
for(i = *startpos; i <= *endpos; i++)
{
*startpos = i;
if (prs->words[i].item && !prs->words[i].repeated)
break;
}
/* cut endpos to have only max_words */
*curlen = 0;
*poslen = 0;
for(i = *startpos; i <= *endpos && *curlen < max_words; i++)
{
if (!NONWORDTOKEN(prs->words[i].type))
*curlen += 1;
if (prs->words[i].item && !prs->words[i].repeated)
*poslen += 1;
}
/* if the cover was cut then move back endpos to a query item */
if (*endpos > i)
{
*endpos = i;
for(i = *endpos; i >= *startpos; i --)
{
*endpos = i;
if (prs->words[i].item && !prs->words[i].repeated)
break;
if (!NONWORDTOKEN(prs->words[i].type))
*curlen -= 1;
}
}
}
static void
mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
int shortword, int min_words,
int max_words, int max_fragments)
{
int4 poslen, curlen, i, f, num_f = 0;
int4 stretch, maxstretch, posmarker;
int4 startpos = 0,
endpos = 0,
p = 0,
q = 0;
int4 numcovers = 0,
maxcovers = 32;
int4 minI, minwords, maxitems;
CoverPos *covers;
covers = palloc(maxcovers * sizeof(CoverPos));
/* get all covers */
while (hlCover(prs, query, &p, &q))
{
startpos = p;
endpos = q;
/* Break the cover into smaller fragments such that each fragment
* has at most max_words. Also ensure that each end of the fragment
* is a query word. This will allow us to stretch the fragment in
* either direction
*/
while (startpos <= endpos)
{
get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
if (numcovers >= maxcovers)
{
maxcovers *= 2;
covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
}
covers[numcovers].startpos = startpos;
covers[numcovers].endpos = endpos;
covers[numcovers].curlen = curlen;
covers[numcovers].poslen = poslen;
covers[numcovers].in = 0;
covers[numcovers].excluded = 0;
numcovers ++;
startpos = endpos + 1;
endpos = q;
}
/* move p to generate the next cover */
p++;
}
/* choose best covers */
for (f = 0; f < max_fragments; f++)
{
maxitems = 0;
minwords = 0x7fffffff;
minI = -1;
/* Choose the cover that contains max items.
* In case of tie choose the one with smaller
* number of words.
*/
for (i = 0; i < numcovers; i ++)
{
if (!covers[i].in && !covers[i].excluded &&
(maxitems < covers[i].poslen || (maxitems == covers[i].poslen
&& minwords > covers[i].curlen)))
{
maxitems = covers[i].poslen;
minwords = covers[i].curlen;
minI = i;
}
}
/* if a cover was found mark it */
if (minI >= 0)
{
covers[minI].in = 1;
/* adjust the size of cover */
startpos = covers[minI].startpos;
endpos = covers[minI].endpos;
curlen = covers[minI].curlen;
/* stretch the cover if cover size is lower than max_words */
if (curlen < max_words)
{
/* divide the stretch on both sides of cover */
maxstretch = (max_words - curlen)/2;
/* first stretch the startpos
* stop stretching if
* 1. we hit the beginning of document
* 2. exceed maxstretch
* 3. we hit an already marked fragment
*/
stretch = 0;
posmarker = startpos;
for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
{
if (!NONWORDTOKEN(prs->words[i].type))
{
curlen ++;
stretch ++;
}
posmarker = i;
}
/* cut back startpos till we find a non short token */
for (i = posmarker; i < startpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i++)
{
if (!NONWORDTOKEN(prs->words[i].type))
curlen --;
}
startpos = i;
/* now stretch the endpos as much as possible*/
posmarker = endpos;
for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
{
if (!NONWORDTOKEN(prs->words[i].type))
curlen ++;
posmarker = i;
}
/* cut back endpos till we find a non-short token */
for ( i = posmarker; i > endpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i--)
{
if (!NONWORDTOKEN(prs->words[i].type))
curlen --;
}
endpos = i;
}
covers[minI].startpos = startpos;
covers[minI].endpos = endpos;
covers[minI].curlen = curlen;
/* Mark the chosen fragments (covers) */
mark_fragment(prs, highlight, startpos, endpos);
num_f ++;
/* exclude overlapping covers */
for (i = 0; i < numcovers; i ++)
{
if (i != minI && ( (covers[i].startpos >= covers[minI].startpos && covers[i].startpos <= covers[minI].endpos) || (covers[i].endpos >= covers[minI].startpos && covers[i].endpos <= covers[minI].endpos)))
covers[i].excluded = 1;
}
}
else
break;
}
/* show at least min_words we have not marked anything*/
if (num_f <= 0)
{
startpos = endpos = curlen = 0;
for (i = 0; i < prs->curwords && curlen < min_words; i++)
{
if (!NONWORDTOKEN(prs->words[i].type))
curlen++;
endpos = i;
}
mark_fragment(prs, highlight, startpos, endpos);
}
pfree(covers);
}
static void
mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
int shortword, int min_words, int max_words)
{
int p = 0,
q = 0;
int bestb = -1,
@ -1707,56 +1936,9 @@ prsd_headline(PG_FUNCTION_ARGS)
curlen;
int i;
int highlight = 0;
ListCell *l;
/* config */
prs->startsel = NULL;
prs->stopsel = NULL;
foreach(l, prsoptions)
{
DefElem *defel = (DefElem *) lfirst(l);
char *val = defGetString(defel);
if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
max_words = pg_atoi(val, sizeof(int32), 0);
else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
min_words = pg_atoi(val, sizeof(int32), 0);
else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
shortword = pg_atoi(val, sizeof(int32), 0);
else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
prs->startsel = pstrdup(val);
else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
prs->stopsel = pstrdup(val);
else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
highlight = (pg_strcasecmp(val, "1") == 0 ||
pg_strcasecmp(val, "on") == 0 ||
pg_strcasecmp(val, "true") == 0 ||
pg_strcasecmp(val, "t") == 0 ||
pg_strcasecmp(val, "y") == 0 ||
pg_strcasecmp(val, "yes") == 0);
else
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("unrecognized headline parameter: \"%s\"",
defel->defname)));
}
if (highlight == 0)
{
if (min_words >= max_words)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("MinWords should be less than MaxWords")));
if (min_words <= 0)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("MinWords should be positive")));
if (shortword < 0)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("ShortWord should be >= 0")));
while (hlCover(prs, query, &p, &q))
{
/* find cover len in words */
@ -1877,12 +2059,95 @@ prsd_headline(PG_FUNCTION_ARGS)
prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
}
}
Datum
prsd_headline(PG_FUNCTION_ARGS)
{
HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
List *prsoptions = (List *) PG_GETARG_POINTER(1);
TSQuery query = PG_GETARG_TSQUERY(2);
/* from opt + start and and tag */
int min_words = 15;
int max_words = 35;
int shortword = 3;
int max_fragments = 0;
int highlight = 0;
ListCell *l;
/* config */
prs->startsel = NULL;
prs->stopsel = NULL;
foreach(l, prsoptions)
{
DefElem *defel = (DefElem *) lfirst(l);
char *val = defGetString(defel);
if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
max_words = pg_atoi(val, sizeof(int32), 0);
else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
min_words = pg_atoi(val, sizeof(int32), 0);
else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
shortword = pg_atoi(val, sizeof(int32), 0);
else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
max_fragments = pg_atoi(val, sizeof(int32), 0);
else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
prs->startsel = pstrdup(val);
else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
prs->stopsel = pstrdup(val);
else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
prs->fragdelim = pstrdup(val);
else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
highlight = (pg_strcasecmp(val, "1") == 0 ||
pg_strcasecmp(val, "on") == 0 ||
pg_strcasecmp(val, "true") == 0 ||
pg_strcasecmp(val, "t") == 0 ||
pg_strcasecmp(val, "y") == 0 ||
pg_strcasecmp(val, "yes") == 0);
else
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("unrecognized headline parameter: \"%s\"",
defel->defname)));
}
if (highlight == 0)
{
if (min_words >= max_words)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("MinWords should be less than MaxWords")));
if (min_words <= 0)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("MinWords should be positive")));
if (shortword < 0)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("ShortWord should be >= 0")));
if (max_fragments < 0)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("MaxFragments should be >= 0")));
}
if (max_fragments == 0)
/* call the default headline generator */
mark_hl_words(prs, query, highlight, shortword, min_words, max_words);
else
mark_hl_fragments(prs, query, highlight, shortword, min_words, max_words, max_fragments);
if (!prs->startsel)
prs->startsel = pstrdup("<b>");
if (!prs->stopsel)
prs->stopsel = pstrdup("</b>");
if (!prs->fragdelim)
prs->fragdelim = pstrdup(" ... ");
prs->startsellen = strlen(prs->startsel);
prs->stopsellen = strlen(prs->stopsel);
prs->fragdelimlen = strlen(prs->fragdelim);
PG_RETURN_POINTER(prs);
}

View File

@ -6,7 +6,7 @@
*
* Copyright (c) 1998-2008, PostgreSQL Global Development Group
*
* $PostgreSQL: pgsql/src/include/tsearch/ts_public.h,v 1.10 2008/06/18 18:42:54 momjian Exp $
* $PostgreSQL: pgsql/src/include/tsearch/ts_public.h,v 1.11 2008/10/17 18:05:19 teodor Exp $
*
*-------------------------------------------------------------------------
*/
@ -52,8 +52,10 @@ typedef struct
int4 curwords;
char *startsel;
char *stopsel;
char *fragdelim;
int2 startsellen;
int2 stopsellen;
int2 fragdelimlen;
} HeadlineParsedText;
/*

View File

@ -632,6 +632,98 @@ to_tsquery('english', 'sea&foo'), 'HighlightAll=true');
</html>
(1 row)
--Check if headline fragments work
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'ocean'), 'MaxFragments=1');
ts_headline
------------------------------------
after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted <b>Ocean</b>.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop
(1 row)
--Check if more than one fragments are displayed
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'Coleridge & stuck'), 'MaxFragments=2');
ts_headline
----------------------------------------------
after day, day after day,
We <b>stuck</b>, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where ... drop to drink.
S. T. <b>Coleridge</b>
(1 row)
--Fragments when there all query words are not in the document
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'ocean & seahorse'), 'MaxFragments=1');
ts_headline
------------------------------------
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as
(1 row)
--FragmentDelimiter option
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'Coleridge & stuck'), 'MaxFragments=2,FragmentDelimiter=***');
ts_headline
--------------------------------------------
after day, day after day,
We <b>stuck</b>, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where***drop to drink.
S. T. <b>Coleridge</b>
(1 row)
--Rewrite sub system
CREATE TABLE test_tsquery (txtkeyword TEXT, txtsample TEXT);
\set ECHO none

View File

@ -208,6 +208,58 @@ ff-bg
</html>',
to_tsquery('english', 'sea&foo'), 'HighlightAll=true');
--Check if headline fragments work
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'ocean'), 'MaxFragments=1');
--Check if more than one fragments are displayed
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'Coleridge & stuck'), 'MaxFragments=2');
--Fragments when there all query words are not in the document
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'ocean & seahorse'), 'MaxFragments=1');
--FragmentDelimiter option
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'Coleridge & stuck'), 'MaxFragments=2,FragmentDelimiter=***');
--Rewrite sub system
CREATE TABLE test_tsquery (txtkeyword TEXT, txtsample TEXT);