Add a rank/(rank+1) normalization option to ts_rank(). While the usefulness

of this seems a bit marginal, if it's useful enough to be shown in the manual
then we probably ought to support doing it without double evaluation of the
ts_rank function.  Per my proposal earlier today.
This commit is contained in:
Tom Lane 2007-11-14 23:43:27 +00:00
parent 5858990f87
commit 866bad9543
2 changed files with 32 additions and 15 deletions

View File

@ -1,4 +1,4 @@
<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.33 2007/11/14 18:36:37 tgl Exp $ --> <!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.34 2007/11/14 23:43:27 tgl Exp $ -->
<chapter id="textsearch"> <chapter id="textsearch">
<title id="textsearch-title">Full Text Search</title> <title id="textsearch-title">Full Text Search</title>
@ -940,6 +940,7 @@ SELECT plainto_tsquery('english', 'The Fat &amp; Rats:C');
<listitem> <listitem>
<para> <para>
4 divides the rank by the mean harmonic distance between extents 4 divides the rank by the mean harmonic distance between extents
(this is implemented only by <function>ts_rank_cd</>)
</para> </para>
</listitem> </listitem>
<listitem> <listitem>
@ -953,17 +954,24 @@ SELECT plainto_tsquery('english', 'The Fat &amp; Rats:C');
of unique words in document of unique words in document
</para> </para>
</listitem> </listitem>
<listitem>
<para>
32 divides the rank by itself + 1
</para>
</listitem>
</itemizedlist> </itemizedlist>
If more than one flag bit is specified, the transformations are
applied in the order listed.
</para> </para>
<para> <para>
It is important to note that the ranking functions do not use any global It is important to note that the ranking functions do not use any global
information so it is impossible to produce a fair normalization to 1% or information, so it is impossible to produce a fair normalization to 1% or
100%, as sometimes desired. However, a simple technique like 100% as sometimes desired. Normalization option 32
<literal>rank/(rank+1)</literal> can be applied. Of course, this is just (<literal>rank/(rank+1)</literal>) can be applied to scale all ranks
a cosmetic change, i.e., the ordering of the search results will not into the range zero to one, but of course this is just a cosmetic change;
change. it will not affect the ordering of the search results.
</para> </para>
<para> <para>
@ -991,7 +999,7 @@ ORDER BY rank DESC LIMIT 10;
This is the same example using normalized ranking: This is the same example using normalized ranking:
<programlisting> <programlisting>
SELECT title, ts_rank_cd(textsearch, query)/(ts_rank_cd(textsearch, query) + 1) AS rank SELECT title, ts_rank_cd(textsearch, query, 32 /* rank/(rank+1) */ ) AS rank
FROM apod, to_tsquery('neutrino|(dark &amp; matter)') query FROM apod, to_tsquery('neutrino|(dark &amp; matter)') query
WHERE query @@ textsearch WHERE query @@ textsearch
ORDER BY rank DESC LIMIT 10; ORDER BY rank DESC LIMIT 10;

View File

@ -7,7 +7,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/adt/tsrank.c,v 1.8 2007/09/20 18:10:57 teodor Exp $ * $PostgreSQL: pgsql/src/backend/utils/adt/tsrank.c,v 1.9 2007/11/14 23:43:27 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
@ -25,13 +25,14 @@ static float weights[] = {0.1f, 0.2f, 0.4f, 1.0f};
#define wpos(wep) ( w[ WEP_GETWEIGHT(wep) ] ) #define wpos(wep) ( w[ WEP_GETWEIGHT(wep) ] )
#define RANK_NO_NORM 0x00 #define RANK_NO_NORM 0x00
#define RANK_NORM_LOGLENGTH 0x01 #define RANK_NORM_LOGLENGTH 0x01
#define RANK_NORM_LENGTH 0x02 #define RANK_NORM_LENGTH 0x02
#define RANK_NORM_EXTDIST 0x04 #define RANK_NORM_EXTDIST 0x04
#define RANK_NORM_UNIQ 0x08 #define RANK_NORM_UNIQ 0x08
#define RANK_NORM_LOGUNIQ 0x10 #define RANK_NORM_LOGUNIQ 0x10
#define DEF_NORM_METHOD RANK_NO_NORM #define RANK_NORM_RDIVRPLUS1 0x20
#define DEF_NORM_METHOD RANK_NO_NORM
static float calc_rank_or(float *w, TSVector t, TSQuery q); static float calc_rank_or(float *w, TSVector t, TSQuery q);
static float calc_rank_and(float *w, TSVector t, TSQuery q); static float calc_rank_and(float *w, TSVector t, TSQuery q);
@ -348,12 +349,17 @@ calc_rank(float *w, TSVector t, TSQuery q, int4 method)
res /= (float) len; res /= (float) len;
} }
/* RANK_NORM_EXTDIST not applicable */
if ((method & RANK_NORM_UNIQ) && t->size > 0) if ((method & RANK_NORM_UNIQ) && t->size > 0)
res /= (float) (t->size); res /= (float) (t->size);
if ((method & RANK_NORM_LOGUNIQ) && t->size > 0) if ((method & RANK_NORM_LOGUNIQ) && t->size > 0)
res /= log((double) (t->size + 1)) / log(2.0); res /= log((double) (t->size + 1)) / log(2.0);
if (method & RANK_NORM_RDIVRPLUS1)
res /= (res + 1);
return res; return res;
} }
@ -762,7 +768,7 @@ calc_rank_cd(float4 *arrdata, TSVector txt, TSQuery query, int method)
Wdoc /= (double) len; Wdoc /= (double) len;
} }
if ((method & RANK_NORM_EXTDIST) && SumDist > 0) if ((method & RANK_NORM_EXTDIST) && NExtent > 0 && SumDist > 0)
Wdoc /= ((double) NExtent) / SumDist; Wdoc /= ((double) NExtent) / SumDist;
if ((method & RANK_NORM_UNIQ) && txt->size > 0) if ((method & RANK_NORM_UNIQ) && txt->size > 0)
@ -771,6 +777,9 @@ calc_rank_cd(float4 *arrdata, TSVector txt, TSQuery query, int method)
if ((method & RANK_NORM_LOGUNIQ) && txt->size > 0) if ((method & RANK_NORM_LOGUNIQ) && txt->size > 0)
Wdoc /= log((double) (txt->size + 1)) / log(2.0); Wdoc /= log((double) (txt->size + 1)) / log(2.0);
if (method & RANK_NORM_RDIVRPLUS1)
Wdoc /= (Wdoc + 1);
pfree(doc); pfree(doc);
pfree( qr.operandexist ); pfree( qr.operandexist );