Add a rank/(rank+1) normalization option to ts_rank(). While the usefulness

of this seems a bit marginal, if it's useful enough to be shown in the manual then we probably ought to support doing it without double evaluation of the ts_rank function. Per my proposal earlier today.
2007-11-14 23:43:27 +00:00 · 2007-11-14 23:43:27 +00:00 · 866bad9543
parent 5858990f87
commit 866bad9543
2 changed files with 32 additions and 15 deletions
--- a/doc/src/sgml/textsearch.sgml
+++ b/doc/src/sgml/textsearch.sgml
@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.33 2007/11/14 18:36:37 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.34 2007/11/14 23:43:27 tgl Exp $ -->
 <chapter id="textsearch">
 <title id="textsearch-title">Full Text Search</title>
@ -940,6 +940,7 @@ SELECT plainto_tsquery('english', 'The Fat &amp; Rats:C');
     <listitem>
      <para>
       4 divides the rank by the mean harmonic distance between extents
       (this is implemented only by <function>ts_rank_cd</>)
      </para>
     </listitem>
     <listitem>
@ -953,17 +954,24 @@ SELECT plainto_tsquery('english', 'The Fat &amp; Rats:C');
       of unique words in document
      </para>
     </listitem>
     <listitem>
      <para>
       32 divides the rank by itself + 1
      </para>
     </listitem>
    </itemizedlist>
    If more than one flag bit is specified, the transformations are
    applied in the order listed.
   </para>
   <para>
    It is important to note that the ranking functions do not use any global
-    information so it is impossible to produce a fair normalization to 1% or
+    information, so it is impossible to produce a fair normalization to 1% or
-    100%, as sometimes desired.  However, a simple technique like
+    100% as sometimes desired.  Normalization option 32
-    <literal>rank/(rank+1)</literal> can be applied.  Of course, this is just
+    (<literal>rank/(rank+1)</literal>) can be applied to scale all ranks
-    a cosmetic change, i.e., the ordering of the search results will not
+    into the range zero to one, but of course this is just a cosmetic change;
-    change.
+    it will not affect the ordering of the search results.
   </para>
   <para>
@ -991,7 +999,7 @@ ORDER BY rank DESC LIMIT 10;
    This is the same example using normalized ranking:
 <programlisting>
-SELECT title, ts_rank_cd(textsearch, query)/(ts_rank_cd(textsearch, query) + 1) AS rank
+SELECT title, ts_rank_cd(textsearch, query, 32 /* rank/(rank+1) */ ) AS rank
 FROM apod, to_tsquery('neutrino|(dark &amp; matter)') query
 WHERE  query @@ textsearch
 ORDER BY rank DESC LIMIT 10;
--- a/src/backend/utils/adt/tsrank.c
+++ b/src/backend/utils/adt/tsrank.c
@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsrank.c,v 1.8 2007/09/20 18:10:57 teodor Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsrank.c,v 1.9 2007/11/14 23:43:27 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -25,13 +25,14 @@ static float weights[] = {0.1f, 0.2f, 0.4f, 1.0f};
 #define wpos(wep)	( w[ WEP_GETWEIGHT(wep) ] )
-#define RANK_NO_NORM		0x00
+#define RANK_NO_NORM			0x00
 #define RANK_NORM_LOGLENGTH		0x01
-#define RANK_NORM_LENGTH	0x02
+#define RANK_NORM_LENGTH		0x02
-#define RANK_NORM_EXTDIST	0x04
+#define RANK_NORM_EXTDIST		0x04
-#define RANK_NORM_UNIQ		0x08
+#define RANK_NORM_UNIQ			0x08
-#define RANK_NORM_LOGUNIQ	0x10
+#define RANK_NORM_LOGUNIQ		0x10
-#define DEF_NORM_METHOD		RANK_NO_NORM
+#define RANK_NORM_RDIVRPLUS1	0x20
 #define DEF_NORM_METHOD			RANK_NO_NORM
 static float calc_rank_or(float *w, TSVector t, TSQuery q);
 static float calc_rank_and(float *w, TSVector t, TSQuery q);
@ -348,12 +349,17 @@ calc_rank(float *w, TSVector t, TSQuery q, int4 method)
 			res /= (float) len;
 	}
 	/* RANK_NORM_EXTDIST not applicable */
 	if ((method & RANK_NORM_UNIQ) && t->size > 0)
 		res /= (float) (t->size);
 	if ((method & RANK_NORM_LOGUNIQ) && t->size > 0)
 		res /= log((double) (t->size + 1)) / log(2.0);
 	if (method & RANK_NORM_RDIVRPLUS1)
 		res /= (res + 1);
 	return res;
 }
@ -762,7 +768,7 @@ calc_rank_cd(float4 *arrdata, TSVector txt, TSQuery query, int method)
 			Wdoc /= (double) len;
 	}
-	if ((method & RANK_NORM_EXTDIST) && SumDist > 0)
+	if ((method & RANK_NORM_EXTDIST) && NExtent > 0 && SumDist > 0)
 		Wdoc /= ((double) NExtent) / SumDist;
 	if ((method & RANK_NORM_UNIQ) && txt->size > 0)
@ -771,6 +777,9 @@ calc_rank_cd(float4 *arrdata, TSVector txt, TSQuery query, int method)
 	if ((method & RANK_NORM_LOGUNIQ) && txt->size > 0)
 		Wdoc /= log((double) (txt->size + 1)) / log(2.0);
 	if (method & RANK_NORM_RDIVRPLUS1)
 		Wdoc /= (Wdoc + 1);
 	pfree(doc);
 	pfree( qr.operandexist );