Add KNNGIST support to contrib/pg_trgm.

Teodor Sigaev, with some revision by Tom
This commit is contained in:
Tom Lane 2010-12-04 00:16:21 -05:00
parent b576757d7e
commit b525bf771e
9 changed files with 214 additions and 43 deletions

View File

@ -1187,6 +1187,13 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
qwertyu0988 | 0.333333 qwertyu0988 | 0.333333
(1 row) (1 row)
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
?column? | t
----------+-------------
0.411765 | qwertyu0988
0.5 | qwertyu0987
(2 rows)
create index trgm_idx on test_trgm using gist (t gist_trgm_ops); create index trgm_idx on test_trgm using gist (t gist_trgm_ops);
set enable_seqscan=off; set enable_seqscan=off;
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t; select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
@ -2315,6 +2322,22 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
qwertyu0988 | 0.333333 qwertyu0988 | 0.333333
(1 row) (1 row)
explain (costs off)
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
QUERY PLAN
---------------------------------------------------
Limit
-> Index Scan using trgm_idx on test_trgm
Order By: (t <-> 'q0987wertyu0988'::text)
(3 rows)
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
?column? | t
----------+-------------
0.411765 | qwertyu0988
0.5 | qwertyu0987
(2 rows)
drop index trgm_idx; drop index trgm_idx;
create index trgm_idx on test_trgm using gin (t gin_trgm_ops); create index trgm_idx on test_trgm using gin (t gin_trgm_ops);
set enable_seqscan=off; set enable_seqscan=off;

View File

@ -26,7 +26,7 @@ LANGUAGE C STRICT IMMUTABLE;
CREATE OR REPLACE FUNCTION similarity_op(text,text) CREATE OR REPLACE FUNCTION similarity_op(text,text)
RETURNS bool RETURNS bool
AS 'MODULE_PATHNAME' AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE; LANGUAGE C STRICT STABLE; -- stable because depends on trgm_limit
CREATE OPERATOR % ( CREATE OPERATOR % (
LEFTARG = text, LEFTARG = text,
@ -37,6 +37,18 @@ CREATE OPERATOR % (
JOIN = contjoinsel JOIN = contjoinsel
); );
CREATE OR REPLACE FUNCTION similarity_dist(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE OPERATOR <-> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = similarity_dist,
COMMUTATOR = '<->'
);
-- gist key -- gist key
CREATE OR REPLACE FUNCTION gtrgm_in(cstring) CREATE OR REPLACE FUNCTION gtrgm_in(cstring)
RETURNS gtrgm RETURNS gtrgm
@ -60,6 +72,11 @@ RETURNS bool
AS 'MODULE_PATHNAME' AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT; LANGUAGE C IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION gtrgm_distance(internal,text,int,oid)
RETURNS float8
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION gtrgm_compress(internal) CREATE OR REPLACE FUNCTION gtrgm_compress(internal)
RETURNS internal RETURNS internal
AS 'MODULE_PATHNAME' AS 'MODULE_PATHNAME'
@ -95,6 +112,7 @@ CREATE OPERATOR CLASS gist_trgm_ops
FOR TYPE text USING gist FOR TYPE text USING gist
AS AS
OPERATOR 1 % (text, text), OPERATOR 1 % (text, text),
OPERATOR 2 <-> (text, text) FOR ORDER BY pg_catalog.float_ops,
FUNCTION 1 gtrgm_consistent (internal, text, int, oid, internal), FUNCTION 1 gtrgm_consistent (internal, text, int, oid, internal),
FUNCTION 2 gtrgm_union (bytea, internal), FUNCTION 2 gtrgm_union (bytea, internal),
FUNCTION 3 gtrgm_compress (internal), FUNCTION 3 gtrgm_compress (internal),
@ -102,6 +120,7 @@ AS
FUNCTION 5 gtrgm_penalty (internal, internal, internal), FUNCTION 5 gtrgm_penalty (internal, internal, internal),
FUNCTION 6 gtrgm_picksplit (internal, internal), FUNCTION 6 gtrgm_picksplit (internal, internal),
FUNCTION 7 gtrgm_same (gtrgm, gtrgm, internal), FUNCTION 7 gtrgm_same (gtrgm, gtrgm, internal),
FUNCTION 8 gtrgm_distance (internal, text, int, oid),
STORAGE gtrgm; STORAGE gtrgm;
-- support functions for gin -- support functions for gin

View File

@ -26,6 +26,7 @@ CREATE TABLE test_trgm(t text);
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t; select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t; select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t; select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
create index trgm_idx on test_trgm using gist (t gist_trgm_ops); create index trgm_idx on test_trgm using gist (t gist_trgm_ops);
set enable_seqscan=off; set enable_seqscan=off;
@ -33,6 +34,9 @@ set enable_seqscan=off;
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t; select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t; select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t; select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
explain (costs off)
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
drop index trgm_idx; drop index trgm_idx;
create index trgm_idx on test_trgm using gin (t gin_trgm_ops); create index trgm_idx on test_trgm using gin (t gin_trgm_ops);

View File

@ -4,12 +4,10 @@
#ifndef __TRGM_H__ #ifndef __TRGM_H__
#define __TRGM_H__ #define __TRGM_H__
#include "postgres.h"
#include "access/gist.h" #include "access/gist.h"
#include "access/itup.h" #include "access/itup.h"
#include "utils/builtins.h"
#include "storage/bufpage.h" #include "storage/bufpage.h"
#include "utils/builtins.h"
/* options */ /* options */
#define LPADDING 2 #define LPADDING 2
@ -18,6 +16,10 @@
#define IGNORECASE #define IGNORECASE
#define DIVUNION #define DIVUNION
/* operator strategy numbers */
#define SimilarityStrategyNumber 1
#define DistanceStrategyNumber 2
typedef char trgm[3]; typedef char trgm[3];
@ -89,4 +91,4 @@ extern float4 trgm_limit;
TRGM *generate_trgm(char *str, int slen); TRGM *generate_trgm(char *str, int slen);
float4 cnt_sml(TRGM *trg1, TRGM *trg2); float4 cnt_sml(TRGM *trg1, TRGM *trg2);
#endif #endif /* __TRGM_H__ */

View File

@ -1,6 +1,8 @@
/* /*
* contrib/pg_trgm/trgm_gin.c * contrib/pg_trgm/trgm_gin.c
*/ */
#include "postgres.h"
#include "trgm.h" #include "trgm.h"
#include "access/gin.h" #include "access/gin.h"
@ -10,6 +12,7 @@
#include "utils/array.h" #include "utils/array.h"
#include "utils/builtins.h" #include "utils/builtins.h"
PG_FUNCTION_INFO_V1(gin_extract_trgm); PG_FUNCTION_INFO_V1(gin_extract_trgm);
Datum gin_extract_trgm(PG_FUNCTION_ARGS); Datum gin_extract_trgm(PG_FUNCTION_ARGS);

View File

@ -1,15 +1,19 @@
/* /*
* contrib/pg_trgm/trgm_gist.c * contrib/pg_trgm/trgm_gist.c
*/ */
#include "postgres.h"
#include "trgm.h" #include "trgm.h"
#include "access/gist.h" #include "access/gist.h"
#include "access/itup.h" #include "access/itup.h"
#include "access/skey.h"
#include "access/tuptoaster.h" #include "access/tuptoaster.h"
#include "storage/bufpage.h" #include "storage/bufpage.h"
#include "utils/array.h" #include "utils/array.h"
#include "utils/builtins.h" #include "utils/builtins.h"
PG_FUNCTION_INFO_V1(gtrgm_in); PG_FUNCTION_INFO_V1(gtrgm_in);
Datum gtrgm_in(PG_FUNCTION_ARGS); Datum gtrgm_in(PG_FUNCTION_ARGS);
@ -25,6 +29,9 @@ Datum gtrgm_decompress(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(gtrgm_consistent); PG_FUNCTION_INFO_V1(gtrgm_consistent);
Datum gtrgm_consistent(PG_FUNCTION_ARGS); Datum gtrgm_consistent(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(gtrgm_distance);
Datum gtrgm_distance(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(gtrgm_union); PG_FUNCTION_INFO_V1(gtrgm_union);
Datum gtrgm_union(PG_FUNCTION_ARGS); Datum gtrgm_union(PG_FUNCTION_ARGS);
@ -159,18 +166,35 @@ gtrgm_decompress(PG_FUNCTION_ARGS)
} }
} }
static int4
cnt_sml_sign_common(TRGM *qtrg, BITVECP sign)
{
int4 count = 0;
int4 k,
len = ARRNELEM(qtrg);
trgm *ptr = GETARR(qtrg);
int4 tmp = 0;
for (k = 0; k < len; k++)
{
CPTRGM(((char *) &tmp), ptr + k);
count += GETBIT(sign, HASHVAL(tmp));
}
return count;
}
Datum Datum
gtrgm_consistent(PG_FUNCTION_ARGS) gtrgm_consistent(PG_FUNCTION_ARGS)
{ {
GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
text *query = PG_GETARG_TEXT_P(1); text *query = PG_GETARG_TEXT_P(1);
StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
/* StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); */
/* Oid subtype = PG_GETARG_OID(3); */ /* Oid subtype = PG_GETARG_OID(3); */
bool *recheck = (bool *) PG_GETARG_POINTER(4); bool *recheck = (bool *) PG_GETARG_POINTER(4);
TRGM *key = (TRGM *) DatumGetPointer(entry->key); TRGM *key = (TRGM *) DatumGetPointer(entry->key);
TRGM *qtrg; TRGM *qtrg;
bool res = false; bool res;
char *cache = (char *) fcinfo->flinfo->fn_extra; char *cache = (char *) fcinfo->flinfo->fn_extra;
/* All cases served by this function are exact */ /* All cases served by this function are exact */
@ -193,41 +217,97 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
qtrg = (TRGM *) (cache + MAXALIGN(VARSIZE(query))); qtrg = (TRGM *) (cache + MAXALIGN(VARSIZE(query)));
if (GIST_LEAF(entry)) switch (strategy)
{ /* all leafs contains orig trgm */ {
float4 tmpsml = cnt_sml(key, qtrg); case SimilarityStrategyNumber:
if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */
float4 tmpsml = cnt_sml(key, qtrg);
/* strange bug at freebsd 5.2.1 and gcc 3.3.3 */ /* strange bug at freebsd 5.2.1 and gcc 3.3.3 */
res = (*(int *) &tmpsml == *(int *) &trgm_limit || tmpsml > trgm_limit) ? true : false; res = (*(int *) &tmpsml == *(int *) &trgm_limit || tmpsml > trgm_limit) ? true : false;
} }
else if (ISALLTRUE(key)) else if (ISALLTRUE(key))
{ /* non-leaf contains signature */ { /* non-leaf contains signature */
res = true; res = true;
} }
else else
{ /* non-leaf contains signature */ { /* non-leaf contains signature */
int4 count = 0; int4 count = cnt_sml_sign_common(qtrg, GETSIGN(key));
int4 k, int4 len = ARRNELEM(qtrg);
len = ARRNELEM(qtrg);
trgm *ptr = GETARR(qtrg);
BITVECP sign = GETSIGN(key);
int4 tmp = 0;
for (k = 0; k < len; k++) if (len == 0)
{ res = false;
CPTRGM(((char *) &tmp), ptr + k); else
count += GETBIT(sign, HASHVAL(tmp)); res = (((((float8) count) / ((float8) len))) >= trgm_limit) ? true : false;
} }
#ifdef DIVUNION break;
res = (len == count) ? true : ((((((float4) count) / ((float4) (len - count)))) >= trgm_limit) ? true : false); default:
#else elog(ERROR, "unrecognized strategy number: %d", strategy);
res = (len == 0) ? false : ((((((float4) count) / ((float4) len))) >= trgm_limit) ? true : false); res = false; /* keep compiler quiet */
#endif break;
} }
PG_RETURN_BOOL(res); PG_RETURN_BOOL(res);
} }
Datum
gtrgm_distance(PG_FUNCTION_ARGS)
{
GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
text *query = PG_GETARG_TEXT_P(1);
StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
/* Oid subtype = PG_GETARG_OID(3); */
TRGM *key = (TRGM *) DatumGetPointer(entry->key);
TRGM *qtrg;
float8 res;
char *cache = (char *) fcinfo->flinfo->fn_extra;
if (cache == NULL || VARSIZE(cache) != VARSIZE(query) || memcmp(cache, query, VARSIZE(query)) != 0)
{
qtrg = generate_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ);
if (cache)
pfree(cache);
fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
MAXALIGN(VARSIZE(query)) + VARSIZE(qtrg));
cache = (char *) fcinfo->flinfo->fn_extra;
memcpy(cache, query, VARSIZE(query));
memcpy(cache + MAXALIGN(VARSIZE(query)), qtrg, VARSIZE(qtrg));
}
qtrg = (TRGM *) (cache + MAXALIGN(VARSIZE(query)));
switch (strategy)
{
case DistanceStrategyNumber:
if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */
res = 1.0 - cnt_sml(key, qtrg);
}
else if (ISALLTRUE(key))
{ /* all leafs contains orig trgm */
res = 0.0;
}
else
{ /* non-leaf contains signature */
int4 count = cnt_sml_sign_common(qtrg, GETSIGN(key));
int4 len = ARRNELEM(qtrg);
res = (len == 0) ? -1.0 : 1.0 - ((float8) count) / ((float8) len);
}
break;
default:
elog(ERROR, "unrecognized strategy number: %d", strategy);
res = 0; /* keep compiler quiet */
break;
}
PG_RETURN_FLOAT8(res);
}
static int4 static int4
unionkey(BITVECP sbase, TRGM *add) unionkey(BITVECP sbase, TRGM *add)
{ {

View File

@ -1,11 +1,16 @@
/* /*
* contrib/pg_trgm/trgm_op.c * contrib/pg_trgm/trgm_op.c
*/ */
#include "trgm.h" #include "postgres.h"
#include <ctype.h> #include <ctype.h>
#include "utils/array.h"
#include "trgm.h"
#include "catalog/pg_type.h" #include "catalog/pg_type.h"
#include "tsearch/ts_locale.h" #include "tsearch/ts_locale.h"
#include "utils/array.h"
PG_MODULE_MAGIC; PG_MODULE_MAGIC;
@ -359,16 +364,25 @@ similarity(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT4(res); PG_RETURN_FLOAT4(res);
} }
PG_FUNCTION_INFO_V1(similarity_dist);
Datum similarity_dist(PG_FUNCTION_ARGS);
Datum
similarity_dist(PG_FUNCTION_ARGS)
{
float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
PG_GETARG_DATUM(0),
PG_GETARG_DATUM(1)));
PG_RETURN_FLOAT4(1.0 - res);
}
PG_FUNCTION_INFO_V1(similarity_op); PG_FUNCTION_INFO_V1(similarity_op);
Datum similarity_op(PG_FUNCTION_ARGS); Datum similarity_op(PG_FUNCTION_ARGS);
Datum Datum
similarity_op(PG_FUNCTION_ARGS) similarity_op(PG_FUNCTION_ARGS)
{ {
float4 res = DatumGetFloat4(DirectFunctionCall2( float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
similarity,
PG_GETARG_DATUM(0), PG_GETARG_DATUM(0),
PG_GETARG_DATUM(1) PG_GETARG_DATUM(1)));
));
PG_RETURN_BOOL(res >= trgm_limit); PG_RETURN_BOOL(res >= trgm_limit);
} }

View File

@ -19,6 +19,8 @@ DROP FUNCTION gtrgm_compress(internal);
DROP FUNCTION gtrgm_consistent(internal,text,int,oid,internal); DROP FUNCTION gtrgm_consistent(internal,text,int,oid,internal);
DROP FUNCTION gtrgm_distance(internal,text,int,oid);
DROP TYPE gtrgm CASCADE; DROP TYPE gtrgm CASCADE;
DROP OPERATOR CLASS gin_trgm_ops USING gin; DROP OPERATOR CLASS gin_trgm_ops USING gin;
@ -33,6 +35,10 @@ DROP OPERATOR % (text, text);
DROP FUNCTION similarity_op(text,text); DROP FUNCTION similarity_op(text,text);
DROP OPERATOR <-> (text, text);
DROP FUNCTION similarity_dist(text,text);
DROP FUNCTION similarity(text,text); DROP FUNCTION similarity(text,text);
DROP FUNCTION show_trgm(text); DROP FUNCTION show_trgm(text);

View File

@ -117,6 +117,14 @@
<function>set_limit</>. <function>set_limit</>.
</entry> </entry>
</row> </row>
<row>
<entry><type>text</> <literal>&lt;-&gt;</literal> <type>text</></entry>
<entry><type>real</type></entry>
<entry>
Returns the <quote>distance</> between the arguments, that is
one minus the <function>similarity()</> value.
</entry>
</row>
</tbody> </tbody>
</tgroup> </tgroup>
</table> </table>
@ -129,7 +137,7 @@
The <filename>pg_trgm</filename> module provides GiST and GIN index The <filename>pg_trgm</filename> module provides GiST and GIN index
operator classes that allow you to create an index over a text column for operator classes that allow you to create an index over a text column for
the purpose of very fast similarity searches. These index types support the purpose of very fast similarity searches. These index types support
the <literal>%</> similarity operator (and no other operators, so you may the above-described similarity operators (and no other operators, so you may
want a regular B-tree index too). want a regular B-tree index too).
</para> </para>
@ -161,6 +169,18 @@ SELECT t, similarity(t, '<replaceable>word</>') AS sml
sets. sets.
</para> </para>
<para>
A variant of the above query is
<programlisting>
SELECT t, t &lt;-&gt; '<replaceable>word</>' AS dist
FROM test_trgm
ORDER BY dist LIMIT 10;
</programlisting>
This can be implemented quite efficiently by GiST indexes, but not
by GIN indexes. It will usually beat the first formulation when only
a small number of the closest matches is wanted.
</para>
<para> <para>
The choice between GiST and GIN indexing depends on the relative The choice between GiST and GIN indexing depends on the relative
performance characteristics of GiST and GIN, which are discussed elsewhere. performance characteristics of GiST and GIN, which are discussed elsewhere.