From b525bf771e31a2254f28bf25c6ed7987d64c8afb Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sat, 4 Dec 2010 00:16:21 -0500 Subject: [PATCH] Add KNNGIST support to contrib/pg_trgm. Teodor Sigaev, with some revision by Tom --- contrib/pg_trgm/expected/pg_trgm.out | 23 +++++ contrib/pg_trgm/pg_trgm.sql.in | 21 +++- contrib/pg_trgm/sql/pg_trgm.sql | 4 + contrib/pg_trgm/trgm.h | 10 +- contrib/pg_trgm/trgm_gin.c | 3 + contrib/pg_trgm/trgm_gist.c | 142 ++++++++++++++++++++------ contrib/pg_trgm/trgm_op.c | 26 +++-- contrib/pg_trgm/uninstall_pg_trgm.sql | 6 ++ doc/src/sgml/pgtrgm.sgml | 22 +++- 9 files changed, 214 insertions(+), 43 deletions(-) diff --git a/contrib/pg_trgm/expected/pg_trgm.out b/contrib/pg_trgm/expected/pg_trgm.out index 9838534729..0532a78e4a 100644 --- a/contrib/pg_trgm/expected/pg_trgm.out +++ b/contrib/pg_trgm/expected/pg_trgm.out @@ -1187,6 +1187,13 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198 qwertyu0988 | 0.333333 (1 row) +select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2; + ?column? | t +----------+------------- + 0.411765 | qwertyu0988 + 0.5 | qwertyu0987 +(2 rows) + create index trgm_idx on test_trgm using gist (t gist_trgm_ops); set enable_seqscan=off; select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t; @@ -2315,6 +2322,22 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198 qwertyu0988 | 0.333333 (1 row) +explain (costs off) +select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2; + QUERY PLAN +--------------------------------------------------- + Limit + -> Index Scan using trgm_idx on test_trgm + Order By: (t <-> 'q0987wertyu0988'::text) +(3 rows) + +select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2; + ?column? | t +----------+------------- + 0.411765 | qwertyu0988 + 0.5 | qwertyu0987 +(2 rows) + drop index trgm_idx; create index trgm_idx on test_trgm using gin (t gin_trgm_ops); set enable_seqscan=off; diff --git a/contrib/pg_trgm/pg_trgm.sql.in b/contrib/pg_trgm/pg_trgm.sql.in index cce6cd9872..3e116e8306 100644 --- a/contrib/pg_trgm/pg_trgm.sql.in +++ b/contrib/pg_trgm/pg_trgm.sql.in @@ -26,7 +26,7 @@ LANGUAGE C STRICT IMMUTABLE; CREATE OR REPLACE FUNCTION similarity_op(text,text) RETURNS bool AS 'MODULE_PATHNAME' -LANGUAGE C STRICT STABLE; +LANGUAGE C STRICT STABLE; -- stable because depends on trgm_limit CREATE OPERATOR % ( LEFTARG = text, @@ -37,6 +37,18 @@ CREATE OPERATOR % ( JOIN = contjoinsel ); +CREATE OR REPLACE FUNCTION similarity_dist(text,text) +RETURNS float4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; + +CREATE OPERATOR <-> ( + LEFTARG = text, + RIGHTARG = text, + PROCEDURE = similarity_dist, + COMMUTATOR = '<->' +); + -- gist key CREATE OR REPLACE FUNCTION gtrgm_in(cstring) RETURNS gtrgm @@ -60,6 +72,11 @@ RETURNS bool AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT; +CREATE OR REPLACE FUNCTION gtrgm_distance(internal,text,int,oid) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + CREATE OR REPLACE FUNCTION gtrgm_compress(internal) RETURNS internal AS 'MODULE_PATHNAME' @@ -95,6 +112,7 @@ CREATE OPERATOR CLASS gist_trgm_ops FOR TYPE text USING gist AS OPERATOR 1 % (text, text), + OPERATOR 2 <-> (text, text) FOR ORDER BY pg_catalog.float_ops, FUNCTION 1 gtrgm_consistent (internal, text, int, oid, internal), FUNCTION 2 gtrgm_union (bytea, internal), FUNCTION 3 gtrgm_compress (internal), @@ -102,6 +120,7 @@ AS FUNCTION 5 gtrgm_penalty (internal, internal, internal), FUNCTION 6 gtrgm_picksplit (internal, internal), FUNCTION 7 gtrgm_same (gtrgm, gtrgm, internal), + FUNCTION 8 gtrgm_distance (internal, text, int, oid), STORAGE gtrgm; -- support functions for gin diff --git a/contrib/pg_trgm/sql/pg_trgm.sql b/contrib/pg_trgm/sql/pg_trgm.sql index 20d86b8a0d..5e5539c005 100644 --- a/contrib/pg_trgm/sql/pg_trgm.sql +++ b/contrib/pg_trgm/sql/pg_trgm.sql @@ -26,6 +26,7 @@ CREATE TABLE test_trgm(t text); select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t; select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t; select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t; +select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2; create index trgm_idx on test_trgm using gist (t gist_trgm_ops); set enable_seqscan=off; @@ -33,6 +34,9 @@ set enable_seqscan=off; select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t; select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t; select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t; +explain (costs off) +select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2; +select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2; drop index trgm_idx; create index trgm_idx on test_trgm using gin (t gin_trgm_ops); diff --git a/contrib/pg_trgm/trgm.h b/contrib/pg_trgm/trgm.h index 85826733f5..1cc812554f 100644 --- a/contrib/pg_trgm/trgm.h +++ b/contrib/pg_trgm/trgm.h @@ -4,12 +4,10 @@ #ifndef __TRGM_H__ #define __TRGM_H__ -#include "postgres.h" - #include "access/gist.h" #include "access/itup.h" -#include "utils/builtins.h" #include "storage/bufpage.h" +#include "utils/builtins.h" /* options */ #define LPADDING 2 @@ -18,6 +16,10 @@ #define IGNORECASE #define DIVUNION +/* operator strategy numbers */ +#define SimilarityStrategyNumber 1 +#define DistanceStrategyNumber 2 + typedef char trgm[3]; @@ -89,4 +91,4 @@ extern float4 trgm_limit; TRGM *generate_trgm(char *str, int slen); float4 cnt_sml(TRGM *trg1, TRGM *trg2); -#endif +#endif /* __TRGM_H__ */ diff --git a/contrib/pg_trgm/trgm_gin.c b/contrib/pg_trgm/trgm_gin.c index 3ce0b2deb5..a5a94ca675 100644 --- a/contrib/pg_trgm/trgm_gin.c +++ b/contrib/pg_trgm/trgm_gin.c @@ -1,6 +1,8 @@ /* * contrib/pg_trgm/trgm_gin.c */ +#include "postgres.h" + #include "trgm.h" #include "access/gin.h" @@ -10,6 +12,7 @@ #include "utils/array.h" #include "utils/builtins.h" + PG_FUNCTION_INFO_V1(gin_extract_trgm); Datum gin_extract_trgm(PG_FUNCTION_ARGS); diff --git a/contrib/pg_trgm/trgm_gist.c b/contrib/pg_trgm/trgm_gist.c index 567b2f878f..d9f3d40c17 100644 --- a/contrib/pg_trgm/trgm_gist.c +++ b/contrib/pg_trgm/trgm_gist.c @@ -1,15 +1,19 @@ /* * contrib/pg_trgm/trgm_gist.c */ +#include "postgres.h" + #include "trgm.h" #include "access/gist.h" #include "access/itup.h" +#include "access/skey.h" #include "access/tuptoaster.h" #include "storage/bufpage.h" #include "utils/array.h" #include "utils/builtins.h" + PG_FUNCTION_INFO_V1(gtrgm_in); Datum gtrgm_in(PG_FUNCTION_ARGS); @@ -25,6 +29,9 @@ Datum gtrgm_decompress(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(gtrgm_consistent); Datum gtrgm_consistent(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(gtrgm_distance); +Datum gtrgm_distance(PG_FUNCTION_ARGS); + PG_FUNCTION_INFO_V1(gtrgm_union); Datum gtrgm_union(PG_FUNCTION_ARGS); @@ -159,18 +166,35 @@ gtrgm_decompress(PG_FUNCTION_ARGS) } } +static int4 +cnt_sml_sign_common(TRGM *qtrg, BITVECP sign) +{ + int4 count = 0; + int4 k, + len = ARRNELEM(qtrg); + trgm *ptr = GETARR(qtrg); + int4 tmp = 0; + + for (k = 0; k < len; k++) + { + CPTRGM(((char *) &tmp), ptr + k); + count += GETBIT(sign, HASHVAL(tmp)); + } + + return count; +} + Datum gtrgm_consistent(PG_FUNCTION_ARGS) { GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); text *query = PG_GETARG_TEXT_P(1); - - /* StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); */ + StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); /* Oid subtype = PG_GETARG_OID(3); */ bool *recheck = (bool *) PG_GETARG_POINTER(4); TRGM *key = (TRGM *) DatumGetPointer(entry->key); TRGM *qtrg; - bool res = false; + bool res; char *cache = (char *) fcinfo->flinfo->fn_extra; /* All cases served by this function are exact */ @@ -193,41 +217,97 @@ gtrgm_consistent(PG_FUNCTION_ARGS) qtrg = (TRGM *) (cache + MAXALIGN(VARSIZE(query))); - if (GIST_LEAF(entry)) - { /* all leafs contains orig trgm */ - float4 tmpsml = cnt_sml(key, qtrg); + switch (strategy) + { + case SimilarityStrategyNumber: + if (GIST_LEAF(entry)) + { /* all leafs contains orig trgm */ + float4 tmpsml = cnt_sml(key, qtrg); - /* strange bug at freebsd 5.2.1 and gcc 3.3.3 */ - res = (*(int *) &tmpsml == *(int *) &trgm_limit || tmpsml > trgm_limit) ? true : false; - } - else if (ISALLTRUE(key)) - { /* non-leaf contains signature */ - res = true; - } - else - { /* non-leaf contains signature */ - int4 count = 0; - int4 k, - len = ARRNELEM(qtrg); - trgm *ptr = GETARR(qtrg); - BITVECP sign = GETSIGN(key); - int4 tmp = 0; + /* strange bug at freebsd 5.2.1 and gcc 3.3.3 */ + res = (*(int *) &tmpsml == *(int *) &trgm_limit || tmpsml > trgm_limit) ? true : false; + } + else if (ISALLTRUE(key)) + { /* non-leaf contains signature */ + res = true; + } + else + { /* non-leaf contains signature */ + int4 count = cnt_sml_sign_common(qtrg, GETSIGN(key)); + int4 len = ARRNELEM(qtrg); - for (k = 0; k < len; k++) - { - CPTRGM(((char *) &tmp), ptr + k); - count += GETBIT(sign, HASHVAL(tmp)); - } -#ifdef DIVUNION - res = (len == count) ? true : ((((((float4) count) / ((float4) (len - count)))) >= trgm_limit) ? true : false); -#else - res = (len == 0) ? false : ((((((float4) count) / ((float4) len))) >= trgm_limit) ? true : false); -#endif + if (len == 0) + res = false; + else + res = (((((float8) count) / ((float8) len))) >= trgm_limit) ? true : false; + } + break; + default: + elog(ERROR, "unrecognized strategy number: %d", strategy); + res = false; /* keep compiler quiet */ + break; } PG_RETURN_BOOL(res); } +Datum +gtrgm_distance(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + text *query = PG_GETARG_TEXT_P(1); + StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); + /* Oid subtype = PG_GETARG_OID(3); */ + TRGM *key = (TRGM *) DatumGetPointer(entry->key); + TRGM *qtrg; + float8 res; + char *cache = (char *) fcinfo->flinfo->fn_extra; + + if (cache == NULL || VARSIZE(cache) != VARSIZE(query) || memcmp(cache, query, VARSIZE(query)) != 0) + { + qtrg = generate_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ); + + if (cache) + pfree(cache); + + fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, + MAXALIGN(VARSIZE(query)) + VARSIZE(qtrg)); + cache = (char *) fcinfo->flinfo->fn_extra; + + memcpy(cache, query, VARSIZE(query)); + memcpy(cache + MAXALIGN(VARSIZE(query)), qtrg, VARSIZE(qtrg)); + } + + qtrg = (TRGM *) (cache + MAXALIGN(VARSIZE(query))); + + switch (strategy) + { + case DistanceStrategyNumber: + if (GIST_LEAF(entry)) + { /* all leafs contains orig trgm */ + res = 1.0 - cnt_sml(key, qtrg); + } + else if (ISALLTRUE(key)) + { /* all leafs contains orig trgm */ + res = 0.0; + } + else + { /* non-leaf contains signature */ + int4 count = cnt_sml_sign_common(qtrg, GETSIGN(key)); + int4 len = ARRNELEM(qtrg); + + res = (len == 0) ? -1.0 : 1.0 - ((float8) count) / ((float8) len); + } + break; + default: + elog(ERROR, "unrecognized strategy number: %d", strategy); + res = 0; /* keep compiler quiet */ + break; + } + + PG_RETURN_FLOAT8(res); +} + static int4 unionkey(BITVECP sbase, TRGM *add) { diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c index e15c826e18..b97e9912ba 100644 --- a/contrib/pg_trgm/trgm_op.c +++ b/contrib/pg_trgm/trgm_op.c @@ -1,11 +1,16 @@ /* * contrib/pg_trgm/trgm_op.c */ -#include "trgm.h" +#include "postgres.h" + #include -#include "utils/array.h" + +#include "trgm.h" + #include "catalog/pg_type.h" #include "tsearch/ts_locale.h" +#include "utils/array.h" + PG_MODULE_MAGIC; @@ -359,16 +364,25 @@ similarity(PG_FUNCTION_ARGS) PG_RETURN_FLOAT4(res); } +PG_FUNCTION_INFO_V1(similarity_dist); +Datum similarity_dist(PG_FUNCTION_ARGS); +Datum +similarity_dist(PG_FUNCTION_ARGS) +{ + float4 res = DatumGetFloat4(DirectFunctionCall2(similarity, + PG_GETARG_DATUM(0), + PG_GETARG_DATUM(1))); + PG_RETURN_FLOAT4(1.0 - res); +} + PG_FUNCTION_INFO_V1(similarity_op); Datum similarity_op(PG_FUNCTION_ARGS); Datum similarity_op(PG_FUNCTION_ARGS) { - float4 res = DatumGetFloat4(DirectFunctionCall2( - similarity, + float4 res = DatumGetFloat4(DirectFunctionCall2(similarity, PG_GETARG_DATUM(0), - PG_GETARG_DATUM(1) - )); + PG_GETARG_DATUM(1))); PG_RETURN_BOOL(res >= trgm_limit); } diff --git a/contrib/pg_trgm/uninstall_pg_trgm.sql b/contrib/pg_trgm/uninstall_pg_trgm.sql index 6706dd133e..bc8f1fa983 100644 --- a/contrib/pg_trgm/uninstall_pg_trgm.sql +++ b/contrib/pg_trgm/uninstall_pg_trgm.sql @@ -19,6 +19,8 @@ DROP FUNCTION gtrgm_compress(internal); DROP FUNCTION gtrgm_consistent(internal,text,int,oid,internal); +DROP FUNCTION gtrgm_distance(internal,text,int,oid); + DROP TYPE gtrgm CASCADE; DROP OPERATOR CLASS gin_trgm_ops USING gin; @@ -33,6 +35,10 @@ DROP OPERATOR % (text, text); DROP FUNCTION similarity_op(text,text); +DROP OPERATOR <-> (text, text); + +DROP FUNCTION similarity_dist(text,text); + DROP FUNCTION similarity(text,text); DROP FUNCTION show_trgm(text); diff --git a/doc/src/sgml/pgtrgm.sgml b/doc/src/sgml/pgtrgm.sgml index 376ab85823..1b75f0292b 100644 --- a/doc/src/sgml/pgtrgm.sgml +++ b/doc/src/sgml/pgtrgm.sgml @@ -117,6 +117,14 @@ set_limit. + + text <-> text + real + + Returns the distance between the arguments, that is + one minus the similarity() value. + + @@ -129,7 +137,7 @@ The pg_trgm module provides GiST and GIN index operator classes that allow you to create an index over a text column for the purpose of very fast similarity searches. These index types support - the % similarity operator (and no other operators, so you may + the above-described similarity operators (and no other operators, so you may want a regular B-tree index too). @@ -161,6 +169,18 @@ SELECT t, similarity(t, 'word') AS sml sets. + + A variant of the above query is + +SELECT t, t <-> 'word' AS dist + FROM test_trgm + ORDER BY dist LIMIT 10; + + This can be implemented quite efficiently by GiST indexes, but not + by GIN indexes. It will usually beat the first formulation when only + a small number of the closest matches is wanted. + + The choice between GiST and GIN indexing depends on the relative performance characteristics of GiST and GIN, which are discussed elsewhere.