Add strict_word_similarity to pg_trgm module

strict_word_similarity is similar to existing word_similarity function but
it takes into account word boundaries to compute similarity.

Author: Alexander Korotkov
Review by: David Steele, Liudmila Mantrova, me
Discussion: https://www.postgresql.org/message-id/flat/CY4PR17MB13207ED8310F847CF117EED0D85A0@CY4PR17MB1320.namprd17.prod.outlook.com
This commit is contained in:
Teodor Sigaev 2018-03-21 14:57:42 +03:00
parent f20b328534
commit be8a7a6866
10 changed files with 1461 additions and 61 deletions

View File

@ -4,11 +4,12 @@ MODULE_big = pg_trgm
OBJS = trgm_op.o trgm_gist.o trgm_gin.o trgm_regexp.o $(WIN32RES) OBJS = trgm_op.o trgm_gist.o trgm_gin.o trgm_regexp.o $(WIN32RES)
EXTENSION = pg_trgm EXTENSION = pg_trgm
DATA = pg_trgm--1.3.sql pg_trgm--1.2--1.3.sql pg_trgm--1.1--1.2.sql \ DATA = pg_trgm--1.3--1.4.sql \
pg_trgm--1.3.sql pg_trgm--1.2--1.3.sql pg_trgm--1.1--1.2.sql \
pg_trgm--1.0--1.1.sql pg_trgm--unpackaged--1.0.sql pg_trgm--1.0--1.1.sql pg_trgm--unpackaged--1.0.sql
PGFILEDESC = "pg_trgm - trigram matching" PGFILEDESC = "pg_trgm - trigram matching"
REGRESS = pg_trgm pg_word_trgm REGRESS = pg_trgm pg_word_trgm pg_strict_word_trgm
ifdef USE_PGXS ifdef USE_PGXS
PG_CONFIG = pg_config PG_CONFIG = pg_config

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,68 @@
/* contrib/pg_trgm/pg_trgm--1.3--1.4.sql */
-- complain if script is sourced in psql, rather than via ALTER EXTENSION
\echo Use "ALTER EXTENSION pg_trgm UPDATE TO '1.4'" to load this file. \quit
CREATE FUNCTION strict_word_similarity(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
CREATE FUNCTION strict_word_similarity_op(text,text)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE PARALLEL SAFE; -- stable because depends on pg_trgm.word_similarity_threshold
CREATE FUNCTION strict_word_similarity_commutator_op(text,text)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE PARALLEL SAFE; -- stable because depends on pg_trgm.word_similarity_threshold
CREATE OPERATOR <<% (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = strict_word_similarity_op,
COMMUTATOR = '%>>',
RESTRICT = contsel,
JOIN = contjoinsel
);
CREATE OPERATOR %>> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = strict_word_similarity_commutator_op,
COMMUTATOR = '<<%',
RESTRICT = contsel,
JOIN = contjoinsel
);
CREATE FUNCTION strict_word_similarity_dist_op(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
CREATE FUNCTION strict_word_similarity_dist_commutator_op(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
CREATE OPERATOR <<<-> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = strict_word_similarity_dist_op,
COMMUTATOR = '<->>>'
);
CREATE OPERATOR <->>> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = strict_word_similarity_dist_commutator_op,
COMMUTATOR = '<<<->'
);
ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
OPERATOR 9 %>> (text, text),
OPERATOR 10 <->>> (text, text) FOR ORDER BY pg_catalog.float_ops;
ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
OPERATOR 9 %>> (text, text);

View File

@ -1,5 +1,5 @@
# pg_trgm extension # pg_trgm extension
comment = 'text similarity measurement and index searching based on trigrams' comment = 'text similarity measurement and index searching based on trigrams'
default_version = '1.3' default_version = '1.4'
module_pathname = '$libdir/pg_trgm' module_pathname = '$libdir/pg_trgm'
relocatable = true relocatable = true

View File

@ -0,0 +1,42 @@
DROP INDEX trgm_idx2;
\copy test_trgm3 from 'data/trgm2.data'
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7;
create index trgm_idx2 on test_trgm2 using gist (t gist_trgm_ops);
set enable_seqscan=off;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
explain (costs off)
select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7;
select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7;
drop index trgm_idx2;
create index trgm_idx2 on test_trgm2 using gin (t gin_trgm_ops);
set enable_seqscan=off;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
set "pg_trgm.strict_word_similarity_threshold" to 0.4;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
set "pg_trgm.strict_word_similarity_threshold" to 0.2;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;

View File

@ -6,6 +6,7 @@
#include "access/gist.h" #include "access/gist.h"
#include "access/itup.h" #include "access/itup.h"
#include "access/stratnum.h"
#include "storage/bufpage.h" #include "storage/bufpage.h"
/* /*
@ -26,14 +27,16 @@
#define DIVUNION #define DIVUNION
/* operator strategy numbers */ /* operator strategy numbers */
#define SimilarityStrategyNumber 1 #define SimilarityStrategyNumber 1
#define DistanceStrategyNumber 2 #define DistanceStrategyNumber 2
#define LikeStrategyNumber 3 #define LikeStrategyNumber 3
#define ILikeStrategyNumber 4 #define ILikeStrategyNumber 4
#define RegExpStrategyNumber 5 #define RegExpStrategyNumber 5
#define RegExpICaseStrategyNumber 6 #define RegExpICaseStrategyNumber 6
#define WordSimilarityStrategyNumber 7 #define WordSimilarityStrategyNumber 7
#define WordDistanceStrategyNumber 8 #define WordDistanceStrategyNumber 8
#define StrictWordSimilarityStrategyNumber 9
#define StrictWordDistanceStrategyNumber 10
typedef char trgm[3]; typedef char trgm[3];
@ -120,7 +123,9 @@ typedef struct TrgmPackedGraph TrgmPackedGraph;
extern double similarity_threshold; extern double similarity_threshold;
extern double word_similarity_threshold; extern double word_similarity_threshold;
extern double strict_word_similarity_threshold;
extern double index_strategy_get_limit(StrategyNumber strategy);
extern uint32 trgm2int(trgm *ptr); extern uint32 trgm2int(trgm *ptr);
extern void compact_trigram(trgm *tptr, char *str, int bytelen); extern void compact_trigram(trgm *tptr, char *str, int bytelen);
extern TRGM *generate_trgm(char *str, int slen); extern TRGM *generate_trgm(char *str, int slen);

View File

@ -90,6 +90,7 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS)
{ {
case SimilarityStrategyNumber: case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber: case WordSimilarityStrategyNumber:
case StrictWordSimilarityStrategyNumber:
trg = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val)); trg = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val));
break; break;
case ILikeStrategyNumber: case ILikeStrategyNumber:
@ -187,8 +188,8 @@ gin_trgm_consistent(PG_FUNCTION_ARGS)
{ {
case SimilarityStrategyNumber: case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber: case WordSimilarityStrategyNumber:
nlimit = (strategy == SimilarityStrategyNumber) ? case StrictWordSimilarityStrategyNumber:
similarity_threshold : word_similarity_threshold; nlimit = index_strategy_get_limit(strategy);
/* Count the matches */ /* Count the matches */
ntrue = 0; ntrue = 0;
@ -282,8 +283,8 @@ gin_trgm_triconsistent(PG_FUNCTION_ARGS)
{ {
case SimilarityStrategyNumber: case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber: case WordSimilarityStrategyNumber:
nlimit = (strategy == SimilarityStrategyNumber) ? case StrictWordSimilarityStrategyNumber:
similarity_threshold : word_similarity_threshold; nlimit = index_strategy_get_limit(strategy);
/* Count the matches */ /* Count the matches */
ntrue = 0; ntrue = 0;

View File

@ -221,6 +221,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
{ {
case SimilarityStrategyNumber: case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber: case WordSimilarityStrategyNumber:
case StrictWordSimilarityStrategyNumber:
qtrg = generate_trgm(VARDATA(query), qtrg = generate_trgm(VARDATA(query),
querysize - VARHDRSZ); querysize - VARHDRSZ);
break; break;
@ -290,10 +291,11 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
{ {
case SimilarityStrategyNumber: case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber: case WordSimilarityStrategyNumber:
/* Similarity search is exact. Word similarity search is inexact */ case StrictWordSimilarityStrategyNumber:
*recheck = (strategy == WordSimilarityStrategyNumber); /* Similarity search is exact. (Strict) word similarity search is inexact */
nlimit = (strategy == SimilarityStrategyNumber) ? *recheck = (strategy != SimilarityStrategyNumber);
similarity_threshold : word_similarity_threshold;
nlimit = index_strategy_get_limit(strategy);
if (GIST_LEAF(entry)) if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */ { /* all leafs contains orig trgm */
@ -468,7 +470,9 @@ gtrgm_distance(PG_FUNCTION_ARGS)
{ {
case DistanceStrategyNumber: case DistanceStrategyNumber:
case WordDistanceStrategyNumber: case WordDistanceStrategyNumber:
*recheck = strategy == WordDistanceStrategyNumber; case StrictWordDistanceStrategyNumber:
/* Only plain trigram distance is exact */
*recheck = (strategy != DistanceStrategyNumber);
if (GIST_LEAF(entry)) if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */ { /* all leafs contains orig trgm */

View File

@ -18,6 +18,7 @@ PG_MODULE_MAGIC;
/* GUC variables */ /* GUC variables */
double similarity_threshold = 0.3f; double similarity_threshold = 0.3f;
double word_similarity_threshold = 0.6f; double word_similarity_threshold = 0.6f;
double strict_word_similarity_threshold = 0.5f;
void _PG_init(void); void _PG_init(void);
@ -26,12 +27,17 @@ PG_FUNCTION_INFO_V1(show_limit);
PG_FUNCTION_INFO_V1(show_trgm); PG_FUNCTION_INFO_V1(show_trgm);
PG_FUNCTION_INFO_V1(similarity); PG_FUNCTION_INFO_V1(similarity);
PG_FUNCTION_INFO_V1(word_similarity); PG_FUNCTION_INFO_V1(word_similarity);
PG_FUNCTION_INFO_V1(strict_word_similarity);
PG_FUNCTION_INFO_V1(similarity_dist); PG_FUNCTION_INFO_V1(similarity_dist);
PG_FUNCTION_INFO_V1(similarity_op); PG_FUNCTION_INFO_V1(similarity_op);
PG_FUNCTION_INFO_V1(word_similarity_op); PG_FUNCTION_INFO_V1(word_similarity_op);
PG_FUNCTION_INFO_V1(word_similarity_commutator_op); PG_FUNCTION_INFO_V1(word_similarity_commutator_op);
PG_FUNCTION_INFO_V1(word_similarity_dist_op); PG_FUNCTION_INFO_V1(word_similarity_dist_op);
PG_FUNCTION_INFO_V1(word_similarity_dist_commutator_op); PG_FUNCTION_INFO_V1(word_similarity_dist_commutator_op);
PG_FUNCTION_INFO_V1(strict_word_similarity_op);
PG_FUNCTION_INFO_V1(strict_word_similarity_commutator_op);
PG_FUNCTION_INFO_V1(strict_word_similarity_dist_op);
PG_FUNCTION_INFO_V1(strict_word_similarity_dist_commutator_op);
/* Trigram with position */ /* Trigram with position */
typedef struct typedef struct
@ -40,6 +46,17 @@ typedef struct
int index; int index;
} pos_trgm; } pos_trgm;
/* Trigram bound type */
typedef uint8 TrgmBound;
#define TRGM_BOUND_LEFT (0x01) /* trigram is left bound of word */
#define TRGM_BOUND_RIGHT (0x02) /* trigram is right bound of word */
/* Word similarity flags */
#define WORD_SIMILARITY_CHECK_ONLY (0x01) /* if set then only check existence
* of similar search pattern in text */
#define WORD_SIMILARITY_STRICT (0x02) /* force bounds of extent to match
* word bounds */
/* /*
* Module load callback * Module load callback
*/ */
@ -71,6 +88,18 @@ _PG_init(void)
NULL, NULL,
NULL, NULL,
NULL); NULL);
DefineCustomRealVariable("pg_trgm.strict_word_similarity_threshold",
"Sets the threshold used by the <<%% operator.",
"Valid range is 0.0 .. 1.0.",
&strict_word_similarity_threshold,
0.5,
0.0,
1.0,
PGC_USERSET,
0,
NULL,
NULL,
NULL);
} }
/* /*
@ -95,6 +124,29 @@ set_limit(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT4(similarity_threshold); PG_RETURN_FLOAT4(similarity_threshold);
} }
/*
* Get similarity threshold for given index scan strategy number.
*/
double
index_strategy_get_limit(StrategyNumber strategy)
{
switch (strategy)
{
case SimilarityStrategyNumber:
return similarity_threshold;
case WordSimilarityStrategyNumber:
return word_similarity_threshold;
case StrictWordSimilarityStrategyNumber:
return strict_word_similarity_threshold;
default:
elog(ERROR, "unrecognized strategy number: %d", strategy);
break;
}
return 0.0; /* keep compiler quiet */
}
/* /*
* Deprecated function. * Deprecated function.
* Use "pg_trgm.similarity_threshold" GUC variable instead of this function. * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
@ -235,11 +287,12 @@ make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
* *
* trg: where to return the array of trigrams. * trg: where to return the array of trigrams.
* str: source string, of length slen bytes. * str: source string, of length slen bytes.
* bounds: where to return bounds of trigrams (if needed).
* *
* Returns length of the generated array. * Returns length of the generated array.
*/ */
static int static int
generate_trgm_only(trgm *trg, char *str, int slen) generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
{ {
trgm *tptr; trgm *tptr;
char *buf; char *buf;
@ -282,11 +335,13 @@ generate_trgm_only(trgm *trg, char *str, int slen)
buf[LPADDING + bytelen] = ' '; buf[LPADDING + bytelen] = ' ';
buf[LPADDING + bytelen + 1] = ' '; buf[LPADDING + bytelen + 1] = ' ';
/* /* Calculate trigrams marking their bounds if needed */
* count trigrams if (bounds)
*/ bounds[tptr - trg] |= TRGM_BOUND_LEFT;
tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING, tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
charlen + LPADDING + RPADDING); charlen + LPADDING + RPADDING);
if (bounds)
bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT;
} }
pfree(buf); pfree(buf);
@ -328,7 +383,7 @@ generate_trgm(char *str, int slen)
trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3); trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
trg->flag = ARRKEY; trg->flag = ARRKEY;
len = generate_trgm_only(GETARR(trg), str, slen); len = generate_trgm_only(GETARR(trg), str, slen, NULL);
SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len)); SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
if (len == 0) if (len == 0)
@ -413,8 +468,8 @@ comp_ptrgm(const void *v1, const void *v2)
* ulen1: count of unique trigrams of array "trg1". * ulen1: count of unique trigrams of array "trg1".
* len2: length of array "trg2" and array "trg2indexes". * len2: length of array "trg2" and array "trg2indexes".
* len: length of the array "found". * len: length of the array "found".
* check_only: if true then only check existence of similar search pattern in * lags: set of boolean flags parametrizing similarity calculation.
* text. * bounds: whether each trigram is left/right bound of word.
* *
* Returns word similarity. * Returns word similarity.
*/ */
@ -424,16 +479,32 @@ iterate_word_similarity(int *trg2indexes,
int ulen1, int ulen1,
int len2, int len2,
int len, int len,
bool check_only) uint8 flags,
TrgmBound *bounds)
{ {
int *lastpos, int *lastpos,
i, i,
ulen2 = 0, ulen2 = 0,
count = 0, count = 0,
upper = -1, upper = -1,
lower = -1; lower;
float4 smlr_cur, float4 smlr_cur,
smlr_max = 0.0f; smlr_max = 0.0f;
double threshold;
Assert(bounds || !(flags & WORD_SIMILARITY_STRICT));
/* Select appropriate threshold */
threshold = (flags & WORD_SIMILARITY_STRICT) ?
strict_word_similarity_threshold :
word_similarity_threshold;
/*
* Consider first trigram as initial lower bount for strict word similarity,
* or initialize it later with first trigram present for plain word
* similarity.
*/
lower = (flags & WORD_SIMILARITY_STRICT) ? 0 : -1;
/* Memorise last position of each trigram */ /* Memorise last position of each trigram */
lastpos = (int *) palloc(sizeof(int) * len); lastpos = (int *) palloc(sizeof(int) * len);
@ -456,8 +527,13 @@ iterate_word_similarity(int *trg2indexes,
lastpos[trgindex] = i; lastpos[trgindex] = i;
} }
/* Adjust upper bound if this trigram is present in required substring */ /*
if (found[trgindex]) * Adjust upper bound if trigram is upper bound of word for strict
* word similarity, or if trigram is present in required substring for
* plain word similarity
*/
if ((flags & WORD_SIMILARITY_STRICT) ? (bounds[i] & TRGM_BOUND_RIGHT)
: found[trgindex])
{ {
int prev_lower, int prev_lower,
tmp_ulen2, tmp_ulen2,
@ -479,24 +555,35 @@ iterate_word_similarity(int *trg2indexes,
prev_lower = lower; prev_lower = lower;
for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++) for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++)
{ {
float smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2); float smlr_tmp;
int tmp_trgindex; int tmp_trgindex;
if (smlr_tmp > smlr_cur)
{
smlr_cur = smlr_tmp;
ulen2 = tmp_ulen2;
lower = tmp_lower;
count = tmp_count;
}
/* /*
* if we only check that word similarity is greater than * Adjust lower bound only if trigram is lower bound of word
* pg_trgm.word_similarity_threshold we do not need to * for strict word similarity, or consider every trigram as
* calculate a maximum similarity. * lower bound for plain word similarity.
*/ */
if (check_only && smlr_cur >= word_similarity_threshold) if (!(flags & WORD_SIMILARITY_STRICT)
break; || (bounds[tmp_lower] & TRGM_BOUND_LEFT))
{
smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
if (smlr_tmp > smlr_cur)
{
smlr_cur = smlr_tmp;
ulen2 = tmp_ulen2;
lower = tmp_lower;
count = tmp_count;
}
/*
* If we only check that word similarity is greater than
* threshold we do not need to calculate a maximum
* similarity.
*/
if ((flags & WORD_SIMILARITY_CHECK_ONLY)
&& smlr_cur >= threshold)
break;
}
tmp_trgindex = trg2indexes[tmp_lower]; tmp_trgindex = trg2indexes[tmp_lower];
if (lastpos[tmp_trgindex] == tmp_lower) if (lastpos[tmp_trgindex] == tmp_lower)
@ -511,10 +598,9 @@ iterate_word_similarity(int *trg2indexes,
/* /*
* if we only check that word similarity is greater than * if we only check that word similarity is greater than
* pg_trgm.word_similarity_threshold we do not need to calculate a * threshold we do not need to calculate a maximum similarity.
* maximum similarity
*/ */
if (check_only && smlr_max >= word_similarity_threshold) if ((flags & WORD_SIMILARITY_CHECK_ONLY) && smlr_max >= threshold)
break; break;
for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++) for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++)
@ -547,14 +633,13 @@ iterate_word_similarity(int *trg2indexes,
* *
* str1: search pattern string, of length slen1 bytes. * str1: search pattern string, of length slen1 bytes.
* str2: text in which we are looking for a word, of length slen2 bytes. * str2: text in which we are looking for a word, of length slen2 bytes.
* check_only: if true then only check existence of similar search pattern in * flags: set of boolean flags parametrizing similarity calculation.
* text.
* *
* Returns word similarity. * Returns word similarity.
*/ */
static float4 static float4
calc_word_similarity(char *str1, int slen1, char *str2, int slen2, calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
bool check_only) uint8 flags)
{ {
bool *found; bool *found;
pos_trgm *ptrg; pos_trgm *ptrg;
@ -568,15 +653,20 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
ulen1; ulen1;
int *trg2indexes; int *trg2indexes;
float4 result; float4 result;
TrgmBound *bounds;
protect_out_of_mem(slen1 + slen2); protect_out_of_mem(slen1 + slen2);
/* Make positional trigrams */ /* Make positional trigrams */
trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3); trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3);
trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3); trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3);
if (flags & WORD_SIMILARITY_STRICT)
bounds = (TrgmBound *) palloc0(sizeof(TrgmBound) * (slen2 / 2 + 1) * 3);
else
bounds = NULL;
len1 = generate_trgm_only(trg1, str1, slen1); len1 = generate_trgm_only(trg1, str1, slen1, NULL);
len2 = generate_trgm_only(trg2, str2, slen2); len2 = generate_trgm_only(trg2, str2, slen2, bounds);
ptrg = make_positional_trgm(trg1, len1, trg2, len2); ptrg = make_positional_trgm(trg1, len1, trg2, len2);
len = len1 + len2; len = len1 + len2;
@ -622,7 +712,7 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
/* Run iterative procedure to find maximum similarity with word */ /* Run iterative procedure to find maximum similarity with word */
result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len, result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len,
check_only); flags, bounds);
pfree(trg2indexes); pfree(trg2indexes);
pfree(found); pfree(found);
@ -1081,7 +1171,23 @@ word_similarity(PG_FUNCTION_ARGS)
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1), res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2), VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
false); 0);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
PG_RETURN_FLOAT4(res);
}
Datum
strict_word_similarity(PG_FUNCTION_ARGS)
{
text *in1 = PG_GETARG_TEXT_PP(0);
text *in2 = PG_GETARG_TEXT_PP(1);
float4 res;
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
WORD_SIMILARITY_STRICT);
PG_FREE_IF_COPY(in1, 0); PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1); PG_FREE_IF_COPY(in2, 1);
@ -1117,7 +1223,7 @@ word_similarity_op(PG_FUNCTION_ARGS)
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1), res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2), VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
true); WORD_SIMILARITY_CHECK_ONLY);
PG_FREE_IF_COPY(in1, 0); PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1); PG_FREE_IF_COPY(in2, 1);
@ -1133,7 +1239,7 @@ word_similarity_commutator_op(PG_FUNCTION_ARGS)
res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2), res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1), VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
true); WORD_SIMILARITY_CHECK_ONLY);
PG_FREE_IF_COPY(in1, 0); PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1); PG_FREE_IF_COPY(in2, 1);
@ -1149,7 +1255,7 @@ word_similarity_dist_op(PG_FUNCTION_ARGS)
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1), res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2), VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
false); 0);
PG_FREE_IF_COPY(in1, 0); PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1); PG_FREE_IF_COPY(in2, 1);
@ -1165,7 +1271,71 @@ word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2), res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1), VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
false); 0);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
PG_RETURN_FLOAT4(1.0 - res);
}
Datum
strict_word_similarity_op(PG_FUNCTION_ARGS)
{
text *in1 = PG_GETARG_TEXT_PP(0);
text *in2 = PG_GETARG_TEXT_PP(1);
float4 res;
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
}
Datum
strict_word_similarity_commutator_op(PG_FUNCTION_ARGS)
{
text *in1 = PG_GETARG_TEXT_PP(0);
text *in2 = PG_GETARG_TEXT_PP(1);
float4 res;
res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
}
Datum
strict_word_similarity_dist_op(PG_FUNCTION_ARGS)
{
text *in1 = PG_GETARG_TEXT_PP(0);
text *in2 = PG_GETARG_TEXT_PP(1);
float4 res;
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
WORD_SIMILARITY_STRICT);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
PG_RETURN_FLOAT4(1.0 - res);
}
Datum
strict_word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
{
text *in1 = PG_GETARG_TEXT_PP(0);
text *in2 = PG_GETARG_TEXT_PP(1);
float4 res;
res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
WORD_SIMILARITY_STRICT);
PG_FREE_IF_COPY(in1, 0); PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1); PG_FREE_IF_COPY(in2, 1);

View File

@ -105,6 +105,17 @@
the explanation below. the explanation below.
</entry> </entry>
</row> </row>
<row>
<entry>
<function>strict_word_similarity(text, text)</function>
<indexterm><primary>strict_word_similarity</primary></indexterm>
</entry>
<entry><type>real</type></entry>
<entry>
Same as <function>word_similarity(text, text)</function>, but forces
extent boundaries to match word boundaries.
</entry>
</row>
<row> <row>
<entry><function>show_limit()</function><indexterm><primary>show_limit</primary></indexterm></entry> <entry><function>show_limit()</function><indexterm><primary>show_limit</primary></indexterm></entry>
<entry><type>real</type></entry> <entry><type>real</type></entry>
@ -157,6 +168,29 @@
a part of the word. a part of the word.
</para> </para>
<para>
At the same time, <function>strict_word_similarity(text, text)</function>
has to select an extent that matches word boundaries. In the example above,
<function>strict_word_similarity(text, text)</function> would select the
extent <literal>{" w"," wo","wor","ord","rds", ds "}</literal>, which
corresponds to the whole word <literal>'words'</literal>.
<programlisting>
# SELECT strict_word_similarity('word', 'two words'), similarity('word', 'words');
strict_word_similarity | similarity
------------------------+------------
0.571429 | 0.571429
(1 row)
</programlisting>
</para>
<para>
Thus, the <function>strict_word_similarity(text, text)</function> function
is useful for finding similar subsets of whole words, while
<function>word_similarity(text, text)</function> is more suitable for
searching similar parts of words.
</para>
<table id="pgtrgm-op-table"> <table id="pgtrgm-op-table">
<title><filename>pg_trgm</filename> Operators</title> <title><filename>pg_trgm</filename> Operators</title>
<tgroup cols="3"> <tgroup cols="3">
@ -196,6 +230,24 @@
Commutator of the <literal>&lt;%</literal> operator. Commutator of the <literal>&lt;%</literal> operator.
</entry> </entry>
</row> </row>
<row>
<entry><type>text</type> <literal>&lt;&lt;%</literal> <type>text</type></entry>
<entry><type>boolean</type></entry>
<entry>
Returns <literal>true</literal> if its second argument has a continuous
extent of an ordered trigram set that matches word boundaries,
and its similarity to the trigram set of the first argument is greater
than the current strict word similarity threshold set by the
<varname>pg_trgm.strict_word_similarity_threshold</varname> parameter.
</entry>
</row>
<row>
<entry><type>text</type> <literal>%&gt;&gt;</literal> <type>text</type></entry>
<entry><type>boolean</type></entry>
<entry>
Commutator of the <literal>&lt;&lt;%</literal> operator.
</entry>
</row>
<row> <row>
<entry><type>text</type> <literal>&lt;-&gt;</literal> <type>text</type></entry> <entry><type>text</type> <literal>&lt;-&gt;</literal> <type>text</type></entry>
<entry><type>real</type></entry> <entry><type>real</type></entry>
@ -223,6 +275,25 @@
Commutator of the <literal>&lt;&lt;-&gt;</literal> operator. Commutator of the <literal>&lt;&lt;-&gt;</literal> operator.
</entry> </entry>
</row> </row>
<row>
<entry>
<type>text</type> <literal>&lt;&lt;&lt;-&gt;</literal> <type>text</type>
</entry>
<entry><type>real</type></entry>
<entry>
Returns the <quote>distance</quote> between the arguments, that is
one minus the <function>strict_word_similarity()</function> value.
</entry>
</row>
<row>
<entry>
<type>text</type> <literal>&lt;-&gt;&gt;&gt;</literal> <type>text</type>
</entry>
<entry><type>real</type></entry>
<entry>
Commutator of the <literal>&lt;&lt;&lt;-&gt;</literal> operator.
</entry>
</row>
</tbody> </tbody>
</tgroup> </tgroup>
</table> </table>
@ -322,12 +393,19 @@ SELECT t, t &lt;-&gt; '<replaceable>word</replaceable>' AS dist
<para> <para>
Also you can use an index on the <structfield>t</structfield> column for word Also you can use an index on the <structfield>t</structfield> column for word
similarity. For example: similarity or strict word similarity. Typical queries are:
<programlisting> <programlisting>
SELECT t, word_similarity('<replaceable>word</replaceable>', t) AS sml SELECT t, word_similarity('<replaceable>word</replaceable>', t) AS sml
FROM test_trgm FROM test_trgm
WHERE '<replaceable>word</replaceable>' &lt;% t WHERE '<replaceable>word</replaceable>' &lt;% t
ORDER BY sml DESC, t; ORDER BY sml DESC, t;
</programlisting>
and
<programlisting>
SELECT t, strict_word_similarity('<replaceable>word</replaceable>', t) AS sml
FROM test_trgm
WHERE '<replaceable>word</replaceable>' &lt;&lt;% t
ORDER BY sml DESC, t;
</programlisting> </programlisting>
This will return all values in the text column for which there is a This will return all values in the text column for which there is a
continuous extent in the corresponding ordered trigram set that is continuous extent in the corresponding ordered trigram set that is
@ -337,11 +415,17 @@ SELECT t, word_similarity('<replaceable>word</replaceable>', t) AS sml
</para> </para>
<para> <para>
A variant of the above query is Possible variants of the above queries are:
<programlisting> <programlisting>
SELECT t, '<replaceable>word</replaceable>' &lt;&lt;-&gt; t AS dist SELECT t, '<replaceable>word</replaceable>' &lt;&lt;-&gt; t AS dist
FROM test_trgm FROM test_trgm
ORDER BY dist LIMIT 10; ORDER BY dist LIMIT 10;
</programlisting>
and
<programlisting>
SELECT t, '<replaceable>word</replaceable>' &lt;&lt;&lt;-&gt; t AS dist
FROM test_trgm
ORDER BY dist LIMIT 10;
</programlisting> </programlisting>
This can be implemented quite efficiently by GiST indexes, but not This can be implemented quite efficiently by GiST indexes, but not
by GIN indexes. by GIN indexes.