Add strict_word_similarity to pg_trgm module

strict_word_similarity is similar to existing word_similarity function but
it takes into account word boundaries to compute similarity.

Author: Alexander Korotkov
Review by: David Steele, Liudmila Mantrova, me
Discussion: https://www.postgresql.org/message-id/flat/CY4PR17MB13207ED8310F847CF117EED0D85A0@CY4PR17MB1320.namprd17.prod.outlook.com
This commit is contained in:
Teodor Sigaev 2018-03-21 14:57:42 +03:00
parent f20b328534
commit be8a7a6866
10 changed files with 1461 additions and 61 deletions

View File

@ -4,11 +4,12 @@ MODULE_big = pg_trgm
OBJS = trgm_op.o trgm_gist.o trgm_gin.o trgm_regexp.o $(WIN32RES)
EXTENSION = pg_trgm
DATA = pg_trgm--1.3.sql pg_trgm--1.2--1.3.sql pg_trgm--1.1--1.2.sql \
DATA = pg_trgm--1.3--1.4.sql \
pg_trgm--1.3.sql pg_trgm--1.2--1.3.sql pg_trgm--1.1--1.2.sql \
pg_trgm--1.0--1.1.sql pg_trgm--unpackaged--1.0.sql
PGFILEDESC = "pg_trgm - trigram matching"
REGRESS = pg_trgm pg_word_trgm
REGRESS = pg_trgm pg_word_trgm pg_strict_word_trgm
ifdef USE_PGXS
PG_CONFIG = pg_config

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,68 @@
/* contrib/pg_trgm/pg_trgm--1.3--1.4.sql */
-- complain if script is sourced in psql, rather than via ALTER EXTENSION
\echo Use "ALTER EXTENSION pg_trgm UPDATE TO '1.4'" to load this file. \quit
CREATE FUNCTION strict_word_similarity(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
CREATE FUNCTION strict_word_similarity_op(text,text)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE PARALLEL SAFE; -- stable because depends on pg_trgm.word_similarity_threshold
CREATE FUNCTION strict_word_similarity_commutator_op(text,text)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE PARALLEL SAFE; -- stable because depends on pg_trgm.word_similarity_threshold
CREATE OPERATOR <<% (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = strict_word_similarity_op,
COMMUTATOR = '%>>',
RESTRICT = contsel,
JOIN = contjoinsel
);
CREATE OPERATOR %>> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = strict_word_similarity_commutator_op,
COMMUTATOR = '<<%',
RESTRICT = contsel,
JOIN = contjoinsel
);
CREATE FUNCTION strict_word_similarity_dist_op(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
CREATE FUNCTION strict_word_similarity_dist_commutator_op(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
CREATE OPERATOR <<<-> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = strict_word_similarity_dist_op,
COMMUTATOR = '<->>>'
);
CREATE OPERATOR <->>> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = strict_word_similarity_dist_commutator_op,
COMMUTATOR = '<<<->'
);
ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
OPERATOR 9 %>> (text, text),
OPERATOR 10 <->>> (text, text) FOR ORDER BY pg_catalog.float_ops;
ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
OPERATOR 9 %>> (text, text);

View File

@ -1,5 +1,5 @@
# pg_trgm extension
comment = 'text similarity measurement and index searching based on trigrams'
default_version = '1.3'
default_version = '1.4'
module_pathname = '$libdir/pg_trgm'
relocatable = true

View File

@ -0,0 +1,42 @@
DROP INDEX trgm_idx2;
\copy test_trgm3 from 'data/trgm2.data'
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7;
create index trgm_idx2 on test_trgm2 using gist (t gist_trgm_ops);
set enable_seqscan=off;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
explain (costs off)
select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7;
select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7;
drop index trgm_idx2;
create index trgm_idx2 on test_trgm2 using gin (t gin_trgm_ops);
set enable_seqscan=off;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
set "pg_trgm.strict_word_similarity_threshold" to 0.4;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
set "pg_trgm.strict_word_similarity_threshold" to 0.2;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;

View File

@ -6,6 +6,7 @@
#include "access/gist.h"
#include "access/itup.h"
#include "access/stratnum.h"
#include "storage/bufpage.h"
/*
@ -26,14 +27,16 @@
#define DIVUNION
/* operator strategy numbers */
#define SimilarityStrategyNumber 1
#define DistanceStrategyNumber 2
#define LikeStrategyNumber 3
#define ILikeStrategyNumber 4
#define RegExpStrategyNumber 5
#define RegExpICaseStrategyNumber 6
#define WordSimilarityStrategyNumber 7
#define WordDistanceStrategyNumber 8
#define SimilarityStrategyNumber 1
#define DistanceStrategyNumber 2
#define LikeStrategyNumber 3
#define ILikeStrategyNumber 4
#define RegExpStrategyNumber 5
#define RegExpICaseStrategyNumber 6
#define WordSimilarityStrategyNumber 7
#define WordDistanceStrategyNumber 8
#define StrictWordSimilarityStrategyNumber 9
#define StrictWordDistanceStrategyNumber 10
typedef char trgm[3];
@ -120,7 +123,9 @@ typedef struct TrgmPackedGraph TrgmPackedGraph;
extern double similarity_threshold;
extern double word_similarity_threshold;
extern double strict_word_similarity_threshold;
extern double index_strategy_get_limit(StrategyNumber strategy);
extern uint32 trgm2int(trgm *ptr);
extern void compact_trigram(trgm *tptr, char *str, int bytelen);
extern TRGM *generate_trgm(char *str, int slen);

View File

@ -90,6 +90,7 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS)
{
case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber:
case StrictWordSimilarityStrategyNumber:
trg = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val));
break;
case ILikeStrategyNumber:
@ -187,8 +188,8 @@ gin_trgm_consistent(PG_FUNCTION_ARGS)
{
case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber:
nlimit = (strategy == SimilarityStrategyNumber) ?
similarity_threshold : word_similarity_threshold;
case StrictWordSimilarityStrategyNumber:
nlimit = index_strategy_get_limit(strategy);
/* Count the matches */
ntrue = 0;
@ -282,8 +283,8 @@ gin_trgm_triconsistent(PG_FUNCTION_ARGS)
{
case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber:
nlimit = (strategy == SimilarityStrategyNumber) ?
similarity_threshold : word_similarity_threshold;
case StrictWordSimilarityStrategyNumber:
nlimit = index_strategy_get_limit(strategy);
/* Count the matches */
ntrue = 0;

View File

@ -221,6 +221,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
{
case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber:
case StrictWordSimilarityStrategyNumber:
qtrg = generate_trgm(VARDATA(query),
querysize - VARHDRSZ);
break;
@ -290,10 +291,11 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
{
case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber:
/* Similarity search is exact. Word similarity search is inexact */
*recheck = (strategy == WordSimilarityStrategyNumber);
nlimit = (strategy == SimilarityStrategyNumber) ?
similarity_threshold : word_similarity_threshold;
case StrictWordSimilarityStrategyNumber:
/* Similarity search is exact. (Strict) word similarity search is inexact */
*recheck = (strategy != SimilarityStrategyNumber);
nlimit = index_strategy_get_limit(strategy);
if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */
@ -468,7 +470,9 @@ gtrgm_distance(PG_FUNCTION_ARGS)
{
case DistanceStrategyNumber:
case WordDistanceStrategyNumber:
*recheck = strategy == WordDistanceStrategyNumber;
case StrictWordDistanceStrategyNumber:
/* Only plain trigram distance is exact */
*recheck = (strategy != DistanceStrategyNumber);
if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */

View File

@ -18,6 +18,7 @@ PG_MODULE_MAGIC;
/* GUC variables */
double similarity_threshold = 0.3f;
double word_similarity_threshold = 0.6f;
double strict_word_similarity_threshold = 0.5f;
void _PG_init(void);
@ -26,12 +27,17 @@ PG_FUNCTION_INFO_V1(show_limit);
PG_FUNCTION_INFO_V1(show_trgm);
PG_FUNCTION_INFO_V1(similarity);
PG_FUNCTION_INFO_V1(word_similarity);
PG_FUNCTION_INFO_V1(strict_word_similarity);
PG_FUNCTION_INFO_V1(similarity_dist);
PG_FUNCTION_INFO_V1(similarity_op);
PG_FUNCTION_INFO_V1(word_similarity_op);
PG_FUNCTION_INFO_V1(word_similarity_commutator_op);
PG_FUNCTION_INFO_V1(word_similarity_dist_op);
PG_FUNCTION_INFO_V1(word_similarity_dist_commutator_op);
PG_FUNCTION_INFO_V1(strict_word_similarity_op);
PG_FUNCTION_INFO_V1(strict_word_similarity_commutator_op);
PG_FUNCTION_INFO_V1(strict_word_similarity_dist_op);
PG_FUNCTION_INFO_V1(strict_word_similarity_dist_commutator_op);
/* Trigram with position */
typedef struct
@ -40,6 +46,17 @@ typedef struct
int index;
} pos_trgm;
/* Trigram bound type */
typedef uint8 TrgmBound;
#define TRGM_BOUND_LEFT (0x01) /* trigram is left bound of word */
#define TRGM_BOUND_RIGHT (0x02) /* trigram is right bound of word */
/* Word similarity flags */
#define WORD_SIMILARITY_CHECK_ONLY (0x01) /* if set then only check existence
* of similar search pattern in text */
#define WORD_SIMILARITY_STRICT (0x02) /* force bounds of extent to match
* word bounds */
/*
* Module load callback
*/
@ -71,6 +88,18 @@ _PG_init(void)
NULL,
NULL,
NULL);
DefineCustomRealVariable("pg_trgm.strict_word_similarity_threshold",
"Sets the threshold used by the <<%% operator.",
"Valid range is 0.0 .. 1.0.",
&strict_word_similarity_threshold,
0.5,
0.0,
1.0,
PGC_USERSET,
0,
NULL,
NULL,
NULL);
}
/*
@ -95,6 +124,29 @@ set_limit(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT4(similarity_threshold);
}
/*
* Get similarity threshold for given index scan strategy number.
*/
double
index_strategy_get_limit(StrategyNumber strategy)
{
switch (strategy)
{
case SimilarityStrategyNumber:
return similarity_threshold;
case WordSimilarityStrategyNumber:
return word_similarity_threshold;
case StrictWordSimilarityStrategyNumber:
return strict_word_similarity_threshold;
default:
elog(ERROR, "unrecognized strategy number: %d", strategy);
break;
}
return 0.0; /* keep compiler quiet */
}
/*
* Deprecated function.
* Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
@ -235,11 +287,12 @@ make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
*
* trg: where to return the array of trigrams.
* str: source string, of length slen bytes.
* bounds: where to return bounds of trigrams (if needed).
*
* Returns length of the generated array.
*/
static int
generate_trgm_only(trgm *trg, char *str, int slen)
generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
{
trgm *tptr;
char *buf;
@ -282,11 +335,13 @@ generate_trgm_only(trgm *trg, char *str, int slen)
buf[LPADDING + bytelen] = ' ';
buf[LPADDING + bytelen + 1] = ' ';
/*
* count trigrams
*/
/* Calculate trigrams marking their bounds if needed */
if (bounds)
bounds[tptr - trg] |= TRGM_BOUND_LEFT;
tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
charlen + LPADDING + RPADDING);
if (bounds)
bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT;
}
pfree(buf);
@ -328,7 +383,7 @@ generate_trgm(char *str, int slen)
trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
trg->flag = ARRKEY;
len = generate_trgm_only(GETARR(trg), str, slen);
len = generate_trgm_only(GETARR(trg), str, slen, NULL);
SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
if (len == 0)
@ -413,8 +468,8 @@ comp_ptrgm(const void *v1, const void *v2)
* ulen1: count of unique trigrams of array "trg1".
* len2: length of array "trg2" and array "trg2indexes".
* len: length of the array "found".
* check_only: if true then only check existence of similar search pattern in
* text.
* lags: set of boolean flags parametrizing similarity calculation.
* bounds: whether each trigram is left/right bound of word.
*
* Returns word similarity.
*/
@ -424,16 +479,32 @@ iterate_word_similarity(int *trg2indexes,
int ulen1,
int len2,
int len,
bool check_only)
uint8 flags,
TrgmBound *bounds)
{
int *lastpos,
i,
ulen2 = 0,
count = 0,
upper = -1,
lower = -1;
lower;
float4 smlr_cur,
smlr_max = 0.0f;
double threshold;
Assert(bounds || !(flags & WORD_SIMILARITY_STRICT));
/* Select appropriate threshold */
threshold = (flags & WORD_SIMILARITY_STRICT) ?
strict_word_similarity_threshold :
word_similarity_threshold;
/*
* Consider first trigram as initial lower bount for strict word similarity,
* or initialize it later with first trigram present for plain word
* similarity.
*/
lower = (flags & WORD_SIMILARITY_STRICT) ? 0 : -1;
/* Memorise last position of each trigram */
lastpos = (int *) palloc(sizeof(int) * len);
@ -456,8 +527,13 @@ iterate_word_similarity(int *trg2indexes,
lastpos[trgindex] = i;
}
/* Adjust upper bound if this trigram is present in required substring */
if (found[trgindex])
/*
* Adjust upper bound if trigram is upper bound of word for strict
* word similarity, or if trigram is present in required substring for
* plain word similarity
*/
if ((flags & WORD_SIMILARITY_STRICT) ? (bounds[i] & TRGM_BOUND_RIGHT)
: found[trgindex])
{
int prev_lower,
tmp_ulen2,
@ -479,24 +555,35 @@ iterate_word_similarity(int *trg2indexes,
prev_lower = lower;
for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++)
{
float smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
float smlr_tmp;
int tmp_trgindex;
if (smlr_tmp > smlr_cur)
{
smlr_cur = smlr_tmp;
ulen2 = tmp_ulen2;
lower = tmp_lower;
count = tmp_count;
}
/*
* if we only check that word similarity is greater than
* pg_trgm.word_similarity_threshold we do not need to
* calculate a maximum similarity.
* Adjust lower bound only if trigram is lower bound of word
* for strict word similarity, or consider every trigram as
* lower bound for plain word similarity.
*/
if (check_only && smlr_cur >= word_similarity_threshold)
break;
if (!(flags & WORD_SIMILARITY_STRICT)
|| (bounds[tmp_lower] & TRGM_BOUND_LEFT))
{
smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
if (smlr_tmp > smlr_cur)
{
smlr_cur = smlr_tmp;
ulen2 = tmp_ulen2;
lower = tmp_lower;
count = tmp_count;
}
/*
* If we only check that word similarity is greater than
* threshold we do not need to calculate a maximum
* similarity.
*/
if ((flags & WORD_SIMILARITY_CHECK_ONLY)
&& smlr_cur >= threshold)
break;
}
tmp_trgindex = trg2indexes[tmp_lower];
if (lastpos[tmp_trgindex] == tmp_lower)
@ -511,10 +598,9 @@ iterate_word_similarity(int *trg2indexes,
/*
* if we only check that word similarity is greater than
* pg_trgm.word_similarity_threshold we do not need to calculate a
* maximum similarity
* threshold we do not need to calculate a maximum similarity.
*/
if (check_only && smlr_max >= word_similarity_threshold)
if ((flags & WORD_SIMILARITY_CHECK_ONLY) && smlr_max >= threshold)
break;
for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++)
@ -547,14 +633,13 @@ iterate_word_similarity(int *trg2indexes,
*
* str1: search pattern string, of length slen1 bytes.
* str2: text in which we are looking for a word, of length slen2 bytes.
* check_only: if true then only check existence of similar search pattern in
* text.
* flags: set of boolean flags parametrizing similarity calculation.
*
* Returns word similarity.
*/
static float4
calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
bool check_only)
uint8 flags)
{
bool *found;
pos_trgm *ptrg;
@ -568,15 +653,20 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
ulen1;
int *trg2indexes;
float4 result;
TrgmBound *bounds;
protect_out_of_mem(slen1 + slen2);
/* Make positional trigrams */
trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3);
trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3);
if (flags & WORD_SIMILARITY_STRICT)
bounds = (TrgmBound *) palloc0(sizeof(TrgmBound) * (slen2 / 2 + 1) * 3);
else
bounds = NULL;
len1 = generate_trgm_only(trg1, str1, slen1);
len2 = generate_trgm_only(trg2, str2, slen2);
len1 = generate_trgm_only(trg1, str1, slen1, NULL);
len2 = generate_trgm_only(trg2, str2, slen2, bounds);
ptrg = make_positional_trgm(trg1, len1, trg2, len2);
len = len1 + len2;
@ -622,7 +712,7 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
/* Run iterative procedure to find maximum similarity with word */
result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len,
check_only);
flags, bounds);
pfree(trg2indexes);
pfree(found);
@ -1081,7 +1171,23 @@ word_similarity(PG_FUNCTION_ARGS)
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
false);
0);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
PG_RETURN_FLOAT4(res);
}
Datum
strict_word_similarity(PG_FUNCTION_ARGS)
{
text *in1 = PG_GETARG_TEXT_PP(0);
text *in2 = PG_GETARG_TEXT_PP(1);
float4 res;
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
WORD_SIMILARITY_STRICT);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
@ -1117,7 +1223,7 @@ word_similarity_op(PG_FUNCTION_ARGS)
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
true);
WORD_SIMILARITY_CHECK_ONLY);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
@ -1133,7 +1239,7 @@ word_similarity_commutator_op(PG_FUNCTION_ARGS)
res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
true);
WORD_SIMILARITY_CHECK_ONLY);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
@ -1149,7 +1255,7 @@ word_similarity_dist_op(PG_FUNCTION_ARGS)
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
false);
0);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
@ -1165,7 +1271,71 @@ word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
false);
0);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
PG_RETURN_FLOAT4(1.0 - res);
}
Datum
strict_word_similarity_op(PG_FUNCTION_ARGS)
{
text *in1 = PG_GETARG_TEXT_PP(0);
text *in2 = PG_GETARG_TEXT_PP(1);
float4 res;
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
}
Datum
strict_word_similarity_commutator_op(PG_FUNCTION_ARGS)
{
text *in1 = PG_GETARG_TEXT_PP(0);
text *in2 = PG_GETARG_TEXT_PP(1);
float4 res;
res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
}
Datum
strict_word_similarity_dist_op(PG_FUNCTION_ARGS)
{
text *in1 = PG_GETARG_TEXT_PP(0);
text *in2 = PG_GETARG_TEXT_PP(1);
float4 res;
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
WORD_SIMILARITY_STRICT);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
PG_RETURN_FLOAT4(1.0 - res);
}
Datum
strict_word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
{
text *in1 = PG_GETARG_TEXT_PP(0);
text *in2 = PG_GETARG_TEXT_PP(1);
float4 res;
res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
WORD_SIMILARITY_STRICT);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);

View File

@ -105,6 +105,17 @@
the explanation below.
</entry>
</row>
<row>
<entry>
<function>strict_word_similarity(text, text)</function>
<indexterm><primary>strict_word_similarity</primary></indexterm>
</entry>
<entry><type>real</type></entry>
<entry>
Same as <function>word_similarity(text, text)</function>, but forces
extent boundaries to match word boundaries.
</entry>
</row>
<row>
<entry><function>show_limit()</function><indexterm><primary>show_limit</primary></indexterm></entry>
<entry><type>real</type></entry>
@ -157,6 +168,29 @@
a part of the word.
</para>
<para>
At the same time, <function>strict_word_similarity(text, text)</function>
has to select an extent that matches word boundaries. In the example above,
<function>strict_word_similarity(text, text)</function> would select the
extent <literal>{" w"," wo","wor","ord","rds", ds "}</literal>, which
corresponds to the whole word <literal>'words'</literal>.
<programlisting>
# SELECT strict_word_similarity('word', 'two words'), similarity('word', 'words');
strict_word_similarity | similarity
------------------------+------------
0.571429 | 0.571429
(1 row)
</programlisting>
</para>
<para>
Thus, the <function>strict_word_similarity(text, text)</function> function
is useful for finding similar subsets of whole words, while
<function>word_similarity(text, text)</function> is more suitable for
searching similar parts of words.
</para>
<table id="pgtrgm-op-table">
<title><filename>pg_trgm</filename> Operators</title>
<tgroup cols="3">
@ -196,6 +230,24 @@
Commutator of the <literal>&lt;%</literal> operator.
</entry>
</row>
<row>
<entry><type>text</type> <literal>&lt;&lt;%</literal> <type>text</type></entry>
<entry><type>boolean</type></entry>
<entry>
Returns <literal>true</literal> if its second argument has a continuous
extent of an ordered trigram set that matches word boundaries,
and its similarity to the trigram set of the first argument is greater
than the current strict word similarity threshold set by the
<varname>pg_trgm.strict_word_similarity_threshold</varname> parameter.
</entry>
</row>
<row>
<entry><type>text</type> <literal>%&gt;&gt;</literal> <type>text</type></entry>
<entry><type>boolean</type></entry>
<entry>
Commutator of the <literal>&lt;&lt;%</literal> operator.
</entry>
</row>
<row>
<entry><type>text</type> <literal>&lt;-&gt;</literal> <type>text</type></entry>
<entry><type>real</type></entry>
@ -223,6 +275,25 @@
Commutator of the <literal>&lt;&lt;-&gt;</literal> operator.
</entry>
</row>
<row>
<entry>
<type>text</type> <literal>&lt;&lt;&lt;-&gt;</literal> <type>text</type>
</entry>
<entry><type>real</type></entry>
<entry>
Returns the <quote>distance</quote> between the arguments, that is
one minus the <function>strict_word_similarity()</function> value.
</entry>
</row>
<row>
<entry>
<type>text</type> <literal>&lt;-&gt;&gt;&gt;</literal> <type>text</type>
</entry>
<entry><type>real</type></entry>
<entry>
Commutator of the <literal>&lt;&lt;&lt;-&gt;</literal> operator.
</entry>
</row>
</tbody>
</tgroup>
</table>
@ -322,12 +393,19 @@ SELECT t, t &lt;-&gt; '<replaceable>word</replaceable>' AS dist
<para>
Also you can use an index on the <structfield>t</structfield> column for word
similarity. For example:
similarity or strict word similarity. Typical queries are:
<programlisting>
SELECT t, word_similarity('<replaceable>word</replaceable>', t) AS sml
FROM test_trgm
WHERE '<replaceable>word</replaceable>' &lt;% t
ORDER BY sml DESC, t;
</programlisting>
and
<programlisting>
SELECT t, strict_word_similarity('<replaceable>word</replaceable>', t) AS sml
FROM test_trgm
WHERE '<replaceable>word</replaceable>' &lt;&lt;% t
ORDER BY sml DESC, t;
</programlisting>
This will return all values in the text column for which there is a
continuous extent in the corresponding ordered trigram set that is
@ -337,11 +415,17 @@ SELECT t, word_similarity('<replaceable>word</replaceable>', t) AS sml
</para>
<para>
A variant of the above query is
Possible variants of the above queries are:
<programlisting>
SELECT t, '<replaceable>word</replaceable>' &lt;&lt;-&gt; t AS dist
FROM test_trgm
ORDER BY dist LIMIT 10;
</programlisting>
and
<programlisting>
SELECT t, '<replaceable>word</replaceable>' &lt;&lt;&lt;-&gt; t AS dist
FROM test_trgm
ORDER BY dist LIMIT 10;
</programlisting>
This can be implemented quite efficiently by GiST indexes, but not
by GIN indexes.