From 4c7feb16116540091e9d3766350c70dbb3c525d7 Mon Sep 17 00:00:00 2001 From: Teodor Sigaev Date: Wed, 21 Mar 2018 14:37:51 +0300 Subject: [PATCH] Rework word_similarity documentation, make it close to actual algorithm. word_similarity before claimed as returning similarity of closest word in string, but, actually it returns similarity of substring. Also fix mistyped comments. Author: Alexander Korotkov Review by: David Steele, Liudmila Mantrova Discussionis: https://www.postgresql.org/message-id/flat/CY4PR17MB13207ED8310F847CF117EED0D85A0@CY4PR17MB1320.namprd17.prod.outlook.com https://www.postgresql.org/message-id/flat/f43b242d-000c-f4c8-cb8b-d37e9752cd93%40postgrespro.ru --- contrib/pg_trgm/trgm_op.c | 4 +-- doc/src/sgml/pgtrgm.sgml | 56 +++++++++++++++++++++++++++++---------- 2 files changed, 44 insertions(+), 16 deletions(-) diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c index 368e7c8941..32adecc9b8 100644 --- a/contrib/pg_trgm/trgm_op.c +++ b/contrib/pg_trgm/trgm_op.c @@ -456,7 +456,7 @@ iterate_word_similarity(int *trg2indexes, lastpos[trgindex] = i; } - /* Adjust lower bound if this trigram is present in required substring */ + /* Adjust upper bound if this trigram is present in required substring */ if (found[trgindex]) { int prev_lower, @@ -473,7 +473,7 @@ iterate_word_similarity(int *trg2indexes, smlr_cur = CALCSML(count, ulen1, ulen2); - /* Also try to adjust upper bound for greater similarity */ + /* Also try to adjust lower bound for greater similarity */ tmp_count = count; tmp_ulen2 = ulen2; prev_lower = lower; diff --git a/doc/src/sgml/pgtrgm.sgml b/doc/src/sgml/pgtrgm.sgml index 775a7b8be7..2613c7a3d5 100644 --- a/doc/src/sgml/pgtrgm.sgml +++ b/doc/src/sgml/pgtrgm.sgml @@ -99,12 +99,10 @@ real - Returns a number that indicates how similar the first string - to the most similar word of the second string. The function searches in - the second string a most similar word not a most similar substring. The - range of the result is zero (indicating that the two strings are - completely dissimilar) to one (indicating that the first string is - identical to one of the words of the second string). + Returns a number that indicates the greatest similarity between + the set of trigrams in the first string and any continuous extent + of an ordered set of trigrams in the second string. For details, see + the explanation below. @@ -131,6 +129,34 @@ + + Consider the following example: + + +# SELECT word_similarity('word', 'two words'); + word_similarity +----------------- + 0.8 +(1 row) + + + In the first string, the set of trigrams is + {" w"," wo","ord","wor","rd "}. + In the second string, the ordered set of trigrams is + {" t"," tw",two,"wo "," w"," wo","wor","ord","rds", ds "}. + The most similar extent of an ordered set of trigrams in the second string + is {" w"," wo","wor","ord"}, and the similarity is + 0.8. + + + + This function returns a value that can be approximately understood as the + greatest similarity between the first string and any substring of the second + string. However, this function does not add padding to the boundaries of + the extent. Thus, a whole word match gets a higher score than a match with + a part of the word. + + <filename>pg_trgm</filename> Operators @@ -156,10 +182,11 @@ text <% text boolean - Returns true if its first argument has the similar word in - the second argument and they have a similarity that is greater than the - current word similarity threshold set by - pg_trgm.word_similarity_threshold parameter. + Returns true if the similarity between the trigram + set in the first argument and a continuous extent of an ordered trigram + set in the second argument is greater than the current word similarity + threshold set by pg_trgm.word_similarity_threshold + parameter. @@ -302,10 +329,11 @@ SELECT t, word_similarity('word', t) AS sml WHERE 'word' <% t ORDER BY sml DESC, t; - This will return all values in the text column that have a word - which sufficiently similar to word, sorted from best - match to worst. The index will be used to make this a fast operation - even over very large data sets. + This will return all values in the text column for which there is a + continuous extent in the corresponding ordered trigram set that is + sufficiently similar to the trigram set of word, + sorted from best match to worst. The index will be used to make this + a fast operation even over very large data sets.