From be8a7a6866276b228b4ffaa3003e1dc2dd1d140a Mon Sep 17 00:00:00 2001 From: Teodor Sigaev Date: Wed, 21 Mar 2018 14:57:42 +0300 Subject: [PATCH] Add strict_word_similarity to pg_trgm module strict_word_similarity is similar to existing word_similarity function but it takes into account word boundaries to compute similarity. Author: Alexander Korotkov Review by: David Steele, Liudmila Mantrova, me Discussion: https://www.postgresql.org/message-id/flat/CY4PR17MB13207ED8310F847CF117EED0D85A0@CY4PR17MB1320.namprd17.prod.outlook.com --- contrib/pg_trgm/Makefile | 5 +- .../pg_trgm/expected/pg_strict_word_trgm.out | 1025 +++++++++++++++++ contrib/pg_trgm/pg_trgm--1.3--1.4.sql | 68 ++ contrib/pg_trgm/pg_trgm.control | 2 +- contrib/pg_trgm/sql/pg_strict_word_trgm.sql | 42 + contrib/pg_trgm/trgm.h | 21 +- contrib/pg_trgm/trgm_gin.c | 9 +- contrib/pg_trgm/trgm_gist.c | 14 +- contrib/pg_trgm/trgm_op.c | 248 +++- doc/src/sgml/pgtrgm.sgml | 88 +- 10 files changed, 1461 insertions(+), 61 deletions(-) create mode 100644 contrib/pg_trgm/expected/pg_strict_word_trgm.out create mode 100644 contrib/pg_trgm/pg_trgm--1.3--1.4.sql create mode 100644 contrib/pg_trgm/sql/pg_strict_word_trgm.sql diff --git a/contrib/pg_trgm/Makefile b/contrib/pg_trgm/Makefile index 212a89039a..dfecc2a37f 100644 --- a/contrib/pg_trgm/Makefile +++ b/contrib/pg_trgm/Makefile @@ -4,11 +4,12 @@ MODULE_big = pg_trgm OBJS = trgm_op.o trgm_gist.o trgm_gin.o trgm_regexp.o $(WIN32RES) EXTENSION = pg_trgm -DATA = pg_trgm--1.3.sql pg_trgm--1.2--1.3.sql pg_trgm--1.1--1.2.sql \ +DATA = pg_trgm--1.3--1.4.sql \ + pg_trgm--1.3.sql pg_trgm--1.2--1.3.sql pg_trgm--1.1--1.2.sql \ pg_trgm--1.0--1.1.sql pg_trgm--unpackaged--1.0.sql PGFILEDESC = "pg_trgm - trigram matching" -REGRESS = pg_trgm pg_word_trgm +REGRESS = pg_trgm pg_word_trgm pg_strict_word_trgm ifdef USE_PGXS PG_CONFIG = pg_config diff --git a/contrib/pg_trgm/expected/pg_strict_word_trgm.out b/contrib/pg_trgm/expected/pg_strict_word_trgm.out new file mode 100644 index 0000000000..43898a3b98 --- /dev/null +++ b/contrib/pg_trgm/expected/pg_strict_word_trgm.out @@ -0,0 +1,1025 @@ +DROP INDEX trgm_idx2; +\copy test_trgm3 from 'data/trgm2.data' +ERROR: relation "test_trgm3" does not exist +select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t; + t | sml +-------------------------------------+---------- + Baykal | 1 + Boloto Baykal | 1 + Boloto Malyy Baykal | 1 + Kolkhoz Krasnyy Baykal | 1 + Ozero Baykal | 1 + Polevoy Stan Baykal | 1 + Port Baykal | 1 + Prud Novyy Baykal | 1 + Sanatoriy Baykal | 1 + Stantsiya Baykal | 1 + Zaliv Baykal | 1 + Baykalo-Amurskaya Zheleznaya Doroga | 0.666667 + Baykalovo | 0.545455 + Baykalsko | 0.545455 + Maloye Baykalovo | 0.545455 + Baykalikha | 0.5 + Baykalovsk | 0.5 +(17 rows) + +select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t; + t | sml +------------------------------+---------- + Kabankala | 1 + Kabankalan City Public Plaza | 0.75 + Abankala | 0.583333 + Kabakala | 0.583333 +(4 rows) + +select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t; + t | sml +-------------------------------------+---------- + Baykal | 1 + Boloto Baykal | 1 + Boloto Malyy Baykal | 1 + Kolkhoz Krasnyy Baykal | 1 + Ozero Baykal | 1 + Polevoy Stan Baykal | 1 + Port Baykal | 1 + Prud Novyy Baykal | 1 + Sanatoriy Baykal | 1 + Stantsiya Baykal | 1 + Zaliv Baykal | 1 + Baykalo-Amurskaya Zheleznaya Doroga | 0.666667 + Baykalovo | 0.545455 + Baykalsko | 0.545455 + Maloye Baykalovo | 0.545455 + Baykalikha | 0.5 + Baykalovsk | 0.5 +(17 rows) + +select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t; + t | sml +------------------------------+---------- + Kabankala | 1 + Kabankalan City Public Plaza | 0.75 + Abankala | 0.583333 + Kabakala | 0.583333 +(4 rows) + +select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7; + ?column? | t +----------+-------------------------- + 0 | Alaikallupoddakulam + 0.25 | Alaikallupodda Alankulam + 0.32 | Alaikalluppodda Kulam + 0.615385 | Mulaikallu Kulam + 0.724138 | Koraikalapu Kulam + 0.75 | Vaikaliththevakulam + 0.766667 | Karaivaikal Kulam +(7 rows) + +create index trgm_idx2 on test_trgm2 using gist (t gist_trgm_ops); +set enable_seqscan=off; +select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t; + t | sml +-------------------------------------+---------- + Baykal | 1 + Boloto Baykal | 1 + Boloto Malyy Baykal | 1 + Kolkhoz Krasnyy Baykal | 1 + Ozero Baykal | 1 + Polevoy Stan Baykal | 1 + Port Baykal | 1 + Prud Novyy Baykal | 1 + Sanatoriy Baykal | 1 + Stantsiya Baykal | 1 + Zaliv Baykal | 1 + Baykalo-Amurskaya Zheleznaya Doroga | 0.666667 + Baykalovo | 0.545455 + Baykalsko | 0.545455 + Maloye Baykalovo | 0.545455 + Baykalikha | 0.5 + Baykalovsk | 0.5 +(17 rows) + +select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t; + t | sml +------------------------------+---------- + Kabankala | 1 + Kabankalan City Public Plaza | 0.75 + Abankala | 0.583333 + Kabakala | 0.583333 +(4 rows) + +select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t; + t | sml +-------------------------------------+---------- + Baykal | 1 + Boloto Baykal | 1 + Boloto Malyy Baykal | 1 + Kolkhoz Krasnyy Baykal | 1 + Ozero Baykal | 1 + Polevoy Stan Baykal | 1 + Port Baykal | 1 + Prud Novyy Baykal | 1 + Sanatoriy Baykal | 1 + Stantsiya Baykal | 1 + Zaliv Baykal | 1 + Baykalo-Amurskaya Zheleznaya Doroga | 0.666667 + Baykalovo | 0.545455 + Baykalsko | 0.545455 + Maloye Baykalovo | 0.545455 + Baykalikha | 0.5 + Baykalovsk | 0.5 +(17 rows) + +select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t; + t | sml +------------------------------+---------- + Kabankala | 1 + Kabankalan City Public Plaza | 0.75 + Abankala | 0.583333 + Kabakala | 0.583333 +(4 rows) + +explain (costs off) +select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7; + QUERY PLAN +--------------------------------------------------------- + Limit + -> Index Scan using trgm_idx2 on test_trgm2 + Order By: (t <->>> 'Alaikallupoddakulam'::text) +(3 rows) + +select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7; + ?column? | t +----------+-------------------------- + 0 | Alaikallupoddakulam + 0.25 | Alaikallupodda Alankulam + 0.32 | Alaikalluppodda Kulam + 0.615385 | Mulaikallu Kulam + 0.724138 | Koraikalapu Kulam + 0.75 | Vaikaliththevakulam + 0.766667 | Karaivaikal Kulam +(7 rows) + +drop index trgm_idx2; +create index trgm_idx2 on test_trgm2 using gin (t gin_trgm_ops); +set enable_seqscan=off; +select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t; + t | sml +-------------------------------------+---------- + Baykal | 1 + Boloto Baykal | 1 + Boloto Malyy Baykal | 1 + Kolkhoz Krasnyy Baykal | 1 + Ozero Baykal | 1 + Polevoy Stan Baykal | 1 + Port Baykal | 1 + Prud Novyy Baykal | 1 + Sanatoriy Baykal | 1 + Stantsiya Baykal | 1 + Zaliv Baykal | 1 + Baykalo-Amurskaya Zheleznaya Doroga | 0.666667 + Baykalovo | 0.545455 + Baykalsko | 0.545455 + Maloye Baykalovo | 0.545455 + Baykalikha | 0.5 + Baykalovsk | 0.5 +(17 rows) + +select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t; + t | sml +------------------------------+---------- + Kabankala | 1 + Kabankalan City Public Plaza | 0.75 + Abankala | 0.583333 + Kabakala | 0.583333 +(4 rows) + +select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t; + t | sml +-------------------------------------+---------- + Baykal | 1 + Boloto Baykal | 1 + Boloto Malyy Baykal | 1 + Kolkhoz Krasnyy Baykal | 1 + Ozero Baykal | 1 + Polevoy Stan Baykal | 1 + Port Baykal | 1 + Prud Novyy Baykal | 1 + Sanatoriy Baykal | 1 + Stantsiya Baykal | 1 + Zaliv Baykal | 1 + Baykalo-Amurskaya Zheleznaya Doroga | 0.666667 + Baykalovo | 0.545455 + Baykalsko | 0.545455 + Maloye Baykalovo | 0.545455 + Baykalikha | 0.5 + Baykalovsk | 0.5 +(17 rows) + +select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t; + t | sml +------------------------------+---------- + Kabankala | 1 + Kabankalan City Public Plaza | 0.75 + Abankala | 0.583333 + Kabakala | 0.583333 +(4 rows) + +set "pg_trgm.strict_word_similarity_threshold" to 0.4; +select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t; + t | sml +-------------------------------------+---------- + Baykal | 1 + Boloto Baykal | 1 + Boloto Malyy Baykal | 1 + Kolkhoz Krasnyy Baykal | 1 + Ozero Baykal | 1 + Polevoy Stan Baykal | 1 + Port Baykal | 1 + Prud Novyy Baykal | 1 + Sanatoriy Baykal | 1 + Stantsiya Baykal | 1 + Zaliv Baykal | 1 + Baykalo-Amurskaya Zheleznaya Doroga | 0.666667 + Baykalovo | 0.545455 + Baykalsko | 0.545455 + Maloye Baykalovo | 0.545455 + Baykalikha | 0.5 + Baykalovsk | 0.5 + Zabaykal | 0.454545 + Air Bakal-kecil | 0.444444 + Bakal | 0.444444 + Bakal Batu | 0.444444 + Bakal Dos | 0.444444 + Bakal Julu | 0.444444 + Bakal Khel | 0.444444 + Bakal Lama | 0.444444 + Bakal Tres | 0.444444 + Bakal Uno | 0.444444 + Daang Bakal | 0.444444 + Desa Bakal | 0.444444 + Eat Bakal | 0.444444 + Gunung Bakal | 0.444444 + Sidi Bakal | 0.444444 + Stantsiya Bakal | 0.444444 + Sungai Bakal | 0.444444 + Talang Bakal | 0.444444 + Uruk Bakal | 0.444444 + Zaouia Oulad Bakal | 0.444444 + Baykalovskiy | 0.428571 + Baykalovskiy Rayon | 0.428571 + Baikal | 0.4 + Baikal Airfield | 0.4 + Baikal Business Centre | 0.4 + Baikal Hotel Moscow | 0.4 + Baikal Listvyanka Hotel | 0.4 + Baikal Mountains | 0.4 + Baikal Plaza | 0.4 + Bajkal | 0.4 + Bankal | 0.4 + Bankal School | 0.4 + Barkal | 0.4 + Jabal Barkal | 0.4 + Lake Baikal | 0.4 + Oulad el Bakkal | 0.4 + Sidi Mohammed Bakkal | 0.4 +(54 rows) + +select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t; + t | sml +------------------------------+---------- + Kabankala | 1 + Kabankalan City Public Plaza | 0.75 + Abankala | 0.583333 + Kabakala | 0.583333 + Kabikala | 0.461538 +(5 rows) + +select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t; + t | sml +-------------------------------------+---------- + Baykal | 1 + Boloto Baykal | 1 + Boloto Malyy Baykal | 1 + Kolkhoz Krasnyy Baykal | 1 + Ozero Baykal | 1 + Polevoy Stan Baykal | 1 + Port Baykal | 1 + Prud Novyy Baykal | 1 + Sanatoriy Baykal | 1 + Stantsiya Baykal | 1 + Zaliv Baykal | 1 + Baykalo-Amurskaya Zheleznaya Doroga | 0.666667 + Baykalovo | 0.545455 + Baykalsko | 0.545455 + Maloye Baykalovo | 0.545455 + Baykalikha | 0.5 + Baykalovsk | 0.5 + Zabaykal | 0.454545 + Air Bakal-kecil | 0.444444 + Bakal | 0.444444 + Bakal Batu | 0.444444 + Bakal Dos | 0.444444 + Bakal Julu | 0.444444 + Bakal Khel | 0.444444 + Bakal Lama | 0.444444 + Bakal Tres | 0.444444 + Bakal Uno | 0.444444 + Daang Bakal | 0.444444 + Desa Bakal | 0.444444 + Eat Bakal | 0.444444 + Gunung Bakal | 0.444444 + Sidi Bakal | 0.444444 + Stantsiya Bakal | 0.444444 + Sungai Bakal | 0.444444 + Talang Bakal | 0.444444 + Uruk Bakal | 0.444444 + Zaouia Oulad Bakal | 0.444444 + Baykalovskiy | 0.428571 + Baykalovskiy Rayon | 0.428571 + Baikal | 0.4 + Baikal Airfield | 0.4 + Baikal Business Centre | 0.4 + Baikal Hotel Moscow | 0.4 + Baikal Listvyanka Hotel | 0.4 + Baikal Mountains | 0.4 + Baikal Plaza | 0.4 + Bajkal | 0.4 + Bankal | 0.4 + Bankal School | 0.4 + Barkal | 0.4 + Jabal Barkal | 0.4 + Lake Baikal | 0.4 + Oulad el Bakkal | 0.4 + Sidi Mohammed Bakkal | 0.4 +(54 rows) + +select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t; + t | sml +------------------------------+---------- + Kabankala | 1 + Kabankalan City Public Plaza | 0.75 + Abankala | 0.583333 + Kabakala | 0.583333 + Kabikala | 0.461538 +(5 rows) + +set "pg_trgm.strict_word_similarity_threshold" to 0.2; +select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t; + t | sml +-----------------------------------------------------------+---------- + Baykal | 1 + Boloto Baykal | 1 + Boloto Malyy Baykal | 1 + Kolkhoz Krasnyy Baykal | 1 + Ozero Baykal | 1 + Polevoy Stan Baykal | 1 + Port Baykal | 1 + Prud Novyy Baykal | 1 + Sanatoriy Baykal | 1 + Stantsiya Baykal | 1 + Zaliv Baykal | 1 + Baykalo-Amurskaya Zheleznaya Doroga | 0.666667 + Baykalovo | 0.545455 + Baykalsko | 0.545455 + Maloye Baykalovo | 0.545455 + Baykalikha | 0.5 + Baykalovsk | 0.5 + Zabaykal | 0.454545 + Air Bakal-kecil | 0.444444 + Bakal | 0.444444 + Bakal Batu | 0.444444 + Bakal Dos | 0.444444 + Bakal Julu | 0.444444 + Bakal Khel | 0.444444 + Bakal Lama | 0.444444 + Bakal Tres | 0.444444 + Bakal Uno | 0.444444 + Daang Bakal | 0.444444 + Desa Bakal | 0.444444 + Eat Bakal | 0.444444 + Gunung Bakal | 0.444444 + Sidi Bakal | 0.444444 + Stantsiya Bakal | 0.444444 + Sungai Bakal | 0.444444 + Talang Bakal | 0.444444 + Uruk Bakal | 0.444444 + Zaouia Oulad Bakal | 0.444444 + Baykalovskiy | 0.428571 + Baykalovskiy Rayon | 0.428571 + Baikal | 0.4 + Baikal Airfield | 0.4 + Baikal Business Centre | 0.4 + Baikal Hotel Moscow | 0.4 + Baikal Listvyanka Hotel | 0.4 + Baikal Mountains | 0.4 + Baikal Plaza | 0.4 + Bajkal | 0.4 + Bankal | 0.4 + Bankal School | 0.4 + Barkal | 0.4 + Jabal Barkal | 0.4 + Lake Baikal | 0.4 + Oulad el Bakkal | 0.4 + Sidi Mohammed Bakkal | 0.4 + Bay of Backaland | 0.375 + Boikalakalawa Bay | 0.375 + Waikalabubu Bay | 0.375 + Bairkal | 0.363636 + Bairkal Dhora | 0.363636 + Bairkal Jabal | 0.363636 + Batikal | 0.363636 + Bakaleyka | 0.307692 + Bakkalmal | 0.307692 + Bikal | 0.3 + Al Barkali | 0.285714 + Zabaykalka | 0.285714 + Baidal | 0.272727 + Baihal | 0.272727 + Baipal | 0.272727 + Bakala | 0.272727 + Bakala Koupi | 0.272727 + Bakale | 0.272727 + Bakali | 0.272727 + Bakall | 0.272727 + Bakaly | 0.272727 + Bakaly TV Mast | 0.272727 + Buur Bakale | 0.272727 + Gory Bakaly | 0.272727 + Kusu-Bakali | 0.272727 + Kwala Bakala | 0.272727 + Mbay Bakala | 0.272727 + Ngao Bakala | 0.272727 + Sidi Mohammed el Bakali | 0.272727 + Sopka Bakaly | 0.272727 + Sungai Bakala | 0.272727 + Urochishche Bakaly | 0.272727 + Alue Bakkala | 0.25 + Azib el Bakkali | 0.25 + Ba Kaliin | 0.25 + Baikaluobbal | 0.25 + Bakalam | 0.25 + Bakalan | 0.25 + Bakalan Barat | 0.25 + Bakalan Dua | 0.25 + Bakalan Kidul | 0.25 + Bakalan Kulon | 0.25 + Bakalan Lor | 0.25 + Bakalan River | 0.25 + Bakalan Tengah | 0.25 + Bakalan Wetan | 0.25 + Bakalao Asibi Point | 0.25 + Bakalao Point | 0.25 + Bakalar Air Force Base (historical) | 0.25 + Bakalar Lake | 0.25 + Bakalar Library | 0.25 + Bakalda | 0.25 + Bakaldy | 0.25 + Bakaley | 0.25 + Bakalha | 0.25 + Bakalia Char | 0.25 + Bakalka | 0.25 + Bakalod Island | 0.25 + Bakalou | 0.25 + Bakalua | 0.25 + Bakalum | 0.25 + Bakkala Cemetery | 0.25 + Bankali | 0.25 + Barkala | 0.25 + Barkala Park | 0.25 + Barkala Rao | 0.25 + Barkala Reserved Forest | 0.25 + Barkald | 0.25 + Barkald stasjon | 0.25 + Barkale | 0.25 + Barkali | 0.25 + Baukala | 0.25 + Buur Bakaley | 0.25 + Columbus Bakalar Municipal Airport | 0.25 + Dakshin Bakalia | 0.25 + Danau Bakalan | 0.25 + Desa Bakalan | 0.25 + Gunung Bakalan | 0.25 + Kali Bakalan | 0.25 + Khrebet Batkali | 0.25 + Kordon Barkalo | 0.25 + Krajan Bakalan | 0.25 + Ovrag Bakalda | 0.25 + Pulau Bakalan | 0.25 + Selat Bakalan | 0.25 + Teluk Bakalan | 0.25 + Tukad Bakalan | 0.25 + Urochishche Batkali | 0.25 + Babakale | 0.230769 + Babakalo | 0.230769 + Bagkalen | 0.230769 + Bakalalan Airport | 0.230769 + Bakalang | 0.230769 + Bakalarr | 0.230769 + Bakalawa | 0.230769 + Bakaldum | 0.230769 + Bakaleko | 0.230769 + Bakalica | 0.230769 + Bakalino | 0.230769 + Bakalite | 0.230769 + Bakalovo | 0.230769 + Bakalsen | 0.230769 + Bakaltua Bank | 0.230769 + Bakalukalu | 0.230769 + Bakalukalu Shan | 0.230769 + Bakkalia | 0.230769 + Bankalol | 0.230769 + Barkaleh | 0.230769 + Barkalne | 0.230769 + Barkalow Hollow | 0.230769 + Bawkalut | 0.230769 + Bawkalut Chaung | 0.230769 + Clifton T Barkalow Elementary School | 0.230769 + Efrejtor Bakalovo | 0.230769 + Efreytor-Bakalovo | 0.230769 + Gora Barkalyu | 0.230769 + Ile Bakalibu | 0.230769 + Khor Bakallii | 0.230769 + Nehalla Bankalah Reserved Forest | 0.230769 + Ragha Bakalzai | 0.230769 + Tanjung Batikala | 0.230769 + Teluk Bakalang | 0.230769 + Urochishche Bakalovo | 0.230769 + Banjar Kubakal | 0.222222 + Darreh Pumba Kal | 0.222222 + Zabaykalovskiy | 0.222222 + Aparthotel Adagio Premium Dubai Al Barsha | 0.214286 + Babakalia | 0.214286 + Bahkalleh | 0.214286 + Baikalovo | 0.214286 + Bakalaale | 0.214286 + Bakalabwa Pans | 0.214286 + Bakalaeng | 0.214286 + Bakalauri | 0.214286 + Bakalbhar | 0.214286 + Bakalbuah | 0.214286 + Bakalerek | 0.214286 + Bakalinga | 0.214286 + Bakalipur | 0.214286 + Bakaljaya | 0.214286 + Bakalnica | 0.214286 + Bakalongo | 0.214286 + Bakalovka | 0.214286 + Bakalrejo | 0.214286 + Bakkalale | 0.214286 + Bambakala | 0.214286 + Bambakalo | 0.214286 + Barkalare | 0.214286 + Barkalden | 0.214286 + Barkallou | 0.214286 + Barkalova | 0.214286 + Baskalino | 0.214286 + Baskaltsi | 0.214286 + Desa Bakalrejo | 0.214286 + Doubletree By Hilton Dubai Al Barsha Hotel and Res | 0.214286 + Doubletree By Hilton Hotel and Apartments Dubai Al Barsha | 0.214286 + Doubletree Res.Dubai-Al Barsha | 0.214286 + Gora Barkalova | 0.214286 + Holiday Inn Dubai Al Barsha | 0.214286 + Novotel Dubai Al Barsha | 0.214286 + Park Inn By Radisson Dubai Al Barsha | 0.214286 + Ramee Rose Hotel Dubai Al Barsha | 0.214286 + Ras Barkallah | 0.214286 + Salu Bakalaeng | 0.214286 + Tanjung Bakalinga | 0.214286 + Tubu Bakalekuk | 0.214286 + Baikalakko | 0.2 + Bakalauri1 | 0.2 + Bakalauri2 | 0.2 + Bakalauri3 | 0.2 + Bakalauri4 | 0.2 + Bakalauri5 | 0.2 + Bakalauri6 | 0.2 + Bakalauri7 | 0.2 + Bakalauri8 | 0.2 + Bakalauri9 | 0.2 + Bakaldalam | 0.2 + Bakaldukuh | 0.2 + Bakaloolay | 0.2 + Bakalovina | 0.2 + Bakalpokok | 0.2 + Bakalshile | 0.2 + Bakalukudu | 0.2 + Bambakalia | 0.2 + Barkaladja Pool | 0.2 + Barkalovka | 0.2 + Bavkalasis | 0.2 + Gora Bakalyadyr | 0.2 + Kampong Bakaladong | 0.2 + Urochishche Bakalarnyn-Ayasy | 0.2 + Urochishche Bakaldikha | 0.2 +(245 rows) + +select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t; + t | sml +----------------------------------+---------- + Kabankala | 1 + Kabankalan City Public Plaza | 0.75 + Abankala | 0.583333 + Kabakala | 0.583333 + Kabikala | 0.461538 + Ntombankala School | 0.375 + Nehalla Bankalah Reserved Forest | 0.357143 + Jabba Kalai | 0.333333 + Kambakala | 0.333333 + Ker Samba Kalla | 0.333333 + Bankal | 0.307692 + Bankal School | 0.307692 + Kanampumba-Kalawa | 0.307692 + Bankali | 0.285714 + Mwalaba-Kalamba | 0.285714 + Tumba-Kalamba | 0.285714 + Darreh Pumba Kal | 0.272727 + Bankalol | 0.266667 + Dabakala | 0.266667 + Purba Kalaujan | 0.266667 + Kali Purbakala | 0.263158 + Dalabakala | 0.25 + Demba Kali | 0.25 + Gagaba Kalo | 0.25 + Golba Kalo | 0.25 + Habakkala | 0.25 + Kali Bakalan | 0.25 + Kimbakala | 0.25 + Kombakala | 0.25 + Jaba Kalle | 0.235294 + Kaikalahun Indian Reserve 25 | 0.235294 + Kwala Bakala | 0.235294 + Gereba Kaler | 0.230769 + Goth Soba Kaloi | 0.230769 + Guba Kaldo | 0.230769 + Gulba Kalle | 0.230769 + Guba Kalgalaksha | 0.222222 + Kalibakalako | 0.222222 + Ba Kaliin | 0.214286 + Bakala | 0.214286 + Bakala Koupi | 0.214286 + Bikala | 0.214286 + Bikala Madila | 0.214286 + Bugor Arba-Kalgan | 0.214286 + Bumba-Kaloki | 0.214286 + Guba Kalita | 0.214286 + Kamba-Kalele | 0.214286 + Mbay Bakala | 0.214286 + Ngao Bakala | 0.214286 + Sungai Bakala | 0.214286 + Fayzabadkala | 0.210526 + Gora Fayzabadkala | 0.210526 + Alue Bakkala | 0.2 + Bakkala Cemetery | 0.2 + Barkala | 0.2 + Barkala Park | 0.2 + Barkala Rao | 0.2 + Barkala Reserved Forest | 0.2 + Baukala | 0.2 + Beikala | 0.2 + Bomba-Kalende | 0.2 + Bumba-Kalumba | 0.2 + Haikala | 0.2 + Kahambikalela | 0.2 + Kaikalapettai | 0.2 + Kaikale | 0.2 + Laikala | 0.2 + Maikala Range | 0.2 + Matamba-Kalenga | 0.2 + Matamba-Kalenge | 0.2 + Naikala | 0.2 + Tumba-Kalumba | 0.2 + Tumba-Kalunga | 0.2 + Waikala | 0.2 +(74 rows) + +select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t; + t | sml +-----------------------------------------------------------+---------- + Baykal | 1 + Boloto Baykal | 1 + Boloto Malyy Baykal | 1 + Kolkhoz Krasnyy Baykal | 1 + Ozero Baykal | 1 + Polevoy Stan Baykal | 1 + Port Baykal | 1 + Prud Novyy Baykal | 1 + Sanatoriy Baykal | 1 + Stantsiya Baykal | 1 + Zaliv Baykal | 1 + Baykalo-Amurskaya Zheleznaya Doroga | 0.666667 + Baykalovo | 0.545455 + Baykalsko | 0.545455 + Maloye Baykalovo | 0.545455 + Baykalikha | 0.5 + Baykalovsk | 0.5 + Zabaykal | 0.454545 + Air Bakal-kecil | 0.444444 + Bakal | 0.444444 + Bakal Batu | 0.444444 + Bakal Dos | 0.444444 + Bakal Julu | 0.444444 + Bakal Khel | 0.444444 + Bakal Lama | 0.444444 + Bakal Tres | 0.444444 + Bakal Uno | 0.444444 + Daang Bakal | 0.444444 + Desa Bakal | 0.444444 + Eat Bakal | 0.444444 + Gunung Bakal | 0.444444 + Sidi Bakal | 0.444444 + Stantsiya Bakal | 0.444444 + Sungai Bakal | 0.444444 + Talang Bakal | 0.444444 + Uruk Bakal | 0.444444 + Zaouia Oulad Bakal | 0.444444 + Baykalovskiy | 0.428571 + Baykalovskiy Rayon | 0.428571 + Baikal | 0.4 + Baikal Airfield | 0.4 + Baikal Business Centre | 0.4 + Baikal Hotel Moscow | 0.4 + Baikal Listvyanka Hotel | 0.4 + Baikal Mountains | 0.4 + Baikal Plaza | 0.4 + Bajkal | 0.4 + Bankal | 0.4 + Bankal School | 0.4 + Barkal | 0.4 + Jabal Barkal | 0.4 + Lake Baikal | 0.4 + Oulad el Bakkal | 0.4 + Sidi Mohammed Bakkal | 0.4 + Bay of Backaland | 0.375 + Boikalakalawa Bay | 0.375 + Waikalabubu Bay | 0.375 + Bairkal | 0.363636 + Bairkal Dhora | 0.363636 + Bairkal Jabal | 0.363636 + Batikal | 0.363636 + Bakaleyka | 0.307692 + Bakkalmal | 0.307692 + Bikal | 0.3 + Al Barkali | 0.285714 + Zabaykalka | 0.285714 + Baidal | 0.272727 + Baihal | 0.272727 + Baipal | 0.272727 + Bakala | 0.272727 + Bakala Koupi | 0.272727 + Bakale | 0.272727 + Bakali | 0.272727 + Bakall | 0.272727 + Bakaly | 0.272727 + Bakaly TV Mast | 0.272727 + Buur Bakale | 0.272727 + Gory Bakaly | 0.272727 + Kusu-Bakali | 0.272727 + Kwala Bakala | 0.272727 + Mbay Bakala | 0.272727 + Ngao Bakala | 0.272727 + Sidi Mohammed el Bakali | 0.272727 + Sopka Bakaly | 0.272727 + Sungai Bakala | 0.272727 + Urochishche Bakaly | 0.272727 + Alue Bakkala | 0.25 + Azib el Bakkali | 0.25 + Ba Kaliin | 0.25 + Baikaluobbal | 0.25 + Bakalam | 0.25 + Bakalan | 0.25 + Bakalan Barat | 0.25 + Bakalan Dua | 0.25 + Bakalan Kidul | 0.25 + Bakalan Kulon | 0.25 + Bakalan Lor | 0.25 + Bakalan River | 0.25 + Bakalan Tengah | 0.25 + Bakalan Wetan | 0.25 + Bakalao Asibi Point | 0.25 + Bakalao Point | 0.25 + Bakalar Air Force Base (historical) | 0.25 + Bakalar Lake | 0.25 + Bakalar Library | 0.25 + Bakalda | 0.25 + Bakaldy | 0.25 + Bakaley | 0.25 + Bakalha | 0.25 + Bakalia Char | 0.25 + Bakalka | 0.25 + Bakalod Island | 0.25 + Bakalou | 0.25 + Bakalua | 0.25 + Bakalum | 0.25 + Bakkala Cemetery | 0.25 + Bankali | 0.25 + Barkala | 0.25 + Barkala Park | 0.25 + Barkala Rao | 0.25 + Barkala Reserved Forest | 0.25 + Barkald | 0.25 + Barkald stasjon | 0.25 + Barkale | 0.25 + Barkali | 0.25 + Baukala | 0.25 + Buur Bakaley | 0.25 + Columbus Bakalar Municipal Airport | 0.25 + Dakshin Bakalia | 0.25 + Danau Bakalan | 0.25 + Desa Bakalan | 0.25 + Gunung Bakalan | 0.25 + Kali Bakalan | 0.25 + Khrebet Batkali | 0.25 + Kordon Barkalo | 0.25 + Krajan Bakalan | 0.25 + Ovrag Bakalda | 0.25 + Pulau Bakalan | 0.25 + Selat Bakalan | 0.25 + Teluk Bakalan | 0.25 + Tukad Bakalan | 0.25 + Urochishche Batkali | 0.25 + Babakale | 0.230769 + Babakalo | 0.230769 + Bagkalen | 0.230769 + Bakalalan Airport | 0.230769 + Bakalang | 0.230769 + Bakalarr | 0.230769 + Bakalawa | 0.230769 + Bakaldum | 0.230769 + Bakaleko | 0.230769 + Bakalica | 0.230769 + Bakalino | 0.230769 + Bakalite | 0.230769 + Bakalovo | 0.230769 + Bakalsen | 0.230769 + Bakaltua Bank | 0.230769 + Bakalukalu | 0.230769 + Bakalukalu Shan | 0.230769 + Bakkalia | 0.230769 + Bankalol | 0.230769 + Barkaleh | 0.230769 + Barkalne | 0.230769 + Barkalow Hollow | 0.230769 + Bawkalut | 0.230769 + Bawkalut Chaung | 0.230769 + Clifton T Barkalow Elementary School | 0.230769 + Efrejtor Bakalovo | 0.230769 + Efreytor-Bakalovo | 0.230769 + Gora Barkalyu | 0.230769 + Ile Bakalibu | 0.230769 + Khor Bakallii | 0.230769 + Nehalla Bankalah Reserved Forest | 0.230769 + Ragha Bakalzai | 0.230769 + Tanjung Batikala | 0.230769 + Teluk Bakalang | 0.230769 + Urochishche Bakalovo | 0.230769 + Banjar Kubakal | 0.222222 + Darreh Pumba Kal | 0.222222 + Zabaykalovskiy | 0.222222 + Aparthotel Adagio Premium Dubai Al Barsha | 0.214286 + Babakalia | 0.214286 + Bahkalleh | 0.214286 + Baikalovo | 0.214286 + Bakalaale | 0.214286 + Bakalabwa Pans | 0.214286 + Bakalaeng | 0.214286 + Bakalauri | 0.214286 + Bakalbhar | 0.214286 + Bakalbuah | 0.214286 + Bakalerek | 0.214286 + Bakalinga | 0.214286 + Bakalipur | 0.214286 + Bakaljaya | 0.214286 + Bakalnica | 0.214286 + Bakalongo | 0.214286 + Bakalovka | 0.214286 + Bakalrejo | 0.214286 + Bakkalale | 0.214286 + Bambakala | 0.214286 + Bambakalo | 0.214286 + Barkalare | 0.214286 + Barkalden | 0.214286 + Barkallou | 0.214286 + Barkalova | 0.214286 + Baskalino | 0.214286 + Baskaltsi | 0.214286 + Desa Bakalrejo | 0.214286 + Doubletree By Hilton Dubai Al Barsha Hotel and Res | 0.214286 + Doubletree By Hilton Hotel and Apartments Dubai Al Barsha | 0.214286 + Doubletree Res.Dubai-Al Barsha | 0.214286 + Gora Barkalova | 0.214286 + Holiday Inn Dubai Al Barsha | 0.214286 + Novotel Dubai Al Barsha | 0.214286 + Park Inn By Radisson Dubai Al Barsha | 0.214286 + Ramee Rose Hotel Dubai Al Barsha | 0.214286 + Ras Barkallah | 0.214286 + Salu Bakalaeng | 0.214286 + Tanjung Bakalinga | 0.214286 + Tubu Bakalekuk | 0.214286 + Baikalakko | 0.2 + Bakalauri1 | 0.2 + Bakalauri2 | 0.2 + Bakalauri3 | 0.2 + Bakalauri4 | 0.2 + Bakalauri5 | 0.2 + Bakalauri6 | 0.2 + Bakalauri7 | 0.2 + Bakalauri8 | 0.2 + Bakalauri9 | 0.2 + Bakaldalam | 0.2 + Bakaldukuh | 0.2 + Bakaloolay | 0.2 + Bakalovina | 0.2 + Bakalpokok | 0.2 + Bakalshile | 0.2 + Bakalukudu | 0.2 + Bambakalia | 0.2 + Barkaladja Pool | 0.2 + Barkalovka | 0.2 + Bavkalasis | 0.2 + Gora Bakalyadyr | 0.2 + Kampong Bakaladong | 0.2 + Urochishche Bakalarnyn-Ayasy | 0.2 + Urochishche Bakaldikha | 0.2 +(245 rows) + +select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t; + t | sml +----------------------------------+---------- + Kabankala | 1 + Kabankalan City Public Plaza | 0.75 + Abankala | 0.583333 + Kabakala | 0.583333 + Kabikala | 0.461538 + Ntombankala School | 0.375 + Nehalla Bankalah Reserved Forest | 0.357143 + Jabba Kalai | 0.333333 + Kambakala | 0.333333 + Ker Samba Kalla | 0.333333 + Bankal | 0.307692 + Bankal School | 0.307692 + Kanampumba-Kalawa | 0.307692 + Bankali | 0.285714 + Mwalaba-Kalamba | 0.285714 + Tumba-Kalamba | 0.285714 + Darreh Pumba Kal | 0.272727 + Bankalol | 0.266667 + Dabakala | 0.266667 + Purba Kalaujan | 0.266667 + Kali Purbakala | 0.263158 + Dalabakala | 0.25 + Demba Kali | 0.25 + Gagaba Kalo | 0.25 + Golba Kalo | 0.25 + Habakkala | 0.25 + Kali Bakalan | 0.25 + Kimbakala | 0.25 + Kombakala | 0.25 + Jaba Kalle | 0.235294 + Kaikalahun Indian Reserve 25 | 0.235294 + Kwala Bakala | 0.235294 + Gereba Kaler | 0.230769 + Goth Soba Kaloi | 0.230769 + Guba Kaldo | 0.230769 + Gulba Kalle | 0.230769 + Guba Kalgalaksha | 0.222222 + Kalibakalako | 0.222222 + Ba Kaliin | 0.214286 + Bakala | 0.214286 + Bakala Koupi | 0.214286 + Bikala | 0.214286 + Bikala Madila | 0.214286 + Bugor Arba-Kalgan | 0.214286 + Bumba-Kaloki | 0.214286 + Guba Kalita | 0.214286 + Kamba-Kalele | 0.214286 + Mbay Bakala | 0.214286 + Ngao Bakala | 0.214286 + Sungai Bakala | 0.214286 + Fayzabadkala | 0.210526 + Gora Fayzabadkala | 0.210526 + Alue Bakkala | 0.2 + Bakkala Cemetery | 0.2 + Barkala | 0.2 + Barkala Park | 0.2 + Barkala Rao | 0.2 + Barkala Reserved Forest | 0.2 + Baukala | 0.2 + Beikala | 0.2 + Bomba-Kalende | 0.2 + Bumba-Kalumba | 0.2 + Haikala | 0.2 + Kahambikalela | 0.2 + Kaikalapettai | 0.2 + Kaikale | 0.2 + Laikala | 0.2 + Maikala Range | 0.2 + Matamba-Kalenga | 0.2 + Matamba-Kalenge | 0.2 + Naikala | 0.2 + Tumba-Kalumba | 0.2 + Tumba-Kalunga | 0.2 + Waikala | 0.2 +(74 rows) + diff --git a/contrib/pg_trgm/pg_trgm--1.3--1.4.sql b/contrib/pg_trgm/pg_trgm--1.3--1.4.sql new file mode 100644 index 0000000000..64a0c219b5 --- /dev/null +++ b/contrib/pg_trgm/pg_trgm--1.3--1.4.sql @@ -0,0 +1,68 @@ +/* contrib/pg_trgm/pg_trgm--1.3--1.4.sql */ + +-- complain if script is sourced in psql, rather than via ALTER EXTENSION +\echo Use "ALTER EXTENSION pg_trgm UPDATE TO '1.4'" to load this file. \quit + +CREATE FUNCTION strict_word_similarity(text,text) +RETURNS float4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE; + +CREATE FUNCTION strict_word_similarity_op(text,text) +RETURNS bool +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT STABLE PARALLEL SAFE; -- stable because depends on pg_trgm.word_similarity_threshold + +CREATE FUNCTION strict_word_similarity_commutator_op(text,text) +RETURNS bool +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT STABLE PARALLEL SAFE; -- stable because depends on pg_trgm.word_similarity_threshold + +CREATE OPERATOR <<% ( + LEFTARG = text, + RIGHTARG = text, + PROCEDURE = strict_word_similarity_op, + COMMUTATOR = '%>>', + RESTRICT = contsel, + JOIN = contjoinsel +); + +CREATE OPERATOR %>> ( + LEFTARG = text, + RIGHTARG = text, + PROCEDURE = strict_word_similarity_commutator_op, + COMMUTATOR = '<<%', + RESTRICT = contsel, + JOIN = contjoinsel +); + +CREATE FUNCTION strict_word_similarity_dist_op(text,text) +RETURNS float4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE; + +CREATE FUNCTION strict_word_similarity_dist_commutator_op(text,text) +RETURNS float4 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE; + +CREATE OPERATOR <<<-> ( + LEFTARG = text, + RIGHTARG = text, + PROCEDURE = strict_word_similarity_dist_op, + COMMUTATOR = '<->>>' +); + +CREATE OPERATOR <->>> ( + LEFTARG = text, + RIGHTARG = text, + PROCEDURE = strict_word_similarity_dist_commutator_op, + COMMUTATOR = '<<<->' +); + +ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD + OPERATOR 9 %>> (text, text), + OPERATOR 10 <->>> (text, text) FOR ORDER BY pg_catalog.float_ops; + +ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD + OPERATOR 9 %>> (text, text); diff --git a/contrib/pg_trgm/pg_trgm.control b/contrib/pg_trgm/pg_trgm.control index 06f274f01a..3e325dde00 100644 --- a/contrib/pg_trgm/pg_trgm.control +++ b/contrib/pg_trgm/pg_trgm.control @@ -1,5 +1,5 @@ # pg_trgm extension comment = 'text similarity measurement and index searching based on trigrams' -default_version = '1.3' +default_version = '1.4' module_pathname = '$libdir/pg_trgm' relocatable = true diff --git a/contrib/pg_trgm/sql/pg_strict_word_trgm.sql b/contrib/pg_trgm/sql/pg_strict_word_trgm.sql new file mode 100644 index 0000000000..98e0d379f8 --- /dev/null +++ b/contrib/pg_trgm/sql/pg_strict_word_trgm.sql @@ -0,0 +1,42 @@ +DROP INDEX trgm_idx2; + +\copy test_trgm3 from 'data/trgm2.data' + +select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t; +select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t; +select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t; +select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t; +select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7; + +create index trgm_idx2 on test_trgm2 using gist (t gist_trgm_ops); +set enable_seqscan=off; + +select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t; +select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t; +select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t; +select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t; + +explain (costs off) +select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7; +select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7; + +drop index trgm_idx2; +create index trgm_idx2 on test_trgm2 using gin (t gin_trgm_ops); +set enable_seqscan=off; + +select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t; +select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t; +select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t; +select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t; + +set "pg_trgm.strict_word_similarity_threshold" to 0.4; +select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t; +select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t; +select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t; +select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t; + +set "pg_trgm.strict_word_similarity_threshold" to 0.2; +select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t; +select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t; +select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t; +select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t; diff --git a/contrib/pg_trgm/trgm.h b/contrib/pg_trgm/trgm.h index 45df91875a..f0ab50dd05 100644 --- a/contrib/pg_trgm/trgm.h +++ b/contrib/pg_trgm/trgm.h @@ -6,6 +6,7 @@ #include "access/gist.h" #include "access/itup.h" +#include "access/stratnum.h" #include "storage/bufpage.h" /* @@ -26,14 +27,16 @@ #define DIVUNION /* operator strategy numbers */ -#define SimilarityStrategyNumber 1 -#define DistanceStrategyNumber 2 -#define LikeStrategyNumber 3 -#define ILikeStrategyNumber 4 -#define RegExpStrategyNumber 5 -#define RegExpICaseStrategyNumber 6 -#define WordSimilarityStrategyNumber 7 -#define WordDistanceStrategyNumber 8 +#define SimilarityStrategyNumber 1 +#define DistanceStrategyNumber 2 +#define LikeStrategyNumber 3 +#define ILikeStrategyNumber 4 +#define RegExpStrategyNumber 5 +#define RegExpICaseStrategyNumber 6 +#define WordSimilarityStrategyNumber 7 +#define WordDistanceStrategyNumber 8 +#define StrictWordSimilarityStrategyNumber 9 +#define StrictWordDistanceStrategyNumber 10 typedef char trgm[3]; @@ -120,7 +123,9 @@ typedef struct TrgmPackedGraph TrgmPackedGraph; extern double similarity_threshold; extern double word_similarity_threshold; +extern double strict_word_similarity_threshold; +extern double index_strategy_get_limit(StrategyNumber strategy); extern uint32 trgm2int(trgm *ptr); extern void compact_trigram(trgm *tptr, char *str, int bytelen); extern TRGM *generate_trgm(char *str, int slen); diff --git a/contrib/pg_trgm/trgm_gin.c b/contrib/pg_trgm/trgm_gin.c index e4b3daea44..1b9809b565 100644 --- a/contrib/pg_trgm/trgm_gin.c +++ b/contrib/pg_trgm/trgm_gin.c @@ -90,6 +90,7 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS) { case SimilarityStrategyNumber: case WordSimilarityStrategyNumber: + case StrictWordSimilarityStrategyNumber: trg = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val)); break; case ILikeStrategyNumber: @@ -187,8 +188,8 @@ gin_trgm_consistent(PG_FUNCTION_ARGS) { case SimilarityStrategyNumber: case WordSimilarityStrategyNumber: - nlimit = (strategy == SimilarityStrategyNumber) ? - similarity_threshold : word_similarity_threshold; + case StrictWordSimilarityStrategyNumber: + nlimit = index_strategy_get_limit(strategy); /* Count the matches */ ntrue = 0; @@ -282,8 +283,8 @@ gin_trgm_triconsistent(PG_FUNCTION_ARGS) { case SimilarityStrategyNumber: case WordSimilarityStrategyNumber: - nlimit = (strategy == SimilarityStrategyNumber) ? - similarity_threshold : word_similarity_threshold; + case StrictWordSimilarityStrategyNumber: + nlimit = index_strategy_get_limit(strategy); /* Count the matches */ ntrue = 0; diff --git a/contrib/pg_trgm/trgm_gist.c b/contrib/pg_trgm/trgm_gist.c index e55dc19a65..53e6830ab1 100644 --- a/contrib/pg_trgm/trgm_gist.c +++ b/contrib/pg_trgm/trgm_gist.c @@ -221,6 +221,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS) { case SimilarityStrategyNumber: case WordSimilarityStrategyNumber: + case StrictWordSimilarityStrategyNumber: qtrg = generate_trgm(VARDATA(query), querysize - VARHDRSZ); break; @@ -290,10 +291,11 @@ gtrgm_consistent(PG_FUNCTION_ARGS) { case SimilarityStrategyNumber: case WordSimilarityStrategyNumber: - /* Similarity search is exact. Word similarity search is inexact */ - *recheck = (strategy == WordSimilarityStrategyNumber); - nlimit = (strategy == SimilarityStrategyNumber) ? - similarity_threshold : word_similarity_threshold; + case StrictWordSimilarityStrategyNumber: + /* Similarity search is exact. (Strict) word similarity search is inexact */ + *recheck = (strategy != SimilarityStrategyNumber); + + nlimit = index_strategy_get_limit(strategy); if (GIST_LEAF(entry)) { /* all leafs contains orig trgm */ @@ -468,7 +470,9 @@ gtrgm_distance(PG_FUNCTION_ARGS) { case DistanceStrategyNumber: case WordDistanceStrategyNumber: - *recheck = strategy == WordDistanceStrategyNumber; + case StrictWordDistanceStrategyNumber: + /* Only plain trigram distance is exact */ + *recheck = (strategy != DistanceStrategyNumber); if (GIST_LEAF(entry)) { /* all leafs contains orig trgm */ diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c index 306d60bd3b..b572d087d8 100644 --- a/contrib/pg_trgm/trgm_op.c +++ b/contrib/pg_trgm/trgm_op.c @@ -18,6 +18,7 @@ PG_MODULE_MAGIC; /* GUC variables */ double similarity_threshold = 0.3f; double word_similarity_threshold = 0.6f; +double strict_word_similarity_threshold = 0.5f; void _PG_init(void); @@ -26,12 +27,17 @@ PG_FUNCTION_INFO_V1(show_limit); PG_FUNCTION_INFO_V1(show_trgm); PG_FUNCTION_INFO_V1(similarity); PG_FUNCTION_INFO_V1(word_similarity); +PG_FUNCTION_INFO_V1(strict_word_similarity); PG_FUNCTION_INFO_V1(similarity_dist); PG_FUNCTION_INFO_V1(similarity_op); PG_FUNCTION_INFO_V1(word_similarity_op); PG_FUNCTION_INFO_V1(word_similarity_commutator_op); PG_FUNCTION_INFO_V1(word_similarity_dist_op); PG_FUNCTION_INFO_V1(word_similarity_dist_commutator_op); +PG_FUNCTION_INFO_V1(strict_word_similarity_op); +PG_FUNCTION_INFO_V1(strict_word_similarity_commutator_op); +PG_FUNCTION_INFO_V1(strict_word_similarity_dist_op); +PG_FUNCTION_INFO_V1(strict_word_similarity_dist_commutator_op); /* Trigram with position */ typedef struct @@ -40,6 +46,17 @@ typedef struct int index; } pos_trgm; +/* Trigram bound type */ +typedef uint8 TrgmBound; +#define TRGM_BOUND_LEFT (0x01) /* trigram is left bound of word */ +#define TRGM_BOUND_RIGHT (0x02) /* trigram is right bound of word */ + +/* Word similarity flags */ +#define WORD_SIMILARITY_CHECK_ONLY (0x01) /* if set then only check existence + * of similar search pattern in text */ +#define WORD_SIMILARITY_STRICT (0x02) /* force bounds of extent to match + * word bounds */ + /* * Module load callback */ @@ -71,6 +88,18 @@ _PG_init(void) NULL, NULL, NULL); + DefineCustomRealVariable("pg_trgm.strict_word_similarity_threshold", + "Sets the threshold used by the <<%% operator.", + "Valid range is 0.0 .. 1.0.", + &strict_word_similarity_threshold, + 0.5, + 0.0, + 1.0, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); } /* @@ -95,6 +124,29 @@ set_limit(PG_FUNCTION_ARGS) PG_RETURN_FLOAT4(similarity_threshold); } + +/* + * Get similarity threshold for given index scan strategy number. + */ +double +index_strategy_get_limit(StrategyNumber strategy) +{ + switch (strategy) + { + case SimilarityStrategyNumber: + return similarity_threshold; + case WordSimilarityStrategyNumber: + return word_similarity_threshold; + case StrictWordSimilarityStrategyNumber: + return strict_word_similarity_threshold; + default: + elog(ERROR, "unrecognized strategy number: %d", strategy); + break; + } + + return 0.0; /* keep compiler quiet */ +} + /* * Deprecated function. * Use "pg_trgm.similarity_threshold" GUC variable instead of this function. @@ -235,11 +287,12 @@ make_trigrams(trgm *tptr, char *str, int bytelen, int charlen) * * trg: where to return the array of trigrams. * str: source string, of length slen bytes. + * bounds: where to return bounds of trigrams (if needed). * * Returns length of the generated array. */ static int -generate_trgm_only(trgm *trg, char *str, int slen) +generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds) { trgm *tptr; char *buf; @@ -282,11 +335,13 @@ generate_trgm_only(trgm *trg, char *str, int slen) buf[LPADDING + bytelen] = ' '; buf[LPADDING + bytelen + 1] = ' '; - /* - * count trigrams - */ + /* Calculate trigrams marking their bounds if needed */ + if (bounds) + bounds[tptr - trg] |= TRGM_BOUND_LEFT; tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING, charlen + LPADDING + RPADDING); + if (bounds) + bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT; } pfree(buf); @@ -328,7 +383,7 @@ generate_trgm(char *str, int slen) trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3); trg->flag = ARRKEY; - len = generate_trgm_only(GETARR(trg), str, slen); + len = generate_trgm_only(GETARR(trg), str, slen, NULL); SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len)); if (len == 0) @@ -413,8 +468,8 @@ comp_ptrgm(const void *v1, const void *v2) * ulen1: count of unique trigrams of array "trg1". * len2: length of array "trg2" and array "trg2indexes". * len: length of the array "found". - * check_only: if true then only check existence of similar search pattern in - * text. + * lags: set of boolean flags parametrizing similarity calculation. + * bounds: whether each trigram is left/right bound of word. * * Returns word similarity. */ @@ -424,16 +479,32 @@ iterate_word_similarity(int *trg2indexes, int ulen1, int len2, int len, - bool check_only) + uint8 flags, + TrgmBound *bounds) { int *lastpos, i, ulen2 = 0, count = 0, upper = -1, - lower = -1; + lower; float4 smlr_cur, smlr_max = 0.0f; + double threshold; + + Assert(bounds || !(flags & WORD_SIMILARITY_STRICT)); + + /* Select appropriate threshold */ + threshold = (flags & WORD_SIMILARITY_STRICT) ? + strict_word_similarity_threshold : + word_similarity_threshold; + + /* + * Consider first trigram as initial lower bount for strict word similarity, + * or initialize it later with first trigram present for plain word + * similarity. + */ + lower = (flags & WORD_SIMILARITY_STRICT) ? 0 : -1; /* Memorise last position of each trigram */ lastpos = (int *) palloc(sizeof(int) * len); @@ -456,8 +527,13 @@ iterate_word_similarity(int *trg2indexes, lastpos[trgindex] = i; } - /* Adjust upper bound if this trigram is present in required substring */ - if (found[trgindex]) + /* + * Adjust upper bound if trigram is upper bound of word for strict + * word similarity, or if trigram is present in required substring for + * plain word similarity + */ + if ((flags & WORD_SIMILARITY_STRICT) ? (bounds[i] & TRGM_BOUND_RIGHT) + : found[trgindex]) { int prev_lower, tmp_ulen2, @@ -479,24 +555,35 @@ iterate_word_similarity(int *trg2indexes, prev_lower = lower; for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++) { - float smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2); + float smlr_tmp; int tmp_trgindex; - if (smlr_tmp > smlr_cur) - { - smlr_cur = smlr_tmp; - ulen2 = tmp_ulen2; - lower = tmp_lower; - count = tmp_count; - } - /* - * if we only check that word similarity is greater than - * pg_trgm.word_similarity_threshold we do not need to - * calculate a maximum similarity. + * Adjust lower bound only if trigram is lower bound of word + * for strict word similarity, or consider every trigram as + * lower bound for plain word similarity. */ - if (check_only && smlr_cur >= word_similarity_threshold) - break; + if (!(flags & WORD_SIMILARITY_STRICT) + || (bounds[tmp_lower] & TRGM_BOUND_LEFT)) + { + smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2); + if (smlr_tmp > smlr_cur) + { + smlr_cur = smlr_tmp; + ulen2 = tmp_ulen2; + lower = tmp_lower; + count = tmp_count; + } + + /* + * If we only check that word similarity is greater than + * threshold we do not need to calculate a maximum + * similarity. + */ + if ((flags & WORD_SIMILARITY_CHECK_ONLY) + && smlr_cur >= threshold) + break; + } tmp_trgindex = trg2indexes[tmp_lower]; if (lastpos[tmp_trgindex] == tmp_lower) @@ -511,10 +598,9 @@ iterate_word_similarity(int *trg2indexes, /* * if we only check that word similarity is greater than - * pg_trgm.word_similarity_threshold we do not need to calculate a - * maximum similarity + * threshold we do not need to calculate a maximum similarity. */ - if (check_only && smlr_max >= word_similarity_threshold) + if ((flags & WORD_SIMILARITY_CHECK_ONLY) && smlr_max >= threshold) break; for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++) @@ -547,14 +633,13 @@ iterate_word_similarity(int *trg2indexes, * * str1: search pattern string, of length slen1 bytes. * str2: text in which we are looking for a word, of length slen2 bytes. - * check_only: if true then only check existence of similar search pattern in - * text. + * flags: set of boolean flags parametrizing similarity calculation. * * Returns word similarity. */ static float4 calc_word_similarity(char *str1, int slen1, char *str2, int slen2, - bool check_only) + uint8 flags) { bool *found; pos_trgm *ptrg; @@ -568,15 +653,20 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2, ulen1; int *trg2indexes; float4 result; + TrgmBound *bounds; protect_out_of_mem(slen1 + slen2); /* Make positional trigrams */ trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3); trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3); + if (flags & WORD_SIMILARITY_STRICT) + bounds = (TrgmBound *) palloc0(sizeof(TrgmBound) * (slen2 / 2 + 1) * 3); + else + bounds = NULL; - len1 = generate_trgm_only(trg1, str1, slen1); - len2 = generate_trgm_only(trg2, str2, slen2); + len1 = generate_trgm_only(trg1, str1, slen1, NULL); + len2 = generate_trgm_only(trg2, str2, slen2, bounds); ptrg = make_positional_trgm(trg1, len1, trg2, len2); len = len1 + len2; @@ -622,7 +712,7 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2, /* Run iterative procedure to find maximum similarity with word */ result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len, - check_only); + flags, bounds); pfree(trg2indexes); pfree(found); @@ -1081,7 +1171,23 @@ word_similarity(PG_FUNCTION_ARGS) res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1), VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2), - false); + 0); + + PG_FREE_IF_COPY(in1, 0); + PG_FREE_IF_COPY(in2, 1); + PG_RETURN_FLOAT4(res); +} + +Datum +strict_word_similarity(PG_FUNCTION_ARGS) +{ + text *in1 = PG_GETARG_TEXT_PP(0); + text *in2 = PG_GETARG_TEXT_PP(1); + float4 res; + + res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1), + VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2), + WORD_SIMILARITY_STRICT); PG_FREE_IF_COPY(in1, 0); PG_FREE_IF_COPY(in2, 1); @@ -1117,7 +1223,7 @@ word_similarity_op(PG_FUNCTION_ARGS) res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1), VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2), - true); + WORD_SIMILARITY_CHECK_ONLY); PG_FREE_IF_COPY(in1, 0); PG_FREE_IF_COPY(in2, 1); @@ -1133,7 +1239,7 @@ word_similarity_commutator_op(PG_FUNCTION_ARGS) res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2), VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1), - true); + WORD_SIMILARITY_CHECK_ONLY); PG_FREE_IF_COPY(in1, 0); PG_FREE_IF_COPY(in2, 1); @@ -1149,7 +1255,7 @@ word_similarity_dist_op(PG_FUNCTION_ARGS) res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1), VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2), - false); + 0); PG_FREE_IF_COPY(in1, 0); PG_FREE_IF_COPY(in2, 1); @@ -1165,7 +1271,71 @@ word_similarity_dist_commutator_op(PG_FUNCTION_ARGS) res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2), VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1), - false); + 0); + + PG_FREE_IF_COPY(in1, 0); + PG_FREE_IF_COPY(in2, 1); + PG_RETURN_FLOAT4(1.0 - res); +} + +Datum +strict_word_similarity_op(PG_FUNCTION_ARGS) +{ + text *in1 = PG_GETARG_TEXT_PP(0); + text *in2 = PG_GETARG_TEXT_PP(1); + float4 res; + + res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1), + VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2), + WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT); + + PG_FREE_IF_COPY(in1, 0); + PG_FREE_IF_COPY(in2, 1); + PG_RETURN_BOOL(res >= strict_word_similarity_threshold); +} + +Datum +strict_word_similarity_commutator_op(PG_FUNCTION_ARGS) +{ + text *in1 = PG_GETARG_TEXT_PP(0); + text *in2 = PG_GETARG_TEXT_PP(1); + float4 res; + + res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2), + VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1), + WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT); + + PG_FREE_IF_COPY(in1, 0); + PG_FREE_IF_COPY(in2, 1); + PG_RETURN_BOOL(res >= strict_word_similarity_threshold); +} + +Datum +strict_word_similarity_dist_op(PG_FUNCTION_ARGS) +{ + text *in1 = PG_GETARG_TEXT_PP(0); + text *in2 = PG_GETARG_TEXT_PP(1); + float4 res; + + res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1), + VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2), + WORD_SIMILARITY_STRICT); + + PG_FREE_IF_COPY(in1, 0); + PG_FREE_IF_COPY(in2, 1); + PG_RETURN_FLOAT4(1.0 - res); +} + +Datum +strict_word_similarity_dist_commutator_op(PG_FUNCTION_ARGS) +{ + text *in1 = PG_GETARG_TEXT_PP(0); + text *in2 = PG_GETARG_TEXT_PP(1); + float4 res; + + res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2), + VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1), + WORD_SIMILARITY_STRICT); PG_FREE_IF_COPY(in1, 0); PG_FREE_IF_COPY(in2, 1); diff --git a/doc/src/sgml/pgtrgm.sgml b/doc/src/sgml/pgtrgm.sgml index b5d893c9fb..8f395296d8 100644 --- a/doc/src/sgml/pgtrgm.sgml +++ b/doc/src/sgml/pgtrgm.sgml @@ -105,6 +105,17 @@ the explanation below. + + + strict_word_similarity(text, text) + strict_word_similarity + + real + + Same as word_similarity(text, text), but forces + extent boundaries to match word boundaries. + + show_limit()show_limit real @@ -157,6 +168,29 @@ a part of the word. + + At the same time, strict_word_similarity(text, text) + has to select an extent that matches word boundaries. In the example above, + strict_word_similarity(text, text) would select the + extent {" w"," wo","wor","ord","rds", ds "}, which + corresponds to the whole word 'words'. + + +# SELECT strict_word_similarity('word', 'two words'), similarity('word', 'words'); + strict_word_similarity | similarity +------------------------+------------ + 0.571429 | 0.571429 +(1 row) + + + + + Thus, the strict_word_similarity(text, text) function + is useful for finding similar subsets of whole words, while + word_similarity(text, text) is more suitable for + searching similar parts of words. + + <filename>pg_trgm</filename> Operators @@ -196,6 +230,24 @@ Commutator of the <% operator. + + text <<% text + boolean + + Returns true if its second argument has a continuous + extent of an ordered trigram set that matches word boundaries, + and its similarity to the trigram set of the first argument is greater + than the current strict word similarity threshold set by the + pg_trgm.strict_word_similarity_threshold parameter. + + + + text %>> text + boolean + + Commutator of the <<% operator. + + text <-> text real @@ -223,6 +275,25 @@ Commutator of the <<-> operator. + + + text <<<-> text + + real + + Returns the distance between the arguments, that is + one minus the strict_word_similarity() value. + + + + + text <->>> text + + real + + Commutator of the <<<-> operator. + +
@@ -322,12 +393,19 @@ SELECT t, t <-> 'word' AS dist Also you can use an index on the t column for word - similarity. For example: + similarity or strict word similarity. Typical queries are: SELECT t, word_similarity('word', t) AS sml FROM test_trgm WHERE 'word' <% t ORDER BY sml DESC, t; + + and + +SELECT t, strict_word_similarity('word', t) AS sml + FROM test_trgm + WHERE 'word' <<% t + ORDER BY sml DESC, t; This will return all values in the text column for which there is a continuous extent in the corresponding ordered trigram set that is @@ -337,11 +415,17 @@ SELECT t, word_similarity('word', t) AS sml - A variant of the above query is + Possible variants of the above queries are: SELECT t, 'word' <<-> t AS dist FROM test_trgm ORDER BY dist LIMIT 10; + + and + +SELECT t, 'word' <<<-> t AS dist + FROM test_trgm + ORDER BY dist LIMIT 10; This can be implemented quite efficiently by GiST indexes, but not by GIN indexes.