Commit be8a7a68 authored by Teodor Sigaev's avatar Teodor Sigaev

Add strict_word_similarity to pg_trgm module

strict_word_similarity is similar to existing word_similarity function but
it takes into account word boundaries to compute similarity.

Author: Alexander Korotkov
Review by: David Steele, Liudmila Mantrova, me
Discussion: https://www.postgresql.org/message-id/flat/CY4PR17MB13207ED8310F847CF117EED0D85A0@CY4PR17MB1320.namprd17.prod.outlook.com
parent f20b3285
......@@ -4,11 +4,12 @@ MODULE_big = pg_trgm
OBJS = trgm_op.o trgm_gist.o trgm_gin.o trgm_regexp.o $(WIN32RES)
EXTENSION = pg_trgm
DATA = pg_trgm--1.3.sql pg_trgm--1.2--1.3.sql pg_trgm--1.1--1.2.sql \
DATA = pg_trgm--1.3--1.4.sql \
pg_trgm--1.3.sql pg_trgm--1.2--1.3.sql pg_trgm--1.1--1.2.sql \
pg_trgm--1.0--1.1.sql pg_trgm--unpackaged--1.0.sql
PGFILEDESC = "pg_trgm - trigram matching"
REGRESS = pg_trgm pg_word_trgm
REGRESS = pg_trgm pg_word_trgm pg_strict_word_trgm
ifdef USE_PGXS
PG_CONFIG = pg_config
......
DROP INDEX trgm_idx2;
\copy test_trgm3 from 'data/trgm2.data'
ERROR: relation "test_trgm3" does not exist
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
t | sml
-------------------------------------+----------
Baykal | 1
Boloto Baykal | 1
Boloto Malyy Baykal | 1
Kolkhoz Krasnyy Baykal | 1
Ozero Baykal | 1
Polevoy Stan Baykal | 1
Port Baykal | 1
Prud Novyy Baykal | 1
Sanatoriy Baykal | 1
Stantsiya Baykal | 1
Zaliv Baykal | 1
Baykalo-Amurskaya Zheleznaya Doroga | 0.666667
Baykalovo | 0.545455
Baykalsko | 0.545455
Maloye Baykalovo | 0.545455
Baykalikha | 0.5
Baykalovsk | 0.5
(17 rows)
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
t | sml
------------------------------+----------
Kabankala | 1
Kabankalan City Public Plaza | 0.75
Abankala | 0.583333
Kabakala | 0.583333
(4 rows)
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
t | sml
-------------------------------------+----------
Baykal | 1
Boloto Baykal | 1
Boloto Malyy Baykal | 1
Kolkhoz Krasnyy Baykal | 1
Ozero Baykal | 1
Polevoy Stan Baykal | 1
Port Baykal | 1
Prud Novyy Baykal | 1
Sanatoriy Baykal | 1
Stantsiya Baykal | 1
Zaliv Baykal | 1
Baykalo-Amurskaya Zheleznaya Doroga | 0.666667
Baykalovo | 0.545455
Baykalsko | 0.545455
Maloye Baykalovo | 0.545455
Baykalikha | 0.5
Baykalovsk | 0.5
(17 rows)
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
t | sml
------------------------------+----------
Kabankala | 1
Kabankalan City Public Plaza | 0.75
Abankala | 0.583333
Kabakala | 0.583333
(4 rows)
select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7;
?column? | t
----------+--------------------------
0 | Alaikallupoddakulam
0.25 | Alaikallupodda Alankulam
0.32 | Alaikalluppodda Kulam
0.615385 | Mulaikallu Kulam
0.724138 | Koraikalapu Kulam
0.75 | Vaikaliththevakulam
0.766667 | Karaivaikal Kulam
(7 rows)
create index trgm_idx2 on test_trgm2 using gist (t gist_trgm_ops);
set enable_seqscan=off;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
t | sml
-------------------------------------+----------
Baykal | 1
Boloto Baykal | 1
Boloto Malyy Baykal | 1
Kolkhoz Krasnyy Baykal | 1
Ozero Baykal | 1
Polevoy Stan Baykal | 1
Port Baykal | 1
Prud Novyy Baykal | 1
Sanatoriy Baykal | 1
Stantsiya Baykal | 1
Zaliv Baykal | 1
Baykalo-Amurskaya Zheleznaya Doroga | 0.666667
Baykalovo | 0.545455
Baykalsko | 0.545455
Maloye Baykalovo | 0.545455
Baykalikha | 0.5
Baykalovsk | 0.5
(17 rows)
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
t | sml
------------------------------+----------
Kabankala | 1
Kabankalan City Public Plaza | 0.75
Abankala | 0.583333
Kabakala | 0.583333
(4 rows)
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
t | sml
-------------------------------------+----------
Baykal | 1
Boloto Baykal | 1
Boloto Malyy Baykal | 1
Kolkhoz Krasnyy Baykal | 1
Ozero Baykal | 1
Polevoy Stan Baykal | 1
Port Baykal | 1
Prud Novyy Baykal | 1
Sanatoriy Baykal | 1
Stantsiya Baykal | 1
Zaliv Baykal | 1
Baykalo-Amurskaya Zheleznaya Doroga | 0.666667
Baykalovo | 0.545455
Baykalsko | 0.545455
Maloye Baykalovo | 0.545455
Baykalikha | 0.5
Baykalovsk | 0.5
(17 rows)
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
t | sml
------------------------------+----------
Kabankala | 1
Kabankalan City Public Plaza | 0.75
Abankala | 0.583333
Kabakala | 0.583333
(4 rows)
explain (costs off)
select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7;
QUERY PLAN
---------------------------------------------------------
Limit
-> Index Scan using trgm_idx2 on test_trgm2
Order By: (t <->>> 'Alaikallupoddakulam'::text)
(3 rows)
select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7;
?column? | t
----------+--------------------------
0 | Alaikallupoddakulam
0.25 | Alaikallupodda Alankulam
0.32 | Alaikalluppodda Kulam
0.615385 | Mulaikallu Kulam
0.724138 | Koraikalapu Kulam
0.75 | Vaikaliththevakulam
0.766667 | Karaivaikal Kulam
(7 rows)
drop index trgm_idx2;
create index trgm_idx2 on test_trgm2 using gin (t gin_trgm_ops);
set enable_seqscan=off;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
t | sml
-------------------------------------+----------
Baykal | 1
Boloto Baykal | 1
Boloto Malyy Baykal | 1
Kolkhoz Krasnyy Baykal | 1
Ozero Baykal | 1
Polevoy Stan Baykal | 1
Port Baykal | 1
Prud Novyy Baykal | 1
Sanatoriy Baykal | 1
Stantsiya Baykal | 1
Zaliv Baykal | 1
Baykalo-Amurskaya Zheleznaya Doroga | 0.666667
Baykalovo | 0.545455
Baykalsko | 0.545455
Maloye Baykalovo | 0.545455
Baykalikha | 0.5
Baykalovsk | 0.5
(17 rows)
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
t | sml
------------------------------+----------
Kabankala | 1
Kabankalan City Public Plaza | 0.75
Abankala | 0.583333
Kabakala | 0.583333
(4 rows)
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
t | sml
-------------------------------------+----------
Baykal | 1
Boloto Baykal | 1
Boloto Malyy Baykal | 1
Kolkhoz Krasnyy Baykal | 1
Ozero Baykal | 1
Polevoy Stan Baykal | 1
Port Baykal | 1
Prud Novyy Baykal | 1
Sanatoriy Baykal | 1
Stantsiya Baykal | 1
Zaliv Baykal | 1
Baykalo-Amurskaya Zheleznaya Doroga | 0.666667
Baykalovo | 0.545455
Baykalsko | 0.545455
Maloye Baykalovo | 0.545455
Baykalikha | 0.5
Baykalovsk | 0.5
(17 rows)
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
t | sml
------------------------------+----------
Kabankala | 1
Kabankalan City Public Plaza | 0.75
Abankala | 0.583333
Kabakala | 0.583333
(4 rows)
set "pg_trgm.strict_word_similarity_threshold" to 0.4;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
t | sml
-------------------------------------+----------
Baykal | 1
Boloto Baykal | 1
Boloto Malyy Baykal | 1
Kolkhoz Krasnyy Baykal | 1
Ozero Baykal | 1
Polevoy Stan Baykal | 1
Port Baykal | 1
Prud Novyy Baykal | 1
Sanatoriy Baykal | 1
Stantsiya Baykal | 1
Zaliv Baykal | 1
Baykalo-Amurskaya Zheleznaya Doroga | 0.666667
Baykalovo | 0.545455
Baykalsko | 0.545455
Maloye Baykalovo | 0.545455
Baykalikha | 0.5
Baykalovsk | 0.5
Zabaykal | 0.454545
Air Bakal-kecil | 0.444444
Bakal | 0.444444
Bakal Batu | 0.444444
Bakal Dos | 0.444444
Bakal Julu | 0.444444
Bakal Khel | 0.444444
Bakal Lama | 0.444444
Bakal Tres | 0.444444
Bakal Uno | 0.444444
Daang Bakal | 0.444444
Desa Bakal | 0.444444
Eat Bakal | 0.444444
Gunung Bakal | 0.444444
Sidi Bakal | 0.444444
Stantsiya Bakal | 0.444444
Sungai Bakal | 0.444444
Talang Bakal | 0.444444
Uruk Bakal | 0.444444
Zaouia Oulad Bakal | 0.444444
Baykalovskiy | 0.428571
Baykalovskiy Rayon | 0.428571
Baikal | 0.4
Baikal Airfield | 0.4
Baikal Business Centre | 0.4
Baikal Hotel Moscow | 0.4
Baikal Listvyanka Hotel | 0.4
Baikal Mountains | 0.4
Baikal Plaza | 0.4
Bajkal | 0.4
Bankal | 0.4
Bankal School | 0.4
Barkal | 0.4
Jabal Barkal | 0.4
Lake Baikal | 0.4
Oulad el Bakkal | 0.4
Sidi Mohammed Bakkal | 0.4
(54 rows)
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
t | sml
------------------------------+----------
Kabankala | 1
Kabankalan City Public Plaza | 0.75
Abankala | 0.583333
Kabakala | 0.583333
Kabikala | 0.461538
(5 rows)
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
t | sml
-------------------------------------+----------
Baykal | 1
Boloto Baykal | 1
Boloto Malyy Baykal | 1
Kolkhoz Krasnyy Baykal | 1
Ozero Baykal | 1
Polevoy Stan Baykal | 1
Port Baykal | 1
Prud Novyy Baykal | 1
Sanatoriy Baykal | 1
Stantsiya Baykal | 1
Zaliv Baykal | 1
Baykalo-Amurskaya Zheleznaya Doroga | 0.666667
Baykalovo | 0.545455
Baykalsko | 0.545455
Maloye Baykalovo | 0.545455
Baykalikha | 0.5
Baykalovsk | 0.5
Zabaykal | 0.454545
Air Bakal-kecil | 0.444444
Bakal | 0.444444
Bakal Batu | 0.444444
Bakal Dos | 0.444444
Bakal Julu | 0.444444
Bakal Khel | 0.444444
Bakal Lama | 0.444444
Bakal Tres | 0.444444
Bakal Uno | 0.444444
Daang Bakal | 0.444444
Desa Bakal | 0.444444
Eat Bakal | 0.444444
Gunung Bakal | 0.444444
Sidi Bakal | 0.444444
Stantsiya Bakal | 0.444444
Sungai Bakal | 0.444444
Talang Bakal | 0.444444
Uruk Bakal | 0.444444
Zaouia Oulad Bakal | 0.444444
Baykalovskiy | 0.428571
Baykalovskiy Rayon | 0.428571
Baikal | 0.4
Baikal Airfield | 0.4
Baikal Business Centre | 0.4
Baikal Hotel Moscow | 0.4
Baikal Listvyanka Hotel | 0.4
Baikal Mountains | 0.4
Baikal Plaza | 0.4
Bajkal | 0.4
Bankal | 0.4
Bankal School | 0.4
Barkal | 0.4
Jabal Barkal | 0.4
Lake Baikal | 0.4
Oulad el Bakkal | 0.4
Sidi Mohammed Bakkal | 0.4
(54 rows)
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
t | sml
------------------------------+----------
Kabankala | 1
Kabankalan City Public Plaza | 0.75
Abankala | 0.583333
Kabakala | 0.583333
Kabikala | 0.461538
(5 rows)
set "pg_trgm.strict_word_similarity_threshold" to 0.2;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
t | sml
-----------------------------------------------------------+----------
Baykal | 1
Boloto Baykal | 1
Boloto Malyy Baykal | 1
Kolkhoz Krasnyy Baykal | 1
Ozero Baykal | 1
Polevoy Stan Baykal | 1
Port Baykal | 1
Prud Novyy Baykal | 1
Sanatoriy Baykal | 1
Stantsiya Baykal | 1
Zaliv Baykal | 1
Baykalo-Amurskaya Zheleznaya Doroga | 0.666667
Baykalovo | 0.545455
Baykalsko | 0.545455
Maloye Baykalovo | 0.545455
Baykalikha | 0.5
Baykalovsk | 0.5
Zabaykal | 0.454545
Air Bakal-kecil | 0.444444
Bakal | 0.444444
Bakal Batu | 0.444444
Bakal Dos | 0.444444
Bakal Julu | 0.444444
Bakal Khel | 0.444444
Bakal Lama | 0.444444
Bakal Tres | 0.444444
Bakal Uno | 0.444444
Daang Bakal | 0.444444
Desa Bakal | 0.444444
Eat Bakal | 0.444444
Gunung Bakal | 0.444444
Sidi Bakal | 0.444444
Stantsiya Bakal | 0.444444
Sungai Bakal | 0.444444
Talang Bakal | 0.444444
Uruk Bakal | 0.444444
Zaouia Oulad Bakal | 0.444444
Baykalovskiy | 0.428571
Baykalovskiy Rayon | 0.428571
Baikal | 0.4
Baikal Airfield | 0.4
Baikal Business Centre | 0.4
Baikal Hotel Moscow | 0.4
Baikal Listvyanka Hotel | 0.4
Baikal Mountains | 0.4
Baikal Plaza | 0.4
Bajkal | 0.4
Bankal | 0.4
Bankal School | 0.4
Barkal | 0.4
Jabal Barkal | 0.4
Lake Baikal | 0.4
Oulad el Bakkal | 0.4
Sidi Mohammed Bakkal | 0.4
Bay of Backaland | 0.375
Boikalakalawa Bay | 0.375
Waikalabubu Bay | 0.375
Bairkal | 0.363636
Bairkal Dhora | 0.363636
Bairkal Jabal | 0.363636
Batikal | 0.363636
Bakaleyka | 0.307692
Bakkalmal | 0.307692
Bikal | 0.3
Al Barkali | 0.285714
Zabaykalka | 0.285714
Baidal | 0.272727
Baihal | 0.272727
Baipal | 0.272727
Bakala | 0.272727
Bakala Koupi | 0.272727
Bakale | 0.272727
Bakali | 0.272727
Bakall | 0.272727
Bakaly | 0.272727
Bakaly TV Mast | 0.272727
Buur Bakale | 0.272727
Gory Bakaly | 0.272727
Kusu-Bakali | 0.272727
Kwala Bakala | 0.272727
Mbay Bakala | 0.272727
Ngao Bakala | 0.272727
Sidi Mohammed el Bakali | 0.272727
Sopka Bakaly | 0.272727
Sungai Bakala | 0.272727
Urochishche Bakaly | 0.272727
Alue Bakkala | 0.25
Azib el Bakkali | 0.25
Ba Kaliin | 0.25
Baikaluobbal | 0.25
Bakalam | 0.25
Bakalan | 0.25
Bakalan Barat | 0.25
Bakalan Dua | 0.25
Bakalan Kidul | 0.25
Bakalan Kulon | 0.25
Bakalan Lor | 0.25
Bakalan River | 0.25
Bakalan Tengah | 0.25
Bakalan Wetan | 0.25
Bakalao Asibi Point | 0.25
Bakalao Point | 0.25
Bakalar Air Force Base (historical) | 0.25
Bakalar Lake | 0.25
Bakalar Library | 0.25
Bakalda | 0.25
Bakaldy | 0.25
Bakaley | 0.25
Bakalha | 0.25
Bakalia Char | 0.25
Bakalka | 0.25
Bakalod Island | 0.25
Bakalou | 0.25
Bakalua | 0.25
Bakalum | 0.25
Bakkala Cemetery | 0.25
Bankali | 0.25
Barkala | 0.25
Barkala Park | 0.25
Barkala Rao | 0.25
Barkala Reserved Forest | 0.25
Barkald | 0.25
Barkald stasjon | 0.25
Barkale | 0.25
Barkali | 0.25
Baukala | 0.25
Buur Bakaley | 0.25
Columbus Bakalar Municipal Airport | 0.25
Dakshin Bakalia | 0.25
Danau Bakalan | 0.25
Desa Bakalan | 0.25
Gunung Bakalan | 0.25
Kali Bakalan | 0.25
Khrebet Batkali | 0.25
Kordon Barkalo | 0.25
Krajan Bakalan | 0.25
Ovrag Bakalda | 0.25
Pulau Bakalan | 0.25
Selat Bakalan | 0.25
Teluk Bakalan | 0.25
Tukad Bakalan | 0.25
Urochishche Batkali | 0.25
Babakale | 0.230769
Babakalo | 0.230769
Bagkalen | 0.230769
Bakalalan Airport | 0.230769
Bakalang | 0.230769
Bakalarr | 0.230769
Bakalawa | 0.230769
Bakaldum | 0.230769
Bakaleko | 0.230769
Bakalica | 0.230769
Bakalino | 0.230769
Bakalite | 0.230769
Bakalovo | 0.230769
Bakalsen | 0.230769
Bakaltua Bank | 0.230769
Bakalukalu | 0.230769
Bakalukalu Shan | 0.230769
Bakkalia | 0.230769
Bankalol | 0.230769
Barkaleh | 0.230769
Barkalne | 0.230769
Barkalow Hollow | 0.230769
Bawkalut | 0.230769
Bawkalut Chaung | 0.230769
Clifton T Barkalow Elementary School | 0.230769
Efrejtor Bakalovo | 0.230769
Efreytor-Bakalovo | 0.230769
Gora Barkalyu | 0.230769
Ile Bakalibu | 0.230769
Khor Bakallii | 0.230769
Nehalla Bankalah Reserved Forest | 0.230769
Ragha Bakalzai | 0.230769
Tanjung Batikala | 0.230769
Teluk Bakalang | 0.230769
Urochishche Bakalovo | 0.230769
Banjar Kubakal | 0.222222
Darreh Pumba Kal | 0.222222
Zabaykalovskiy | 0.222222
Aparthotel Adagio Premium Dubai Al Barsha | 0.214286
Babakalia | 0.214286
Bahkalleh | 0.214286
Baikalovo | 0.214286
Bakalaale | 0.214286
Bakalabwa Pans | 0.214286
Bakalaeng | 0.214286
Bakalauri | 0.214286
Bakalbhar | 0.214286
Bakalbuah | 0.214286
Bakalerek | 0.214286
Bakalinga | 0.214286
Bakalipur | 0.214286
Bakaljaya | 0.214286
Bakalnica | 0.214286
Bakalongo | 0.214286
Bakalovka | 0.214286
Bakalrejo | 0.214286
Bakkalale | 0.214286
Bambakala | 0.214286
Bambakalo | 0.214286
Barkalare | 0.214286
Barkalden | 0.214286
Barkallou | 0.214286
Barkalova | 0.214286
Baskalino | 0.214286
Baskaltsi | 0.214286
Desa Bakalrejo | 0.214286
Doubletree By Hilton Dubai Al Barsha Hotel and Res | 0.214286
Doubletree By Hilton Hotel and Apartments Dubai Al Barsha | 0.214286
Doubletree Res.Dubai-Al Barsha | 0.214286
Gora Barkalova | 0.214286
Holiday Inn Dubai Al Barsha | 0.214286
Novotel Dubai Al Barsha | 0.214286
Park Inn By Radisson Dubai Al Barsha | 0.214286
Ramee Rose Hotel Dubai Al Barsha | 0.214286
Ras Barkallah | 0.214286
Salu Bakalaeng | 0.214286
Tanjung Bakalinga | 0.214286
Tubu Bakalekuk | 0.214286
Baikalakko | 0.2
Bakalauri1 | 0.2
Bakalauri2 | 0.2
Bakalauri3 | 0.2
Bakalauri4 | 0.2
Bakalauri5 | 0.2
Bakalauri6 | 0.2
Bakalauri7 | 0.2
Bakalauri8 | 0.2
Bakalauri9 | 0.2
Bakaldalam | 0.2
Bakaldukuh | 0.2
Bakaloolay | 0.2
Bakalovina | 0.2
Bakalpokok | 0.2
Bakalshile | 0.2
Bakalukudu | 0.2
Bambakalia | 0.2
Barkaladja Pool | 0.2
Barkalovka | 0.2
Bavkalasis | 0.2
Gora Bakalyadyr | 0.2
Kampong Bakaladong | 0.2
Urochishche Bakalarnyn-Ayasy | 0.2
Urochishche Bakaldikha | 0.2
(245 rows)
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
t | sml
----------------------------------+----------
Kabankala | 1
Kabankalan City Public Plaza | 0.75
Abankala | 0.583333
Kabakala | 0.583333
Kabikala | 0.461538
Ntombankala School | 0.375
Nehalla Bankalah Reserved Forest | 0.357143
Jabba Kalai | 0.333333
Kambakala | 0.333333
Ker Samba Kalla | 0.333333
Bankal | 0.307692
Bankal School | 0.307692
Kanampumba-Kalawa | 0.307692
Bankali | 0.285714
Mwalaba-Kalamba | 0.285714
Tumba-Kalamba | 0.285714
Darreh Pumba Kal | 0.272727
Bankalol | 0.266667
Dabakala | 0.266667
Purba Kalaujan | 0.266667
Kali Purbakala | 0.263158
Dalabakala | 0.25
Demba Kali | 0.25
Gagaba Kalo | 0.25
Golba Kalo | 0.25
Habakkala | 0.25
Kali Bakalan | 0.25
Kimbakala | 0.25
Kombakala | 0.25
Jaba Kalle | 0.235294
Kaikalahun Indian Reserve 25 | 0.235294
Kwala Bakala | 0.235294
Gereba Kaler | 0.230769
Goth Soba Kaloi | 0.230769
Guba Kaldo | 0.230769
Gulba Kalle | 0.230769
Guba Kalgalaksha | 0.222222
Kalibakalako | 0.222222
Ba Kaliin | 0.214286
Bakala | 0.214286
Bakala Koupi | 0.214286
Bikala | 0.214286
Bikala Madila | 0.214286
Bugor Arba-Kalgan | 0.214286
Bumba-Kaloki | 0.214286
Guba Kalita | 0.214286
Kamba-Kalele | 0.214286
Mbay Bakala | 0.214286
Ngao Bakala | 0.214286
Sungai Bakala | 0.214286
Fayzabadkala | 0.210526
Gora Fayzabadkala | 0.210526
Alue Bakkala | 0.2
Bakkala Cemetery | 0.2
Barkala | 0.2
Barkala Park | 0.2
Barkala Rao | 0.2
Barkala Reserved Forest | 0.2
Baukala | 0.2
Beikala | 0.2
Bomba-Kalende | 0.2
Bumba-Kalumba | 0.2
Haikala | 0.2
Kahambikalela | 0.2
Kaikalapettai | 0.2
Kaikale | 0.2
Laikala | 0.2
Maikala Range | 0.2
Matamba-Kalenga | 0.2
Matamba-Kalenge | 0.2
Naikala | 0.2
Tumba-Kalumba | 0.2
Tumba-Kalunga | 0.2
Waikala | 0.2
(74 rows)
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
t | sml
-----------------------------------------------------------+----------
Baykal | 1
Boloto Baykal | 1
Boloto Malyy Baykal | 1
Kolkhoz Krasnyy Baykal | 1
Ozero Baykal | 1
Polevoy Stan Baykal | 1
Port Baykal | 1
Prud Novyy Baykal | 1
Sanatoriy Baykal | 1
Stantsiya Baykal | 1
Zaliv Baykal | 1
Baykalo-Amurskaya Zheleznaya Doroga | 0.666667
Baykalovo | 0.545455
Baykalsko | 0.545455
Maloye Baykalovo | 0.545455
Baykalikha | 0.5
Baykalovsk | 0.5
Zabaykal | 0.454545
Air Bakal-kecil | 0.444444
Bakal | 0.444444
Bakal Batu | 0.444444
Bakal Dos | 0.444444
Bakal Julu | 0.444444
Bakal Khel | 0.444444
Bakal Lama | 0.444444
Bakal Tres | 0.444444
Bakal Uno | 0.444444
Daang Bakal | 0.444444
Desa Bakal | 0.444444
Eat Bakal | 0.444444
Gunung Bakal | 0.444444
Sidi Bakal | 0.444444
Stantsiya Bakal | 0.444444
Sungai Bakal | 0.444444
Talang Bakal | 0.444444
Uruk Bakal | 0.444444
Zaouia Oulad Bakal | 0.444444
Baykalovskiy | 0.428571
Baykalovskiy Rayon | 0.428571
Baikal | 0.4
Baikal Airfield | 0.4
Baikal Business Centre | 0.4
Baikal Hotel Moscow | 0.4
Baikal Listvyanka Hotel | 0.4
Baikal Mountains | 0.4
Baikal Plaza | 0.4
Bajkal | 0.4
Bankal | 0.4
Bankal School | 0.4
Barkal | 0.4
Jabal Barkal | 0.4
Lake Baikal | 0.4
Oulad el Bakkal | 0.4
Sidi Mohammed Bakkal | 0.4
Bay of Backaland | 0.375
Boikalakalawa Bay | 0.375
Waikalabubu Bay | 0.375
Bairkal | 0.363636
Bairkal Dhora | 0.363636
Bairkal Jabal | 0.363636
Batikal | 0.363636
Bakaleyka | 0.307692
Bakkalmal | 0.307692
Bikal | 0.3
Al Barkali | 0.285714
Zabaykalka | 0.285714
Baidal | 0.272727
Baihal | 0.272727
Baipal | 0.272727
Bakala | 0.272727
Bakala Koupi | 0.272727
Bakale | 0.272727
Bakali | 0.272727
Bakall | 0.272727
Bakaly | 0.272727
Bakaly TV Mast | 0.272727
Buur Bakale | 0.272727
Gory Bakaly | 0.272727
Kusu-Bakali | 0.272727
Kwala Bakala | 0.272727
Mbay Bakala | 0.272727
Ngao Bakala | 0.272727
Sidi Mohammed el Bakali | 0.272727
Sopka Bakaly | 0.272727
Sungai Bakala | 0.272727
Urochishche Bakaly | 0.272727
Alue Bakkala | 0.25
Azib el Bakkali | 0.25
Ba Kaliin | 0.25
Baikaluobbal | 0.25
Bakalam | 0.25
Bakalan | 0.25
Bakalan Barat | 0.25
Bakalan Dua | 0.25
Bakalan Kidul | 0.25
Bakalan Kulon | 0.25
Bakalan Lor | 0.25
Bakalan River | 0.25
Bakalan Tengah | 0.25
Bakalan Wetan | 0.25
Bakalao Asibi Point | 0.25
Bakalao Point | 0.25
Bakalar Air Force Base (historical) | 0.25
Bakalar Lake | 0.25
Bakalar Library | 0.25
Bakalda | 0.25
Bakaldy | 0.25
Bakaley | 0.25
Bakalha | 0.25
Bakalia Char | 0.25
Bakalka | 0.25
Bakalod Island | 0.25
Bakalou | 0.25
Bakalua | 0.25
Bakalum | 0.25
Bakkala Cemetery | 0.25
Bankali | 0.25
Barkala | 0.25
Barkala Park | 0.25
Barkala Rao | 0.25
Barkala Reserved Forest | 0.25
Barkald | 0.25
Barkald stasjon | 0.25
Barkale | 0.25
Barkali | 0.25
Baukala | 0.25
Buur Bakaley | 0.25
Columbus Bakalar Municipal Airport | 0.25
Dakshin Bakalia | 0.25
Danau Bakalan | 0.25
Desa Bakalan | 0.25
Gunung Bakalan | 0.25
Kali Bakalan | 0.25
Khrebet Batkali | 0.25
Kordon Barkalo | 0.25
Krajan Bakalan | 0.25
Ovrag Bakalda | 0.25
Pulau Bakalan | 0.25
Selat Bakalan | 0.25
Teluk Bakalan | 0.25
Tukad Bakalan | 0.25
Urochishche Batkali | 0.25
Babakale | 0.230769
Babakalo | 0.230769
Bagkalen | 0.230769
Bakalalan Airport | 0.230769
Bakalang | 0.230769
Bakalarr | 0.230769
Bakalawa | 0.230769
Bakaldum | 0.230769
Bakaleko | 0.230769
Bakalica | 0.230769
Bakalino | 0.230769
Bakalite | 0.230769
Bakalovo | 0.230769
Bakalsen | 0.230769
Bakaltua Bank | 0.230769
Bakalukalu | 0.230769
Bakalukalu Shan | 0.230769
Bakkalia | 0.230769
Bankalol | 0.230769
Barkaleh | 0.230769
Barkalne | 0.230769
Barkalow Hollow | 0.230769
Bawkalut | 0.230769
Bawkalut Chaung | 0.230769
Clifton T Barkalow Elementary School | 0.230769
Efrejtor Bakalovo | 0.230769
Efreytor-Bakalovo | 0.230769
Gora Barkalyu | 0.230769
Ile Bakalibu | 0.230769
Khor Bakallii | 0.230769
Nehalla Bankalah Reserved Forest | 0.230769
Ragha Bakalzai | 0.230769
Tanjung Batikala | 0.230769
Teluk Bakalang | 0.230769
Urochishche Bakalovo | 0.230769
Banjar Kubakal | 0.222222
Darreh Pumba Kal | 0.222222
Zabaykalovskiy | 0.222222
Aparthotel Adagio Premium Dubai Al Barsha | 0.214286
Babakalia | 0.214286
Bahkalleh | 0.214286
Baikalovo | 0.214286
Bakalaale | 0.214286
Bakalabwa Pans | 0.214286
Bakalaeng | 0.214286
Bakalauri | 0.214286
Bakalbhar | 0.214286
Bakalbuah | 0.214286
Bakalerek | 0.214286
Bakalinga | 0.214286
Bakalipur | 0.214286
Bakaljaya | 0.214286
Bakalnica | 0.214286
Bakalongo | 0.214286
Bakalovka | 0.214286
Bakalrejo | 0.214286
Bakkalale | 0.214286
Bambakala | 0.214286
Bambakalo | 0.214286
Barkalare | 0.214286
Barkalden | 0.214286
Barkallou | 0.214286
Barkalova | 0.214286
Baskalino | 0.214286
Baskaltsi | 0.214286
Desa Bakalrejo | 0.214286
Doubletree By Hilton Dubai Al Barsha Hotel and Res | 0.214286
Doubletree By Hilton Hotel and Apartments Dubai Al Barsha | 0.214286
Doubletree Res.Dubai-Al Barsha | 0.214286
Gora Barkalova | 0.214286
Holiday Inn Dubai Al Barsha | 0.214286
Novotel Dubai Al Barsha | 0.214286
Park Inn By Radisson Dubai Al Barsha | 0.214286
Ramee Rose Hotel Dubai Al Barsha | 0.214286
Ras Barkallah | 0.214286
Salu Bakalaeng | 0.214286
Tanjung Bakalinga | 0.214286
Tubu Bakalekuk | 0.214286
Baikalakko | 0.2
Bakalauri1 | 0.2
Bakalauri2 | 0.2
Bakalauri3 | 0.2
Bakalauri4 | 0.2
Bakalauri5 | 0.2
Bakalauri6 | 0.2
Bakalauri7 | 0.2
Bakalauri8 | 0.2
Bakalauri9 | 0.2
Bakaldalam | 0.2
Bakaldukuh | 0.2
Bakaloolay | 0.2
Bakalovina | 0.2
Bakalpokok | 0.2
Bakalshile | 0.2
Bakalukudu | 0.2
Bambakalia | 0.2
Barkaladja Pool | 0.2
Barkalovka | 0.2
Bavkalasis | 0.2
Gora Bakalyadyr | 0.2
Kampong Bakaladong | 0.2
Urochishche Bakalarnyn-Ayasy | 0.2
Urochishche Bakaldikha | 0.2
(245 rows)
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
t | sml
----------------------------------+----------
Kabankala | 1
Kabankalan City Public Plaza | 0.75
Abankala | 0.583333
Kabakala | 0.583333
Kabikala | 0.461538
Ntombankala School | 0.375
Nehalla Bankalah Reserved Forest | 0.357143
Jabba Kalai | 0.333333
Kambakala | 0.333333
Ker Samba Kalla | 0.333333
Bankal | 0.307692
Bankal School | 0.307692
Kanampumba-Kalawa | 0.307692
Bankali | 0.285714
Mwalaba-Kalamba | 0.285714
Tumba-Kalamba | 0.285714
Darreh Pumba Kal | 0.272727
Bankalol | 0.266667
Dabakala | 0.266667
Purba Kalaujan | 0.266667
Kali Purbakala | 0.263158
Dalabakala | 0.25
Demba Kali | 0.25
Gagaba Kalo | 0.25
Golba Kalo | 0.25
Habakkala | 0.25
Kali Bakalan | 0.25
Kimbakala | 0.25
Kombakala | 0.25
Jaba Kalle | 0.235294
Kaikalahun Indian Reserve 25 | 0.235294
Kwala Bakala | 0.235294
Gereba Kaler | 0.230769
Goth Soba Kaloi | 0.230769
Guba Kaldo | 0.230769
Gulba Kalle | 0.230769
Guba Kalgalaksha | 0.222222
Kalibakalako | 0.222222
Ba Kaliin | 0.214286
Bakala | 0.214286
Bakala Koupi | 0.214286
Bikala | 0.214286
Bikala Madila | 0.214286
Bugor Arba-Kalgan | 0.214286
Bumba-Kaloki | 0.214286
Guba Kalita | 0.214286
Kamba-Kalele | 0.214286
Mbay Bakala | 0.214286
Ngao Bakala | 0.214286
Sungai Bakala | 0.214286
Fayzabadkala | 0.210526
Gora Fayzabadkala | 0.210526
Alue Bakkala | 0.2
Bakkala Cemetery | 0.2
Barkala | 0.2
Barkala Park | 0.2
Barkala Rao | 0.2
Barkala Reserved Forest | 0.2
Baukala | 0.2
Beikala | 0.2
Bomba-Kalende | 0.2
Bumba-Kalumba | 0.2
Haikala | 0.2
Kahambikalela | 0.2
Kaikalapettai | 0.2
Kaikale | 0.2
Laikala | 0.2
Maikala Range | 0.2
Matamba-Kalenga | 0.2
Matamba-Kalenge | 0.2
Naikala | 0.2
Tumba-Kalumba | 0.2
Tumba-Kalunga | 0.2
Waikala | 0.2
(74 rows)
/* contrib/pg_trgm/pg_trgm--1.3--1.4.sql */
-- complain if script is sourced in psql, rather than via ALTER EXTENSION
\echo Use "ALTER EXTENSION pg_trgm UPDATE TO '1.4'" to load this file. \quit
CREATE FUNCTION strict_word_similarity(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
CREATE FUNCTION strict_word_similarity_op(text,text)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE PARALLEL SAFE; -- stable because depends on pg_trgm.word_similarity_threshold
CREATE FUNCTION strict_word_similarity_commutator_op(text,text)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE PARALLEL SAFE; -- stable because depends on pg_trgm.word_similarity_threshold
CREATE OPERATOR <<% (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = strict_word_similarity_op,
COMMUTATOR = '%>>',
RESTRICT = contsel,
JOIN = contjoinsel
);
CREATE OPERATOR %>> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = strict_word_similarity_commutator_op,
COMMUTATOR = '<<%',
RESTRICT = contsel,
JOIN = contjoinsel
);
CREATE FUNCTION strict_word_similarity_dist_op(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
CREATE FUNCTION strict_word_similarity_dist_commutator_op(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
CREATE OPERATOR <<<-> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = strict_word_similarity_dist_op,
COMMUTATOR = '<->>>'
);
CREATE OPERATOR <->>> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = strict_word_similarity_dist_commutator_op,
COMMUTATOR = '<<<->'
);
ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
OPERATOR 9 %>> (text, text),
OPERATOR 10 <->>> (text, text) FOR ORDER BY pg_catalog.float_ops;
ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
OPERATOR 9 %>> (text, text);
# pg_trgm extension
comment = 'text similarity measurement and index searching based on trigrams'
default_version = '1.3'
default_version = '1.4'
module_pathname = '$libdir/pg_trgm'
relocatable = true
DROP INDEX trgm_idx2;
\copy test_trgm3 from 'data/trgm2.data'
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7;
create index trgm_idx2 on test_trgm2 using gist (t gist_trgm_ops);
set enable_seqscan=off;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
explain (costs off)
select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7;
select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7;
drop index trgm_idx2;
create index trgm_idx2 on test_trgm2 using gin (t gin_trgm_ops);
set enable_seqscan=off;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
set "pg_trgm.strict_word_similarity_threshold" to 0.4;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
set "pg_trgm.strict_word_similarity_threshold" to 0.2;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
......@@ -6,6 +6,7 @@
#include "access/gist.h"
#include "access/itup.h"
#include "access/stratnum.h"
#include "storage/bufpage.h"
/*
......@@ -34,6 +35,8 @@
#define RegExpICaseStrategyNumber 6
#define WordSimilarityStrategyNumber 7
#define WordDistanceStrategyNumber 8
#define StrictWordSimilarityStrategyNumber 9
#define StrictWordDistanceStrategyNumber 10
typedef char trgm[3];
......@@ -120,7 +123,9 @@ typedef struct TrgmPackedGraph TrgmPackedGraph;
extern double similarity_threshold;
extern double word_similarity_threshold;
extern double strict_word_similarity_threshold;
extern double index_strategy_get_limit(StrategyNumber strategy);
extern uint32 trgm2int(trgm *ptr);
extern void compact_trigram(trgm *tptr, char *str, int bytelen);
extern TRGM *generate_trgm(char *str, int slen);
......
......@@ -90,6 +90,7 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS)
{
case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber:
case StrictWordSimilarityStrategyNumber:
trg = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val));
break;
case ILikeStrategyNumber:
......@@ -187,8 +188,8 @@ gin_trgm_consistent(PG_FUNCTION_ARGS)
{
case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber:
nlimit = (strategy == SimilarityStrategyNumber) ?
similarity_threshold : word_similarity_threshold;
case StrictWordSimilarityStrategyNumber:
nlimit = index_strategy_get_limit(strategy);
/* Count the matches */
ntrue = 0;
......@@ -282,8 +283,8 @@ gin_trgm_triconsistent(PG_FUNCTION_ARGS)
{
case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber:
nlimit = (strategy == SimilarityStrategyNumber) ?
similarity_threshold : word_similarity_threshold;
case StrictWordSimilarityStrategyNumber:
nlimit = index_strategy_get_limit(strategy);
/* Count the matches */
ntrue = 0;
......
......@@ -221,6 +221,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
{
case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber:
case StrictWordSimilarityStrategyNumber:
qtrg = generate_trgm(VARDATA(query),
querysize - VARHDRSZ);
break;
......@@ -290,10 +291,11 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
{
case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber:
/* Similarity search is exact. Word similarity search is inexact */
*recheck = (strategy == WordSimilarityStrategyNumber);
nlimit = (strategy == SimilarityStrategyNumber) ?
similarity_threshold : word_similarity_threshold;
case StrictWordSimilarityStrategyNumber:
/* Similarity search is exact. (Strict) word similarity search is inexact */
*recheck = (strategy != SimilarityStrategyNumber);
nlimit = index_strategy_get_limit(strategy);
if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */
......@@ -468,7 +470,9 @@ gtrgm_distance(PG_FUNCTION_ARGS)
{
case DistanceStrategyNumber:
case WordDistanceStrategyNumber:
*recheck = strategy == WordDistanceStrategyNumber;
case StrictWordDistanceStrategyNumber:
/* Only plain trigram distance is exact */
*recheck = (strategy != DistanceStrategyNumber);
if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */
......
......@@ -18,6 +18,7 @@ PG_MODULE_MAGIC;
/* GUC variables */
double similarity_threshold = 0.3f;
double word_similarity_threshold = 0.6f;
double strict_word_similarity_threshold = 0.5f;
void _PG_init(void);
......@@ -26,12 +27,17 @@ PG_FUNCTION_INFO_V1(show_limit);
PG_FUNCTION_INFO_V1(show_trgm);
PG_FUNCTION_INFO_V1(similarity);
PG_FUNCTION_INFO_V1(word_similarity);
PG_FUNCTION_INFO_V1(strict_word_similarity);
PG_FUNCTION_INFO_V1(similarity_dist);
PG_FUNCTION_INFO_V1(similarity_op);
PG_FUNCTION_INFO_V1(word_similarity_op);
PG_FUNCTION_INFO_V1(word_similarity_commutator_op);
PG_FUNCTION_INFO_V1(word_similarity_dist_op);
PG_FUNCTION_INFO_V1(word_similarity_dist_commutator_op);
PG_FUNCTION_INFO_V1(strict_word_similarity_op);
PG_FUNCTION_INFO_V1(strict_word_similarity_commutator_op);
PG_FUNCTION_INFO_V1(strict_word_similarity_dist_op);
PG_FUNCTION_INFO_V1(strict_word_similarity_dist_commutator_op);
/* Trigram with position */
typedef struct
......@@ -40,6 +46,17 @@ typedef struct
int index;
} pos_trgm;
/* Trigram bound type */
typedef uint8 TrgmBound;
#define TRGM_BOUND_LEFT (0x01) /* trigram is left bound of word */
#define TRGM_BOUND_RIGHT (0x02) /* trigram is right bound of word */
/* Word similarity flags */
#define WORD_SIMILARITY_CHECK_ONLY (0x01) /* if set then only check existence
* of similar search pattern in text */
#define WORD_SIMILARITY_STRICT (0x02) /* force bounds of extent to match
* word bounds */
/*
* Module load callback
*/
......@@ -71,6 +88,18 @@ _PG_init(void)
NULL,
NULL,
NULL);
DefineCustomRealVariable("pg_trgm.strict_word_similarity_threshold",
"Sets the threshold used by the <<%% operator.",
"Valid range is 0.0 .. 1.0.",
&strict_word_similarity_threshold,
0.5,
0.0,
1.0,
PGC_USERSET,
0,
NULL,
NULL,
NULL);
}
/*
......@@ -95,6 +124,29 @@ set_limit(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT4(similarity_threshold);
}
/*
* Get similarity threshold for given index scan strategy number.
*/
double
index_strategy_get_limit(StrategyNumber strategy)
{
switch (strategy)
{
case SimilarityStrategyNumber:
return similarity_threshold;
case WordSimilarityStrategyNumber:
return word_similarity_threshold;
case StrictWordSimilarityStrategyNumber:
return strict_word_similarity_threshold;
default:
elog(ERROR, "unrecognized strategy number: %d", strategy);
break;
}
return 0.0; /* keep compiler quiet */
}
/*
* Deprecated function.
* Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
......@@ -235,11 +287,12 @@ make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
*
* trg: where to return the array of trigrams.
* str: source string, of length slen bytes.
* bounds: where to return bounds of trigrams (if needed).
*
* Returns length of the generated array.
*/
static int
generate_trgm_only(trgm *trg, char *str, int slen)
generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
{
trgm *tptr;
char *buf;
......@@ -282,11 +335,13 @@ generate_trgm_only(trgm *trg, char *str, int slen)
buf[LPADDING + bytelen] = ' ';
buf[LPADDING + bytelen + 1] = ' ';
/*
* count trigrams
*/
/* Calculate trigrams marking their bounds if needed */
if (bounds)
bounds[tptr - trg] |= TRGM_BOUND_LEFT;
tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
charlen + LPADDING + RPADDING);
if (bounds)
bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT;
}
pfree(buf);
......@@ -328,7 +383,7 @@ generate_trgm(char *str, int slen)
trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
trg->flag = ARRKEY;
len = generate_trgm_only(GETARR(trg), str, slen);
len = generate_trgm_only(GETARR(trg), str, slen, NULL);
SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
if (len == 0)
......@@ -413,8 +468,8 @@ comp_ptrgm(const void *v1, const void *v2)
* ulen1: count of unique trigrams of array "trg1".
* len2: length of array "trg2" and array "trg2indexes".
* len: length of the array "found".
* check_only: if true then only check existence of similar search pattern in
* text.
* lags: set of boolean flags parametrizing similarity calculation.
* bounds: whether each trigram is left/right bound of word.
*
* Returns word similarity.
*/
......@@ -424,16 +479,32 @@ iterate_word_similarity(int *trg2indexes,
int ulen1,
int len2,
int len,
bool check_only)
uint8 flags,
TrgmBound *bounds)
{
int *lastpos,
i,
ulen2 = 0,
count = 0,
upper = -1,
lower = -1;
lower;
float4 smlr_cur,
smlr_max = 0.0f;
double threshold;
Assert(bounds || !(flags & WORD_SIMILARITY_STRICT));
/* Select appropriate threshold */
threshold = (flags & WORD_SIMILARITY_STRICT) ?
strict_word_similarity_threshold :
word_similarity_threshold;
/*
* Consider first trigram as initial lower bount for strict word similarity,
* or initialize it later with first trigram present for plain word
* similarity.
*/
lower = (flags & WORD_SIMILARITY_STRICT) ? 0 : -1;
/* Memorise last position of each trigram */
lastpos = (int *) palloc(sizeof(int) * len);
......@@ -456,8 +527,13 @@ iterate_word_similarity(int *trg2indexes,
lastpos[trgindex] = i;
}
/* Adjust upper bound if this trigram is present in required substring */
if (found[trgindex])
/*
* Adjust upper bound if trigram is upper bound of word for strict
* word similarity, or if trigram is present in required substring for
* plain word similarity
*/
if ((flags & WORD_SIMILARITY_STRICT) ? (bounds[i] & TRGM_BOUND_RIGHT)
: found[trgindex])
{
int prev_lower,
tmp_ulen2,
......@@ -479,9 +555,18 @@ iterate_word_similarity(int *trg2indexes,
prev_lower = lower;
for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++)
{
float smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
float smlr_tmp;
int tmp_trgindex;
/*
* Adjust lower bound only if trigram is lower bound of word
* for strict word similarity, or consider every trigram as
* lower bound for plain word similarity.
*/
if (!(flags & WORD_SIMILARITY_STRICT)
|| (bounds[tmp_lower] & TRGM_BOUND_LEFT))
{
smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
if (smlr_tmp > smlr_cur)
{
smlr_cur = smlr_tmp;
......@@ -491,12 +576,14 @@ iterate_word_similarity(int *trg2indexes,
}
/*
* if we only check that word similarity is greater than
* pg_trgm.word_similarity_threshold we do not need to
* calculate a maximum similarity.
* If we only check that word similarity is greater than
* threshold we do not need to calculate a maximum
* similarity.
*/
if (check_only && smlr_cur >= word_similarity_threshold)
if ((flags & WORD_SIMILARITY_CHECK_ONLY)
&& smlr_cur >= threshold)
break;
}
tmp_trgindex = trg2indexes[tmp_lower];
if (lastpos[tmp_trgindex] == tmp_lower)
......@@ -511,10 +598,9 @@ iterate_word_similarity(int *trg2indexes,
/*
* if we only check that word similarity is greater than
* pg_trgm.word_similarity_threshold we do not need to calculate a
* maximum similarity
* threshold we do not need to calculate a maximum similarity.
*/
if (check_only && smlr_max >= word_similarity_threshold)
if ((flags & WORD_SIMILARITY_CHECK_ONLY) && smlr_max >= threshold)
break;
for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++)
......@@ -547,14 +633,13 @@ iterate_word_similarity(int *trg2indexes,
*
* str1: search pattern string, of length slen1 bytes.
* str2: text in which we are looking for a word, of length slen2 bytes.
* check_only: if true then only check existence of similar search pattern in
* text.
* flags: set of boolean flags parametrizing similarity calculation.
*
* Returns word similarity.
*/
static float4
calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
bool check_only)
uint8 flags)
{
bool *found;
pos_trgm *ptrg;
......@@ -568,15 +653,20 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
ulen1;
int *trg2indexes;
float4 result;
TrgmBound *bounds;
protect_out_of_mem(slen1 + slen2);
/* Make positional trigrams */
trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3);
trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3);
if (flags & WORD_SIMILARITY_STRICT)
bounds = (TrgmBound *) palloc0(sizeof(TrgmBound) * (slen2 / 2 + 1) * 3);
else
bounds = NULL;
len1 = generate_trgm_only(trg1, str1, slen1);
len2 = generate_trgm_only(trg2, str2, slen2);
len1 = generate_trgm_only(trg1, str1, slen1, NULL);
len2 = generate_trgm_only(trg2, str2, slen2, bounds);
ptrg = make_positional_trgm(trg1, len1, trg2, len2);
len = len1 + len2;
......@@ -622,7 +712,7 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
/* Run iterative procedure to find maximum similarity with word */
result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len,
check_only);
flags, bounds);
pfree(trg2indexes);
pfree(found);
......@@ -1081,7 +1171,23 @@ word_similarity(PG_FUNCTION_ARGS)
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
false);
0);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
PG_RETURN_FLOAT4(res);
}
Datum
strict_word_similarity(PG_FUNCTION_ARGS)
{
text *in1 = PG_GETARG_TEXT_PP(0);
text *in2 = PG_GETARG_TEXT_PP(1);
float4 res;
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
WORD_SIMILARITY_STRICT);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
......@@ -1117,7 +1223,7 @@ word_similarity_op(PG_FUNCTION_ARGS)
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
true);
WORD_SIMILARITY_CHECK_ONLY);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
......@@ -1133,7 +1239,7 @@ word_similarity_commutator_op(PG_FUNCTION_ARGS)
res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
true);
WORD_SIMILARITY_CHECK_ONLY);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
......@@ -1149,7 +1255,7 @@ word_similarity_dist_op(PG_FUNCTION_ARGS)
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
false);
0);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
......@@ -1165,7 +1271,71 @@ word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
false);
0);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
PG_RETURN_FLOAT4(1.0 - res);
}
Datum
strict_word_similarity_op(PG_FUNCTION_ARGS)
{
text *in1 = PG_GETARG_TEXT_PP(0);
text *in2 = PG_GETARG_TEXT_PP(1);
float4 res;
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
}
Datum
strict_word_similarity_commutator_op(PG_FUNCTION_ARGS)
{
text *in1 = PG_GETARG_TEXT_PP(0);
text *in2 = PG_GETARG_TEXT_PP(1);
float4 res;
res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
}
Datum
strict_word_similarity_dist_op(PG_FUNCTION_ARGS)
{
text *in1 = PG_GETARG_TEXT_PP(0);
text *in2 = PG_GETARG_TEXT_PP(1);
float4 res;
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
WORD_SIMILARITY_STRICT);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
PG_RETURN_FLOAT4(1.0 - res);
}
Datum
strict_word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
{
text *in1 = PG_GETARG_TEXT_PP(0);
text *in2 = PG_GETARG_TEXT_PP(1);
float4 res;
res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
WORD_SIMILARITY_STRICT);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
......
......@@ -105,6 +105,17 @@
the explanation below.
</entry>
</row>
<row>
<entry>
<function>strict_word_similarity(text, text)</function>
<indexterm><primary>strict_word_similarity</primary></indexterm>
</entry>
<entry><type>real</type></entry>
<entry>
Same as <function>word_similarity(text, text)</function>, but forces
extent boundaries to match word boundaries.
</entry>
</row>
<row>
<entry><function>show_limit()</function><indexterm><primary>show_limit</primary></indexterm></entry>
<entry><type>real</type></entry>
......@@ -157,6 +168,29 @@
a part of the word.
</para>
<para>
At the same time, <function>strict_word_similarity(text, text)</function>
has to select an extent that matches word boundaries. In the example above,
<function>strict_word_similarity(text, text)</function> would select the
extent <literal>{" w"," wo","wor","ord","rds", ds "}</literal>, which
corresponds to the whole word <literal>'words'</literal>.
<programlisting>
# SELECT strict_word_similarity('word', 'two words'), similarity('word', 'words');
strict_word_similarity | similarity
------------------------+------------
0.571429 | 0.571429
(1 row)
</programlisting>
</para>
<para>
Thus, the <function>strict_word_similarity(text, text)</function> function
is useful for finding similar subsets of whole words, while
<function>word_similarity(text, text)</function> is more suitable for
searching similar parts of words.
</para>
<table id="pgtrgm-op-table">
<title><filename>pg_trgm</filename> Operators</title>
<tgroup cols="3">
......@@ -196,6 +230,24 @@
Commutator of the <literal>&lt;%</literal> operator.
</entry>
</row>
<row>
<entry><type>text</type> <literal>&lt;&lt;%</literal> <type>text</type></entry>
<entry><type>boolean</type></entry>
<entry>
Returns <literal>true</literal> if its second argument has a continuous
extent of an ordered trigram set that matches word boundaries,
and its similarity to the trigram set of the first argument is greater
than the current strict word similarity threshold set by the
<varname>pg_trgm.strict_word_similarity_threshold</varname> parameter.
</entry>
</row>
<row>
<entry><type>text</type> <literal>%&gt;&gt;</literal> <type>text</type></entry>
<entry><type>boolean</type></entry>
<entry>
Commutator of the <literal>&lt;&lt;%</literal> operator.
</entry>
</row>
<row>
<entry><type>text</type> <literal>&lt;-&gt;</literal> <type>text</type></entry>
<entry><type>real</type></entry>
......@@ -223,6 +275,25 @@
Commutator of the <literal>&lt;&lt;-&gt;</literal> operator.
</entry>
</row>
<row>
<entry>
<type>text</type> <literal>&lt;&lt;&lt;-&gt;</literal> <type>text</type>
</entry>
<entry><type>real</type></entry>
<entry>
Returns the <quote>distance</quote> between the arguments, that is
one minus the <function>strict_word_similarity()</function> value.
</entry>
</row>
<row>
<entry>
<type>text</type> <literal>&lt;-&gt;&gt;&gt;</literal> <type>text</type>
</entry>
<entry><type>real</type></entry>
<entry>
Commutator of the <literal>&lt;&lt;&lt;-&gt;</literal> operator.
</entry>
</row>
</tbody>
</tgroup>
</table>
......@@ -322,12 +393,19 @@ SELECT t, t &lt;-&gt; '<replaceable>word</replaceable>' AS dist
<para>
Also you can use an index on the <structfield>t</structfield> column for word
similarity. For example:
similarity or strict word similarity. Typical queries are:
<programlisting>
SELECT t, word_similarity('<replaceable>word</replaceable>', t) AS sml
FROM test_trgm
WHERE '<replaceable>word</replaceable>' &lt;% t
ORDER BY sml DESC, t;
</programlisting>
and
<programlisting>
SELECT t, strict_word_similarity('<replaceable>word</replaceable>', t) AS sml
FROM test_trgm
WHERE '<replaceable>word</replaceable>' &lt;&lt;% t
ORDER BY sml DESC, t;
</programlisting>
This will return all values in the text column for which there is a
continuous extent in the corresponding ordered trigram set that is
......@@ -337,11 +415,17 @@ SELECT t, word_similarity('<replaceable>word</replaceable>', t) AS sml
</para>
<para>
A variant of the above query is
Possible variants of the above queries are:
<programlisting>
SELECT t, '<replaceable>word</replaceable>' &lt;&lt;-&gt; t AS dist
FROM test_trgm
ORDER BY dist LIMIT 10;
</programlisting>
and
<programlisting>
SELECT t, '<replaceable>word</replaceable>' &lt;&lt;&lt;-&gt; t AS dist
FROM test_trgm
ORDER BY dist LIMIT 10;
</programlisting>
This can be implemented quite efficiently by GiST indexes, but not
by GIN indexes.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment