Commit 456e3718 authored by Thomas Munro's avatar Thomas Munro

Add combining characters to unaccent.rules.

Strip certain classes of combining characters, so that accents encoded
this way are removed.

Author: Hugh Ranalli
Discussion: https://postgr.es/m/15548-cef1b3f8de190d4f%40postgresql.org
parent 80579f9b
...@@ -31,6 +31,12 @@ SELECT unaccent('˃˖˗˜'); ...@@ -31,6 +31,12 @@ SELECT unaccent('˃˖˗˜');
>+-~ >+-~
(1 row) (1 row)
SELECT unaccent('À'); -- Remove combining diacritical 0x0300
unaccent
----------
A
(1 row)
SELECT unaccent('unaccent', 'foobar'); SELECT unaccent('unaccent', 'foobar');
unaccent unaccent
---------- ----------
...@@ -55,6 +61,12 @@ SELECT unaccent('unaccent', '˃˖˗˜'); ...@@ -55,6 +61,12 @@ SELECT unaccent('unaccent', '˃˖˗˜');
>+-~ >+-~
(1 row) (1 row)
SELECT unaccent('unaccent', 'À');
unaccent
----------
A
(1 row)
SELECT ts_lexize('unaccent', 'foobar'); SELECT ts_lexize('unaccent', 'foobar');
ts_lexize ts_lexize
----------- -----------
...@@ -79,3 +91,9 @@ SELECT ts_lexize('unaccent', '˃˖˗˜'); ...@@ -79,3 +91,9 @@ SELECT ts_lexize('unaccent', '˃˖˗˜');
{>+-~} {>+-~}
(1 row) (1 row)
SELECT ts_lexize('unaccent', 'À');
ts_lexize
-----------
{A}
(1 row)
...@@ -61,8 +61,25 @@ PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case ...@@ -61,8 +61,25 @@ PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case
(0x03b1, 0x03c9), # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA (0x03b1, 0x03c9), # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA
(0x0391, 0x03a9)) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA (0x0391, 0x03a9)) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
# Combining marks follow a "base" character, and result in a composite
# character. Example: "U&'A\0300'"produces "À".There are three types of
# combining marks: enclosing (Me), non-spacing combining (Mn), spacing
# combining (Mc). We identify the ranges of marks we feel safe removing.
# References:
# https://en.wikipedia.org/wiki/Combining_character
# https://www.unicode.org/charts/PDF/U0300.pdf
# https://www.unicode.org/charts/PDF/U20D0.pdf
COMBINING_MARK_RANGES = ((0x0300, 0x0362), # Mn: Accents, IPA
(0x20dd, 0x20E0), # Me: Symbols
(0x20e2, 0x20e4),) # Me: Screen, keycap, triangle
def print_record(codepoint, letter): def print_record(codepoint, letter):
print (chr(codepoint) + "\t" + letter) if letter:
output = chr(codepoint) + "\t" + letter
else:
output = chr(codepoint)
print(output)
class Codepoint: class Codepoint:
def __init__(self, id, general_category, combining_ids): def __init__(self, id, general_category, combining_ids):
...@@ -70,6 +87,16 @@ class Codepoint: ...@@ -70,6 +87,16 @@ class Codepoint:
self.general_category = general_category self.general_category = general_category
self.combining_ids = combining_ids self.combining_ids = combining_ids
def is_mark_to_remove(codepoint):
"""Return true if this is a combining mark to remove."""
if not is_mark(codepoint):
return False
for begin, end in COMBINING_MARK_RANGES:
if codepoint.id >= begin and codepoint.id <= end:
return True
return False
def is_plain_letter(codepoint): def is_plain_letter(codepoint):
"""Return true if codepoint represents a "plain letter".""" """Return true if codepoint represents a "plain letter"."""
for begin, end in PLAIN_LETTER_RANGES: for begin, end in PLAIN_LETTER_RANGES:
...@@ -234,6 +261,8 @@ def main(args): ...@@ -234,6 +261,8 @@ def main(args):
"".join(chr(combining_codepoint.id) "".join(chr(combining_codepoint.id)
for combining_codepoint \ for combining_codepoint \
in get_plain_letters(codepoint, table)))) in get_plain_letters(codepoint, table))))
elif is_mark_to_remove(codepoint):
charactersSet.add((codepoint.id, None))
# add CLDR Latin-ASCII characters # add CLDR Latin-ASCII characters
if not args.noLigaturesExpansion: if not args.noLigaturesExpansion:
......
...@@ -9,13 +9,16 @@ SELECT unaccent('foobar'); ...@@ -9,13 +9,16 @@ SELECT unaccent('foobar');
SELECT unaccent('ёлка'); SELECT unaccent('ёлка');
SELECT unaccent('ЁЖИК'); SELECT unaccent('ЁЖИК');
SELECT unaccent('˃˖˗˜'); SELECT unaccent('˃˖˗˜');
SELECT unaccent('À'); -- Remove combining diacritical 0x0300
SELECT unaccent('unaccent', 'foobar'); SELECT unaccent('unaccent', 'foobar');
SELECT unaccent('unaccent', 'ёлка'); SELECT unaccent('unaccent', 'ёлка');
SELECT unaccent('unaccent', 'ЁЖИК'); SELECT unaccent('unaccent', 'ЁЖИК');
SELECT unaccent('unaccent', '˃˖˗˜'); SELECT unaccent('unaccent', '˃˖˗˜');
SELECT unaccent('unaccent', 'À');
SELECT ts_lexize('unaccent', 'foobar'); SELECT ts_lexize('unaccent', 'foobar');
SELECT ts_lexize('unaccent', 'ёлка'); SELECT ts_lexize('unaccent', 'ёлка');
SELECT ts_lexize('unaccent', 'ЁЖИК'); SELECT ts_lexize('unaccent', 'ЁЖИК');
SELECT ts_lexize('unaccent', '˃˖˗˜'); SELECT ts_lexize('unaccent', '˃˖˗˜');
SELECT ts_lexize('unaccent', 'À');
...@@ -414,6 +414,105 @@ ...@@ -414,6 +414,105 @@
˖ + ˖ +
˗ - ˗ -
˜ ~ ˜ ~
̀
́
̂
̃
̄
̅
̆
̇
̈
̉
̊
̋
̌
̍
̎
̏
̐
̑
̒
̓
̔
̕
̖
̗
̘
̙
̚
̛
̜
̝
̞
̟
̠
̡
̢
̣
̤
̥
̦
̧
̨
̩
̪
̫
̬
̭
̮
̯
̰
̱
̲
̳
̴
̵
̶
̷
̸
̹
̺
̻
̼
̽
̾
̿
̀
́
͂
̓
̈́
ͅ
͆
͇
͈
͉
͊
͋
͌
͍
͎
͏
͐
͑
͒
͓
͔
͕
͖
͗
͘
͙
͚
͛
͜
͝
͞
͟
͠
͡
͢
Ά Α Ά Α
Έ Ε Έ Ε
Ή Η Ή Η
...@@ -982,6 +1081,13 @@ ...@@ -982,6 +1081,13 @@
₧ Pts ₧ Pts
₹ Rs ₹ Rs
₺ TL ₺ TL
℀ a/c ℀ a/c
℁ a/s ℁ a/s
ℂ C ℂ C
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment