Commit 5e8d670c authored by Thomas Munro's avatar Thomas Munro

Add Greek characters to unaccent.rules.

Author: Tasos Maschalidis
Reviewed-by: Michael Paquier, Tom Lane
Discussion: https://postgr.es/m/153495048900.1368.11566580687623014380%40wrigleys.postgresql.org
Discussion: https://postgr.es/m/VI1PR01MB38537EBD529FE5EE3FE9A5FEB5370%40VI1PR01MB3853.eurprd01.prod.exchangelabs.com
parent ec743699
...@@ -29,6 +29,15 @@ import argparse ...@@ -29,6 +29,15 @@ import argparse
import sys import sys
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
# The ranges of Unicode characters that we consider to be "plain letters".
# For now we are being conservative by including only Latin and Greek. This
# could be extended in future based on feedback from people with relevant
# language knowledge.
PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case
(ord('A'), ord('Z')), # Latin upper case
(0x03b1, 0x03c9), # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA
(0x0391, 0x03a9)) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
def print_record(codepoint, letter): def print_record(codepoint, letter):
print (unichr(codepoint) + "\t" + letter).encode("UTF-8") print (unichr(codepoint) + "\t" + letter).encode("UTF-8")
...@@ -39,9 +48,11 @@ class Codepoint: ...@@ -39,9 +48,11 @@ class Codepoint:
self.combining_ids = combining_ids self.combining_ids = combining_ids
def is_plain_letter(codepoint): def is_plain_letter(codepoint):
"""Return true if codepoint represents a plain ASCII letter.""" """Return true if codepoint represents a "plain letter"."""
return (codepoint.id >= ord('a') and codepoint.id <= ord('z')) or \ for begin, end in PLAIN_LETTER_RANGES:
(codepoint.id >= ord('A') and codepoint.id <= ord('Z')) if codepoint.id >= begin and codepoint.id <= end:
return True
return False
def is_mark(codepoint): def is_mark(codepoint):
"""Returns true for diacritical marks (combining codepoints).""" """Returns true for diacritical marks (combining codepoints)."""
...@@ -184,7 +195,7 @@ def main(args): ...@@ -184,7 +195,7 @@ def main(args):
len(codepoint.combining_ids) > 1: len(codepoint.combining_ids) > 1:
if is_letter_with_marks(codepoint, table): if is_letter_with_marks(codepoint, table):
charactersSet.add((codepoint.id, charactersSet.add((codepoint.id,
chr(get_plain_letter(codepoint, table).id))) unichr(get_plain_letter(codepoint, table).id)))
elif args.noLigaturesExpansion is False and is_ligature(codepoint, table): elif args.noLigaturesExpansion is False and is_ligature(codepoint, table):
charactersSet.add((codepoint.id, charactersSet.add((codepoint.id,
"".join(unichr(combining_codepoint.id) "".join(unichr(combining_codepoint.id)
......
...@@ -399,6 +399,26 @@ ...@@ -399,6 +399,26 @@
ʦ ts ʦ ts
ʪ ls ʪ ls
ʫ lz ʫ lz
Ά Α
Έ Ε
Ή Η
Ί Ι
Ό Ο
Ύ Υ
Ώ Ω
ΐ ι
Ϊ Ι
Ϋ Υ
ά α
έ ε
ή η
ί ι
ΰ υ
ϊ ι
ϋ υ
ό ο
ύ υ
ώ ω
Ё Е Ё Е
ё е ё е
ᴀ A ᴀ A
...@@ -709,6 +729,207 @@ ...@@ -709,6 +729,207 @@
ỽ v ỽ v
Ỿ Y Ỿ Y
ỿ y ỿ y
ἀ α
ἁ α
ἂ α
ἃ α
ἄ α
ἅ α
ἆ α
ἇ α
Ἀ Α
Ἁ Α
Ἂ Α
Ἃ Α
Ἄ Α
Ἅ Α
Ἆ Α
Ἇ Α
ἐ ε
ἑ ε
ἒ ε
ἓ ε
ἔ ε
ἕ ε
Ἐ Ε
Ἑ Ε
Ἒ Ε
Ἓ Ε
Ἔ Ε
Ἕ Ε
ἠ η
ἡ η
ἢ η
ἣ η
ἤ η
ἥ η
ἦ η
ἧ η
Ἠ Η
Ἡ Η
Ἢ Η
Ἣ Η
Ἤ Η
Ἥ Η
Ἦ Η
Ἧ Η
ἰ ι
ἱ ι
ἲ ι
ἳ ι
ἴ ι
ἵ ι
ἶ ι
ἷ ι
Ἰ Ι
Ἱ Ι
Ἲ Ι
Ἳ Ι
Ἴ Ι
Ἵ Ι
Ἶ Ι
Ἷ Ι
ὀ ο
ὁ ο
ὂ ο
ὃ ο
ὄ ο
ὅ ο
Ὀ Ο
Ὁ Ο
Ὂ Ο
Ὃ Ο
Ὄ Ο
Ὅ Ο
ὐ υ
ὑ υ
ὒ υ
ὓ υ
ὔ υ
ὕ υ
ὖ υ
ὗ υ
Ὑ Υ
Ὓ Υ
Ὕ Υ
Ὗ Υ
ὠ ω
ὡ ω
ὢ ω
ὣ ω
ὤ ω
ὥ ω
ὦ ω
ὧ ω
Ὠ Ω
Ὡ Ω
Ὢ Ω
Ὣ Ω
Ὤ Ω
Ὥ Ω
Ὦ Ω
Ὧ Ω
ὰ α
ὲ ε
ὴ η
ὶ ι
ὸ ο
ὺ υ
ὼ ω
ᾀ α
ᾁ α
ᾂ α
ᾃ α
ᾄ α
ᾅ α
ᾆ α
ᾇ α
ᾈ Α
ᾉ Α
ᾊ Α
ᾋ Α
ᾌ Α
ᾍ Α
ᾎ Α
ᾏ Α
ᾐ η
ᾑ η
ᾒ η
ᾓ η
ᾔ η
ᾕ η
ᾖ η
ᾗ η
ᾘ Η
ᾙ Η
ᾚ Η
ᾛ Η
ᾜ Η
ᾝ Η
ᾞ Η
ᾟ Η
ᾠ ω
ᾡ ω
ᾢ ω
ᾣ ω
ᾤ ω
ᾥ ω
ᾦ ω
ᾧ ω
ᾨ Ω
ᾩ Ω
ᾪ Ω
ᾫ Ω
ᾬ Ω
ᾭ Ω
ᾮ Ω
ᾯ Ω
ᾰ α
ᾱ α
ᾲ α
ᾳ α
ᾴ α
ᾶ α
ᾷ α
Ᾰ Α
Ᾱ Α
Ὰ Α
ᾼ Α
ῂ η
ῃ η
ῄ η
ῆ η
ῇ η
Ὲ Ε
Ὴ Η
ῌ Η
ῐ ι
ῑ ι
ῒ ι
ῖ ι
ῗ ι
Ῐ Ι
Ῑ Ι
Ὶ Ι
ῠ υ
ῡ υ
ῢ υ
ῤ ρ
ῥ ρ
ῦ υ
ῧ υ
Ῠ Υ
Ῡ Υ
Ὺ Υ
Ῥ Ρ
ῲ ω
ῳ ω
ῴ ω
ῶ ω
ῷ ω
Ὸ Ο
Ὼ Ω
ῼ Ω
‐ - ‐ -
‑ - ‑ -
‒ - ‒ -
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment