Commit 0afc0a78 authored by Alvaro Herrera's avatar Alvaro Herrera

Fix unaccent generation script in Windows

As originally coded, the script would fail on Windows 10 and Python 3
because stdout would not be switched to UTF-8 only for Python 2.  This
patch makes that apply to both versions.

Also add python 2 compatibility markers so that we know what to remove
once we drop support for that.  Also use a "with" clause to ensure file
descriptor is closed promptly.

Author: Hugh Ranalli, Ramanarayana
Reviewed-by: Kyotaro Horiguchi
Discussion: https://postgr.es/m/CAKm4Xs7_61XMyOWmHs3n0mmkS0O4S0pvfWk=7cQ5P0gs177f7A@mail.gmail.com
Discussion: https://postgr.es/m/15548-cef1b3f8de190d4f@postgresql.org
parent b438e7e7
...@@ -32,9 +32,15 @@ ...@@ -32,9 +32,15 @@
# The approach is to be Python3 compatible with Python2 "backports". # The approach is to be Python3 compatible with Python2 "backports".
from __future__ import print_function from __future__ import print_function
from __future__ import unicode_literals from __future__ import unicode_literals
# END: Python 2/3 compatibility - remove when Python 2 compatibility dropped
import argparse
import codecs import codecs
import re
import sys import sys
import xml.etree.ElementTree as ET
# BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped
if sys.version_info[0] <= 2: if sys.version_info[0] <= 2:
# Encode stdout as UTF-8, so we can just print to it # Encode stdout as UTF-8, so we can just print to it
sys.stdout = codecs.getwriter('utf8')(sys.stdout) sys.stdout = codecs.getwriter('utf8')(sys.stdout)
...@@ -45,12 +51,9 @@ if sys.version_info[0] <= 2: ...@@ -45,12 +51,9 @@ if sys.version_info[0] <= 2:
# Python 2 and 3 compatible bytes call # Python 2 and 3 compatible bytes call
def bytes(source, encoding='ascii', errors='strict'): def bytes(source, encoding='ascii', errors='strict'):
return source.encode(encoding=encoding, errors=errors) return source.encode(encoding=encoding, errors=errors)
else:
# END: Python 2/3 compatibility - remove when Python 2 compatibility dropped # END: Python 2/3 compatibility - remove when Python 2 compatibility dropped
sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer)
import re
import argparse
import sys
import xml.etree.ElementTree as ET
# The ranges of Unicode characters that we consider to be "plain letters". # The ranges of Unicode characters that we consider to be "plain letters".
# For now we are being conservative by including only Latin and Greek. This # For now we are being conservative by including only Latin and Greek. This
...@@ -233,21 +236,22 @@ def main(args): ...@@ -233,21 +236,22 @@ def main(args):
charactersSet = set() charactersSet = set()
# read file UnicodeData.txt # read file UnicodeData.txt
unicodeDataFile = open(args.unicodeDataFilePath, 'r') with codecs.open(
args.unicodeDataFilePath, mode='r', encoding='UTF-8',
# read everything we need into memory ) as unicodeDataFile:
for line in unicodeDataFile: # read everything we need into memory
fields = line.split(";") for line in unicodeDataFile:
if len(fields) > 5: fields = line.split(";")
# http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt if len(fields) > 5:
general_category = fields[2] # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
decomposition = fields[5] general_category = fields[2]
decomposition = re.sub(decomposition_type_pattern, ' ', decomposition) decomposition = fields[5]
id = int(fields[0], 16) decomposition = re.sub(decomposition_type_pattern, ' ', decomposition)
combining_ids = [int(s, 16) for s in decomposition.split(" ") if s != ""] id = int(fields[0], 16)
codepoint = Codepoint(id, general_category, combining_ids) combining_ids = [int(s, 16) for s in decomposition.split(" ") if s != ""]
table[id] = codepoint codepoint = Codepoint(id, general_category, combining_ids)
all.append(codepoint) table[id] = codepoint
all.append(codepoint)
# walk through all the codepoints looking for interesting mappings # walk through all the codepoints looking for interesting mappings
for codepoint in all: for codepoint in all:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment