Fix unaccent generation script in Windows

As originally coded, the script would fail on Windows 10 and Python 3 because stdout would not be switched to UTF-8 only for Python 2. This patch makes that apply to both versions. Also add python 2 compatibility markers so that we know what to remove once we drop support for that. Also use a "with" clause to ensure file descriptor is closed promptly. Author: Hugh Ranalli, Ramanarayana Reviewed-by: Kyotaro Horiguchi Discussion: https://postgr.es/m/CAKm4Xs7_61XMyOWmHs3n0mmkS0O4S0pvfWk=7cQ5P0gs177f7A@mail.gmail.com Discussion: https://postgr.es/m/15548-cef1b3f8de190d4f@postgresql.org

Fix unaccent generation script in Windows
As originally coded, the script would fail on Windows 10 and Python 3 because stdout would not be switched to UTF-8 only for Python 2. This patch makes that apply to both versions. Also add python 2 compatibility markers so that we know what to remove once we drop support for that. Also use a "with" clause to ensure file descriptor is closed promptly. Author: Hugh Ranalli, Ramanarayana Reviewed-by: Kyotaro Horiguchi Discussion: https://postgr.es/m/CAKm4Xs7_61XMyOWmHs3n0mmkS0O4S0pvfWk=7cQ5P0gs177f7A@mail.gmail.com Discussion: https://postgr.es/m/15548-cef1b3f8de190d4f@postgresql.org
0afc0a78 · Alvaro Herrera · b438e7e7 · 0afc0a78
Commit 0afc0a78 authored Sep 10, 2019 by Alvaro Herrera
Hide whitespace changes
Inline Side-by-side

Showing with 24 additions and 20 deletions

contrib/unaccent/generate_unaccent_rules.py contrib/unaccent/generate_unaccent_rules.py +24 -20

No files found.
--- a/contrib/unaccent/generate_unaccent_rules.py
+++ b/contrib/unaccent/generate_unaccent_rules.py
@@ -32,9 +32,15 @@
 # The approach is to be Python3 compatible with Python2 "backports".
 from __future__ import print_function
 from __future__ import unicode_literals
+# END: Python 2/3 compatibility - remove when Python 2 compatibility dropped
+import argparse
 import codecs
+import re
 import sys
+import xml.etree.ElementTree as ET
+# BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped
 if sys.version_info[0] <= 2:
    # Encode stdout as UTF-8, so we can just print to it
    sys.stdout = codecs.getwriter('utf8')(sys.stdout)
@@ -45,12 +51,9 @@ if sys.version_info[0] <= 2:
    # Python 2 and 3 compatible bytes call
    def bytes(source, encoding='ascii', errors='strict'):
        return source.encode(encoding=encoding, errors=errors)
+else:
 # END: Python 2/3 compatibility - remove when Python 2 compatibility dropped
+    sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer)
-import re
-import argparse
-import sys
-import xml.etree.ElementTree as ET
 # The ranges of Unicode characters that we consider to be "plain letters".
 # For now we are being conservative by including only Latin and Greek.  This
@@ -233,21 +236,22 @@ def main(args):
    charactersSet = set()
    # read file UnicodeData.txt
-    unicodeDataFile = open(args.unicodeDataFilePath, 'r')
+    with codecs.open(
+      args.unicodeDataFilePath, mode='r', encoding='UTF-8',
-    # read everything we need into memory
+      ) as unicodeDataFile:
-    for line in unicodeDataFile:
+        # read everything we need into memory
-        fields = line.split(";")
+        for line in unicodeDataFile:
-        if len(fields) > 5:
+            fields = line.split(";")
-            # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
+            if len(fields) > 5:
-            general_category = fields[2]
+                # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
-            decomposition = fields[5]
+                general_category = fields[2]
-            decomposition = re.sub(decomposition_type_pattern, ' ', decomposition)
+                decomposition = fields[5]
-            id = int(fields[0], 16)
+                decomposition = re.sub(decomposition_type_pattern, ' ', decomposition)
-            combining_ids = [int(s, 16) for s in decomposition.split(" ") if s != ""]
+                id = int(fields[0], 16)
-            codepoint = Codepoint(id, general_category, combining_ids)
+                combining_ids = [int(s, 16) for s in decomposition.split(" ") if s != ""]
-            table[id] = codepoint
+                codepoint = Codepoint(id, general_category, combining_ids)
-            all.append(codepoint)
+                table[id] = codepoint
+                all.append(codepoint)
    # walk through all the codepoints looking for interesting mappings
    for codepoint in all: