Commit bdb839cb authored by Peter Eisentraut's avatar Peter Eisentraut

Update unicode.org URLs

Use https, consistent host name, remove references to ftp.  Also
update the URLs for CLDR, which has moved from Trac to GitHub.
parent 9abb2bfc
......@@ -24,9 +24,9 @@
# Latin-ASCII.xml, the latest data sets released can be browsed directly
# via [3]. Note that this script is compatible with at least release 29.
#
# [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt
# [2] http://unicode.org/cldr/trac/export/14746/tags/release-34/common/transforms/Latin-ASCII.xml
# [3] https://unicode.org/cldr/trac/browser/tags
# [1] https://www.unicode.org/Public/8.0.0/ucd/UnicodeData.txt
# [2] https://raw.githubusercontent.com/unicode-org/cldr/release-34/common/transforms/Latin-ASCII.xml
# [3] https://github.com/unicode-org/cldr/tags
# BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped
# The approach is to be Python3 compatible with Python2 "backports".
......@@ -113,7 +113,7 @@ def is_mark(codepoint):
def is_letter_with_marks(codepoint, table):
"""Returns true for letters combined with one or more marks."""
# See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
# See https://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
# Letter may have no combining characters, in which case it has
# no marks.
......@@ -226,7 +226,7 @@ def special_cases():
return charactersSet
def main(args):
# http://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
# https://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
decomposition_type_pattern = re.compile(" *<[^>]*> *")
table = {}
......@@ -243,7 +243,7 @@ def main(args):
for line in unicodeDataFile:
fields = line.split(";")
if len(fields) > 5:
# http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
# https://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
general_category = fields[2]
decomposition = fields[5]
decomposition = re.sub(decomposition_type_pattern, ' ', decomposition)
......@@ -281,8 +281,8 @@ def main(args):
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments.')
parser.add_argument("--unicode-data-file", help="Path to formatted text file corresponding to UnicodeData.txt. See <http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt>.", type=str, required=True, dest='unicodeDataFilePath')
parser.add_argument("--latin-ascii-file", help="Path to XML file from Unicode Common Locale Data Repository (CLDR) corresponding to Latin-ASCII transliterator (Latin-ASCII.xml). See <http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml>.", type=str, dest='latinAsciiFilePath')
parser.add_argument("--unicode-data-file", help="Path to formatted text file corresponding to UnicodeData.txt.", type=str, required=True, dest='unicodeDataFilePath')
parser.add_argument("--latin-ascii-file", help="Path to XML file from Unicode Common Locale Data Repository (CLDR) corresponding to Latin-ASCII transliterator (Latin-ASCII.xml).", type=str, dest='latinAsciiFilePath')
parser.add_argument("--no-ligatures-expansion", help="Do not expand ligatures and do not use Unicode CLDR Latin-ASCII transliterator. By default, this option is not enabled and \"--latin-ascii-file\" argument is required. If this option is enabled, \"--latin-ascii-file\" argument is optional and ignored.", action="store_true", dest='noLigaturesExpansion')
args = parser.parse_args()
......
......@@ -728,7 +728,7 @@
<term><acronym>UTF</acronym></term>
<listitem>
<para>
<ulink url="http://www.unicode.org/">Unicode Transformation
<ulink url="https://www.unicode.org/">Unicode Transformation
Format</ulink>
</para>
</listitem>
......
......@@ -832,12 +832,12 @@ CREATE COLLATION german (provider = libc, locale = 'de_DE');
</varlistentry>
</variablelist>
See <ulink url="http://unicode.org/reports/tr35/tr35-collation.html">Unicode
See <ulink url="https://www.unicode.org/reports/tr35/tr35-collation.html">Unicode
Technical Standard #35</ulink>
and <ulink url="https://tools.ietf.org/html/bcp47">BCP 47</ulink> for
details. The list of possible collation types (<literal>co</literal>
subtag) can be found in
the <ulink url="http://www.unicode.org/repos/cldr/trunk/common/bcp47/collation.xml">CLDR
the <ulink url="https://github.com/unicode-org/cldr/blob/master/common/bcp47/collation.xml">CLDR
repository</ulink>.
The <ulink url="https://ssl.icu-project.org/icu-bin/locexp">ICU Locale
Explorer</ulink> can be used to check the details of a particular locale
......@@ -900,7 +900,7 @@ CREATE COLLATION french FROM "fr-x-icu";
different Unicode normal forms. It is up to the collation provider to
actually implement such insensitive comparisons; the deterministic flag
only determines whether ties are to be broken using bytewise comparison.
See also <ulink url="https://unicode.org/reports/tr10">Unicode Technical
See also <ulink url="https://www.unicode.org/reports/tr10">Unicode Technical
Standard 10</ulink> for more information on the terminology.
</para>
......@@ -1926,7 +1926,7 @@ RESET client_encoding;
</varlistentry>
<varlistentry>
<term><ulink url="http://www.unicode.org/"></ulink></term>
<term><ulink url="https://www.unicode.org/"></ulink></term>
<listitem>
<para>
......
......@@ -119,7 +119,7 @@ DOWNLOAD = wget -O $@ --no-use-server-timestamps
#DOWNLOAD = curl -o $@
BIG5.TXT CNS11643.TXT:
$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/$(@F)
$(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/$(@F)
euc-jis-2004-std.txt sjis-0213-2004-std.txt:
$(DOWNLOAD) http://x0213.org/codetable/$(@F)
......@@ -131,19 +131,19 @@ GB2312.TXT:
$(DOWNLOAD) 'http://trac.greenstone.org/browser/trunk/gsdl/unicode/MAPPINGS/EASTASIA/GB/GB2312.TXT?rev=1842&format=txt'
JIS0212.TXT:
$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/$(@F)
$(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/$(@F)
JOHAB.TXT KSX1001.TXT:
$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC/$(@F)
$(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC/$(@F)
KOI8-R.TXT KOI8-U.TXT:
$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/VENDORS/MISC/$(@F)
$(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/VENDORS/MISC/$(@F)
$(ISO8859TEXTS):
$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/ISO8859/$(@F)
$(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/ISO8859/$(@F)
$(filter-out CP8%,$(WINTEXTS)) CP932.TXT CP950.TXT:
$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/$(@F)
$(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/$(@F)
$(filter CP8%,$(WINTEXTS)):
$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/$(@F)
$(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/$(@F)
......@@ -8,8 +8,8 @@
# map files provided by Unicode organization.
# Unfortunately it is prohibited by the organization
# to distribute the map files. So if you try to use this script,
# you have to obtain the map files from the organization's ftp site.
# ftp://www.unicode.org/Public/MAPPINGS/
# you have to obtain the map files from the organization's download site.
# https://www.unicode.org/Public/MAPPINGS/
#
# Our "big5" comes from BIG5.TXT, with the addition of the characters
# in the range 0xf9d6-0xf9dc from CP950.TXT.
......
......@@ -8,8 +8,8 @@
# map files provided by Unicode organization.
# Unfortunately it is prohibited by the organization
# to distribute the map files. So if you try to use this script,
# you have to obtain the map files from the organization's ftp site.
# ftp://www.unicode.org/Public/MAPPINGS/
# you have to obtain the map files from the organization's download site.
# https://www.unicode.org/Public/MAPPINGS/
# We assume the file include three tab-separated columns:
# JOHAB code in hex
# UCS-2 code in hex
......
......@@ -8,8 +8,8 @@
# map files provided by Unicode organization.
# Unfortunately it is prohibited by the organization
# to distribute the map files. So if you try to use this script,
# you have to obtain the map files from the organization's ftp site.
# ftp://www.unicode.org/Public/MAPPINGS/
# you have to obtain the map files from the organization's download site.
# https://www.unicode.org/Public/MAPPINGS/
# We assume the file include three tab-separated columns:
# source character set code in hex
# UCS-2 code in hex
......
......@@ -23,7 +23,7 @@ DOWNLOAD = wget -O $@ --no-use-server-timestamps
# These files are part of the Unicode Character Database. Download
# them on demand.
UnicodeData.txt CompositionExclusions.txt NormalizationTest.txt:
$(DOWNLOAD) http://unicode.org/Public/UNIDATA/$(@F)
$(DOWNLOAD) https://www.unicode.org/Public/UNIDATA/$(@F)
# Generation of conversion tables used for string normalization with
# UTF-8 strings.
......
......@@ -3,7 +3,7 @@
* Normalize a Unicode string to NFKC form
*
* This implements Unicode normalization, per the documentation at
* http://www.unicode.org/reports/tr15/.
* https://www.unicode.org/reports/tr15/.
*
* Portions Copyright (c) 2017-2019, PostgreSQL Global Development Group
*
......@@ -109,7 +109,7 @@ get_decomposed_size(pg_wchar code)
/*
* Fast path for Hangul characters not stored in tables to save memory as
* decomposition is algorithmic. See
* http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details on
* https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details on
* the matter.
*/
if (code >= SBASE && code < SBASE + SCOUNT)
......@@ -234,7 +234,7 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current)
/*
* Fast path for Hangul characters not stored in tables to save memory as
* decomposition is algorithmic. See
* http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details on
* https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details on
* the matter.
*/
if (code >= SBASE && code < SBASE + SCOUNT)
......@@ -362,7 +362,7 @@ unicode_normalize_kc(const pg_wchar *input)
continue;
/*
* Per Unicode (http://unicode.org/reports/tr15/tr15-18.html) annex 4,
* Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html) annex 4,
* a sequence of two adjacent characters in a string is an
* exchangeable pair if the combining class (from the Unicode
* Character Database) for the first character is greater than the
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment