Commit ddb5fdc0 authored by Tom Lane's avatar Tom Lane

Further hacking on ICU collation creation and usage.

pg_import_system_collations() refused to create any ICU collations if
the current database's encoding didn't support ICU.  This is wrongheaded:
initdb must initialize pg_collation in an encoding-independent way
since it might be used in other databases with different encodings.
The reason for the restriction seems to be that get_icu_locale_comment()
used icu_from_uchar() to convert the UChar-format display name, and that
unsurprisingly doesn't know what to do in unsupported encodings.
But by the same token that the initial catalog contents must be
encoding-independent, we can't allow non-ASCII characters in the comment
strings.  So we don't really need icu_from_uchar() here: just check for
Unicode codes outside the ASCII range, and if there are none, the format
conversion is trivial.  If there are some, we can simply not install the
comment.  (In my testing, this affects only Norwegian Bokmål, which has
given us trouble before.)

For paranoia's sake, also check for non-ASCII characters in ICU locale
names, and skip such locales, as we do for libc locales.  I don't
currently have a reason to believe that this will ever reject anything,
but then again the libc maintainers should have known better too.

With just the import changes, ICU collations can be found in pg_collation
in databases with unsupported encodings.  This resulted in more or less
clean failures at runtime, but that's not how things act for unsupported
encodings with libc collations.  Make it work the same as our traditional
behavior for libc collations by having collation lookup take into account
whether is_encoding_supported_by_icu().

Adjust documentation to match.  Also, expand Table 23.1 to show which
encodings are supported by ICU.

catversion bump because of likely change in pg_collation/pg_description
initial contents in ICU-enabled builds.

Discussion: https://postgr.es/m/20c74bc3-d6ca-243d-1bbc-12f17fa4fe9a@gmail.com
parent a15b47df
This diff is collapsed.
......@@ -1914,10 +1914,61 @@ OpfamilyIsVisible(Oid opfid)
return visible;
}
/*
* lookup_collation
* If there's a collation of the given name/namespace, and it works
* with the given encoding, return its OID. Else return InvalidOid.
*/
static Oid
lookup_collation(const char *collname, Oid collnamespace, int32 encoding)
{
Oid collid;
HeapTuple colltup;
Form_pg_collation collform;
/* Check for encoding-specific entry (exact match) */
collid = GetSysCacheOid3(COLLNAMEENCNSP,
PointerGetDatum(collname),
Int32GetDatum(encoding),
ObjectIdGetDatum(collnamespace));
if (OidIsValid(collid))
return collid;
/*
* Check for any-encoding entry. This takes a bit more work: while libc
* collations with collencoding = -1 do work with all encodings, ICU
* collations only work with certain encodings, so we have to check that
* aspect before deciding it's a match.
*/
colltup = SearchSysCache3(COLLNAMEENCNSP,
PointerGetDatum(collname),
Int32GetDatum(-1),
ObjectIdGetDatum(collnamespace));
if (!HeapTupleIsValid(colltup))
return InvalidOid;
collform = (Form_pg_collation) GETSTRUCT(colltup);
if (collform->collprovider == COLLPROVIDER_ICU)
{
if (is_encoding_supported_by_icu(encoding))
collid = HeapTupleGetOid(colltup);
else
collid = InvalidOid;
}
else
{
collid = HeapTupleGetOid(colltup);
}
ReleaseSysCache(colltup);
return collid;
}
/*
* CollationGetCollid
* Try to resolve an unqualified collation name.
* Returns OID if collation found in search path, else InvalidOid.
*
* Note that this will only find collations that work with the current
* database's encoding.
*/
Oid
CollationGetCollid(const char *collname)
......@@ -1935,19 +1986,7 @@ CollationGetCollid(const char *collname)
if (namespaceId == myTempNamespace)
continue; /* do not look in temp namespace */
/* Check for database-encoding-specific entry */
collid = GetSysCacheOid3(COLLNAMEENCNSP,
PointerGetDatum(collname),
Int32GetDatum(dbencoding),
ObjectIdGetDatum(namespaceId));
if (OidIsValid(collid))
return collid;
/* Check for any-encoding entry */
collid = GetSysCacheOid3(COLLNAMEENCNSP,
PointerGetDatum(collname),
Int32GetDatum(-1),
ObjectIdGetDatum(namespaceId));
collid = lookup_collation(collname, namespaceId, dbencoding);
if (OidIsValid(collid))
return collid;
}
......@@ -1961,6 +2000,9 @@ CollationGetCollid(const char *collname)
* Determine whether a collation (identified by OID) is visible in the
* current search path. Visible means "would be found by searching
* for the unqualified collation name".
*
* Note that only collations that work with the current database's encoding
* will be considered visible.
*/
bool
CollationIsVisible(Oid collid)
......@@ -1990,9 +2032,10 @@ CollationIsVisible(Oid collid)
{
/*
* If it is in the path, it might still not be visible; it could be
* hidden by another conversion of the same name earlier in the path.
* So we must do a slow check to see if this conversion would be found
* by CollationGetCollid.
* hidden by another collation of the same name earlier in the path,
* or it might not work with the current DB encoding. So we must do a
* slow check to see if this collation would be found by
* CollationGetCollid.
*/
char *collname = NameStr(collform->collname);
......@@ -3442,6 +3485,9 @@ PopOverrideSearchPath(void)
/*
* get_collation_oid - find a collation by possibly qualified name
*
* Note that this will only find collations that work with the current
* database's encoding.
*/
Oid
get_collation_oid(List *name, bool missing_ok)
......@@ -3463,17 +3509,7 @@ get_collation_oid(List *name, bool missing_ok)
if (missing_ok && !OidIsValid(namespaceId))
return InvalidOid;
/* first try for encoding-specific entry, then any-encoding */
colloid = GetSysCacheOid3(COLLNAMEENCNSP,
PointerGetDatum(collation_name),
Int32GetDatum(dbencoding),
ObjectIdGetDatum(namespaceId));
if (OidIsValid(colloid))
return colloid;
colloid = GetSysCacheOid3(COLLNAMEENCNSP,
PointerGetDatum(collation_name),
Int32GetDatum(-1),
ObjectIdGetDatum(namespaceId));
colloid = lookup_collation(collation_name, namespaceId, dbencoding);
if (OidIsValid(colloid))
return colloid;
}
......@@ -3489,16 +3525,7 @@ get_collation_oid(List *name, bool missing_ok)
if (namespaceId == myTempNamespace)
continue; /* do not look in temp namespace */
colloid = GetSysCacheOid3(COLLNAMEENCNSP,
PointerGetDatum(collation_name),
Int32GetDatum(dbencoding),
ObjectIdGetDatum(namespaceId));
if (OidIsValid(colloid))
return colloid;
colloid = GetSysCacheOid3(COLLNAMEENCNSP,
PointerGetDatum(collation_name),
Int32GetDatum(-1),
ObjectIdGetDatum(namespaceId));
colloid = lookup_collation(collation_name, namespaceId, dbencoding);
if (OidIsValid(colloid))
return colloid;
}
......
......@@ -353,6 +353,21 @@ pg_collation_actual_version(PG_FUNCTION_ARGS)
}
/*
* Check a string to see if it is pure ASCII
*/
static bool
is_all_ascii(const char *str)
{
while (*str)
{
if (IS_HIGHBIT_SET(*str))
return false;
str++;
}
return true;
}
/* will we use "locale -a" in pg_import_system_collations? */
#if defined(HAVE_LOCALE_T) && !defined(WIN32)
#define READ_LOCALE_A_OUTPUT
......@@ -431,7 +446,9 @@ get_icu_language_tag(const char *localename)
/*
* Get a comment (specifically, the display name) for an ICU locale.
* The result is a palloc'd string.
* The result is a palloc'd string, or NULL if we can't get a comment
* or find that it's not all ASCII. (We can *not* accept non-ASCII
* comments, because the contents of template0 must be encoding-agnostic.)
*/
static char *
get_icu_locale_comment(const char *localename)
......@@ -439,6 +456,7 @@ get_icu_locale_comment(const char *localename)
UErrorCode status;
UChar displayname[128];
int32 len_uchar;
int32 i;
char *result;
status = U_ZERO_ERROR;
......@@ -446,11 +464,20 @@ get_icu_locale_comment(const char *localename)
displayname, lengthof(displayname),
&status);
if (U_FAILURE(status))
ereport(ERROR,
(errmsg("could not get display name for locale \"%s\": %s",
localename, u_errorName(status))));
return NULL; /* no good reason to raise an error */
/* Check for non-ASCII comment (can't use is_all_ascii for this) */
for (i = 0; i < len_uchar; i++)
{
if (displayname[i] > 127)
return NULL;
}
icu_from_uchar(&result, displayname, len_uchar);
/* OK, transcribe */
result = palloc(len_uchar + 1);
for (i = 0; i < len_uchar; i++)
result[i] = displayname[i];
result[len_uchar] = '\0';
return result;
}
......@@ -502,7 +529,6 @@ pg_import_system_collations(PG_FUNCTION_ARGS)
{
size_t len;
int enc;
bool skip;
char alias[NAMEDATALEN];
len = strlen(localebuf);
......@@ -521,16 +547,7 @@ pg_import_system_collations(PG_FUNCTION_ARGS)
* interpret the non-ASCII characters. We can't do much with
* those, so we filter them out.
*/
skip = false;
for (i = 0; i < len; i++)
{
if (IS_HIGHBIT_SET(localebuf[i]))
{
skip = true;
break;
}
}
if (skip)
if (!is_all_ascii(localebuf))
{
elog(DEBUG1, "locale name has non-ASCII characters, skipped: \"%s\"", localebuf);
continue;
......@@ -642,14 +659,6 @@ pg_import_system_collations(PG_FUNCTION_ARGS)
/* Load collations known to ICU */
#ifdef USE_ICU
if (!is_encoding_supported_by_icu(GetDatabaseEncoding()))
{
ereport(NOTICE,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("encoding \"%s\" not supported by ICU",
pg_encoding_to_char(GetDatabaseEncoding()))));
}
else
{
int i;
......@@ -661,6 +670,7 @@ pg_import_system_collations(PG_FUNCTION_ARGS)
{
const char *name;
char *langtag;
char *icucomment;
const char *collcollate;
UEnumeration *en;
UErrorCode status;
......@@ -674,6 +684,14 @@ pg_import_system_collations(PG_FUNCTION_ARGS)
langtag = get_icu_language_tag(name);
collcollate = U_ICU_VERSION_MAJOR_NUM >= 54 ? langtag : name;
/*
* Be paranoid about not allowing any non-ASCII strings into
* pg_collation
*/
if (!is_all_ascii(langtag) || !is_all_ascii(collcollate))
continue;
collid = CollationCreate(psprintf("%s-x-icu", langtag),
nspid, GetUserId(),
COLLPROVIDER_ICU, -1,
......@@ -686,8 +704,10 @@ pg_import_system_collations(PG_FUNCTION_ARGS)
CommandCounterIncrement();
CreateComments(collid, CollationRelationId, 0,
get_icu_locale_comment(name));
icucomment = get_icu_locale_comment(name);
if (icucomment)
CreateComments(collid, CollationRelationId, 0,
icucomment);
}
/*
......@@ -708,6 +728,14 @@ pg_import_system_collations(PG_FUNCTION_ARGS)
langtag = get_icu_language_tag(localeid);
collcollate = U_ICU_VERSION_MAJOR_NUM >= 54 ? langtag : localeid;
/*
* Be paranoid about not allowing any non-ASCII strings into
* pg_collation
*/
if (!is_all_ascii(langtag) || !is_all_ascii(collcollate))
continue;
collid = CollationCreate(psprintf("%s-x-icu", langtag),
nspid, GetUserId(),
COLLPROVIDER_ICU, -1,
......@@ -720,8 +748,10 @@ pg_import_system_collations(PG_FUNCTION_ARGS)
CommandCounterIncrement();
CreateComments(collid, CollationRelationId, 0,
get_icu_locale_comment(localeid));
icucomment = get_icu_locale_comment(name);
if (icucomment)
CreateComments(collid, CollationRelationId, 0,
icucomment);
}
}
if (U_FAILURE(status))
......
......@@ -53,6 +53,6 @@
*/
/* yyyymmddN */
#define CATALOG_VERSION_NO 201706231
#define CATALOG_VERSION_NO 201706241
#endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment