Commit d40d564c authored by Peter Eisentraut's avatar Peter Eisentraut

Add support for other normal forms to Unicode normalization API

It previously only supported NFKC, for use by SASLprep.  This expands
the API to offer the choice of all four normalization forms.  Right
now, there are no internal users of the forms other than NFKC.
Reviewed-by: default avatarDaniel Verite <daniel@manitou-mail.org>
Reviewed-by: default avatarAndreas Karlsson <andreas@proxel.se>
Discussion: https://www.postgresql.org/message-id/flat/c1909f27-c269-2ed9-12f8-3ab72c8caf7a@2ndquadrant.com
parent cedffbdb
......@@ -1156,7 +1156,7 @@ pg_saslprep(const char *input, char **output)
* 2) Normalize -- Normalize the result of step 1 using Unicode
* normalization.
*/
output_chars = unicode_normalize_kc(input_chars);
output_chars = unicode_normalize(UNICODE_NFKC, input_chars);
if (!output_chars)
goto oom;
......
......@@ -48,7 +48,7 @@ typedef struct
{
int linenum;
pg_wchar input[50];
pg_wchar output[50];
pg_wchar output[4][50];
} pg_unicode_test;
/* test table */
......@@ -89,13 +89,16 @@ while (my $line = <$INPUT>)
my ($source, $nfc, $nfd, $nfkc, $nfkd) = split(';', $line);
my $source_utf8 = codepoint_string_to_hex($source);
my $nfc_utf8 = codepoint_string_to_hex($nfc);
my $nfd_utf8 = codepoint_string_to_hex($nfd);
my $nfkc_utf8 = codepoint_string_to_hex($nfkc);
my $nfkd_utf8 = codepoint_string_to_hex($nfkd);
print $OUTPUT "\t{ $linenum, { $source_utf8 }, { $nfkc_utf8 } },\n";
print $OUTPUT "\t{ $linenum, { $source_utf8 }, { { $nfc_utf8 }, { $nfd_utf8 }, { $nfkc_utf8 }, { $nfkd_utf8 } } },\n";
}
# Output terminator entry
print $OUTPUT "\t{ 0, { 0 }, { 0 } }";
print $OUTPUT "\t{ 0, { 0 }, { { 0 }, { 0 }, { 0 }, { 0 } } }";
print $OUTPUT "\n};\n";
close $OUTPUT;
......
......@@ -99,10 +99,12 @@ typedef struct
#define DECOMP_NO_COMPOSE 0x80 /* don't use for re-composition */
#define DECOMP_INLINE 0x40 /* decomposition is stored inline in
* dec_index */
#define DECOMP_COMPAT 0x20 /* compatibility mapping */
#define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags & 0x3F)
#define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & DECOMP_NO_COMPOSE) != 0)
#define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags & 0x1F)
#define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & (DECOMP_NO_COMPOSE | DECOMP_COMPAT)) != 0)
#define DECOMPOSITION_IS_INLINE(x) (((x)->dec_size_flags & DECOMP_INLINE) != 0)
#define DECOMPOSITION_IS_COMPAT(x) (((x)->dec_size_flags & DECOMP_COMPAT) != 0)
/* Table of Unicode codepoints and their decompositions */
static const pg_unicode_decomposition UnicodeDecompMain[$num_characters] =
......@@ -136,22 +138,22 @@ foreach my $char (@characters)
# Decomposition size
# Print size of decomposition
my $decomp_size = scalar(@decomp_elts);
die if $decomp_size > 0x1F; # to not overrun bitmask
my $first_decomp = shift @decomp_elts;
my $flags = "";
my $comment = "";
if ($decomp_size == 2)
{
# Should this be used for recomposition?
if ($compat)
{
$flags .= " | DECOMP_NO_COMPOSE";
$comment = "compatibility mapping";
$flags .= " | DECOMP_COMPAT";
}
elsif ($character_hash{$first_decomp}
if ($decomp_size == 2)
{
# Should this be used for recomposition?
if ($character_hash{$first_decomp}
&& $character_hash{$first_decomp}->{class} != 0)
{
$flags .= " | DECOMP_NO_COMPOSE";
......
......@@ -62,21 +62,24 @@ main(int argc, char **argv)
const pg_unicode_test *test;
for (test = UnicodeNormalizationTests; test->input[0] != 0; test++)
{
for (int form = 0; form < 4; form++)
{
pg_wchar *result;
result = unicode_normalize_kc(test->input);
result = unicode_normalize(form, test->input);
if (pg_wcscmp(test->output, result) != 0)
if (pg_wcscmp(test->output[form], result) != 0)
{
printf("FAILURE (NormalizationTest.txt line %d):\n", test->linenum);
printf("FAILURE (NormalizationTest.txt line %d form %d):\n", test->linenum, form);
printf("input: %s\n", print_wchar_str(test->input));
printf("expected: %s\n", print_wchar_str(test->output));
printf("expected: %s\n", print_wchar_str(test->output[form]));
printf("got: %s\n", print_wchar_str(result));
printf("\n");
exit(1);
}
}
}
printf("All tests successful!\n");
exit(0);
......
/*-------------------------------------------------------------------------
* unicode_norm.c
* Normalize a Unicode string to NFKC form
* Normalize a Unicode string
*
* This implements Unicode normalization, per the documentation at
* https://www.unicode.org/reports/tr15/.
......@@ -98,7 +98,7 @@ get_code_decomposition(pg_unicode_decomposition *entry, int *dec_size)
* are, in turn, decomposable.
*/
static int
get_decomposed_size(pg_wchar code)
get_decomposed_size(pg_wchar code, bool compat)
{
pg_unicode_decomposition *entry;
int size = 0;
......@@ -131,7 +131,8 @@ get_decomposed_size(pg_wchar code)
* Just count current code if no other decompositions. A NULL entry is
* equivalent to a character with class 0 and no decompositions.
*/
if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0)
if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
(!compat && DECOMPOSITION_IS_COMPAT(entry)))
return 1;
/*
......@@ -143,7 +144,7 @@ get_decomposed_size(pg_wchar code)
{
uint32 lcode = decomp[i];
size += get_decomposed_size(lcode);
size += get_decomposed_size(lcode, compat);
}
return size;
......@@ -224,7 +225,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result)
* in the array result.
*/
static void
decompose_code(pg_wchar code, pg_wchar **result, int *current)
decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
{
pg_unicode_decomposition *entry;
int i;
......@@ -272,7 +273,8 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current)
* character with class 0 and no decompositions, so just leave also in
* this case.
*/
if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0)
if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
(!compat && DECOMPOSITION_IS_COMPAT(entry)))
{
pg_wchar *res = *result;
......@@ -290,12 +292,12 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current)
pg_wchar lcode = (pg_wchar) decomp[i];
/* Leave if no more decompositions */
decompose_code(lcode, result, current);
decompose_code(lcode, compat, result, current);
}
}
/*
* unicode_normalize_kc - Normalize a Unicode string to NFKC form.
* unicode_normalize - Normalize a Unicode string to the specified form.
*
* The input is a 0-terminated array of codepoints.
*
......@@ -304,8 +306,10 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current)
* string is palloc'd instead, and OOM is reported with ereport().
*/
pg_wchar *
unicode_normalize_kc(const pg_wchar *input)
unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
{
bool compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
bool recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
pg_wchar *decomp_chars;
pg_wchar *recomp_chars;
int decomp_size,
......@@ -326,7 +330,7 @@ unicode_normalize_kc(const pg_wchar *input)
*/
decomp_size = 0;
for (p = input; *p; p++)
decomp_size += get_decomposed_size(*p);
decomp_size += get_decomposed_size(*p, compat);
decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
if (decomp_chars == NULL)
......@@ -338,7 +342,7 @@ unicode_normalize_kc(const pg_wchar *input)
*/
current_size = 0;
for (p = input; *p; p++)
decompose_code(*p, &decomp_chars, &current_size);
decompose_code(*p, compat, &decomp_chars, &current_size);
decomp_chars[decomp_size] = '\0';
Assert(decomp_size == current_size);
......@@ -385,8 +389,11 @@ unicode_normalize_kc(const pg_wchar *input)
count -= 2;
}
if (!recompose)
return decomp_chars;
/*
* The last phase of NFKC is the recomposition of the reordered Unicode
* The last phase of NFC and NFKC is the recomposition of the reordered Unicode
* string using combining classes. The recomposed string cannot be longer
* than the decomposed one, so make the allocation of the output string
* based on that assumption.
......
......@@ -16,6 +16,14 @@
#include "mb/pg_wchar.h"
extern pg_wchar *unicode_normalize_kc(const pg_wchar *input);
typedef enum
{
UNICODE_NFC = 0,
UNICODE_NFD = 1,
UNICODE_NFKC = 2,
UNICODE_NFKD = 3,
} UnicodeNormalizationForm;
extern pg_wchar *unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input);
#endif /* UNICODE_NORM_H */
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment