Commit 976a1a48 authored by Tom Lane's avatar Tom Lane

Improve to_date/to_number/to_timestamp behavior with multibyte characters.

The documentation says that these functions skip one input character
per literal (non-pattern) format character.  Actually, though, they
skipped one input *byte* per literal *byte*, which could be hugely
confusing if either data or format contained multibyte characters.

To fix, adjust the FormatNode representation and parse_format() so
that multibyte format characters are stored as one FormatNode not
several, and adjust the data-skipping bits to advance by pg_mblen()
not necessarily one byte.  There's no user-visible behavior change
on the to_char() side, although the internal representation changes.

Commit e87d4965 had already fixed most places where we skip characters
on the basis of non-literal format patterns to advance by characters
not bytes, but this gets one more place, the SKIP_THth macro.  I think
everything in formatting.c gets that right now.

It'd be nice to have some regression test cases covering this behavior;
but of course there's no way to do so in an encoding-agnostic way, and
many of the interesting aspects would also require unportable locale
selections.  So I've not bothered here.

Discussion: https://postgr.es/m/28186.1510957703@sss.pgh.pa.us
parent 63ca8631
...@@ -151,8 +151,6 @@ typedef enum ...@@ -151,8 +151,6 @@ typedef enum
FROM_CHAR_DATE_ISOWEEK /* ISO 8601 week date */ FROM_CHAR_DATE_ISOWEEK /* ISO 8601 week date */
} FromCharDateMode; } FromCharDateMode;
typedef struct FormatNode FormatNode;
typedef struct typedef struct
{ {
const char *name; const char *name;
...@@ -162,13 +160,13 @@ typedef struct ...@@ -162,13 +160,13 @@ typedef struct
FromCharDateMode date_mode; FromCharDateMode date_mode;
} KeyWord; } KeyWord;
struct FormatNode typedef struct
{ {
int type; /* node type */ int type; /* NODE_TYPE_XXX, see below */
const KeyWord *key; /* if node type is KEYWORD */ const KeyWord *key; /* if type is ACTION */
char character; /* if node type is CHAR */ char character[MAX_MULTIBYTE_CHAR_LEN + 1]; /* if type is CHAR */
int suffix; /* keyword suffix */ int suffix; /* keyword prefix/suffix code, if any */
}; } FormatNode;
#define NODE_TYPE_END 1 #define NODE_TYPE_END 1
#define NODE_TYPE_ACTION 2 #define NODE_TYPE_ACTION 2
...@@ -1282,12 +1280,15 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw, ...@@ -1282,12 +1280,15 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
} }
else if (*str) else if (*str)
{ {
int chlen;
/* /*
* Process double-quoted literal string, if any * Process double-quoted literal string, if any
*/ */
if (*str == '"') if (*str == '"')
{ {
while (*(++str)) str++;
while (*str)
{ {
if (*str == '"') if (*str == '"')
{ {
...@@ -1297,11 +1298,14 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw, ...@@ -1297,11 +1298,14 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
/* backslash quotes the next character, if any */ /* backslash quotes the next character, if any */
if (*str == '\\' && *(str + 1)) if (*str == '\\' && *(str + 1))
str++; str++;
chlen = pg_mblen(str);
n->type = NODE_TYPE_CHAR; n->type = NODE_TYPE_CHAR;
n->character = *str; memcpy(n->character, str, chlen);
n->character[chlen] = '\0';
n->key = NULL; n->key = NULL;
n->suffix = 0; n->suffix = 0;
n++; n++;
str += chlen;
} }
} }
else else
...@@ -1312,12 +1316,14 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw, ...@@ -1312,12 +1316,14 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
*/ */
if (*str == '\\' && *(str + 1) == '"') if (*str == '\\' && *(str + 1) == '"')
str++; str++;
chlen = pg_mblen(str);
n->type = NODE_TYPE_CHAR; n->type = NODE_TYPE_CHAR;
n->character = *str; memcpy(n->character, str, chlen);
n->character[chlen] = '\0';
n->key = NULL; n->key = NULL;
n->suffix = 0; n->suffix = 0;
n++; n++;
str++; str += chlen;
} }
} }
} }
...@@ -1349,7 +1355,8 @@ dump_node(FormatNode *node, int max) ...@@ -1349,7 +1355,8 @@ dump_node(FormatNode *node, int max)
elog(DEBUG_elog_output, "%d:\t NODE_TYPE_ACTION '%s'\t(%s,%s)", elog(DEBUG_elog_output, "%d:\t NODE_TYPE_ACTION '%s'\t(%s,%s)",
a, n->key->name, DUMP_THth(n->suffix), DUMP_FM(n->suffix)); a, n->key->name, DUMP_THth(n->suffix), DUMP_FM(n->suffix));
else if (n->type == NODE_TYPE_CHAR) else if (n->type == NODE_TYPE_CHAR)
elog(DEBUG_elog_output, "%d:\t NODE_TYPE_CHAR '%c'", a, n->character); elog(DEBUG_elog_output, "%d:\t NODE_TYPE_CHAR '%s'",
a, n->character);
else if (n->type == NODE_TYPE_END) else if (n->type == NODE_TYPE_END)
{ {
elog(DEBUG_elog_output, "%d:\t NODE_TYPE_END", a); elog(DEBUG_elog_output, "%d:\t NODE_TYPE_END", a);
...@@ -2008,8 +2015,8 @@ asc_toupper_z(const char *buff) ...@@ -2008,8 +2015,8 @@ asc_toupper_z(const char *buff)
do { \ do { \
if (S_THth(_suf)) \ if (S_THth(_suf)) \
{ \ { \
if (*(ptr)) (ptr)++; \ if (*(ptr)) (ptr) += pg_mblen(ptr); \
if (*(ptr)) (ptr)++; \ if (*(ptr)) (ptr) += pg_mblen(ptr); \
} \ } \
} while (0) } while (0)
...@@ -2076,7 +2083,8 @@ is_next_separator(FormatNode *n) ...@@ -2076,7 +2083,8 @@ is_next_separator(FormatNode *n)
return true; return true;
} }
else if (isdigit((unsigned char) n->character)) else if (n->character[1] == '\0' &&
isdigit((unsigned char) n->character[0]))
return false; return false;
return true; /* some non-digit input (separator) */ return true; /* some non-digit input (separator) */
...@@ -2405,8 +2413,8 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col ...@@ -2405,8 +2413,8 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col
{ {
if (n->type != NODE_TYPE_ACTION) if (n->type != NODE_TYPE_ACTION)
{ {
*s = n->character; strcpy(s, n->character);
s++; s += strlen(s);
continue; continue;
} }
...@@ -2974,7 +2982,7 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out) ...@@ -2974,7 +2982,7 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out)
* we don't insist that the consumed character match the format's * we don't insist that the consumed character match the format's
* character. * character.
*/ */
s++; s += pg_mblen(s);
continue; continue;
} }
...@@ -4217,7 +4225,7 @@ get_last_relevant_decnum(char *num) ...@@ -4217,7 +4225,7 @@ get_last_relevant_decnum(char *num)
/* /*
* These macros are used in NUM_processor() and its subsidiary routines. * These macros are used in NUM_processor() and its subsidiary routines.
* OVERLOAD_TEST: true if we've reached end of input string * OVERLOAD_TEST: true if we've reached end of input string
* AMOUNT_TEST(s): true if at least s characters remain in string * AMOUNT_TEST(s): true if at least s bytes remain in string
*/ */
#define OVERLOAD_TEST (Np->inout_p >= Np->inout + input_len) #define OVERLOAD_TEST (Np->inout_p >= Np->inout + input_len)
#define AMOUNT_TEST(s) (Np->inout_p <= Np->inout + (input_len - (s))) #define AMOUNT_TEST(s) (Np->inout_p <= Np->inout + (input_len - (s)))
...@@ -4821,9 +4829,9 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout, ...@@ -4821,9 +4829,9 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
if (!Np->is_to_char) if (!Np->is_to_char)
{ {
/* /*
* Check at least one character remains to be scanned. (In * Check at least one byte remains to be scanned. (In actions
* actions below, must use AMOUNT_TEST if we want to read more * below, must use AMOUNT_TEST if we want to read more bytes than
* characters than that.) * that.)
*/ */
if (OVERLOAD_TEST) if (OVERLOAD_TEST)
break; break;
...@@ -5081,12 +5089,18 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout, ...@@ -5081,12 +5089,18 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
* In TO_CHAR, non-pattern characters in the format are copied to * In TO_CHAR, non-pattern characters in the format are copied to
* the output. In TO_NUMBER, we skip one input character for each * the output. In TO_NUMBER, we skip one input character for each
* non-pattern format character, whether or not it matches the * non-pattern format character, whether or not it matches the
* format character. (Currently, that's actually implemented as * format character.
* skipping one input byte per non-pattern format byte, which is
* wrong...)
*/ */
if (Np->is_to_char) if (Np->is_to_char)
*Np->inout_p = n->character; {
strcpy(Np->inout_p, n->character);
Np->inout_p += strlen(Np->inout_p);
}
else
{
Np->inout_p += pg_mblen(Np->inout_p);
}
continue;
} }
Np->inout_p++; Np->inout_p++;
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment