Commit d6775504 authored by Tom Lane's avatar Tom Lane

Allow to_date/to_timestamp to recognize non-English month/day names.

to_char() has long allowed the TM (translation mode) prefix to
specify output of translated month or day names; but that prefix
had no effect in input format strings.  Now it does.  to_date()
and to_timestamp() will now recognize the same month or day names
that to_char() would output for the same format code.  Matching is
case-insensitive (per the active collation's notion of what that
means), just as it has always been for English month/day names
without the TM prefix.

(As per the discussion thread, there are lots of cases that this
feature will not handle, such as alternate day names.  But being
able to accept what to_char() will output seems useful enough.)

In passing, fix some shaky English and violations of message
style guidelines in jsonpath errors for the .datetime() method,
which depends on this code.

Juan José Santamaría Flecha, reviewed and modified by me,
with other commentary from Alvaro Herrera, Tomas Vondra,
Arthur Zakirov, Peter Eisentraut, Mark Dilger.

Discussion: https://postgr.es/m/CAC+AXB3u1jTngJcoC1nAHBf=M3v-jrEfo86UFtCqCjzbWS9QhA@mail.gmail.com
parent 1810ca18
......@@ -5968,7 +5968,7 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
</row>
<row>
<entry><literal>TM</literal> prefix</entry>
<entry>translation mode (print localized day and month names based on
<entry>translation mode (use localized day and month names based on
<xref linkend="guc-lc-time"/>)</entry>
<entry><literal>TMMonth</literal></entry>
</row>
......@@ -5999,9 +5999,20 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
<listitem>
<para>
<literal>TM</literal> does not include trailing blanks.
<function>to_timestamp</function> and <function>to_date</function> ignore
the <literal>TM</literal> modifier.
<literal>TM</literal> suppresses trailing blanks whether or
not <literal>FM</literal> is specified.
</para>
</listitem>
<listitem>
<para>
<function>to_timestamp</function> and <function>to_date</function>
ignore letter case in the input; so for
example <literal>MON</literal>, <literal>Mon</literal>,
and <literal>mon</literal> all accept the same strings. When using
the <literal>TM</literal> modifier, case-folding is done according to
the rules of the function's input collation (see
<xref linkend="collation"/>).
</para>
</listitem>
......@@ -12824,12 +12835,12 @@ table2-mapping
<para>
The <literal>datetime()</literal> and
<literal>datetime(<replaceable>template</replaceable>)</literal> methods
use the same parsing rules as <literal>to_timestamp</literal> SQL
function does (see <xref linkend="functions-formatting"/>) with three
exceptions. At first, these methods doesn't allow unmatched template
patterns. At second, only following separators are allowed in the
template string: minus sign, period, solidus, comma, apostrophe,
semicolon, colon and space. At third, separators in the template string
use the same parsing rules as the <literal>to_timestamp</literal> SQL
function does (see <xref linkend="functions-formatting"/>), with three
exceptions. First, these methods don't allow unmatched template
patterns. Second, only the following separators are allowed in the
template string: minus sign, period, solidus (slash), comma, apostrophe,
semicolon, colon and space. Third, separators in the template string
must exactly match the input string.
</para>
</note>
......
......@@ -1038,7 +1038,7 @@ static void parse_format(FormatNode *node, const char *str, const KeyWord *kw,
static void DCH_to_char(FormatNode *node, bool is_interval,
TmToChar *in, char *out, Oid collid);
static void DCH_from_char(FormatNode *node, const char *in, TmFromChar *out,
bool std, bool *have_error);
Oid collid, bool std, bool *have_error);
#ifdef DEBUG_TO_FROM_CHAR
static void dump_index(const KeyWord *k, const int *index);
......@@ -1057,11 +1057,14 @@ static int from_char_parse_int_len(int *dest, const char **src, const int len,
FormatNode *node, bool *have_error);
static int from_char_parse_int(int *dest, const char **src, FormatNode *node,
bool *have_error);
static int seq_search(const char *name, const char *const *array, int *len);
static int seq_search_ascii(const char *name, const char *const *array, int *len);
static int seq_search_localized(const char *name, char **array, int *len,
Oid collid);
static int from_char_seq_search(int *dest, const char **src,
const char *const *array,
char **localized_array, Oid collid,
FormatNode *node, bool *have_error);
static void do_to_timestamp(text *date_txt, text *fmt, bool std,
static void do_to_timestamp(text *date_txt, text *fmt, Oid collid, bool std,
struct pg_tm *tm, fsec_t *fsec, int *fprec,
uint32 *flags, bool *have_error);
static char *fill_str(char *str, int c, int max);
......@@ -2457,7 +2460,7 @@ from_char_parse_int(int *dest, const char **src, FormatNode *node, bool *have_er
* suitable for comparisons to ASCII strings.
*/
static int
seq_search(const char *name, const char *const *array, int *len)
seq_search_ascii(const char *name, const char *const *array, int *len)
{
unsigned char firstc;
const char *const *a;
......@@ -2503,8 +2506,89 @@ seq_search(const char *name, const char *const *array, int *len)
}
/*
* Perform a sequential search in 'array' for an entry matching the first
* character(s) of the 'src' string case-insensitively.
* Sequentially search an array of possibly non-English words for
* a case-insensitive match to the initial character(s) of "name".
*
* This has the same API as seq_search_ascii(), but we use a more general
* case-folding transformation to achieve case-insensitivity. Case folding
* is done per the rules of the collation identified by "collid".
*
* The array is treated as const, but we don't declare it that way because
* the arrays exported by pg_locale.c aren't const.
*/
static int
seq_search_localized(const char *name, char **array, int *len, Oid collid)
{
char **a;
char *upper_name;
char *lower_name;
*len = 0;
/* empty string can't match anything */
if (!*name)
return -1;
/*
* The case-folding processing done below is fairly expensive, so before
* doing that, make a quick pass to see if there is an exact match.
*/
for (a = array; *a != NULL; a++)
{
int element_len = strlen(*a);
if (strncmp(name, *a, element_len) == 0)
{
*len = element_len;
return a - array;
}
}
/*
* Fold to upper case, then to lower case, so that we can match reliably
* even in languages in which case conversions are not injective.
*/
upper_name = str_toupper(unconstify(char *, name), strlen(name), collid);
lower_name = str_tolower(upper_name, strlen(upper_name), collid);
pfree(upper_name);
for (a = array; *a != NULL; a++)
{
char *upper_element;
char *lower_element;
int element_len;
/* Likewise upper/lower-case array element */
upper_element = str_toupper(*a, strlen(*a), collid);
lower_element = str_tolower(upper_element, strlen(upper_element),
collid);
pfree(upper_element);
element_len = strlen(lower_element);
/* Match? */
if (strncmp(lower_name, lower_element, element_len) == 0)
{
*len = element_len;
pfree(lower_element);
pfree(lower_name);
return a - array;
}
pfree(lower_element);
}
pfree(lower_name);
return -1;
}
/*
* Perform a sequential search in 'array' (or 'localized_array', if that's
* not NULL) for an entry matching the first character(s) of the 'src'
* string case-insensitively.
*
* The 'array' is presumed to be English words (all-ASCII), but
* if 'localized_array' is supplied, that might be non-English
* so we need a more expensive case-folding transformation
* (which will follow the rules of the collation 'collid').
*
* If a match is found, copy the array index of the match into the integer
* pointed to by 'dest', advance 'src' to the end of the part of the string
......@@ -2518,11 +2602,15 @@ seq_search(const char *name, const char *const *array, int *len)
*/
static int
from_char_seq_search(int *dest, const char **src, const char *const *array,
char **localized_array, Oid collid,
FormatNode *node, bool *have_error)
{
int len;
*dest = seq_search(*src, array, &len);
if (localized_array == NULL)
*dest = seq_search_ascii(*src, array, &len);
else
*dest = seq_search_localized(*src, localized_array, &len, collid);
if (len <= 0)
{
......@@ -3147,19 +3235,20 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col
*s = '\0';
}
/* ----------
* Process a string as denoted by a list of FormatNodes.
/*
* Process the string 'in' as denoted by the array of FormatNodes 'node[]'.
* The TmFromChar struct pointed to by 'out' is populated with the results.
*
* 'collid' identifies the collation to use, if needed.
* 'std' specifies standard parsing mode.
* If 'have_error' is NULL, then errors are thrown, else '*have_error' is set.
*
* Note: we currently don't have any to_interval() function, so there
* is no need here for INVALID_FOR_INTERVAL checks.
*
* If 'have_error' is NULL, then errors are thrown, else '*have_error' is set.
* ----------
*/
static void
DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
bool *have_error)
DCH_from_char(FormatNode *node, const char *in, TmFromChar *out,
Oid collid, bool std, bool *have_error)
{
FormatNode *n;
const char *s;
......@@ -3170,6 +3259,9 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
/* number of extra skipped characters (more than given in format string) */
int extra_skip = 0;
/* cache localized days and months */
cache_locale_time();
for (n = node, s = in; n->type != NODE_TYPE_END && *s != '\0'; n++)
{
/*
......@@ -3271,6 +3363,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
case DCH_a_m:
case DCH_p_m:
from_char_seq_search(&value, &s, ampm_strings_long,
NULL, InvalidOid,
n, have_error);
CHECK_ERROR;
from_char_set_int(&out->pm, value % 2, n, have_error);
......@@ -3282,6 +3375,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
case DCH_am:
case DCH_pm:
from_char_seq_search(&value, &s, ampm_strings,
NULL, InvalidOid,
n, have_error);
CHECK_ERROR;
from_char_set_int(&out->pm, value % 2, n, have_error);
......@@ -3395,6 +3489,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
case DCH_a_d:
case DCH_b_c:
from_char_seq_search(&value, &s, adbc_strings_long,
NULL, InvalidOid,
n, have_error);
CHECK_ERROR;
from_char_set_int(&out->bc, value % 2, n, have_error);
......@@ -3405,6 +3500,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
case DCH_ad:
case DCH_bc:
from_char_seq_search(&value, &s, adbc_strings,
NULL, InvalidOid,
n, have_error);
CHECK_ERROR;
from_char_set_int(&out->bc, value % 2, n, have_error);
......@@ -3414,6 +3510,8 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
case DCH_Month:
case DCH_month:
from_char_seq_search(&value, &s, months_full,
S_TM(n->suffix) ? localized_full_months : NULL,
collid,
n, have_error);
CHECK_ERROR;
from_char_set_int(&out->mm, value + 1, n, have_error);
......@@ -3423,6 +3521,8 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
case DCH_Mon:
case DCH_mon:
from_char_seq_search(&value, &s, months,
S_TM(n->suffix) ? localized_abbrev_months : NULL,
collid,
n, have_error);
CHECK_ERROR;
from_char_set_int(&out->mm, value + 1, n, have_error);
......@@ -3437,6 +3537,8 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
case DCH_Day:
case DCH_day:
from_char_seq_search(&value, &s, days,
S_TM(n->suffix) ? localized_full_days : NULL,
collid,
n, have_error);
CHECK_ERROR;
from_char_set_int(&out->d, value, n, have_error);
......@@ -3447,6 +3549,8 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
case DCH_Dy:
case DCH_dy:
from_char_seq_search(&value, &s, days_short,
S_TM(n->suffix) ? localized_abbrev_days : NULL,
collid,
n, have_error);
CHECK_ERROR;
from_char_set_int(&out->d, value, n, have_error);
......@@ -3565,6 +3669,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
case DCH_RM:
case DCH_rm:
from_char_seq_search(&value, &s, rm_months_lower,
NULL, InvalidOid,
n, have_error);
CHECK_ERROR;
from_char_set_int(&out->mm, MONTHS_PER_YEAR - value,
......@@ -4031,13 +4136,15 @@ to_timestamp(PG_FUNCTION_ARGS)
{
text *date_txt = PG_GETARG_TEXT_PP(0);
text *fmt = PG_GETARG_TEXT_PP(1);
Oid collid = PG_GET_COLLATION();
Timestamp result;
int tz;
struct pg_tm tm;
fsec_t fsec;
int fprec;
do_to_timestamp(date_txt, fmt, false, &tm, &fsec, &fprec, NULL, NULL);
do_to_timestamp(date_txt, fmt, collid, false,
&tm, &fsec, &fprec, NULL, NULL);
/* Use the specified time zone, if any. */
if (tm.tm_zone)
......@@ -4072,11 +4179,13 @@ to_date(PG_FUNCTION_ARGS)
{
text *date_txt = PG_GETARG_TEXT_PP(0);
text *fmt = PG_GETARG_TEXT_PP(1);
Oid collid = PG_GET_COLLATION();
DateADT result;
struct pg_tm tm;
fsec_t fsec;
do_to_timestamp(date_txt, fmt, false, &tm, &fsec, NULL, NULL, NULL);
do_to_timestamp(date_txt, fmt, collid, false,
&tm, &fsec, NULL, NULL, NULL);
/* Prevent overflow in Julian-day routines */
if (!IS_VALID_JULIAN(tm.tm_year, tm.tm_mon, tm.tm_mday))
......@@ -4098,25 +4207,31 @@ to_date(PG_FUNCTION_ARGS)
}
/*
* Convert the 'date_txt' input to a datetime type using argument 'fmt' as a format string.
* Convert the 'date_txt' input to a datetime type using argument 'fmt'
* as a format string. The collation 'collid' may be used for case-folding
* rules in some cases. 'strict' specifies standard parsing mode.
*
* The actual data type (returned in 'typid', 'typmod') is determined by
* the presence of date/time/zone components in the format string.
*
* When timezone component is present, the corresponding offset is set to '*tz'.
* When timezone component is present, the corresponding offset is
* returned in '*tz'.
*
* If 'have_error' is NULL, then errors are thrown, else '*have_error' is set
* and zero value is returned.
*/
Datum
parse_datetime(text *date_txt, text *fmt, bool strict, Oid *typid,
int32 *typmod, int *tz, bool *have_error)
parse_datetime(text *date_txt, text *fmt, Oid collid, bool strict,
Oid *typid, int32 *typmod, int *tz,
bool *have_error)
{
struct pg_tm tm;
fsec_t fsec;
int fprec;
uint32 flags;
do_to_timestamp(date_txt, fmt, strict, &tm, &fsec, &fprec, &flags, have_error);
do_to_timestamp(date_txt, fmt, collid, strict,
&tm, &fsec, &fprec, &flags, have_error);
CHECK_ERROR;
*typmod = fprec ? fprec : -1; /* fractional part precision */
......@@ -4278,21 +4393,21 @@ on_error:
* Parse the 'date_txt' according to 'fmt', return results as a struct pg_tm,
* fractional seconds, and fractional precision.
*
* 'collid' identifies the collation to use, if needed.
* 'std' specifies standard parsing mode.
* Bit mask of date/time/zone components found in 'fmt' is returned in 'flags',
* if that is not NULL.
* If 'have_error' is NULL, then errors are thrown, else '*have_error' is set.
*
* We parse 'fmt' into a list of FormatNodes, which is then passed to
* DCH_from_char to populate a TmFromChar with the parsed contents of
* 'date_txt'.
*
* The TmFromChar is then analysed and converted into the final results in
* struct 'tm' and 'fsec'.
*
* Bit mask of date/time/zone components found in 'fmt' is returned in 'flags'.
*
* 'std' specifies standard parsing mode.
*
* If 'have_error' is NULL, then errors are thrown, else '*have_error' is set.
* struct 'tm', 'fsec', and 'fprec'.
*/
static void
do_to_timestamp(text *date_txt, text *fmt, bool std,
do_to_timestamp(text *date_txt, text *fmt, Oid collid, bool std,
struct pg_tm *tm, fsec_t *fsec, int *fprec,
uint32 *flags, bool *have_error)
{
......@@ -4352,7 +4467,7 @@ do_to_timestamp(text *date_txt, text *fmt, bool std,
/* dump_index(DCH_keywords, DCH_index); */
#endif
DCH_from_char(format, date_str, &tmfc, std, have_error);
DCH_from_char(format, date_str, &tmfc, collid, std, have_error);
CHECK_ERROR;
pfree(fmt_str);
......
......@@ -1781,6 +1781,7 @@ executeDateTimeMethod(JsonPathExecContext *cxt, JsonPathItem *jsp,
JsonbValue jbvbuf;
Datum value;
text *datetime;
Oid collid;
Oid typid;
int32 typmod = -1;
int tz = 0;
......@@ -1797,6 +1798,13 @@ executeDateTimeMethod(JsonPathExecContext *cxt, JsonPathItem *jsp,
datetime = cstring_to_text_with_len(jb->val.string.val,
jb->val.string.len);
/*
* At some point we might wish to have callers supply the collation to
* use, but right now it's unclear that they'd be able to do better than
* DEFAULT_COLLATION_OID anyway.
*/
collid = DEFAULT_COLLATION_OID;
if (jsp->content.arg)
{
text *template;
......@@ -1814,7 +1822,7 @@ executeDateTimeMethod(JsonPathExecContext *cxt, JsonPathItem *jsp,
template = cstring_to_text_with_len(template_str,
template_len);
value = parse_datetime(datetime, template, true,
value = parse_datetime(datetime, template, collid, true,
&typid, &typmod, &tz,
jspThrowErrors(cxt) ? NULL : &have_error);
......@@ -1858,7 +1866,7 @@ executeDateTimeMethod(JsonPathExecContext *cxt, JsonPathItem *jsp,
MemoryContextSwitchTo(oldcxt);
}
value = parse_datetime(datetime, fmt_txt[i], true,
value = parse_datetime(datetime, fmt_txt[i], collid, true,
&typid, &typmod, &tz,
&have_error);
......@@ -1872,8 +1880,9 @@ executeDateTimeMethod(JsonPathExecContext *cxt, JsonPathItem *jsp,
if (res == jperNotFound)
RETURN_ERROR(ereport(ERROR,
(errcode(ERRCODE_INVALID_ARGUMENT_FOR_JSON_DATETIME_FUNCTION),
errmsg("datetime format is not unrecognized"),
errhint("use datetime template argument for explicit format specification"))));
errmsg("datetime format is not recognized: \"%s\"",
text_to_cstring(datetime)),
errhint("Use a datetime template argument to specify the input data format."))));
}
pfree(datetime);
......
......@@ -96,11 +96,17 @@ char *locale_monetary;
char *locale_numeric;
char *locale_time;
/* lc_time localization cache */
char *localized_abbrev_days[7];
char *localized_full_days[7];
char *localized_abbrev_months[12];
char *localized_full_months[12];
/*
* lc_time localization cache.
*
* We use only the first 7 or 12 entries of these arrays. The last array
* element is left as NULL for the convenience of outside code that wants
* to sequentially scan these arrays.
*/
char *localized_abbrev_days[7 + 1];
char *localized_full_days[7 + 1];
char *localized_abbrev_months[12 + 1];
char *localized_full_months[12 + 1];
/* indicates whether locale information cache is valid */
static bool CurrentLocaleConvValid = false;
......@@ -922,6 +928,8 @@ cache_locale_time(void)
cache_single_string(&localized_full_days[i], bufptr, encoding);
bufptr += MAX_L10N_DATA;
}
localized_abbrev_days[7] = NULL;
localized_full_days[7] = NULL;
/* localized months */
for (i = 0; i < 12; i++)
......@@ -931,6 +939,8 @@ cache_locale_time(void)
cache_single_string(&localized_full_months[i], bufptr, encoding);
bufptr += MAX_L10N_DATA;
}
localized_abbrev_months[12] = NULL;
localized_full_months[12] = NULL;
CurrentLCTimeValid = true;
}
......
......@@ -26,7 +26,7 @@ extern char *asc_tolower(const char *buff, size_t nbytes);
extern char *asc_toupper(const char *buff, size_t nbytes);
extern char *asc_initcap(const char *buff, size_t nbytes);
extern Datum parse_datetime(text *date_txt, text *fmt, bool std,
extern Datum parse_datetime(text *date_txt, text *fmt, Oid collid, bool strict,
Oid *typid, int32 *typmod, int *tz,
bool *have_error);
......
......@@ -461,6 +461,22 @@ SELECT to_char(date '2010-04-01', 'DD TMMON YYYY' COLLATE "tr_TR");
01 NİS 2010
(1 row)
-- to_date
SELECT to_date('01 ŞUB 2010', 'DD TMMON YYYY');
to_date
------------
02-01-2010
(1 row)
SELECT to_date('01 Şub 2010', 'DD TMMON YYYY');
to_date
------------
02-01-2010
(1 row)
SELECT to_date('1234567890ab 2010', 'TMMONTH YYYY'); -- fail
ERROR: invalid value "1234567890ab" for "MONTH"
DETAIL: The given value did not match any of the allowed values for this field.
-- backwards parsing
CREATE VIEW collview1 AS SELECT * FROM collate_test1 WHERE b COLLATE "C" >= 'bbc';
CREATE VIEW collview2 AS SELECT a, b FROM collate_test1 ORDER BY b COLLATE "C";
......
......@@ -1674,9 +1674,9 @@ select jsonb_path_query('[]', 'strict $.datetime()');
ERROR: jsonpath item method .datetime() can only be applied to a string
select jsonb_path_query('{}', '$.datetime()');
ERROR: jsonpath item method .datetime() can only be applied to a string
select jsonb_path_query('""', '$.datetime()');
ERROR: datetime format is not unrecognized
HINT: use datetime template argument for explicit format specification
select jsonb_path_query('"bogus"', '$.datetime()');
ERROR: datetime format is not recognized: "bogus"
HINT: Use a datetime template argument to specify the input data format.
select jsonb_path_query('"12:34"', '$.datetime("aaa")');
ERROR: invalid datetime format separator: "a"
select jsonb_path_query('"aaaa"', '$.datetime("HH24")');
......
......@@ -182,6 +182,12 @@ SELECT to_char(date '2010-02-01', 'DD TMMON YYYY' COLLATE "tr_TR");
SELECT to_char(date '2010-04-01', 'DD TMMON YYYY');
SELECT to_char(date '2010-04-01', 'DD TMMON YYYY' COLLATE "tr_TR");
-- to_date
SELECT to_date('01 ŞUB 2010', 'DD TMMON YYYY');
SELECT to_date('01 Şub 2010', 'DD TMMON YYYY');
SELECT to_date('1234567890ab 2010', 'TMMONTH YYYY'); -- fail
-- backwards parsing
......
......@@ -352,7 +352,7 @@ select jsonb_path_query('1', '$.datetime()');
select jsonb_path_query('[]', '$.datetime()');
select jsonb_path_query('[]', 'strict $.datetime()');
select jsonb_path_query('{}', '$.datetime()');
select jsonb_path_query('""', '$.datetime()');
select jsonb_path_query('"bogus"', '$.datetime()');
select jsonb_path_query('"12:34"', '$.datetime("aaa")');
select jsonb_path_query('"aaaa"', '$.datetime("HH24")');
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment