Commit 78ed8e03 authored by Andrew Dunstan's avatar Andrew Dunstan

Fix unescaping of JSON Unicode escapes, especially for non-UTF8.

Per discussion  on -hackers. We treat Unicode escapes when unescaping
them similarly to the way we treat them in PostgreSQL string literals.
Escapes in the ASCII range are always accepted, no matter what the
database encoding. Escapes for higher code points are only processed in
UTF8 databases, and attempts to process them in other databases will
result in an error. \u0000 is never unescaped, since it would result in
an impermissible null byte.
parent c1d729b4
...@@ -10159,6 +10159,17 @@ table2-mapping ...@@ -10159,6 +10159,17 @@ table2-mapping
</para> </para>
</note> </note>
<note>
<para>
Many of these functions and operators will convert Unicode escapes
in the JSON text to the appropriate UTF8 character when the database encoding is UTF8. In
other encodings the escape sequence must be for an ASCII character, and any other code point
in a Unicode escape sequence will result in an error.
In general, it is best to avoid mixing Unicode escapes in JSON with a non-UTF8 database
encoding, if possible.
</para>
</note>
<note> <note>
<para> <para>
The <xref linkend="hstore"> extension has a cast from <type>hstore</type> to The <xref linkend="hstore"> extension has a cast from <type>hstore</type> to
......
...@@ -717,7 +717,6 @@ json_lex_string(JsonLexContext *lex) ...@@ -717,7 +717,6 @@ json_lex_string(JsonLexContext *lex)
{ {
char utf8str[5]; char utf8str[5];
int utf8len; int utf8len;
char *converted;
if (ch >= 0xd800 && ch <= 0xdbff) if (ch >= 0xd800 && ch <= 0xdbff)
{ {
...@@ -749,13 +748,40 @@ json_lex_string(JsonLexContext *lex) ...@@ -749,13 +748,40 @@ json_lex_string(JsonLexContext *lex)
errdetail("low order surrogate must follow a high order surrogate."), errdetail("low order surrogate must follow a high order surrogate."),
report_json_context(lex))); report_json_context(lex)));
/*
* For UTF8, replace the escape sequence by the actual utf8
* character in lex->strval. Do this also for other encodings
* if the escape designates an ASCII character, otherwise
* raise an error. We don't ever unescape a \u0000, since that
* would result in an impermissible nul byte.
*/
if (ch == 0)
{
appendStringInfoString(lex->strval, "\\u0000");
}
else if (GetDatabaseEncoding() == PG_UTF8)
{
unicode_to_utf8(ch, (unsigned char *) utf8str); unicode_to_utf8(ch, (unsigned char *) utf8str);
utf8len = pg_utf_mblen((unsigned char *) utf8str); utf8len = pg_utf_mblen((unsigned char *) utf8str);
utf8str[utf8len] = '\0'; appendBinaryStringInfo(lex->strval, utf8str, utf8len);
converted = pg_any_to_server(utf8str, utf8len, PG_UTF8); }
appendStringInfoString(lex->strval, converted); else if (ch <= 0x007f)
if (converted != utf8str) {
pfree(converted); /*
* This is the only way to designate things like a form feed
* character in JSON, so it's useful in all encodings.
*/
appendStringInfoChar(lex->strval, (char) ch);
}
else
{
ereport(ERROR,
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
errmsg("invalid input syntax for type json"),
errdetail("Unicode escape for code points higher than U+007F not permitted in non-UTF8 encoding"),
report_json_context(lex)));
}
} }
} }
......
...@@ -921,8 +921,8 @@ ERROR: cannot call json_populate_recordset on a nested object ...@@ -921,8 +921,8 @@ ERROR: cannot call json_populate_recordset on a nested object
select * from json_populate_recordset(row('def',99,null)::jpop,'[{"c":[100,200,300],"x":43.2},{"a":{"z":true},"b":3,"c":"2012-01-20 10:42:53"}]') q; select * from json_populate_recordset(row('def',99,null)::jpop,'[{"c":[100,200,300],"x":43.2},{"a":{"z":true},"b":3,"c":"2012-01-20 10:42:53"}]') q;
ERROR: cannot call json_populate_recordset on a nested object ERROR: cannot call json_populate_recordset on a nested object
-- handling of unicode surrogate pairs -- handling of unicode surrogate pairs
select json '{ "a": "\ud83d\ude04\ud83d\udc36" }' -> 'a' as correct; select json '{ "a": "\ud83d\ude04\ud83d\udc36" }' -> 'a' as correct_in_utf8;
correct correct_in_utf8
---------------------------- ----------------------------
"\ud83d\ude04\ud83d\udc36" "\ud83d\ude04\ud83d\udc36"
(1 row) (1 row)
...@@ -943,3 +943,22 @@ select json '{ "a": "\ude04X" }' -> 'a'; -- orphan low surrogate ...@@ -943,3 +943,22 @@ select json '{ "a": "\ude04X" }' -> 'a'; -- orphan low surrogate
ERROR: invalid input syntax for type json ERROR: invalid input syntax for type json
DETAIL: low order surrogate must follow a high order surrogate. DETAIL: low order surrogate must follow a high order surrogate.
CONTEXT: JSON data, line 1: { "a":... CONTEXT: JSON data, line 1: { "a":...
--handling of simple unicode escapes
select json '{ "a": "the Copyright \u00a9 sign" }' ->> 'a' as correct_in_utf8;
correct_in_utf8
----------------------
the Copyright © sign
(1 row)
select json '{ "a": "dollar \u0024 character" }' ->> 'a' as correct_everywhere;
correct_everywhere
--------------------
dollar $ character
(1 row)
select json '{ "a": "null \u0000 escape" }' ->> 'a' as not_unescaped;
not_unescaped
--------------------
null \u0000 escape
(1 row)
This diff is collapsed.
...@@ -299,8 +299,14 @@ select * from json_populate_recordset(row('def',99,null)::jpop,'[{"c":[100,200,3 ...@@ -299,8 +299,14 @@ select * from json_populate_recordset(row('def',99,null)::jpop,'[{"c":[100,200,3
-- handling of unicode surrogate pairs -- handling of unicode surrogate pairs
select json '{ "a": "\ud83d\ude04\ud83d\udc36" }' -> 'a' as correct; select json '{ "a": "\ud83d\ude04\ud83d\udc36" }' -> 'a' as correct_in_utf8;
select json '{ "a": "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row select json '{ "a": "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row
select json '{ "a": "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order select json '{ "a": "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
select json '{ "a": "\ud83dX" }' -> 'a'; -- orphan high surrogate select json '{ "a": "\ud83dX" }' -> 'a'; -- orphan high surrogate
select json '{ "a": "\ude04X" }' -> 'a'; -- orphan low surrogate select json '{ "a": "\ude04X" }' -> 'a'; -- orphan low surrogate
--handling of simple unicode escapes
select json '{ "a": "the Copyright \u00a9 sign" }' ->> 'a' as correct_in_utf8;
select json '{ "a": "dollar \u0024 character" }' ->> 'a' as correct_everywhere;
select json '{ "a": "null \u0000 escape" }' ->> 'a' as not_unescaped;
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment