Commit 94e3311b authored by Andrew Dunstan's avatar Andrew Dunstan

Handle Unicode surrogate pairs correctly when processing JSON.

In 9.2, Unicode escape sequences are not analysed at all other than
to make sure that they are in the form \uXXXX. But in 9.3 many of the
new operators and functions try to turn JSON text values into text in
the server encoding, and this includes de-escaping Unicode escape
sequences. This processing had not taken into account the possibility
that this might contain a surrogate pair to designate a character
outside the BMP. That is now handled correctly.

This also enforces correct use of surrogate pairs, something that is not
done by the type's input routines. This fact is noted in the docs.
parent c99d5d1b
...@@ -10150,6 +10150,15 @@ table2-mapping ...@@ -10150,6 +10150,15 @@ table2-mapping
</tgroup> </tgroup>
</table> </table>
<note>
<para>
The <type>json</type> functions and operators can impose stricter validity requirements
than the type's input functions. In particular, they check much more closely that any use
of Unicode surrogate pairs to designate characters outside the Unicode Basic Multilingual
Plane is correct.
</para>
</note>
<note> <note>
<para> <para>
The <xref linkend="hstore"> extension has a cast from <type>hstore</type> to The <xref linkend="hstore"> extension has a cast from <type>hstore</type> to
......
...@@ -646,6 +646,7 @@ json_lex_string(JsonLexContext *lex) ...@@ -646,6 +646,7 @@ json_lex_string(JsonLexContext *lex)
{ {
char *s; char *s;
int len; int len;
int hi_surrogate = -1;
if (lex->strval != NULL) if (lex->strval != NULL)
resetStringInfo(lex->strval); resetStringInfo(lex->strval);
...@@ -718,6 +719,36 @@ json_lex_string(JsonLexContext *lex) ...@@ -718,6 +719,36 @@ json_lex_string(JsonLexContext *lex)
int utf8len; int utf8len;
char *converted; char *converted;
if (ch >= 0xd800 && ch <= 0xdbff)
{
if (hi_surrogate != -1)
ereport(ERROR,
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
errmsg("invalid input syntax for type json"),
errdetail("high order surrogate must not follow a high order surrogate."),
report_json_context(lex)));
hi_surrogate = (ch & 0x3ff) << 10;
continue;
}
else if (ch >= 0xdc00 && ch <= 0xdfff)
{
if (hi_surrogate == -1)
ereport(ERROR,
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
errmsg("invalid input syntax for type json"),
errdetail("low order surrogate must follow a high order surrogate."),
report_json_context(lex)));
ch = 0x10000 + hi_surrogate + (ch & 0x3ff);
hi_surrogate = -1;
}
if (hi_surrogate != -1)
ereport(ERROR,
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
errmsg("invalid input syntax for type json"),
errdetail("low order surrogate must follow a high order surrogate."),
report_json_context(lex)));
unicode_to_utf8(ch, (unsigned char *) utf8str); unicode_to_utf8(ch, (unsigned char *) utf8str);
utf8len = pg_utf_mblen((unsigned char *) utf8str); utf8len = pg_utf_mblen((unsigned char *) utf8str);
utf8str[utf8len] = '\0'; utf8str[utf8len] = '\0';
...@@ -730,6 +761,13 @@ json_lex_string(JsonLexContext *lex) ...@@ -730,6 +761,13 @@ json_lex_string(JsonLexContext *lex)
} }
else if (lex->strval != NULL) else if (lex->strval != NULL)
{ {
if (hi_surrogate != -1)
ereport(ERROR,
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
errmsg("invalid input syntax for type json"),
errdetail("low order surrogate must follow a high order surrogate."),
report_json_context(lex)));
switch (*s) switch (*s)
{ {
case '"': case '"':
...@@ -784,11 +822,25 @@ json_lex_string(JsonLexContext *lex) ...@@ -784,11 +822,25 @@ json_lex_string(JsonLexContext *lex)
} }
else if (lex->strval != NULL) else if (lex->strval != NULL)
{ {
if (hi_surrogate != -1)
ereport(ERROR,
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
errmsg("invalid input syntax for type json"),
errdetail("low order surrogate must follow a high order surrogate."),
report_json_context(lex)));
appendStringInfoChar(lex->strval, *s); appendStringInfoChar(lex->strval, *s);
} }
} }
if (hi_surrogate != -1)
ereport(ERROR,
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
errmsg("invalid input syntax for type json"),
errdetail("low order surrogate must follow a high order surrogate."),
report_json_context(lex)));
/* Hooray, we found the end of the string! */ /* Hooray, we found the end of the string! */
lex->prev_token_terminator = lex->token_terminator; lex->prev_token_terminator = lex->token_terminator;
lex->token_terminator = s + 1; lex->token_terminator = s + 1;
......
...@@ -920,3 +920,26 @@ select * from json_populate_recordset(row('def',99,null)::jpop,'[{"a":[100,200,3 ...@@ -920,3 +920,26 @@ select * from json_populate_recordset(row('def',99,null)::jpop,'[{"a":[100,200,3
ERROR: cannot call json_populate_recordset on a nested object ERROR: cannot call json_populate_recordset on a nested object
select * from json_populate_recordset(row('def',99,null)::jpop,'[{"c":[100,200,300],"x":43.2},{"a":{"z":true},"b":3,"c":"2012-01-20 10:42:53"}]') q; select * from json_populate_recordset(row('def',99,null)::jpop,'[{"c":[100,200,300],"x":43.2},{"a":{"z":true},"b":3,"c":"2012-01-20 10:42:53"}]') q;
ERROR: cannot call json_populate_recordset on a nested object ERROR: cannot call json_populate_recordset on a nested object
-- handling of unicode surrogate pairs
select json '{ "a": "\ud83d\ude04\ud83d\udc36" }' -> 'a' as correct;
correct
----------------------------
"\ud83d\ude04\ud83d\udc36"
(1 row)
select json '{ "a": "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row
ERROR: invalid input syntax for type json
DETAIL: high order surrogate must not follow a high order surrogate.
CONTEXT: JSON data, line 1: { "a":...
select json '{ "a": "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
ERROR: invalid input syntax for type json
DETAIL: low order surrogate must follow a high order surrogate.
CONTEXT: JSON data, line 1: { "a":...
select json '{ "a": "\ud83dX" }' -> 'a'; -- orphan high surrogate
ERROR: invalid input syntax for type json
DETAIL: low order surrogate must follow a high order surrogate.
CONTEXT: JSON data, line 1: { "a":...
select json '{ "a": "\ude04X" }' -> 'a'; -- orphan low surrogate
ERROR: invalid input syntax for type json
DETAIL: low order surrogate must follow a high order surrogate.
CONTEXT: JSON data, line 1: { "a":...
...@@ -296,3 +296,11 @@ select * from json_populate_recordset(null::jpop,'[{"a":"blurfl","x":43.2},{"b": ...@@ -296,3 +296,11 @@ select * from json_populate_recordset(null::jpop,'[{"a":"blurfl","x":43.2},{"b":
select * from json_populate_recordset(row('def',99,null)::jpop,'[{"a":"blurfl","x":43.2},{"b":3,"c":"2012-01-20 10:42:53"}]') q; select * from json_populate_recordset(row('def',99,null)::jpop,'[{"a":"blurfl","x":43.2},{"b":3,"c":"2012-01-20 10:42:53"}]') q;
select * from json_populate_recordset(row('def',99,null)::jpop,'[{"a":[100,200,300],"x":43.2},{"a":{"z":true},"b":3,"c":"2012-01-20 10:42:53"}]') q; select * from json_populate_recordset(row('def',99,null)::jpop,'[{"a":[100,200,300],"x":43.2},{"a":{"z":true},"b":3,"c":"2012-01-20 10:42:53"}]') q;
select * from json_populate_recordset(row('def',99,null)::jpop,'[{"c":[100,200,300],"x":43.2},{"a":{"z":true},"b":3,"c":"2012-01-20 10:42:53"}]') q; select * from json_populate_recordset(row('def',99,null)::jpop,'[{"c":[100,200,300],"x":43.2},{"a":{"z":true},"b":3,"c":"2012-01-20 10:42:53"}]') q;
-- handling of unicode surrogate pairs
select json '{ "a": "\ud83d\ude04\ud83d\udc36" }' -> 'a' as correct;
select json '{ "a": "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row
select json '{ "a": "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
select json '{ "a": "\ud83dX" }' -> 'a'; -- orphan high surrogate
select json '{ "a": "\ude04X" }' -> 'a'; -- orphan low surrogate
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment