Commit 02faeb4a authored by Peter Eisentraut's avatar Peter Eisentraut

Surrogate pair support for U& string and identifier syntax

This is mainly to make the functionality consistent with the proposed \u
escape syntax.
parent c6bc0feb
<!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.134 2009/08/27 20:08:02 tgl Exp $ --> <!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.135 2009/09/21 22:22:07 petere Exp $ -->
<chapter id="sql-syntax"> <chapter id="sql-syntax">
<title>SQL Syntax</title> <title>SQL Syntax</title>
...@@ -238,6 +238,10 @@ U&amp;"d!0061t!+000061" UESCAPE '!' ...@@ -238,6 +238,10 @@ U&amp;"d!0061t!+000061" UESCAPE '!'
The Unicode escape syntax works only when the server encoding is The Unicode escape syntax works only when the server encoding is
UTF8. When other server encodings are used, only code points in UTF8. When other server encodings are used, only code points in
the ASCII range (up to <literal>\007F</literal>) can be specified. the ASCII range (up to <literal>\007F</literal>) can be specified.
Both the 4-digit and the 6-digit form can be used to specify
UTF-16 surrogate pairs to compose characters with code points
larger than <literal>\FFFF</literal> (although the availability of
the 6-digit form technically makes this unnecessary).
</para> </para>
<para> <para>
...@@ -497,6 +501,10 @@ U&amp;'d!0061t!+000061' UESCAPE '!' ...@@ -497,6 +501,10 @@ U&amp;'d!0061t!+000061' UESCAPE '!'
UTF8. When other server encodings are used, only code points in UTF8. When other server encodings are used, only code points in
the ASCII range (up to <literal>\007F</literal>) can be the ASCII range (up to <literal>\007F</literal>) can be
specified. specified.
Both the 4-digit and the 6-digit form can be used to specify
UTF-16 surrogate pairs to compose characters with code points
larger than <literal>\FFFF</literal> (although the availability
of the 6-digit form technically makes this unnecessary).
</para> </para>
<para> <para>
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.157 2009/07/14 20:24:10 tgl Exp $ * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.158 2009/09/21 22:22:07 petere Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -1097,11 +1097,30 @@ check_unicode_value(pg_wchar c, char *loc, base_yyscan_t yyscanner) ...@@ -1097,11 +1097,30 @@ check_unicode_value(pg_wchar c, char *loc, base_yyscan_t yyscanner)
} }
} }
static bool
is_utf16_surrogate_first(pg_wchar c)
{
return (c >= 0xD800 && c <= 0xDBFF);
}
static bool
is_utf16_surrogate_second(pg_wchar c)
{
return (c >= 0xDC00 && c <= 0xDFFF);
}
static pg_wchar
surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
{
return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
}
static char * static char *
litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner) litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
{ {
char *new; char *new;
char *litbuf, *in, *out; char *litbuf, *in, *out;
pg_wchar pair_first = 0;
if (isxdigit(escape) if (isxdigit(escape)
|| escape == '+' || escape == '+'
...@@ -1131,6 +1150,11 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner) ...@@ -1131,6 +1150,11 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
{ {
if (in[1] == escape) if (in[1] == escape)
{ {
if (pair_first)
{
ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
yyerror("invalid Unicode surrogate pair");
}
*out++ = escape; *out++ = escape;
in += 2; in += 2;
} }
...@@ -1138,9 +1162,27 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner) ...@@ -1138,9 +1162,27 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
{ {
pg_wchar unicode = hexval(in[1]) * 16*16*16 + hexval(in[2]) * 16*16 + hexval(in[3]) * 16 + hexval(in[4]); pg_wchar unicode = hexval(in[1]) * 16*16*16 + hexval(in[2]) * 16*16 + hexval(in[3]) * 16 + hexval(in[4]);
check_unicode_value(unicode, in, yyscanner); check_unicode_value(unicode, in, yyscanner);
unicode_to_utf8(unicode, (unsigned char *) out); if (pair_first)
{
if (is_utf16_surrogate_second(unicode))
{
unicode = surrogate_pair_to_codepoint(pair_first, unicode);
pair_first = 0;
}
else
{
ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
yyerror("invalid Unicode surrogate pair");
}
}
if (is_utf16_surrogate_first(unicode))
pair_first = unicode;
else
{
unicode_to_utf8(unicode, (unsigned char *) out);
out += pg_mblen(out);
}
in += 5; in += 5;
out += pg_mblen(out);
} }
else if (in[1] == '+' else if (in[1] == '+'
&& isxdigit(in[2]) && isxdigit(in[3]) && isxdigit(in[2]) && isxdigit(in[3])
...@@ -1150,9 +1192,27 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner) ...@@ -1150,9 +1192,27 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
pg_wchar unicode = hexval(in[2]) * 16*16*16*16*16 + hexval(in[3]) * 16*16*16*16 + hexval(in[4]) * 16*16*16 pg_wchar unicode = hexval(in[2]) * 16*16*16*16*16 + hexval(in[3]) * 16*16*16*16 + hexval(in[4]) * 16*16*16
+ hexval(in[5]) * 16*16 + hexval(in[6]) * 16 + hexval(in[7]); + hexval(in[5]) * 16*16 + hexval(in[6]) * 16 + hexval(in[7]);
check_unicode_value(unicode, in, yyscanner); check_unicode_value(unicode, in, yyscanner);
unicode_to_utf8(unicode, (unsigned char *) out); if (pair_first)
{
if (is_utf16_surrogate_second(unicode))
{
unicode = surrogate_pair_to_codepoint(pair_first, unicode);
pair_first = 0;
}
else
{
ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
yyerror("invalid Unicode surrogate pair");
}
}
if (is_utf16_surrogate_first(unicode))
pair_first = unicode;
else
{
unicode_to_utf8(unicode, (unsigned char *) out);
out += pg_mblen(out);
}
in += 8; in += 8;
out += pg_mblen(out);
} }
else else
{ {
...@@ -1161,7 +1221,14 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner) ...@@ -1161,7 +1221,14 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
} }
} }
else else
{
if (pair_first)
{
ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
yyerror("invalid Unicode surrogate pair");
}
*out++ = *in++; *out++ = *in++;
}
} }
*out = '\0'; *out = '\0';
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment