Commit c2bb0378 authored by Peter Eisentraut's avatar Peter Eisentraut

Unicode escapes in E'...' strings

Author: Marko Kreen <markokr@gmail.com>
parent 9048b731
<!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.135 2009/09/21 22:22:07 petere Exp $ --> <!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.136 2009/09/22 23:52:53 petere Exp $ -->
<chapter id="sql-syntax"> <chapter id="sql-syntax">
<title>SQL Syntax</title> <title>SQL Syntax</title>
...@@ -398,6 +398,14 @@ SELECT 'foo' 'bar'; ...@@ -398,6 +398,14 @@ SELECT 'foo' 'bar';
</entry> </entry>
<entry>hexadecimal byte value</entry> <entry>hexadecimal byte value</entry>
</row> </row>
<row>
<entry>
<literal>\u<replaceable>xxxx</replaceable></literal>,
<literal>\U<replaceable>xxxxxxxx</replaceable></literal>
(<replaceable>x</replaceable> = 0 - 9, A - F)
</entry>
<entry>16 or 32-bit hexadecimal Unicode character value</entry>
</row>
</tbody> </tbody>
</tgroup> </tgroup>
</table> </table>
...@@ -411,13 +419,25 @@ SELECT 'foo' 'bar'; ...@@ -411,13 +419,25 @@ SELECT 'foo' 'bar';
</para> </para>
<para> <para>
It is your responsibility that the byte sequences you create are It is your responsibility that the byte sequences you create,
especially when using the octal or hexadecimal escapes, compose
valid characters in the server character set encoding. When the valid characters in the server character set encoding. When the
server encoding is UTF-8, then the alternative Unicode escape server encoding is UTF-8, then the Unicode escapes or the
syntax, explained in <xref linkend="sql-syntax-strings-uescape">, alternative Unicode escape syntax, explained
should be used instead. (The alternative would be doing the in <xref linkend="sql-syntax-strings-uescape">, should be used
UTF-8 encoding by hand and writing out the bytes, which would be instead. (The alternative would be doing the UTF-8 encoding by
very cumbersome.) hand and writing out the bytes, which would be very cumbersome.)
</para>
<para>
The Unicode escape syntax works fully only when the server
encoding is UTF-8. When other server encodings are used, only
code points in the ASCII range (up to <literal>\u007F</>) can be
specified. Both the 4-digit and the 8-digit form can be used to
specify UTF-16 surrogate pairs to compose characters with code
points larger than <literal>\FFFF</literal> (although the
availability of the 8-digit form technically makes this
unnecessary).
</para> </para>
<caution> <caution>
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.158 2009/09/21 22:22:07 petere Exp $ * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.159 2009/09/22 23:52:53 petere Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -80,6 +80,9 @@ static void addlitchar(unsigned char ychar, base_yyscan_t yyscanner); ...@@ -80,6 +80,9 @@ static void addlitchar(unsigned char ychar, base_yyscan_t yyscanner);
static char *litbufdup(base_yyscan_t yyscanner); static char *litbufdup(base_yyscan_t yyscanner);
static char *litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner); static char *litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner);
static unsigned char unescape_single_char(unsigned char c, base_yyscan_t yyscanner); static unsigned char unescape_single_char(unsigned char c, base_yyscan_t yyscanner);
static bool is_utf16_surrogate_first(pg_wchar c);
static bool is_utf16_surrogate_second(pg_wchar c);
static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second);
#define yyerror(msg) scanner_yyerror(msg, yyscanner) #define yyerror(msg) scanner_yyerror(msg, yyscanner)
...@@ -97,6 +100,8 @@ static void check_escape_warning(base_yyscan_t yyscanner); ...@@ -97,6 +100,8 @@ static void check_escape_warning(base_yyscan_t yyscanner);
extern int base_yyget_column(yyscan_t yyscanner); extern int base_yyget_column(yyscan_t yyscanner);
extern void base_yyset_column(int column_no, yyscan_t yyscanner); extern void base_yyset_column(int column_no, yyscan_t yyscanner);
static void addunicode(pg_wchar c, yyscan_t yyscanner);
%} %}
%option reentrant %option reentrant
...@@ -134,6 +139,7 @@ extern void base_yyset_column(int column_no, yyscan_t yyscanner); ...@@ -134,6 +139,7 @@ extern void base_yyset_column(int column_no, yyscan_t yyscanner);
* <xdolq> $foo$ quoted strings * <xdolq> $foo$ quoted strings
* <xui> quoted identifier with Unicode escapes * <xui> quoted identifier with Unicode escapes
* <xus> quoted string with Unicode escapes * <xus> quoted string with Unicode escapes
* <xeu> Unicode surrogate pair in extended quoted string
*/ */
%x xb %x xb
...@@ -145,6 +151,7 @@ extern void base_yyset_column(int column_no, yyscan_t yyscanner); ...@@ -145,6 +151,7 @@ extern void base_yyset_column(int column_no, yyscan_t yyscanner);
%x xdolq %x xdolq
%x xui %x xui
%x xus %x xus
%x xeu
/* /*
* In order to make the world safe for Windows and Mac clients as well as * In order to make the world safe for Windows and Mac clients as well as
...@@ -223,6 +230,8 @@ xeinside [^\\']+ ...@@ -223,6 +230,8 @@ xeinside [^\\']+
xeescape [\\][^0-7] xeescape [\\][^0-7]
xeoctesc [\\][0-7]{1,3} xeoctesc [\\][0-7]{1,3}
xehexesc [\\]x[0-9A-Fa-f]{1,2} xehexesc [\\]x[0-9A-Fa-f]{1,2}
xeunicode [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
xeunicodebad [\\]([uU])
/* Extended quote /* Extended quote
* xqdouble implements embedded quote, '''' * xqdouble implements embedded quote, ''''
...@@ -535,6 +544,45 @@ other . ...@@ -535,6 +544,45 @@ other .
<xe>{xeinside} { <xe>{xeinside} {
addlit(yytext, yyleng, yyscanner); addlit(yytext, yyleng, yyscanner);
} }
<xe>{xeunicode} {
pg_wchar c = strtoul(yytext+2, NULL, 16);
check_escape_warning(yyscanner);
if (is_utf16_surrogate_first(c))
{
yyextra->utf16_first_part = c;
BEGIN(xeu);
}
else if (is_utf16_surrogate_second(c))
yyerror("invalid Unicode surrogate pair");
else
addunicode(c, yyscanner);
}
<xeu>{xeunicode} {
pg_wchar c = strtoul(yytext+2, NULL, 16);
if (!is_utf16_surrogate_second(c))
yyerror("invalid Unicode surrogate pair");
c = surrogate_pair_to_codepoint(yyextra->utf16_first_part, c);
addunicode(c, yyscanner);
BEGIN(xe);
}
<xeu>. |
<xeu>\n |
<xeu><<EOF>> { yyerror("invalid Unicode surrogate pair"); }
<xe>{xeunicodebad} {
ereport(ERROR,
(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
errmsg("invalid Unicode escape"),
errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."),
lexer_errposition()));
}
<xe>{xeescape} { <xe>{xeescape} {
if (yytext[1] == '\'') if (yytext[1] == '\'')
{ {
...@@ -1330,3 +1378,21 @@ base_yyfree(void *ptr, base_yyscan_t yyscanner) ...@@ -1330,3 +1378,21 @@ base_yyfree(void *ptr, base_yyscan_t yyscanner)
if (ptr) if (ptr)
pfree(ptr); pfree(ptr);
} }
static void
addunicode(pg_wchar c, base_yyscan_t yyscanner)
{
char buf[8];
if (c == 0 || c > 0x10FFFF)
yyerror("invalid Unicode escape value");
if (c > 0x7F)
{
if (GetDatabaseEncoding() != PG_UTF8)
yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
yyextra->saw_non_ascii = true;
}
unicode_to_utf8(c, (unsigned char *)buf);
addlit(buf, pg_mblen(buf), yyscanner);
}
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/parser/gramparse.h,v 1.47 2009/07/14 20:24:10 tgl Exp $ * $PostgreSQL: pgsql/src/include/parser/gramparse.h,v 1.48 2009/09/22 23:52:53 petere Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -71,6 +71,9 @@ typedef struct base_yy_extra_type ...@@ -71,6 +71,9 @@ typedef struct base_yy_extra_type
int xcdepth; /* depth of nesting in slash-star comments */ int xcdepth; /* depth of nesting in slash-star comments */
char *dolqstart; /* current $foo$ quote start string */ char *dolqstart; /* current $foo$ quote start string */
/* first part of UTF16 surrogate pair for Unicode escapes */
int32 utf16_first_part;
/* state variables for literal-lexing warnings */ /* state variables for literal-lexing warnings */
bool warn_on_first_escape; bool warn_on_first_escape;
bool saw_non_ascii; bool saw_non_ascii;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment