Commit 22c92226 authored by Tom Lane's avatar Tom Lane

Fix de-escaping checks so that we will reject \000 as well as other invalidly

encoded sequences.  Per discussion of a couple of days ago.
parent c1c40e58
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.307 2009/03/31 22:12:46 tgl Exp $ * $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.308 2009/04/19 21:08:54 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -2718,7 +2718,7 @@ CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals) ...@@ -2718,7 +2718,7 @@ CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals)
char *start_ptr; char *start_ptr;
char *end_ptr; char *end_ptr;
int input_len; int input_len;
bool saw_high_bit = false; bool saw_non_ascii = false;
/* Make sure space remains in fieldvals[] */ /* Make sure space remains in fieldvals[] */
if (fieldno >= maxfields) if (fieldno >= maxfields)
...@@ -2783,8 +2783,8 @@ CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals) ...@@ -2783,8 +2783,8 @@ CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals)
} }
} }
c = val & 0377; c = val & 0377;
if (IS_HIGHBIT_SET(c)) if (c == '\0' || IS_HIGHBIT_SET(c))
saw_high_bit = true; saw_non_ascii = true;
} }
break; break;
case 'x': case 'x':
...@@ -2808,8 +2808,8 @@ CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals) ...@@ -2808,8 +2808,8 @@ CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals)
} }
} }
c = val & 0xff; c = val & 0xff;
if (IS_HIGHBIT_SET(c)) if (c == '\0' || IS_HIGHBIT_SET(c))
saw_high_bit = true; saw_non_ascii = true;
} }
} }
break; break;
...@@ -2847,11 +2847,11 @@ CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals) ...@@ -2847,11 +2847,11 @@ CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals)
*output_ptr++ = '\0'; *output_ptr++ = '\0';
/* /*
* If we de-escaped a char with the high bit set, make sure we still * If we de-escaped a non-7-bit-ASCII char, make sure we still
* have valid data for the db encoding. Avoid calling strlen here for * have valid data for the db encoding. Avoid calling strlen here for
* the sake of efficiency. * the sake of efficiency.
*/ */
if (saw_high_bit) if (saw_non_ascii)
{ {
char *fld = fieldvals[fieldno]; char *fld = fieldvals[fieldno];
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.150 2009/04/14 22:18:47 tgl Exp $ * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.151 2009/04/19 21:08:54 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -60,7 +60,7 @@ bool escape_string_warning = true; ...@@ -60,7 +60,7 @@ bool escape_string_warning = true;
bool standard_conforming_strings = false; bool standard_conforming_strings = false;
static bool warn_on_first_escape; static bool warn_on_first_escape;
static bool saw_high_bit = false; static bool saw_non_ascii = false;
/* /*
* literalbuf is used to accumulate literal values when multiple rules * literalbuf is used to accumulate literal values when multiple rules
...@@ -453,7 +453,7 @@ other . ...@@ -453,7 +453,7 @@ other .
{xqstart} { {xqstart} {
warn_on_first_escape = true; warn_on_first_escape = true;
saw_high_bit = false; saw_non_ascii = false;
SET_YYLLOC(); SET_YYLLOC();
if (standard_conforming_strings) if (standard_conforming_strings)
BEGIN(xq); BEGIN(xq);
...@@ -463,7 +463,7 @@ other . ...@@ -463,7 +463,7 @@ other .
} }
{xestart} { {xestart} {
warn_on_first_escape = false; warn_on_first_escape = false;
saw_high_bit = false; saw_non_ascii = false;
SET_YYLLOC(); SET_YYLLOC();
BEGIN(xe); BEGIN(xe);
startlit(); startlit();
...@@ -477,10 +477,11 @@ other . ...@@ -477,10 +477,11 @@ other .
<xq,xe>{quotefail} { <xq,xe>{quotefail} {
yyless(1); yyless(1);
BEGIN(INITIAL); BEGIN(INITIAL);
/* check that the data remains valid if it might have been /*
* check that the data remains valid if it might have been
* made invalid by unescaping any chars. * made invalid by unescaping any chars.
*/ */
if (saw_high_bit) if (saw_non_ascii)
pg_verifymbstr(literalbuf, literallen, false); pg_verifymbstr(literalbuf, literallen, false);
yylval.str = litbufdup(); yylval.str = litbufdup();
return SCONST; return SCONST;
...@@ -526,16 +527,16 @@ other . ...@@ -526,16 +527,16 @@ other .
check_escape_warning(); check_escape_warning();
addlitchar(c); addlitchar(c);
if (IS_HIGHBIT_SET(c)) if (c == '\0' || IS_HIGHBIT_SET(c))
saw_high_bit = true; saw_non_ascii = true;
} }
<xe>{xehexesc} { <xe>{xehexesc} {
unsigned char c = strtoul(yytext+2, NULL, 16); unsigned char c = strtoul(yytext+2, NULL, 16);
check_escape_warning(); check_escape_warning();
addlitchar(c); addlitchar(c);
if (IS_HIGHBIT_SET(c)) if (c == '\0' || IS_HIGHBIT_SET(c))
saw_high_bit = true; saw_non_ascii = true;
} }
<xq,xe,xus>{quotecontinue} { <xq,xe,xus>{quotecontinue} {
/* ignore */ /* ignore */
...@@ -1083,6 +1084,11 @@ litbuf_udeescape(unsigned char escape) ...@@ -1083,6 +1084,11 @@ litbuf_udeescape(unsigned char escape)
} }
*out = '\0'; *out = '\0';
/*
* We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII
* codes; but it's probably not worth the trouble, since this isn't
* likely to be a performance-critical path.
*/
pg_verifymbstr(new, out - new, false); pg_verifymbstr(new, out - new, false);
return new; return new;
} }
...@@ -1090,14 +1096,6 @@ litbuf_udeescape(unsigned char escape) ...@@ -1090,14 +1096,6 @@ litbuf_udeescape(unsigned char escape)
static unsigned char static unsigned char
unescape_single_char(unsigned char c) unescape_single_char(unsigned char c)
{ {
/* Normally we wouldn't expect to see \n where n has its high bit set
* but we set the flag to check the string if we do get it, so
* that this doesn't become a way of getting around the coding validity
* checks.
*/
if (IS_HIGHBIT_SET(c))
saw_high_bit = true;
switch (c) switch (c)
{ {
case 'b': case 'b':
...@@ -1111,6 +1109,10 @@ unescape_single_char(unsigned char c) ...@@ -1111,6 +1109,10 @@ unescape_single_char(unsigned char c)
case 't': case 't':
return '\t'; return '\t';
default: default:
/* check for backslash followed by non-7-bit-ASCII */
if (c == '\0' || IS_HIGHBIT_SET(c))
saw_non_ascii = true;
return c; return c;
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment