Commit 5fdd9418 authored by Bruce Momjian's avatar Bruce Momjian

Handle carriage returns and line feeds in COPY CSV mode.

Andrew Dunstan
parent 06a61d66
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.236 2004/12/31 21:59:41 pgsql Exp $ * $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.237 2005/03/12 05:41:34 momjian Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -98,7 +98,6 @@ static bool fe_eof; /* true if detected end of copy data */ ...@@ -98,7 +98,6 @@ static bool fe_eof; /* true if detected end of copy data */
static EolType eol_type; /* EOL type of input */ static EolType eol_type; /* EOL type of input */
static int client_encoding; /* remote side's character encoding */ static int client_encoding; /* remote side's character encoding */
static int server_encoding; /* local encoding */ static int server_encoding; /* local encoding */
static bool embedded_line_warning;
/* these are just for error messages, see copy_in_error_callback */ /* these are just for error messages, see copy_in_error_callback */
static bool copy_binary; /* is it a binary copy? */ static bool copy_binary; /* is it a binary copy? */
...@@ -139,7 +138,7 @@ static void CopyTo(Relation rel, List *attnumlist, bool binary, bool oids, ...@@ -139,7 +138,7 @@ static void CopyTo(Relation rel, List *attnumlist, bool binary, bool oids,
static void CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids, static void CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
char *delim, char *null_print, bool csv_mode, char *quote, char *escape, char *delim, char *null_print, bool csv_mode, char *quote, char *escape,
List *force_notnull_atts); List *force_notnull_atts);
static bool CopyReadLine(void); static bool CopyReadLine(char * quote, char * escape);
static char *CopyReadAttribute(const char *delim, const char *null_print, static char *CopyReadAttribute(const char *delim, const char *null_print,
CopyReadResult *result, bool *isnull); CopyReadResult *result, bool *isnull);
static char *CopyReadAttributeCSV(const char *delim, const char *null_print, static char *CopyReadAttributeCSV(const char *delim, const char *null_print,
...@@ -1191,7 +1190,6 @@ CopyTo(Relation rel, List *attnumlist, bool binary, bool oids, ...@@ -1191,7 +1190,6 @@ CopyTo(Relation rel, List *attnumlist, bool binary, bool oids,
attr = tupDesc->attrs; attr = tupDesc->attrs;
num_phys_attrs = tupDesc->natts; num_phys_attrs = tupDesc->natts;
attr_count = list_length(attnumlist); attr_count = list_length(attnumlist);
embedded_line_warning = false;
/* /*
* Get info about the columns we need to process. * Get info about the columns we need to process.
...@@ -1718,7 +1716,8 @@ CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids, ...@@ -1718,7 +1716,8 @@ CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
ListCell *cur; ListCell *cur;
/* Actually read the line into memory here */ /* Actually read the line into memory here */
done = CopyReadLine(); done = csv_mode ?
CopyReadLine(quote, escape) : CopyReadLine(NULL, NULL);
/* /*
* EOF at start of line means we're done. If we see EOF after * EOF at start of line means we're done. If we see EOF after
...@@ -2006,7 +2005,7 @@ CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids, ...@@ -2006,7 +2005,7 @@ CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
* by newline. * by newline.
*/ */
static bool static bool
CopyReadLine(void) CopyReadLine(char * quote, char * escape)
{ {
bool result; bool result;
bool change_encoding = (client_encoding != server_encoding); bool change_encoding = (client_encoding != server_encoding);
...@@ -2015,6 +2014,19 @@ CopyReadLine(void) ...@@ -2015,6 +2014,19 @@ CopyReadLine(void)
int j; int j;
unsigned char s[2]; unsigned char s[2];
char *cvt; char *cvt;
bool in_quote = false, last_was_esc = false, csv_mode = false;
char quotec = '\0', escapec = '\0';
if (quote)
{
csv_mode = true;
quotec = quote[0];
escapec = escape[0];
/* ignore special escape processing if it's the same as quotec */
if (quotec == escapec)
escapec = '\0';
}
s[1] = 0; s[1] = 0;
...@@ -2031,11 +2043,20 @@ CopyReadLine(void) ...@@ -2031,11 +2043,20 @@ CopyReadLine(void)
/* /*
* In this loop we only care for detecting newlines (\r and/or \n) and * In this loop we only care for detecting newlines (\r and/or \n) and
* the end-of-copy marker (\.). For backwards compatibility we allow * the end-of-copy marker (\.).
*
* In Text mode, for backwards compatibility we allow
* backslashes to escape newline characters. Backslashes other than * backslashes to escape newline characters. Backslashes other than
* the end marker get put into the line_buf, since CopyReadAttribute * the end marker get put into the line_buf, since CopyReadAttribute
* does its own escape processing. These four characters, and only * does its own escape processing.
* these four, are assumed the same in frontend and backend encodings. *
* In CSV mode, CR and NL inside q quoted field are just part of the
* data value and are put in line_buf. We keep just enough state
* to know if we are currently in a quoted field or not.
*
* These four characters, and only these four, are assumed the same in
* frontend and backend encodings.
*
* We do not assume that second and later bytes of a frontend * We do not assume that second and later bytes of a frontend
* multibyte character couldn't look like ASCII characters. * multibyte character couldn't look like ASCII characters.
*/ */
...@@ -2047,13 +2068,49 @@ CopyReadLine(void) ...@@ -2047,13 +2068,49 @@ CopyReadLine(void)
result = true; result = true;
break; break;
} }
if (c == '\r')
if (csv_mode)
{
/*
* Dealing with quotes and escapes here is mildly tricky. If the
* quote char is also the escape char, there's no problem - we
* just use the char as a toggle. If they are different, we need
* to ensure that we only take account of an escape inside a quoted
* field and immediately preceding a quote char, and not the
* second in a escape-escape sequence.
*/
if (in_quote && c == escapec)
last_was_esc = ! last_was_esc;
if (c == quotec && ! last_was_esc)
in_quote = ! in_quote;
if (c != escapec)
last_was_esc = false;
/*
* updating the line count for embedded CR and/or LF chars is
* necessarily a little fragile - this test is probably about
* the best we can do.
*/
if (in_quote && c == (eol_type == EOL_CR ? '\r' : '\n'))
copy_lineno++;
}
if (!in_quote && c == '\r')
{ {
if (eol_type == EOL_NL) if (eol_type == EOL_NL)
ereport(ERROR, {
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT), if (! csv_mode)
errmsg("literal carriage return found in data"), ereport(ERROR,
errhint("Use \"\\r\" to represent carriage return."))); (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
errmsg("literal carriage return found in data"),
errhint("Use \"\\r\" to represent carriage return.")));
else
ereport(ERROR,
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
errmsg("unquoted carriage return found in CSV data"),
errhint("Use quoted CSV field to represent carriage return.")));
}
/* Check for \r\n on first line, _and_ handle \r\n. */ /* Check for \r\n on first line, _and_ handle \r\n. */
if (eol_type == EOL_UNKNOWN || eol_type == EOL_CRNL) if (eol_type == EOL_UNKNOWN || eol_type == EOL_CRNL)
{ {
...@@ -2068,10 +2125,19 @@ CopyReadLine(void) ...@@ -2068,10 +2125,19 @@ CopyReadLine(void)
{ {
/* found \r, but no \n */ /* found \r, but no \n */
if (eol_type == EOL_CRNL) if (eol_type == EOL_CRNL)
ereport(ERROR, {
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT), if (!csv_mode)
errmsg("literal carriage return found in data"), ereport(ERROR,
errhint("Use \"\\r\" to represent carriage return."))); (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
errmsg("literal carriage return found in data"),
errhint("Use \"\\r\" to represent carriage return.")));
else
ereport(ERROR,
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
errmsg("unquoted carriage return found in data"),
errhint("Use quoted CSV field to represent carriage return.")));
}
/* /*
* if we got here, it is the first line and we didn't * if we got here, it is the first line and we didn't
...@@ -2083,26 +2149,47 @@ CopyReadLine(void) ...@@ -2083,26 +2149,47 @@ CopyReadLine(void)
} }
break; break;
} }
if (c == '\n') if (!in_quote && c == '\n')
{ {
if (eol_type == EOL_CR || eol_type == EOL_CRNL) if (eol_type == EOL_CR || eol_type == EOL_CRNL)
ereport(ERROR, {
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT), if (!csv_mode)
errmsg("literal newline found in data"), ereport(ERROR,
errhint("Use \"\\n\" to represent newline."))); (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
errmsg("literal newline found in data"),
errhint("Use \"\\n\" to represent newline.")));
else
ereport(ERROR,
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
errmsg("unquoted newline found in data"),
errhint("Use quoted CSV field to represent newline.")));
}
eol_type = EOL_NL; eol_type = EOL_NL;
break; break;
} }
if (c == '\\')
if ((line_buf.len == 0 || !csv_mode) && c == '\\')
{ {
c = CopyGetChar(); int c2;
if (c == EOF)
if (csv_mode)
c2 = CopyPeekChar();
else
c2 = c = CopyGetChar();
if (c2 == EOF)
{ {
result = true; result = true;
if (csv_mode)
CopyDonePeek(c2, true);
break; break;
} }
if (c == '.') if (c2 == '.')
{ {
if (csv_mode)
CopyDonePeek(c2, true); /* allow keep calling GetChar() */
if (eol_type == EOL_CRNL) if (eol_type == EOL_CRNL)
{ {
c = CopyGetChar(); c = CopyGetChar();
...@@ -2140,8 +2227,12 @@ CopyReadLine(void) ...@@ -2140,8 +2227,12 @@ CopyReadLine(void)
result = true; /* report EOF */ result = true; /* report EOF */
break; break;
} }
/* not EOF mark, so emit \ and following char literally */
appendStringInfoCharMacro(&line_buf, '\\'); if (csv_mode)
CopyDonePeek(c2, false); /* not a dot, so put it back */
else
/* not EOF mark, so emit \ and following char literally */
appendStringInfoCharMacro(&line_buf, '\\');
} }
appendStringInfoCharMacro(&line_buf, c); appendStringInfoCharMacro(&line_buf, c);
...@@ -2369,34 +2460,6 @@ CopyReadAttributeCSV(const char *delim, const char *null_print, char *quote, ...@@ -2369,34 +2460,6 @@ CopyReadAttributeCSV(const char *delim, const char *null_print, char *quote,
for (;;) for (;;)
{ {
/* handle multiline quoted fields */
if (in_quote && line_buf.cursor >= line_buf.len)
{
bool done;
switch (eol_type)
{
case EOL_NL:
appendStringInfoString(&attribute_buf, "\n");
break;
case EOL_CR:
appendStringInfoString(&attribute_buf, "\r");
break;
case EOL_CRNL:
appendStringInfoString(&attribute_buf, "\r\n");
break;
case EOL_UNKNOWN:
/* shouldn't happen - just keep going */
break;
}
copy_lineno++;
done = CopyReadLine();
if (done && line_buf.len == 0)
break;
start_cursor = line_buf.cursor;
}
end_cursor = line_buf.cursor; end_cursor = line_buf.cursor;
if (line_buf.cursor >= line_buf.len) if (line_buf.cursor >= line_buf.len)
break; break;
...@@ -2629,25 +2692,6 @@ CopyAttributeOutCSV(char *server_string, char *delim, char *quote, ...@@ -2629,25 +2692,6 @@ CopyAttributeOutCSV(char *server_string, char *delim, char *quote,
!use_quote && (c = *test_string) != '\0'; !use_quote && (c = *test_string) != '\0';
test_string += mblen) test_string += mblen)
{ {
/*
* We don't know here what the surrounding line end characters
* might be. It might not even be under postgres' control. So
* we simple warn on ANY embedded line ending character.
*
* This warning will disappear when we make line parsing field-aware,
* so that we can reliably read in embedded line ending characters
* regardless of the file's line-end context.
*
*/
if (!embedded_line_warning && (c == '\n' || c == '\r') )
{
embedded_line_warning = true;
elog(WARNING,
"CSV fields with embedded linefeed or carriage return "
"characters might not be able to be reimported");
}
if (c == delimc || c == quotec || c == '\n' || c == '\r') if (c == delimc || c == quotec || c == '\n' || c == '\r')
use_quote = true; use_quote = true;
if (!same_encoding) if (!same_encoding)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment