Do COPY FROM encoding conversion/verification in larger chunks.

This gives a small performance gain, by reducing the number of calls to the conversion/verification function, and letting it work with larger inputs. Also, reorganizing the input pipeline makes it easier to parallelize the input parsing: after the input has been converted to the database encoding, the next stage of finding the newlines can be done in parallel, because there cannot be any newline chars "embedded" in multi-byte characters in the encodings that we support as server encodings. This changes behavior in one corner case: if client and server encodings are the same single-byte encoding (e.g. latin1), previously the input would not be checked for zero bytes ('\0'). Any fields containing zero bytes would be truncated at the zero. But if encoding conversion was needed, the conversion routine would throw an error on the zero. After this commit, the input is always checked for zeros. Reviewed-by: John Naylor Discussion: https://www.postgresql.org/message-id/e7861509-3960-538a-9025-b75a61188e01%40iki.fi

Do COPY FROM encoding conversion/verification in larger chunks.
This gives a small performance gain, by reducing the number of calls to the conversion/verification function, and letting it work with larger inputs. Also, reorganizing the input pipeline makes it easier to parallelize the input parsing: after the input has been converted to the database encoding, the next stage of finding the newlines can be done in parallel, because there cannot be any newline chars "embedded" in multi-byte characters in the encodings that we support as server encodings. This changes behavior in one corner case: if client and server encodings are the same single-byte encoding (e.g. latin1), previously the input would not be checked for zero bytes ('\0'). Any fields containing zero bytes would be truncated at the zero. But if encoding conversion was needed, the conversion routine would throw an error on the zero. After this commit, the input is always checked for zeros. Reviewed-by: John Naylor Discussion: https://www.postgresql.org/message-id/e7861509-3960-538a-9025-b75a61188e01%40iki.fi
f82de5c4 · Heikki Linnakangas · ea1b99a6 · f82de5c4 · f82de5c4 · f82de5c4
Commit f82de5c4 authored Apr 01, 2021 by Heikki Linnakangas
4 changed files
--- a/src/backend/commands/copyfrom.c
+++ b/src/backend/commands/copyfrom.c
@@ -3,6 +3,12 @@
 * copyfrom.c
 *		COPY <table> FROM file/program/client
 *
+ * This file contains routines needed to efficiently load tuples into a
+ * table.  That includes looking up the correct partition, firing triggers,
+ * calling the table AM function to insert the data, and updating indexes.
+ * Reading data from the input file or client and parsing it into Datums
+ * is handled in copyfromparse.c.
+ *
 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
@@ -23,6 +29,7 @@
 #include "access/tableam.h"
 #include "access/xact.h"
 #include "access/xlog.h"
+#include "catalog/namespace.h"
 #include "commands/copy.h"
 #include "commands/copyfrom_internal.h"
 #include "commands/progress.h"
@@ -87,7 +94,7 @@ typedef struct CopyMultiInsertInfo
 	List	   *multiInsertBuffers; /* List of tracked CopyMultiInsertBuffers */
 	int			bufferedTuples; /* number of tuples buffered over all buffers */
 	int			bufferedBytes;	/* number of bytes from all buffered tuples */
-	CopyFromState	cstate;			/* Copy state for this CopyMultiInsertInfo */
+	CopyFromState cstate;		/* Copy state for this CopyMultiInsertInfo */
 	EState	   *estate;			/* Executor state used for COPY */
 	CommandId	mycid;			/* Command Id used for COPY */
 	int			ti_options;		/* table insert options */
@@ -107,7 +114,7 @@ static void ClosePipeFromProgram(CopyFromState cstate);
 void
 CopyFromErrorCallback(void *arg)
 {
-	CopyFromState	cstate = (CopyFromState) arg;
+	CopyFromState cstate = (CopyFromState) arg;
 	char		curlineno_str[32];
 	snprintf(curlineno_str, sizeof(curlineno_str), UINT64_FORMAT,
@@ -149,15 +156,9 @@ CopyFromErrorCallback(void *arg)
 			/*
 			 * Error is relevant to a particular line.
 			 *
-			 * If line_buf still contains the correct line, and it's already
+			 * If line_buf still contains the correct line, print it.
-			 * transcoded, print it. If it's still in a foreign encoding, it's
-			 * quite likely that the error is precisely a failure to do
-			 * encoding conversion (ie, bad data). We dare not try to convert
-			 * it, and at present there's no way to regurgitate it without
-			 * conversion. So we have to punt and just report the line number.
 			 */
-			if (cstate->line_buf_valid &&
+			if (cstate->line_buf_valid)
-				(cstate->line_buf_converted || !cstate->need_transcoding))
 			{
 				char	   *lineval;
@@ -300,7 +301,7 @@ CopyMultiInsertBufferFlush(CopyMultiInsertInfo *miinfo,
 	MemoryContext oldcontext;
 	int			i;
 	uint64		save_cur_lineno;
-	CopyFromState	cstate = miinfo->cstate;
+	CopyFromState cstate = miinfo->cstate;
 	EState	   *estate = miinfo->estate;
 	CommandId	mycid = miinfo->mycid;
 	int			ti_options = miinfo->ti_options;
@@ -1191,7 +1192,7 @@ BeginCopyFrom(ParseState *pstate,
 			  List *attnamelist,
 			  List *options)
 {
-	CopyFromState	cstate;
+	CopyFromState cstate;
 	bool		pipe = (filename == NULL);
 	TupleDesc	tupDesc;
 	AttrNumber	num_phys_attrs,
@@ -1229,7 +1230,7 @@ BeginCopyFrom(ParseState *pstate,
 	oldcontext = MemoryContextSwitchTo(cstate->copycontext);
 	/* Extract options from the statement node tree */
-	ProcessCopyOptions(pstate, &cstate->opts, true /* is_from */, options);
+	ProcessCopyOptions(pstate, &cstate->opts, true /* is_from */ , options);
 	/* Process the target relation */
 	cstate->rel = rel;
@@ -1320,15 +1321,20 @@ BeginCopyFrom(ParseState *pstate,
 		cstate->file_encoding = cstate->opts.file_encoding;
 	/*
-	 * Set up encoding conversion info.  Even if the file and server encodings
+	 * Look up encoding conversion function.
-	 * are the same, we must apply pg_any_to_server() to validate data in
-	 * multibyte encodings.
 	 */
-	cstate->need_transcoding =
+	if (cstate->file_encoding == GetDatabaseEncoding() ||
-		(cstate->file_encoding != GetDatabaseEncoding() ||
+		cstate->file_encoding == PG_SQL_ASCII ||
-		 pg_database_encoding_max_length() > 1);
+		GetDatabaseEncoding() == PG_SQL_ASCII)
-	/* See Multibyte encoding comment above */
+	{
-	cstate->encoding_embeds_ascii = PG_ENCODING_IS_CLIENT_ONLY(cstate->file_encoding);
+		cstate->need_transcoding = false;
+	}
+	else
+	{
+		cstate->need_transcoding = true;
+		cstate->conversion_proc = FindDefaultConversionProc(cstate->file_encoding,
+															GetDatabaseEncoding());
+	}
 	cstate->copy_src = COPY_FILE;	/* default */
@@ -1339,7 +1345,6 @@ BeginCopyFrom(ParseState *pstate,
 	oldcontext = MemoryContextSwitchTo(cstate->copycontext);
 	/* Initialize state variables */
-	cstate->reached_eof = false;
 	cstate->eol_type = EOL_UNKNOWN;
 	cstate->cur_relname = RelationGetRelationName(cstate->rel);
 	cstate->cur_lineno = 0;
@@ -1347,19 +1352,36 @@ BeginCopyFrom(ParseState *pstate,
 	cstate->cur_attval = NULL;
 	/*
-	 * Set up variables to avoid per-attribute overhead.  attribute_buf and
+	 * Allocate buffers for the input pipeline.
-	 * raw_buf are used in both text and binary modes, but we use line_buf
+	 *
-	 * only in text mode.
+	 * attribute_buf and raw_buf are used in both text and binary modes, but
+	 * input_buf and line_buf only in text mode.
 	 */
-	initStringInfo(&cstate->attribute_buf);
+	cstate->raw_buf = palloc(RAW_BUF_SIZE + 1);
-	cstate->raw_buf = (char *) palloc(RAW_BUF_SIZE + 1);
 	cstate->raw_buf_index = cstate->raw_buf_len = 0;
+	cstate->raw_reached_eof = false;
 	if (!cstate->opts.binary)
 	{
+		/*
+		 * If encoding conversion is needed, we need another buffer to hold
+		 * the converted input data.  Otherwise, we can just point input_buf
+		 * to the same buffer as raw_buf.
+		 */
+		if (cstate->need_transcoding)
+		{
+			cstate->input_buf = (char *) palloc(INPUT_BUF_SIZE + 1);
+			cstate->input_buf_index = cstate->input_buf_len = 0;
+		}
+		else
+			cstate->input_buf = cstate->raw_buf;
+		cstate->input_reached_eof = false;
 		initStringInfo(&cstate->line_buf);
-		cstate->line_buf_converted = false;
 	}
+	initStringInfo(&cstate->attribute_buf);
 	/* Assign range table, we'll need it in CopyFrom. */
 	if (pstate)
 		cstate->range_table = pstate->p_rtable;
@@ -1584,7 +1606,7 @@ ClosePipeFromProgram(CopyFromState cstate)
 		 * should not report that as an error.  Otherwise, SIGPIPE indicates a
 		 * problem.
 		 */
-		if (!cstate->reached_eof &&
+		if (!cstate->raw_reached_eof &&
 			wait_result_is_signal(pclose_rc, SIGPIPE))
 			return;

--- a/src/backend/commands/copyfromparse.c
+++ b/src/backend/commands/copyfromparse.c
--- a/src/include/commands/copyfrom_internal.h
+++ b/src/include/commands/copyfrom_internal.h
@@ -52,17 +52,6 @@ typedef enum CopyInsertMethod
 /*
 * This struct contains all the state variables used throughout a COPY FROM
 * operation.
- *
- * Multi-byte encodings: all supported client-side encodings encode multi-byte
- * characters by having the first byte's high bit set. Subsequent bytes of the
- * character can have the high bit not set. When scanning data in such an
- * encoding to look for a match to a single-byte (ie ASCII) character, we must
- * use the full pg_encoding_mblen() machinery to skip over multibyte
- * characters, else we might find a false match to a trailing byte. In
- * supported server encodings, there is no possibility of a false match, and
- * it's faster to make useless comparisons to trailing bytes than it is to
- * invoke pg_encoding_mblen() to skip over them. encoding_embeds_ascii is true
- * when we have to do it the hard way.
 */
 typedef struct CopyFromStateData
 {
@@ -70,13 +59,11 @@ typedef struct CopyFromStateData
 	CopySource	copy_src;		/* type of copy source */
 	FILE	   *copy_file;		/* used if copy_src == COPY_FILE */
 	StringInfo	fe_msgbuf;		/* used if copy_src == COPY_NEW_FE */
-	bool		reached_eof;	/* true if we read to end of copy data (not
-								 * all copy_src types maintain this) */
 	EolType		eol_type;		/* EOL type of input */
 	int			file_encoding;	/* file or remote side's character encoding */
 	bool		need_transcoding;	/* file encoding diff from server? */
-	bool		encoding_embeds_ascii;	/* ASCII can be non-first byte? */
+	Oid			conversion_proc;	/* encoding conversion function */
 	/* parameters from the COPY command */
 	Relation	rel;			/* relation to copy from */
@@ -131,31 +118,52 @@ typedef struct CopyFromStateData
 	/*
 	 * Similarly, line_buf holds the whole input line being processed. The
-	 * input cycle is first to read the whole line into line_buf, convert it
+	 * input cycle is first to read the whole line into line_buf, and then
-	 * to server encoding there, and then extract the individual attribute
+	 * extract the individual attribute fields into attribute_buf.  line_buf
-	 * fields into attribute_buf.  line_buf is preserved unmodified so that we
+	 * is preserved unmodified so that we can display it in error messages if
-	 * can display it in error messages if appropriate.  (In binary mode,
+	 * appropriate.  (In binary mode, line_buf is not used.)
-	 * line_buf is not used.)
 	 */
 	StringInfoData line_buf;
-	bool		line_buf_converted; /* converted to server encoding? */
 	bool		line_buf_valid; /* contains the row being processed? */
 	/*
-	 * Finally, raw_buf holds raw data read from the data source (file or
+	 * input_buf holds input data, already converted to database encoding.
-	 * client connection).  In text mode, CopyReadLine parses this data
+	 *
-	 * sufficiently to locate line boundaries, then transfers the data to
+	 * In text mode, CopyReadLine parses this data sufficiently to locate
-	 * line_buf and converts it.  In binary mode, CopyReadBinaryData fetches
+	 * line boundaries, then transfers the data to line_buf. We guarantee
-	 * appropriate amounts of data from this buffer.  In both modes, we
+	 * that there is a \0 at input_buf[input_buf_len] at all times.  (In
-	 * guarantee that there is a \0 at raw_buf[raw_buf_len].
+	 * binary mode, input_buf is not used.)
+	 *
+	 * If encoding conversion is not required, input_buf is not a separate
+	 * buffer but points directly to raw_buf.  In that case, input_buf_len
+	 * tracks the number of bytes that have been verified as valid in the
+	 * database encoding, and raw_buf_len is the total number of bytes
+	 * stored in the buffer.
+	 */
+#define INPUT_BUF_SIZE 65536	/* we palloc INPUT_BUF_SIZE+1 bytes */
+	char	   *input_buf;
+	int			input_buf_index;	/* next byte to process */
+	int			input_buf_len;		/* total # of bytes stored */
+	bool		input_reached_eof;	/* true if we reached EOF */
+	bool		input_reached_error; /* true if a conversion error happened */
+	/* Shorthand for number of unconsumed bytes available in input_buf */
+#define INPUT_BUF_BYTES(cstate) ((cstate)->input_buf_len - (cstate)->input_buf_index)
+	/*
+	 * raw_buf holds raw input data read from the data source (file or client
+	 * connection), not yet converted to the database encoding.  Like with
+	 * 'input_buf', we guarantee that there is a \0 at raw_buf[raw_buf_len].
 	 */
 #define RAW_BUF_SIZE 65536		/* we palloc RAW_BUF_SIZE+1 bytes */
 	char	   *raw_buf;
 	int			raw_buf_index;	/* next byte to process */
 	int			raw_buf_len;	/* total # of bytes stored */
-	uint64		bytes_processed;/* number of bytes processed so far */
+	bool		raw_reached_eof;	/* true if we reached EOF */
 	/* Shorthand for number of unconsumed bytes available in raw_buf */
 #define RAW_BUF_BYTES(cstate) ((cstate)->raw_buf_len - (cstate)->raw_buf_index)
+	uint64		bytes_processed;	/* number of bytes processed so far */
 } CopyFromStateData;
 extern void ReceiveCopyBegin(CopyFromState cstate);

--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -306,15 +306,33 @@ typedef enum pg_enc
 /*
 * When converting strings between different encodings, we assume that space
- * for converted result is 4-to-1 growth in the worst case. The rate for
+ * for converted result is 4-to-1 growth in the worst case.  The rate for
 * currently supported encoding pairs are within 3 (SJIS JIS X0201 half width
- * kanna -> UTF8 is the worst case).  So "4" should be enough for the moment.
+ * kana -> UTF8 is the worst case).  So "4" should be enough for the moment.
 *
 * Note that this is not the same as the maximum character width in any
 * particular encoding.
 */
 #define MAX_CONVERSION_GROWTH  4
+/*
+ * Maximum byte length of a string that's required in any encoding to convert
+ * at least one character to any other encoding.  In other words, if you feed
+ * MAX_CONVERSION_INPUT_LENGTH bytes to any encoding conversion function, it
+ * is guaranteed to be able to convert something without needing more input
+ * (assuming the input is valid).
+ *
+ * Currently, the maximum case is the conversion UTF8 -> SJIS JIS X0201 half
+ * width kana, where a pair of UTF-8 characters is converted into a single
+ * SHIFT_JIS_2004 character (the reverse of the worst case for
+ * MAX_CONVERSION_GROWTH).  It needs 6 bytes of input.  In theory, a
+ * user-defined conversion function might have more complicated cases, although
+ * for the reverse mapping you would probably also need to bump up
+ * MAX_CONVERSION_GROWTH.  But there is no need to be stingy here, so make it
+ * generous.
+ */
+#define MAX_CONVERSION_INPUT_LENGTH	16
 /*
 * Maximum byte length of the string equivalent to any one Unicode code point,
 * in any backend encoding.  The current value assumes that a 4-byte UTF-8