postgresql/src/include/commands/copyfrom_internal.h

/*-------------------------------------------------------------------------
 *
 * copyfrom_internal.h
 *	  Internal definitions for COPY FROM command.
 *
 *
 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * src/include/commands/copyfrom_internal.h
 *
 *-------------------------------------------------------------------------
 */
#ifndef COPYFROM_INTERNAL_H
#define COPYFROM_INTERNAL_H

#include "commands/copy.h"
#include "commands/trigger.h"

/*
 * Represents the different source cases we need to worry about at
 * the bottom level
 */
typedef enum CopySource
{
	COPY_FILE,					/* from file (or a piped program) */
	COPY_OLD_FE,				/* from frontend (2.0 protocol) */
	COPY_NEW_FE,				/* from frontend (3.0 protocol) */
	COPY_CALLBACK				/* from callback function */
} CopySource;

/*
 *	Represents the end-of-line terminator type of the input
 */
typedef enum EolType
{
	EOL_UNKNOWN,
	EOL_NL,
	EOL_CR,
	EOL_CRNL
} EolType;

/*
 * Represents the heap insert method to be used during COPY FROM.
 */
typedef enum CopyInsertMethod
{
	CIM_SINGLE,					/* use table_tuple_insert or fdw routine */
	CIM_MULTI,					/* always use table_multi_insert */
	CIM_MULTI_CONDITIONAL		/* use table_multi_insert only if valid */
} CopyInsertMethod;

/*
 * This struct contains all the state variables used throughout a COPY FROM
 * operation.
 *
 * Multi-byte encodings: all supported client-side encodings encode multi-byte
 * characters by having the first byte's high bit set. Subsequent bytes of the
 * character can have the high bit not set. When scanning data in such an
 * encoding to look for a match to a single-byte (ie ASCII) character, we must
 * use the full pg_encoding_mblen() machinery to skip over multibyte
 * characters, else we might find a false match to a trailing byte. In
 * supported server encodings, there is no possibility of a false match, and
 * it's faster to make useless comparisons to trailing bytes than it is to
 * invoke pg_encoding_mblen() to skip over them. encoding_embeds_ascii is true
 * when we have to do it the hard way.
 */
typedef struct CopyFromStateData
{
	/* low-level state data */
	CopySource	copy_src;		/* type of copy source */
	FILE	   *copy_file;		/* used if copy_src == COPY_FILE */
	StringInfo	fe_msgbuf;		/* used if copy_src == COPY_NEW_FE */
	bool		reached_eof;	/* true if we read to end of copy data (not
								 * all copy_src types maintain this) */

	EolType		eol_type;		/* EOL type of input */
	int			file_encoding;	/* file or remote side's character encoding */
	bool		need_transcoding;	/* file encoding diff from server? */
	bool		encoding_embeds_ascii;	/* ASCII can be non-first byte? */

	/* parameters from the COPY command */
	Relation	rel;			/* relation to copy from */
	List	   *attnumlist;		/* integer list of attnums to copy */
	char	   *filename;		/* filename, or NULL for STDIN */
	bool		is_program;		/* is 'filename' a program to popen? */
	copy_data_source_cb data_source_cb; /* function for reading data */

	CopyFormatOptions opts;
	bool	   *convert_select_flags;	/* per-column CSV/TEXT CS flags */
	Node	   *whereClause;	/* WHERE condition (or NULL) */

	/* these are just for error messages, see CopyFromErrorCallback */
	const char *cur_relname;	/* table name for error messages */
	uint64		cur_lineno;		/* line number for error messages */
	const char *cur_attname;	/* current att for error messages */
	const char *cur_attval;		/* current att value for error messages */

	/*
	 * Working state
	 */
	MemoryContext copycontext;	/* per-copy execution context */

	AttrNumber	num_defaults;
	FmgrInfo   *in_functions;	/* array of input functions for each attrs */
	Oid		   *typioparams;	/* array of element types for in_functions */
	int		   *defmap;			/* array of default att numbers */
	ExprState **defexprs;		/* array of default att expressions */
	bool		volatile_defexprs;	/* is any of defexprs volatile? */
	List	   *range_table;
	ExprState  *qualexpr;

	TransitionCaptureState *transition_capture;

	/*
	 * These variables are used to reduce overhead in COPY FROM.
	 *
	 * attribute_buf holds the separated, de-escaped text for each field of
	 * the current line.  The CopyReadAttributes functions return arrays of
	 * pointers into this buffer.  We avoid palloc/pfree overhead by re-using
	 * the buffer on each cycle.
	 *
	 * In binary COPY FROM, attribute_buf holds the binary data for the
	 * current field, but the usage is otherwise similar.
	 */
	StringInfoData attribute_buf;

	/* field raw data pointers found by COPY FROM */

	int			max_fields;
	char	  **raw_fields;

	/*
	 * Similarly, line_buf holds the whole input line being processed. The
	 * input cycle is first to read the whole line into line_buf, convert it
	 * to server encoding there, and then extract the individual attribute
	 * fields into attribute_buf.  line_buf is preserved unmodified so that we
	 * can display it in error messages if appropriate.  (In binary mode,
	 * line_buf is not used.)
	 */
	StringInfoData line_buf;
	bool		line_buf_converted; /* converted to server encoding? */
	bool		line_buf_valid; /* contains the row being processed? */

	/*
	 * Finally, raw_buf holds raw data read from the data source (file or
	 * client connection).  In text mode, CopyReadLine parses this data
	 * sufficiently to locate line boundaries, then transfers the data to
	 * line_buf and converts it.  In binary mode, CopyReadBinaryData fetches
	 * appropriate amounts of data from this buffer.  In both modes, we
	 * guarantee that there is a \0 at raw_buf[raw_buf_len].
	 */
#define RAW_BUF_SIZE 65536		/* we palloc RAW_BUF_SIZE+1 bytes */
	char	   *raw_buf;
	int			raw_buf_index;	/* next byte to process */
	int			raw_buf_len;	/* total # of bytes stored */
	uint64		bytes_processed;/* number of bytes processed so far */
	/* Shorthand for number of unconsumed bytes available in raw_buf */
#define RAW_BUF_BYTES(cstate) ((cstate)->raw_buf_len - (cstate)->raw_buf_index)
} CopyFromStateData;

extern void ReceiveCopyBegin(CopyFromState cstate);
extern void ReceiveCopyBinaryHeader(CopyFromState cstate);

#endif							/* COPYFROM_INTERNAL_H */
Split copy.c into four files. Copy.c has grown really large. Split it into more manageable parts: - copy.c now contains only a few functions that are common to COPY FROM and COPY TO. - copyto.c contains code for COPY TO. - copyfrom.c contains code for initializing COPY FROM, and inserting the tuples to the correct table. - copyfromparse.c contains code for reading from the client/file/program, and parsing the input text/CSV/binary format into tuples. All of these parts are fairly complicated, and fairly independent of each other. There is a patch being discussed to implement parallel COPY FROM, which will add a lot of new code to the COPY FROM path, and another patch which would allow INSERTs to use the same multi-insert machinery as COPY FROM, both of which will require refactoring that code. With those two patches, there's going to be a lot of code churn in copy.c anyway, so now seems like a good time to do this refactoring. The CopyStateData struct is also split. All the formatting options, like FORMAT, QUOTE, ESCAPE, are put in a new CopyFormatOption struct, which is used by both COPY FROM and TO. Other state data are kept in separate CopyFromStateData and CopyToStateData structs. Reviewed-by: Soumyadeep Chakraborty, Erik Rijkers, Vignesh C, Andres Freund Discussion: https://www.postgresql.org/message-id/8e15b560-f387-7acc-ac90-763986617bfb%40iki.fi 2020-11-23 09:50:50 +01:00			`/*-------------------------------------------------------------------------`
			`*`
			`* copyfrom_internal.h`
			`* Internal definitions for COPY FROM command.`
			`*`
			`*`
Update copyright for 2021 Backpatch-through: 9.5 2021-01-02 19:06:25 +01:00			`* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group`
Split copy.c into four files. Copy.c has grown really large. Split it into more manageable parts: - copy.c now contains only a few functions that are common to COPY FROM and COPY TO. - copyto.c contains code for COPY TO. - copyfrom.c contains code for initializing COPY FROM, and inserting the tuples to the correct table. - copyfromparse.c contains code for reading from the client/file/program, and parsing the input text/CSV/binary format into tuples. All of these parts are fairly complicated, and fairly independent of each other. There is a patch being discussed to implement parallel COPY FROM, which will add a lot of new code to the COPY FROM path, and another patch which would allow INSERTs to use the same multi-insert machinery as COPY FROM, both of which will require refactoring that code. With those two patches, there's going to be a lot of code churn in copy.c anyway, so now seems like a good time to do this refactoring. The CopyStateData struct is also split. All the formatting options, like FORMAT, QUOTE, ESCAPE, are put in a new CopyFormatOption struct, which is used by both COPY FROM and TO. Other state data are kept in separate CopyFromStateData and CopyToStateData structs. Reviewed-by: Soumyadeep Chakraborty, Erik Rijkers, Vignesh C, Andres Freund Discussion: https://www.postgresql.org/message-id/8e15b560-f387-7acc-ac90-763986617bfb%40iki.fi 2020-11-23 09:50:50 +01:00			`* Portions Copyright (c) 1994, Regents of the University of California`
			`*`
			`* src/include/commands/copyfrom_internal.h`
			`*`
			`*-------------------------------------------------------------------------`
			`*/`
			`#ifndef COPYFROM_INTERNAL_H`
			`#define COPYFROM_INTERNAL_H`

			`#include "commands/copy.h"`
			`#include "commands/trigger.h"`

			`/*`
			`* Represents the different source cases we need to worry about at`
			`* the bottom level`
			`*/`
			`typedef enum CopySource`
			`{`
			`COPY_FILE, /* from file (or a piped program) */`
			`COPY_OLD_FE, /* from frontend (2.0 protocol) */`
			`COPY_NEW_FE, /* from frontend (3.0 protocol) */`
			`COPY_CALLBACK /* from callback function */`
			`} CopySource;`

			`/*`
			`* Represents the end-of-line terminator type of the input`
			`*/`
			`typedef enum EolType`
			`{`
			`EOL_UNKNOWN,`
			`EOL_NL,`
			`EOL_CR,`
			`EOL_CRNL`
			`} EolType;`

			`/*`
			`* Represents the heap insert method to be used during COPY FROM.`
			`*/`
			`typedef enum CopyInsertMethod`
			`{`
			`CIM_SINGLE, /* use table_tuple_insert or fdw routine */`
			`CIM_MULTI, /* always use table_multi_insert */`
			`CIM_MULTI_CONDITIONAL /* use table_multi_insert only if valid */`
			`} CopyInsertMethod;`

			`/*`
			`* This struct contains all the state variables used throughout a COPY FROM`
			`* operation.`
			`*`
			`* Multi-byte encodings: all supported client-side encodings encode multi-byte`
			`* characters by having the first byte's high bit set. Subsequent bytes of the`
			`* character can have the high bit not set. When scanning data in such an`
			`* encoding to look for a match to a single-byte (ie ASCII) character, we must`
			`* use the full pg_encoding_mblen() machinery to skip over multibyte`
			`* characters, else we might find a false match to a trailing byte. In`
			`* supported server encodings, there is no possibility of a false match, and`
			`* it's faster to make useless comparisons to trailing bytes than it is to`
			`* invoke pg_encoding_mblen() to skip over them. encoding_embeds_ascii is true`
			`* when we have to do it the hard way.`
			`*/`
			`typedef struct CopyFromStateData`
			`{`
			`/* low-level state data */`
			`CopySource copy_src; /* type of copy source */`
			`FILE copy_file; / used if copy_src == COPY_FILE */`
			`StringInfo fe_msgbuf; /* used if copy_src == COPY_NEW_FE */`
			`bool reached_eof; /* true if we read to end of copy data (not`
			`* all copy_src types maintain this) */`

			`EolType eol_type; /* EOL type of input */`
			`int file_encoding; /* file or remote side's character encoding */`
			`bool need_transcoding; /* file encoding diff from server? */`
			`bool encoding_embeds_ascii; /* ASCII can be non-first byte? */`

			`/* parameters from the COPY command */`
			`Relation rel; /* relation to copy from */`
			`List attnumlist; / integer list of attnums to copy */`
			`char filename; / filename, or NULL for STDIN */`
			`bool is_program; /* is 'filename' a program to popen? */`
			`copy_data_source_cb data_source_cb; /* function for reading data */`

			`CopyFormatOptions opts;`
			`bool convert_select_flags; / per-column CSV/TEXT CS flags */`
			`Node whereClause; / WHERE condition (or NULL) */`

			`/* these are just for error messages, see CopyFromErrorCallback */`
			`const char cur_relname; / table name for error messages */`
			`uint64 cur_lineno; /* line number for error messages */`
			`const char cur_attname; / current att for error messages */`
			`const char cur_attval; / current att value for error messages */`

			`/*`
			`* Working state`
			`*/`
			`MemoryContext copycontext; /* per-copy execution context */`

			`AttrNumber num_defaults;`
			`FmgrInfo in_functions; / array of input functions for each attrs */`
			`Oid typioparams; / array of element types for in_functions */`
			`int defmap; / array of default att numbers */`
			`ExprState *defexprs; / array of default att expressions */`
			`bool volatile_defexprs; /* is any of defexprs volatile? */`
			`List *range_table;`
			`ExprState *qualexpr;`

			`TransitionCaptureState *transition_capture;`

			`/*`
			`* These variables are used to reduce overhead in COPY FROM.`
			`*`
			`* attribute_buf holds the separated, de-escaped text for each field of`
			`* the current line. The CopyReadAttributes functions return arrays of`
			`* pointers into this buffer. We avoid palloc/pfree overhead by re-using`
			`* the buffer on each cycle.`
			`*`
			`* In binary COPY FROM, attribute_buf holds the binary data for the`
			`* current field, but the usage is otherwise similar.`
			`*/`
			`StringInfoData attribute_buf;`

			`/* field raw data pointers found by COPY FROM */`

			`int max_fields;`
			`char **raw_fields;`

			`/*`
			`* Similarly, line_buf holds the whole input line being processed. The`
			`* input cycle is first to read the whole line into line_buf, convert it`
			`* to server encoding there, and then extract the individual attribute`
			`* fields into attribute_buf. line_buf is preserved unmodified so that we`
			`* can display it in error messages if appropriate. (In binary mode,`
			`* line_buf is not used.)`
			`*/`
			`StringInfoData line_buf;`
			`bool line_buf_converted; /* converted to server encoding? */`
			`bool line_buf_valid; /* contains the row being processed? */`

			`/*`
			`* Finally, raw_buf holds raw data read from the data source (file or`
			`* client connection). In text mode, CopyReadLine parses this data`
			`* sufficiently to locate line boundaries, then transfers the data to`
			`* line_buf and converts it. In binary mode, CopyReadBinaryData fetches`
			`* appropriate amounts of data from this buffer. In both modes, we`
			`* guarantee that there is a \0 at raw_buf[raw_buf_len].`
			`*/`
			`#define RAW_BUF_SIZE 65536 /* we palloc RAW_BUF_SIZE+1 bytes */`
			`char *raw_buf;`
			`int raw_buf_index; /* next byte to process */`
			`int raw_buf_len; /* total # of bytes stored */`
Report progress of COPY commands This commit introduces a view pg_stat_progress_copy, reporting progress of COPY commands. This allows rough estimates how far a running COPY progressed, with the caveat that the total number of bytes may not be available in some cases (e.g. when the input comes from the client). Author: Josef Šimánek Reviewed-by: Fujii Masao, Bharath Rupireddy, Vignesh C, Matthias van de Meent Discussion: https://postgr.es/m/CAFp7QwqMGEi4OyyaLEK9DR0+E+oK3UtA4bEjDVCa4bNkwUY2PQ@mail.gmail.com Discussion: https://postgr.es/m/CAFp7Qwr6_FmRM6pCO0x_a0mymOfX_Gg+FEKet4XaTGSW=LitKQ@mail.gmail.com 2021-01-06 21:46:26 +01:00			`uint64 bytes_processed;/* number of bytes processed so far */`
Split copy.c into four files. Copy.c has grown really large. Split it into more manageable parts: - copy.c now contains only a few functions that are common to COPY FROM and COPY TO. - copyto.c contains code for COPY TO. - copyfrom.c contains code for initializing COPY FROM, and inserting the tuples to the correct table. - copyfromparse.c contains code for reading from the client/file/program, and parsing the input text/CSV/binary format into tuples. All of these parts are fairly complicated, and fairly independent of each other. There is a patch being discussed to implement parallel COPY FROM, which will add a lot of new code to the COPY FROM path, and another patch which would allow INSERTs to use the same multi-insert machinery as COPY FROM, both of which will require refactoring that code. With those two patches, there's going to be a lot of code churn in copy.c anyway, so now seems like a good time to do this refactoring. The CopyStateData struct is also split. All the formatting options, like FORMAT, QUOTE, ESCAPE, are put in a new CopyFormatOption struct, which is used by both COPY FROM and TO. Other state data are kept in separate CopyFromStateData and CopyToStateData structs. Reviewed-by: Soumyadeep Chakraborty, Erik Rijkers, Vignesh C, Andres Freund Discussion: https://www.postgresql.org/message-id/8e15b560-f387-7acc-ac90-763986617bfb%40iki.fi 2020-11-23 09:50:50 +01:00			`/* Shorthand for number of unconsumed bytes available in raw_buf */`
			`#define RAW_BUF_BYTES(cstate) ((cstate)->raw_buf_len - (cstate)->raw_buf_index)`
			`} CopyFromStateData;`

			`extern void ReceiveCopyBegin(CopyFromState cstate);`
			`extern void ReceiveCopyBinaryHeader(CopyFromState cstate);`

			`#endif /* COPYFROM_INTERNAL_H */`