diff --git a/src/backend/commands/copyfrom.c b/src/backend/commands/copyfrom.c index 74dbb709fe..be2e3d7354 100644 --- a/src/backend/commands/copyfrom.c +++ b/src/backend/commands/copyfrom.c @@ -3,6 +3,12 @@ * copyfrom.c * COPY FROM file/program/client * + * This file contains routines needed to efficiently load tuples into a + * table. That includes looking up the correct partition, firing triggers, + * calling the table AM function to insert the data, and updating indexes. + * Reading data from the input file or client and parsing it into Datums + * is handled in copyfromparse.c. + * * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -23,6 +29,7 @@ #include "access/tableam.h" #include "access/xact.h" #include "access/xlog.h" +#include "catalog/namespace.h" #include "commands/copy.h" #include "commands/copyfrom_internal.h" #include "commands/progress.h" @@ -87,7 +94,7 @@ typedef struct CopyMultiInsertInfo List *multiInsertBuffers; /* List of tracked CopyMultiInsertBuffers */ int bufferedTuples; /* number of tuples buffered over all buffers */ int bufferedBytes; /* number of bytes from all buffered tuples */ - CopyFromState cstate; /* Copy state for this CopyMultiInsertInfo */ + CopyFromState cstate; /* Copy state for this CopyMultiInsertInfo */ EState *estate; /* Executor state used for COPY */ CommandId mycid; /* Command Id used for COPY */ int ti_options; /* table insert options */ @@ -107,7 +114,7 @@ static void ClosePipeFromProgram(CopyFromState cstate); void CopyFromErrorCallback(void *arg) { - CopyFromState cstate = (CopyFromState) arg; + CopyFromState cstate = (CopyFromState) arg; char curlineno_str[32]; snprintf(curlineno_str, sizeof(curlineno_str), UINT64_FORMAT, @@ -149,15 +156,9 @@ CopyFromErrorCallback(void *arg) /* * Error is relevant to a particular line. * - * If line_buf still contains the correct line, and it's already - * transcoded, print it. If it's still in a foreign encoding, it's - * quite likely that the error is precisely a failure to do - * encoding conversion (ie, bad data). We dare not try to convert - * it, and at present there's no way to regurgitate it without - * conversion. So we have to punt and just report the line number. + * If line_buf still contains the correct line, print it. */ - if (cstate->line_buf_valid && - (cstate->line_buf_converted || !cstate->need_transcoding)) + if (cstate->line_buf_valid) { char *lineval; @@ -300,7 +301,7 @@ CopyMultiInsertBufferFlush(CopyMultiInsertInfo *miinfo, MemoryContext oldcontext; int i; uint64 save_cur_lineno; - CopyFromState cstate = miinfo->cstate; + CopyFromState cstate = miinfo->cstate; EState *estate = miinfo->estate; CommandId mycid = miinfo->mycid; int ti_options = miinfo->ti_options; @@ -1191,7 +1192,7 @@ BeginCopyFrom(ParseState *pstate, List *attnamelist, List *options) { - CopyFromState cstate; + CopyFromState cstate; bool pipe = (filename == NULL); TupleDesc tupDesc; AttrNumber num_phys_attrs, @@ -1229,7 +1230,7 @@ BeginCopyFrom(ParseState *pstate, oldcontext = MemoryContextSwitchTo(cstate->copycontext); /* Extract options from the statement node tree */ - ProcessCopyOptions(pstate, &cstate->opts, true /* is_from */, options); + ProcessCopyOptions(pstate, &cstate->opts, true /* is_from */ , options); /* Process the target relation */ cstate->rel = rel; @@ -1320,15 +1321,20 @@ BeginCopyFrom(ParseState *pstate, cstate->file_encoding = cstate->opts.file_encoding; /* - * Set up encoding conversion info. Even if the file and server encodings - * are the same, we must apply pg_any_to_server() to validate data in - * multibyte encodings. + * Look up encoding conversion function. */ - cstate->need_transcoding = - (cstate->file_encoding != GetDatabaseEncoding() || - pg_database_encoding_max_length() > 1); - /* See Multibyte encoding comment above */ - cstate->encoding_embeds_ascii = PG_ENCODING_IS_CLIENT_ONLY(cstate->file_encoding); + if (cstate->file_encoding == GetDatabaseEncoding() || + cstate->file_encoding == PG_SQL_ASCII || + GetDatabaseEncoding() == PG_SQL_ASCII) + { + cstate->need_transcoding = false; + } + else + { + cstate->need_transcoding = true; + cstate->conversion_proc = FindDefaultConversionProc(cstate->file_encoding, + GetDatabaseEncoding()); + } cstate->copy_src = COPY_FILE; /* default */ @@ -1339,7 +1345,6 @@ BeginCopyFrom(ParseState *pstate, oldcontext = MemoryContextSwitchTo(cstate->copycontext); /* Initialize state variables */ - cstate->reached_eof = false; cstate->eol_type = EOL_UNKNOWN; cstate->cur_relname = RelationGetRelationName(cstate->rel); cstate->cur_lineno = 0; @@ -1347,19 +1352,36 @@ BeginCopyFrom(ParseState *pstate, cstate->cur_attval = NULL; /* - * Set up variables to avoid per-attribute overhead. attribute_buf and - * raw_buf are used in both text and binary modes, but we use line_buf - * only in text mode. + * Allocate buffers for the input pipeline. + * + * attribute_buf and raw_buf are used in both text and binary modes, but + * input_buf and line_buf only in text mode. */ - initStringInfo(&cstate->attribute_buf); - cstate->raw_buf = (char *) palloc(RAW_BUF_SIZE + 1); + cstate->raw_buf = palloc(RAW_BUF_SIZE + 1); cstate->raw_buf_index = cstate->raw_buf_len = 0; + cstate->raw_reached_eof = false; + if (!cstate->opts.binary) { + /* + * If encoding conversion is needed, we need another buffer to hold + * the converted input data. Otherwise, we can just point input_buf + * to the same buffer as raw_buf. + */ + if (cstate->need_transcoding) + { + cstate->input_buf = (char *) palloc(INPUT_BUF_SIZE + 1); + cstate->input_buf_index = cstate->input_buf_len = 0; + } + else + cstate->input_buf = cstate->raw_buf; + cstate->input_reached_eof = false; + initStringInfo(&cstate->line_buf); - cstate->line_buf_converted = false; } + initStringInfo(&cstate->attribute_buf); + /* Assign range table, we'll need it in CopyFrom. */ if (pstate) cstate->range_table = pstate->p_rtable; @@ -1584,7 +1606,7 @@ ClosePipeFromProgram(CopyFromState cstate) * should not report that as an error. Otherwise, SIGPIPE indicates a * problem. */ - if (!cstate->reached_eof && + if (!cstate->raw_reached_eof && wait_result_is_signal(pclose_rc, SIGPIPE)) return; diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c index ce24a1528b..0813424768 100644 --- a/src/backend/commands/copyfromparse.c +++ b/src/backend/commands/copyfromparse.c @@ -3,6 +3,50 @@ * copyfromparse.c * Parse CSV/text/binary format for COPY FROM. * + * This file contains routines to parse the text, CSV and binary input + * formats. The main entry point is NextCopyFrom(), which parses the + * next input line and returns it as Datums. + * + * In text/CSV mode, the parsing happens in multiple stages: + * + * [data source] --> raw_buf --> input_buf --> line_buf --> attribute_buf + * 1. 2. 3. 4. + * + * 1. CopyLoadRawBuf() reads raw data from the input file or client, and + * places it into 'raw_buf'. + * + * 2. CopyConvertBuf() calls the encoding conversion function to convert + * the data in 'raw_buf' from client to server encoding, placing the + * converted result in 'input_buf'. + * + * 3. CopyReadLine() parses the data in 'input_buf', one line at a time. + * It is responsible for finding the next newline marker, taking quote and + * escape characters into account according to the COPY options. The line + * is copied into 'line_buf', with quotes and escape characters still + * intact. + * + * 4. CopyReadAttributesText/CSV() function takes the input line from + * 'line_buf', and splits it into fields, unescaping the data as required. + * The fields are stored in 'attribute_buf', and 'raw_fields' array holds + * pointers to each field. + * + * If encoding conversion is not required, a shortcut is taken in step 2 to + * avoid copying the data unnecessarily. The 'input_buf' pointer is set to + * point directly to 'raw_buf', so that CopyLoadRawBuf() loads the raw data + * directly into 'input_buf'. CopyConvertBuf() then merely validates that + * the data is valid in the current encoding. + * + * In binary mode, the pipeline is much simpler. Input is loaded into + * into 'raw_buf', and encoding conversion is done in the datatype-specific + * receive functions, if required. 'input_buf' and 'line_buf' are not used, + * but 'attribute_buf' is used as a temporary buffer to hold one attribute's + * data when it's passed the receive function. + * + * 'raw_buf' is always 64 kB in size (RAW_BUF_SIZE). 'input_buf' is also + * 64 kB (INPUT_BUF_SIZE), if encoding conversion is required. 'line_buf' + * and 'attribute_buf' are expanded on demand, to hold the longest line + * encountered so far. + * * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -35,7 +79,7 @@ #define OCTVALUE(c) ((c) - '0') /* - * These macros centralize code used to process line_buf and raw_buf buffers. + * These macros centralize code used to process line_buf and input_buf buffers. * They are macros because they often do continue/break control and to avoid * function call overhead in tight COPY loops. * @@ -53,9 +97,9 @@ #define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \ if (1) \ { \ - if (raw_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \ + if (input_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \ { \ - raw_buf_ptr = prev_raw_ptr; /* undo fetch */ \ + input_buf_ptr = prev_raw_ptr; /* undo fetch */ \ need_data = true; \ continue; \ } \ @@ -65,10 +109,10 @@ if (1) \ #define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \ if (1) \ { \ - if (raw_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \ + if (input_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \ { \ if (extralen) \ - raw_buf_ptr = copy_buf_len; /* consume the partial character */ \ + input_buf_ptr = copy_buf_len; /* consume the partial character */ \ /* backslash just before EOF, treat as data char */ \ result = true; \ break; \ @@ -77,17 +121,17 @@ if (1) \ /* * Transfer any approved data to line_buf; must do this to be sure - * there is some room in raw_buf. + * there is some room in input_buf. */ #define REFILL_LINEBUF \ if (1) \ { \ - if (raw_buf_ptr > cstate->raw_buf_index) \ + if (input_buf_ptr > cstate->input_buf_index) \ { \ appendBinaryStringInfo(&cstate->line_buf, \ - cstate->raw_buf + cstate->raw_buf_index, \ - raw_buf_ptr - cstate->raw_buf_index); \ - cstate->raw_buf_index = raw_buf_ptr; \ + cstate->input_buf + cstate->input_buf_index, \ + input_buf_ptr - cstate->input_buf_index); \ + cstate->input_buf_index = input_buf_ptr; \ } \ } else ((void) 0) @@ -95,7 +139,7 @@ if (1) \ #define NO_END_OF_COPY_GOTO \ if (1) \ { \ - raw_buf_ptr = prev_raw_ptr + 1; \ + input_buf_ptr = prev_raw_ptr + 1; \ goto not_end_of_copy; \ } else ((void) 0) @@ -118,7 +162,7 @@ static int CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread); static inline bool CopyGetInt32(CopyFromState cstate, int32 *val); static inline bool CopyGetInt16(CopyFromState cstate, int16 *val); -static bool CopyLoadRawBuf(CopyFromState cstate); +static void CopyLoadInputBuf(CopyFromState cstate); static int CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes); void @@ -210,10 +254,10 @@ CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread) (errcode_for_file_access(), errmsg("could not read from COPY file: %m"))); if (bytesread == 0) - cstate->reached_eof = true; + cstate->raw_reached_eof = true; break; case COPY_FRONTEND: - while (maxread > 0 && bytesread < minread && !cstate->reached_eof) + while (maxread > 0 && bytesread < minread && !cstate->raw_reached_eof) { int avail; @@ -241,7 +285,7 @@ CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread) break; case 'c': /* CopyDone */ /* COPY IN correctly terminated by frontend */ - cstate->reached_eof = true; + cstate->raw_reached_eof = true; return bytesread; case 'f': /* CopyFail */ ereport(ERROR, @@ -327,34 +371,303 @@ CopyGetInt16(CopyFromState cstate, int16 *val) /* - * CopyLoadRawBuf loads some more data into raw_buf + * Perform encoding conversion on data in 'raw_buf', writing the converted + * data into 'input_buf'. * - * Returns true if able to obtain at least one more byte, else false. - * - * If RAW_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the start - * of the buffer and then we load more data after that. This case occurs only - * when a multibyte character crosses a bufferload boundary. + * On entry, there must be some data to convert in 'raw_buf'. */ -static bool +static void +CopyConvertBuf(CopyFromState cstate) +{ + /* + * If the file and server encoding are the same, no encoding conversion is + * required. However, we still need to verify that the input is valid for + * the encoding. + */ + if (!cstate->need_transcoding) + { + /* + * When conversion is not required, input_buf and raw_buf are the + * same. raw_buf_len is the total number of bytes in the buffer, and + * input_buf_len tracks how many of those bytes have already been + * verified. + */ + int preverifiedlen = cstate->input_buf_len; + int unverifiedlen = cstate->raw_buf_len - cstate->input_buf_len; + int nverified; + + if (unverifiedlen == 0) + { + /* + * If no more raw data is coming, report the EOF to the caller. + */ + if (cstate->raw_reached_eof) + cstate->input_reached_eof = true; + return; + } + + /* + * Verify the new data, including any residual unverified bytes from + * previous round. + */ + nverified = pg_encoding_verifymbstr(cstate->file_encoding, + cstate->raw_buf + preverifiedlen, + unverifiedlen); + if (nverified == 0) + { + /* + * Could not verify anything. + * + * If there is no more raw input data coming, it means that there + * was an incomplete multi-byte sequence at the end. Also, if + * there's "enough" input left, we should be able to verify at + * least one character, and a failure to do so means that we've + * hit an invalid byte sequence. + */ + if (cstate->raw_reached_eof || unverifiedlen >= pg_database_encoding_max_length()) + cstate->input_reached_error = true; + return; + } + cstate->input_buf_len += nverified; + } + else + { + /* + * Encoding conversion is needed. + */ + int nbytes; + unsigned char *src; + int srclen; + unsigned char *dst; + int dstlen; + int convertedlen; + + if (RAW_BUF_BYTES(cstate) == 0) + { + /* + * If no more raw data is coming, report the EOF to the caller. + */ + if (cstate->raw_reached_eof) + cstate->input_reached_eof = true; + return; + } + + /* + * First, copy down any unprocessed data. + */ + nbytes = INPUT_BUF_BYTES(cstate); + if (nbytes > 0 && cstate->input_buf_index > 0) + memmove(cstate->input_buf, cstate->input_buf + cstate->input_buf_index, + nbytes); + cstate->input_buf_index = 0; + cstate->input_buf_len = nbytes; + cstate->input_buf[nbytes] = '\0'; + + src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index; + srclen = cstate->raw_buf_len - cstate->raw_buf_index; + dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len; + dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1; + + /* + * Do the conversion. This might stop short, if there is an invalid + * byte sequence in the input. We'll convert as much as we can in + * that case. + * + * Note: Even if we hit an invalid byte sequence, we don't report the + * error until all the valid bytes have been consumed. The input + * might contain an end-of-input marker (\.), and we don't want to + * report an error if the invalid byte sequence is after the + * end-of-input marker. We might unnecessarily convert some data + * after the end-of-input marker as long as it's valid for the + * encoding, but that's harmless. + */ + convertedlen = pg_do_encoding_conversion_buf(cstate->conversion_proc, + cstate->file_encoding, + GetDatabaseEncoding(), + src, srclen, + dst, dstlen, + true); + if (convertedlen == 0) + { + /* + * Could not convert anything. If there is no more raw input data + * coming, it means that there was an incomplete multi-byte + * sequence at the end. Also, if there is plenty of input left, + * we should be able to convert at least one character, so a + * failure to do so must mean that we've hit a byte sequence + * that's invalid. + */ + if (cstate->raw_reached_eof || srclen >= MAX_CONVERSION_INPUT_LENGTH) + cstate->input_reached_error = true; + return; + } + cstate->raw_buf_index += convertedlen; + cstate->input_buf_len += strlen((char *) dst); + } +} + +/* + * Report an encoding or conversion error. + */ +static void +CopyConversionError(CopyFromState cstate) +{ + Assert(cstate->raw_buf_len > 0); + Assert(cstate->input_reached_error); + + if (!cstate->need_transcoding) + { + /* + * Everything up to input_buf_len was successfully verified, and + * input_buf_len points to the invalid or incomplete character. + */ + report_invalid_encoding(cstate->file_encoding, + cstate->raw_buf + cstate->input_buf_len, + cstate->raw_buf_len - cstate->input_buf_len); + } + else + { + /* + * raw_buf_index points to the invalid or untranslatable character. We + * let the conversion routine report the error, because it can provide + * a more specific error message than we could here. An earlier call + * to the conversion routine in CopyConvertBuf() detected that there + * is an error, now we call the conversion routine again with + * noError=false, to have it throw the error. + */ + unsigned char *src; + int srclen; + unsigned char *dst; + int dstlen; + + src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index; + srclen = cstate->raw_buf_len - cstate->raw_buf_index; + dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len; + dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1; + + (void) pg_do_encoding_conversion_buf(cstate->conversion_proc, + cstate->file_encoding, + GetDatabaseEncoding(), + src, srclen, + dst, dstlen, + false); + + /* + * The conversion routine should have reported an error, so this + * should not be reached. + */ + elog(ERROR, "encoding conversion failed without error"); + } +} + +/* + * Load more data from data source to raw_buf. + * + * If RAW_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the + * beginning of the buffer, and we load new data after that. + */ +static void CopyLoadRawBuf(CopyFromState cstate) { - int nbytes = RAW_BUF_BYTES(cstate); + int nbytes; int inbytes; - /* Copy down the unprocessed data if any. */ - if (nbytes > 0) + /* + * In text mode, if encoding conversion is not required, raw_buf and + * input_buf point to the same buffer. Their len/index better agree, too. + */ + if (cstate->raw_buf == cstate->input_buf) + { + Assert(!cstate->need_transcoding); + Assert(cstate->raw_buf_index == cstate->input_buf_index); + Assert(cstate->input_buf_len <= cstate->raw_buf_len); + } + + /* + * Copy down the unprocessed data if any. + */ + nbytes = RAW_BUF_BYTES(cstate); + if (nbytes > 0 && cstate->raw_buf_index > 0) memmove(cstate->raw_buf, cstate->raw_buf + cstate->raw_buf_index, nbytes); + cstate->raw_buf_len -= cstate->raw_buf_index; + cstate->raw_buf_index = 0; - inbytes = CopyGetData(cstate, cstate->raw_buf + nbytes, - 1, RAW_BUF_SIZE - nbytes); + /* + * If raw_buf and input_buf are in fact the same buffer, adjust the + * input_buf variables, too. + */ + if (cstate->raw_buf == cstate->input_buf) + { + cstate->input_buf_len -= cstate->input_buf_index; + cstate->input_buf_index = 0; + } + + /* Load more data */ + inbytes = CopyGetData(cstate, cstate->raw_buf + cstate->raw_buf_len, + 1, RAW_BUF_SIZE - cstate->raw_buf_len); nbytes += inbytes; cstate->raw_buf[nbytes] = '\0'; - cstate->raw_buf_index = 0; cstate->raw_buf_len = nbytes; + cstate->bytes_processed += inbytes; pgstat_progress_update_param(PROGRESS_COPY_BYTES_PROCESSED, cstate->bytes_processed); - return (inbytes > 0); + + if (inbytes == 0) + cstate->raw_reached_eof = true; +} + +/* + * CopyLoadInputBuf loads some more data into input_buf + * + * On return, at least one more input character is loaded into + * input_buf, or input_reached_eof is set. + * + * If INPUT_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the start + * of the buffer and then we load more data after that. + */ +static void +CopyLoadInputBuf(CopyFromState cstate) +{ + int nbytes = INPUT_BUF_BYTES(cstate); + + /* + * The caller has updated input_buf_index to indicate how much of the + * input has been consumed and isn't needed anymore. If input_buf is the + * same physical area as raw_buf, update raw_buf_index accordingly. + */ + if (cstate->raw_buf == cstate->input_buf) + { + Assert(!cstate->need_transcoding); + Assert(cstate->input_buf_index >= cstate->raw_buf_index); + cstate->raw_buf_index = cstate->input_buf_index; + } + + for (;;) + { + /* If we now have some unconverted data, try to convert it */ + CopyConvertBuf(cstate); + + /* If we now have some more input bytes ready, return them */ + if (INPUT_BUF_BYTES(cstate) > nbytes) + return; + + /* + * If we reached an invalid byte sequence, or we're at an incomplete + * multi-byte character but there is no more raw input data, report + * conversion error. + */ + if (cstate->input_reached_error) + CopyConversionError(cstate); + + /* no more input, and everything has been converted */ + if (cstate->input_reached_eof) + break; + + /* Try to load more raw data */ + Assert(!cstate->raw_reached_eof); + CopyLoadRawBuf(cstate); + } } /* @@ -389,7 +702,8 @@ CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes) /* Load more data if buffer is empty. */ if (RAW_BUF_BYTES(cstate) == 0) { - if (!CopyLoadRawBuf(cstate)) + CopyLoadRawBuf(cstate); + if (cstate->raw_reached_eof) break; /* EOF */ } @@ -645,8 +959,7 @@ NextCopyFrom(CopyFromState cstate, ExprContext *econtext, } /* - * Read the next input line and stash it in line_buf, with conversion to - * server encoding. + * Read the next input line and stash it in line_buf. * * Result is true if read was terminated by EOF, false if terminated * by newline. The terminating newline or EOF marker is not included @@ -658,10 +971,7 @@ CopyReadLine(CopyFromState cstate) bool result; resetStringInfo(&cstate->line_buf); - cstate->line_buf_valid = true; - - /* Mark that encoding conversion hasn't occurred yet */ - cstate->line_buf_converted = false; + cstate->line_buf_valid = false; /* Parse data and transfer into line_buf */ result = CopyReadLineText(cstate); @@ -675,10 +985,17 @@ CopyReadLine(CopyFromState cstate) */ if (cstate->copy_src == COPY_FRONTEND) { + int inbytes; + do { - cstate->raw_buf_index = cstate->raw_buf_len; - } while (CopyLoadRawBuf(cstate)); + inbytes = CopyGetData(cstate, cstate->input_buf, + 1, INPUT_BUF_SIZE); + } while (inbytes > 0); + cstate->input_buf_index = 0; + cstate->input_buf_len = 0; + cstate->raw_buf_index = 0; + cstate->raw_buf_len = 0; } } else @@ -715,25 +1032,8 @@ CopyReadLine(CopyFromState cstate) } } - /* Done reading the line. Convert it to server encoding. */ - if (cstate->need_transcoding) - { - char *cvt; - - cvt = pg_any_to_server(cstate->line_buf.data, - cstate->line_buf.len, - cstate->file_encoding); - if (cvt != cstate->line_buf.data) - { - /* transfer converted data back to line_buf */ - resetStringInfo(&cstate->line_buf); - appendBinaryStringInfo(&cstate->line_buf, cvt, strlen(cvt)); - pfree(cvt); - } - } - /* Now it's safe to use the buffer in error messages */ - cstate->line_buf_converted = true; + cstate->line_buf_valid = true; return result; } @@ -744,13 +1044,12 @@ CopyReadLine(CopyFromState cstate) static bool CopyReadLineText(CopyFromState cstate) { - char *copy_raw_buf; - int raw_buf_ptr; + char *copy_input_buf; + int input_buf_ptr; int copy_buf_len; bool need_data = false; bool hit_eof = false; bool result = false; - char mblen_str[2]; /* CSV variables */ bool first_char_in_line = true; @@ -768,8 +1067,6 @@ CopyReadLineText(CopyFromState cstate) escapec = '\0'; } - mblen_str[1] = '\0'; - /* * The objective of this loop is to transfer the entire next input line * into line_buf. Hence, we only care for detecting newlines (\r and/or @@ -782,18 +1079,25 @@ CopyReadLineText(CopyFromState cstate) * These four characters, and the CSV escape and quote characters, are * assumed the same in frontend and backend encodings. * - * For speed, we try to move data from raw_buf to line_buf in chunks - * rather than one character at a time. raw_buf_ptr points to the next - * character to examine; any characters from raw_buf_index to raw_buf_ptr - * have been determined to be part of the line, but not yet transferred to - * line_buf. + * The input has already been converted to the database encoding. All + * supported server encodings have the property that all bytes in a + * multi-byte sequence have the high bit set, so a multibyte character + * cannot contain any newline or escape characters embedded in the + * multibyte sequence. Therefore, we can process the input byte-by-byte, + * regardless of the encoding. * - * For a little extra speed within the loop, we copy raw_buf and - * raw_buf_len into local variables. + * For speed, we try to move data from input_buf to line_buf in chunks + * rather than one character at a time. input_buf_ptr points to the next + * character to examine; any characters from input_buf_index to + * input_buf_ptr have been determined to be part of the line, but not yet + * transferred to line_buf. + * + * For a little extra speed within the loop, we copy input_buf and + * input_buf_len into local variables. */ - copy_raw_buf = cstate->raw_buf; - raw_buf_ptr = cstate->raw_buf_index; - copy_buf_len = cstate->raw_buf_len; + copy_input_buf = cstate->input_buf; + input_buf_ptr = cstate->input_buf_index; + copy_buf_len = cstate->input_buf_len; for (;;) { @@ -810,24 +1114,21 @@ CopyReadLineText(CopyFromState cstate) * cstate->copy_src != COPY_OLD_FE, but it hardly seems worth it, * considering the size of the buffer. */ - if (raw_buf_ptr >= copy_buf_len || need_data) + if (input_buf_ptr >= copy_buf_len || need_data) { REFILL_LINEBUF; - /* - * Try to read some more data. This will certainly reset - * raw_buf_index to zero, and raw_buf_ptr must go with it. - */ - if (!CopyLoadRawBuf(cstate)) - hit_eof = true; - raw_buf_ptr = 0; - copy_buf_len = cstate->raw_buf_len; + CopyLoadInputBuf(cstate); + /* update our local variables */ + hit_eof = cstate->input_reached_eof; + input_buf_ptr = cstate->input_buf_index; + copy_buf_len = cstate->input_buf_len; /* * If we are completely out of data, break out of the loop, * reporting EOF. */ - if (copy_buf_len <= 0) + if (INPUT_BUF_BYTES(cstate) <= 0) { result = true; break; @@ -836,8 +1137,8 @@ CopyReadLineText(CopyFromState cstate) } /* OK to fetch a character */ - prev_raw_ptr = raw_buf_ptr; - c = copy_raw_buf[raw_buf_ptr++]; + prev_raw_ptr = input_buf_ptr; + c = copy_input_buf[input_buf_ptr++]; if (cstate->opts.csv_mode) { @@ -891,16 +1192,16 @@ CopyReadLineText(CopyFromState cstate) * If need more data, go back to loop top to load it. * * Note that if we are at EOF, c will wind up as '\0' because - * of the guaranteed pad of raw_buf. + * of the guaranteed pad of input_buf. */ IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); /* get next char */ - c = copy_raw_buf[raw_buf_ptr]; + c = copy_input_buf[input_buf_ptr]; if (c == '\n') { - raw_buf_ptr++; /* eat newline */ + input_buf_ptr++; /* eat newline */ cstate->eol_type = EOL_CRNL; /* in case not set yet */ } else @@ -967,14 +1268,14 @@ CopyReadLineText(CopyFromState cstate) /* ----- * get next character * Note: we do not change c so if it isn't \., we can fall - * through and continue processing for file encoding. + * through and continue processing. * ----- */ - c2 = copy_raw_buf[raw_buf_ptr]; + c2 = copy_input_buf[input_buf_ptr]; if (c2 == '.') { - raw_buf_ptr++; /* consume the '.' */ + input_buf_ptr++; /* consume the '.' */ /* * Note: if we loop back for more data here, it does not @@ -986,7 +1287,7 @@ CopyReadLineText(CopyFromState cstate) /* Get the next character */ IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); /* if hit_eof, c2 will become '\0' */ - c2 = copy_raw_buf[raw_buf_ptr++]; + c2 = copy_input_buf[input_buf_ptr++]; if (c2 == '\n') { @@ -1011,7 +1312,7 @@ CopyReadLineText(CopyFromState cstate) /* Get the next character */ IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); /* if hit_eof, c2 will become '\0' */ - c2 = copy_raw_buf[raw_buf_ptr++]; + c2 = copy_input_buf[input_buf_ptr++]; if (c2 != '\r' && c2 != '\n') { @@ -1036,11 +1337,11 @@ CopyReadLineText(CopyFromState cstate) * Transfer only the data before the \. into line_buf, then * discard the data and the \. sequence. */ - if (prev_raw_ptr > cstate->raw_buf_index) + if (prev_raw_ptr > cstate->input_buf_index) appendBinaryStringInfo(&cstate->line_buf, - cstate->raw_buf + cstate->raw_buf_index, - prev_raw_ptr - cstate->raw_buf_index); - cstate->raw_buf_index = raw_buf_ptr; + cstate->input_buf + cstate->input_buf_index, + prev_raw_ptr - cstate->input_buf_index); + cstate->input_buf_index = input_buf_ptr; result = true; /* report EOF */ break; } @@ -1056,15 +1357,8 @@ CopyReadLineText(CopyFromState cstate) * backslashes are not special, so we want to process the * character after the backslash just like a normal character, * so we don't increment in those cases. - * - * Set 'c' to skip whole character correctly in multi-byte - * encodings. If we don't have the whole character in the - * buffer yet, we might loop back to process it, after all, - * but that's OK because multi-byte characters cannot have any - * special meaning. */ - raw_buf_ptr++; - c = c2; + input_buf_ptr++; } } @@ -1075,30 +1369,6 @@ CopyReadLineText(CopyFromState cstate) * value, while in non-CSV mode, \. cannot be a data value. */ not_end_of_copy: - - /* - * Process all bytes of a multi-byte character as a group. - * - * We only support multi-byte sequences where the first byte has the - * high-bit set, so as an optimization we can avoid this block - * entirely if it is not set. - */ - if (cstate->encoding_embeds_ascii && IS_HIGHBIT_SET(c)) - { - int mblen; - - /* - * It is enough to look at the first byte in all our encodings, to - * get the length. (GB18030 is a bit special, but still works for - * our purposes; see comment in pg_gb18030_mblen()) - */ - mblen_str[0] = c; - mblen = pg_encoding_mblen(cstate->file_encoding, mblen_str); - - IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(mblen - 1); - IF_NEED_REFILL_AND_EOF_BREAK(mblen - 1); - raw_buf_ptr += mblen - 1; - } first_char_in_line = false; } /* end of outer loop */ diff --git a/src/include/commands/copyfrom_internal.h b/src/include/commands/copyfrom_internal.h index 705f5b615b..858af7a717 100644 --- a/src/include/commands/copyfrom_internal.h +++ b/src/include/commands/copyfrom_internal.h @@ -52,17 +52,6 @@ typedef enum CopyInsertMethod /* * This struct contains all the state variables used throughout a COPY FROM * operation. - * - * Multi-byte encodings: all supported client-side encodings encode multi-byte - * characters by having the first byte's high bit set. Subsequent bytes of the - * character can have the high bit not set. When scanning data in such an - * encoding to look for a match to a single-byte (ie ASCII) character, we must - * use the full pg_encoding_mblen() machinery to skip over multibyte - * characters, else we might find a false match to a trailing byte. In - * supported server encodings, there is no possibility of a false match, and - * it's faster to make useless comparisons to trailing bytes than it is to - * invoke pg_encoding_mblen() to skip over them. encoding_embeds_ascii is true - * when we have to do it the hard way. */ typedef struct CopyFromStateData { @@ -70,13 +59,11 @@ typedef struct CopyFromStateData CopySource copy_src; /* type of copy source */ FILE *copy_file; /* used if copy_src == COPY_FILE */ StringInfo fe_msgbuf; /* used if copy_src == COPY_NEW_FE */ - bool reached_eof; /* true if we read to end of copy data (not - * all copy_src types maintain this) */ EolType eol_type; /* EOL type of input */ int file_encoding; /* file or remote side's character encoding */ bool need_transcoding; /* file encoding diff from server? */ - bool encoding_embeds_ascii; /* ASCII can be non-first byte? */ + Oid conversion_proc; /* encoding conversion function */ /* parameters from the COPY command */ Relation rel; /* relation to copy from */ @@ -131,31 +118,52 @@ typedef struct CopyFromStateData /* * Similarly, line_buf holds the whole input line being processed. The - * input cycle is first to read the whole line into line_buf, convert it - * to server encoding there, and then extract the individual attribute - * fields into attribute_buf. line_buf is preserved unmodified so that we - * can display it in error messages if appropriate. (In binary mode, - * line_buf is not used.) + * input cycle is first to read the whole line into line_buf, and then + * extract the individual attribute fields into attribute_buf. line_buf + * is preserved unmodified so that we can display it in error messages if + * appropriate. (In binary mode, line_buf is not used.) */ StringInfoData line_buf; - bool line_buf_converted; /* converted to server encoding? */ bool line_buf_valid; /* contains the row being processed? */ /* - * Finally, raw_buf holds raw data read from the data source (file or - * client connection). In text mode, CopyReadLine parses this data - * sufficiently to locate line boundaries, then transfers the data to - * line_buf and converts it. In binary mode, CopyReadBinaryData fetches - * appropriate amounts of data from this buffer. In both modes, we - * guarantee that there is a \0 at raw_buf[raw_buf_len]. + * input_buf holds input data, already converted to database encoding. + * + * In text mode, CopyReadLine parses this data sufficiently to locate + * line boundaries, then transfers the data to line_buf. We guarantee + * that there is a \0 at input_buf[input_buf_len] at all times. (In + * binary mode, input_buf is not used.) + * + * If encoding conversion is not required, input_buf is not a separate + * buffer but points directly to raw_buf. In that case, input_buf_len + * tracks the number of bytes that have been verified as valid in the + * database encoding, and raw_buf_len is the total number of bytes + * stored in the buffer. + */ +#define INPUT_BUF_SIZE 65536 /* we palloc INPUT_BUF_SIZE+1 bytes */ + char *input_buf; + int input_buf_index; /* next byte to process */ + int input_buf_len; /* total # of bytes stored */ + bool input_reached_eof; /* true if we reached EOF */ + bool input_reached_error; /* true if a conversion error happened */ + /* Shorthand for number of unconsumed bytes available in input_buf */ +#define INPUT_BUF_BYTES(cstate) ((cstate)->input_buf_len - (cstate)->input_buf_index) + + /* + * raw_buf holds raw input data read from the data source (file or client + * connection), not yet converted to the database encoding. Like with + * 'input_buf', we guarantee that there is a \0 at raw_buf[raw_buf_len]. */ #define RAW_BUF_SIZE 65536 /* we palloc RAW_BUF_SIZE+1 bytes */ char *raw_buf; int raw_buf_index; /* next byte to process */ int raw_buf_len; /* total # of bytes stored */ - uint64 bytes_processed;/* number of bytes processed so far */ + bool raw_reached_eof; /* true if we reached EOF */ + /* Shorthand for number of unconsumed bytes available in raw_buf */ #define RAW_BUF_BYTES(cstate) ((cstate)->raw_buf_len - (cstate)->raw_buf_index) + + uint64 bytes_processed; /* number of bytes processed so far */ } CopyFromStateData; extern void ReceiveCopyBegin(CopyFromState cstate); diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index a9aaff9e6d..0f31e68318 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -306,15 +306,33 @@ typedef enum pg_enc /* * When converting strings between different encodings, we assume that space - * for converted result is 4-to-1 growth in the worst case. The rate for + * for converted result is 4-to-1 growth in the worst case. The rate for * currently supported encoding pairs are within 3 (SJIS JIS X0201 half width - * kanna -> UTF8 is the worst case). So "4" should be enough for the moment. + * kana -> UTF8 is the worst case). So "4" should be enough for the moment. * * Note that this is not the same as the maximum character width in any * particular encoding. */ #define MAX_CONVERSION_GROWTH 4 +/* + * Maximum byte length of a string that's required in any encoding to convert + * at least one character to any other encoding. In other words, if you feed + * MAX_CONVERSION_INPUT_LENGTH bytes to any encoding conversion function, it + * is guaranteed to be able to convert something without needing more input + * (assuming the input is valid). + * + * Currently, the maximum case is the conversion UTF8 -> SJIS JIS X0201 half + * width kana, where a pair of UTF-8 characters is converted into a single + * SHIFT_JIS_2004 character (the reverse of the worst case for + * MAX_CONVERSION_GROWTH). It needs 6 bytes of input. In theory, a + * user-defined conversion function might have more complicated cases, although + * for the reverse mapping you would probably also need to bump up + * MAX_CONVERSION_GROWTH. But there is no need to be stingy here, so make it + * generous. + */ +#define MAX_CONVERSION_INPUT_LENGTH 16 + /* * Maximum byte length of the string equivalent to any one Unicode code point, * in any backend encoding. The current value assumes that a 4-byte UTF-8