From 072132f04e55c1c3b0f1a582318da78de7334379 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Wed, 30 Mar 2022 08:56:58 +0200 Subject: [PATCH] Add header matching mode to COPY FROM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit COPY FROM supports the HEADER option to silently discard the header line from a CSV or text file. It is possible to load by mistake a file that matches the expected format, for example, if two text columns have been swapped, resulting in garbage in the database. This adds a new option value HEADER MATCH that checks the column names in the header line against the actual column names and errors out if they do not match. Author: RĂ©mi Lapeyre Reviewed-by: Daniel Verite Reviewed-by: Peter Eisentraut Discussion: https://www.postgresql.org/message-id/flat/CAF1-J-0PtCWMeLtswwGV2M70U26n4g33gpe1rcKQqe6wVQDrFA@mail.gmail.com --- contrib/file_fdw/expected/file_fdw.out | 19 +++++++- contrib/file_fdw/sql/file_fdw.sql | 9 ++++ doc/src/sgml/ref/copy.sgml | 8 +++- src/backend/commands/copy.c | 60 +++++++++++++++++++++++++- src/backend/commands/copyfromparse.c | 53 +++++++++++++++++++++-- src/include/commands/copy.h | 13 +++++- src/test/regress/expected/copy.out | 23 ++++++++++ src/test/regress/sql/copy.sql | 33 ++++++++++++++ 8 files changed, 210 insertions(+), 8 deletions(-) diff --git a/contrib/file_fdw/expected/file_fdw.out b/contrib/file_fdw/expected/file_fdw.out index 14acdb27e5..0029f36b35 100644 --- a/contrib/file_fdw/expected/file_fdw.out +++ b/contrib/file_fdw/expected/file_fdw.out @@ -113,6 +113,21 @@ CREATE FOREIGN TABLE agg_bad ( ) SERVER file_server OPTIONS (format 'csv', filename :'filename', header 'true', delimiter ';', quote '@', escape '"', null ''); ALTER FOREIGN TABLE agg_bad ADD CHECK (a >= 0); +-- test header matching +\set filename :abs_srcdir '/data/list1.csv' +CREATE FOREIGN TABLE header_match ("1" int, foo text) SERVER file_server +OPTIONS (format 'csv', filename :'filename', delimiter ',', header 'match'); +SELECT * FROM header_match; + 1 | foo +---+----- + 1 | bar +(1 row) + +CREATE FOREIGN TABLE header_doesnt_match (a int, foo text) SERVER file_server +OPTIONS (format 'csv', filename :'filename', delimiter ',', header 'match'); +SELECT * FROM header_doesnt_match; -- ERROR +ERROR: column name mismatch in header line field 1: got "1", expected "a" +CONTEXT: COPY header_doesnt_match, line 1: "1,foo" -- per-column options tests \set filename :abs_srcdir '/data/text.csv' CREATE FOREIGN TABLE text_csv ( @@ -464,12 +479,14 @@ SET ROLE regress_file_fdw_superuser; -- cleanup RESET ROLE; DROP EXTENSION file_fdw CASCADE; -NOTICE: drop cascades to 7 other objects +NOTICE: drop cascades to 9 other objects DETAIL: drop cascades to server file_server drop cascades to user mapping for regress_file_fdw_superuser on server file_server drop cascades to user mapping for regress_no_priv_user on server file_server drop cascades to foreign table agg_text drop cascades to foreign table agg_csv drop cascades to foreign table agg_bad +drop cascades to foreign table header_match +drop cascades to foreign table header_doesnt_match drop cascades to foreign table text_csv DROP ROLE regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user; diff --git a/contrib/file_fdw/sql/file_fdw.sql b/contrib/file_fdw/sql/file_fdw.sql index 86f876d565..563d824ccc 100644 --- a/contrib/file_fdw/sql/file_fdw.sql +++ b/contrib/file_fdw/sql/file_fdw.sql @@ -103,6 +103,15 @@ CREATE FOREIGN TABLE agg_bad ( OPTIONS (format 'csv', filename :'filename', header 'true', delimiter ';', quote '@', escape '"', null ''); ALTER FOREIGN TABLE agg_bad ADD CHECK (a >= 0); +-- test header matching +\set filename :abs_srcdir '/data/list1.csv' +CREATE FOREIGN TABLE header_match ("1" int, foo text) SERVER file_server +OPTIONS (format 'csv', filename :'filename', delimiter ',', header 'match'); +SELECT * FROM header_match; +CREATE FOREIGN TABLE header_doesnt_match (a int, foo text) SERVER file_server +OPTIONS (format 'csv', filename :'filename', delimiter ',', header 'match'); +SELECT * FROM header_doesnt_match; -- ERROR + -- per-column options tests \set filename :abs_srcdir '/data/text.csv' CREATE FOREIGN TABLE text_csv ( diff --git a/doc/src/sgml/ref/copy.sgml b/doc/src/sgml/ref/copy.sgml index 1b7d001963..e7a6676efd 100644 --- a/doc/src/sgml/ref/copy.sgml +++ b/doc/src/sgml/ref/copy.sgml @@ -36,7 +36,7 @@ COPY { table_name [ ( boolean ] DELIMITER 'delimiter_character' NULL 'null_string' - HEADER [ boolean ] + HEADER [ boolean | match ] QUOTE 'quote_character' ESCAPE 'escape_character' FORCE_QUOTE { ( column_name [, ...] ) | * } @@ -276,7 +276,11 @@ COPY { table_name [ ( Specifies that the file contains a header line with the names of each column in the file. On output, the first line contains the column - names from the table, and on input, the first line is ignored. + names from the table. On input, the first line is discarded when this + option is set to true (or equivalent Boolean value). + If this option is set to match, the number and names + of the columns in the header line must match the actual column names of + the table, otherwise an error is raised. This option is not allowed when using binary format. diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 7a0c897cc9..689713ea58 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -313,6 +313,64 @@ DoCopy(ParseState *pstate, const CopyStmt *stmt, table_close(rel, NoLock); } +/* + * Extract a CopyHeaderChoice value from a DefElem. This is like + * defGetBoolean() but also accepts the special value "match". + */ +static CopyHeaderChoice +defGetCopyHeaderChoice(DefElem *def) +{ + /* + * If no parameter given, assume "true" is meant. + */ + if (def->arg == NULL) + return COPY_HEADER_TRUE; + + /* + * Allow 0, 1, "true", "false", "on", "off", or "match". + */ + switch (nodeTag(def->arg)) + { + case T_Integer: + switch (intVal(def->arg)) + { + case 0: + return COPY_HEADER_FALSE; + case 1: + return COPY_HEADER_TRUE; + default: + /* otherwise, error out below */ + break; + } + break; + default: + { + char *sval = defGetString(def); + + /* + * The set of strings accepted here should match up with the + * grammar's opt_boolean_or_string production. + */ + if (pg_strcasecmp(sval, "true") == 0) + return COPY_HEADER_TRUE; + if (pg_strcasecmp(sval, "false") == 0) + return COPY_HEADER_FALSE; + if (pg_strcasecmp(sval, "on") == 0) + return COPY_HEADER_TRUE; + if (pg_strcasecmp(sval, "off") == 0) + return COPY_HEADER_FALSE; + if (pg_strcasecmp(sval, "match") == 0) + return COPY_HEADER_MATCH; + } + break; + } + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("%s requires a Boolean value or \"match\"", + def->defname))); + return COPY_HEADER_FALSE; /* keep compiler quiet */ +} + /* * Process the statement option list for COPY. * @@ -394,7 +452,7 @@ ProcessCopyOptions(ParseState *pstate, if (header_specified) errorConflictingDefElem(defel, pstate); header_specified = true; - opts_out->header_line = defGetBoolean(defel); + opts_out->header_line = defGetCopyHeaderChoice(defel); } else if (strcmp(defel->defname, "quote") == 0) { diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c index baf328b620..58017ec53b 100644 --- a/src/backend/commands/copyfromparse.c +++ b/src/backend/commands/copyfromparse.c @@ -72,6 +72,7 @@ #include "miscadmin.h" #include "pgstat.h" #include "port/pg_bswap.h" +#include "utils/builtins.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -758,12 +759,58 @@ NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields) /* only available for text or csv input */ Assert(!cstate->opts.binary); - /* on input just throw the header line away */ + /* on input check that the header line is correct if needed */ if (cstate->cur_lineno == 0 && cstate->opts.header_line) { + ListCell *cur; + TupleDesc tupDesc; + + tupDesc = RelationGetDescr(cstate->rel); + cstate->cur_lineno++; - if (CopyReadLine(cstate)) - return false; /* done */ + done = CopyReadLine(cstate); + + if (cstate->opts.header_line == COPY_HEADER_MATCH) + { + int fldnum; + + if (cstate->opts.csv_mode) + fldct = CopyReadAttributesCSV(cstate); + else + fldct = CopyReadAttributesText(cstate); + + if (fldct != list_length(cstate->attnumlist)) + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("wrong number of fields in header line: field count is %d, expected %d", + fldct, list_length(cstate->attnumlist)))); + + fldnum = 0; + foreach(cur, cstate->attnumlist) + { + int attnum = lfirst_int(cur); + char *colName = cstate->raw_fields[attnum - 1]; + Form_pg_attribute attr = TupleDescAttr(tupDesc, attnum - 1); + + fldnum++; + + if (colName == NULL) + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("column name mismatch in header line field %d: got null value (\"%s\"), expected \"%s\"", + fldnum, cstate->opts.null_print, NameStr(attr->attname)))); + + if (namestrcmp(&attr->attname, colName) != 0) { + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("column name mismatch in header line field %d: got \"%s\", expected \"%s\"", + fldnum, colName, NameStr(attr->attname)))); + } + } + } + + if (done) + return false; } cstate->cur_lineno++; diff --git a/src/include/commands/copy.h b/src/include/commands/copy.h index 8694da5004..cb0096aeb6 100644 --- a/src/include/commands/copy.h +++ b/src/include/commands/copy.h @@ -19,6 +19,17 @@ #include "parser/parse_node.h" #include "tcop/dest.h" +/* + * Represents whether a header line should be present, and whether it must + * match the actual names (which implies "true"). + */ +typedef enum CopyHeaderChoice +{ + COPY_HEADER_FALSE = 0, + COPY_HEADER_TRUE, + COPY_HEADER_MATCH, +} CopyHeaderChoice; + /* * A struct to hold COPY options, in a parsed form. All of these are related * to formatting, except for 'freeze', which doesn't really belong here, but @@ -32,7 +43,7 @@ typedef struct CopyFormatOptions bool binary; /* binary format? */ bool freeze; /* freeze rows on loading? */ bool csv_mode; /* Comma Separated Value format? */ - bool header_line; /* header line? */ + CopyHeaderChoice header_line; /* header line? */ char *null_print; /* NULL marker string (server encoding!) */ int null_print_len; /* length of same */ char *null_print_client; /* same converted to file encoding */ diff --git a/src/test/regress/expected/copy.out b/src/test/regress/expected/copy.out index bec19dc44d..e8d6b4fc13 100644 --- a/src/test/regress/expected/copy.out +++ b/src/test/regress/expected/copy.out @@ -176,3 +176,26 @@ INFO: progress: {"type": "FILE", "command": "COPY FROM", "relname": "tab_progre drop trigger check_after_tab_progress_reporting on tab_progress_reporting; drop function notice_after_tab_progress_reporting(); drop table tab_progress_reporting; +-- Test header matching feature +create table header_copytest ( + a int, + b int, + c text +); +copy header_copytest from stdin with (header wrong_choice); +ERROR: header requires a Boolean value or "match" +copy header_copytest from stdin with (header match); +copy header_copytest from stdin with (header match); +ERROR: column name mismatch in header line field 3: got null value ("\N"), expected "c" +CONTEXT: COPY header_copytest, line 1: "a b \N" +copy header_copytest from stdin with (header match); +ERROR: wrong number of fields in header line: field count is 2, expected 3 +CONTEXT: COPY header_copytest, line 1: "a b" +copy header_copytest from stdin with (header match); +ERROR: wrong number of fields in header line: field count is 4, expected 3 +CONTEXT: COPY header_copytest, line 1: "a b c d" +copy header_copytest from stdin with (header match); +ERROR: column name mismatch in header line field 3: got "d", expected "c" +CONTEXT: COPY header_copytest, line 1: "a b d" +copy header_copytest from stdin with (header match, format csv); +drop table header_copytest; diff --git a/src/test/regress/sql/copy.sql b/src/test/regress/sql/copy.sql index eb302773ab..d72d226f34 100644 --- a/src/test/regress/sql/copy.sql +++ b/src/test/regress/sql/copy.sql @@ -197,3 +197,36 @@ copy tab_progress_reporting from :'filename' drop trigger check_after_tab_progress_reporting on tab_progress_reporting; drop function notice_after_tab_progress_reporting(); drop table tab_progress_reporting; + +-- Test header matching feature +create table header_copytest ( + a int, + b int, + c text +); +copy header_copytest from stdin with (header wrong_choice); +copy header_copytest from stdin with (header match); +a b c +1 2 foo +\. +copy header_copytest from stdin with (header match); +a b \N +1 2 foo +\. +copy header_copytest from stdin with (header match); +a b +1 2 +\. +copy header_copytest from stdin with (header match); +a b c d +1 2 foo bar +\. +copy header_copytest from stdin with (header match); +a b d +1 2 foo +\. +copy header_copytest from stdin with (header match, format csv); +a,b,c +1,2,foo +\. +drop table header_copytest;