Add header matching mode to COPY FROM

COPY FROM supports the HEADER option to silently discard the header
line from a CSV or text file.  It is possible to load by mistake a
file that matches the expected format, for example, if two text
columns have been swapped, resulting in garbage in the database.

This adds a new option value HEADER MATCH that checks the column names
in the header line against the actual column names and errors out if
they do not match.

Author: Rémi Lapeyre <remi.lapeyre@lenstra.fr>
Reviewed-by: Daniel Verite <daniel@manitou-mail.org>
Reviewed-by: Peter Eisentraut <peter.eisentraut@enterprisedb.com>
Discussion: https://www.postgresql.org/message-id/flat/CAF1-J-0PtCWMeLtswwGV2M70U26n4g33gpe1rcKQqe6wVQDrFA@mail.gmail.com
This commit is contained in:
Peter Eisentraut 2022-03-30 08:56:58 +02:00
parent edcedcc2c7
commit 072132f04e
8 changed files with 210 additions and 8 deletions

View File

@ -113,6 +113,21 @@ CREATE FOREIGN TABLE agg_bad (
) SERVER file_server
OPTIONS (format 'csv', filename :'filename', header 'true', delimiter ';', quote '@', escape '"', null '');
ALTER FOREIGN TABLE agg_bad ADD CHECK (a >= 0);
-- test header matching
\set filename :abs_srcdir '/data/list1.csv'
CREATE FOREIGN TABLE header_match ("1" int, foo text) SERVER file_server
OPTIONS (format 'csv', filename :'filename', delimiter ',', header 'match');
SELECT * FROM header_match;
1 | foo
---+-----
1 | bar
(1 row)
CREATE FOREIGN TABLE header_doesnt_match (a int, foo text) SERVER file_server
OPTIONS (format 'csv', filename :'filename', delimiter ',', header 'match');
SELECT * FROM header_doesnt_match; -- ERROR
ERROR: column name mismatch in header line field 1: got "1", expected "a"
CONTEXT: COPY header_doesnt_match, line 1: "1,foo"
-- per-column options tests
\set filename :abs_srcdir '/data/text.csv'
CREATE FOREIGN TABLE text_csv (
@ -464,12 +479,14 @@ SET ROLE regress_file_fdw_superuser;
-- cleanup
RESET ROLE;
DROP EXTENSION file_fdw CASCADE;
NOTICE: drop cascades to 7 other objects
NOTICE: drop cascades to 9 other objects
DETAIL: drop cascades to server file_server
drop cascades to user mapping for regress_file_fdw_superuser on server file_server
drop cascades to user mapping for regress_no_priv_user on server file_server
drop cascades to foreign table agg_text
drop cascades to foreign table agg_csv
drop cascades to foreign table agg_bad
drop cascades to foreign table header_match
drop cascades to foreign table header_doesnt_match
drop cascades to foreign table text_csv
DROP ROLE regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user;

View File

@ -103,6 +103,15 @@ CREATE FOREIGN TABLE agg_bad (
OPTIONS (format 'csv', filename :'filename', header 'true', delimiter ';', quote '@', escape '"', null '');
ALTER FOREIGN TABLE agg_bad ADD CHECK (a >= 0);
-- test header matching
\set filename :abs_srcdir '/data/list1.csv'
CREATE FOREIGN TABLE header_match ("1" int, foo text) SERVER file_server
OPTIONS (format 'csv', filename :'filename', delimiter ',', header 'match');
SELECT * FROM header_match;
CREATE FOREIGN TABLE header_doesnt_match (a int, foo text) SERVER file_server
OPTIONS (format 'csv', filename :'filename', delimiter ',', header 'match');
SELECT * FROM header_doesnt_match; -- ERROR
-- per-column options tests
\set filename :abs_srcdir '/data/text.csv'
CREATE FOREIGN TABLE text_csv (

View File

@ -36,7 +36,7 @@ COPY { <replaceable class="parameter">table_name</replaceable> [ ( <replaceable
FREEZE [ <replaceable class="parameter">boolean</replaceable> ]
DELIMITER '<replaceable class="parameter">delimiter_character</replaceable>'
NULL '<replaceable class="parameter">null_string</replaceable>'
HEADER [ <replaceable class="parameter">boolean</replaceable> ]
HEADER [ <replaceable class="parameter">boolean</replaceable> | <literal>match</literal> ]
QUOTE '<replaceable class="parameter">quote_character</replaceable>'
ESCAPE '<replaceable class="parameter">escape_character</replaceable>'
FORCE_QUOTE { ( <replaceable class="parameter">column_name</replaceable> [, ...] ) | * }
@ -276,7 +276,11 @@ COPY { <replaceable class="parameter">table_name</replaceable> [ ( <replaceable
<para>
Specifies that the file contains a header line with the names of each
column in the file. On output, the first line contains the column
names from the table, and on input, the first line is ignored.
names from the table. On input, the first line is discarded when this
option is set to <literal>true</literal> (or equivalent Boolean value).
If this option is set to <literal>match</literal>, the number and names
of the columns in the header line must match the actual column names of
the table, otherwise an error is raised.
This option is not allowed when using <literal>binary</literal> format.
</para>
</listitem>

View File

@ -313,6 +313,64 @@ DoCopy(ParseState *pstate, const CopyStmt *stmt,
table_close(rel, NoLock);
}
/*
* Extract a CopyHeaderChoice value from a DefElem. This is like
* defGetBoolean() but also accepts the special value "match".
*/
static CopyHeaderChoice
defGetCopyHeaderChoice(DefElem *def)
{
/*
* If no parameter given, assume "true" is meant.
*/
if (def->arg == NULL)
return COPY_HEADER_TRUE;
/*
* Allow 0, 1, "true", "false", "on", "off", or "match".
*/
switch (nodeTag(def->arg))
{
case T_Integer:
switch (intVal(def->arg))
{
case 0:
return COPY_HEADER_FALSE;
case 1:
return COPY_HEADER_TRUE;
default:
/* otherwise, error out below */
break;
}
break;
default:
{
char *sval = defGetString(def);
/*
* The set of strings accepted here should match up with the
* grammar's opt_boolean_or_string production.
*/
if (pg_strcasecmp(sval, "true") == 0)
return COPY_HEADER_TRUE;
if (pg_strcasecmp(sval, "false") == 0)
return COPY_HEADER_FALSE;
if (pg_strcasecmp(sval, "on") == 0)
return COPY_HEADER_TRUE;
if (pg_strcasecmp(sval, "off") == 0)
return COPY_HEADER_FALSE;
if (pg_strcasecmp(sval, "match") == 0)
return COPY_HEADER_MATCH;
}
break;
}
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("%s requires a Boolean value or \"match\"",
def->defname)));
return COPY_HEADER_FALSE; /* keep compiler quiet */
}
/*
* Process the statement option list for COPY.
*
@ -394,7 +452,7 @@ ProcessCopyOptions(ParseState *pstate,
if (header_specified)
errorConflictingDefElem(defel, pstate);
header_specified = true;
opts_out->header_line = defGetBoolean(defel);
opts_out->header_line = defGetCopyHeaderChoice(defel);
}
else if (strcmp(defel->defname, "quote") == 0)
{

View File

@ -72,6 +72,7 @@
#include "miscadmin.h"
#include "pgstat.h"
#include "port/pg_bswap.h"
#include "utils/builtins.h"
#include "utils/memutils.h"
#include "utils/rel.h"
@ -758,12 +759,58 @@ NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields)
/* only available for text or csv input */
Assert(!cstate->opts.binary);
/* on input just throw the header line away */
/* on input check that the header line is correct if needed */
if (cstate->cur_lineno == 0 && cstate->opts.header_line)
{
ListCell *cur;
TupleDesc tupDesc;
tupDesc = RelationGetDescr(cstate->rel);
cstate->cur_lineno++;
if (CopyReadLine(cstate))
return false; /* done */
done = CopyReadLine(cstate);
if (cstate->opts.header_line == COPY_HEADER_MATCH)
{
int fldnum;
if (cstate->opts.csv_mode)
fldct = CopyReadAttributesCSV(cstate);
else
fldct = CopyReadAttributesText(cstate);
if (fldct != list_length(cstate->attnumlist))
ereport(ERROR,
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
errmsg("wrong number of fields in header line: field count is %d, expected %d",
fldct, list_length(cstate->attnumlist))));
fldnum = 0;
foreach(cur, cstate->attnumlist)
{
int attnum = lfirst_int(cur);
char *colName = cstate->raw_fields[attnum - 1];
Form_pg_attribute attr = TupleDescAttr(tupDesc, attnum - 1);
fldnum++;
if (colName == NULL)
ereport(ERROR,
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
errmsg("column name mismatch in header line field %d: got null value (\"%s\"), expected \"%s\"",
fldnum, cstate->opts.null_print, NameStr(attr->attname))));
if (namestrcmp(&attr->attname, colName) != 0) {
ereport(ERROR,
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
errmsg("column name mismatch in header line field %d: got \"%s\", expected \"%s\"",
fldnum, colName, NameStr(attr->attname))));
}
}
}
if (done)
return false;
}
cstate->cur_lineno++;

View File

@ -19,6 +19,17 @@
#include "parser/parse_node.h"
#include "tcop/dest.h"
/*
* Represents whether a header line should be present, and whether it must
* match the actual names (which implies "true").
*/
typedef enum CopyHeaderChoice
{
COPY_HEADER_FALSE = 0,
COPY_HEADER_TRUE,
COPY_HEADER_MATCH,
} CopyHeaderChoice;
/*
* A struct to hold COPY options, in a parsed form. All of these are related
* to formatting, except for 'freeze', which doesn't really belong here, but
@ -32,7 +43,7 @@ typedef struct CopyFormatOptions
bool binary; /* binary format? */
bool freeze; /* freeze rows on loading? */
bool csv_mode; /* Comma Separated Value format? */
bool header_line; /* header line? */
CopyHeaderChoice header_line; /* header line? */
char *null_print; /* NULL marker string (server encoding!) */
int null_print_len; /* length of same */
char *null_print_client; /* same converted to file encoding */

View File

@ -176,3 +176,26 @@ INFO: progress: {"type": "FILE", "command": "COPY FROM", "relname": "tab_progre
drop trigger check_after_tab_progress_reporting on tab_progress_reporting;
drop function notice_after_tab_progress_reporting();
drop table tab_progress_reporting;
-- Test header matching feature
create table header_copytest (
a int,
b int,
c text
);
copy header_copytest from stdin with (header wrong_choice);
ERROR: header requires a Boolean value or "match"
copy header_copytest from stdin with (header match);
copy header_copytest from stdin with (header match);
ERROR: column name mismatch in header line field 3: got null value ("\N"), expected "c"
CONTEXT: COPY header_copytest, line 1: "a b \N"
copy header_copytest from stdin with (header match);
ERROR: wrong number of fields in header line: field count is 2, expected 3
CONTEXT: COPY header_copytest, line 1: "a b"
copy header_copytest from stdin with (header match);
ERROR: wrong number of fields in header line: field count is 4, expected 3
CONTEXT: COPY header_copytest, line 1: "a b c d"
copy header_copytest from stdin with (header match);
ERROR: column name mismatch in header line field 3: got "d", expected "c"
CONTEXT: COPY header_copytest, line 1: "a b d"
copy header_copytest from stdin with (header match, format csv);
drop table header_copytest;

View File

@ -197,3 +197,36 @@ copy tab_progress_reporting from :'filename'
drop trigger check_after_tab_progress_reporting on tab_progress_reporting;
drop function notice_after_tab_progress_reporting();
drop table tab_progress_reporting;
-- Test header matching feature
create table header_copytest (
a int,
b int,
c text
);
copy header_copytest from stdin with (header wrong_choice);
copy header_copytest from stdin with (header match);
a b c
1 2 foo
\.
copy header_copytest from stdin with (header match);
a b \N
1 2 foo
\.
copy header_copytest from stdin with (header match);
a b
1 2
\.
copy header_copytest from stdin with (header match);
a b c d
1 2 foo bar
\.
copy header_copytest from stdin with (header match);
a b d
1 2 foo
\.
copy header_copytest from stdin with (header match, format csv);
a,b,c
1,2,foo
\.
drop table header_copytest;