From 0eceaaf9b7d03f88c82f27b1e51e9e75bb7ab4ff Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 6 Oct 2003 02:38:53 +0000
Subject: [PATCH] Modify COPY FROM to match the null-value string against the
 column value before it is de-backslashed, not after.  This allows the null
 string \N to be reliably distinguished from the data value \N (which must be
 represented as \\N).  Per bug report from Manfred Koizar ... but it's amazing
 this hasn't been reported before ... Also, be consistent about encoding
 conversion for null string: the form specified in the command is in the
 server encoding, but what is sent to/from client must be in client encoding. 
 This never worked quite right before either.

---
 doc/src/sgml/ref/copy.sgml  | 23 +++++++++----
 src/backend/commands/copy.c | 69 ++++++++++++++++++++++++-------------
 2 files changed, 62 insertions(+), 30 deletions(-)
diff --git a/doc/src/sgml/ref/copy.sgml b/doc/src/sgml/ref/copy.sgml
index 83a51362c5..c8e4debea7 100644
--- a/doc/src/sgml/ref/copy.sgml
+++ b/doc/src/sgml/ref/copy.sgml
@@ -1,5 +1,5 @@
 <!--
-$Header: /cvsroot/pgsql/doc/src/sgml/ref/copy.sgml,v 1.52 2003/09/30 01:56:11 tgl Exp $
+$Header: /cvsroot/pgsql/doc/src/sgml/ref/copy.sgml,v 1.53 2003/10/06 02:38:53 tgl Exp $
 PostgreSQL documentation
 -->
 
@@ -257,7 +257,7 @@ COPY <replaceable class="parameter">tablename</replaceable> [ ( <replaceable cla
     Columns in a row are separated by the delimiter character.
     The column values themselves are strings generated by the
     output function, or acceptable to the input function, of each
-    attribute's data type.  The specified null-value string is used in
+    attribute's data type.  The specified null string is used in
     place of columns that are null.
     <command>COPY FROM</command> will raise an error if any line of the
     input file contains more or fewer columns than are expected.
@@ -282,6 +282,15 @@ COPY <replaceable class="parameter">tablename</replaceable> [ ( <replaceable cla
     newline, carriage return, and the current delimiter character.
    </para>
 
+   <para>
+    The specified null string is sent by <command>COPY TO</command> without
+    adding any backslashes; conversely, <command>COPY FROM</command> matches
+    the input against the null string before removing backslashes.  Therefore,
+    a null string such as <literal>\N</literal> cannot be confused with
+    the actual data value <literal>\N</literal> (which would be represented
+    as <literal>\\N</literal>).
+   </para>
+
    <para>
     The following special backslash sequences are recognized by
     <command>COPY FROM</command>:
@@ -335,10 +344,12 @@ COPY <replaceable class="parameter">tablename</replaceable> [ ( <replaceable cla
    </para>
 
    <para>
-    Never put a backslash before a data character <literal>N</> or period
-    (<literal>.</>). Such pairs will be mistaken for the default null string
-    or the end-of-data marker, respectively.  Any other backslashed character
-    that is not mentioned in the above table will be taken to represent itself.
+    Any other backslashed character that is not mentioned in the above table
+    will be taken to represent itself.  However, beware of adding backslashes
+    unnecessarily, since that might accidentally produce a string matching the
+    end-of-data marker (<literal>\.</>) or the null string (<literal>\N</> by
+    default).  These strings will be recognized before any other backslash
+    processing is done.
    </para>
 
    <para>
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index aac9f3c1d4..26de12ea9b 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/commands/copy.c,v 1.212 2003/09/29 22:06:40 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/commands/copy.c,v 1.213 2003/10/06 02:38:53 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -122,7 +122,7 @@ static StringInfoData attribute_buf;
  * to server encoding, and then extract individual attribute fields into
  * attribute_buf.  (We used to have CopyReadAttribute read the input source
  * directly, but that caused a lot of encoding issues and unnecessary logic
- * complexity).
+ * complexity.)
  */
 static StringInfoData line_buf;
 static bool line_buf_converted;
@@ -133,7 +133,8 @@ static void CopyTo(Relation rel, List *attnumlist, bool binary, bool oids,
 static void CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
 		 char *delim, char *null_print);
 static bool CopyReadLine(void);
-static char *CopyReadAttribute(const char *delim, CopyReadResult *result);
+static char *CopyReadAttribute(const char *delim, const char *null_print,
+							   CopyReadResult *result, bool *isnull);
 static Datum CopyReadBinaryAttribute(int column_no, FmgrInfo *flinfo,
 						Oid typelem, bool *isnull);
 static void CopyAttributeOut(char *string, char *delim);
@@ -1014,6 +1015,17 @@ CopyTo(Relation rel, List *attnumlist, bool binary, bool oids,
 		tmp = 0;
 		CopySendInt32(tmp);
 	}
+	else
+	{
+		/*
+		 * For non-binary copy, we need to convert null_print to client
+		 * encoding, because it will be sent directly with CopySendString.
+		 */
+		if (server_encoding != client_encoding)
+			null_print = (char *)
+				pg_server_to_client((unsigned char *) null_print,
+									strlen(null_print));
+	}
 
 	mySnapshot = CopyQuerySnapshot();
 
@@ -1441,9 +1453,10 @@ CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
 
 			if (file_has_oids)
 			{
-				string = CopyReadAttribute(delim, &result);
+				string = CopyReadAttribute(delim, null_print,
+										   &result, &isnull);
 
-				if (strcmp(string, null_print) == 0)
+				if (isnull)
 					ereport(ERROR,
 							(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
 							 errmsg("null OID in COPY data")));
@@ -1478,9 +1491,10 @@ CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
 							 errmsg("missing data for column \"%s\"",
 									NameStr(attr[m]->attname))));
 
-				string = CopyReadAttribute(delim, &result);
+				string = CopyReadAttribute(delim, null_print,
+										   &result, &isnull);
 
-				if (strcmp(string, null_print) == 0)
+				if (isnull)
 				{
 					/* we read an SQL NULL, no need to do anything */
 				}
@@ -1880,25 +1894,33 @@ CopyReadLine(void)
 	return result;
 }
 
-/*
+/*----------
  * Read the value of a single attribute, performing de-escaping as needed.
  *
+ * delim is the column delimiter string (must be just one byte for now).
+ * null_print is the null marker string.  Note that this is compared to
+ * the pre-de-escaped input string.
+ *
  * *result is set to indicate what terminated the read:
  *		NORMAL_ATTR:	column delimiter
  *		END_OF_LINE:	end of line
  * In either case, the string read up to the terminator is returned.
  *
- * Note: This function does not care about SQL NULL values -- it
- * is the caller's responsibility to check if the returned string
- * matches what the user specified for the SQL NULL value.
- *
- * delim is the column delimiter string.
+ * *isnull is set true or false depending on whether the input matched
+ * the null marker.  Note that the caller cannot check this since the
+ * returned string will be the post-de-escaping equivalent, which may
+ * look the same as some valid data string.
+ *----------
  */
 static char *
-CopyReadAttribute(const char *delim, CopyReadResult *result)
+CopyReadAttribute(const char *delim, const char *null_print,
+				  CopyReadResult *result, bool *isnull)
 {
 	char		c;
 	char		delimc = delim[0];
+	int			start_cursor = line_buf.cursor;
+	int			end_cursor;
+	int			input_len;
 
 	/* reset attribute_buf to empty */
 	attribute_buf.len = 0;
@@ -1909,6 +1931,7 @@ CopyReadAttribute(const char *delim, CopyReadResult *result)
 
 	for (;;)
 	{
+		end_cursor = line_buf.cursor;
 		if (line_buf.cursor >= line_buf.len)
 			break;
 		c = line_buf.data[line_buf.cursor++];
@@ -1957,16 +1980,6 @@ CopyReadAttribute(const char *delim, CopyReadResult *result)
 						c = val & 0377;
 					}
 					break;
-
-					/*
-					 * This is a special hack to parse `\N' as
-					 * <backslash-N> rather then just 'N' to provide
-					 * compatibility with the default NULL output. -- pe
-					 */
-				case 'N':
-					appendStringInfoCharMacro(&attribute_buf, '\\');
-					c = 'N';
-					break;
 				case 'b':
 					c = '\b';
 					break;
@@ -1993,6 +2006,14 @@ CopyReadAttribute(const char *delim, CopyReadResult *result)
 		appendStringInfoCharMacro(&attribute_buf, c);
 	}
 
+	/* check whether raw input matched null marker */
+	input_len = end_cursor - start_cursor;
+	if (input_len == strlen(null_print) &&
+		strncmp(&line_buf.data[start_cursor], null_print, input_len) == 0)
+		*isnull = true;
+	else
+		*isnull = false;
+
 	return attribute_buf.data;
 }