Fix pg_dump for hash partitioning on enum columns.

Hash partitioning on an enum is problematic because the hash codes are derived from the OIDs assigned to the enum values, which will almost certainly be different after a dump-and-reload than they were before. This means that some rows probably end up in different partitions than before, causing restore to fail because of partition constraint violations. (pg_upgrade dodges this problem by using hacks to force the enum values to keep the same OIDs, but that's not possible nor desirable for pg_dump.) Users can work around that by specifying --load-via-partition-root, but since that's a dump-time not restore-time decision, one might find out the need for it far too late. Instead, teach pg_dump to apply that option automatically when dealing with a partitioned table that has hash-on-enum partitioning. Also deal with a pre-existing issue for --load-via-partition-root mode: in a parallel restore, we try to TRUNCATE target tables just before loading them, in order to enable some backend optimizations. This is bad when using --load-via-partition-root because (a) we're likely to suffer deadlocks from restore jobs trying to restore rows into other partitions than they came from, and (b) if we miss getting a deadlock we might still lose data due to a TRUNCATE removing rows from some already-completed restore job. The fix for this is conceptually simple: just don't TRUNCATE if we're dealing with a --load-via-partition-root case. The tricky bit is for pg_restore to identify those cases. In dumps using COPY commands we can inspect each COPY command to see if it targets the nominal target table or some ancestor. However, in dumps using INSERT commands it's pretty impractical to examine the INSERTs in advance. To provide a solution for that going forward, modify pg_dump to mark TABLE DATA items that are using --load-via-partition-root with a comment. (This change also responds to a complaint from Robert Haas that the dump output for --load-via-partition-root is pretty confusing.) pg_restore checks for the special comment as well as checking the COPY command if present. This will fail to identify the combination of --load-via-partition-root and --inserts in pre-existing dump files, but that should be a pretty rare case in the field. If it does happen you will probably get a deadlock failure that you can work around by not using parallel restore, which is the same as before this bug fix. Having done this, there seems no remaining reason for the alarmism in the pg_dump man page about combining --load-via-partition-root with parallel restore, so remove that warning. Patch by me; thanks to Julien Rouhaud for review. Back-patch to v11 where hash partitioning was introduced. Discussion: https://postgr.es/m/1376149.1675268279@sss.pgh.pa.us
2023-03-17 13:31:40 -04:00 · 2023-03-17 13:31:40 -04:00 · 012ffb365a
parent b520129bdc
commit 012ffb365a
7 changed files with 289 additions and 53 deletions
--- a/doc/src/sgml/ref/pg_dump.sgml
+++ b/doc/src/sgml/ref/pg_dump.sgml
@ -804,16 +804,6 @@ PostgreSQL documentation
        and the two systems have different definitions of the collation used
        to sort the partitioning column.
       </para>
-
-       <para>
-        It is best not to use parallelism when restoring from an archive made
-        with this option, because <application>pg_restore</application> will
-        not know exactly which partition(s) a given archive data item will
-        load data into.  This could result in inefficiency due to lock
-        conflicts between parallel jobs, or perhaps even restore failures due
-        to foreign key constraints being set up before all the relevant data
-        is loaded.
-       </para>
      </listitem>
     </varlistentry>

--- a/doc/src/sgml/ref/pg_dumpall.sgml
+++ b/doc/src/sgml/ref/pg_dumpall.sgml
@ -342,10 +342,6 @@ PostgreSQL documentation
        and the two systems have different definitions of the collation used
        to sort the partitioning column.
       </para>
-
-       <!-- Currently, we don't need pg_dump's warning about parallelism here,
-        since parallel restore from a pg_dumpall script is impossible.
-       -->
      </listitem>
     </varlistentry>

--- a/src/bin/pg_dump/common.c
+++ b/src/bin/pg_dump/common.c
@ -259,6 +259,10 @@ getSchemaData(Archive *fout, int *numTablesPtr)
 		write_msg(NULL, "flagging inherited columns in subtables\n");
 	flagInhAttrs(fout->dopt, tblinfo, numTables);

+	if (g_verbose)
+		write_msg(NULL, "reading partitioning data");
+	getPartitioningInfo(fout);
+
 	if (g_verbose)
 		write_msg(NULL, "reading indexes\n");
 	getIndexes(fout, tblinfo, numTables);
@ -319,7 +323,6 @@ static void
 flagInhTables(Archive *fout, TableInfo *tblinfo, int numTables,
 			  InhInfo *inhinfo, int numInherits)
 {
-	DumpOptions *dopt = fout->dopt;
 	int			i,
 				j;

@ -335,18 +338,18 @@ flagInhTables(Archive *fout, TableInfo *tblinfo, int numTables,
 			continue;

 		/*
-		 * Normally, we don't bother computing anything for non-target tables,
-		 * but if load-via-partition-root is specified, we gather information
-		 * on every partition in the system so that getRootTableInfo can trace
-		 * from any given to leaf partition all the way up to the root.  (We
-		 * don't need to mark them as interesting for getTableAttrs, though.)
+		 * Normally, we don't bother computing anything for non-target tables.
+		 * However, we must find the parents of non-root partitioned tables in
+		 * any case, so that we can trace from leaf partitions up to the root
+		 * (in case a leaf is to be dumped but its parents are not).  We need
+		 * not mark such parents interesting for getTableAttrs, though.
 		 */
 		if (!tblinfo[i].dobj.dump)
 		{
 			mark_parents = false;

-			if (!dopt->load_via_partition_root ||
-				!tblinfo[i].ispartition)
+			if (!(tblinfo[i].relkind == RELKIND_PARTITIONED_TABLE &&
+				  tblinfo[i].ispartition))
 				find_parents = false;
 		}

--- a/src/bin/pg_dump/pg_backup_archiver.c
+++ b/src/bin/pg_dump/pg_backup_archiver.c
@ -76,6 +76,7 @@ static RestorePass _tocEntryRestorePass(TocEntry *te);
 static bool _tocEntryIsACL(TocEntry *te);
 static void _disableTriggersIfNecessary(ArchiveHandle *AH, TocEntry *te);
 static void _enableTriggersIfNecessary(ArchiveHandle *AH, TocEntry *te);
+static bool is_load_via_partition_root(TocEntry *te);
 static void buildTocEntryArrays(ArchiveHandle *AH);
 static void _moveBefore(ArchiveHandle *AH, TocEntry *pos, TocEntry *te);
 static int	_discoverArchiveFormat(ArchiveHandle *AH);
@ -864,6 +865,8 @@ restore_toc_entry(ArchiveHandle *AH, TocEntry *te, bool is_parallel)
 				}
 				else
 				{
+					bool		use_truncate;
+
 					_disableTriggersIfNecessary(AH, te);

 					/* Select owner and schema as necessary */
@ -875,13 +878,24 @@ restore_toc_entry(ArchiveHandle *AH, TocEntry *te, bool is_parallel)

 					/*
 					 * In parallel restore, if we created the table earlier in
-					 * the run then we wrap the COPY in a transaction and
-					 * precede it with a TRUNCATE.  If archiving is not on
-					 * this prevents WAL-logging the COPY.  This obtains a
-					 * speedup similar to that from using single_txn mode in
-					 * non-parallel restores.
+					 * this run (so that we know it is empty) and we are not
+					 * restoring a load-via-partition-root data item then we
+					 * wrap the COPY in a transaction and precede it with a
+					 * TRUNCATE.  If wal_level is set to minimal this prevents
+					 * WAL-logging the COPY.  This obtains a speedup similar
+					 * to that from using single_txn mode in non-parallel
+					 * restores.
+					 *
+					 * We mustn't do this for load-via-partition-root cases
+					 * because some data might get moved across partition
+					 * boundaries, risking deadlock and/or loss of previously
+					 * loaded data.  (We assume that all partitions of a
+					 * partitioned table will be treated the same way.)
 					 */
-					if (is_parallel && te->created)
+					use_truncate = is_parallel && te->created &&
+						!is_load_via_partition_root(te);
+
+					if (use_truncate)
 					{
 						/*
 						 * Parallel restore is always talking directly to a
@ -922,7 +936,7 @@ restore_toc_entry(ArchiveHandle *AH, TocEntry *te, bool is_parallel)
 					AH->outputKind = OUTPUT_SQLCMDS;

 					/* close out the transaction started above */
-					if (is_parallel && te->created)
+					if (use_truncate)
 						CommitTransaction(&AH->public);

 					_enableTriggersIfNecessary(AH, te);
@ -1014,6 +1028,43 @@ _enableTriggersIfNecessary(ArchiveHandle *AH, TocEntry *te)
 			 fmtQualifiedId(te->namespace, te->tag));
 }

+/*
+ * Detect whether a TABLE DATA TOC item is performing "load via partition
+ * root", that is the target table is an ancestor partition rather than the
+ * table the TOC item is nominally for.
+ *
+ * In newer archive files this can be detected by checking for a special
+ * comment placed in te->defn.  In older files we have to fall back to seeing
+ * if the COPY statement targets the named table or some other one.  This
+ * will not work for data dumped as INSERT commands, so we could give a false
+ * negative in that case; fortunately, that's a rarely-used option.
+ */
+static bool
+is_load_via_partition_root(TocEntry *te)
+{
+	if (te->defn &&
+		strncmp(te->defn, "-- load via partition root ", 27) == 0)
+		return true;
+	if (te->copyStmt && *te->copyStmt)
+	{
+		PQExpBuffer copyStmt = createPQExpBuffer();
+		bool		result;
+
+		/*
+		 * Build the initial part of the COPY as it would appear if the
+		 * nominal target table is the actual target.  If we see anything
+		 * else, it must be a load-via-partition-root case.
+		 */
+		appendPQExpBuffer(copyStmt, "COPY %s ",
+						  fmtQualifiedId(te->namespace, te->tag));
+		result = strncmp(te->copyStmt, copyStmt->data, copyStmt->len) != 0;
+		destroyPQExpBuffer(copyStmt);
+		return result;
+	}
+	/* Assume it's not load-via-partition-root */
+	return false;
+}
+
 /*
 * This is a routine that is part of the dumper interface, hence the 'Archive*' parameter.
 */
@ -2970,8 +3021,12 @@ _tocEntryRequired(TocEntry *te, teSection curSection, ArchiveHandle *AH)
 			res = res & ~REQ_DATA;
 	}

-	/* If there's no definition command, there's no schema component */
-	if (!te->defn || !te->defn[0])
+	/*
+	 * If there's no definition command, there's no schema component.  Treat
+	 * "load via partition root" comments as not schema.
+	 */
+	if (!te->defn || !te->defn[0] ||
+		strncmp(te->defn, "-- load via partition root ", 27) == 0)
 		res = res & ~REQ_SCHEMA;

 	/*
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@ -286,6 +286,7 @@ static void appendReloptionsArrayAH(PQExpBuffer buffer, const char *reloptions,
 static char *get_synchronized_snapshot(Archive *fout);
 static void setupDumpWorker(Archive *AHX);
 static TableInfo *getRootTableInfo(TableInfo *tbinfo);
+static bool forcePartitionRootLoad(const TableInfo *tbinfo);


 int
@ -1939,11 +1940,13 @@ dumpTableData_insert(Archive *fout, void *dcontext)
 				insertStmt = createPQExpBuffer();

 				/*
-				 * When load-via-partition-root is set, get the root table
-				 * name for the partition table, so that we can reload data
-				 * through the root table.
+				 * When load-via-partition-root is set or forced, get the root
+				 * table name for the partition table, so that we can reload
+				 * data through the root table.
 				 */
-				if (dopt->load_via_partition_root && tbinfo->ispartition)
+				if (tbinfo->ispartition &&
+					(dopt->load_via_partition_root ||
+					 forcePartitionRootLoad(tbinfo)))
 					targettab = getRootTableInfo(tbinfo);
 				else
 					targettab = tbinfo;
@ -2093,6 +2096,35 @@ getRootTableInfo(TableInfo *tbinfo)
 	return parentTbinfo;
 }

+/*
+ * forcePartitionRootLoad
+ *     Check if we must force load_via_partition_root for this partition.
+ *
+ * This is required if any level of ancestral partitioned table has an
+ * unsafe partitioning scheme.
+ */
+static bool
+forcePartitionRootLoad(const TableInfo *tbinfo)
+{
+	TableInfo  *parentTbinfo;
+
+	Assert(tbinfo->ispartition);
+	Assert(tbinfo->numParents == 1);
+
+	parentTbinfo = tbinfo->parents[0];
+	if (parentTbinfo->unsafe_partitions)
+		return true;
+	while (parentTbinfo->ispartition)
+	{
+		Assert(parentTbinfo->numParents == 1);
+		parentTbinfo = parentTbinfo->parents[0];
+		if (parentTbinfo->unsafe_partitions)
+			return true;
+	}
+
+	return false;
+}
+
 /*
 * dumpTableData -
 *	  dump the contents of a single table
@ -2107,34 +2139,40 @@ dumpTableData(Archive *fout, TableDataInfo *tdinfo)
 	PQExpBuffer copyBuf = createPQExpBuffer();
 	PQExpBuffer clistBuf = createPQExpBuffer();
 	DataDumperPtr dumpFn;
+	char	   *tdDefn = "";
 	char	   *copyStmt;
 	const char *copyFrom;

 	/* We had better have loaded per-column details about this table */
 	Assert(tbinfo->interesting);

+	/*
+	 * When load-via-partition-root is set or forced, get the root table name
+	 * for the partition table, so that we can reload data through the root
+	 * table.  Then construct a comment to be inserted into the TOC entry's
+	 * defn field, so that such cases can be identified reliably.
+	 */
+	if (tbinfo->ispartition &&
+		(dopt->load_via_partition_root ||
+		 forcePartitionRootLoad(tbinfo)))
+	{
+		TableInfo  *parentTbinfo;
+
+		parentTbinfo = getRootTableInfo(tbinfo);
+		copyFrom = fmtQualifiedDumpable(parentTbinfo);
+		printfPQExpBuffer(copyBuf, "-- load via partition root %s",
+						  copyFrom);
+		tdDefn = pg_strdup(copyBuf->data);
+	}
+	else
+		copyFrom = fmtQualifiedDumpable(tbinfo);
+
 	if (!dopt->dump_inserts)
 	{
 		/* Dump/restore using COPY */
 		dumpFn = dumpTableData_copy;
-
-		/*
-		 * When load-via-partition-root is set, get the root table name for
-		 * the partition table, so that we can reload data through the root
-		 * table.
-		 */
-		if (dopt->load_via_partition_root && tbinfo->ispartition)
-		{
-			TableInfo  *parentTbinfo;
-
-			parentTbinfo = getRootTableInfo(tbinfo);
-			copyFrom = fmtQualifiedDumpable(parentTbinfo);
-		}
-		else
-			copyFrom = fmtQualifiedDumpable(tbinfo);
-
 		/* must use 2 steps here 'cause fmtId is nonreentrant */
-		appendPQExpBuffer(copyBuf, "COPY %s ",
+		printfPQExpBuffer(copyBuf, "COPY %s ",
 						  copyFrom);
 		appendPQExpBuffer(copyBuf, "%s %sFROM stdin;\n",
 						  fmtCopyColumnList(tbinfo, clistBuf),
@ -2158,7 +2196,7 @@ dumpTableData(Archive *fout, TableDataInfo *tdinfo)
 					 tbinfo->dobj.name, tbinfo->dobj.namespace->dobj.name,
 					 NULL, tbinfo->rolname,
 					 false, "TABLE DATA", SECTION_DATA,
-					 "", "", copyStmt,
+					 tdDefn, "", copyStmt,
 					 &(tbinfo->dobj.dumpId), 1,
 					 dumpFn, tdinfo);

@ -6796,6 +6834,77 @@ getInherits(Archive *fout, int *numInherits)
 	return inhinfo;
 }

+/*
+ * getPartitioningInfo
+ *	  get information about partitioning
+ *
+ * For the most part, we only collect partitioning info about tables we
+ * intend to dump.  However, this function has to consider all partitioned
+ * tables in the database, because we need to know about parents of partitions
+ * we are going to dump even if the parents themselves won't be dumped.
+ *
+ * Specifically, what we need to know is whether each partitioned table
+ * has an "unsafe" partitioning scheme that requires us to force
+ * load-via-partition-root mode for its children.  Currently the only case
+ * for which we force that is hash partitioning on enum columns, since the
+ * hash codes depend on enum value OIDs which won't be replicated across
+ * dump-and-reload.  There are other cases in which load-via-partition-root
+ * might be necessary, but we expect users to cope with them.
+ */
+void
+getPartitioningInfo(Archive *fout)
+{
+	PQExpBuffer query;
+	PGresult   *res;
+	int			ntups;
+	int			i;
+
+	/* hash partitioning didn't exist before v11 */
+	if (fout->remoteVersion < 110000)
+		return;
+	/* needn't bother if schema-only dump */
+	if (fout->dopt->schemaOnly)
+		return;
+
+	query = createPQExpBuffer();
+
+	/*
+	 * Unsafe partitioning schemes are exactly those for which hash enum_ops
+	 * appears among the partition opclasses.  We needn't check partstrat.
+	 *
+	 * Note that this query may well retrieve info about tables we aren't
+	 * going to dump and hence have no lock on.  That's okay since we need not
+	 * invoke any unsafe server-side functions.
+	 */
+	appendPQExpBufferStr(query,
+						 "SELECT partrelid FROM pg_partitioned_table WHERE\n"
+						 "(SELECT c.oid FROM pg_opclass c JOIN pg_am a "
+						 "ON c.opcmethod = a.oid\n"
+						 "WHERE opcname = 'enum_ops' "
+						 "AND opcnamespace = 'pg_catalog'::regnamespace "
+						 "AND amname = 'hash') = ANY(partclass)");
+
+	res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK);
+
+	ntups = PQntuples(res);
+
+	for (i = 0; i < ntups; i++)
+	{
+		Oid			tabrelid = atooid(PQgetvalue(res, i, 0));
+		TableInfo  *tbinfo;
+
+		tbinfo = findTableByOid(tabrelid);
+		if (tbinfo == NULL)
+			exit_horribly(NULL, "failed sanity check, table OID %u appearing in pg_partitioned_table not found\n",
+						  tabrelid);
+		tbinfo->unsafe_partitions = true;
+	}
+
+	PQclear(res);
+
+	destroyPQExpBuffer(query);
+}
+
 /*
 * getIndexes
 *	  get information about every index on a dumpable table
--- a/src/bin/pg_dump/pg_dump.h
+++ b/src/bin/pg_dump/pg_dump.h
@ -299,6 +299,7 @@ typedef struct _tableInfo
 	bool		dummy_view;		/* view's real definition must be postponed */
 	bool		postponed_def;	/* matview must be postponed into post-data */
 	bool		ispartition;	/* is table a partition? */
+	bool		unsafe_partitions;	/* is it an unsafe partitioned table? */

 	/*
 	 * These fields are computed only if we decide the table is interesting
@ -701,6 +702,7 @@ extern ConvInfo *getConversions(Archive *fout, int *numConversions);
 extern TableInfo *getTables(Archive *fout, int *numTables);
 extern void getOwnedSeqs(Archive *fout, TableInfo tblinfo[], int numTables);
 extern InhInfo *getInherits(Archive *fout, int *numInherits);
+extern void getPartitioningInfo(Archive *fout);
 extern void getIndexes(Archive *fout, TableInfo tblinfo[], int numTables);
 extern void getExtendedStatistics(Archive *fout);
 extern void getConstraints(Archive *fout, TableInfo tblinfo[], int numTables);
--- a/src/bin/pg_dump/t/004_pg_dump_parallel.pl
+++ b/src/bin/pg_dump/t/004_pg_dump_parallel.pl
@ -0,0 +1,81 @@
+
+# Copyright (c) 2021-2023, PostgreSQL Global Development Group
+
+use strict;
+use warnings;
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+my $dbname1 = 'regression_src';
+my $dbname2 = 'regression_dest1';
+my $dbname3 = 'regression_dest2';
+
+my $node = PostgreSQL::Test::Cluster->new('main');
+$node->init;
+$node->start;
+
+my $backupdir = $node->backup_dir;
+
+$node->run_log([ 'createdb', $dbname1 ]);
+$node->run_log([ 'createdb', $dbname2 ]);
+$node->run_log([ 'createdb', $dbname3 ]);
+
+$node->safe_psql(
+	$dbname1,
+	qq{
+create type digit as enum ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9');
+
+-- plain table with index
+create table tplain (en digit, data int unique);
+insert into tplain select (x%10)::text::digit, x from generate_series(1,1000) x;
+
+-- non-troublesome hashed partitioning
+create table ths (mod int, data int, unique(mod, data)) partition by hash(mod);
+create table ths_p1 partition of ths for values with (modulus 3, remainder 0);
+create table ths_p2 partition of ths for values with (modulus 3, remainder 1);
+create table ths_p3 partition of ths for values with (modulus 3, remainder 2);
+insert into ths select (x%10), x from generate_series(1,1000) x;
+
+-- dangerous hashed partitioning
+create table tht (en digit, data int, unique(en, data)) partition by hash(en);
+create table tht_p1 partition of tht for values with (modulus 3, remainder 0);
+create table tht_p2 partition of tht for values with (modulus 3, remainder 1);
+create table tht_p3 partition of tht for values with (modulus 3, remainder 2);
+insert into tht select (x%10)::text::digit, x from generate_series(1,1000) x;
+	});
+
+$node->command_ok(
+	[
+		'pg_dump', '-Fd', '--no-sync', '-j2', '-f', "$backupdir/dump1",
+		$node->connstr($dbname1)
+	],
+	'parallel dump');
+
+$node->command_ok(
+	[
+		'pg_restore', '-v',
+		'-d',         $node->connstr($dbname2),
+		'-j3',        "$backupdir/dump1"
+	],
+	'parallel restore');
+
+$node->command_ok(
+	[
+		'pg_dump',   '-Fd',
+		'--no-sync', '-j2',
+		'-f',        "$backupdir/dump2",
+		'--inserts', $node->connstr($dbname1)
+	],
+	'parallel dump as inserts');
+
+$node->command_ok(
+	[
+		'pg_restore', '-v',
+		'-d',         $node->connstr($dbname3),
+		'-j3',        "$backupdir/dump2"
+	],
+	'parallel restore as inserts');
+
+done_testing();