pg_upgrade: check for clean server shutdowns

Previously pg_upgrade checked for the pid file and started/stopped the server to force a clean shutdown. However, "pg_ctl -m immediate" removes the pid file but doesn't do a clean shutdown, so check pg_controldata for a clean shutdown too. Diagnosed-by: Vimalraj A Discussion: https://postgr.es/m/CAFKBAK5e4Q-oTUuPPJ56EU_d2Rzodq6GWKS3ncAk3xo7hAsOZg@mail.gmail.com Backpatch-through: 9.3
2024-08-26 21:57:18 +02:00 · 2018-07-28 15:01:55 -04:00 · 2018-07-28 15:01:55 -04:00 · a326ca75b3
commit a326ca75b3
parent 5f2c5890e9
2 changed files with 61 additions and 1 deletions
--- a/src/bin/pg_upgrade/controldata.c
+++ b/src/bin/pg_upgrade/controldata.c
@ -58,6 +58,7 @@ get_control_data(ClusterInfo *cluster, bool live_check)
 	bool		got_large_object = false;
 	bool		got_date_is_int = false;
 	bool		got_data_checksum_version = false;
+	bool		got_cluster_state = false;
 	char	   *lc_collate = NULL;
 	char	   *lc_ctype = NULL;
 	char	   *lc_monetary = NULL;
@ -416,6 +417,64 @@ get_control_data(ClusterInfo *cluster, bool live_check)

 	pclose(output);

+	/*
+	 * Check for clean shutdown
+	 */
+
+	/* only pg_controldata outputs the cluster state */
+	snprintf(cmd, sizeof(cmd), "\"%s/pg_controldata\" \"%s\"",
+			 cluster->bindir, cluster->pgdata);
+	fflush(stdout);
+	fflush(stderr);
+
+	if ((output = popen(cmd, "r")) == NULL)
+		pg_fatal("could not get control data using %s: %s\n",
+				 cmd, strerror(errno));
+
+	/* we have the result of cmd in "output". so parse it line by line now */
+	while (fgets(bufin, sizeof(bufin), output))
+	{
+		if ((!live_check || cluster == &new_cluster) &&
+			(p = strstr(bufin, "Database cluster state:")) != NULL)
+		{
+			p = strchr(p, ':');
+
+			if (p == NULL || strlen(p) <= 1)
+				pg_fatal("%d: database cluster state problem\n", __LINE__);
+
+			p++;				/* remove ':' char */
+
+			/*
+			 * We checked earlier for a postmaster lock file, and if we found
+			 * one, we tried to start/stop the server to replay the WAL.  However,
+			 * pg_ctl -m immediate doesn't leave a lock file, but does require
+			 * WAL replay, so we check here that the server was shut down cleanly,
+			 * from the controldata perspective.
+			 */
+			/* remove leading spaces */
+			while (*p == ' ')
+				p++;
+			if (strcmp(p, "shut down\n") != 0)
+			{
+				if (cluster == &old_cluster)
+					pg_fatal("The source cluster was not shut down cleanly.\n");
+				else
+					pg_fatal("The target cluster was not shut down cleanly.\n");
+			}
+			got_cluster_state = true;
+		}
+	}
+
+	pclose(output);
+
+	if (!got_cluster_state)
+	{
+		if (cluster == &old_cluster)
+			pg_fatal("The source cluster lacks cluster state information:\n");
+		else
+			pg_fatal("The target cluster lacks cluster state information:\n");
+	}
+
 	/*
 	 * Restore environment variables
 	 */
--- a/src/bin/pg_upgrade/pg_upgrade.c
+++ b/src/bin/pg_upgrade/pg_upgrade.c
@ -202,7 +202,8 @@ setup(char *argv0, bool *live_check)
 		 * start, assume the server is running.  If the pid file is left over
 		 * from a server crash, this also allows any committed transactions
 		 * stored in the WAL to be replayed so they are not lost, because WAL
-		 * files are not transferred from old to new servers.
+		 * files are not transferred from old to new servers.  We later check
+		 * for a clean shutdown.
 		 */
 		if (start_postmaster(&old_cluster, false))
 			stop_postmaster(false);