From 47215c16f20631627ce0e1d78f5a592886c97b44 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 31 May 2016 15:54:46 -0400
Subject: [PATCH] Avoid useless closely-spaced writes of statistics files.

The original intent in the stats collector was that we should not write out
stats data oftener than every PGSTAT_STAT_INTERVAL msec.  Backends will not
make requests at all if they see the existing data is newer than that, and
the stats collector is supposed to disregard requests having a cutoff_time
older than its most recently written data, so that close-together requests
don't result in multiple writes.  But the latter part of that got broken
in commit 187492b6c2e8cafc, so that if two backends concurrently decide
the existing stats are too old, the collector would write the data twice.
(In principle the collector's logic would still merge requests as long as
the second one arrives before we've actually written data ... but since
the message collection loop would write data immediately after processing
a single inquiry message, that never happened in practice, and in any case
the window in which it might work would be much shorter than
PGSTAT_STAT_INTERVAL.)

To fix, improve pgstat_recv_inquiry so that it checks whether the cutoff
time is too old, and doesn't add a request to the queue if so.  This means
that we do not need DBWriteRequest.request_time, because the decision is
taken before making a queue entry.  And that means that we don't really
need the DBWriteRequest data structure at all; an OID list of database
OIDs will serve and allow removal of some rather verbose and crufty code.

In passing, improve the comments in this area, which have been rather
neglected.  Also change backend_read_statsfile so that it's not silently
relying on MyDatabaseId to have some particular value in the autovacuum
launcher process.  It accidentally worked as desired because MyDatabaseId
is zero in that process; but that does not seem like a dependency we want,
especially with no documentation about it.

Although this patch is mine, it turns out I'd rediscovered a known bug,
for which Tomas Vondra had already submitted a patch that's functionally
equivalent to the non-cosmetic aspects of this patch.  Thanks to Tomas
for reviewing this version.

Back-patch to 9.3 where the bug was introduced.

Prior-Discussion: <1718942738eb65c8407fcd864883f4c8@fuzzy.cz>
Patch: <4625.1464202586@sss.pgh.pa.us>
---
 src/backend/postmaster/pgstat.c | 214 ++++++++++++++++----------------
 src/include/pgstat.h            |  17 ++-
 2 files changed, 121 insertions(+), 110 deletions(-)

diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 7f64949ea0..0b0b0446b9 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -38,7 +38,6 @@
 #include "access/xact.h"
 #include "catalog/pg_database.h"
 #include "catalog/pg_proc.h"
-#include "lib/ilist.h"
 #include "libpq/ip.h"
 #include "libpq/libpq.h"
 #include "libpq/pqsignal.h"
@@ -221,17 +220,14 @@ static int	localNumBackends = 0;
 static PgStat_ArchiverStats archiverStats;
 static PgStat_GlobalStats globalStats;
 
-/* Write request info for each database */
-typedef struct DBWriteRequest
-{
-	Oid			databaseid;		/* OID of the database to write */
-	TimestampTz request_time;	/* timestamp of the last write request */
-	slist_node	next;
-} DBWriteRequest;
-
-/* Latest statistics request times from backends */
-static slist_head last_statrequests = SLIST_STATIC_INIT(last_statrequests);
+/*
+ * List of OIDs of databases we need to write out.  If an entry is InvalidOid,
+ * it means to write only the shared-catalog stats ("DB 0"); otherwise, we
+ * will write both that DB's data and the shared stats.
+ */
+static List *pending_write_requests = NIL;
 
+/* Signal handler flags */
 static volatile bool need_exit = false;
 static volatile bool got_SIGHUP = false;
 
@@ -3344,8 +3340,7 @@ PgstatCollectorMain(int argc, char *argv[])
 	init_ps_display("stats collector process", "", "", "");
 
 	/*
-	 * Read in an existing statistics stats file or initialize the stats to
-	 * zero.
+	 * Read in existing stats files or initialize the stats to zero.
 	 */
 	pgStatRunningInCollector = true;
 	pgStatDBHash = pgstat_read_statsfiles(InvalidOid, true, true);
@@ -3391,8 +3386,8 @@ PgstatCollectorMain(int argc, char *argv[])
 			}
 
 			/*
-			 * Write the stats file if a new request has arrived that is not
-			 * satisfied by existing file.
+			 * Write the stats file(s) if a new request has arrived that is
+			 * not satisfied by existing file(s).
 			 */
 			if (pgstat_write_statsfile_needed())
 				pgstat_write_statsfiles(false, false);
@@ -3722,14 +3717,14 @@ pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create)
  * pgstat_write_statsfiles() -
  *		Write the global statistics file, as well as requested DB files.
  *
- *	If writing to the permanent files (happens when the collector is
- *	shutting down only), remove the temporary files so that backends
- *	starting up under a new postmaster can't read the old data before
- *	the new collector is ready.
+ *	'permanent' specifies writing to the permanent files not temporary ones.
+ *	When true (happens only when the collector is shutting down), also remove
+ *	the temporary files so that backends starting up under a new postmaster
+ *	can't read old data before the new collector is ready.
  *
  *	When 'allDbs' is false, only the requested databases (listed in
- *	last_statrequests) will be written; otherwise, all databases will be
- *	written.
+ *	pending_write_requests) will be written; otherwise, all databases
+ *	will be written.
  * ----------
  */
 static void
@@ -3789,15 +3784,14 @@ pgstat_write_statsfiles(bool permanent, bool allDbs)
 	while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL)
 	{
 		/*
-		 * Write out the tables and functions into the DB stat file, if
-		 * required.
-		 *
-		 * We need to do this before the dbentry write, to ensure the
-		 * timestamps written to both are consistent.
+		 * Write out the table and function stats for this DB into the
+		 * appropriate per-DB stat file, if required.
 		 */
 		if (allDbs || pgstat_db_requested(dbentry->databaseid))
 		{
+			/* Make DB's timestamp consistent with the global stats */
 			dbentry->stats_timestamp = globalStats.stats_timestamp;
+
 			pgstat_write_db_statsfile(dbentry, permanent);
 		}
 
@@ -3850,27 +3844,8 @@ pgstat_write_statsfiles(bool permanent, bool allDbs)
 	 * Now throw away the list of requests.  Note that requests sent after we
 	 * started the write are still waiting on the network socket.
 	 */
-	if (!slist_is_empty(&last_statrequests))
-	{
-		slist_mutable_iter iter;
-
-		/*
-		 * Strictly speaking we should do slist_delete_current() before
-		 * freeing each request struct.  We skip that and instead
-		 * re-initialize the list header at the end.  Nonetheless, we must use
-		 * slist_foreach_modify, not just slist_foreach, since we will free
-		 * the node's storage before advancing.
-		 */
-		slist_foreach_modify(iter, &last_statrequests)
-		{
-			DBWriteRequest *req;
-
-			req = slist_container(DBWriteRequest, next, iter.cur);
-			pfree(req);
-		}
-
-		slist_init(&last_statrequests);
-	}
+	list_free(pending_write_requests);
+	pending_write_requests = NIL;
 }
 
 /*
@@ -4009,13 +3984,20 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
 /* ----------
  * pgstat_read_statsfiles() -
  *
- *	Reads in the existing statistics collector files and initializes the
- *	databases' hash table.  If the permanent file name is requested (which
- *	only happens in the stats collector itself), also remove the file after
- *	reading; the in-memory status is now authoritative, and the permanent file
- *	would be out of date in case somebody else reads it.
+ *	Reads in some existing statistics collector files and returns the
+ *	databases hash table that is the top level of the data.
  *
- *	If a deep read is requested, table/function stats are read also, otherwise
+ *	If 'onlydb' is not InvalidOid, it means we only want data for that DB
+ *	plus the shared catalogs ("DB 0").  We'll still populate the DB hash
+ *	table for all databases, but we don't bother even creating table/function
+ *	hash tables for other databases.
+ *
+ *	'permanent' specifies reading from the permanent files not temporary ones.
+ *	When true (happens only when the collector is starting up), remove the
+ *	files after reading; the in-memory status is now authoritative, and the
+ *	files would be out of date in case somebody else reads them.
+ *
+ *	If a 'deep' read is requested, table/function stats are read, otherwise
  *	the table/function hash tables remain empty.
  * ----------
  */
@@ -4152,8 +4134,8 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
 				dbentry->functions = NULL;
 
 				/*
-				 * Don't collect tables if not the requested DB (or the
-				 * shared-table info)
+				 * Don't create tables/functions hashtables for uninteresting
+				 * databases.
 				 */
 				if (onlydb != InvalidOid)
 				{
@@ -4181,9 +4163,7 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
 
 				/*
 				 * If requested, read the data from the database-specific
-				 * file. If there was onlydb specified (!= InvalidOid), we
-				 * would not get here because of a break above. So we don't
-				 * need to recheck.
+				 * file.  Otherwise we just leave the hashtables empty.
 				 */
 				if (deep)
 					pgstat_read_db_statsfile(dbentry->databaseid,
@@ -4222,10 +4202,14 @@ done:
  * pgstat_read_db_statsfile() -
  *
  *	Reads in the existing statistics collector file for the given database,
- *	and initializes the tables and functions hash tables.
+ *	filling the passed-in tables and functions hash tables.
  *
- *	As pgstat_read_statsfiles, if the permanent file is requested, it is
+ *	As in pgstat_read_statsfiles, if the permanent file is requested, it is
  *	removed after reading.
+ *
+ *	Note: this code has the ability to skip storing per-table or per-function
+ *	data, if NULL is passed for the corresponding hashtable.  That's not used
+ *	at the moment though.
  * ----------
  */
 static void
@@ -4295,7 +4279,7 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
 				}
 
 				/*
-				 * Skip if table belongs to a not requested database.
+				 * Skip if table data not wanted.
 				 */
 				if (tabhash == NULL)
 					break;
@@ -4329,7 +4313,7 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
 				}
 
 				/*
-				 * Skip if function belongs to a not requested database.
+				 * Skip if function data not wanted.
 				 */
 				if (funchash == NULL)
 					break;
@@ -4371,8 +4355,6 @@ done:
 		elog(DEBUG2, "removing permanent stats file \"%s\"", statfile);
 		unlink(statfile);
 	}
-
-	return;
 }
 
 /* ----------
@@ -4516,6 +4498,7 @@ backend_read_statsfile(void)
 {
 	TimestampTz min_ts = 0;
 	TimestampTz ref_ts = 0;
+	Oid			inquiry_db;
 	int			count;
 
 	/* already read it? */
@@ -4523,6 +4506,17 @@ backend_read_statsfile(void)
 		return;
 	Assert(!pgStatRunningInCollector);
 
+	/*
+	 * In a normal backend, we check staleness of the data for our own DB, and
+	 * so we send MyDatabaseId in inquiry messages.  In the autovac launcher,
+	 * check staleness of the shared-catalog data, and send InvalidOid in
+	 * inquiry messages so as not to force writing unnecessary data.
+	 */
+	if (IsAutoVacuumLauncherProcess())
+		inquiry_db = InvalidOid;
+	else
+		inquiry_db = MyDatabaseId;
+
 	/*
 	 * Loop until fresh enough stats file is available or we ran out of time.
 	 * The stats inquiry message is sent repeatedly in case collector drops
@@ -4536,7 +4530,7 @@ backend_read_statsfile(void)
 
 		CHECK_FOR_INTERRUPTS();
 
-		ok = pgstat_read_db_statsfile_timestamp(MyDatabaseId, false, &file_ts);
+		ok = pgstat_read_db_statsfile_timestamp(inquiry_db, false, &file_ts);
 
 		cur_ts = GetCurrentTimestamp();
 		/* Calculate min acceptable timestamp, if we didn't already */
@@ -4595,7 +4589,7 @@ backend_read_statsfile(void)
 				pfree(mytime);
 			}
 
-			pgstat_send_inquiry(cur_ts, min_ts, MyDatabaseId);
+			pgstat_send_inquiry(cur_ts, min_ts, inquiry_db);
 			break;
 		}
 
@@ -4605,7 +4599,7 @@ backend_read_statsfile(void)
 
 		/* Not there or too old, so kick the collector and wait a bit */
 		if ((count % PGSTAT_INQ_LOOP_COUNT) == 0)
-			pgstat_send_inquiry(cur_ts, min_ts, MyDatabaseId);
+			pgstat_send_inquiry(cur_ts, min_ts, inquiry_db);
 
 		pg_usleep(PGSTAT_RETRY_DELAY * 1000L);
 	}
@@ -4617,7 +4611,8 @@ backend_read_statsfile(void)
 
 	/*
 	 * Autovacuum launcher wants stats about all databases, but a shallow read
-	 * is sufficient.
+	 * is sufficient.  Regular backends want a deep read for just the tables
+	 * they can see (MyDatabaseId + shared catalogs).
 	 */
 	if (IsAutoVacuumLauncherProcess())
 		pgStatDBHash = pgstat_read_statsfiles(InvalidOid, false, false);
@@ -4678,43 +4673,27 @@ pgstat_clear_snapshot(void)
 static void
 pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len)
 {
-	slist_iter	iter;
-	DBWriteRequest *newreq;
 	PgStat_StatDBEntry *dbentry;
 
 	elog(DEBUG2, "received inquiry for database %u", msg->databaseid);
 
 	/*
-	 * Find the last write request for this DB.  If it's older than the
-	 * request's cutoff time, update it; otherwise there's nothing to do.
+	 * If there's already a write request for this DB, there's nothing to do.
 	 *
 	 * Note that if a request is found, we return early and skip the below
 	 * check for clock skew.  This is okay, since the only way for a DB
 	 * request to be present in the list is that we have been here since the
-	 * last write round.
+	 * last write round.  It seems sufficient to check for clock skew once per
+	 * write round.
 	 */
-	slist_foreach(iter, &last_statrequests)
-	{
-		DBWriteRequest *req = slist_container(DBWriteRequest, next, iter.cur);
-
-		if (req->databaseid != msg->databaseid)
-			continue;
-
-		if (msg->cutoff_time > req->request_time)
-			req->request_time = msg->cutoff_time;
+	if (list_member_oid(pending_write_requests, msg->databaseid))
 		return;
-	}
-
-	/*
-	 * There's no request for this DB yet, so create one.
-	 */
-	newreq = palloc(sizeof(DBWriteRequest));
-
-	newreq->databaseid = msg->databaseid;
-	newreq->request_time = msg->clock_time;
-	slist_push_head(&last_statrequests, &newreq->next);
 
 	/*
+	 * Check to see if we last wrote this database at a time >= the requested
+	 * cutoff time.  If so, this is a stale request that was generated before
+	 * we updated the DB file, and we don't need to do so again.
+	 *
 	 * If the requestor's local clock time is older than stats_timestamp, we
 	 * should suspect a clock glitch, ie system time going backwards; though
 	 * the more likely explanation is just delayed message receipt.  It is
@@ -4723,7 +4702,17 @@ pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len)
 	 * to update the stats file for a long time.
 	 */
 	dbentry = pgstat_get_db_entry(msg->databaseid, false);
-	if ((dbentry != NULL) && (msg->clock_time < dbentry->stats_timestamp))
+	if (dbentry == NULL)
+	{
+		/*
+		 * We have no data for this DB.  Enter a write request anyway so that
+		 * the global stats will get updated.  This is needed to prevent
+		 * backend_read_statsfile from waiting for data that we cannot supply,
+		 * in the case of a new DB that nobody has yet reported any stats for.
+		 * See the behavior of pgstat_read_db_statsfile_timestamp.
+		 */
+	}
+	else if (msg->clock_time < dbentry->stats_timestamp)
 	{
 		TimestampTz cur_ts = GetCurrentTimestamp();
 
@@ -4744,11 +4733,27 @@ pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len)
 				 writetime, mytime, dbentry->databaseid);
 			pfree(writetime);
 			pfree(mytime);
-
-			newreq->request_time = cur_ts;
-			dbentry->stats_timestamp = cur_ts - 1;
+		}
+		else
+		{
+			/*
+			 * Nope, it's just an old request.  Assuming msg's clock_time is
+			 * >= its cutoff_time, it must be stale, so we can ignore it.
+			 */
+			return;
 		}
 	}
+	else if (msg->cutoff_time <= dbentry->stats_timestamp)
+	{
+		/* Stale request, ignore it */
+		return;
+	}
+
+	/*
+	 * We need to write this DB, so create a request.
+	 */
+	pending_write_requests = lappend_oid(pending_write_requests,
+										 msg->databaseid);
 }
 
 
@@ -5327,13 +5332,13 @@ pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len)
 /* ----------
  * pgstat_write_statsfile_needed() -
  *
- *	Do we need to write out the files?
+ *	Do we need to write out any stats files?
  * ----------
  */
 static bool
 pgstat_write_statsfile_needed(void)
 {
-	if (!slist_is_empty(&last_statrequests))
+	if (pending_write_requests != NIL)
 		return true;
 
 	/* Everything was written recently */
@@ -5349,25 +5354,18 @@ pgstat_write_statsfile_needed(void)
 static bool
 pgstat_db_requested(Oid databaseid)
 {
-	slist_iter	iter;
-
 	/*
 	 * If any requests are outstanding at all, we should write the stats for
 	 * shared catalogs (the "database" with OID 0).  This ensures that
 	 * backends will see up-to-date stats for shared catalogs, even though
 	 * they send inquiry messages mentioning only their own DB.
 	 */
-	if (databaseid == InvalidOid && !slist_is_empty(&last_statrequests))
+	if (databaseid == InvalidOid && pending_write_requests != NIL)
 		return true;
 
 	/* Search to see if there's an open request to write this database. */
-	slist_foreach(iter, &last_statrequests)
-	{
-		DBWriteRequest *req = slist_container(DBWriteRequest, next, iter.cur);
-
-		if (req->databaseid == databaseid)
-			return true;
-	}
+	if (list_member_oid(pending_write_requests, databaseid))
+		return true;
 
 	return false;
 }
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 9ecc16372d..5738336c2a 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -218,7 +218,20 @@ typedef struct PgStat_MsgDummy
 
 /* ----------
  * PgStat_MsgInquiry			Sent by a backend to ask the collector
- *								to write the stats file.
+ *								to write the stats file(s).
+ *
+ * Ordinarily, an inquiry message prompts writing of the global stats file,
+ * the stats file for shared catalogs, and the stats file for the specified
+ * database.  If databaseid is InvalidOid, only the first two are written.
+ *
+ * New file(s) will be written only if the existing file has a timestamp
+ * older than the specified cutoff_time; this prevents duplicated effort
+ * when multiple requests arrive at nearly the same time, assuming that
+ * backends send requests with cutoff_times a little bit in the past.
+ *
+ * clock_time should be the requestor's current local time; the collector
+ * uses this to check for the system clock going backward, but it has no
+ * effect unless that occurs.  We assume clock_time >= cutoff_time, though.
  * ----------
  */
 
@@ -227,7 +240,7 @@ typedef struct PgStat_MsgInquiry
 	PgStat_MsgHdr m_hdr;
 	TimestampTz clock_time;		/* observed local clock time */
 	TimestampTz cutoff_time;	/* minimum acceptable file timestamp */
-	Oid			databaseid;		/* requested DB (InvalidOid => all DBs) */
+	Oid			databaseid;		/* requested DB (InvalidOid => shared only) */
 } PgStat_MsgInquiry;