From 9e18eacbdff2ae2abd5ff38eee725e6399e39b41 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sun, 17 Jun 2012 17:11:07 -0400 Subject: [PATCH] Fix stats collector to recover nicely when system clock goes backwards. Formerly, if the system clock went backwards, the stats collector would fail to update the stats file any more until the clock reading again exceeds whatever timestamp was last written into the stats file. Such glitches in the clock's behavior are not terribly unlikely on machines not using NTP. Such a scenario has been observed to cause regression test failures in the buildfarm, and it could have bad effects on the behavior of autovacuum, so it seems prudent to install some defenses. We could directly detect the clock going backwards by adding GetCurrentTimestamp calls in the stats collector's main loop, but that would hurt performance on platforms where GetCurrentTimestamp is expensive. To minimize the performance hit in normal cases, adopt a more complicated scheme wherein backends check for clock skew when reading the stats file, and if they see it, signal the stats collector by sending an extra stats inquiry message. The stats collector does an extra GetCurrentTimestamp only when it receives an inquiry with an apparently out-of-order timestamp. To avoid unnecessary GetCurrentTimestamp calls, expand the inquiry messages to carry the backend's current clock reading as well as its stats cutoff time. The latter, being intentionally slightly in-the-past, would trigger more clock rechecks than we need if it were used for this purpose. We might want to backpatch this change at some point, but let's let it shake out in the buildfarm for awhile first. --- src/backend/postmaster/pgstat.c | 145 +++++++++++++++++++++++++------- src/include/pgstat.h | 3 +- 2 files changed, 116 insertions(+), 32 deletions(-) diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 73d5b2e39c..243dc8bc53 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -1404,16 +1404,16 @@ pgstat_ping(void) * pgstat_send_inquiry() - * * Notify collector that we need fresh data. - * ts specifies the minimum acceptable timestamp for the stats file. * ---------- */ static void -pgstat_send_inquiry(TimestampTz ts) +pgstat_send_inquiry(TimestampTz clock_time, TimestampTz cutoff_time) { PgStat_MsgInquiry msg; pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_INQUIRY); - msg.inquiry_time = ts; + msg.clock_time = clock_time; + msg.cutoff_time = cutoff_time; pgstat_send(&msg, sizeof(msg)); } @@ -3633,7 +3633,7 @@ pgstat_read_statsfile(Oid onlydb, bool permanent) /* * Set the current timestamp (will be kept only in case we can't load an - * existing statsfile. + * existing statsfile). */ globalStats.stat_reset_timestamp = GetCurrentTimestamp(); @@ -3922,8 +3922,8 @@ pgstat_read_statsfile_timestamp(bool permanent, TimestampTz *ts) static void backend_read_statsfile(void) { - TimestampTz cur_ts; - TimestampTz min_ts; + TimestampTz min_ts = 0; + TimestampTz ref_ts = 0; int count; /* already read it? */ @@ -3931,26 +3931,6 @@ backend_read_statsfile(void) return; Assert(!pgStatRunningInCollector); - /* - * We set the minimum acceptable timestamp to PGSTAT_STAT_INTERVAL msec - * before now. This indirectly ensures that the collector needn't write - * the file more often than PGSTAT_STAT_INTERVAL. In an autovacuum - * worker, however, we want a lower delay to avoid using stale data, so we - * use PGSTAT_RETRY_DELAY (since the number of worker is low, this - * shouldn't be a problem). - * - * Note that we don't recompute min_ts after sleeping; so we might end up - * accepting a file a bit older than PGSTAT_STAT_INTERVAL. In practice - * that shouldn't happen, though, as long as the sleep time is less than - * PGSTAT_STAT_INTERVAL; and we don't want to lie to the collector about - * what our cutoff time really is. - */ - cur_ts = GetCurrentTimestamp(); - if (IsAutoVacuumWorkerProcess()) - min_ts = TimestampTzPlusMilliseconds(cur_ts, -PGSTAT_RETRY_DELAY); - else - min_ts = TimestampTzPlusMilliseconds(cur_ts, -PGSTAT_STAT_INTERVAL); - /* * Loop until fresh enough stats file is available or we ran out of time. * The stats inquiry message is sent repeatedly in case collector drops @@ -3958,17 +3938,82 @@ backend_read_statsfile(void) */ for (count = 0; count < PGSTAT_POLL_LOOP_COUNT; count++) { + bool ok; TimestampTz file_ts = 0; + TimestampTz cur_ts; CHECK_FOR_INTERRUPTS(); - if (pgstat_read_statsfile_timestamp(false, &file_ts) && - file_ts >= min_ts) + ok = pgstat_read_statsfile_timestamp(false, &file_ts); + + cur_ts = GetCurrentTimestamp(); + /* Calculate min acceptable timestamp, if we didn't already */ + if (count == 0 || cur_ts < ref_ts) + { + /* + * We set the minimum acceptable timestamp to PGSTAT_STAT_INTERVAL + * msec before now. This indirectly ensures that the collector + * needn't write the file more often than PGSTAT_STAT_INTERVAL. + * In an autovacuum worker, however, we want a lower delay to + * avoid using stale data, so we use PGSTAT_RETRY_DELAY (since the + * number of workers is low, this shouldn't be a problem). + * + * We don't recompute min_ts after sleeping, except in the + * unlikely case that cur_ts went backwards. So we might end up + * accepting a file a bit older than PGSTAT_STAT_INTERVAL. In + * practice that shouldn't happen, though, as long as the sleep + * time is less than PGSTAT_STAT_INTERVAL; and we don't want to + * tell the collector that our cutoff time is less than what we'd + * actually accept. + */ + ref_ts = cur_ts; + if (IsAutoVacuumWorkerProcess()) + min_ts = TimestampTzPlusMilliseconds(ref_ts, + -PGSTAT_RETRY_DELAY); + else + min_ts = TimestampTzPlusMilliseconds(ref_ts, + -PGSTAT_STAT_INTERVAL); + } + + /* + * If the file timestamp is actually newer than cur_ts, we must have + * had a clock glitch (system time went backwards) or there is clock + * skew between our processor and the stats collector's processor. + * Accept the file, but send an inquiry message anyway to make + * pgstat_recv_inquiry do a sanity check on the collector's time. + */ + if (ok && file_ts > cur_ts) + { + /* + * A small amount of clock skew between processors isn't terribly + * surprising, but a large difference is worth logging. We + * arbitrarily define "large" as 1000 msec. + */ + if (file_ts >= TimestampTzPlusMilliseconds(cur_ts, 1000)) + { + char *filetime; + char *mytime; + + /* Copy because timestamptz_to_str returns a static buffer */ + filetime = pstrdup(timestamptz_to_str(file_ts)); + mytime = pstrdup(timestamptz_to_str(cur_ts)); + elog(LOG, "stats collector's time %s is later than backend local time %s", + filetime, mytime); + pfree(filetime); + pfree(mytime); + } + + pgstat_send_inquiry(cur_ts, min_ts); + break; + } + + /* Normal acceptance case: file is not older than cutoff time */ + if (ok && file_ts >= min_ts) break; /* Not there or too old, so kick the collector and wait a bit */ if ((count % PGSTAT_INQ_LOOP_COUNT) == 0) - pgstat_send_inquiry(min_ts); + pgstat_send_inquiry(cur_ts, min_ts); pg_usleep(PGSTAT_RETRY_DELAY * 1000L); } @@ -4036,8 +4081,46 @@ pgstat_clear_snapshot(void) static void pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len) { - if (msg->inquiry_time > last_statrequest) - last_statrequest = msg->inquiry_time; + /* + * Advance last_statrequest if this requestor has a newer cutoff time + * than any previous request. + */ + if (msg->cutoff_time > last_statrequest) + last_statrequest = msg->cutoff_time; + + /* + * If the requestor's local clock time is older than last_statwrite, we + * should suspect a clock glitch, ie system time going backwards; though + * the more likely explanation is just delayed message receipt. It is + * worth expending a GetCurrentTimestamp call to be sure, since a large + * retreat in the system clock reading could otherwise cause us to neglect + * to update the stats file for a long time. + */ + if (msg->clock_time < last_statwrite) + { + TimestampTz cur_ts = GetCurrentTimestamp(); + + if (cur_ts < last_statwrite) + { + /* + * Sure enough, time went backwards. Force a new stats file write + * to get back in sync; but first, log a complaint. + */ + char *writetime; + char *mytime; + + /* Copy because timestamptz_to_str returns a static buffer */ + writetime = pstrdup(timestamptz_to_str(last_statwrite)); + mytime = pstrdup(timestamptz_to_str(cur_ts)); + elog(LOG, "last_statwrite %s is later than collector's time %s", + writetime, mytime); + pfree(writetime); + pfree(mytime); + + last_statrequest = cur_ts; + last_statwrite = last_statrequest - 1; + } + } } diff --git a/src/include/pgstat.h b/src/include/pgstat.h index dd978d79c3..cad9b54e7b 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -203,7 +203,8 @@ typedef struct PgStat_MsgDummy typedef struct PgStat_MsgInquiry { PgStat_MsgHdr m_hdr; - TimestampTz inquiry_time; /* minimum acceptable file timestamp */ + TimestampTz clock_time; /* observed local clock time */ + TimestampTz cutoff_time; /* minimum acceptable file timestamp */ } PgStat_MsgInquiry;