Fix stats collector to recover nicely when system clock goes backwards.

Formerly, if the system clock went backwards, the stats collector would
fail to update the stats file any more until the clock reading again
exceeds whatever timestamp was last written into the stats file.  Such
glitches in the clock's behavior are not terribly unlikely on machines
not using NTP.  Such a scenario has been observed to cause regression test
failures in the buildfarm, and it could have bad effects on the behavior
of autovacuum, so it seems prudent to install some defenses.

We could directly detect the clock going backwards by adding
GetCurrentTimestamp calls in the stats collector's main loop, but that
would hurt performance on platforms where GetCurrentTimestamp is expensive.
To minimize the performance hit in normal cases, adopt a more complicated
scheme wherein backends check for clock skew when reading the stats file,
and if they see it, signal the stats collector by sending an extra stats
inquiry message.  The stats collector does an extra GetCurrentTimestamp
only when it receives an inquiry with an apparently out-of-order
timestamp.

To avoid unnecessary GetCurrentTimestamp calls, expand the inquiry messages
to carry the backend's current clock reading as well as its stats cutoff
time.  The latter, being intentionally slightly in-the-past, would trigger
more clock rechecks than we need if it were used for this purpose.

We might want to backpatch this change at some point, but let's let it
shake out in the buildfarm for awhile first.
This commit is contained in:
Tom Lane 2012-06-17 17:11:07 -04:00
parent 920febdad5
commit 9e18eacbdf
2 changed files with 116 additions and 32 deletions

View File

@ -1404,16 +1404,16 @@ pgstat_ping(void)
* pgstat_send_inquiry() -
*
* Notify collector that we need fresh data.
* ts specifies the minimum acceptable timestamp for the stats file.
* ----------
*/
static void
pgstat_send_inquiry(TimestampTz ts)
pgstat_send_inquiry(TimestampTz clock_time, TimestampTz cutoff_time)
{
PgStat_MsgInquiry msg;
pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_INQUIRY);
msg.inquiry_time = ts;
msg.clock_time = clock_time;
msg.cutoff_time = cutoff_time;
pgstat_send(&msg, sizeof(msg));
}
@ -3633,7 +3633,7 @@ pgstat_read_statsfile(Oid onlydb, bool permanent)
/*
* Set the current timestamp (will be kept only in case we can't load an
* existing statsfile.
* existing statsfile).
*/
globalStats.stat_reset_timestamp = GetCurrentTimestamp();
@ -3922,8 +3922,8 @@ pgstat_read_statsfile_timestamp(bool permanent, TimestampTz *ts)
static void
backend_read_statsfile(void)
{
TimestampTz cur_ts;
TimestampTz min_ts;
TimestampTz min_ts = 0;
TimestampTz ref_ts = 0;
int count;
/* already read it? */
@ -3931,26 +3931,6 @@ backend_read_statsfile(void)
return;
Assert(!pgStatRunningInCollector);
/*
* We set the minimum acceptable timestamp to PGSTAT_STAT_INTERVAL msec
* before now. This indirectly ensures that the collector needn't write
* the file more often than PGSTAT_STAT_INTERVAL. In an autovacuum
* worker, however, we want a lower delay to avoid using stale data, so we
* use PGSTAT_RETRY_DELAY (since the number of worker is low, this
* shouldn't be a problem).
*
* Note that we don't recompute min_ts after sleeping; so we might end up
* accepting a file a bit older than PGSTAT_STAT_INTERVAL. In practice
* that shouldn't happen, though, as long as the sleep time is less than
* PGSTAT_STAT_INTERVAL; and we don't want to lie to the collector about
* what our cutoff time really is.
*/
cur_ts = GetCurrentTimestamp();
if (IsAutoVacuumWorkerProcess())
min_ts = TimestampTzPlusMilliseconds(cur_ts, -PGSTAT_RETRY_DELAY);
else
min_ts = TimestampTzPlusMilliseconds(cur_ts, -PGSTAT_STAT_INTERVAL);
/*
* Loop until fresh enough stats file is available or we ran out of time.
* The stats inquiry message is sent repeatedly in case collector drops
@ -3958,17 +3938,82 @@ backend_read_statsfile(void)
*/
for (count = 0; count < PGSTAT_POLL_LOOP_COUNT; count++)
{
bool ok;
TimestampTz file_ts = 0;
TimestampTz cur_ts;
CHECK_FOR_INTERRUPTS();
if (pgstat_read_statsfile_timestamp(false, &file_ts) &&
file_ts >= min_ts)
ok = pgstat_read_statsfile_timestamp(false, &file_ts);
cur_ts = GetCurrentTimestamp();
/* Calculate min acceptable timestamp, if we didn't already */
if (count == 0 || cur_ts < ref_ts)
{
/*
* We set the minimum acceptable timestamp to PGSTAT_STAT_INTERVAL
* msec before now. This indirectly ensures that the collector
* needn't write the file more often than PGSTAT_STAT_INTERVAL.
* In an autovacuum worker, however, we want a lower delay to
* avoid using stale data, so we use PGSTAT_RETRY_DELAY (since the
* number of workers is low, this shouldn't be a problem).
*
* We don't recompute min_ts after sleeping, except in the
* unlikely case that cur_ts went backwards. So we might end up
* accepting a file a bit older than PGSTAT_STAT_INTERVAL. In
* practice that shouldn't happen, though, as long as the sleep
* time is less than PGSTAT_STAT_INTERVAL; and we don't want to
* tell the collector that our cutoff time is less than what we'd
* actually accept.
*/
ref_ts = cur_ts;
if (IsAutoVacuumWorkerProcess())
min_ts = TimestampTzPlusMilliseconds(ref_ts,
-PGSTAT_RETRY_DELAY);
else
min_ts = TimestampTzPlusMilliseconds(ref_ts,
-PGSTAT_STAT_INTERVAL);
}
/*
* If the file timestamp is actually newer than cur_ts, we must have
* had a clock glitch (system time went backwards) or there is clock
* skew between our processor and the stats collector's processor.
* Accept the file, but send an inquiry message anyway to make
* pgstat_recv_inquiry do a sanity check on the collector's time.
*/
if (ok && file_ts > cur_ts)
{
/*
* A small amount of clock skew between processors isn't terribly
* surprising, but a large difference is worth logging. We
* arbitrarily define "large" as 1000 msec.
*/
if (file_ts >= TimestampTzPlusMilliseconds(cur_ts, 1000))
{
char *filetime;
char *mytime;
/* Copy because timestamptz_to_str returns a static buffer */
filetime = pstrdup(timestamptz_to_str(file_ts));
mytime = pstrdup(timestamptz_to_str(cur_ts));
elog(LOG, "stats collector's time %s is later than backend local time %s",
filetime, mytime);
pfree(filetime);
pfree(mytime);
}
pgstat_send_inquiry(cur_ts, min_ts);
break;
}
/* Normal acceptance case: file is not older than cutoff time */
if (ok && file_ts >= min_ts)
break;
/* Not there or too old, so kick the collector and wait a bit */
if ((count % PGSTAT_INQ_LOOP_COUNT) == 0)
pgstat_send_inquiry(min_ts);
pgstat_send_inquiry(cur_ts, min_ts);
pg_usleep(PGSTAT_RETRY_DELAY * 1000L);
}
@ -4036,8 +4081,46 @@ pgstat_clear_snapshot(void)
static void
pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len)
{
if (msg->inquiry_time > last_statrequest)
last_statrequest = msg->inquiry_time;
/*
* Advance last_statrequest if this requestor has a newer cutoff time
* than any previous request.
*/
if (msg->cutoff_time > last_statrequest)
last_statrequest = msg->cutoff_time;
/*
* If the requestor's local clock time is older than last_statwrite, we
* should suspect a clock glitch, ie system time going backwards; though
* the more likely explanation is just delayed message receipt. It is
* worth expending a GetCurrentTimestamp call to be sure, since a large
* retreat in the system clock reading could otherwise cause us to neglect
* to update the stats file for a long time.
*/
if (msg->clock_time < last_statwrite)
{
TimestampTz cur_ts = GetCurrentTimestamp();
if (cur_ts < last_statwrite)
{
/*
* Sure enough, time went backwards. Force a new stats file write
* to get back in sync; but first, log a complaint.
*/
char *writetime;
char *mytime;
/* Copy because timestamptz_to_str returns a static buffer */
writetime = pstrdup(timestamptz_to_str(last_statwrite));
mytime = pstrdup(timestamptz_to_str(cur_ts));
elog(LOG, "last_statwrite %s is later than collector's time %s",
writetime, mytime);
pfree(writetime);
pfree(mytime);
last_statrequest = cur_ts;
last_statwrite = last_statrequest - 1;
}
}
}

View File

@ -203,7 +203,8 @@ typedef struct PgStat_MsgDummy
typedef struct PgStat_MsgInquiry
{
PgStat_MsgHdr m_hdr;
TimestampTz inquiry_time; /* minimum acceptable file timestamp */
TimestampTz clock_time; /* observed local clock time */
TimestampTz cutoff_time; /* minimum acceptable file timestamp */
} PgStat_MsgInquiry;