diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index baa43b203f..aab2f4ca70 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -1602,9 +1602,10 @@ ServerLoop(void) fd_set readmask; int nSockets; time_t now, + last_lockfile_recheck_time, last_touch_time; - last_touch_time = time(NULL); + last_lockfile_recheck_time = last_touch_time = time(NULL); nSockets = initMasks(&readmask); @@ -1754,19 +1755,6 @@ ServerLoop(void) if (StartWorkerNeeded || HaveCrashedWorker) maybe_start_bgworker(); - /* - * Touch Unix socket and lock files every 58 minutes, to ensure that - * they are not removed by overzealous /tmp-cleaning tasks. We assume - * no one runs cleaners with cutoff times of less than an hour ... - */ - now = time(NULL); - if (now - last_touch_time >= 58 * SECS_PER_MINUTE) - { - TouchSocketFiles(); - TouchSocketLockFiles(); - last_touch_time = now; - } - #ifdef HAVE_PTHREAD_IS_THREADED_NP /* @@ -1793,6 +1781,48 @@ ServerLoop(void) /* reset flag so we don't SIGKILL again */ AbortStartTime = 0; } + + /* + * Lastly, check to see if it's time to do some things that we don't + * want to do every single time through the loop, because they're a + * bit expensive. Note that there's up to a minute of slop in when + * these tasks will be performed, since DetermineSleepTime() will let + * us sleep at most that long. + */ + now = time(NULL); + + /* + * Once a minute, verify that postmaster.pid hasn't been removed or + * overwritten. If it has, we force a shutdown. This avoids having + * postmasters and child processes hanging around after their database + * is gone, and maybe causing problems if a new database cluster is + * created in the same place. It also provides some protection + * against a DBA foolishly removing postmaster.pid and manually + * starting a new postmaster. Data corruption is likely to ensue from + * that anyway, but we can minimize the damage by aborting ASAP. + */ + if (now - last_lockfile_recheck_time >= 1 * SECS_PER_MINUTE) + { + if (!RecheckDataDirLockFile()) + { + ereport(LOG, + (errmsg("performing immediate shutdown because data directory lock file is invalid"))); + kill(MyProcPid, SIGQUIT); + } + last_lockfile_recheck_time = now; + } + + /* + * Touch Unix socket and lock files every 58 minutes, to ensure that + * they are not removed by overzealous /tmp-cleaning tasks. We assume + * no one runs cleaners with cutoff times of less than an hour ... + */ + if (now - last_touch_time >= 58 * SECS_PER_MINUTE) + { + TouchSocketFiles(); + TouchSocketLockFiles(); + last_touch_time = now; + } } } diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index e871fef7fa..fb3cb6eb3d 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -1218,6 +1218,76 @@ AddToDataDirLockFile(int target_line, const char *str) } +/* + * Recheck that the data directory lock file still exists with expected + * content. Return TRUE if the lock file appears OK, FALSE if it isn't. + * + * We call this periodically in the postmaster. The idea is that if the + * lock file has been removed or replaced by another postmaster, we should + * do a panic database shutdown. Therefore, we should return TRUE if there + * is any doubt: we do not want to cause a panic shutdown unnecessarily. + * Transient failures like EINTR or ENFILE should not cause us to fail. + * (If there really is something wrong, we'll detect it on a future recheck.) + */ +bool +RecheckDataDirLockFile(void) +{ + int fd; + int len; + long file_pid; + char buffer[BLCKSZ]; + + fd = open(DIRECTORY_LOCK_FILE, O_RDWR | PG_BINARY, 0); + if (fd < 0) + { + /* + * There are many foreseeable false-positive error conditions. For + * safety, fail only on enumerated clearly-something-is-wrong + * conditions. + */ + switch (errno) + { + case ENOENT: + case ENOTDIR: + /* disaster */ + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + DIRECTORY_LOCK_FILE))); + return false; + default: + /* non-fatal, at least for now */ + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m; continuing anyway", + DIRECTORY_LOCK_FILE))); + return true; + } + } + len = read(fd, buffer, sizeof(buffer) - 1); + if (len < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not read from file \"%s\": %m", + DIRECTORY_LOCK_FILE))); + close(fd); + return true; /* treat read failure as nonfatal */ + } + buffer[len] = '\0'; + close(fd); + file_pid = atol(buffer); + if (file_pid == getpid()) + return true; /* all is well */ + + /* Trouble: someone's overwritten the lock file */ + ereport(LOG, + (errmsg("lock file \"%s\" contains wrong PID: %ld instead of %ld", + DIRECTORY_LOCK_FILE, file_pid, (long) getpid()))); + return false; +} + + /*------------------------------------------------------------------------- * Version checking support *------------------------------------------------------------------------- diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 77cf8d71de..e0d464e02f 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -453,6 +453,7 @@ extern void CreateSocketLockFile(const char *socketfile, bool amPostmaster, const char *socketDir); extern void TouchSocketLockFiles(void); extern void AddToDataDirLockFile(int target_line, const char *str); +extern bool RecheckDataDirLockFile(void); extern void ValidatePgVersion(const char *path); extern void process_shared_preload_libraries(void); extern void process_session_preload_libraries(void);