Fix fsync-at-startup code to not treat errors as fatal.

Commit 2ce439f337 introduced a rather serious
regression, namely that if its scan of the data directory came across any
un-fsync-able files, it would fail and thereby prevent database startup.
Worse yet, symlinks to such files also caused the problem, which meant that
crash restart was guaranteed to fail on certain common installations such
as older Debian.

After discussion, we agreed that (1) failure to start is worse than any
consequence of not fsync'ing is likely to be, therefore treat all errors
in this code as nonfatal; (2) we should not chase symlinks other than
those that are expected to exist, namely pg_xlog/ and tablespace links
under pg_tblspc/.  The latter restriction avoids possibly fsync'ing a
much larger part of the filesystem than intended, if the user has left
random symlinks hanging about in the data directory.

This commit takes care of that and also does some code beautification,
mainly moving the relevant code into fd.c, which seems a much better place
for it than xlog.c, and making sure that the conditional compilation for
the pre_sync_fname pass has something to do with whether pg_flush_data
works.

I also relocated the call site in xlog.c down a few lines; it seems a
bit silly to be doing this before ValidateXLOGDirectoryStructure().

The similar logic in initdb.c ought to be made to match this, but that
change is noncritical and will be dealt with separately.

Back-patch to all active branches, like the prior commit.

Abhijit Menon-Sen and Tom Lane
This commit is contained in:
Tom Lane 2015-05-28 17:33:03 -04:00
parent d5442cb243
commit d8179b001a
3 changed files with 262 additions and 118 deletions

View File

@ -866,8 +866,6 @@ static void WALInsertLockAcquireExclusive(void);
static void WALInsertLockRelease(void);
static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
static void fsync_pgdata(char *datadir);
/*
* Insert an XLOG record represented by an already-constructed chain of data
* chunks. This is a low-level routine; to construct the WAL record header
@ -5951,18 +5949,6 @@ StartupXLOG(void)
(errmsg("database system was interrupted; last known up at %s",
str_time(ControlFile->time))));
/*
* If we previously crashed, there might be data which we had written,
* intending to fsync it, but which we had not actually fsync'd yet.
* Therefore, a power failure in the near future might cause earlier
* unflushed writes to be lost, even though more recent data written to
* disk from here on would be persisted. To avoid that, fsync the entire
* data directory.
*/
if (ControlFile->state != DB_SHUTDOWNED &&
ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
fsync_pgdata(data_directory);
/* This is just to allow attaching to startup process with a debugger */
#ifdef XLOG_REPLAY_DELAY
if (ControlFile->state != DB_SHUTDOWNED)
@ -5976,6 +5962,18 @@ StartupXLOG(void)
*/
ValidateXLOGDirectoryStructure();
/*
* If we previously crashed, there might be data which we had written,
* intending to fsync it, but which we had not actually fsync'd yet.
* Therefore, a power failure in the near future might cause earlier
* unflushed writes to be lost, even though more recent data written to
* disk from here on would be persisted. To avoid that, fsync the entire
* data directory.
*/
if (ControlFile->state != DB_SHUTDOWNED &&
ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
SyncDataDirectory();
/*
* Initialize on the assumption we want to recover to the latest timeline
* that's active according to pg_control.
@ -11602,31 +11600,3 @@ SetWalWriterSleeping(bool sleeping)
XLogCtl->WalWriterSleeping = sleeping;
SpinLockRelease(&XLogCtl->info_lck);
}
/*
* Issue fsync recursively on PGDATA and all its contents.
*/
static void
fsync_pgdata(char *datadir)
{
if (!enableFsync)
return;
/*
* If possible, hint to the kernel that we're soon going to fsync the data
* directory and its contents.
*/
#if defined(HAVE_SYNC_FILE_RANGE) || \
(defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED))
walkdir(datadir, pre_sync_fname);
#endif
/*
* Now we do the fsync()s in the same order.
*
* It's important to fsync the destination directory itself as individual
* file fsyncs don't guarantee that the directory entry for the file is
* synced.
*/
walkdir(datadir, fsync_fname);
}

View File

@ -79,6 +79,13 @@
#include "utils/resowner_private.h"
/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
#if defined(HAVE_SYNC_FILE_RANGE)
#define PG_FLUSH_DATA_WORKS 1
#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
#define PG_FLUSH_DATA_WORKS 1
#endif
/*
* We must leave some file descriptors free for system(), the dynamic loader,
* and other code that tries to open files without consulting fd.c. This
@ -283,6 +290,8 @@ static int FileAccess(File file);
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
static bool reserveAllocatedDesc(void);
static int FreeDesc(AllocateDesc *desc);
static struct dirent *ReadDirExtended(DIR *dir, const char *dirname, int elevel);
static void AtProcExit_Files(int code, Datum arg);
static void CleanupTempFiles(bool isProcExit);
static void RemovePgTempFilesInDir(const char *tmpdirname);
@ -290,6 +299,15 @@ static void RemovePgTempRelationFiles(const char *tsdirname);
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
static bool looks_like_temp_rel_name(const char *name);
static void walkdir(const char *path,
void (*action) (const char *fname, bool isdir, int elevel),
bool process_symlinks,
int elevel);
#ifdef PG_FLUSH_DATA_WORKS
static void pre_sync_fname(const char *fname, bool isdir, int elevel);
#endif
static void fsync_fname_ext(const char *fname, bool isdir, int elevel);
/*
* pg_fsync --- do fsync with or without writethrough
@ -372,14 +390,18 @@ pg_fdatasync(int fd)
int
pg_flush_data(int fd, off_t offset, off_t amount)
{
#ifdef PG_FLUSH_DATA_WORKS
if (enableFsync)
{
#if defined(HAVE_SYNC_FILE_RANGE)
return sync_file_range(fd, offset, amount, SYNC_FILE_RANGE_WRITE);
#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
return posix_fadvise(fd, offset, amount, POSIX_FADV_DONTNEED);
#else
#error PG_FLUSH_DATA_WORKS should not have been defined
#endif
}
#endif
return 0;
}
@ -1942,22 +1964,35 @@ TryAgain:
*/
struct dirent *
ReadDir(DIR *dir, const char *dirname)
{
return ReadDirExtended(dir, dirname, ERROR);
}
/*
* Alternate version that allows caller to specify the elevel for any
* error report. If elevel < ERROR, returns NULL on any error.
*/
static struct dirent *
ReadDirExtended(DIR *dir, const char *dirname, int elevel)
{
struct dirent *dent;
/* Give a generic message for AllocateDir failure, if caller didn't */
if (dir == NULL)
ereport(ERROR,
{
ereport(elevel,
(errcode_for_file_access(),
errmsg("could not open directory \"%s\": %m",
dirname)));
return NULL;
}
errno = 0;
if ((dent = readdir(dir)) != NULL)
return dent;
if (errno)
ereport(ERROR,
ereport(elevel,
(errcode_for_file_access(),
errmsg("could not read directory \"%s\": %m",
dirname)));
@ -2440,54 +2475,121 @@ looks_like_temp_rel_name(const char *name)
return true;
}
/*
* Hint to the OS that it should get ready to fsync() this file.
* Issue fsync recursively on PGDATA and all its contents.
*
* Adapted from pre_sync_fname in initdb.c
* We fsync regular files and directories wherever they are, but we
* follow symlinks only for pg_xlog and immediately under pg_tblspc.
* Other symlinks are presumed to point at files we're not responsible
* for fsyncing, and might not have privileges to write at all.
*
* Errors are logged but not considered fatal; that's because this is used
* only during database startup, to deal with the possibility that there are
* issued-but-unsynced writes pending against the data directory. We want to
* ensure that such writes reach disk before anything that's done in the new
* run. However, aborting on error would result in failure to start for
* harmless cases such as read-only files in the data directory, and that's
* not good either.
*
* Note we assume we're chdir'd into PGDATA to begin with.
*/
void
pre_sync_fname(char *fname, bool isdir)
SyncDataDirectory(void)
{
int fd;
bool xlog_is_symlink;
fd = BasicOpenFile(fname, O_RDONLY | PG_BINARY, 0);
/*
* Some OSs don't allow us to open directories at all (Windows returns
* EACCES)
*/
if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
/* We can skip this whole thing if fsync is disabled. */
if (!enableFsync)
return;
if (fd < 0)
ereport(FATAL,
(errmsg("could not open file \"%s\": %m",
fname)));
/*
* If pg_xlog is a symlink, we'll need to recurse into it separately,
* because the first walkdir below will ignore it.
*/
xlog_is_symlink = false;
pg_flush_data(fd, 0, 0);
#ifndef WIN32
{
struct stat st;
close(fd);
if (lstat("pg_xlog", &st) < 0)
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not stat file \"%s\": %m",
"pg_xlog")));
else if (S_ISLNK(st.st_mode))
xlog_is_symlink = true;
}
#else
if (pgwin32_is_junction("pg_xlog"))
xlog_is_symlink = true;
#endif
/*
* If possible, hint to the kernel that we're soon going to fsync the data
* directory and its contents. Errors in this step are even less
* interesting than normal, so log them only at DEBUG1.
*/
#ifdef PG_FLUSH_DATA_WORKS
walkdir(".", pre_sync_fname, false, DEBUG1);
if (xlog_is_symlink)
walkdir("pg_xlog", pre_sync_fname, false, DEBUG1);
walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
#endif
/*
* Now we do the fsync()s in the same order.
*
* The main call ignores symlinks, so in addition to specially processing
* pg_xlog if it's a symlink, pg_tblspc has to be visited separately with
* process_symlinks = true. Note that if there are any plain directories
* in pg_tblspc, they'll get fsync'd twice. That's not an expected case
* so we don't worry about optimizing it.
*/
walkdir(".", fsync_fname_ext, false, LOG);
if (xlog_is_symlink)
walkdir("pg_xlog", fsync_fname_ext, false, LOG);
walkdir("pg_tblspc", fsync_fname_ext, true, LOG);
}
/*
* walkdir: recursively walk a directory, applying the action to each
* regular file and directory (including the named directory itself)
* and following symbolic links.
* regular file and directory (including the named directory itself).
*
* NB: There is another version of walkdir in initdb.c, but that version
* behaves differently with respect to symbolic links. Caveat emptor!
* If process_symlinks is true, the action and recursion are also applied
* to regular files and directories that are pointed to by symlinks in the
* given directory; otherwise symlinks are ignored. Symlinks are always
* ignored in subdirectories, ie we intentionally don't pass down the
* process_symlinks flag to recursive calls.
*
* Errors are reported at level elevel, which might be ERROR or less.
*
* See also walkdir in initdb.c, which is a frontend version of this logic.
*/
void
walkdir(char *path, void (*action) (char *fname, bool isdir))
static void
walkdir(const char *path,
void (*action) (const char *fname, bool isdir, int elevel),
bool process_symlinks,
int elevel)
{
DIR *dir;
struct dirent *de;
dir = AllocateDir(path);
while ((de = ReadDir(dir, path)) != NULL)
if (dir == NULL)
{
ereport(elevel,
(errcode_for_file_access(),
errmsg("could not open directory \"%s\": %m", path)));
return;
}
while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
{
char subpath[MAXPGPATH];
struct stat fst;
int sret;
CHECK_FOR_INTERRUPTS();
@ -2497,59 +2599,132 @@ walkdir(char *path, void (*action) (char *fname, bool isdir))
snprintf(subpath, MAXPGPATH, "%s/%s", path, de->d_name);
if (lstat(subpath, &fst) < 0)
ereport(ERROR,
if (process_symlinks)
sret = stat(subpath, &fst);
else
sret = lstat(subpath, &fst);
if (sret < 0)
{
ereport(elevel,
(errcode_for_file_access(),
errmsg("could not stat file \"%s\": %m", subpath)));
continue;
}
if (S_ISREG(fst.st_mode))
(*action) (subpath, false);
(*action) (subpath, false, elevel);
else if (S_ISDIR(fst.st_mode))
walkdir(subpath, action);
#ifndef WIN32
else if (S_ISLNK(fst.st_mode))
#else
else if (pgwin32_is_junction(subpath))
#endif
{
#if defined(HAVE_READLINK) || defined(WIN32)
char linkpath[MAXPGPATH];
int len;
struct stat lst;
len = readlink(subpath, linkpath, sizeof(linkpath));
if (len < 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not read symbolic link \"%s\": %m",
subpath)));
if (len >= sizeof(linkpath))
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("symbolic link \"%s\" target is too long",
subpath)));
linkpath[len] = '\0';
if (lstat(linkpath, &lst) == 0)
{
if (S_ISREG(lst.st_mode))
(*action) (linkpath, false);
else if (S_ISDIR(lst.st_mode))
walkdir(subpath, action);
}
else if (errno != ENOENT)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not stat file \"%s\": %m", linkpath)));
#else
ereport(WARNING,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("this platform does not support symbolic links; ignoring \"%s\"",
subpath)));
#endif
}
walkdir(subpath, action, false, elevel);
}
FreeDir(dir);
(*action) (path, true);
FreeDir(dir); /* we ignore any error here */
/*
* It's important to fsync the destination directory itself as individual
* file fsyncs don't guarantee that the directory entry for the file is
* synced.
*/
(*action) (path, true, elevel);
}
/*
* Hint to the OS that it should get ready to fsync() this file.
*
* Ignores errors trying to open unreadable files, and logs other errors at a
* caller-specified level.
*/
#ifdef PG_FLUSH_DATA_WORKS
static void
pre_sync_fname(const char *fname, bool isdir, int elevel)
{
int fd;
fd = OpenTransientFile((char *) fname, O_RDONLY | PG_BINARY, 0);
if (fd < 0)
{
if (errno == EACCES || (isdir && errno == EISDIR))
return;
#ifdef ETXTBSY
if (errno == ETXTBSY)
return;
#endif
ereport(elevel,
(errcode_for_file_access(),
errmsg("could not open file \"%s\": %m", fname)));
return;
}
(void) pg_flush_data(fd, 0, 0);
(void) CloseTransientFile(fd);
}
#endif /* PG_FLUSH_DATA_WORKS */
/*
* fsync_fname_ext -- Try to fsync a file or directory
*
* Ignores errors trying to open unreadable files, or trying to fsync
* directories on systems where that isn't allowed/required, and logs other
* errors at a caller-specified level.
*/
static void
fsync_fname_ext(const char *fname, bool isdir, int elevel)
{
int fd;
int flags;
int returncode;
/*
* Some OSs require directories to be opened read-only whereas other
* systems don't allow us to fsync files opened read-only; so we need both
* cases here. Using O_RDWR will cause us to fail to fsync files that are
* not writable by our userid, but we assume that's OK.
*/
flags = PG_BINARY;
if (!isdir)
flags |= O_RDWR;
else
flags |= O_RDONLY;
/*
* Open the file, silently ignoring errors about unreadable files (or
* unsupported operations, e.g. opening a directory under Windows), and
* logging others.
*/
fd = OpenTransientFile((char *) fname, flags, 0);
if (fd < 0)
{
if (errno == EACCES || (isdir && errno == EISDIR))
return;
#ifdef ETXTBSY
if (errno == ETXTBSY)
return;
#endif
ereport(elevel,
(errcode_for_file_access(),
errmsg("could not open file \"%s\": %m", fname)));
return;
}
returncode = pg_fsync(fd);
/*
* Some OSes don't allow us to fsync directories at all, so we can ignore
* those errors. Anything else needs to be logged.
*/
if (returncode != 0 && !(isdir && errno == EBADF))
ereport(elevel,
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m", fname)));
(void) CloseTransientFile(fd);
}

View File

@ -114,8 +114,7 @@ extern int pg_fsync_writethrough(int fd);
extern int pg_fdatasync(int fd);
extern int pg_flush_data(int fd, off_t offset, off_t amount);
extern void fsync_fname(char *fname, bool isdir);
extern void pre_sync_fname(char *fname, bool isdir);
extern void walkdir(char *path, void (*action) (char *fname, bool isdir));
extern void SyncDataDirectory(void);
/* Filename components for OpenTemporaryFile */
#define PG_TEMP_FILES_DIR "pgsql_tmp"