diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 6cf441534c..92822a192b 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -845,6 +845,8 @@ static void WALInsertLockAcquireExclusive(void); static void WALInsertLockRelease(void); static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt); +static void fsync_pgdata(char *datadir); + /* * Insert an XLOG record represented by an already-constructed chain of data * chunks. This is a low-level routine; to construct the WAL record header @@ -5910,6 +5912,18 @@ StartupXLOG(void) (errmsg("database system was interrupted; last known up at %s", str_time(ControlFile->time)))); + /* + * If we previously crashed, there might be data which we had written, + * intending to fsync it, but which we had not actually fsync'd yet. + * Therefore, a power failure in the near future might cause earlier + * unflushed writes to be lost, even though more recent data written to + * disk from here on would be persisted. To avoid that, fsync the entire + * data directory. + */ + if (ControlFile->state != DB_SHUTDOWNED && + ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY) + fsync_pgdata(data_directory); + /* This is just to allow attaching to startup process with a debugger */ #ifdef XLOG_REPLAY_DELAY if (ControlFile->state != DB_SHUTDOWNED) @@ -11123,3 +11137,31 @@ SetWalWriterSleeping(bool sleeping) XLogCtl->WalWriterSleeping = sleeping; SpinLockRelease(&XLogCtl->info_lck); } + +/* + * Issue fsync recursively on PGDATA and all its contents. + */ +static void +fsync_pgdata(char *datadir) +{ + if (!enableFsync) + return; + + /* + * If possible, hint to the kernel that we're soon going to fsync + * the data directory and its contents. + */ +#if defined(HAVE_SYNC_FILE_RANGE) || \ + (defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)) + walkdir(datadir, pre_sync_fname); +#endif + + /* + * Now we do the fsync()s in the same order. + * + * It's important to fsync the destination directory itself as individual + * file fsyncs don't guarantee that the directory entry for the file is + * synced. + */ + walkdir(datadir, fsync_fname); +} diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index f7967178c6..99dac841ff 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -2439,3 +2439,118 @@ looks_like_temp_rel_name(const char *name) return false; return true; } + +/* + * Hint to the OS that it should get ready to fsync() this file. + * + * Adapted from pre_sync_fname in initdb.c + */ +void +pre_sync_fname(char *fname, bool isdir) +{ + int fd; + + fd = open(fname, O_RDONLY | PG_BINARY); + + /* + * Some OSs don't allow us to open directories at all (Windows returns + * EACCES) + */ + if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES)) + return; + + if (fd < 0) + ereport(FATAL, + (errmsg("could not open file \"%s\" before fsync", + fname))); + + pg_flush_data(fd, 0, 0); + + close(fd); +} + +/* + * walkdir: recursively walk a directory, applying the action to each + * regular file and directory (including the named directory itself) + * and following symbolic links. + * + * NB: There is another version of walkdir in initdb.c, but that version + * behaves differently with respect to symbolic links. Caveat emptor! + */ +void +walkdir(char *path, void (*action) (char *fname, bool isdir)) +{ + DIR *dir; + struct dirent *de; + + dir = AllocateDir(path); + while ((de = ReadDir(dir, path)) != NULL) + { + char subpath[MAXPGPATH]; + struct stat fst; + + CHECK_FOR_INTERRUPTS(); + + if (strcmp(de->d_name, ".") == 0 || + strcmp(de->d_name, "..") == 0) + continue; + + snprintf(subpath, MAXPGPATH, "%s/%s", path, de->d_name); + + if (lstat(subpath, &fst) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", subpath))); + + if (S_ISREG(fst.st_mode)) + (*action) (subpath, false); + else if (S_ISDIR(fst.st_mode)) + walkdir(subpath, action); +#ifndef WIN32 + else if (S_ISLNK(fst.st_mode)) +#else + else if (pg_win32_is_junction(subpath)) +#endif + { +#if defined(HAVE_READLINK) || defined(WIN32) + char linkpath[MAXPGPATH]; + int len; + struct stat lst; + + len = readlink(subpath, linkpath, sizeof(linkpath)-1); + if (len < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read symbolic link \"%s\": %m", + subpath))); + + if (len >= sizeof(linkpath)-1) + ereport(ERROR, + (errmsg("symbolic link \"%s\" target is too long", + subpath))); + + linkpath[len] = '\0'; + + if (lstat(linkpath, &lst) == 0) + { + if (S_ISREG(lst.st_mode)) + (*action) (linkpath, false); + else if (S_ISDIR(lst.st_mode)) + walkdir(subpath, action); + } + else if (errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", linkpath))); +#else + ereport(WARNING, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("this platform does not support symbolic links; ignoring \"%s\"", + subpath))); +#endif + } + } + FreeDir(dir); + + (*action) (path, true); +} diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 5e9571c9a1..5b563a6a47 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -114,6 +114,8 @@ extern int pg_fsync_writethrough(int fd); extern int pg_fdatasync(int fd); extern int pg_flush_data(int fd, off_t offset, off_t amount); extern void fsync_fname(char *fname, bool isdir); +extern void pre_sync_fname(char *fname, bool isdir); +extern void walkdir(char *path, void (*action) (char *fname, bool isdir)); /* Filename components for OpenTemporaryFile */ #define PG_TEMP_FILES_DIR "pgsql_tmp"