diff --git a/src/backend/replication/logical/origin.c b/src/backend/replication/logical/origin.c index 0caf7a3bf3..7e2307f5f2 100644 --- a/src/backend/replication/logical/origin.c +++ b/src/backend/replication/logical/origin.c @@ -625,7 +625,7 @@ CheckPointReplicationOrigin(void) tmppath, path))); } - fsync_fname((char *) path, false); + fsync_fname(path, false); fsync_fname("pg_logical", true); } diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index affa9b9cb3..ead221d3c7 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -1095,7 +1095,7 @@ SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel) START_CRIT_SECTION(); fsync_fname(path, false); - fsync_fname((char *) dir, true); + fsync_fname(dir, true); fsync_fname("pg_replslot", true); END_CRIT_SECTION(); diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 37a2ae6b64..c1076992a3 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -306,7 +306,10 @@ static void walkdir(const char *path, #ifdef PG_FLUSH_DATA_WORKS static void pre_sync_fname(const char *fname, bool isdir, int elevel); #endif -static void fsync_fname_ext(const char *fname, bool isdir, int elevel); +static void datadir_fsync_fname(const char *fname, bool isdir, int elevel); + +static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel); +static int fsync_parent_path(const char *fname, int elevel); /* @@ -413,54 +416,158 @@ pg_flush_data(int fd, off_t offset, off_t amount) * indicate the OS just doesn't allow/require fsyncing directories. */ void -fsync_fname(char *fname, bool isdir) +fsync_fname(const char *fname, bool isdir) { - int fd; - int returncode; - - /* - * Some OSs require directories to be opened read-only whereas other - * systems don't allow us to fsync files opened read-only; so we need both - * cases here - */ - if (!isdir) - fd = OpenTransientFile(fname, - O_RDWR | PG_BINARY, - S_IRUSR | S_IWUSR); - else - fd = OpenTransientFile(fname, - O_RDONLY | PG_BINARY, - S_IRUSR | S_IWUSR); - - /* - * Some OSs don't allow us to open directories at all (Windows returns - * EACCES) - */ - if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES)) - return; - - else if (fd < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", fname))); - - returncode = pg_fsync(fd); - - /* Some OSs don't allow us to fsync directories at all */ - if (returncode != 0 && isdir && errno == EBADF) - { - CloseTransientFile(fd); - return; - } - - if (returncode != 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not fsync file \"%s\": %m", fname))); - - CloseTransientFile(fd); + fsync_fname_ext(fname, isdir, false, ERROR); } +/* + * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability + * + * This routine ensures that, after returning, the effect of renaming file + * persists in case of a crash. A crash while this routine is running will + * leave you with either the pre-existing or the moved file in place of the + * new file; no mixed state or truncated files are possible. + * + * It does so by using fsync on the old filename and the possibly existing + * target filename before the rename, and the target file and directory after. + * + * Note that rename() cannot be used across arbitrary directories, as they + * might not be on the same filesystem. Therefore this routine does not + * support renaming across directories. + * + * Log errors with the caller specified severity. + * + * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not + * valid upon return. + */ +int +durable_rename(const char *oldfile, const char *newfile, int elevel) +{ + int fd; + + /* + * First fsync the old and target path (if it exists), to ensure that they + * are properly persistent on disk. Syncing the target file is not + * strictly necessary, but it makes it easier to reason about crashes; + * because it's then guaranteed that either source or target file exists + * after a crash. + */ + if (fsync_fname_ext(oldfile, false, false, elevel) != 0) + return -1; + + fd = OpenTransientFile((char *) newfile, PG_BINARY | O_RDWR, 0); + if (fd < 0) + { + if (errno != ENOENT) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", newfile))); + return -1; + } + } + else + { + if (pg_fsync(fd) != 0) + { + int save_errno; + + /* close file upon error, might not be in transaction context */ + save_errno = errno; + CloseTransientFile(fd); + errno = save_errno; + + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", newfile))); + return -1; + } + CloseTransientFile(fd); + } + + /* Time to do the real deal... */ + if (rename(oldfile, newfile) < 0) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not rename file \"%s\" to \"%s\": %m", + oldfile, newfile))); + return -1; + } + + /* + * To guarantee renaming the file is persistent, fsync the file with its + * new name, and its containing directory. + */ + if (fsync_fname_ext(newfile, false, false, elevel) != 0) + return -1; + + if (fsync_parent_path(newfile, elevel) != 0) + return -1; + + return 0; +} + +/* + * durable_link_or_rename -- rename a file in a durable manner. + * + * Similar to durable_rename(), except that this routine tries (but does not + * guarantee) not to overwrite the target file. + * + * Note that a crash in an unfortunate moment can leave you with two links to + * the target file. + * + * Log errors with the caller specified severity. + * + * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not + * valid upon return. + */ +int +durable_link_or_rename(const char *oldfile, const char *newfile, int elevel) +{ + /* + * Ensure that, if we crash directly after the rename/link, a file with + * valid contents is moved into place. + */ + if (fsync_fname_ext(oldfile, false, false, elevel) != 0) + return -1; + +#if HAVE_WORKING_LINK + if (link(oldfile, newfile) < 0) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not link file \"%s\" to \"%s\": %m", + oldfile, newfile))); + return -1; + } + unlink(oldfile); +#else + /* XXX: Add racy file existence check? */ + if (rename(oldfile, newfile) < 0) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not rename file \"%s\" to \"%s\": %m", + oldfile, newfile))); + return -1; + } +#endif + + /* + * Make change persistent in case of an OS crash, both the new entry and + * its parent directory need to be flushed. + */ + if (fsync_fname_ext(newfile, false, false, elevel) != 0) + return -1; + + /* Same for parent directory */ + if (fsync_parent_path(newfile, elevel) != 0) + return -1; + + return 0; +} /* * InitFileAccess --- initialize this module during backend startup @@ -2581,10 +2688,10 @@ SyncDataDirectory(void) * in pg_tblspc, they'll get fsync'd twice. That's not an expected case * so we don't worry about optimizing it. */ - walkdir(".", fsync_fname_ext, false, LOG); + walkdir(".", datadir_fsync_fname, false, LOG); if (xlog_is_symlink) - walkdir("pg_xlog", fsync_fname_ext, false, LOG); - walkdir("pg_tblspc", fsync_fname_ext, true, LOG); + walkdir("pg_xlog", datadir_fsync_fname, false, LOG); + walkdir("pg_tblspc", datadir_fsync_fname, true, LOG); } /* @@ -2698,15 +2805,26 @@ pre_sync_fname(const char *fname, bool isdir, int elevel) #endif /* PG_FLUSH_DATA_WORKS */ +static void +datadir_fsync_fname(const char *fname, bool isdir, int elevel) +{ + /* + * We want to silently ignoring errors about unreadable files. Pass that + * desire on to fsync_fname_ext(). + */ + fsync_fname_ext(fname, isdir, true, elevel); +} + /* * fsync_fname_ext -- Try to fsync a file or directory * - * Ignores errors trying to open unreadable files, or trying to fsync - * directories on systems where that isn't allowed/required, and logs other - * errors at a caller-specified level. + * If ignore_perm is true, ignore errors upon trying to open unreadable + * files. Logs other errors at a caller-specified level. + * + * Returns 0 if the operation succeeded, -1 otherwise. */ -static void -fsync_fname_ext(const char *fname, bool isdir, int elevel) +static int +fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel) { int fd; int flags; @@ -2724,20 +2842,23 @@ fsync_fname_ext(const char *fname, bool isdir, int elevel) else flags |= O_RDONLY; - /* - * Open the file, silently ignoring errors about unreadable files (or - * unsupported operations, e.g. opening a directory under Windows), and - * logging others. - */ fd = OpenTransientFile((char *) fname, flags, 0); - if (fd < 0) + + /* + * Some OSs don't allow us to open directories at all (Windows returns + * EACCES), just ignore the error in that case. If desired also silently + * ignoring errors about unreadable files. Log others. + */ + if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES)) + return 0; + else if (fd < 0 && ignore_perm && errno == EACCES) + return 0; + else if (fd < 0) { - if (errno == EACCES || (isdir && errno == EISDIR)) - return; ereport(elevel, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", fname))); - return; + return -1; } returncode = pg_fsync(fd); @@ -2747,9 +2868,49 @@ fsync_fname_ext(const char *fname, bool isdir, int elevel) * those errors. Anything else needs to be logged. */ if (returncode != 0 && !(isdir && errno == EBADF)) + { + int save_errno; + + /* close file upon error, might not be in transaction context */ + save_errno = errno; + (void) CloseTransientFile(fd); + errno = save_errno; + ereport(elevel, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", fname))); + return -1; + } (void) CloseTransientFile(fd); + + return 0; +} + +/* + * fsync_parent_path -- fsync the parent path of a file or directory + * + * This is aimed at making file operations persistent on disk in case of + * an OS crash or power failure. + */ +static int +fsync_parent_path(const char *fname, int elevel) +{ + char parentpath[MAXPGPATH]; + + strlcpy(parentpath, fname, MAXPGPATH); + get_parent_directory(parentpath); + + /* + * get_parent_directory() returns an empty string if the input argument is + * just a file name (see comments in path.c), so handle that as being the + * current directory. + */ + if (strlen(parentpath) == 0) + strlcpy(parentpath, ".", MAXPGPATH); + + if (fsync_fname_ext(parentpath, true, false, elevel) != 0) + return -1; + + return 0; } diff --git a/src/backend/storage/file/reinit.c b/src/backend/storage/file/reinit.c index 4ad853804b..7e8138b42a 100644 --- a/src/backend/storage/file/reinit.c +++ b/src/backend/storage/file/reinit.c @@ -385,7 +385,7 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) FreeDir(dbspace_dir); - fsync_fname((char *) dbspacedirname, true); + fsync_fname(dbspacedirname, true); } } diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 6faa8ad8a6..1a7f8ae7c5 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -116,7 +116,9 @@ extern int pg_fsync_no_writethrough(int fd); extern int pg_fsync_writethrough(int fd); extern int pg_fdatasync(int fd); extern int pg_flush_data(int fd, off_t offset, off_t amount); -extern void fsync_fname(char *fname, bool isdir); +extern void fsync_fname(const char *fname, bool isdir); +extern int durable_rename(const char *oldfile, const char *newfile, int loglevel); +extern int durable_link_or_rename(const char *oldfile, const char *newfile, int loglevel); extern void SyncDataDirectory(void); /* Filename components for OpenTemporaryFile */