From b966dd6c4228d696b291c1cdcb5ab8c8475fefa8 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 13 Jul 2012 17:16:58 -0400 Subject: [PATCH] Add fsync capability to initdb, and use sync_file_range() if available. Historically we have not worried about fsync'ing anything during initdb (in fact, initdb intentionally passes -F to each backend launch to prevent it from fsync'ing). But with filesystems getting more aggressive about caching data, that's not such a good plan anymore. Make initdb do a pass over the finished data directory tree to fsync everything. For testing purposes, the -N/--nosync flag can be used to restore the old behavior. Also, testing shows that on Linux, sync_file_range() is much faster than posix_fadvise() for hinting to the kernel that an fsync is coming, apparently because the latter blocks on a rather small request queue while the former doesn't. So use this function if available in initdb, and also in the backend's pg_flush_data() (where it currently will affect only the speed of CREATE DATABASE's cloning step). We will later make pg_regress invoke initdb with the --nosync flag to avoid slowing down cases such as "make check" in contrib. But let's not do so until we've shaken out any portability issues in this patch. Jeff Davis, reviewed by Andres Freund --- configure | 3 +- configure.in | 2 +- doc/src/sgml/ref/initdb.sgml | 15 +++ src/backend/storage/file/fd.c | 7 +- src/bin/initdb/initdb.c | 230 +++++++++++++++++++++++++++++++++- src/include/pg_config.h.in | 3 + src/include/pg_config.h.win32 | 3 + 7 files changed, 258 insertions(+), 5 deletions(-) diff --git a/configure b/configure index 9049ad015e..78a10c8c88 100755 --- a/configure +++ b/configure @@ -19254,7 +19254,8 @@ fi -for ac_func in cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit memmove poll pstat readlink setproctitle setsid sigprocmask symlink towlower utime utimes wcstombs wcstombs_l + +for ac_func in cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit memmove poll pstat readlink setproctitle setsid sigprocmask symlink sync_file_range towlower utime utimes wcstombs wcstombs_l do as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` { $as_echo "$as_me:$LINENO: checking for $ac_func" >&5 diff --git a/configure.in b/configure.in index a362cfdf5e..8ed09f58ab 100644 --- a/configure.in +++ b/configure.in @@ -1207,7 +1207,7 @@ PGAC_VAR_INT_TIMEZONE AC_FUNC_ACCEPT_ARGTYPES PGAC_FUNC_GETTIMEOFDAY_1ARG -AC_CHECK_FUNCS([cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit memmove poll pstat readlink setproctitle setsid sigprocmask symlink towlower utime utimes wcstombs wcstombs_l]) +AC_CHECK_FUNCS([cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit memmove poll pstat readlink setproctitle setsid sigprocmask symlink sync_file_range towlower utime utimes wcstombs wcstombs_l]) AC_REPLACE_FUNCS(fseeko) case $host_os in diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml index 11484468dd..08ee37e7d8 100644 --- a/doc/src/sgml/ref/initdb.sgml +++ b/doc/src/sgml/ref/initdb.sgml @@ -219,6 +219,21 @@ PostgreSQL documentation + + + + + + By default, initdb will wait for all files to be + written safely to disk. This option causes initdb + to return without waiting, which is faster, but means that a + subsequent operating system crash can leave the data directory + corrupt. Generally, this option is useful for testing, but should not + be used when creating a production installation. + + + + diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index f79f4c6a36..9724f481dc 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -336,12 +336,15 @@ pg_fdatasync(int fd) /* * pg_flush_data --- advise OS that the data described won't be needed soon * - * Not all platforms have posix_fadvise; treat as noop if not available. + * Not all platforms have sync_file_range or posix_fadvise; treat as no-op + * if not available. */ int pg_flush_data(int fd, off_t offset, off_t amount) { -#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) +#if defined(HAVE_SYNC_FILE_RANGE) + return sync_file_range(fd, offset, amount, SYNC_FILE_RANGE_WRITE); +#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) return posix_fadvise(fd, offset, amount, POSIX_FADV_DONTNEED); #else return 0; diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 982d864cb3..4292231d0d 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -49,6 +49,7 @@ #include "postgres_fe.h" #include +#include #include #include #include @@ -116,6 +117,7 @@ static const char *authmethodhost = ""; static const char *authmethodlocal = ""; static bool debug = false; static bool noclean = false; +static bool do_sync = true; static bool show_setting = false; static char *xlog_dir = ""; @@ -160,6 +162,9 @@ static char *authwarning = NULL; /* * Centralized knowledge of switches to pass to backend * + * Note: we run the backend with -F (fsync disabled) and then do a single + * pass of fsync'ing at the end. This is faster than fsync'ing each step. + * * Note: in the shell-script version, we also passed PGDATA as a -D switch, * but here it is more convenient to pass it as an environment variable * (no quoting to worry about). @@ -182,6 +187,9 @@ static char **filter_lines_with_token(char **lines, const char *token); #endif static char **readfile(const char *path); static void writefile(char *path, char **lines); +static void walkdir(char *path, void (*action)(char *fname, bool isdir)); +static void pre_sync_fname(char *fname, bool isdir); +static void fsync_fname(char *fname, bool isdir); static FILE *popen_check(const char *command, const char *mode); static void exit_nicely(void); static char *get_id(void); @@ -209,6 +217,7 @@ static void load_plpgsql(void); static void vacuum_db(void); static void make_template0(void); static void make_postgres(void); +static void perform_fsync(void); static void trapsig(int signum); static void check_ok(void); static char *escape_quotes(const char *src); @@ -489,6 +498,174 @@ writefile(char *path, char **lines) } } +/* + * walkdir: recursively walk a directory, applying the action to each + * regular file and directory (including the named directory itself). + * + * Adapted from copydir() in copydir.c. + */ +static void +walkdir(char *path, void (*action) (char *fname, bool isdir)) +{ + DIR *dir; + struct dirent *direntry; + char subpath[MAXPGPATH]; + + dir = opendir(path); + if (dir == NULL) + { + fprintf(stderr, _("%s: could not open directory \"%s\": %s\n"), + progname, path, strerror(errno)); + exit_nicely(); + } + + while (errno = 0, (direntry = readdir(dir)) != NULL) + { + struct stat fst; + + if (strcmp(direntry->d_name, ".") == 0 || + strcmp(direntry->d_name, "..") == 0) + continue; + + snprintf(subpath, MAXPGPATH, "%s/%s", path, direntry->d_name); + + if (lstat(subpath, &fst) < 0) + { + fprintf(stderr, _("%s: could not stat file \"%s\": %s\n"), + progname, subpath, strerror(errno)); + exit_nicely(); + } + + if (S_ISDIR(fst.st_mode)) + walkdir(subpath, action); + else if (S_ISREG(fst.st_mode)) + (*action) (subpath, false); + } + +#ifdef WIN32 + /* + * This fix is in mingw cvs (runtime/mingwex/dirent.c rev 1.4), but not in + * released version + */ + if (GetLastError() == ERROR_NO_MORE_FILES) + errno = 0; +#endif + + if (errno) + { + fprintf(stderr, _("%s: could not read directory \"%s\": %s\n"), + progname, path, strerror(errno)); + exit_nicely(); + } + + closedir(dir); + + /* + * It's important to fsync the destination directory itself as individual + * file fsyncs don't guarantee that the directory entry for the file is + * synced. Recent versions of ext4 have made the window much wider but + * it's been an issue for ext3 and other filesystems in the past. + */ + (*action) (path, true); +} + +/* + * Hint to the OS that it should get ready to fsync() this file. + */ +static void +pre_sync_fname(char *fname, bool isdir) +{ +#if defined(HAVE_SYNC_FILE_RANGE) || \ + (defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)) + int fd; + + fd = open(fname, O_RDONLY | PG_BINARY); + + /* + * Some OSs don't allow us to open directories at all (Windows returns + * EACCES) + */ + if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES)) + return; + + if (fd < 0) + { + fprintf(stderr, _("%s: could not open file \"%s\": %s\n"), + progname, fname, strerror(errno)); + exit_nicely(); + } + + /* + * Prefer sync_file_range, else use posix_fadvise. We ignore any error + * here since this operation is only a hint anyway. + */ +#if defined(HAVE_SYNC_FILE_RANGE) + sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE); +#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) + posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED); +#endif + + close(fd); +#endif +} + +/* + * fsync a file or directory + * + * Try to fsync directories but ignore errors that indicate the OS + * just doesn't allow/require fsyncing directories. + * + * Adapted from fsync_fname() in copydir.c. + */ +static void +fsync_fname(char *fname, bool isdir) +{ + int fd; + int returncode; + + /* + * Some OSs require directories to be opened read-only whereas other + * systems don't allow us to fsync files opened read-only; so we need both + * cases here + */ + if (!isdir) + fd = open(fname, O_RDWR | PG_BINARY); + else + fd = open(fname, O_RDONLY | PG_BINARY); + + /* + * Some OSs don't allow us to open directories at all (Windows returns + * EACCES) + */ + if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES)) + return; + + else if (fd < 0) + { + fprintf(stderr, _("%s: could not open file \"%s\": %s\n"), + progname, fname, strerror(errno)); + exit_nicely(); + } + + returncode = fsync(fd); + + /* Some OSs don't allow us to fsync directories at all */ + if (returncode != 0 && isdir && errno == EBADF) + { + close(fd); + return; + } + + if (returncode != 0) + { + fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"), + progname, fname, strerror(errno)); + exit_nicely(); + } + + close(fd); +} + /* * Open a subcommand with suitable error messaging */ @@ -2092,6 +2269,47 @@ make_postgres(void) check_ok(); } +/* + * fsync everything down to disk + */ +static void +perform_fsync(void) +{ + char pdir[MAXPGPATH]; + + fputs(_("syncing data to disk ... "), stdout); + fflush(stdout); + + /* + * We need to name the parent of PGDATA. get_parent_directory() isn't + * enough here, because it can result in an empty string. + */ + snprintf(pdir, MAXPGPATH, "%s/..", pg_data); + canonicalize_path(pdir); + + /* + * Hint to the OS so that we're going to fsync each of these files soon. + */ + + /* first the parent of the PGDATA directory */ + pre_sync_fname(pdir, true); + + /* then recursively through the directory */ + walkdir(pg_data, pre_sync_fname); + + /* + * Now, do the fsync()s in the same order. + */ + + /* first the parent of the PGDATA directory */ + fsync_fname(pdir, true); + + /* then recursively through the directory */ + walkdir(pg_data, fsync_fname); + + check_ok(); +} + /* * signal handler in case we are interrupted. @@ -2532,6 +2750,7 @@ usage(const char *progname) printf(_(" -d, --debug generate lots of debugging output\n")); printf(_(" -L DIRECTORY where to find the input files\n")); printf(_(" -n, --noclean do not clean up after errors\n")); + printf(_(" -N, --nosync do not wait for changes to be written safely to disk\n")); printf(_(" -s, --show show internal settings\n")); printf(_("\nOther options:\n")); printf(_(" -V, --version output version information, then exit\n")); @@ -2621,6 +2840,7 @@ main(int argc, char *argv[]) {"debug", no_argument, NULL, 'd'}, {"show", no_argument, NULL, 's'}, {"noclean", no_argument, NULL, 'n'}, + {"nosync", no_argument, NULL, 'N'}, {"xlogdir", required_argument, NULL, 'X'}, {NULL, 0, NULL, 0} }; @@ -2676,7 +2896,7 @@ main(int argc, char *argv[]) /* process command-line options */ - while ((c = getopt_long(argc, argv, "dD:E:L:nU:WA:sT:X:", long_options, &option_index)) != -1) + while ((c = getopt_long(argc, argv, "dD:E:L:nNU:WA:sT:X:", long_options, &option_index)) != -1) { switch (c) { @@ -2719,6 +2939,9 @@ main(int argc, char *argv[]) noclean = true; printf(_("Running in noclean mode. Mistakes will not be cleaned up.\n")); break; + case 'N': + do_sync = false; + break; case 'L': share_path = xstrdup(optarg); break; @@ -3310,6 +3533,11 @@ main(int argc, char *argv[]) make_postgres(); + if (do_sync) + perform_fsync(); + else + printf(_("\nSync to disk skipped.\nThe data directory might become corrupt if the operating system crashes.\n")); + if (authwarning != NULL) fprintf(stderr, "%s", authwarning); diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 6521c6d5b9..915c318bd2 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -511,6 +511,9 @@ /* Define to 1 if you have the `symlink' function. */ #undef HAVE_SYMLINK +/* Define to 1 if you have the `sync_file_range' function. */ +#undef HAVE_SYNC_FILE_RANGE + /* Define to 1 if you have the syslog interface. */ #undef HAVE_SYSLOG diff --git a/src/include/pg_config.h.win32 b/src/include/pg_config.h.win32 index 8c232f67a0..e6fc482534 100644 --- a/src/include/pg_config.h.win32 +++ b/src/include/pg_config.h.win32 @@ -420,6 +420,9 @@ /* Define to 1 if you have the `symlink' function. */ #define HAVE_SYMLINK 1 +/* Define to 1 if you have the `sync_file_range' function. */ +/* #undef HAVE_SYNC_FILE_RANGE */ + /* Define to 1 if you have the `sysconf' function. */ /* #undef HAVE_SYSCONF */