diff --git a/configure b/configure index 8176e99756..6d34243dca 100755 --- a/configure +++ b/configure @@ -15409,7 +15409,7 @@ fi LIBS_including_readline="$LIBS" LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` -for ac_func in backtrace_symbols clock_gettime copyfile fdatasync getifaddrs getpeerucred getrlimit kqueue mbstowcs_l memset_s poll posix_fallocate ppoll pstat pthread_is_threaded_np readlink readv setproctitle setproctitle_fast setsid shm_open strchrnul strsignal symlink sync_file_range uselocale wcstombs_l writev +for ac_func in backtrace_symbols clock_gettime copyfile fdatasync getifaddrs getpeerucred getrlimit kqueue mbstowcs_l memset_s poll posix_fallocate ppoll pstat pthread_is_threaded_np readlink readv setproctitle setproctitle_fast setsid shm_open strchrnul strsignal symlink syncfs sync_file_range uselocale wcstombs_l writev do : as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var" diff --git a/configure.ac b/configure.ac index 54efbb22a3..e54e2fb632 100644 --- a/configure.ac +++ b/configure.ac @@ -1701,6 +1701,7 @@ AC_CHECK_FUNCS(m4_normalize([ strchrnul strsignal symlink + syncfs sync_file_range uselocale wcstombs_l diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 863ac31c6b..ee4925d6d9 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -9721,6 +9721,41 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir' + + recovery_init_sync_method (enum) + + recovery_init_sync_method configuration parameter + + + + + When set to fsync, which is the default, + PostgreSQL will recursively open and + synchronize all files in the data directory before crash recovery + begins. The search for files will follow symbolic links for the WAL + directory and each configured tablespace (but not any other symbolic + links). This is intended to make sure that all WAL and data files are + durably stored on disk before replaying changes. This applies whenever + starting a database cluster that did not shut down cleanly, including + copies created with pg_basebackup. + + + On Linux, syncfs may be used instead, to ask the + operating system to synchronize the whole file systems that contain the + data directory, the WAL files and each tablespace (but not any other + file systems that may be reachable through symbolic links). This may + be a lot faster than the fsync setting, because it + doesn't need to open each file one by one. On the other hand, it may + be slower if a file system is shared by other applications that + modify a lot of files, since those files will also be written to disk. + Furthermore, on versions of Linux before 5.8, I/O errors encountered + while writing data to disk may not be reported to + PostgreSQL, and relevant error messages may + appear only in kernel logs. + + + + diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 110ba31517..28933f8bbe 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -72,9 +72,11 @@ #include "postgres.h" +#include #include #include #include +#include #ifndef WIN32 #include #endif @@ -158,6 +160,9 @@ int max_safe_fds = FD_MINFREE; /* default if not changed */ /* Whether it is safe to continue running after fsync() fails. */ bool data_sync_retry = false; +/* How SyncDataDirectory() should do its job. */ +int recovery_init_sync_method = RECOVERY_INIT_SYNC_METHOD_FSYNC; + /* Debugging.... */ #ifdef FDDEBUG @@ -3265,9 +3270,31 @@ looks_like_temp_rel_name(const char *name) return true; } +#ifdef HAVE_SYNCFS +static void +do_syncfs(const char *path) +{ + int fd; + + fd = OpenTransientFile(path, O_RDONLY); + if (fd < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not open %s: %m", path))); + return; + } + if (syncfs(fd) < 0) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not sync filesystem for \"%s\": %m", path))); + CloseTransientFile(fd); +} +#endif /* - * Issue fsync recursively on PGDATA and all its contents. + * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for + * all potential filesystem, depending on recovery_init_sync_method setting. * * We fsync regular files and directories wherever they are, but we * follow symlinks only for pg_wal and immediately under pg_tblspc. @@ -3319,6 +3346,42 @@ SyncDataDirectory(void) xlog_is_symlink = true; #endif +#ifdef HAVE_SYNCFS + if (recovery_init_sync_method == RECOVERY_INIT_SYNC_METHOD_SYNCFS) + { + DIR *dir; + struct dirent *de; + + /* + * On Linux, we don't have to open every single file one by one. We + * can use syncfs() to sync whole filesystems. We only expect + * filesystem boundaries to exist where we tolerate symlinks, namely + * pg_wal and the tablespaces, so we call syncfs() for each of those + * directories. + */ + + /* Sync the top level pgdata directory. */ + do_syncfs("."); + /* If any tablespaces are configured, sync each of those. */ + dir = AllocateDir("pg_tblspc"); + while ((de = ReadDirExtended(dir, "pg_tblspc", LOG))) + { + char path[MAXPGPATH]; + + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + + snprintf(path, MAXPGPATH, "pg_tblspc/%s", de->d_name); + do_syncfs(path); + } + FreeDir(dir); + /* If pg_wal is a symlink, process that too. */ + if (xlog_is_symlink) + do_syncfs("pg_wal"); + return; + } +#endif /* !HAVE_SYNCFS */ + /* * If possible, hint to the kernel that we're soon going to fsync the data * directory and its contents. Errors in this step are even less diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index f720b093fe..2964efda96 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -488,6 +488,14 @@ const struct config_enum_entry ssl_protocol_versions_info[] = { StaticAssertDecl(lengthof(ssl_protocol_versions_info) == (PG_TLS1_3_VERSION + 2), "array length mismatch"); +static struct config_enum_entry recovery_init_sync_method_options[] = { + {"fsync", RECOVERY_INIT_SYNC_METHOD_FSYNC, false}, +#ifdef HAVE_SYNCFS + {"syncfs", RECOVERY_INIT_SYNC_METHOD_SYNCFS, false}, +#endif + {NULL, 0, false} +}; + static struct config_enum_entry shared_memory_options[] = { #ifndef WIN32 {"sysv", SHMEM_TYPE_SYSV, false}, @@ -4871,6 +4879,15 @@ static struct config_enum ConfigureNamesEnum[] = NULL, NULL, NULL }, + { + {"recovery_init_sync_method", PGC_POSTMASTER, ERROR_HANDLING_OPTIONS, + gettext_noop("Sets the method for synchronizing the data directory before crash recovery."), + }, + &recovery_init_sync_method, + RECOVERY_INIT_SYNC_METHOD_FSYNC, recovery_init_sync_method_options, + NULL, NULL, NULL + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, 0, NULL, NULL, NULL, NULL diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index b0b49b3823..86425965d0 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -761,6 +761,7 @@ #restart_after_crash = on # reinitialize after backend crash? #remove_temp_files_after_crash = on # remove temporary files after # backend crash? +#recovery_init_sync_method = fsync # fsync, syncfs (Linux 5.8+) #data_sync_retry = off # retry or panic on failure to fsync # data? # (change requires restart) diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 0a6422da4f..d873c177cb 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -590,6 +590,9 @@ /* Define to 1 if you have the `symlink' function. */ #undef HAVE_SYMLINK +/* Define to 1 if you have the `syncfs' function. */ +#undef HAVE_SYNCFS + /* Define to 1 if you have the `sync_file_range' function. */ #undef HAVE_SYNC_FILE_RANGE diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 30bf7d2193..328473bdc9 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -45,6 +45,11 @@ #include +typedef enum RecoveryInitSyncMethod { + RECOVERY_INIT_SYNC_METHOD_FSYNC, + RECOVERY_INIT_SYNC_METHOD_SYNCFS +} RecoveryInitSyncMethod; + struct iovec; /* avoid including port/pg_iovec.h here */ typedef int File; @@ -53,6 +58,7 @@ typedef int File; /* GUC parameter */ extern PGDLLIMPORT int max_files_per_process; extern PGDLLIMPORT bool data_sync_retry; +extern int recovery_init_sync_method; /* * This is private to fd.c, but exported for save/restore_backend_variables() diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm index 14605371bb..ea8ed4be30 100644 --- a/src/tools/msvc/Solution.pm +++ b/src/tools/msvc/Solution.pm @@ -388,6 +388,7 @@ sub GenerateFiles HAVE_STRUCT_TM_TM_ZONE => undef, HAVE_SYNC_FILE_RANGE => undef, HAVE_SYMLINK => 1, + HAVE_SYNCFS => undef, HAVE_SYSLOG => undef, HAVE_SYS_EPOLL_H => undef, HAVE_SYS_EVENT_H => undef,