From d4e71df6d757fd21c363164a3a4d3b5681462662 Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Sat, 8 Apr 2023 11:04:49 +1200 Subject: [PATCH] Add io_direct setting (developer-only). Provide a way to ask the kernel to use O_DIRECT (or local equivalent) where available for data and WAL files, to avoid or minimize kernel caching. This hurts performance currently and is not intended for end users yet. Later proposed work would introduce our own I/O clustering, read-ahead, etc to replace the facilities the kernel disables with this option. The only user-visible change, if the developer-only GUC is not used, is that this commit also removes the obscure logic that would activate O_DIRECT for the WAL when wal_sync_method=open_[data]sync and wal_level=minimal (which also requires max_wal_senders=0). Those are non-default and unlikely settings, and this behavior wasn't (correctly) documented. The same effect can be achieved with io_direct=wal. Author: Thomas Munro Author: Andres Freund Author: Bharath Rupireddy Reviewed-by: Justin Pryzby Reviewed-by: Bharath Rupireddy Discussion: https://postgr.es/m/CA%2BhUKGK1X532hYqJ_MzFWt0n1zt8trz980D79WbjwnT-yYLZpg%40mail.gmail.com --- doc/src/sgml/config.sgml | 33 ++++++- src/backend/access/transam/xlog.c | 37 +++---- src/backend/access/transam/xlogprefetcher.c | 2 +- src/backend/storage/buffer/bufmgr.c | 16 ++- src/backend/storage/buffer/localbuf.c | 7 +- src/backend/storage/file/fd.c | 98 +++++++++++++++++++ src/backend/storage/smgr/md.c | 24 ++++- src/backend/storage/smgr/smgr.c | 1 + src/backend/utils/misc/guc_tables.c | 12 +++ src/include/storage/fd.h | 7 ++ src/include/storage/smgr.h | 1 + src/include/utils/guc_hooks.h | 2 + src/test/modules/test_misc/meson.build | 1 + src/test/modules/test_misc/t/004_io_direct.pl | 57 +++++++++++ 14 files changed, 263 insertions(+), 35 deletions(-) create mode 100644 src/test/modules/test_misc/t/004_io_direct.pl diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 1966ecc162..091a79d4f3 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -3172,7 +3172,6 @@ include_dir 'conf.d' - The open_* options also use O_DIRECT if available. Not all of these choices are available on all platforms. The default is the first method in the above list that is supported by the platform, except that fdatasync is the default on @@ -11256,6 +11255,38 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir' + + io_direct (string) + + io_direct configuration parameter + + + + + Ask the kernel to minimize caching effects for relation data and WAL + files using O_DIRECT (most Unix-like systems), + F_NOCACHE (macOS) or + FILE_FLAG_NO_BUFFERING (Windows). + + + May be set to an empty string (the default) to disable use of direct + I/O, or a comma-separated list of operations that should use direct I/O. + The valid options are data for + main data files, wal for WAL files, and + wal_init for WAL files when being initially + allocated. + + + Some operating systems and file systems do not support direct I/O, so + non-default settings may be rejected at startup or cause errors. + + + Currently this feature reduces performance, and is intended for + developer testing only. + + + + post_auth_delay (integer) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index a5c74fdab8..18e16ae5b3 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -2926,6 +2926,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, XLogSegNo max_segno; int fd; int save_errno; + int open_flags = O_RDWR | O_CREAT | O_EXCL | PG_BINARY; Assert(logtli != 0); @@ -2959,8 +2960,11 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, unlink(tmppath); + if (io_direct_flags & IO_DIRECT_WAL_INIT) + open_flags |= PG_O_DIRECT; + /* do not use get_sync_bit() here --- want to fsync only at end of fill */ - fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); + fd = BasicOpenFile(tmppath, open_flags); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), @@ -3354,7 +3358,7 @@ XLogFileClose(void) * use the cache to read the WAL segment. */ #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) - if (!XLogIsNeeded()) + if (!XLogIsNeeded() && (io_direct_flags & IO_DIRECT_WAL) == 0) (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED); #endif @@ -4445,7 +4449,6 @@ show_in_hot_standby(void) return RecoveryInProgress() ? "on" : "off"; } - /* * Read the control file, set respective GUCs. * @@ -8029,35 +8032,27 @@ xlog_redo(XLogReaderState *record) } /* - * Return the (possible) sync flag used for opening a file, depending on the - * value of the GUC wal_sync_method. + * Return the extra open flags used for opening a file, depending on the + * value of the GUCs wal_sync_method, fsync and io_direct. */ static int get_sync_bit(int method) { int o_direct_flag = 0; - /* If fsync is disabled, never open in sync mode */ - if (!enableFsync) - return 0; - /* - * Optimize writes by bypassing kernel cache with O_DIRECT when using - * O_SYNC and O_DSYNC. But only if archiving and streaming are disabled, - * otherwise the archive command or walsender process will read the WAL - * soon after writing it, which is guaranteed to cause a physical read if - * we bypassed the kernel cache. We also skip the - * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same - * reason. - * - * Never use O_DIRECT in walreceiver process for similar reasons; the WAL + * Use O_DIRECT if requested, except in walreceiver process. The WAL * written by walreceiver is normally read by the startup process soon - * after it's written. Also, walreceiver performs unaligned writes, which + * after it's written. Also, walreceiver performs unaligned writes, which * don't work with O_DIRECT, so it is required for correctness too. */ - if (!XLogIsNeeded() && !AmWalReceiverProcess()) + if ((io_direct_flags & IO_DIRECT_WAL) && !AmWalReceiverProcess()) o_direct_flag = PG_O_DIRECT; + /* If fsync is disabled, never open in sync mode */ + if (!enableFsync) + return o_direct_flag; + switch (method) { /* @@ -8069,7 +8064,7 @@ get_sync_bit(int method) case SYNC_METHOD_FSYNC: case SYNC_METHOD_FSYNC_WRITETHROUGH: case SYNC_METHOD_FDATASYNC: - return 0; + return o_direct_flag; #ifdef O_SYNC case SYNC_METHOD_OPEN: return O_SYNC | o_direct_flag; diff --git a/src/backend/access/transam/xlogprefetcher.c b/src/backend/access/transam/xlogprefetcher.c index 046e40d143..7ba18f2a76 100644 --- a/src/backend/access/transam/xlogprefetcher.c +++ b/src/backend/access/transam/xlogprefetcher.c @@ -785,7 +785,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) block->prefetch_buffer = InvalidBuffer; return LRQ_NEXT_IO; } - else + else if ((io_direct_flags & IO_DIRECT_DATA) == 0) { /* * This shouldn't be possible, because we already determined diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 5a237d5606..7778dde3e5 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -541,8 +541,11 @@ PrefetchSharedBuffer(SMgrRelation smgr_reln, * Try to initiate an asynchronous read. This returns false in * recovery if the relation file doesn't exist. */ - if (smgrprefetch(smgr_reln, forkNum, blockNum)) + if ((io_direct_flags & IO_DIRECT_DATA) == 0 && + smgrprefetch(smgr_reln, forkNum, blockNum)) + { result.initiated_io = true; + } #endif /* USE_PREFETCH */ } else @@ -588,11 +591,11 @@ PrefetchSharedBuffer(SMgrRelation smgr_reln, * the kernel and therefore didn't really initiate I/O, and no way to know when * the I/O completes other than using synchronous ReadBuffer(). * - * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and either + * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and * USE_PREFETCH is not defined (this build doesn't support prefetching due to - * lack of a kernel facility), or the underlying relation file wasn't found and - * we are in recovery. (If the relation file wasn't found and we are not in - * recovery, an error is raised). + * lack of a kernel facility), direct I/O is enabled, or the underlying + * relation file wasn't found and we are in recovery. (If the relation file + * wasn't found and we are not in recovery, an error is raised). */ PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum) @@ -5440,6 +5443,9 @@ ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag) { PendingWriteback *pending; + if (io_direct_flags & IO_DIRECT_DATA) + return; + /* * Add buffer to the pending writeback array, unless writeback control is * disabled. diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 3c6382456a..f684862d98 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -92,8 +92,11 @@ PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, { #ifdef USE_PREFETCH /* Not in buffers, so initiate prefetch */ - smgrprefetch(smgr, forkNum, blockNum); - result.initiated_io = true; + if ((io_direct_flags & IO_DIRECT_DATA) == 0 && + smgrprefetch(smgr, forkNum, blockNum)) + { + result.initiated_io = true; + } #endif /* USE_PREFETCH */ } diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index a280a1e7be..277a28fc13 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -98,7 +98,9 @@ #include "storage/fd.h" #include "storage/ipc.h" #include "utils/guc.h" +#include "utils/guc_hooks.h" #include "utils/resowner_private.h" +#include "utils/varlena.h" /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */ #if defined(HAVE_SYNC_FILE_RANGE) @@ -162,6 +164,9 @@ bool data_sync_retry = false; /* How SyncDataDirectory() should do its job. */ int recovery_init_sync_method = RECOVERY_INIT_SYNC_METHOD_FSYNC; +/* Which kinds of files should be opened with PG_O_DIRECT. */ +int io_direct_flags; + /* Debugging.... */ #ifdef FDDEBUG @@ -2022,6 +2027,9 @@ FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info) if (nbytes <= 0) return; + if (VfdCache[file].fileFlags & PG_O_DIRECT) + return; + returnCode = FileAccess(file); if (returnCode < 0) return; @@ -3826,3 +3834,93 @@ data_sync_elevel(int elevel) { return data_sync_retry ? elevel : PANIC; } + +bool +check_io_direct(char **newval, void **extra, GucSource source) +{ + bool result = true; + int flags; + +#if PG_O_DIRECT == 0 + if (strcmp(*newval, "") != 0) + { + GUC_check_errdetail("io_direct is not supported on this platform."); + result = false; + } + flags = 0; +#else + List *elemlist; + ListCell *l; + char *rawstring; + + /* Need a modifiable copy of string */ + rawstring = pstrdup(*newval); + + if (!SplitGUCList(rawstring, ',', &elemlist)) + { + GUC_check_errdetail("invalid list syntax in parameter \"%s\"", + "io_direct"); + pfree(rawstring); + list_free(elemlist); + return false; + } + + flags = 0; + foreach(l, elemlist) + { + char *item = (char *) lfirst(l); + + if (pg_strcasecmp(item, "data") == 0) + flags |= IO_DIRECT_DATA; + else if (pg_strcasecmp(item, "wal") == 0) + flags |= IO_DIRECT_WAL; + else if (pg_strcasecmp(item, "wal_init") == 0) + flags |= IO_DIRECT_WAL_INIT; + else + { + GUC_check_errdetail("invalid option \"%s\"", item); + result = false; + break; + } + } + + /* + * It's possible to configure block sizes smaller than our assumed I/O + * alignment size, which could result in invalid I/O requests. + */ +#if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE + if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT))) + { + GUC_check_errdetail("io_direct is not supported for WAL because XLOG_BLCKSZ is too small"); + result = false; + } +#endif +#if BLCKSZ < PG_IO_ALIGN_SIZE + if (result && (flags & IO_DIRECT_DATA)) + { + GUC_check_errdetail("io_direct is not supported for data because BLCKSZ is too small"); + result = false; + } +#endif + + pfree(rawstring); + list_free(elemlist); +#endif + + if (!result) + return result; + + /* Save the flags in *extra, for use by assign_io_direct */ + *extra = guc_malloc(ERROR, sizeof(int)); + *((int *) *extra) = flags; + + return result; +} + +extern void +assign_io_direct(const char *newval, void *extra) +{ + int *flags = (int *) extra; + + io_direct_flags = *flags; +} diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index d1124d46f4..f1316eb4ce 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -142,6 +142,16 @@ static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forknum, static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg); +static inline int +_mdfd_open_flags(void) +{ + int flags = O_RDWR | PG_BINARY; + + if (io_direct_flags & IO_DIRECT_DATA) + flags |= PG_O_DIRECT; + + return flags; +} /* * mdinit() -- Initialize private state for magnetic disk storage manager. @@ -205,14 +215,14 @@ mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) path = relpath(reln->smgr_rlocator, forknum); - fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); + fd = PathNameOpenFile(path, _mdfd_open_flags() | O_CREAT | O_EXCL); if (fd < 0) { int save_errno = errno; if (isRedo) - fd = PathNameOpenFile(path, O_RDWR | PG_BINARY); + fd = PathNameOpenFile(path, _mdfd_open_flags()); if (fd < 0) { /* be sure to report the error reported by create, not open */ @@ -635,7 +645,7 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior) path = relpath(reln->smgr_rlocator, forknum); - fd = PathNameOpenFile(path, O_RDWR | PG_BINARY); + fd = PathNameOpenFile(path, _mdfd_open_flags()); if (fd < 0) { @@ -706,6 +716,8 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) off_t seekpos; MdfdVec *v; + Assert((io_direct_flags & IO_DIRECT_DATA) == 0); + v = _mdfd_getseg(reln, forknum, blocknum, false, InRecovery ? EXTENSION_RETURN_NULL : EXTENSION_FAIL); if (v == NULL) @@ -731,6 +743,8 @@ void mdwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks) { + Assert((io_direct_flags & IO_DIRECT_DATA) == 0); + /* * Issue flush requests in as few requests as possible; have to split at * segment boundaries though, since those are actually separate files. @@ -1335,7 +1349,7 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, fullpath = _mdfd_segpath(reln, forknum, segno); /* open the file */ - fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags); + fd = PathNameOpenFile(fullpath, _mdfd_open_flags() | oflags); pfree(fullpath); @@ -1546,7 +1560,7 @@ mdsyncfiletag(const FileTag *ftag, char *path) strlcpy(path, p, MAXPGPATH); pfree(p); - file = PathNameOpenFile(path, O_RDWR | PG_BINARY); + file = PathNameOpenFile(path, _mdfd_open_flags()); if (file < 0) return -1; need_to_close = true; diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index c37c246b77..70d0d570b1 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -20,6 +20,7 @@ #include "access/xlogutils.h" #include "lib/ilist.h" #include "storage/bufmgr.h" +#include "storage/fd.h" #include "storage/ipc.h" #include "storage/md.h" #include "storage/smgr.h" diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 97edc61a14..cab3ddbe11 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -568,6 +568,7 @@ static char *locale_ctype; static char *server_encoding_string; static char *server_version_string; static int server_version_num; +static char *io_direct_string; #ifdef HAVE_SYSLOG #define DEFAULT_SYSLOG_FACILITY LOG_LOCAL0 @@ -4575,6 +4576,17 @@ struct config_string ConfigureNamesString[] = check_backtrace_functions, assign_backtrace_functions, NULL }, + { + {"io_direct", PGC_POSTMASTER, DEVELOPER_OPTIONS, + gettext_noop("Use direct I/O for file access."), + NULL, + GUC_LIST_INPUT | GUC_NOT_IN_SAMPLE + }, + &io_direct_string, + "", + check_io_direct, assign_io_direct, NULL + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, NULL, NULL, NULL, NULL diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index faac4914fe..6791a406fc 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -44,6 +44,7 @@ #define FD_H #include +#include typedef enum RecoveryInitSyncMethod { @@ -54,10 +55,16 @@ typedef enum RecoveryInitSyncMethod typedef int File; +#define IO_DIRECT_DATA 0x01 +#define IO_DIRECT_WAL 0x02 +#define IO_DIRECT_WAL_INIT 0x04 + + /* GUC parameter */ extern PGDLLIMPORT int max_files_per_process; extern PGDLLIMPORT bool data_sync_retry; extern PGDLLIMPORT int recovery_init_sync_method; +extern PGDLLIMPORT int io_direct_flags; /* * This is private to fd.c, but exported for save/restore_backend_variables() diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index a9a179aaba..17fba6f91a 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -17,6 +17,7 @@ #include "lib/ilist.h" #include "storage/block.h" #include "storage/relfilelocator.h" +#include "utils/guc.h" /* * smgr.c maintains a table of SMgrRelation objects, which are essentially diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h index f722fb250a..a82a85c940 100644 --- a/src/include/utils/guc_hooks.h +++ b/src/include/utils/guc_hooks.h @@ -156,5 +156,7 @@ extern bool check_wal_consistency_checking(char **newval, void **extra, GucSource source); extern void assign_wal_consistency_checking(const char *newval, void *extra); extern void assign_xlog_sync_method(int new_sync_method, void *extra); +extern bool check_io_direct(char **newval, void **extra, GucSource source); +extern void assign_io_direct(const char *newval, void *extra); #endif /* GUC_HOOKS_H */ diff --git a/src/test/modules/test_misc/meson.build b/src/test/modules/test_misc/meson.build index 21bde427b4..911084ac0f 100644 --- a/src/test/modules/test_misc/meson.build +++ b/src/test/modules/test_misc/meson.build @@ -9,6 +9,7 @@ tests += { 't/001_constraint_validation.pl', 't/002_tablespace.pl', 't/003_check_guc.pl', + 't/004_io_direct.pl', ], }, } diff --git a/src/test/modules/test_misc/t/004_io_direct.pl b/src/test/modules/test_misc/t/004_io_direct.pl new file mode 100644 index 0000000000..f5bf0b11e4 --- /dev/null +++ b/src/test/modules/test_misc/t/004_io_direct.pl @@ -0,0 +1,57 @@ +# Very simple exercise of direct I/O GUC. + +use strict; +use warnings; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Systems that we know to have direct I/O support, and whose typical local +# filesystems support it or at least won't fail with an error. (illumos should +# probably be in this list, but perl reports it as solaris. Solaris should not +# be in the list because we don't support its way of turning on direct I/O, and +# even if we did, its version of ZFS rejects it, and OpenBSD just doesn't have +# it.) +if (!grep { $^O eq $_ } qw(aix darwin dragonfly freebsd linux MSWin32 netbsd)) +{ + plan skip_all => "no direct I/O support"; +} + +my $node = PostgreSQL::Test::Cluster->new('main'); +$node->init; +$node->append_conf( + 'postgresql.conf', qq{ +io_direct = 'data,wal,wal_init' +shared_buffers = '256kB' # tiny to force I/O +}); +$node->start; + +# Do some work that is bound to generate shared and local writes and reads as a +# simple exercise. +$node->safe_psql('postgres', + 'create table t1 as select 1 as i from generate_series(1, 10000)'); +$node->safe_psql('postgres', 'create table t2count (i int)'); +$node->safe_psql( + 'postgres', qq{ +begin; +create temporary table t2 as select 1 as i from generate_series(1, 10000); +update t2 set i = i; +insert into t2count select count(*) from t2; +commit; +}); +$node->safe_psql('postgres', 'update t1 set i = i'); +is( '10000', + $node->safe_psql('postgres', 'select count(*) from t1'), + "read back from shared"); +is( '10000', + $node->safe_psql('postgres', 'select * from t2count'), + "read back from local"); +$node->stop('immediate'); + +$node->start; +is( '10000', + $node->safe_psql('postgres', 'select count(*) from t1'), + "read back from shared after crash recovery"); +$node->stop; + +done_testing();