Add io_direct setting (developer-only).

Provide a way to ask the kernel to use O_DIRECT (or local equivalent)
where available for data and WAL files, to avoid or minimize kernel
caching.  This hurts performance currently and is not intended for end
users yet.  Later proposed work would introduce our own I/O clustering,
read-ahead, etc to replace the facilities the kernel disables with this
option.

The only user-visible change, if the developer-only GUC is not used, is
that this commit also removes the obscure logic that would activate
O_DIRECT for the WAL when wal_sync_method=open_[data]sync and
wal_level=minimal (which also requires max_wal_senders=0).  Those are
non-default and unlikely settings, and this behavior wasn't (correctly)
documented.  The same effect can be achieved with io_direct=wal.

Author: Thomas Munro <thomas.munro@gmail.com>
Author: Andres Freund <andres@anarazel.de>
Author: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Reviewed-by: Justin Pryzby <pryzby@telsasoft.com>
Reviewed-by: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKGK1X532hYqJ_MzFWt0n1zt8trz980D79WbjwnT-yYLZpg%40mail.gmail.com
This commit is contained in:
Thomas Munro 2023-04-08 11:04:49 +12:00
parent faeedbcefd
commit d4e71df6d7
14 changed files with 263 additions and 35 deletions

View File

@ -3172,7 +3172,6 @@ include_dir 'conf.d'
</listitem>
</itemizedlist>
<para>
The <literal>open_</literal>* options also use <literal>O_DIRECT</literal> if available.
Not all of these choices are available on all platforms.
The default is the first method in the above list that is supported
by the platform, except that <literal>fdatasync</literal> is the default on
@ -11256,6 +11255,38 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir'
</listitem>
</varlistentry>
<varlistentry id="guc-io-direct" xreflabel="io_direct">
<term><varname>io_direct</varname> (<type>string</type>)
<indexterm>
<primary><varname>io_direct</varname> configuration parameter</primary>
</indexterm>
</term>
<listitem>
<para>
Ask the kernel to minimize caching effects for relation data and WAL
files using <literal>O_DIRECT</literal> (most Unix-like systems),
<literal>F_NOCACHE</literal> (macOS) or
<literal>FILE_FLAG_NO_BUFFERING</literal> (Windows).
</para>
<para>
May be set to an empty string (the default) to disable use of direct
I/O, or a comma-separated list of operations that should use direct I/O.
The valid options are <literal>data</literal> for
main data files, <literal>wal</literal> for WAL files, and
<literal>wal_init</literal> for WAL files when being initially
allocated.
</para>
<para>
Some operating systems and file systems do not support direct I/O, so
non-default settings may be rejected at startup or cause errors.
</para>
<para>
Currently this feature reduces performance, and is intended for
developer testing only.
</para>
</listitem>
</varlistentry>
<varlistentry id="guc-post-auth-delay" xreflabel="post_auth_delay">
<term><varname>post_auth_delay</varname> (<type>integer</type>)
<indexterm>

View File

@ -2926,6 +2926,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
XLogSegNo max_segno;
int fd;
int save_errno;
int open_flags = O_RDWR | O_CREAT | O_EXCL | PG_BINARY;
Assert(logtli != 0);
@ -2959,8 +2960,11 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
unlink(tmppath);
if (io_direct_flags & IO_DIRECT_WAL_INIT)
open_flags |= PG_O_DIRECT;
/* do not use get_sync_bit() here --- want to fsync only at end of fill */
fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
fd = BasicOpenFile(tmppath, open_flags);
if (fd < 0)
ereport(ERROR,
(errcode_for_file_access(),
@ -3354,7 +3358,7 @@ XLogFileClose(void)
* use the cache to read the WAL segment.
*/
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
if (!XLogIsNeeded())
if (!XLogIsNeeded() && (io_direct_flags & IO_DIRECT_WAL) == 0)
(void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
#endif
@ -4445,7 +4449,6 @@ show_in_hot_standby(void)
return RecoveryInProgress() ? "on" : "off";
}
/*
* Read the control file, set respective GUCs.
*
@ -8029,35 +8032,27 @@ xlog_redo(XLogReaderState *record)
}
/*
* Return the (possible) sync flag used for opening a file, depending on the
* value of the GUC wal_sync_method.
* Return the extra open flags used for opening a file, depending on the
* value of the GUCs wal_sync_method, fsync and io_direct.
*/
static int
get_sync_bit(int method)
{
int o_direct_flag = 0;
/* If fsync is disabled, never open in sync mode */
if (!enableFsync)
return 0;
/*
* Optimize writes by bypassing kernel cache with O_DIRECT when using
* O_SYNC and O_DSYNC. But only if archiving and streaming are disabled,
* otherwise the archive command or walsender process will read the WAL
* soon after writing it, which is guaranteed to cause a physical read if
* we bypassed the kernel cache. We also skip the
* posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
* reason.
*
* Never use O_DIRECT in walreceiver process for similar reasons; the WAL
* Use O_DIRECT if requested, except in walreceiver process. The WAL
* written by walreceiver is normally read by the startup process soon
* after it's written. Also, walreceiver performs unaligned writes, which
* after it's written. Also, walreceiver performs unaligned writes, which
* don't work with O_DIRECT, so it is required for correctness too.
*/
if (!XLogIsNeeded() && !AmWalReceiverProcess())
if ((io_direct_flags & IO_DIRECT_WAL) && !AmWalReceiverProcess())
o_direct_flag = PG_O_DIRECT;
/* If fsync is disabled, never open in sync mode */
if (!enableFsync)
return o_direct_flag;
switch (method)
{
/*
@ -8069,7 +8064,7 @@ get_sync_bit(int method)
case SYNC_METHOD_FSYNC:
case SYNC_METHOD_FSYNC_WRITETHROUGH:
case SYNC_METHOD_FDATASYNC:
return 0;
return o_direct_flag;
#ifdef O_SYNC
case SYNC_METHOD_OPEN:
return O_SYNC | o_direct_flag;

View File

@ -785,7 +785,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn)
block->prefetch_buffer = InvalidBuffer;
return LRQ_NEXT_IO;
}
else
else if ((io_direct_flags & IO_DIRECT_DATA) == 0)
{
/*
* This shouldn't be possible, because we already determined

View File

@ -541,8 +541,11 @@ PrefetchSharedBuffer(SMgrRelation smgr_reln,
* Try to initiate an asynchronous read. This returns false in
* recovery if the relation file doesn't exist.
*/
if (smgrprefetch(smgr_reln, forkNum, blockNum))
if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
smgrprefetch(smgr_reln, forkNum, blockNum))
{
result.initiated_io = true;
}
#endif /* USE_PREFETCH */
}
else
@ -588,11 +591,11 @@ PrefetchSharedBuffer(SMgrRelation smgr_reln,
* the kernel and therefore didn't really initiate I/O, and no way to know when
* the I/O completes other than using synchronous ReadBuffer().
*
* 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and either
* 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and
* USE_PREFETCH is not defined (this build doesn't support prefetching due to
* lack of a kernel facility), or the underlying relation file wasn't found and
* we are in recovery. (If the relation file wasn't found and we are not in
* recovery, an error is raised).
* lack of a kernel facility), direct I/O is enabled, or the underlying
* relation file wasn't found and we are in recovery. (If the relation file
* wasn't found and we are not in recovery, an error is raised).
*/
PrefetchBufferResult
PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
@ -5440,6 +5443,9 @@ ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag)
{
PendingWriteback *pending;
if (io_direct_flags & IO_DIRECT_DATA)
return;
/*
* Add buffer to the pending writeback array, unless writeback control is
* disabled.

View File

@ -92,8 +92,11 @@ PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum,
{
#ifdef USE_PREFETCH
/* Not in buffers, so initiate prefetch */
smgrprefetch(smgr, forkNum, blockNum);
result.initiated_io = true;
if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
smgrprefetch(smgr, forkNum, blockNum))
{
result.initiated_io = true;
}
#endif /* USE_PREFETCH */
}

View File

@ -98,7 +98,9 @@
#include "storage/fd.h"
#include "storage/ipc.h"
#include "utils/guc.h"
#include "utils/guc_hooks.h"
#include "utils/resowner_private.h"
#include "utils/varlena.h"
/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
#if defined(HAVE_SYNC_FILE_RANGE)
@ -162,6 +164,9 @@ bool data_sync_retry = false;
/* How SyncDataDirectory() should do its job. */
int recovery_init_sync_method = RECOVERY_INIT_SYNC_METHOD_FSYNC;
/* Which kinds of files should be opened with PG_O_DIRECT. */
int io_direct_flags;
/* Debugging.... */
#ifdef FDDEBUG
@ -2022,6 +2027,9 @@ FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
if (nbytes <= 0)
return;
if (VfdCache[file].fileFlags & PG_O_DIRECT)
return;
returnCode = FileAccess(file);
if (returnCode < 0)
return;
@ -3826,3 +3834,93 @@ data_sync_elevel(int elevel)
{
return data_sync_retry ? elevel : PANIC;
}
bool
check_io_direct(char **newval, void **extra, GucSource source)
{
bool result = true;
int flags;
#if PG_O_DIRECT == 0
if (strcmp(*newval, "") != 0)
{
GUC_check_errdetail("io_direct is not supported on this platform.");
result = false;
}
flags = 0;
#else
List *elemlist;
ListCell *l;
char *rawstring;
/* Need a modifiable copy of string */
rawstring = pstrdup(*newval);
if (!SplitGUCList(rawstring, ',', &elemlist))
{
GUC_check_errdetail("invalid list syntax in parameter \"%s\"",
"io_direct");
pfree(rawstring);
list_free(elemlist);
return false;
}
flags = 0;
foreach(l, elemlist)
{
char *item = (char *) lfirst(l);
if (pg_strcasecmp(item, "data") == 0)
flags |= IO_DIRECT_DATA;
else if (pg_strcasecmp(item, "wal") == 0)
flags |= IO_DIRECT_WAL;
else if (pg_strcasecmp(item, "wal_init") == 0)
flags |= IO_DIRECT_WAL_INIT;
else
{
GUC_check_errdetail("invalid option \"%s\"", item);
result = false;
break;
}
}
/*
* It's possible to configure block sizes smaller than our assumed I/O
* alignment size, which could result in invalid I/O requests.
*/
#if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
{
GUC_check_errdetail("io_direct is not supported for WAL because XLOG_BLCKSZ is too small");
result = false;
}
#endif
#if BLCKSZ < PG_IO_ALIGN_SIZE
if (result && (flags & IO_DIRECT_DATA))
{
GUC_check_errdetail("io_direct is not supported for data because BLCKSZ is too small");
result = false;
}
#endif
pfree(rawstring);
list_free(elemlist);
#endif
if (!result)
return result;
/* Save the flags in *extra, for use by assign_io_direct */
*extra = guc_malloc(ERROR, sizeof(int));
*((int *) *extra) = flags;
return result;
}
extern void
assign_io_direct(const char *newval, void *extra)
{
int *flags = (int *) extra;
io_direct_flags = *flags;
}

View File

@ -142,6 +142,16 @@ static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forknum,
static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
MdfdVec *seg);
static inline int
_mdfd_open_flags(void)
{
int flags = O_RDWR | PG_BINARY;
if (io_direct_flags & IO_DIRECT_DATA)
flags |= PG_O_DIRECT;
return flags;
}
/*
* mdinit() -- Initialize private state for magnetic disk storage manager.
@ -205,14 +215,14 @@ mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
path = relpath(reln->smgr_rlocator, forknum);
fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
fd = PathNameOpenFile(path, _mdfd_open_flags() | O_CREAT | O_EXCL);
if (fd < 0)
{
int save_errno = errno;
if (isRedo)
fd = PathNameOpenFile(path, O_RDWR | PG_BINARY);
fd = PathNameOpenFile(path, _mdfd_open_flags());
if (fd < 0)
{
/* be sure to report the error reported by create, not open */
@ -635,7 +645,7 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
path = relpath(reln->smgr_rlocator, forknum);
fd = PathNameOpenFile(path, O_RDWR | PG_BINARY);
fd = PathNameOpenFile(path, _mdfd_open_flags());
if (fd < 0)
{
@ -706,6 +716,8 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
off_t seekpos;
MdfdVec *v;
Assert((io_direct_flags & IO_DIRECT_DATA) == 0);
v = _mdfd_getseg(reln, forknum, blocknum, false,
InRecovery ? EXTENSION_RETURN_NULL : EXTENSION_FAIL);
if (v == NULL)
@ -731,6 +743,8 @@ void
mdwriteback(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, BlockNumber nblocks)
{
Assert((io_direct_flags & IO_DIRECT_DATA) == 0);
/*
* Issue flush requests in as few requests as possible; have to split at
* segment boundaries though, since those are actually separate files.
@ -1335,7 +1349,7 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
fullpath = _mdfd_segpath(reln, forknum, segno);
/* open the file */
fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags);
fd = PathNameOpenFile(fullpath, _mdfd_open_flags() | oflags);
pfree(fullpath);
@ -1546,7 +1560,7 @@ mdsyncfiletag(const FileTag *ftag, char *path)
strlcpy(path, p, MAXPGPATH);
pfree(p);
file = PathNameOpenFile(path, O_RDWR | PG_BINARY);
file = PathNameOpenFile(path, _mdfd_open_flags());
if (file < 0)
return -1;
need_to_close = true;

View File

@ -20,6 +20,7 @@
#include "access/xlogutils.h"
#include "lib/ilist.h"
#include "storage/bufmgr.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/md.h"
#include "storage/smgr.h"

View File

@ -568,6 +568,7 @@ static char *locale_ctype;
static char *server_encoding_string;
static char *server_version_string;
static int server_version_num;
static char *io_direct_string;
#ifdef HAVE_SYSLOG
#define DEFAULT_SYSLOG_FACILITY LOG_LOCAL0
@ -4575,6 +4576,17 @@ struct config_string ConfigureNamesString[] =
check_backtrace_functions, assign_backtrace_functions, NULL
},
{
{"io_direct", PGC_POSTMASTER, DEVELOPER_OPTIONS,
gettext_noop("Use direct I/O for file access."),
NULL,
GUC_LIST_INPUT | GUC_NOT_IN_SAMPLE
},
&io_direct_string,
"",
check_io_direct, assign_io_direct, NULL
},
/* End-of-list marker */
{
{NULL, 0, 0, NULL, NULL}, NULL, NULL, NULL, NULL, NULL

View File

@ -44,6 +44,7 @@
#define FD_H
#include <dirent.h>
#include <fcntl.h>
typedef enum RecoveryInitSyncMethod
{
@ -54,10 +55,16 @@ typedef enum RecoveryInitSyncMethod
typedef int File;
#define IO_DIRECT_DATA 0x01
#define IO_DIRECT_WAL 0x02
#define IO_DIRECT_WAL_INIT 0x04
/* GUC parameter */
extern PGDLLIMPORT int max_files_per_process;
extern PGDLLIMPORT bool data_sync_retry;
extern PGDLLIMPORT int recovery_init_sync_method;
extern PGDLLIMPORT int io_direct_flags;
/*
* This is private to fd.c, but exported for save/restore_backend_variables()

View File

@ -17,6 +17,7 @@
#include "lib/ilist.h"
#include "storage/block.h"
#include "storage/relfilelocator.h"
#include "utils/guc.h"
/*
* smgr.c maintains a table of SMgrRelation objects, which are essentially

View File

@ -156,5 +156,7 @@ extern bool check_wal_consistency_checking(char **newval, void **extra,
GucSource source);
extern void assign_wal_consistency_checking(const char *newval, void *extra);
extern void assign_xlog_sync_method(int new_sync_method, void *extra);
extern bool check_io_direct(char **newval, void **extra, GucSource source);
extern void assign_io_direct(const char *newval, void *extra);
#endif /* GUC_HOOKS_H */

View File

@ -9,6 +9,7 @@ tests += {
't/001_constraint_validation.pl',
't/002_tablespace.pl',
't/003_check_guc.pl',
't/004_io_direct.pl',
],
},
}

View File

@ -0,0 +1,57 @@
# Very simple exercise of direct I/O GUC.
use strict;
use warnings;
use PostgreSQL::Test::Cluster;
use PostgreSQL::Test::Utils;
use Test::More;
# Systems that we know to have direct I/O support, and whose typical local
# filesystems support it or at least won't fail with an error. (illumos should
# probably be in this list, but perl reports it as solaris. Solaris should not
# be in the list because we don't support its way of turning on direct I/O, and
# even if we did, its version of ZFS rejects it, and OpenBSD just doesn't have
# it.)
if (!grep { $^O eq $_ } qw(aix darwin dragonfly freebsd linux MSWin32 netbsd))
{
plan skip_all => "no direct I/O support";
}
my $node = PostgreSQL::Test::Cluster->new('main');
$node->init;
$node->append_conf(
'postgresql.conf', qq{
io_direct = 'data,wal,wal_init'
shared_buffers = '256kB' # tiny to force I/O
});
$node->start;
# Do some work that is bound to generate shared and local writes and reads as a
# simple exercise.
$node->safe_psql('postgres',
'create table t1 as select 1 as i from generate_series(1, 10000)');
$node->safe_psql('postgres', 'create table t2count (i int)');
$node->safe_psql(
'postgres', qq{
begin;
create temporary table t2 as select 1 as i from generate_series(1, 10000);
update t2 set i = i;
insert into t2count select count(*) from t2;
commit;
});
$node->safe_psql('postgres', 'update t1 set i = i');
is( '10000',
$node->safe_psql('postgres', 'select count(*) from t1'),
"read back from shared");
is( '10000',
$node->safe_psql('postgres', 'select * from t2count'),
"read back from local");
$node->stop('immediate');
$node->start;
is( '10000',
$node->safe_psql('postgres', 'select count(*) from t1'),
"read back from shared after crash recovery");
$node->stop;
done_testing();