Restore pg_pread and friends.

Commits cf112c12 and a0dc8271 were a little too hasty in getting rid of
the pg_ prefixes where we use pread(), pwrite() and vectored variants.

We dropped support for ancient Unixes where we needed to use lseek() to
implement replacements for those, but it turns out that Windows also
changes the current position even when you pass in an offset to
ReadFile() and WriteFile() if the file handle is synchronous, despite
its documentation saying otherwise.

Switching to asynchronous file handles would fix that, but have other
complications.  For now let's just put back the pg_ prefix and add some
comments to highlight the non-standard side-effect, which we can now
describe as Windows-only.

Reported-by: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Reviewed-by: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Discussion: https://postgr.es/m/20220923202439.GA1156054%40nathanxps13
This commit is contained in:
Thomas Munro 2022-09-29 13:12:11 +13:00
parent 3a5817695a
commit b6d8a60aba
19 changed files with 75 additions and 55 deletions

View File

@ -2103,9 +2103,9 @@ qtext_store(const char *query, int query_len,
if (fd < 0)
goto error;
if (pwrite(fd, query, query_len, off) != query_len)
if (pg_pwrite(fd, query, query_len, off) != query_len)
goto error;
if (pwrite(fd, "\0", 1, off + query_len) != 1)
if (pg_pwrite(fd, "\0", 1, off + query_len) != 1)
goto error;
CloseTransientFile(fd);

View File

@ -1150,7 +1150,7 @@ heap_xlog_logical_rewrite(XLogReaderState *r)
/* write out tail end of mapping file (again) */
errno = 0;
pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_MAPPING_WRITE);
if (pwrite(fd, data, len, xlrec->offset) != len)
if (pg_pwrite(fd, data, len, xlrec->offset) != len)
{
/* if write didn't set errno, assume problem is no disk space */
if (errno == 0)

View File

@ -718,7 +718,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
errno = 0;
pgstat_report_wait_start(WAIT_EVENT_SLRU_READ);
if (pread(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
if (pg_pread(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
{
pgstat_report_wait_end();
slru_errcause = SLRU_READ_FAILED;
@ -873,7 +873,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruWriteAll fdata)
errno = 0;
pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE);
if (pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
{
pgstat_report_wait_end();
/* if write didn't set errno, assume problem is no disk space */

View File

@ -2196,7 +2196,7 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
INSTR_TIME_SET_CURRENT(start);
pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
written = pwrite(openLogFile, from, nleft, startoffset);
written = pg_pwrite(openLogFile, from, nleft, startoffset);
pgstat_report_wait_end();
/*
@ -3018,7 +3018,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
* enough.
*/
errno = 0;
if (pwrite(fd, zbuffer.data, 1, wal_segment_size - 1) != 1)
if (pg_pwrite(fd, zbuffer.data, 1, wal_segment_size - 1) != 1)
{
/* if write didn't set errno, assume no disk space */
save_errno = errno ? errno : ENOSPC;

View File

@ -1544,7 +1544,7 @@ WALRead(XLogReaderState *state,
/* Reset errno first; eases reporting non-errno-affecting errors */
errno = 0;
readbytes = pread(state->seg.ws_file, p, segbytes, (off_t) startoff);
readbytes = pg_pread(state->seg.ws_file, p, segbytes, (off_t) startoff);
#ifndef FRONTEND
pgstat_report_wait_end();

View File

@ -3271,7 +3271,7 @@ retry:
readOff = targetPageOff;
pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
r = pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
if (r != XLOG_BLCKSZ)
{
char fname[MAXFNAMELEN];

View File

@ -1828,7 +1828,7 @@ basebackup_read_file(int fd, char *buf, size_t nbytes, off_t offset,
int rc;
pgstat_report_wait_start(WAIT_EVENT_BASEBACKUP_READ);
rc = pread(fd, buf, nbytes, offset);
rc = pg_pread(fd, buf, nbytes, offset);
pgstat_report_wait_end();
if (rc < 0)

View File

@ -915,7 +915,7 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr, TimeLineID tli)
/* OK to write the logs */
errno = 0;
byteswritten = pwrite(recvFile, buf, segbytes, (off_t) startoff);
byteswritten = pg_pwrite(recvFile, buf, segbytes, (off_t) startoff);
if (byteswritten <= 0)
{
char xlogfname[MAXFNAMELEN];

View File

@ -2053,7 +2053,7 @@ FileRead(File file, char *buffer, int amount, off_t offset,
retry:
pgstat_report_wait_start(wait_event_info);
returnCode = pread(vfdP->fd, buffer, amount, offset);
returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
pgstat_report_wait_end();
if (returnCode < 0)
@ -2135,7 +2135,7 @@ FileWrite(File file, char *buffer, int amount, off_t offset,
retry:
errno = 0;
pgstat_report_wait_start(wait_event_info);
returnCode = pwrite(VfdCache[file].fd, buffer, amount, offset);
returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
pgstat_report_wait_end();
/* if write didn't set errno, assume problem is no disk space */
@ -3740,7 +3740,7 @@ data_sync_elevel(int elevel)
}
/*
* A convenience wrapper for pwritev() that retries on partial write. If an
* A convenience wrapper for pg_pwritev() that retries on partial write. If an
* error is returned, it is unspecified how much has been written.
*/
ssize_t
@ -3760,7 +3760,7 @@ pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
for (;;)
{
/* Write as much as we can. */
part = pwritev(fd, iov, iovcnt, offset);
part = pg_pwritev(fd, iov, iovcnt, offset);
if (part < 0)
return -1;

View File

@ -1527,7 +1527,7 @@ AddToDataDirLockFile(int target_line, const char *str)
len = strlen(destbuffer);
errno = 0;
pgstat_report_wait_start(WAIT_EVENT_LOCK_FILE_ADDTODATADIR_WRITE);
if (pwrite(fd, destbuffer, len, 0) != len)
if (pg_pwrite(fd, destbuffer, len, 0) != len)
{
pgstat_report_wait_end();
/* if write didn't set errno, assume problem is no disk space */

View File

@ -312,10 +312,10 @@ test_sync(int writes_per_op)
for (ops = 0; alarm_triggered == false; ops++)
{
for (writes = 0; writes < writes_per_op; writes++)
if (pwrite(tmpfile,
buf,
XLOG_BLCKSZ,
writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
if (pg_pwrite(tmpfile,
buf,
XLOG_BLCKSZ,
writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
die("write failed");
}
STOP_TIMER;
@ -337,10 +337,10 @@ test_sync(int writes_per_op)
for (ops = 0; alarm_triggered == false; ops++)
{
for (writes = 0; writes < writes_per_op; writes++)
if (pwrite(tmpfile,
buf,
XLOG_BLCKSZ,
writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
if (pg_pwrite(tmpfile,
buf,
XLOG_BLCKSZ,
writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
die("write failed");
fdatasync(tmpfile);
}
@ -359,10 +359,10 @@ test_sync(int writes_per_op)
for (ops = 0; alarm_triggered == false; ops++)
{
for (writes = 0; writes < writes_per_op; writes++)
if (pwrite(tmpfile,
buf,
XLOG_BLCKSZ,
writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
if (pg_pwrite(tmpfile,
buf,
XLOG_BLCKSZ,
writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
die("write failed");
if (fsync(tmpfile) != 0)
die("fsync failed");
@ -383,10 +383,10 @@ test_sync(int writes_per_op)
for (ops = 0; alarm_triggered == false; ops++)
{
for (writes = 0; writes < writes_per_op; writes++)
if (pwrite(tmpfile,
buf,
XLOG_BLCKSZ,
writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
if (pg_pwrite(tmpfile,
buf,
XLOG_BLCKSZ,
writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
die("write failed");
if (pg_fsync_writethrough(tmpfile) != 0)
die("fsync failed");
@ -415,10 +415,10 @@ test_sync(int writes_per_op)
for (ops = 0; alarm_triggered == false; ops++)
{
for (writes = 0; writes < writes_per_op; writes++)
if (pwrite(tmpfile,
buf,
XLOG_BLCKSZ,
writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
if (pg_pwrite(tmpfile,
buf,
XLOG_BLCKSZ,
writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
/*
* This can generate write failures if the filesystem has
@ -480,10 +480,10 @@ test_open_sync(const char *msg, int writes_size)
for (ops = 0; alarm_triggered == false; ops++)
{
for (writes = 0; writes < 16 / writes_size; writes++)
if (pwrite(tmpfile,
buf,
writes_size * 1024,
writes * writes_size * 1024) !=
if (pg_pwrite(tmpfile,
buf,
writes_size * 1024,
writes * writes_size * 1024) !=
writes_size * 1024)
die("write failed");
}
@ -582,7 +582,7 @@ test_non_sync(void)
START_TIMER;
for (ops = 0; alarm_triggered == false; ops++)
{
if (pwrite(tmpfile, buf, XLOG_BLCKSZ, 0) != XLOG_BLCKSZ)
if (pg_pwrite(tmpfile, buf, XLOG_BLCKSZ, 0) != XLOG_BLCKSZ)
die("write failed");
}
STOP_TIMER;

View File

@ -378,11 +378,11 @@ extern void XLogReaderResetError(XLogReaderState *state);
/*
* Error information from WALRead that both backend and frontend caller can
* process. Currently only errors from pread can be reported.
* process. Currently only errors from pg_pread can be reported.
*/
typedef struct WALReadError
{
int wre_errno; /* errno set by the last pread() */
int wre_errno; /* errno set by the last pg_pread() */
int wre_off; /* Offset we tried to read from. */
int wre_req; /* Bytes requested to be read. */
int wre_read; /* Bytes read by the last read(). */

View File

@ -213,6 +213,15 @@ extern int pg_fprintf(FILE *stream, const char *fmt,...) pg_attribute_printf(2,
extern int pg_vprintf(const char *fmt, va_list args) pg_attribute_printf(1, 0);
extern int pg_printf(const char *fmt,...) pg_attribute_printf(1, 2);
#ifndef WIN32
/*
* We add a pg_ prefix as a warning that the Windows implementations have the
* non-standard side-effect of changing the current file position.
*/
#define pg_pread pread
#define pg_pwrite pwrite
#endif
/*
* We use __VA_ARGS__ for printf to prevent replacing references to
* the "printf" format archetype in format() attribute declarations.

View File

@ -35,12 +35,21 @@ struct iovec
/* Define a reasonable maximum that is safe to use on the stack. */
#define PG_IOV_MAX Min(IOV_MAX, 32)
#if !HAVE_DECL_PREADV
extern ssize_t preadv(int fd, const struct iovec *iov, int iovcnt, off_t offset);
/*
* Note that pg_preadv and pg_writev have a pg_ prefix as a warning that the
* Windows implementations have the side-effect of changing the file position.
*/
#if HAVE_DECL_PREADV
#define pg_preadv preadv
#else
extern ssize_t pg_preadv(int fd, const struct iovec *iov, int iovcnt, off_t offset);
#endif
#if !HAVE_DECL_PWRITEV
extern ssize_t pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset);
#if HAVE_DECL_PWRITEV
#define pg_pwritev pwritev
#else
extern ssize_t pg_pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset);
#endif
#endif /* PG_IOVEC_H */

View File

@ -564,9 +564,9 @@ typedef unsigned short mode_t;
#endif
/* in port/win32pread.c */
extern ssize_t pread(int fd, void *buf, size_t nbyte, off_t offset);
extern ssize_t pg_pread(int fd, void *buf, size_t nbyte, off_t offset);
/* in port/win32pwrite.c */
extern ssize_t pwrite(int fd, const void *buf, size_t nbyte, off_t offset);
extern ssize_t pg_pwrite(int fd, const void *buf, size_t nbyte, off_t offset);
#endif /* PG_WIN32_PORT_H */

View File

@ -19,14 +19,14 @@
#include "port/pg_iovec.h"
ssize_t
preadv(int fd, const struct iovec *iov, int iovcnt, off_t offset)
pg_preadv(int fd, const struct iovec *iov, int iovcnt, off_t offset)
{
ssize_t sum = 0;
ssize_t part;
for (int i = 0; i < iovcnt; ++i)
{
part = pread(fd, iov[i].iov_base, iov[i].iov_len, offset);
part = pg_pread(fd, iov[i].iov_base, iov[i].iov_len, offset);
if (part < 0)
{
if (i == 0)

View File

@ -19,14 +19,14 @@
#include "port/pg_iovec.h"
ssize_t
pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset)
pg_pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset)
{
ssize_t sum = 0;
ssize_t part;
for (int i = 0; i < iovcnt; ++i)
{
part = pwrite(fd, iov[i].iov_base, iov[i].iov_len, offset);
part = pg_pwrite(fd, iov[i].iov_base, iov[i].iov_len, offset);
if (part < 0)
{
if (i == 0)

View File

@ -17,7 +17,7 @@
#include <windows.h>
ssize_t
pread(int fd, void *buf, size_t size, off_t offset)
pg_pread(int fd, void *buf, size_t size, off_t offset)
{
OVERLAPPED overlapped = {0};
HANDLE handle;
@ -30,6 +30,7 @@ pread(int fd, void *buf, size_t size, off_t offset)
return -1;
}
/* Note that this changes the file position, despite not using it. */
overlapped.Offset = offset;
if (!ReadFile(handle, buf, size, &result, &overlapped))
{

View File

@ -17,7 +17,7 @@
#include <windows.h>
ssize_t
pwrite(int fd, const void *buf, size_t size, off_t offset)
pg_pwrite(int fd, const void *buf, size_t size, off_t offset)
{
OVERLAPPED overlapped = {0};
HANDLE handle;
@ -30,6 +30,7 @@ pwrite(int fd, const void *buf, size_t size, off_t offset)
return -1;
}
/* Note that this changes the file position, despite not using it. */
overlapped.Offset = offset;
if (!WriteFile(handle, buf, size, &result, &overlapped))
{