mirror of
https://git.postgresql.org/git/postgresql.git
synced 2024-10-01 12:21:18 +02:00
faeedbcefd
In order to have the option to use O_DIRECT/FILE_FLAG_NO_BUFFERING in a
later commit, we need the addresses of user space buffers to be well
aligned. The exact requirements vary by OS and file system (typically
sectors and/or memory pages). The address alignment size is set to
4096, which is enough for currently known systems: it matches modern
sectors and common memory page size. There is no standard governing
O_DIRECT's requirements so we might eventually have to reconsider this
with more information from the field or future systems.
Aligning I/O buffers on memory pages is also known to improve regular
buffered I/O performance.
Three classes of I/O buffers for regular data pages are adjusted:
(1) Heap buffers are now allocated with the new palloc_aligned() or
MemoryContextAllocAligned() functions introduced by commit 439f6175
.
(2) Stack buffers now use a new struct PGIOAlignedBlock to respect
PG_IO_ALIGN_SIZE, if possible with this compiler. (3) The buffer
pool is also aligned in shared memory.
WAL buffers were already aligned on XLOG_BLCKSZ. It's possible for
XLOG_BLCKSZ to be configured smaller than PG_IO_ALIGNED_SIZE and thus
for O_DIRECT WAL writes to fail to be well aligned, but that's a
pre-existing condition and will be addressed by a later commit.
BufFiles are not yet addressed (there's no current plan to use O_DIRECT
for those, but they could potentially get some incidental speedup even
in plain buffered I/O operations through better alignment).
If we can't align stack objects suitably using the compiler extensions
we know about, we disable the use of O_DIRECT by setting PG_O_DIRECT to
0. This avoids the need to consider systems that have O_DIRECT but
can't align stack objects the way we want; such systems could in theory
be supported with more work but we don't currently know of any such
machines, so it's easier to pretend there is no O_DIRECT support
instead. That's an existing and tested class of system.
Add assertions that all buffers passed into smgrread(), smgrwrite() and
smgrextend() are correctly aligned, unless PG_O_DIRECT is 0 (= stack
alignment tricks may be unavailable) or the block size has been set too
small to allow arrays of buffers to be all aligned.
Author: Thomas Munro <thomas.munro@gmail.com>
Author: Andres Freund <andres@anarazel.de>
Reviewed-by: Justin Pryzby <pryzby@telsasoft.com>
Discussion: https://postgr.es/m/CA+hUKGK1X532hYqJ_MzFWt0n1zt8trz980D79WbjwnT-yYLZpg@mail.gmail.com
378 lines
11 KiB
C
378 lines
11 KiB
C
/*
|
|
* file.c
|
|
*
|
|
* file system operations
|
|
*
|
|
* Copyright (c) 2010-2023, PostgreSQL Global Development Group
|
|
* src/bin/pg_upgrade/file.c
|
|
*/
|
|
|
|
#include "postgres_fe.h"
|
|
|
|
#include <sys/stat.h>
|
|
#include <fcntl.h>
|
|
#ifdef HAVE_COPYFILE_H
|
|
#include <copyfile.h>
|
|
#endif
|
|
#ifdef __linux__
|
|
#include <sys/ioctl.h>
|
|
#include <linux/fs.h>
|
|
#endif
|
|
|
|
#include "access/visibilitymapdefs.h"
|
|
#include "common/file_perm.h"
|
|
#include "pg_upgrade.h"
|
|
#include "storage/bufpage.h"
|
|
#include "storage/checksum.h"
|
|
#include "storage/checksum_impl.h"
|
|
|
|
|
|
/*
|
|
* cloneFile()
|
|
*
|
|
* Clones/reflinks a relation file from src to dst.
|
|
*
|
|
* schemaName/relName are relation's SQL name (used for error messages only).
|
|
*/
|
|
void
|
|
cloneFile(const char *src, const char *dst,
|
|
const char *schemaName, const char *relName)
|
|
{
|
|
#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE)
|
|
if (copyfile(src, dst, NULL, COPYFILE_CLONE_FORCE) < 0)
|
|
pg_fatal("error while cloning relation \"%s.%s\" (\"%s\" to \"%s\"): %s",
|
|
schemaName, relName, src, dst, strerror(errno));
|
|
#elif defined(__linux__) && defined(FICLONE)
|
|
int src_fd;
|
|
int dest_fd;
|
|
|
|
if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0)
|
|
pg_fatal("error while cloning relation \"%s.%s\": could not open file \"%s\": %s",
|
|
schemaName, relName, src, strerror(errno));
|
|
|
|
if ((dest_fd = open(dst, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
|
|
pg_file_create_mode)) < 0)
|
|
pg_fatal("error while cloning relation \"%s.%s\": could not create file \"%s\": %s",
|
|
schemaName, relName, dst, strerror(errno));
|
|
|
|
if (ioctl(dest_fd, FICLONE, src_fd) < 0)
|
|
{
|
|
int save_errno = errno;
|
|
|
|
unlink(dst);
|
|
|
|
pg_fatal("error while cloning relation \"%s.%s\" (\"%s\" to \"%s\"): %s",
|
|
schemaName, relName, src, dst, strerror(save_errno));
|
|
}
|
|
|
|
close(src_fd);
|
|
close(dest_fd);
|
|
#endif
|
|
}
|
|
|
|
|
|
/*
|
|
* copyFile()
|
|
*
|
|
* Copies a relation file from src to dst.
|
|
* schemaName/relName are relation's SQL name (used for error messages only).
|
|
*/
|
|
void
|
|
copyFile(const char *src, const char *dst,
|
|
const char *schemaName, const char *relName)
|
|
{
|
|
#ifndef WIN32
|
|
int src_fd;
|
|
int dest_fd;
|
|
char *buffer;
|
|
|
|
if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0)
|
|
pg_fatal("error while copying relation \"%s.%s\": could not open file \"%s\": %s",
|
|
schemaName, relName, src, strerror(errno));
|
|
|
|
if ((dest_fd = open(dst, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
|
|
pg_file_create_mode)) < 0)
|
|
pg_fatal("error while copying relation \"%s.%s\": could not create file \"%s\": %s",
|
|
schemaName, relName, dst, strerror(errno));
|
|
|
|
/* copy in fairly large chunks for best efficiency */
|
|
#define COPY_BUF_SIZE (50 * BLCKSZ)
|
|
|
|
buffer = (char *) pg_malloc(COPY_BUF_SIZE);
|
|
|
|
/* perform data copying i.e read src source, write to destination */
|
|
while (true)
|
|
{
|
|
ssize_t nbytes = read(src_fd, buffer, COPY_BUF_SIZE);
|
|
|
|
if (nbytes < 0)
|
|
pg_fatal("error while copying relation \"%s.%s\": could not read file \"%s\": %s",
|
|
schemaName, relName, src, strerror(errno));
|
|
|
|
if (nbytes == 0)
|
|
break;
|
|
|
|
errno = 0;
|
|
if (write(dest_fd, buffer, nbytes) != nbytes)
|
|
{
|
|
/* if write didn't set errno, assume problem is no disk space */
|
|
if (errno == 0)
|
|
errno = ENOSPC;
|
|
pg_fatal("error while copying relation \"%s.%s\": could not write file \"%s\": %s",
|
|
schemaName, relName, dst, strerror(errno));
|
|
}
|
|
}
|
|
|
|
pg_free(buffer);
|
|
close(src_fd);
|
|
close(dest_fd);
|
|
|
|
#else /* WIN32 */
|
|
|
|
if (CopyFile(src, dst, true) == 0)
|
|
{
|
|
_dosmaperr(GetLastError());
|
|
pg_fatal("error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s",
|
|
schemaName, relName, src, dst, strerror(errno));
|
|
}
|
|
|
|
#endif /* WIN32 */
|
|
}
|
|
|
|
|
|
/*
|
|
* linkFile()
|
|
*
|
|
* Hard-links a relation file from src to dst.
|
|
* schemaName/relName are relation's SQL name (used for error messages only).
|
|
*/
|
|
void
|
|
linkFile(const char *src, const char *dst,
|
|
const char *schemaName, const char *relName)
|
|
{
|
|
if (link(src, dst) < 0)
|
|
pg_fatal("error while creating link for relation \"%s.%s\" (\"%s\" to \"%s\"): %s",
|
|
schemaName, relName, src, dst, strerror(errno));
|
|
}
|
|
|
|
|
|
/*
|
|
* rewriteVisibilityMap()
|
|
*
|
|
* Transform a visibility map file, copying from src to dst.
|
|
* schemaName/relName are relation's SQL name (used for error messages only).
|
|
*
|
|
* In versions of PostgreSQL prior to catversion 201603011, PostgreSQL's
|
|
* visibility map included one bit per heap page; it now includes two.
|
|
* When upgrading a cluster from before that time to a current PostgreSQL
|
|
* version, we could refuse to copy visibility maps from the old cluster
|
|
* to the new cluster; the next VACUUM would recreate them, but at the
|
|
* price of scanning the entire table. So, instead, we rewrite the old
|
|
* visibility maps in the new format. That way, the all-visible bits
|
|
* remain set for the pages for which they were set previously. The
|
|
* all-frozen bits are never set by this conversion; we leave that to VACUUM.
|
|
*/
|
|
void
|
|
rewriteVisibilityMap(const char *fromfile, const char *tofile,
|
|
const char *schemaName, const char *relName)
|
|
{
|
|
int src_fd;
|
|
int dst_fd;
|
|
PGIOAlignedBlock buffer;
|
|
PGIOAlignedBlock new_vmbuf;
|
|
ssize_t totalBytesRead = 0;
|
|
ssize_t src_filesize;
|
|
int rewriteVmBytesPerPage;
|
|
BlockNumber new_blkno = 0;
|
|
struct stat statbuf;
|
|
|
|
/* Compute number of old-format bytes per new page */
|
|
rewriteVmBytesPerPage = (BLCKSZ - SizeOfPageHeaderData) / 2;
|
|
|
|
if ((src_fd = open(fromfile, O_RDONLY | PG_BINARY, 0)) < 0)
|
|
pg_fatal("error while copying relation \"%s.%s\": could not open file \"%s\": %s",
|
|
schemaName, relName, fromfile, strerror(errno));
|
|
|
|
if (fstat(src_fd, &statbuf) != 0)
|
|
pg_fatal("error while copying relation \"%s.%s\": could not stat file \"%s\": %s",
|
|
schemaName, relName, fromfile, strerror(errno));
|
|
|
|
if ((dst_fd = open(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
|
|
pg_file_create_mode)) < 0)
|
|
pg_fatal("error while copying relation \"%s.%s\": could not create file \"%s\": %s",
|
|
schemaName, relName, tofile, strerror(errno));
|
|
|
|
/* Save old file size */
|
|
src_filesize = statbuf.st_size;
|
|
|
|
/*
|
|
* Turn each visibility map page into 2 pages one by one. Each new page
|
|
* has the same page header as the old one. If the last section of the
|
|
* last page is empty, we skip it, mostly to avoid turning one-page
|
|
* visibility maps for small relations into two pages needlessly.
|
|
*/
|
|
while (totalBytesRead < src_filesize)
|
|
{
|
|
ssize_t bytesRead;
|
|
char *old_cur;
|
|
char *old_break;
|
|
char *old_blkend;
|
|
PageHeaderData pageheader;
|
|
bool old_lastblk;
|
|
|
|
if ((bytesRead = read(src_fd, buffer.data, BLCKSZ)) != BLCKSZ)
|
|
{
|
|
if (bytesRead < 0)
|
|
pg_fatal("error while copying relation \"%s.%s\": could not read file \"%s\": %s",
|
|
schemaName, relName, fromfile, strerror(errno));
|
|
else
|
|
pg_fatal("error while copying relation \"%s.%s\": partial page found in file \"%s\"",
|
|
schemaName, relName, fromfile);
|
|
}
|
|
|
|
totalBytesRead += BLCKSZ;
|
|
old_lastblk = (totalBytesRead == src_filesize);
|
|
|
|
/* Save the page header data */
|
|
memcpy(&pageheader, buffer.data, SizeOfPageHeaderData);
|
|
|
|
/*
|
|
* These old_* variables point to old visibility map page. old_cur
|
|
* points to current position on old page. old_blkend points to end of
|
|
* old block. old_break is the end+1 position on the old page for the
|
|
* data that will be transferred to the current new page.
|
|
*/
|
|
old_cur = buffer.data + SizeOfPageHeaderData;
|
|
old_blkend = buffer.data + bytesRead;
|
|
old_break = old_cur + rewriteVmBytesPerPage;
|
|
|
|
while (old_break <= old_blkend)
|
|
{
|
|
char *new_cur;
|
|
bool empty = true;
|
|
bool old_lastpart;
|
|
|
|
/* First, copy old page header to new page */
|
|
memcpy(new_vmbuf.data, &pageheader, SizeOfPageHeaderData);
|
|
|
|
/* Rewriting the last part of the last old page? */
|
|
old_lastpart = old_lastblk && (old_break == old_blkend);
|
|
|
|
new_cur = new_vmbuf.data + SizeOfPageHeaderData;
|
|
|
|
/* Process old page bytes one by one, and turn it into new page. */
|
|
while (old_cur < old_break)
|
|
{
|
|
uint8 byte = *(uint8 *) old_cur;
|
|
uint16 new_vmbits = 0;
|
|
int i;
|
|
|
|
/* Generate new format bits while keeping old information */
|
|
for (i = 0; i < BITS_PER_BYTE; i++)
|
|
{
|
|
if (byte & (1 << i))
|
|
{
|
|
empty = false;
|
|
new_vmbits |=
|
|
VISIBILITYMAP_ALL_VISIBLE << (BITS_PER_HEAPBLOCK * i);
|
|
}
|
|
}
|
|
|
|
/* Copy new visibility map bytes to new-format page */
|
|
new_cur[0] = (char) (new_vmbits & 0xFF);
|
|
new_cur[1] = (char) (new_vmbits >> 8);
|
|
|
|
old_cur++;
|
|
new_cur += BITS_PER_HEAPBLOCK;
|
|
}
|
|
|
|
/* If the last part of the last page is empty, skip writing it */
|
|
if (old_lastpart && empty)
|
|
break;
|
|
|
|
/* Set new checksum for visibility map page, if enabled */
|
|
if (new_cluster.controldata.data_checksum_version != 0)
|
|
((PageHeader) new_vmbuf.data)->pd_checksum =
|
|
pg_checksum_page(new_vmbuf.data, new_blkno);
|
|
|
|
errno = 0;
|
|
if (write(dst_fd, new_vmbuf.data, BLCKSZ) != BLCKSZ)
|
|
{
|
|
/* if write didn't set errno, assume problem is no disk space */
|
|
if (errno == 0)
|
|
errno = ENOSPC;
|
|
pg_fatal("error while copying relation \"%s.%s\": could not write file \"%s\": %s",
|
|
schemaName, relName, tofile, strerror(errno));
|
|
}
|
|
|
|
/* Advance for next new page */
|
|
old_break += rewriteVmBytesPerPage;
|
|
new_blkno++;
|
|
}
|
|
}
|
|
|
|
/* Clean up */
|
|
close(dst_fd);
|
|
close(src_fd);
|
|
}
|
|
|
|
void
|
|
check_file_clone(void)
|
|
{
|
|
char existing_file[MAXPGPATH];
|
|
char new_link_file[MAXPGPATH];
|
|
|
|
snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", old_cluster.pgdata);
|
|
snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.clonetest", new_cluster.pgdata);
|
|
unlink(new_link_file); /* might fail */
|
|
|
|
#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE)
|
|
if (copyfile(existing_file, new_link_file, NULL, COPYFILE_CLONE_FORCE) < 0)
|
|
pg_fatal("could not clone file between old and new data directories: %s",
|
|
strerror(errno));
|
|
#elif defined(__linux__) && defined(FICLONE)
|
|
{
|
|
int src_fd;
|
|
int dest_fd;
|
|
|
|
if ((src_fd = open(existing_file, O_RDONLY | PG_BINARY, 0)) < 0)
|
|
pg_fatal("could not open file \"%s\": %s",
|
|
existing_file, strerror(errno));
|
|
|
|
if ((dest_fd = open(new_link_file, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
|
|
pg_file_create_mode)) < 0)
|
|
pg_fatal("could not create file \"%s\": %s",
|
|
new_link_file, strerror(errno));
|
|
|
|
if (ioctl(dest_fd, FICLONE, src_fd) < 0)
|
|
pg_fatal("could not clone file between old and new data directories: %s",
|
|
strerror(errno));
|
|
|
|
close(src_fd);
|
|
close(dest_fd);
|
|
}
|
|
#else
|
|
pg_fatal("file cloning not supported on this platform");
|
|
#endif
|
|
|
|
unlink(new_link_file);
|
|
}
|
|
|
|
void
|
|
check_hard_link(void)
|
|
{
|
|
char existing_file[MAXPGPATH];
|
|
char new_link_file[MAXPGPATH];
|
|
|
|
snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", old_cluster.pgdata);
|
|
snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.linktest", new_cluster.pgdata);
|
|
unlink(new_link_file); /* might fail */
|
|
|
|
if (link(existing_file, new_link_file) < 0)
|
|
pg_fatal("could not create hard link between old and new data directories: %s\n"
|
|
"In link mode the old and new data directories must be on the same file system.",
|
|
strerror(errno));
|
|
|
|
unlink(new_link_file);
|
|
}
|