postgresql/src/backend/storage/file/copydir.c

207 lines
5.0 KiB
C
Raw Normal View History

2003-11-12 00:52:45 +01:00
/*-------------------------------------------------------------------------
*
* copydir.c
* copies a directory
*
2017-01-03 19:48:53 +01:00
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
2003-11-12 00:52:45 +01:00
* Portions Copyright (c) 1994, Regents of the University of California
*
* While "xcopy /e /i /q" works fine for copying directories, on Windows XP
* it requires a Window handle which prevents it from working when invoked
* as a service.
*
2003-11-12 00:52:45 +01:00
* IDENTIFICATION
2010-09-20 22:08:53 +02:00
* src/backend/storage/file/copydir.c
2003-11-12 00:52:45 +01:00
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <fcntl.h>
#include <unistd.h>
#include <sys/stat.h>
#include "storage/copydir.h"
#include "storage/fd.h"
#include "miscadmin.h"
/*
* copydir: copy a directory
*
* If recurse is false, subdirectories are ignored. Anything that's not
* a directory or a regular file is ignored.
*/
void
copydir(char *fromdir, char *todir, bool recurse)
{
DIR *xldir;
struct dirent *xlde;
char fromfile[MAXPGPATH];
char tofile[MAXPGPATH];
if (mkdir(todir, S_IRWXU) != 0)
ereport(ERROR,
2003-07-27 19:10:07 +02:00
(errcode_for_file_access(),
errmsg("could not create directory \"%s\": %m", todir)));
xldir = AllocateDir(fromdir);
if (xldir == NULL)
ereport(ERROR,
2003-07-27 19:10:07 +02:00
(errcode_for_file_access(),
errmsg("could not open directory \"%s\": %m", fromdir)));
while ((xlde = ReadDir(xldir, fromdir)) != NULL)
{
2005-10-15 04:49:52 +02:00
struct stat fst;
2010-07-06 21:19:02 +02:00
/* If we got a cancel signal during the copy of the directory, quit */
CHECK_FOR_INTERRUPTS();
2005-10-15 04:49:52 +02:00
if (strcmp(xlde->d_name, ".") == 0 ||
strcmp(xlde->d_name, "..") == 0)
2005-10-15 04:49:52 +02:00
continue;
snprintf(fromfile, MAXPGPATH, "%s/%s", fromdir, xlde->d_name);
snprintf(tofile, MAXPGPATH, "%s/%s", todir, xlde->d_name);
if (lstat(fromfile, &fst) < 0)
ereport(ERROR,
(errcode_for_file_access(),
2005-10-29 02:31:52 +02:00
errmsg("could not stat file \"%s\": %m", fromfile)));
if (S_ISDIR(fst.st_mode))
{
/* recurse to handle subdirectories */
if (recurse)
copydir(fromfile, tofile, true);
}
else if (S_ISREG(fst.st_mode))
copy_file(fromfile, tofile);
}
FreeDir(xldir);
/*
* Be paranoid here and fsync all files to ensure the copy is really done.
* But if fsync is disabled, we're done.
*/
if (!enableFsync)
return;
xldir = AllocateDir(todir);
if (xldir == NULL)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open directory \"%s\": %m", todir)));
while ((xlde = ReadDir(xldir, todir)) != NULL)
{
struct stat fst;
if (strcmp(xlde->d_name, ".") == 0 ||
strcmp(xlde->d_name, "..") == 0)
continue;
snprintf(tofile, MAXPGPATH, "%s/%s", todir, xlde->d_name);
/*
* We don't need to sync subdirectories here since the recursive
* copydir will do it before it returns
*/
if (lstat(tofile, &fst) < 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not stat file \"%s\": %m", tofile)));
if (S_ISREG(fst.st_mode))
fsync_fname(tofile, false);
}
FreeDir(xldir);
/*
2010-02-26 03:01:40 +01:00
* It's important to fsync the destination directory itself as individual
* file fsyncs don't guarantee that the directory entry for the file is
* synced. Recent versions of ext4 have made the window much wider but
* it's been true for ext3 and other filesystems in the past.
*/
fsync_fname(todir, true);
}
/*
* copy one file
*/
void
copy_file(char *fromfile, char *tofile)
{
char *buffer;
int srcfd;
int dstfd;
int nbytes;
off_t offset;
/* Use palloc to ensure we get a maxaligned buffer */
#define COPY_BUF_SIZE (8 * BLCKSZ)
buffer = palloc(COPY_BUF_SIZE);
/*
* Open the files
*/
srcfd = OpenTransientFile(fromfile, O_RDONLY | PG_BINARY, 0);
if (srcfd < 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open file \"%s\": %m", fromfile)));
dstfd = OpenTransientFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
S_IRUSR | S_IWUSR);
if (dstfd < 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not create file \"%s\": %m", tofile)));
/*
* Do the data copying.
*/
2010-02-26 03:01:40 +01:00
for (offset = 0;; offset += nbytes)
{
2010-07-06 21:19:02 +02:00
/* If we got a cancel signal during the copy of the file, quit */
CHECK_FOR_INTERRUPTS();
nbytes = read(srcfd, buffer, COPY_BUF_SIZE);
if (nbytes < 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not read file \"%s\": %m", fromfile)));
if (nbytes == 0)
break;
errno = 0;
if ((int) write(dstfd, buffer, nbytes) != nbytes)
2003-08-04 02:43:34 +02:00
{
/* if write didn't set errno, assume problem is no disk space */
if (errno == 0)
errno = ENOSPC;
ereport(ERROR,
2003-08-04 02:43:34 +02:00
(errcode_for_file_access(),
errmsg("could not write to file \"%s\": %m", tofile)));
2003-08-04 02:43:34 +02:00
}
/*
2010-02-26 03:01:40 +01:00
* We fsync the files later but first flush them to avoid spamming the
* cache and hopefully get the kernel to start writing them out before
Allow to trigger kernel writeback after a configurable number of writes. Currently writes to the main data files of postgres all go through the OS page cache. This means that some operating systems can end up collecting a large number of dirty buffers in their respective page caches. When these dirty buffers are flushed to storage rapidly, be it because of fsync(), timeouts, or dirty ratios, latency for other reads and writes can increase massively. This is the primary reason for regular massive stalls observed in real world scenarios and artificial benchmarks; on rotating disks stalls on the order of hundreds of seconds have been observed. On linux it is possible to control this by reducing the global dirty limits significantly, reducing the above problem. But global configuration is rather problematic because it'll affect other applications; also PostgreSQL itself doesn't always generally want this behavior, e.g. for temporary files it's undesirable. Several operating systems allow some control over the kernel page cache. Linux has sync_file_range(2), several posix systems have msync(2) and posix_fadvise(2). sync_file_range(2) is preferable because it requires no special setup, whereas msync() requires the to-be-flushed range to be mmap'ed. For the purpose of flushing dirty data posix_fadvise(2) is the worst alternative, as flushing dirty data is just a side-effect of POSIX_FADV_DONTNEED, which also removes the pages from the page cache. Thus the feature is enabled by default only on linux, but can be enabled on all systems that have any of the above APIs. While desirable and likely possible this patch does not contain an implementation for windows. With the infrastructure added, writes made via checkpointer, bgwriter and normal user backends can be flushed after a configurable number of writes. Each of these sources of writes controlled by a separate GUC, checkpointer_flush_after, bgwriter_flush_after and backend_flush_after respectively; they're separate because the number of flushes that are good are separate, and because the performance considerations of controlled flushing for each of these are different. A later patch will add checkpoint sorting - after that flushes from the ckeckpoint will almost always be desirable. Bgwriter flushes are most of the time going to be random, which are slow on lots of storage hardware. Flushing in backends works well if the storage and bgwriter can keep up, but if not it can have negative consequences. This patch is likely to have negative performance consequences without checkpoint sorting, but unfortunately so has sorting without flush control. Discussion: alpine.DEB.2.10.1506011320000.28433@sto Author: Fabien Coelho and Andres Freund
2016-02-19 21:13:05 +01:00
* the fsync comes.
*/
Allow to trigger kernel writeback after a configurable number of writes. Currently writes to the main data files of postgres all go through the OS page cache. This means that some operating systems can end up collecting a large number of dirty buffers in their respective page caches. When these dirty buffers are flushed to storage rapidly, be it because of fsync(), timeouts, or dirty ratios, latency for other reads and writes can increase massively. This is the primary reason for regular massive stalls observed in real world scenarios and artificial benchmarks; on rotating disks stalls on the order of hundreds of seconds have been observed. On linux it is possible to control this by reducing the global dirty limits significantly, reducing the above problem. But global configuration is rather problematic because it'll affect other applications; also PostgreSQL itself doesn't always generally want this behavior, e.g. for temporary files it's undesirable. Several operating systems allow some control over the kernel page cache. Linux has sync_file_range(2), several posix systems have msync(2) and posix_fadvise(2). sync_file_range(2) is preferable because it requires no special setup, whereas msync() requires the to-be-flushed range to be mmap'ed. For the purpose of flushing dirty data posix_fadvise(2) is the worst alternative, as flushing dirty data is just a side-effect of POSIX_FADV_DONTNEED, which also removes the pages from the page cache. Thus the feature is enabled by default only on linux, but can be enabled on all systems that have any of the above APIs. While desirable and likely possible this patch does not contain an implementation for windows. With the infrastructure added, writes made via checkpointer, bgwriter and normal user backends can be flushed after a configurable number of writes. Each of these sources of writes controlled by a separate GUC, checkpointer_flush_after, bgwriter_flush_after and backend_flush_after respectively; they're separate because the number of flushes that are good are separate, and because the performance considerations of controlled flushing for each of these are different. A later patch will add checkpoint sorting - after that flushes from the ckeckpoint will almost always be desirable. Bgwriter flushes are most of the time going to be random, which are slow on lots of storage hardware. Flushing in backends works well if the storage and bgwriter can keep up, but if not it can have negative consequences. This patch is likely to have negative performance consequences without checkpoint sorting, but unfortunately so has sorting without flush control. Discussion: alpine.DEB.2.10.1506011320000.28433@sto Author: Fabien Coelho and Andres Freund
2016-02-19 21:13:05 +01:00
pg_flush_data(dstfd, offset, nbytes);
}
if (CloseTransientFile(dstfd))
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not close file \"%s\": %m", tofile)));
CloseTransientFile(srcfd);
pfree(buffer);
}