From f8ce4ed78ca6e35bf135545e34bd49cd65d88ea2 Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Fri, 5 Apr 2024 18:01:26 +0200 Subject: [PATCH] Allow copying files using clone/copy_file_range Adds --clone/--copy-file-range options to pg_combinebackup, to allow copying files using file cloning or copy_file_range(). These methods may be faster than the standard block-by-block copy, but the main advantage is that they enable various features provided by CoW filesystems. This commit only uses these copy methods for files that did not change and can be copied as a whole from a single backup. These new copy methods may not be available on all platforms, in which case the command throws an error (immediately, even if no files would be copied as a whole). This early failure seems better than failing later when trying to copy the first file, after performing a lot of work on earlier files. If the requested copy method is available, but a checksum needs to be recalculated (e.g. because of a different checksum type), the file is still copied using the requested method, but it is also read for the checksum calculation. Depending on the filesystem this may be more expensive than just performing the simple copy, but it does enable the CoW benefits. Initial patch by Jakub Wartak, various reworks and improvements by me. Author: Tomas Vondra, Jakub Wartak Reviewed-by: Thomas Munro, Jakub Wartak, Robert Haas Discussion: https://postgr.es/m/3024283a-7491-4240-80d0-421575f6bb23%40enterprisedb.com --- doc/src/sgml/ref/pg_combinebackup.sgml | 45 +++++ src/bin/pg_combinebackup/copy_file.c | 206 ++++++++++++++++---- src/bin/pg_combinebackup/copy_file.h | 18 +- src/bin/pg_combinebackup/pg_combinebackup.c | 45 ++++- src/bin/pg_combinebackup/reconstruct.c | 3 +- src/bin/pg_combinebackup/reconstruct.h | 1 + src/tools/pgindent/typedefs.list | 1 + 7 files changed, 278 insertions(+), 41 deletions(-) diff --git a/doc/src/sgml/ref/pg_combinebackup.sgml b/doc/src/sgml/ref/pg_combinebackup.sgml index 6f90dba281..658e9a759c 100644 --- a/doc/src/sgml/ref/pg_combinebackup.sgml +++ b/doc/src/sgml/ref/pg_combinebackup.sgml @@ -185,6 +185,51 @@ PostgreSQL documentation + + + + + Use efficient file cloning (also known as reflinks on + some systems) instead of copying files to the new data directory, + which can result in near-instantaneous copying of the data files. + + + + If a backup manifest is not available or does not contain checksum of + the right type, file cloning will be used to copy the file, but the + file will be also read block-by-block for the checksum calculation. + + + + File cloning is only supported on some operating systems and file + systems. If it is selected but not supported, the + pg_combinebackup run will error. At present, + it is supported on Linux (kernel 4.5 or later) with Btrfs and XFS (on + file systems created with reflink support), and on macOS with APFS. + + + + + + + + + Use the copy_file_range system call for efficient + copying. On some file systems this gives results similar to + , sharing physical disk blocks, while on others + it may still copy blocks, but do so via an optimized path. At present, + it is supported on Linux and FreeBSD. + + + + If a backup manifest is not available or does not contain checksum of + the right type, copy_file_range will be used to + copy the file, but the file will be also read block-by-block for the + checksum calculation. + + + + diff --git a/src/bin/pg_combinebackup/copy_file.c b/src/bin/pg_combinebackup/copy_file.c index e6d2423278..844896d66e 100644 --- a/src/bin/pg_combinebackup/copy_file.c +++ b/src/bin/pg_combinebackup/copy_file.c @@ -14,6 +14,7 @@ #include #endif #include +#include #include #include @@ -24,8 +25,15 @@ static void copy_file_blocks(const char *src, const char *dst, pg_checksum_context *checksum_ctx); +static void copy_file_clone(const char *src, const char *dst, + pg_checksum_context *checksum_ctx); + +static void copy_file_by_range(const char *src, const char *dst, + pg_checksum_context *checksum_ctx); + #ifdef WIN32 -static void copy_file_copyfile(const char *src, const char *dst); +static void copy_file_copyfile(const char *src, const char *dst, + pg_checksum_context *checksum_ctx); #endif /* @@ -35,8 +43,13 @@ static void copy_file_copyfile(const char *src, const char *dst); */ void copy_file(const char *src, const char *dst, - pg_checksum_context *checksum_ctx, bool dry_run) + pg_checksum_context *checksum_ctx, + CopyMethod copy_method, bool dry_run) { + char *strategy_name = NULL; + void (*strategy_implementation) (const char *, const char *, + pg_checksum_context *checksum_ctx) = NULL; + /* * In dry-run mode, we don't actually copy anything, nor do we read any * data from the source file, but we do verify that we can open it. @@ -51,61 +64,94 @@ copy_file(const char *src, const char *dst, pg_fatal("could not close \"%s\": %m", src); } - /* - * If we don't need to compute a checksum, then we can use any special - * operating system primitives that we know about to copy the file; this - * may be quicker than a naive block copy. - */ - if (checksum_ctx->type == CHECKSUM_TYPE_NONE) - { - char *strategy_name = NULL; - void (*strategy_implementation) (const char *, const char *) = NULL; - #ifdef WIN32 - strategy_name = "CopyFile"; - strategy_implementation = copy_file_copyfile; + copy_method = COPY_METHOD_COPYFILE; #endif - if (strategy_name != NULL) - { - if (dry_run) - pg_log_debug("would copy \"%s\" to \"%s\" using strategy %s", - src, dst, strategy_name); - else - { - pg_log_debug("copying \"%s\" to \"%s\" using strategy %s", - src, dst, strategy_name); - (*strategy_implementation) (src, dst); - } - return; - } + /* Determine the name of the copy strategy for use in log messages. */ + switch (copy_method) + { + case COPY_METHOD_CLONE: + strategy_name = "clone"; + strategy_implementation = copy_file_clone; + break; + case COPY_METHOD_COPY: + /* leave NULL for simple block-by-block copy */ + strategy_implementation = copy_file_blocks; + break; + case COPY_METHOD_COPY_FILE_RANGE: + strategy_name = "copy_file_range"; + strategy_implementation = copy_file_by_range; + break; +#ifdef WIN32 + case COPY_METHOD_COPYFILE: + strategy_name = "CopyFile"; + strategy_implementation = copy_file_copyfile; + break; +#endif } - /* - * Fall back to the simple approach of reading and writing all the blocks, - * feeding them into the checksum context as we go. - */ if (dry_run) { - if (checksum_ctx->type == CHECKSUM_TYPE_NONE) + if (strategy_name) + pg_log_debug("would copy \"%s\" to \"%s\" using strategy %s", + src, dst, strategy_name); + else pg_log_debug("would copy \"%s\" to \"%s\"", src, dst); - else - pg_log_debug("would copy \"%s\" to \"%s\" and checksum with %s", - src, dst, pg_checksum_type_name(checksum_ctx->type)); } else { - if (checksum_ctx->type == CHECKSUM_TYPE_NONE) + if (strategy_name) + pg_log_debug("copying \"%s\" to \"%s\" using strategy %s", + src, dst, strategy_name); + else if (checksum_ctx->type == CHECKSUM_TYPE_NONE) pg_log_debug("copying \"%s\" to \"%s\"", src, dst); else pg_log_debug("copying \"%s\" to \"%s\" and checksumming with %s", src, dst, pg_checksum_type_name(checksum_ctx->type)); - copy_file_blocks(src, dst, checksum_ctx); + + strategy_implementation(src, dst, checksum_ctx); } } +/* + * Calculate checksum for the src file. + */ +static void +checksum_file(const char *src, pg_checksum_context *checksum_ctx) +{ + int src_fd; + uint8 *buffer; + const int buffer_size = 50 * BLCKSZ; + ssize_t rb; + unsigned offset = 0; + + /* bail out if no checksum needed */ + if (checksum_ctx->type == CHECKSUM_TYPE_NONE) + return; + + if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("could not open file \"%s\": %m", src); + + buffer = pg_malloc(buffer_size); + + while ((rb = read(src_fd, buffer, buffer_size)) > 0) + { + if (pg_checksum_update(checksum_ctx, buffer, rb) < 0) + pg_fatal("could not update checksum of file \"%s\"", src); + + offset += rb; + } + + if (rb < 0) + pg_fatal("could not read file \"%s\": %m", src); + + pg_free(buffer); + close(src_fd); +} + /* * Copy a file block by block, and optionally compute a checksum as we go. */ @@ -156,14 +202,98 @@ copy_file_blocks(const char *src, const char *dst, close(dest_fd); } +/* + * copy_file_clone + * Clones/reflinks a file from src to dest. + * + * If needed, also reads the file and calculates the checksum. + */ +static void +copy_file_clone(const char *src, const char *dest, + pg_checksum_context *checksum_ctx) +{ +#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE) + if (copyfile(src, dest, NULL, COPYFILE_CLONE_FORCE) < 0) + pg_fatal("error while cloning file \"%s\" to \"%s\": %m", src, dest); +#elif defined(__linux__) && defined(FICLONE) + { + if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("could not open file \"%s\": %m", src); + + if ((dest_fd = open(dest, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + pg_fatal("could not create file \"%s\": %m", dest); + + if (ioctl(dest_fd, FICLONE, src_fd) < 0) + { + int save_errno = errno; + + unlink(dest); + + pg_fatal("error while cloning file \"%s\" to \"%s\": %s", + src, dest); + } + } +#else + pg_fatal("file cloning not supported on this platform"); +#endif + + /* if needed, calculate checksum of the file */ + checksum_file(src, checksum_ctx); +} + +/* + * copy_file_by_range + * Copies a file from src to dest using copy_file_range system call. + * + * If needed, also reads the file and calculates the checksum. + */ +static void +copy_file_by_range(const char *src, const char *dest, + pg_checksum_context *checksum_ctx) +{ +#if defined(HAVE_COPY_FILE_RANGE) + int src_fd; + int dest_fd; + ssize_t nbytes; + + if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("could not open file \"%s\": %m", src); + + if ((dest_fd = open(dest, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + pg_fatal("could not create file \"%s\": %m", dest); + + do + { + nbytes = copy_file_range(src_fd, NULL, dest_fd, NULL, SSIZE_MAX, 0); + if (nbytes < 0) + pg_fatal("error while copying file range from \"%s\" to \"%s\": %m", + src, dest); + } while (nbytes > 0); + + close(src_fd); + close(dest_fd); +#else + pg_fatal("copy_file_range not supported on this platform"); +#endif + + /* if needed, calculate checksum of the file */ + checksum_file(src, checksum_ctx); +} + #ifdef WIN32 static void -copy_file_copyfile(const char *src, const char *dst) +copy_file_copyfile(const char *src, const char *dst, + pg_checksum_context *checksum_ctx) { if (CopyFile(src, dst, true) == 0) { _dosmaperr(GetLastError()); pg_fatal("could not copy \"%s\" to \"%s\": %m", src, dst); } + + /* if needed, calculate checksum of the file */ + checksum_file(src, checksum_ctx); } #endif /* WIN32 */ diff --git a/src/bin/pg_combinebackup/copy_file.h b/src/bin/pg_combinebackup/copy_file.h index 0f6bc09403..cedb760738 100644 --- a/src/bin/pg_combinebackup/copy_file.h +++ b/src/bin/pg_combinebackup/copy_file.h @@ -11,9 +11,25 @@ #ifndef COPY_FILE_H #define COPY_FILE_H +#include "c.h" #include "common/checksum_helper.h" +#include "common/file_utils.h" + +/* + * Enumeration to denote copy modes. + */ +typedef enum CopyMethod +{ + COPY_METHOD_CLONE, + COPY_METHOD_COPY, + COPY_METHOD_COPY_FILE_RANGE, +#ifdef WIN32 + COPY_METHOD_COPYFILE, +#endif +} CopyMethod; extern void copy_file(const char *src, const char *dst, - pg_checksum_context *checksum_ctx, bool dry_run); + pg_checksum_context *checksum_ctx, + CopyMethod copy_method, bool dry_run); #endif /* COPY_FILE_H */ diff --git a/src/bin/pg_combinebackup/pg_combinebackup.c b/src/bin/pg_combinebackup/pg_combinebackup.c index 74f8be9eea..1b07ca3fb6 100644 --- a/src/bin/pg_combinebackup/pg_combinebackup.c +++ b/src/bin/pg_combinebackup/pg_combinebackup.c @@ -69,6 +69,7 @@ typedef struct cb_options pg_checksum_type manifest_checksums; bool no_manifest; DataDirSyncMethod sync_method; + CopyMethod copy_method; } cb_options; /* @@ -129,6 +130,8 @@ main(int argc, char *argv[]) {"manifest-checksums", required_argument, NULL, 1}, {"no-manifest", no_argument, NULL, 2}, {"sync-method", required_argument, NULL, 3}, + {"clone", no_argument, NULL, 4}, + {"copy-file-range", no_argument, NULL, 5}, {NULL, 0, NULL, 0} }; @@ -156,6 +159,7 @@ main(int argc, char *argv[]) memset(&opt, 0, sizeof(opt)); opt.manifest_checksums = CHECKSUM_TYPE_CRC32C; opt.sync_method = DATA_DIR_SYNC_METHOD_FSYNC; + opt.copy_method = COPY_METHOD_COPY; /* process command-line options */ while ((c = getopt_long(argc, argv, "dnNPo:T:", @@ -192,6 +196,12 @@ main(int argc, char *argv[]) if (!parse_sync_method(optarg, &opt.sync_method)) exit(1); break; + case 4: + opt.copy_method = COPY_METHOD_CLONE; + break; + case 5: + opt.copy_method = COPY_METHOD_COPY_FILE_RANGE; + break; default: /* getopt_long already emitted a complaint */ pg_log_error_hint("Try \"%s --help\" for more information.", progname); @@ -213,6 +223,35 @@ main(int argc, char *argv[]) if (opt.no_manifest) opt.manifest_checksums = CHECKSUM_TYPE_NONE; + /* Check that the platform supports the requested copy method. */ + if (opt.copy_method == COPY_METHOD_CLONE) + { +#if (defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE)) || \ + (defined(__linux__) && defined(FICLONE)) + + if (opt.dry_run) + pg_log_debug("would use cloning to copy files"); + else + pg_log_debug("will use cloning to copy files"); + +#else + pg_fatal("file cloning not supported on this platform"); +#endif + } + else if (opt.copy_method == COPY_METHOD_COPY_FILE_RANGE) + { +#if defined(HAVE_COPY_FILE_RANGE) + + if (opt.dry_run) + pg_log_debug("would use copy_file_range to copy blocks"); + else + pg_log_debug("will use copy_file_range to copy blocks"); + +#else + pg_fatal("copy_file_range not supported on this platform"); +#endif + } + /* Read the server version from the final backup. */ version = read_pg_version_file(argv[argc - 1]); @@ -696,6 +735,8 @@ help(const char *progname) " use algorithm for manifest checksums\n")); printf(_(" --no-manifest suppress generation of backup manifest\n")); printf(_(" --sync-method=METHOD set method for syncing files to disk\n")); + printf(_(" --clone clone (reflink) instead of copying files\n")); + printf(_(" --copy-file-range copy using copy_file_range() syscall\n")); printf(_(" -?, --help show this help, then exit\n")); printf(_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT); @@ -936,6 +977,7 @@ process_directory_recursively(Oid tsoid, checksum_type, &checksum_length, &checksum_payload, + opt->copy_method, opt->debug, opt->dry_run); } @@ -993,7 +1035,8 @@ process_directory_recursively(Oid tsoid, /* Actually copy the file. */ snprintf(ofullpath, MAXPGPATH, "%s/%s", ofulldir, de->d_name); - copy_file(ifullpath, ofullpath, &checksum_ctx, opt->dry_run); + copy_file(ifullpath, ofullpath, &checksum_ctx, + opt->copy_method, opt->dry_run); /* * If copy_file() performed a checksum calculation for us, then diff --git a/src/bin/pg_combinebackup/reconstruct.c b/src/bin/pg_combinebackup/reconstruct.c index 33c6da02a8..b083c5ce15 100644 --- a/src/bin/pg_combinebackup/reconstruct.c +++ b/src/bin/pg_combinebackup/reconstruct.c @@ -89,6 +89,7 @@ reconstruct_from_incremental_file(char *input_filename, pg_checksum_type checksum_type, int *checksum_length, uint8 **checksum_payload, + CopyMethod copy_method, bool debug, bool dry_run) { @@ -319,7 +320,7 @@ reconstruct_from_incremental_file(char *input_filename, */ if (copy_source != NULL) copy_file(copy_source->filename, output_filename, - &checksum_ctx, dry_run); + &checksum_ctx, copy_method, dry_run); else { write_reconstructed_file(input_filename, output_filename, diff --git a/src/bin/pg_combinebackup/reconstruct.h b/src/bin/pg_combinebackup/reconstruct.h index 8e33a8a95a..902a8e9abb 100644 --- a/src/bin/pg_combinebackup/reconstruct.h +++ b/src/bin/pg_combinebackup/reconstruct.h @@ -27,6 +27,7 @@ extern void reconstruct_from_incremental_file(char *input_filename, pg_checksum_type checksum_type, int *checksum_length, uint8 **checksum_payload, + CopyMethod copy_method, bool debug, bool dry_run); diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index f3b8641d76..01845ee71d 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -480,6 +480,7 @@ CopyFromState CopyFromStateData CopyHeaderChoice CopyInsertMethod +CopyMethod CopyLogVerbosityChoice CopyMultiInsertBuffer CopyMultiInsertInfo