postgresql/src/bin/pg_combinebackup/reconstruct.c

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

779 lines
22 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* reconstruct.c
* Reconstruct full file from incremental file and backup chain.
*
* Copyright (c) 2017-2024, PostgreSQL Global Development Group
*
* IDENTIFICATION
* src/bin/pg_combinebackup/reconstruct.c
*
*-------------------------------------------------------------------------
*/
#include "postgres_fe.h"
#include <unistd.h>
#include "backup/basebackup_incremental.h"
#include "common/file_perm.h"
#include "common/logging.h"
#include "copy_file.h"
#include "lib/stringinfo.h"
#include "reconstruct.h"
#include "storage/block.h"
/*
* An rfile stores the data that we need in order to be able to use some file
* on disk for reconstruction. For any given output file, we create one rfile
* per backup that we need to consult when we constructing that output file.
*
* If we find a full version of the file in the backup chain, then only
* filename and fd are initialized; the remaining fields are 0 or NULL.
* For an incremental file, header_length, num_blocks, relative_block_numbers,
* and truncation_block_length are also set.
*
* num_blocks_read and highest_offset_read always start out as 0.
*/
typedef struct rfile
{
char *filename;
int fd;
size_t header_length;
unsigned num_blocks;
BlockNumber *relative_block_numbers;
unsigned truncation_block_length;
unsigned num_blocks_read;
off_t highest_offset_read;
} rfile;
static void debug_reconstruction(int n_source,
rfile **sources,
bool dry_run);
static unsigned find_reconstructed_block_length(rfile *s);
static rfile *make_incremental_rfile(char *filename);
static rfile *make_rfile(char *filename, bool missing_ok);
static void write_reconstructed_file(char *input_filename,
char *output_filename,
unsigned block_length,
rfile **sourcemap,
off_t *offsetmap,
pg_checksum_context *checksum_ctx,
CopyMethod copy_method,
bool debug,
bool dry_run);
static void read_bytes(rfile *rf, void *buffer, unsigned length);
static void write_block(int wfd, char *output_filename,
uint8 *buffer,
pg_checksum_context *checksum_ctx);
static void read_block(rfile *s, off_t off, uint8 *buffer);
/*
* Reconstruct a full file from an incremental file and a chain of prior
* backups.
*
* input_filename should be the path to the incremental file, and
* output_filename should be the path where the reconstructed file is to be
* written.
*
* relative_path should be the path to the directory containing this file,
* relative to the root of the backup (NOT relative to the root of the
* tablespace). bare_file_name should be the name of the file within that
* directory, without "INCREMENTAL.".
*
* n_prior_backups is the number of prior backups, and prior_backup_dirs is
* an array of pathnames where those backups can be found.
*/
void
reconstruct_from_incremental_file(char *input_filename,
char *output_filename,
char *relative_path,
char *bare_file_name,
int n_prior_backups,
char **prior_backup_dirs,
manifest_data **manifests,
char *manifest_path,
pg_checksum_type checksum_type,
int *checksum_length,
uint8 **checksum_payload,
CopyMethod copy_method,
bool debug,
bool dry_run)
{
rfile **source;
rfile *latest_source = NULL;
rfile **sourcemap;
off_t *offsetmap;
unsigned block_length;
unsigned i;
unsigned sidx = n_prior_backups;
bool full_copy_possible = true;
int copy_source_index = -1;
rfile *copy_source = NULL;
pg_checksum_context checksum_ctx;
/*
* Every block must come either from the latest version of the file or
* from one of the prior backups.
*/
source = pg_malloc0(sizeof(rfile *) * (1 + n_prior_backups));
/*
* Use the information from the latest incremental file to figure out how
* long the reconstructed file should be.
*/
latest_source = make_incremental_rfile(input_filename);
source[n_prior_backups] = latest_source;
block_length = find_reconstructed_block_length(latest_source);
/*
* For each block in the output file, we need to know from which file we
* need to obtain it and at what offset in that file it's stored.
* sourcemap gives us the first of these things, and offsetmap the latter.
*/
sourcemap = pg_malloc0(sizeof(rfile *) * block_length);
offsetmap = pg_malloc0(sizeof(off_t) * block_length);
/*
* Every block that is present in the newest incremental file should be
* sourced from that file. If it precedes the truncation_block_length,
* it's a block that we would otherwise have had to find in an older
* backup and thus reduces the number of blocks remaining to be found by
* one; otherwise, it's an extra block that needs to be included in the
* output but would not have needed to be found in an older backup if it
* had not been present.
*/
for (i = 0; i < latest_source->num_blocks; ++i)
{
BlockNumber b = latest_source->relative_block_numbers[i];
Assert(b < block_length);
sourcemap[b] = latest_source;
offsetmap[b] = latest_source->header_length + (i * BLCKSZ);
/*
* A full copy of a file from an earlier backup is only possible if no
* blocks are needed from any later incremental file.
*/
full_copy_possible = false;
}
while (1)
{
char source_filename[MAXPGPATH];
rfile *s;
/*
* Move to the next backup in the chain. If there are no more, then
* we're done.
*/
if (sidx == 0)
break;
--sidx;
/*
* Look for the full file in the previous backup. If not found, then
* look for an incremental file instead.
*/
snprintf(source_filename, MAXPGPATH, "%s/%s/%s",
prior_backup_dirs[sidx], relative_path, bare_file_name);
if ((s = make_rfile(source_filename, true)) == NULL)
{
snprintf(source_filename, MAXPGPATH, "%s/%s/INCREMENTAL.%s",
prior_backup_dirs[sidx], relative_path, bare_file_name);
s = make_incremental_rfile(source_filename);
}
source[sidx] = s;
/*
* If s->header_length == 0, then this is a full file; otherwise, it's
* an incremental file.
*/
if (s->header_length == 0)
{
struct stat sb;
BlockNumber b;
BlockNumber blocklength;
/* We need to know the length of the file. */
if (fstat(s->fd, &sb) < 0)
pg_fatal("could not stat \"%s\": %m", s->filename);
/*
* Since we found a full file, source all blocks from it that
* exist in the file.
*
* Note that there may be blocks that don't exist either in this
* file or in any incremental file but that precede
* truncation_block_length. These are, presumably, zero-filled
* blocks that result from the server extending the file but
* taking no action on those blocks that generated any WAL.
*
* Sadly, we have no way of validating that this is really what
* happened, and neither does the server. From it's perspective,
* an unmodified block that contains data looks exactly the same
* as a zero-filled block that never had any data: either way,
* it's not mentioned in any WAL summary and the server has no
* reason to read it. From our perspective, all we know is that
* nobody had a reason to back up the block. That certainly means
* that the block didn't exist at the time of the full backup, but
* the supposition that it was all zeroes at the time of every
* later backup is one that we can't validate.
*/
blocklength = sb.st_size / BLCKSZ;
for (b = 0; b < latest_source->truncation_block_length; ++b)
{
if (sourcemap[b] == NULL && b < blocklength)
{
sourcemap[b] = s;
offsetmap[b] = b * BLCKSZ;
}
}
/*
* If a full copy looks possible, check whether the resulting file
* should be exactly as long as the source file is. If so, a full
* copy is acceptable, otherwise not.
*/
if (full_copy_possible)
{
uint64 expected_length;
expected_length =
(uint64) latest_source->truncation_block_length;
expected_length *= BLCKSZ;
if (expected_length == sb.st_size)
{
copy_source = s;
copy_source_index = sidx;
}
}
/* We don't need to consider any further sources. */
break;
}
/*
* Since we found another incremental file, source all blocks from it
* that we need but don't yet have.
*/
for (i = 0; i < s->num_blocks; ++i)
{
BlockNumber b = s->relative_block_numbers[i];
if (b < latest_source->truncation_block_length &&
sourcemap[b] == NULL)
{
sourcemap[b] = s;
offsetmap[b] = s->header_length + (i * BLCKSZ);
/*
* A full copy of a file from an earlier backup is only
* possible if no blocks are needed from any later incremental
* file.
*/
full_copy_possible = false;
}
}
}
/*
* If a checksum of the required type already exists in the
* backup_manifest for the relevant input directory, we can save some work
* by reusing that checksum instead of computing a new one.
*/
if (copy_source_index >= 0 && manifests[copy_source_index] != NULL &&
checksum_type != CHECKSUM_TYPE_NONE)
{
manifest_file *mfile;
mfile = manifest_files_lookup(manifests[copy_source_index]->files,
manifest_path);
if (mfile == NULL)
{
char *path = psprintf("%s/backup_manifest",
prior_backup_dirs[copy_source_index]);
/*
* The directory is out of sync with the backup_manifest, so emit
* a warning.
*/
/*- translator: the first %s is a backup manifest file, the second is a file absent therein */
pg_log_warning("\"%s\" contains no entry for \"%s\"",
path,
manifest_path);
pfree(path);
}
else if (mfile->checksum_type == checksum_type)
{
*checksum_length = mfile->checksum_length;
*checksum_payload = pg_malloc(*checksum_length);
memcpy(*checksum_payload, mfile->checksum_payload,
*checksum_length);
checksum_type = CHECKSUM_TYPE_NONE;
}
}
/* Prepare for checksum calculation, if required. */
pg_checksum_init(&checksum_ctx, checksum_type);
/*
* If the full file can be created by copying a file from an older backup
* in the chain without needing to overwrite any blocks or truncate the
* result, then forget about performing reconstruction and just copy that
* file in its entirety.
*
* Otherwise, reconstruct.
*/
if (copy_source != NULL)
copy_file(copy_source->filename, output_filename,
&checksum_ctx, copy_method, dry_run);
else
{
write_reconstructed_file(input_filename, output_filename,
block_length, sourcemap, offsetmap,
&checksum_ctx, copy_method,
debug, dry_run);
debug_reconstruction(n_prior_backups + 1, source, dry_run);
}
/* Save results of checksum calculation. */
if (checksum_type != CHECKSUM_TYPE_NONE)
{
*checksum_payload = pg_malloc(PG_CHECKSUM_MAX_LENGTH);
*checksum_length = pg_checksum_final(&checksum_ctx,
*checksum_payload);
}
/*
* Close files and release memory.
*/
for (i = 0; i <= n_prior_backups; ++i)
{
rfile *s = source[i];
if (s == NULL)
continue;
if (close(s->fd) != 0)
pg_fatal("could not close \"%s\": %m", s->filename);
if (s->relative_block_numbers != NULL)
pfree(s->relative_block_numbers);
pg_free(s->filename);
}
pfree(sourcemap);
pfree(offsetmap);
pfree(source);
}
/*
* Perform post-reconstruction logging and sanity checks.
*/
static void
debug_reconstruction(int n_source, rfile **sources, bool dry_run)
{
unsigned i;
for (i = 0; i < n_source; ++i)
{
rfile *s = sources[i];
/* Ignore source if not used. */
if (s == NULL)
continue;
/* If no data is needed from this file, we can ignore it. */
if (s->num_blocks_read == 0)
continue;
/* Debug logging. */
if (dry_run)
pg_log_debug("would have read %u blocks from \"%s\"",
s->num_blocks_read, s->filename);
else
pg_log_debug("read %u blocks from \"%s\"",
s->num_blocks_read, s->filename);
/*
* In dry-run mode, we don't actually try to read data from the file,
* but we do try to verify that the file is long enough that we could
* have read the data if we'd tried.
*
* If this fails, then it means that a non-dry-run attempt would fail,
* complaining of not being able to read the required bytes from the
* file.
*/
if (dry_run)
{
struct stat sb;
if (fstat(s->fd, &sb) < 0)
pg_fatal("could not stat \"%s\": %m", s->filename);
if (sb.st_size < s->highest_offset_read)
pg_fatal("file \"%s\" is too short: expected %llu, found %llu",
s->filename,
(unsigned long long) s->highest_offset_read,
(unsigned long long) sb.st_size);
}
}
}
/*
* When we perform reconstruction using an incremental file, the output file
* should be at least as long as the truncation_block_length. Any blocks
* present in the incremental file increase the output length as far as is
* necessary to include those blocks.
*/
static unsigned
find_reconstructed_block_length(rfile *s)
{
unsigned block_length = s->truncation_block_length;
unsigned i;
for (i = 0; i < s->num_blocks; ++i)
if (s->relative_block_numbers[i] >= block_length)
block_length = s->relative_block_numbers[i] + 1;
return block_length;
}
/*
* Initialize an incremental rfile, reading the header so that we know which
* blocks it contains.
*/
static rfile *
make_incremental_rfile(char *filename)
{
rfile *rf;
unsigned magic;
rf = make_rfile(filename, false);
/* Read and validate magic number. */
read_bytes(rf, &magic, sizeof(magic));
if (magic != INCREMENTAL_MAGIC)
pg_fatal("file \"%s\" has bad incremental magic number (0x%x not 0x%x)",
filename, magic, INCREMENTAL_MAGIC);
/* Read block count. */
read_bytes(rf, &rf->num_blocks, sizeof(rf->num_blocks));
if (rf->num_blocks > RELSEG_SIZE)
pg_fatal("file \"%s\" has block count %u in excess of segment size %u",
filename, rf->num_blocks, RELSEG_SIZE);
/* Read truncation block length. */
read_bytes(rf, &rf->truncation_block_length,
sizeof(rf->truncation_block_length));
if (rf->truncation_block_length > RELSEG_SIZE)
pg_fatal("file \"%s\" has truncation block length %u in excess of segment size %u",
filename, rf->truncation_block_length, RELSEG_SIZE);
/* Read block numbers if there are any. */
if (rf->num_blocks > 0)
{
rf->relative_block_numbers =
pg_malloc0(sizeof(BlockNumber) * rf->num_blocks);
read_bytes(rf, rf->relative_block_numbers,
sizeof(BlockNumber) * rf->num_blocks);
}
/* Remember length of header. */
rf->header_length = sizeof(magic) + sizeof(rf->num_blocks) +
sizeof(rf->truncation_block_length) +
sizeof(BlockNumber) * rf->num_blocks;
Align blocks in incremental backups to BLCKSZ Align blocks stored in incremental files to BLCKSZ, so that the incremental backups work well with CoW filesystems. The header of the incremental file is padded with \0 to a multiple of BLCKSZ, so that the block data (also BLCKSZ) is aligned to BLCKSZ. The padding is added only to files containing block data, so files with just the header remain small. This adds a bit of extra space, but as the number of blocks increases the overhead gets negligible very quickly. And as the padding is \0 bytes, it does compress extremely well. The alignment is important for CoW filesystems that usually require the blocks to be aligned to filesystem page size for features like block sharing, deduplication etc. to work well. With the variable sized header the blocks in the increments were not aligned at all, negating the benefits of the CoW filesystems. This matters even for non-CoW filesystems, for example when placed on a RAID array. If the block is not aligned, it may easily span multiple devices, causing read and write amplification. It might be better to align the blocks to the filesystem page, not BLCKSZ, but we have no good way to determine that. Even if we determine the page size at the time of taking the backup, the backup may move. For now the BLCKSZ seems sufficient - the filesystem page is usually 4K, so the default BLCKSZ (8K by default) is aligned to that. Author: Tomas Vondra Reviewed-by: Robert Haas, Jakub Wartak Discussion: https://postgr.es/m/3024283a-7491-4240-80d0-421575f6bb23%40enterprisedb.com
2024-04-05 16:28:21 +02:00
/*
* Round header length to a multiple of BLCKSZ, so that blocks contents
* are properly aligned. Only do this when the file actually has data for
* some blocks.
*/
if ((rf->num_blocks > 0) && ((rf->header_length % BLCKSZ) != 0))
rf->header_length += (BLCKSZ - (rf->header_length % BLCKSZ));
return rf;
}
/*
* Allocate and perform basic initialization of an rfile.
*/
static rfile *
make_rfile(char *filename, bool missing_ok)
{
rfile *rf;
rf = pg_malloc0(sizeof(rfile));
rf->filename = pstrdup(filename);
if ((rf->fd = open(filename, O_RDONLY | PG_BINARY, 0)) < 0)
{
if (missing_ok && errno == ENOENT)
{
pg_free(rf);
return NULL;
}
pg_fatal("could not open file \"%s\": %m", filename);
}
return rf;
}
/*
* Read the indicated number of bytes from an rfile into the buffer.
*/
static void
read_bytes(rfile *rf, void *buffer, unsigned length)
{
int rb = read(rf->fd, buffer, length);
if (rb != length)
{
if (rb < 0)
pg_fatal("could not read file \"%s\": %m", rf->filename);
else
2024-03-13 06:40:32 +01:00
pg_fatal("could not read file \"%s\": read only %d of %u bytes",
rf->filename, rb, length);
}
}
/*
* Write out a reconstructed file.
*/
static void
write_reconstructed_file(char *input_filename,
char *output_filename,
unsigned block_length,
rfile **sourcemap,
off_t *offsetmap,
pg_checksum_context *checksum_ctx,
CopyMethod copy_method,
bool debug,
bool dry_run)
{
int wfd = -1;
unsigned i;
unsigned zero_blocks = 0;
/* Debugging output. */
if (debug)
{
StringInfoData debug_buf;
unsigned start_of_range = 0;
unsigned current_block = 0;
/* Basic information about the output file to be produced. */
if (dry_run)
pg_log_debug("would reconstruct \"%s\" (%u blocks, checksum %s)",
output_filename, block_length,
pg_checksum_type_name(checksum_ctx->type));
else
pg_log_debug("reconstructing \"%s\" (%u blocks, checksum %s)",
output_filename, block_length,
pg_checksum_type_name(checksum_ctx->type));
/* Print out the plan for reconstructing this file. */
initStringInfo(&debug_buf);
while (current_block < block_length)
{
rfile *s = sourcemap[current_block];
/* Extend range, if possible. */
if (current_block + 1 < block_length &&
s == sourcemap[current_block + 1])
{
++current_block;
continue;
}
/* Add details about this range. */
if (s == NULL)
{
if (current_block == start_of_range)
appendStringInfo(&debug_buf, " %u:zero", current_block);
else
appendStringInfo(&debug_buf, " %u-%u:zero",
start_of_range, current_block);
}
else
{
if (current_block == start_of_range)
appendStringInfo(&debug_buf, " %u:%s@" UINT64_FORMAT,
current_block, s->filename,
(uint64) offsetmap[current_block]);
else
appendStringInfo(&debug_buf, " %u-%u:%s@" UINT64_FORMAT,
start_of_range, current_block,
s->filename,
(uint64) offsetmap[current_block]);
}
/* Begin new range. */
start_of_range = ++current_block;
/* If the output is very long or we are done, dump it now. */
if (current_block == block_length || debug_buf.len > 1024)
{
pg_log_debug("reconstruction plan:%s", debug_buf.data);
resetStringInfo(&debug_buf);
}
}
/* Free memory. */
pfree(debug_buf.data);
}
/* Open the output file, except in dry_run mode. */
if (!dry_run &&
(wfd = open(output_filename,
O_RDWR | PG_BINARY | O_CREAT | O_EXCL,
pg_file_create_mode)) < 0)
pg_fatal("could not open file \"%s\": %m", output_filename);
/* Read and write the blocks as required. */
for (i = 0; i < block_length; ++i)
{
uint8 buffer[BLCKSZ];
rfile *s = sourcemap[i];
/* Update accounting information. */
if (s == NULL)
++zero_blocks;
else
{
s->num_blocks_read++;
s->highest_offset_read = Max(s->highest_offset_read,
offsetmap[i] + BLCKSZ);
}
/* Skip the rest of this in dry-run mode. */
if (dry_run)
continue;
/* Read or zero-fill the block as appropriate. */
if (s == NULL)
{
/*
* New block not mentioned in the WAL summary. Should have been an
* uninitialized block, so just zero-fill it.
*/
memset(buffer, 0, BLCKSZ);
/* Write out the block, update the checksum if needed. */
write_block(wfd, output_filename, buffer, checksum_ctx);
/* Nothing else to do for zero-filled blocks. */
continue;
}
/* Copy the block using the appropriate copy method. */
if (copy_method != COPY_METHOD_COPY_FILE_RANGE)
{
/*
* Read the block from the correct source file, and then write it
* out, possibly with a checksum update.
*/
read_block(s, offsetmap[i], buffer);
write_block(wfd, output_filename, buffer, checksum_ctx);
}
else /* use copy_file_range */
{
#if defined(HAVE_COPY_FILE_RANGE)
/* copy_file_range modifies the offset, so use a local copy */
off_t off = offsetmap[i];
size_t nwritten = 0;
/*
* Retry until we've written all the bytes (the offset is updated
* by copy_file_range, and so is the wfd file offset).
*/
do
{
int wb;
wb = copy_file_range(s->fd, &off, wfd, NULL, BLCKSZ - nwritten, 0);
if (wb < 0)
pg_fatal("error while copying file range from \"%s\" to \"%s\": %m",
input_filename, output_filename);
nwritten += wb;
} while (BLCKSZ > nwritten);
/*
* When checksum calculation not needed, we're done, otherwise
* read the block and pass it to the checksum calculation.
*/
if (checksum_ctx->type == CHECKSUM_TYPE_NONE)
continue;
read_block(s, offsetmap[i], buffer);
if (pg_checksum_update(checksum_ctx, buffer, BLCKSZ) < 0)
pg_fatal("could not update checksum of file \"%s\"",
output_filename);
#else
pg_fatal("copy_file_range not supported on this platform");
#endif
}
}
/* Debugging output. */
if (zero_blocks > 0)
{
if (dry_run)
pg_log_debug("would have zero-filled %u blocks", zero_blocks);
else
pg_log_debug("zero-filled %u blocks", zero_blocks);
}
/* Close the output file. */
if (wfd >= 0 && close(wfd) != 0)
pg_fatal("could not close \"%s\": %m", output_filename);
}
/*
* Write the block into the file (using the file descriptor), and
* if needed update the checksum calculation.
*
* The buffer is expected to contain BLCKSZ bytes. The filename is
* provided only for the error message.
*/
static void
write_block(int fd, char *output_filename,
uint8 *buffer, pg_checksum_context *checksum_ctx)
{
int wb;
if ((wb = write(fd, buffer, BLCKSZ)) != BLCKSZ)
{
if (wb < 0)
pg_fatal("could not write file \"%s\": %m", output_filename);
else
pg_fatal("could not write file \"%s\": wrote only %d of %d bytes",
output_filename, wb, BLCKSZ);
}
/* Update the checksum computation. */
if (pg_checksum_update(checksum_ctx, buffer, BLCKSZ) < 0)
pg_fatal("could not update checksum of file \"%s\"",
output_filename);
}
/*
* Read a block of data (BLCKSZ bytes) into the buffer.
*/
static void
read_block(rfile *s, off_t off, uint8 *buffer)
{
int rb;
/* Read the block from the correct source, except if dry-run. */
rb = pg_pread(s->fd, buffer, BLCKSZ, off);
if (rb != BLCKSZ)
{
if (rb < 0)
pg_fatal("could not read file \"%s\": %m", s->filename);
else
pg_fatal("could not read file \"%s\": read only %d of %d bytes at offset %llu",
s->filename, rb, BLCKSZ,
(unsigned long long) off);
}
}