/*------------------------------------------------------------------------- * * reconstruct.c * Reconstruct full file from incremental file and backup chain. * * Copyright (c) 2017-2024, PostgreSQL Global Development Group * * IDENTIFICATION * src/bin/pg_combinebackup/reconstruct.c * *------------------------------------------------------------------------- */ #include "postgres_fe.h" #include #include "backup/basebackup_incremental.h" #include "common/file_perm.h" #include "common/logging.h" #include "copy_file.h" #include "lib/stringinfo.h" #include "reconstruct.h" #include "storage/block.h" /* * An rfile stores the data that we need in order to be able to use some file * on disk for reconstruction. For any given output file, we create one rfile * per backup that we need to consult when we constructing that output file. * * If we find a full version of the file in the backup chain, then only * filename and fd are initialized; the remaining fields are 0 or NULL. * For an incremental file, header_length, num_blocks, relative_block_numbers, * and truncation_block_length are also set. * * num_blocks_read and highest_offset_read always start out as 0. */ typedef struct rfile { char *filename; int fd; size_t header_length; unsigned num_blocks; BlockNumber *relative_block_numbers; unsigned truncation_block_length; unsigned num_blocks_read; off_t highest_offset_read; } rfile; static void debug_reconstruction(int n_source, rfile **sources, bool dry_run); static unsigned find_reconstructed_block_length(rfile *s); static rfile *make_incremental_rfile(char *filename); static rfile *make_rfile(char *filename, bool missing_ok); static void write_reconstructed_file(char *input_filename, char *output_filename, unsigned block_length, rfile **sourcemap, off_t *offsetmap, pg_checksum_context *checksum_ctx, CopyMethod copy_method, bool debug, bool dry_run); static void read_bytes(rfile *rf, void *buffer, unsigned length); static void write_block(int wfd, char *output_filename, uint8 *buffer, pg_checksum_context *checksum_ctx); static void read_block(rfile *s, off_t off, uint8 *buffer); /* * Reconstruct a full file from an incremental file and a chain of prior * backups. * * input_filename should be the path to the incremental file, and * output_filename should be the path where the reconstructed file is to be * written. * * relative_path should be the relative path to the directory containing this * file. bare_file_name should be the name of the file within that directory, * without "INCREMENTAL.". * * n_prior_backups is the number of prior backups, and prior_backup_dirs is * an array of pathnames where those backups can be found. */ void reconstruct_from_incremental_file(char *input_filename, char *output_filename, char *relative_path, char *bare_file_name, int n_prior_backups, char **prior_backup_dirs, manifest_data **manifests, char *manifest_path, pg_checksum_type checksum_type, int *checksum_length, uint8 **checksum_payload, CopyMethod copy_method, bool debug, bool dry_run) { rfile **source; rfile *latest_source = NULL; rfile **sourcemap; off_t *offsetmap; unsigned block_length; unsigned i; unsigned sidx = n_prior_backups; bool full_copy_possible = true; int copy_source_index = -1; rfile *copy_source = NULL; pg_checksum_context checksum_ctx; /* * Every block must come either from the latest version of the file or * from one of the prior backups. */ source = pg_malloc0(sizeof(rfile *) * (1 + n_prior_backups)); /* * Use the information from the latest incremental file to figure out how * long the reconstructed file should be. */ latest_source = make_incremental_rfile(input_filename); source[n_prior_backups] = latest_source; block_length = find_reconstructed_block_length(latest_source); /* * For each block in the output file, we need to know from which file we * need to obtain it and at what offset in that file it's stored. * sourcemap gives us the first of these things, and offsetmap the latter. */ sourcemap = pg_malloc0(sizeof(rfile *) * block_length); offsetmap = pg_malloc0(sizeof(off_t) * block_length); /* * Every block that is present in the newest incremental file should be * sourced from that file. If it precedes the truncation_block_length, * it's a block that we would otherwise have had to find in an older * backup and thus reduces the number of blocks remaining to be found by * one; otherwise, it's an extra block that needs to be included in the * output but would not have needed to be found in an older backup if it * had not been present. */ for (i = 0; i < latest_source->num_blocks; ++i) { BlockNumber b = latest_source->relative_block_numbers[i]; Assert(b < block_length); sourcemap[b] = latest_source; offsetmap[b] = latest_source->header_length + (i * BLCKSZ); /* * A full copy of a file from an earlier backup is only possible if no * blocks are needed from any later incremental file. */ full_copy_possible = false; } while (1) { char source_filename[MAXPGPATH]; rfile *s; /* * Move to the next backup in the chain. If there are no more, then * we're done. */ if (sidx == 0) break; --sidx; /* * Look for the full file in the previous backup. If not found, then * look for an incremental file instead. */ snprintf(source_filename, MAXPGPATH, "%s/%s/%s", prior_backup_dirs[sidx], relative_path, bare_file_name); if ((s = make_rfile(source_filename, true)) == NULL) { snprintf(source_filename, MAXPGPATH, "%s/%s/INCREMENTAL.%s", prior_backup_dirs[sidx], relative_path, bare_file_name); s = make_incremental_rfile(source_filename); } source[sidx] = s; /* * If s->header_length == 0, then this is a full file; otherwise, it's * an incremental file. */ if (s->header_length == 0) { struct stat sb; BlockNumber b; BlockNumber blocklength; /* We need to know the length of the file. */ if (fstat(s->fd, &sb) < 0) pg_fatal("could not stat \"%s\": %m", s->filename); /* * Since we found a full file, source all blocks from it that * exist in the file. * * Note that there may be blocks that don't exist either in this * file or in any incremental file but that precede * truncation_block_length. These are, presumably, zero-filled * blocks that result from the server extending the file but * taking no action on those blocks that generated any WAL. * * Sadly, we have no way of validating that this is really what * happened, and neither does the server. From it's perspective, * an unmodified block that contains data looks exactly the same * as a zero-filled block that never had any data: either way, * it's not mentioned in any WAL summary and the server has no * reason to read it. From our perspective, all we know is that * nobody had a reason to back up the block. That certainly means * that the block didn't exist at the time of the full backup, but * the supposition that it was all zeroes at the time of every * later backup is one that we can't validate. */ blocklength = sb.st_size / BLCKSZ; for (b = 0; b < latest_source->truncation_block_length; ++b) { if (sourcemap[b] == NULL && b < blocklength) { sourcemap[b] = s; offsetmap[b] = b * BLCKSZ; } } /* * If a full copy looks possible, check whether the resulting file * should be exactly as long as the source file is. If so, a full * copy is acceptable, otherwise not. */ if (full_copy_possible) { uint64 expected_length; expected_length = (uint64) latest_source->truncation_block_length; expected_length *= BLCKSZ; if (expected_length == sb.st_size) { copy_source = s; copy_source_index = sidx; } } /* We don't need to consider any further sources. */ break; } /* * Since we found another incremental file, source all blocks from it * that we need but don't yet have. */ for (i = 0; i < s->num_blocks; ++i) { BlockNumber b = s->relative_block_numbers[i]; if (b < latest_source->truncation_block_length && sourcemap[b] == NULL) { sourcemap[b] = s; offsetmap[b] = s->header_length + (i * BLCKSZ); /* * A full copy of a file from an earlier backup is only * possible if no blocks are needed from any later incremental * file. */ full_copy_possible = false; } } } /* * If a checksum of the required type already exists in the * backup_manifest for the relevant input directory, we can save some work * by reusing that checksum instead of computing a new one. */ if (copy_source_index >= 0 && manifests[copy_source_index] != NULL && checksum_type != CHECKSUM_TYPE_NONE) { manifest_file *mfile; mfile = manifest_files_lookup(manifests[copy_source_index]->files, manifest_path); if (mfile == NULL) { char *path = psprintf("%s/backup_manifest", prior_backup_dirs[copy_source_index]); /* * The directory is out of sync with the backup_manifest, so emit * a warning. */ /*- translator: the first %s is a backup manifest file, the second is a file absent therein */ pg_log_warning("\"%s\" contains no entry for \"%s\"", path, manifest_path); pfree(path); } else if (mfile->checksum_type == checksum_type) { *checksum_length = mfile->checksum_length; *checksum_payload = pg_malloc(*checksum_length); memcpy(*checksum_payload, mfile->checksum_payload, *checksum_length); checksum_type = CHECKSUM_TYPE_NONE; } } /* Prepare for checksum calculation, if required. */ pg_checksum_init(&checksum_ctx, checksum_type); /* * If the full file can be created by copying a file from an older backup * in the chain without needing to overwrite any blocks or truncate the * result, then forget about performing reconstruction and just copy that * file in its entirety. * * Otherwise, reconstruct. */ if (copy_source != NULL) copy_file(copy_source->filename, output_filename, &checksum_ctx, copy_method, dry_run); else { write_reconstructed_file(input_filename, output_filename, block_length, sourcemap, offsetmap, &checksum_ctx, copy_method, debug, dry_run); debug_reconstruction(n_prior_backups + 1, source, dry_run); } /* Save results of checksum calculation. */ if (checksum_type != CHECKSUM_TYPE_NONE) { *checksum_payload = pg_malloc(PG_CHECKSUM_MAX_LENGTH); *checksum_length = pg_checksum_final(&checksum_ctx, *checksum_payload); } /* * Close files and release memory. */ for (i = 0; i <= n_prior_backups; ++i) { rfile *s = source[i]; if (s == NULL) continue; if (close(s->fd) != 0) pg_fatal("could not close \"%s\": %m", s->filename); if (s->relative_block_numbers != NULL) pfree(s->relative_block_numbers); pg_free(s->filename); } pfree(sourcemap); pfree(offsetmap); pfree(source); } /* * Perform post-reconstruction logging and sanity checks. */ static void debug_reconstruction(int n_source, rfile **sources, bool dry_run) { unsigned i; for (i = 0; i < n_source; ++i) { rfile *s = sources[i]; /* Ignore source if not used. */ if (s == NULL) continue; /* If no data is needed from this file, we can ignore it. */ if (s->num_blocks_read == 0) continue; /* Debug logging. */ if (dry_run) pg_log_debug("would have read %u blocks from \"%s\"", s->num_blocks_read, s->filename); else pg_log_debug("read %u blocks from \"%s\"", s->num_blocks_read, s->filename); /* * In dry-run mode, we don't actually try to read data from the file, * but we do try to verify that the file is long enough that we could * have read the data if we'd tried. * * If this fails, then it means that a non-dry-run attempt would fail, * complaining of not being able to read the required bytes from the * file. */ if (dry_run) { struct stat sb; if (fstat(s->fd, &sb) < 0) pg_fatal("could not stat \"%s\": %m", s->filename); if (sb.st_size < s->highest_offset_read) pg_fatal("file \"%s\" is too short: expected %llu, found %llu", s->filename, (unsigned long long) s->highest_offset_read, (unsigned long long) sb.st_size); } } } /* * When we perform reconstruction using an incremental file, the output file * should be at least as long as the truncation_block_length. Any blocks * present in the incremental file increase the output length as far as is * necessary to include those blocks. */ static unsigned find_reconstructed_block_length(rfile *s) { unsigned block_length = s->truncation_block_length; unsigned i; for (i = 0; i < s->num_blocks; ++i) if (s->relative_block_numbers[i] >= block_length) block_length = s->relative_block_numbers[i] + 1; return block_length; } /* * Initialize an incremental rfile, reading the header so that we know which * blocks it contains. */ static rfile * make_incremental_rfile(char *filename) { rfile *rf; unsigned magic; rf = make_rfile(filename, false); /* Read and validate magic number. */ read_bytes(rf, &magic, sizeof(magic)); if (magic != INCREMENTAL_MAGIC) pg_fatal("file \"%s\" has bad incremental magic number (0x%x not 0x%x)", filename, magic, INCREMENTAL_MAGIC); /* Read block count. */ read_bytes(rf, &rf->num_blocks, sizeof(rf->num_blocks)); if (rf->num_blocks > RELSEG_SIZE) pg_fatal("file \"%s\" has block count %u in excess of segment size %u", filename, rf->num_blocks, RELSEG_SIZE); /* Read truncation block length. */ read_bytes(rf, &rf->truncation_block_length, sizeof(rf->truncation_block_length)); if (rf->truncation_block_length > RELSEG_SIZE) pg_fatal("file \"%s\" has truncation block length %u in excess of segment size %u", filename, rf->truncation_block_length, RELSEG_SIZE); /* Read block numbers if there are any. */ if (rf->num_blocks > 0) { rf->relative_block_numbers = pg_malloc0(sizeof(BlockNumber) * rf->num_blocks); read_bytes(rf, rf->relative_block_numbers, sizeof(BlockNumber) * rf->num_blocks); } /* Remember length of header. */ rf->header_length = sizeof(magic) + sizeof(rf->num_blocks) + sizeof(rf->truncation_block_length) + sizeof(BlockNumber) * rf->num_blocks; /* * Round header length to a multiple of BLCKSZ, so that blocks contents * are properly aligned. Only do this when the file actually has data for * some blocks. */ if ((rf->num_blocks > 0) && ((rf->header_length % BLCKSZ) != 0)) rf->header_length += (BLCKSZ - (rf->header_length % BLCKSZ)); return rf; } /* * Allocate and perform basic initialization of an rfile. */ static rfile * make_rfile(char *filename, bool missing_ok) { rfile *rf; rf = pg_malloc0(sizeof(rfile)); rf->filename = pstrdup(filename); if ((rf->fd = open(filename, O_RDONLY | PG_BINARY, 0)) < 0) { if (missing_ok && errno == ENOENT) { pg_free(rf); return NULL; } pg_fatal("could not open file \"%s\": %m", filename); } return rf; } /* * Read the indicated number of bytes from an rfile into the buffer. */ static void read_bytes(rfile *rf, void *buffer, unsigned length) { int rb = read(rf->fd, buffer, length); if (rb != length) { if (rb < 0) pg_fatal("could not read file \"%s\": %m", rf->filename); else pg_fatal("could not read file \"%s\": read only %d of %u bytes", rf->filename, rb, length); } } /* * Write out a reconstructed file. */ static void write_reconstructed_file(char *input_filename, char *output_filename, unsigned block_length, rfile **sourcemap, off_t *offsetmap, pg_checksum_context *checksum_ctx, CopyMethod copy_method, bool debug, bool dry_run) { int wfd = -1; unsigned i; unsigned zero_blocks = 0; /* Debugging output. */ if (debug) { StringInfoData debug_buf; unsigned start_of_range = 0; unsigned current_block = 0; /* Basic information about the output file to be produced. */ if (dry_run) pg_log_debug("would reconstruct \"%s\" (%u blocks, checksum %s)", output_filename, block_length, pg_checksum_type_name(checksum_ctx->type)); else pg_log_debug("reconstructing \"%s\" (%u blocks, checksum %s)", output_filename, block_length, pg_checksum_type_name(checksum_ctx->type)); /* Print out the plan for reconstructing this file. */ initStringInfo(&debug_buf); while (current_block < block_length) { rfile *s = sourcemap[current_block]; /* Extend range, if possible. */ if (current_block + 1 < block_length && s == sourcemap[current_block + 1]) { ++current_block; continue; } /* Add details about this range. */ if (s == NULL) { if (current_block == start_of_range) appendStringInfo(&debug_buf, " %u:zero", current_block); else appendStringInfo(&debug_buf, " %u-%u:zero", start_of_range, current_block); } else { if (current_block == start_of_range) appendStringInfo(&debug_buf, " %u:%s@" UINT64_FORMAT, current_block, s->filename, (uint64) offsetmap[current_block]); else appendStringInfo(&debug_buf, " %u-%u:%s@" UINT64_FORMAT, start_of_range, current_block, s->filename, (uint64) offsetmap[current_block]); } /* Begin new range. */ start_of_range = ++current_block; /* If the output is very long or we are done, dump it now. */ if (current_block == block_length || debug_buf.len > 1024) { pg_log_debug("reconstruction plan:%s", debug_buf.data); resetStringInfo(&debug_buf); } } /* Free memory. */ pfree(debug_buf.data); } /* Open the output file, except in dry_run mode. */ if (!dry_run && (wfd = open(output_filename, O_RDWR | PG_BINARY | O_CREAT | O_EXCL, pg_file_create_mode)) < 0) pg_fatal("could not open file \"%s\": %m", output_filename); /* Read and write the blocks as required. */ for (i = 0; i < block_length; ++i) { uint8 buffer[BLCKSZ]; rfile *s = sourcemap[i]; /* Update accounting information. */ if (s == NULL) ++zero_blocks; else { s->num_blocks_read++; s->highest_offset_read = Max(s->highest_offset_read, offsetmap[i] + BLCKSZ); } /* Skip the rest of this in dry-run mode. */ if (dry_run) continue; /* Read or zero-fill the block as appropriate. */ if (s == NULL) { /* * New block not mentioned in the WAL summary. Should have been an * uninitialized block, so just zero-fill it. */ memset(buffer, 0, BLCKSZ); /* Write out the block, update the checksum if needed. */ write_block(wfd, output_filename, buffer, checksum_ctx); /* Nothing else to do for zero-filled blocks. */ continue; } /* Copy the block using the appropriate copy method. */ if (copy_method != COPY_METHOD_COPY_FILE_RANGE) { /* * Read the block from the correct source file, and then write it * out, possibly with a checksum update. */ read_block(s, offsetmap[i], buffer); write_block(wfd, output_filename, buffer, checksum_ctx); } else /* use copy_file_range */ { /* copy_file_range modifies the offset, so use a local copy */ off_t off = offsetmap[i]; size_t nwritten = 0; /* * Retry until we've written all the bytes (the offset is updated * by copy_file_range, and so is the wfd file offset). */ do { int wb; wb = copy_file_range(s->fd, &off, wfd, NULL, BLCKSZ - nwritten, 0); if (wb < 0) pg_fatal("error while copying file range from \"%s\" to \"%s\": %m", input_filename, output_filename); nwritten += wb; } while (BLCKSZ > nwritten); /* * When checksum calculation not needed, we're done, otherwise * read the block and pass it to the checksum calculation. */ if (checksum_ctx->type == CHECKSUM_TYPE_NONE) continue; read_block(s, offsetmap[i], buffer); if (pg_checksum_update(checksum_ctx, buffer, BLCKSZ) < 0) pg_fatal("could not update checksum of file \"%s\"", output_filename); } } /* Debugging output. */ if (zero_blocks > 0) { if (dry_run) pg_log_debug("would have zero-filled %u blocks", zero_blocks); else pg_log_debug("zero-filled %u blocks", zero_blocks); } /* Close the output file. */ if (wfd >= 0 && close(wfd) != 0) pg_fatal("could not close \"%s\": %m", output_filename); } /* * Write the block into the file (using the file descriptor), and * if needed update the checksum calculation. * * The buffer is expected to contain BLCKSZ bytes. The filename is * provided only for the error message. */ static void write_block(int fd, char *output_filename, uint8 *buffer, pg_checksum_context *checksum_ctx) { int wb; if ((wb = write(fd, buffer, BLCKSZ)) != BLCKSZ) { if (wb < 0) pg_fatal("could not write file \"%s\": %m", output_filename); else pg_fatal("could not write file \"%s\": wrote only %d of %d bytes", output_filename, wb, BLCKSZ); } /* Update the checksum computation. */ if (pg_checksum_update(checksum_ctx, buffer, BLCKSZ) < 0) pg_fatal("could not update checksum of file \"%s\"", output_filename); } /* * Read a block of data (BLCKSZ bytes) into the the buffer. */ static void read_block(rfile *s, off_t off, uint8 *buffer) { int rb; /* Read the block from the correct source, except if dry-run. */ rb = pg_pread(s->fd, buffer, BLCKSZ, off); if (rb != BLCKSZ) { if (rb < 0) pg_fatal("could not read file \"%s\": %m", s->filename); else pg_fatal("could not read file \"%s\": read only %d of %d bytes at offset %llu", s->filename, rb, BLCKSZ, (unsigned long long) off); } }