/*------------------------------------------------------------------------- * * basebackup_incremental.c * code for incremental backup support * * This code isn't actually in charge of taking an incremental backup; * the actual construction of the incremental backup happens in * basebackup.c. Here, we're concerned with providing the necessary * supports for that operation. In particular, we need to parse the * backup manifest supplied by the user taking the incremental backup * and extract the required information from it. * * Portions Copyright (c) 2010-2024, PostgreSQL Global Development Group * * IDENTIFICATION * src/backend/backup/basebackup_incremental.c * *------------------------------------------------------------------------- */ #include "postgres.h" #include "access/timeline.h" #include "access/xlog.h" #include "access/xlogrecovery.h" #include "backup/basebackup_incremental.h" #include "backup/walsummary.h" #include "common/blkreftable.h" #include "common/parse_manifest.h" #include "common/hashfn.h" #include "common/int.h" #include "postmaster/walsummarizer.h" #define BLOCKS_PER_READ 512 /* * Details extracted from the WAL ranges present in the supplied backup manifest. */ typedef struct { TimeLineID tli; XLogRecPtr start_lsn; XLogRecPtr end_lsn; } backup_wal_range; /* * Details extracted from the file list present in the supplied backup manifest. */ typedef struct { uint32 status; const char *path; size_t size; } backup_file_entry; static uint32 hash_string_pointer(const char *s); #define SH_PREFIX backup_file #define SH_ELEMENT_TYPE backup_file_entry #define SH_KEY_TYPE const char * #define SH_KEY path #define SH_HASH_KEY(tb, key) hash_string_pointer(key) #define SH_EQUAL(tb, a, b) (strcmp(a, b) == 0) #define SH_SCOPE static inline #define SH_DECLARE #define SH_DEFINE #include "lib/simplehash.h" struct IncrementalBackupInfo { /* Memory context for this object and its subsidiary objects. */ MemoryContext mcxt; /* Temporary buffer for storing the manifest while parsing it. */ StringInfoData buf; /* WAL ranges extracted from the backup manifest. */ List *manifest_wal_ranges; /* * Files extracted from the backup manifest. * * We don't really need this information, because we use WAL summaries to * figure what's changed. It would be unsafe to just rely on the list of * files that existed before, because it's possible for a file to be * removed and a new one created with the same name and different * contents. In such cases, the whole file must still be sent. We can tell * from the WAL summaries whether that happened, but not from the file * list. * * Nonetheless, this data is useful for sanity checking. If a file that we * think we shouldn't need to send is not present in the manifest for the * prior backup, something has gone terribly wrong. We retain the file * names and sizes, but not the checksums or last modified times, for * which we have no use. * * One significant downside of storing this data is that it consumes * memory. If that turns out to be a problem, we might have to decide not * to retain this information, or to make it optional. */ backup_file_hash *manifest_files; /* * Block-reference table for the incremental backup. * * It's possible that storing the entire block-reference table in memory * will be a problem for some users. The in-memory format that we're using * here is pretty efficient, converging to little more than 1 bit per * block for relation forks with large numbers of modified blocks. It's * possible, however, that if you try to perform an incremental backup of * a database with a sufficiently large number of relations on a * sufficiently small machine, you could run out of memory here. If that * turns out to be a problem in practice, we'll need to be more clever. */ BlockRefTable *brtab; }; static void manifest_process_file(JsonManifestParseContext *context, char *pathname, size_t size, pg_checksum_type checksum_type, int checksum_length, uint8 *checksum_payload); static void manifest_process_wal_range(JsonManifestParseContext *context, TimeLineID tli, XLogRecPtr start_lsn, XLogRecPtr end_lsn); static void manifest_report_error(JsonManifestParseContext *ib, const char *fmt,...) pg_attribute_printf(2, 3) pg_attribute_noreturn(); static int compare_block_numbers(const void *a, const void *b); /* * Create a new object for storing information extracted from the manifest * supplied when creating an incremental backup. */ IncrementalBackupInfo * CreateIncrementalBackupInfo(MemoryContext mcxt) { IncrementalBackupInfo *ib; MemoryContext oldcontext; oldcontext = MemoryContextSwitchTo(mcxt); ib = palloc0(sizeof(IncrementalBackupInfo)); ib->mcxt = mcxt; initStringInfo(&ib->buf); /* * It's hard to guess how many files a "typical" installation will have in * the data directory, but a fresh initdb creates almost 1000 files as of * this writing, so it seems to make sense for our estimate to * substantially higher. */ ib->manifest_files = backup_file_create(mcxt, 10000, NULL); MemoryContextSwitchTo(oldcontext); return ib; } /* * Before taking an incremental backup, the caller must supply the backup * manifest from a prior backup. Each chunk of manifest data received * from the client should be passed to this function. */ void AppendIncrementalManifestData(IncrementalBackupInfo *ib, const char *data, int len) { MemoryContext oldcontext; /* Switch to our memory context. */ oldcontext = MemoryContextSwitchTo(ib->mcxt); /* * XXX. Our json parser is at present incapable of parsing json blobs * incrementally, so we have to accumulate the entire backup manifest * before we can do anything with it. This should really be fixed, since * some users might have very large numbers of files in the data * directory. */ appendBinaryStringInfo(&ib->buf, data, len); /* Switch back to previous memory context. */ MemoryContextSwitchTo(oldcontext); } /* * Finalize an IncrementalBackupInfo object after all manifest data has * been supplied via calls to AppendIncrementalManifestData. */ void FinalizeIncrementalManifest(IncrementalBackupInfo *ib) { JsonManifestParseContext context; MemoryContext oldcontext; /* Switch to our memory context. */ oldcontext = MemoryContextSwitchTo(ib->mcxt); /* Parse the manifest. */ context.private_data = ib; context.per_file_cb = manifest_process_file; context.per_wal_range_cb = manifest_process_wal_range; context.error_cb = manifest_report_error; json_parse_manifest(&context, ib->buf.data, ib->buf.len); /* Done with the buffer, so release memory. */ pfree(ib->buf.data); ib->buf.data = NULL; /* Switch back to previous memory context. */ MemoryContextSwitchTo(oldcontext); } /* * Prepare to take an incremental backup. * * Before this function is called, AppendIncrementalManifestData and * FinalizeIncrementalManifest should have already been called to pass all * the manifest data to this object. * * This function performs sanity checks on the data extracted from the * manifest and figures out for which WAL ranges we need summaries, and * whether those summaries are available. Then, it reads and combines the * data from those summary files. It also updates the backup_state with the * reference TLI and LSN for the prior backup. */ void PrepareForIncrementalBackup(IncrementalBackupInfo *ib, BackupState *backup_state) { MemoryContext oldcontext; List *expectedTLEs; List *all_wslist, *required_wslist = NIL; ListCell *lc; TimeLineHistoryEntry **tlep; int num_wal_ranges; int i; bool found_backup_start_tli = false; TimeLineID earliest_wal_range_tli = 0; XLogRecPtr earliest_wal_range_start_lsn = InvalidXLogRecPtr; TimeLineID latest_wal_range_tli = 0; XLogRecPtr summarized_lsn; XLogRecPtr pending_lsn; XLogRecPtr prior_pending_lsn = InvalidXLogRecPtr; int deadcycles = 0; TimestampTz initial_time, current_time; Assert(ib->buf.data == NULL); /* Switch to our memory context. */ oldcontext = MemoryContextSwitchTo(ib->mcxt); /* * A valid backup manifest must always contain at least one WAL range * (usually exactly one, unless the backup spanned a timeline switch). */ num_wal_ranges = list_length(ib->manifest_wal_ranges); if (num_wal_ranges == 0) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("manifest contains no required WAL ranges"))); /* * Match up the TLIs that appear in the WAL ranges of the backup manifest * with those that appear in this server's timeline history. We expect * every backup_wal_range to match to a TimeLineHistoryEntry; if it does * not, that's an error. * * This loop also decides which of the WAL ranges is the manifest is most * ancient and which one is the newest, according to the timeline history * of this server, and stores TLIs of those WAL ranges into * earliest_wal_range_tli and latest_wal_range_tli. It also updates * earliest_wal_range_start_lsn to the start LSN of the WAL range for * earliest_wal_range_tli. * * Note that the return value of readTimeLineHistory puts the latest * timeline at the beginning of the list, not the end. Hence, the earliest * TLI is the one that occurs nearest the end of the list returned by * readTimeLineHistory, and the latest TLI is the one that occurs closest * to the beginning. */ expectedTLEs = readTimeLineHistory(backup_state->starttli); tlep = palloc0(num_wal_ranges * sizeof(TimeLineHistoryEntry *)); for (i = 0; i < num_wal_ranges; ++i) { backup_wal_range *range = list_nth(ib->manifest_wal_ranges, i); bool saw_earliest_wal_range_tli = false; bool saw_latest_wal_range_tli = false; /* Search this server's history for this WAL range's TLI. */ foreach(lc, expectedTLEs) { TimeLineHistoryEntry *tle = lfirst(lc); if (tle->tli == range->tli) { tlep[i] = tle; break; } if (tle->tli == earliest_wal_range_tli) saw_earliest_wal_range_tli = true; if (tle->tli == latest_wal_range_tli) saw_latest_wal_range_tli = true; } /* * An incremental backup can only be taken relative to a backup that * represents a previous state of this server. If the backup requires * WAL from a timeline that's not in our history, that definitely * isn't the case. */ if (tlep[i] == NULL) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("timeline %u found in manifest, but not in this server's history", range->tli))); /* * If we found this TLI in the server's history before encountering * the latest TLI seen so far in the server's history, then this TLI * is the latest one seen so far. * * If on the other hand we saw the earliest TLI seen so far before * finding this TLI, this TLI is earlier than the earliest one seen so * far. And if this is the first TLI for which we've searched, it's * also the earliest one seen so far. * * On the first loop iteration, both things should necessarily be * true. */ if (!saw_latest_wal_range_tli) latest_wal_range_tli = range->tli; if (earliest_wal_range_tli == 0 || saw_earliest_wal_range_tli) { earliest_wal_range_tli = range->tli; earliest_wal_range_start_lsn = range->start_lsn; } } /* * Propagate information about the prior backup into the backup_label that * will be generated for this backup. */ backup_state->istartpoint = earliest_wal_range_start_lsn; backup_state->istarttli = earliest_wal_range_tli; /* * Sanity check start and end LSNs for the WAL ranges in the manifest. * * Commonly, there won't be any timeline switches during the prior backup * at all, but if there are, they should happen at the same LSNs that this * server switched timelines. * * Whether there are any timeline switches during the prior backup or not, * the prior backup shouldn't require any WAL from a timeline prior to the * start of that timeline. It also shouldn't require any WAL from later * than the start of this backup. * * If any of these sanity checks fail, one possible explanation is that * the user has generated WAL on the same timeline with the same LSNs more * than once. For instance, if two standbys running on timeline 1 were * both promoted and (due to a broken archiving setup) both selected new * timeline ID 2, then it's possible that one of these checks might trip. * * Note that there are lots of ways for the user to do something very bad * without tripping any of these checks, and they are not intended to be * comprehensive. It's pretty hard to see how we could be certain of * anything here. However, if there's a problem staring us right in the * face, it's best to report it, so we do. */ for (i = 0; i < num_wal_ranges; ++i) { backup_wal_range *range = list_nth(ib->manifest_wal_ranges, i); if (range->tli == earliest_wal_range_tli) { if (range->start_lsn < tlep[i]->begin) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("manifest requires WAL from initial timeline %u starting at %X/%X, but that timeline begins at %X/%X", range->tli, LSN_FORMAT_ARGS(range->start_lsn), LSN_FORMAT_ARGS(tlep[i]->begin)))); } else { if (range->start_lsn != tlep[i]->begin) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("manifest requires WAL from continuation timeline %u starting at %X/%X, but that timeline begins at %X/%X", range->tli, LSN_FORMAT_ARGS(range->start_lsn), LSN_FORMAT_ARGS(tlep[i]->begin)))); } if (range->tli == latest_wal_range_tli) { if (range->end_lsn > backup_state->startpoint) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("manifest requires WAL from final timeline %u ending at %X/%X, but this backup starts at %X/%X", range->tli, LSN_FORMAT_ARGS(range->end_lsn), LSN_FORMAT_ARGS(backup_state->startpoint)))); } else { if (range->end_lsn != tlep[i]->end) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("manifest requires WAL from non-final timeline %u ending at %X/%X, but this server switched timelines at %X/%X", range->tli, LSN_FORMAT_ARGS(range->end_lsn), LSN_FORMAT_ARGS(tlep[i]->end)))); } } /* * Wait for WAL summarization to catch up to the backup start LSN (but * time out if it doesn't do so quickly enough). */ initial_time = current_time = GetCurrentTimestamp(); while (1) { long timeout_in_ms = 10000; unsigned elapsed_seconds; /* * Align the wait time to prevent drift. This doesn't really matter, * but we'd like the warnings about how long we've been waiting to say * 10 seconds, 20 seconds, 30 seconds, 40 seconds ... without ever * drifting to something that is not a multiple of ten. */ timeout_in_ms -= TimestampDifferenceMilliseconds(initial_time, current_time) % timeout_in_ms; /* Wait for up to 10 seconds. */ summarized_lsn = WaitForWalSummarization(backup_state->startpoint, timeout_in_ms, &pending_lsn); /* If WAL summarization has progressed sufficiently, stop waiting. */ if (summarized_lsn >= backup_state->startpoint) break; /* * Keep track of the number of cycles during which there has been no * progression of pending_lsn. If pending_lsn is not advancing, that * means that not only are no new files appearing on disk, but we're * not even incorporating new records into the in-memory state. */ if (pending_lsn > prior_pending_lsn) { prior_pending_lsn = pending_lsn; deadcycles = 0; } else ++deadcycles; /* * If we've managed to wait for an entire minute without the WAL * summarizer absorbing a single WAL record, error out; probably * something is wrong. * * We could consider also erroring out if the summarizer is taking too * long to catch up, but it's not clear what rate of progress would be * acceptable and what would be too slow. So instead, we just try to * error out in the case where there's no progress at all. That seems * likely to catch a reasonable number of the things that can go wrong * in practice (e.g. the summarizer process is completely hung, say * because somebody hooked up a debugger to it or something) without * giving up too quickly when the system is just slow. */ if (deadcycles >= 6) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("WAL summarization is not progressing"), errdetail("Summarization is needed through %X/%X, but is stuck at %X/%X on disk and %X/%X in memory.", LSN_FORMAT_ARGS(backup_state->startpoint), LSN_FORMAT_ARGS(summarized_lsn), LSN_FORMAT_ARGS(pending_lsn)))); /* * Otherwise, just let the user know what's happening. */ current_time = GetCurrentTimestamp(); elapsed_seconds = TimestampDifferenceMilliseconds(initial_time, current_time) / 1000; ereport(WARNING, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("still waiting for WAL summarization through %X/%X after %d seconds", LSN_FORMAT_ARGS(backup_state->startpoint), elapsed_seconds), errdetail("Summarization has reached %X/%X on disk and %X/%X in memory.", LSN_FORMAT_ARGS(summarized_lsn), LSN_FORMAT_ARGS(pending_lsn)))); } /* * Retrieve a list of all WAL summaries on any timeline that overlap with * the LSN range of interest. We could instead call GetWalSummaries() once * per timeline in the loop that follows, but that would involve reading * the directory multiple times. It should be mildly faster - and perhaps * a bit safer - to do it just once. */ all_wslist = GetWalSummaries(0, earliest_wal_range_start_lsn, backup_state->startpoint); /* * We need WAL summaries for everything that happened during the prior * backup and everything that happened afterward up until the point where * the current backup started. */ foreach(lc, expectedTLEs) { TimeLineHistoryEntry *tle = lfirst(lc); XLogRecPtr tli_start_lsn = tle->begin; XLogRecPtr tli_end_lsn = tle->end; XLogRecPtr tli_missing_lsn = InvalidXLogRecPtr; List *tli_wslist; /* * Working through the history of this server from the current * timeline backwards, we skip everything until we find the timeline * where this backup started. Most of the time, this means we won't * skip anything at all, as it's unlikely that the timeline has * changed since the beginning of the backup moments ago. */ if (tle->tli == backup_state->starttli) { found_backup_start_tli = true; tli_end_lsn = backup_state->startpoint; } else if (!found_backup_start_tli) continue; /* * Find the summaries that overlap the LSN range of interest for this * timeline. If this is the earliest timeline involved, the range of * interest begins with the start LSN of the prior backup; otherwise, * it begins at the LSN at which this timeline came into existence. If * this is the latest TLI involved, the range of interest ends at the * start LSN of the current backup; otherwise, it ends at the point * where we switched from this timeline to the next one. */ if (tle->tli == earliest_wal_range_tli) tli_start_lsn = earliest_wal_range_start_lsn; tli_wslist = FilterWalSummaries(all_wslist, tle->tli, tli_start_lsn, tli_end_lsn); /* * There is no guarantee that the WAL summaries we found cover the * entire range of LSNs for which summaries are required, or indeed * that we found any WAL summaries at all. Check whether we have a * problem of that sort. */ if (!WalSummariesAreComplete(tli_wslist, tli_start_lsn, tli_end_lsn, &tli_missing_lsn)) { if (XLogRecPtrIsInvalid(tli_missing_lsn)) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("WAL summaries are required on timeline %u from %X/%X to %X/%X, but no summaries for that timeline and LSN range exist", tle->tli, LSN_FORMAT_ARGS(tli_start_lsn), LSN_FORMAT_ARGS(tli_end_lsn)))); else ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("WAL summaries are required on timeline %u from %X/%X to %X/%X, but the summaries for that timeline and LSN range are incomplete", tle->tli, LSN_FORMAT_ARGS(tli_start_lsn), LSN_FORMAT_ARGS(tli_end_lsn)), errdetail("The first unsummarized LSN in this range is %X/%X.", LSN_FORMAT_ARGS(tli_missing_lsn)))); } /* * Remember that we need to read these summaries. * * Technically, it's possible that this could read more files than * required, since tli_wslist in theory could contain redundant * summaries. For instance, if we have a summary from 0/10000000 to * 0/20000000 and also one from 0/00000000 to 0/30000000, then the * latter subsumes the former and the former could be ignored. * * We ignore this possibility because the WAL summarizer only tries to * generate summaries that do not overlap. If somehow they exist, * we'll do a bit of extra work but the results should still be * correct. */ required_wslist = list_concat(required_wslist, tli_wslist); /* * Timelines earlier than the one in which the prior backup began are * not relevant. */ if (tle->tli == earliest_wal_range_tli) break; } /* * Read all of the required block reference table files and merge all of * the data into a single in-memory block reference table. * * See the comments for struct IncrementalBackupInfo for some thoughts on * memory usage. */ ib->brtab = CreateEmptyBlockRefTable(); foreach(lc, required_wslist) { WalSummaryFile *ws = lfirst(lc); WalSummaryIO wsio; BlockRefTableReader *reader; RelFileLocator rlocator; ForkNumber forknum; BlockNumber limit_block; BlockNumber blocks[BLOCKS_PER_READ]; wsio.file = OpenWalSummaryFile(ws, false); wsio.filepos = 0; ereport(DEBUG1, (errmsg_internal("reading WAL summary file \"%s\"", FilePathName(wsio.file)))); reader = CreateBlockRefTableReader(ReadWalSummary, &wsio, FilePathName(wsio.file), ReportWalSummaryError, NULL); while (BlockRefTableReaderNextRelation(reader, &rlocator, &forknum, &limit_block)) { BlockRefTableSetLimitBlock(ib->brtab, &rlocator, forknum, limit_block); while (1) { unsigned nblocks; unsigned i; nblocks = BlockRefTableReaderGetBlocks(reader, blocks, BLOCKS_PER_READ); if (nblocks == 0) break; for (i = 0; i < nblocks; ++i) BlockRefTableMarkBlockModified(ib->brtab, &rlocator, forknum, blocks[i]); } } DestroyBlockRefTableReader(reader); FileClose(wsio.file); } /* Switch back to previous memory context. */ MemoryContextSwitchTo(oldcontext); } /* * Get the pathname that should be used when a file is sent incrementally. * * The result is a palloc'd string. */ char * GetIncrementalFilePath(Oid dboid, Oid spcoid, RelFileNumber relfilenumber, ForkNumber forknum, unsigned segno) { char *path; char *lastslash; char *ipath; path = GetRelationPath(dboid, spcoid, relfilenumber, InvalidBackendId, forknum); lastslash = strrchr(path, '/'); Assert(lastslash != NULL); *lastslash = '\0'; if (segno > 0) ipath = psprintf("%s/INCREMENTAL.%s.%u", path, lastslash + 1, segno); else ipath = psprintf("%s/INCREMENTAL.%s", path, lastslash + 1); pfree(path); return ipath; } /* * How should we back up a particular file as part of an incremental backup? * * If the return value is BACK_UP_FILE_FULLY, caller should back up the whole * file just as if this were not an incremental backup. * * If the return value is BACK_UP_FILE_INCREMENTALLY, caller should include * an incremental file in the backup instead of the entire file. On return, * *num_blocks_required will be set to the number of blocks that need to be * sent, and the actual block numbers will have been stored in * relative_block_numbers, which should be an array of at least RELSEG_SIZE. * In addition, *truncation_block_length will be set to the value that should * be included in the incremental file. */ FileBackupMethod GetFileBackupMethod(IncrementalBackupInfo *ib, const char *path, Oid dboid, Oid spcoid, RelFileNumber relfilenumber, ForkNumber forknum, unsigned segno, size_t size, unsigned *num_blocks_required, BlockNumber *relative_block_numbers, unsigned *truncation_block_length) { BlockNumber absolute_block_numbers[RELSEG_SIZE]; BlockNumber limit_block; BlockNumber start_blkno; BlockNumber stop_blkno; RelFileLocator rlocator; BlockRefTableEntry *brtentry; unsigned i; unsigned nblocks; /* Should only be called after PrepareForIncrementalBackup. */ Assert(ib->buf.data == NULL); /* * dboid could be InvalidOid if shared rel, but spcoid and relfilenumber * should have legal values. */ Assert(OidIsValid(spcoid)); Assert(RelFileNumberIsValid(relfilenumber)); /* * If the file size is too large or not a multiple of BLCKSZ, then * something weird is happening, so give up and send the whole file. */ if ((size % BLCKSZ) != 0 || size / BLCKSZ > RELSEG_SIZE) return BACK_UP_FILE_FULLY; /* * The free-space map fork is not properly WAL-logged, so we need to * backup the entire file every time. */ if (forknum == FSM_FORKNUM) return BACK_UP_FILE_FULLY; /* * If this file was not part of the prior backup, back it up fully. * * If this file was created after the prior backup and before the start of * the current backup, then the WAL summary information will tell us to * back up the whole file. However, if this file was created after the * start of the current backup, then the WAL summary won't know anything * about it. Without this logic, we would erroneously conclude that it was * OK to send it incrementally. * * Note that the file could have existed at the time of the prior backup, * gotten deleted, and then a new file with the same name could have been * created. In that case, this logic won't prevent the file from being * backed up incrementally. But, if the deletion happened before the start * of the current backup, the limit block will be 0, inducing a full * backup. If the deletion happened after the start of the current backup, * reconstruction will erroneously combine blocks from the current * lifespan of the file with blocks from the previous lifespan -- but in * this type of case, WAL replay to reach backup consistency should remove * and recreate the file anyway, so the initial bogus contents should not * matter. */ if (backup_file_lookup(ib->manifest_files, path) == NULL) { char *ipath; ipath = GetIncrementalFilePath(dboid, spcoid, relfilenumber, forknum, segno); if (backup_file_lookup(ib->manifest_files, ipath) == NULL) return BACK_UP_FILE_FULLY; } /* Look up the block reference table entry. */ rlocator.spcOid = spcoid; rlocator.dbOid = dboid; rlocator.relNumber = relfilenumber; brtentry = BlockRefTableGetEntry(ib->brtab, &rlocator, forknum, &limit_block); /* * If there is no entry, then there have been no WAL-logged changes to the * relation since the predecessor backup was taken, so we can back it up * incrementally and need not include any modified blocks. * * However, if the file is zero-length, we should do a full backup, * because an incremental file is always more than zero length, and it's * silly to take an incremental backup when a full backup would be * smaller. */ if (brtentry == NULL) { if (size == 0) return BACK_UP_FILE_FULLY; *num_blocks_required = 0; *truncation_block_length = size / BLCKSZ; return BACK_UP_FILE_INCREMENTALLY; } /* * If the limit_block is less than or equal to the point where this * segment starts, send the whole file. */ if (limit_block <= segno * RELSEG_SIZE) return BACK_UP_FILE_FULLY; /* * Get relevant entries from the block reference table entry. * * We shouldn't overflow computing the start or stop block numbers, but if * it manages to happen somehow, detect it and throw an error. */ start_blkno = segno * RELSEG_SIZE; stop_blkno = start_blkno + (size / BLCKSZ); if (start_blkno / RELSEG_SIZE != segno || stop_blkno < start_blkno) ereport(ERROR, errcode(ERRCODE_INTERNAL_ERROR), errmsg_internal("overflow computing block number bounds for segment %u with size %zu", segno, size)); nblocks = BlockRefTableEntryGetBlocks(brtentry, start_blkno, stop_blkno, absolute_block_numbers, RELSEG_SIZE); Assert(nblocks <= RELSEG_SIZE); /* * If we're going to have to send nearly all of the blocks, then just send * the whole file, because that won't require much extra storage or * transfer and will speed up and simplify backup restoration. It's not * clear what threshold is most appropriate here and perhaps it ought to * be configurable, but for now we're just going to say that if we'd need * to send 90% of the blocks anyway, give up and send the whole file. * * NB: If you change the threshold here, at least make sure to back up the * file fully when every single block must be sent, because there's * nothing good about sending an incremental file in that case. */ if (nblocks * BLCKSZ > size * 0.9) return BACK_UP_FILE_FULLY; /* * Looks like we can send an incremental file, so sort the absolute the * block numbers and then transpose absolute block numbers to relative * block numbers. * * NB: If the block reference table was using the bitmap representation * for a given chunk, the block numbers in that chunk will already be * sorted, but when the array-of-offsets representation is used, we can * receive block numbers here out of order. */ qsort(absolute_block_numbers, nblocks, sizeof(BlockNumber), compare_block_numbers); for (i = 0; i < nblocks; ++i) relative_block_numbers[i] = absolute_block_numbers[i] - start_blkno; *num_blocks_required = nblocks; /* * The truncation block length is the minimum length of the reconstructed * file. Any block numbers below this threshold that are not present in * the backup need to be fetched from the prior backup. At or above this * threshold, blocks should only be included in the result if they are * present in the backup. (This may require inserting zero blocks if the * blocks included in the backup are non-consecutive.) */ *truncation_block_length = size / BLCKSZ; if (BlockNumberIsValid(limit_block)) { unsigned relative_limit = limit_block - segno * RELSEG_SIZE; if (*truncation_block_length < relative_limit) *truncation_block_length = relative_limit; } /* Send it incrementally. */ return BACK_UP_FILE_INCREMENTALLY; } /* * Compute the size for an incremental file containing a given number of blocks. */ extern size_t GetIncrementalFileSize(unsigned num_blocks_required) { size_t result; /* Make sure we're not going to overflow. */ Assert(num_blocks_required <= RELSEG_SIZE); /* * Three four byte quantities (magic number, truncation block length, * block count) followed by block numbers followed by block contents. */ result = 3 * sizeof(uint32); result += (BLCKSZ + sizeof(BlockNumber)) * num_blocks_required; return result; } /* * Helper function for filemap hash table. */ static uint32 hash_string_pointer(const char *s) { unsigned char *ss = (unsigned char *) s; return hash_bytes(ss, strlen(s)); } /* * This callback is invoked for each file mentioned in the backup manifest. * * We store the path to each file and the size of each file for sanity-checking * purposes. For further details, see comments for IncrementalBackupInfo. */ static void manifest_process_file(JsonManifestParseContext *context, char *pathname, size_t size, pg_checksum_type checksum_type, int checksum_length, uint8 *checksum_payload) { IncrementalBackupInfo *ib = context->private_data; backup_file_entry *entry; bool found; entry = backup_file_insert(ib->manifest_files, pathname, &found); if (!found) { entry->path = MemoryContextStrdup(ib->manifest_files->ctx, pathname); entry->size = size; } } /* * This callback is invoked for each WAL range mentioned in the backup * manifest. * * We're just interested in learning the oldest LSN and the corresponding TLI * that appear in any WAL range. */ static void manifest_process_wal_range(JsonManifestParseContext *context, TimeLineID tli, XLogRecPtr start_lsn, XLogRecPtr end_lsn) { IncrementalBackupInfo *ib = context->private_data; backup_wal_range *range = palloc(sizeof(backup_wal_range)); range->tli = tli; range->start_lsn = start_lsn; range->end_lsn = end_lsn; ib->manifest_wal_ranges = lappend(ib->manifest_wal_ranges, range); } /* * This callback is invoked if an error occurs while parsing the backup * manifest. */ static void manifest_report_error(JsonManifestParseContext *context, const char *fmt,...) { StringInfoData errbuf; initStringInfo(&errbuf); for (;;) { va_list ap; int needed; va_start(ap, fmt); needed = appendStringInfoVA(&errbuf, fmt, ap); va_end(ap); if (needed == 0) break; enlargeStringInfo(&errbuf, needed); } ereport(ERROR, errmsg_internal("%s", errbuf.data)); } /* * Quicksort comparator for block numbers. */ static int compare_block_numbers(const void *a, const void *b) { BlockNumber aa = *(BlockNumber *) a; BlockNumber bb = *(BlockNumber *) b; return pg_cmp_u32(aa, bb); }