postgresql/src/bin/pg_rewind/pg_rewind.c

1204 lines
34 KiB
C

/*-------------------------------------------------------------------------
*
* pg_rewind.c
* Synchronizes a PostgreSQL data directory to a new timeline
*
* Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
*
*-------------------------------------------------------------------------
*/
#include "postgres_fe.h"
#include <sys/stat.h>
#include <fcntl.h>
#include <time.h>
#include <unistd.h>
#include "access/timeline.h"
#include "access/xlog_internal.h"
#include "catalog/catversion.h"
#include "catalog/pg_control.h"
#include "common/controldata_utils.h"
#include "common/file_perm.h"
#include "common/restricted_token.h"
#include "common/string.h"
#include "fe_utils/option_utils.h"
#include "fe_utils/recovery_gen.h"
#include "fe_utils/string_utils.h"
#include "file_ops.h"
#include "filemap.h"
#include "getopt_long.h"
#include "pg_rewind.h"
#include "rewind_source.h"
#include "storage/bufpage.h"
static void usage(const char *progname);
static void perform_rewind(filemap_t *filemap, rewind_source *source,
XLogRecPtr chkptrec,
TimeLineID chkpttli,
XLogRecPtr chkptredo);
static void createBackupLabel(XLogRecPtr startpoint, TimeLineID starttli,
XLogRecPtr checkpointloc);
static void digestControlFile(ControlFileData *ControlFile,
const char *content, size_t size);
static void getRestoreCommand(const char *argv0);
static void sanityChecks(void);
static TimeLineHistoryEntry *getTimelineHistory(TimeLineID tli, bool is_source,
int *nentries);
static void findCommonAncestorTimeline(TimeLineHistoryEntry *a_history,
int a_nentries,
TimeLineHistoryEntry *b_history,
int b_nentries,
XLogRecPtr *recptr, int *tliIndex);
static void ensureCleanShutdown(const char *argv0);
static void disconnect_atexit(void);
static ControlFileData ControlFile_target;
static ControlFileData ControlFile_source;
static ControlFileData ControlFile_source_after;
const char *progname;
int WalSegSz;
/* Configuration options */
char *datadir_target = NULL;
char *datadir_source = NULL;
char *connstr_source = NULL;
char *restore_command = NULL;
char *config_file = NULL;
static bool debug = false;
bool showprogress = false;
bool dry_run = false;
bool do_sync = true;
bool restore_wal = false;
DataDirSyncMethod sync_method = DATA_DIR_SYNC_METHOD_FSYNC;
/* Target history */
TimeLineHistoryEntry *targetHistory;
int targetNentries;
/* Progress counters */
uint64 fetch_size;
uint64 fetch_done;
static PGconn *conn;
static rewind_source *source;
static void
usage(const char *progname)
{
printf(_("%s resynchronizes a PostgreSQL cluster with another copy of the cluster.\n\n"), progname);
printf(_("Usage:\n %s [OPTION]...\n\n"), progname);
printf(_("Options:\n"));
printf(_(" -c, --restore-target-wal use restore_command in target configuration to\n"
" retrieve WAL files from archives\n"));
printf(_(" -D, --target-pgdata=DIRECTORY existing data directory to modify\n"));
printf(_(" --source-pgdata=DIRECTORY source data directory to synchronize with\n"));
printf(_(" --source-server=CONNSTR source server to synchronize with\n"));
printf(_(" -n, --dry-run stop before modifying anything\n"));
printf(_(" -N, --no-sync do not wait for changes to be written\n"
" safely to disk\n"));
printf(_(" -P, --progress write progress messages\n"));
printf(_(" -R, --write-recovery-conf write configuration for replication\n"
" (requires --source-server)\n"));
printf(_(" --config-file=FILENAME use specified main server configuration\n"
" file when running target cluster\n"));
printf(_(" --debug write a lot of debug messages\n"));
printf(_(" --no-ensure-shutdown do not automatically fix unclean shutdown\n"));
printf(_(" --sync-method=METHOD set method for syncing files to disk\n"));
printf(_(" -V, --version output version information, then exit\n"));
printf(_(" -?, --help show this help, then exit\n"));
printf(_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
printf(_("%s home page: <%s>\n"), PACKAGE_NAME, PACKAGE_URL);
}
int
main(int argc, char **argv)
{
static struct option long_options[] = {
{"help", no_argument, NULL, '?'},
{"target-pgdata", required_argument, NULL, 'D'},
{"write-recovery-conf", no_argument, NULL, 'R'},
{"source-pgdata", required_argument, NULL, 1},
{"source-server", required_argument, NULL, 2},
{"no-ensure-shutdown", no_argument, NULL, 4},
{"config-file", required_argument, NULL, 5},
{"version", no_argument, NULL, 'V'},
{"restore-target-wal", no_argument, NULL, 'c'},
{"dry-run", no_argument, NULL, 'n'},
{"no-sync", no_argument, NULL, 'N'},
{"progress", no_argument, NULL, 'P'},
{"debug", no_argument, NULL, 3},
{"sync-method", required_argument, NULL, 6},
{NULL, 0, NULL, 0}
};
int option_index;
int c;
XLogRecPtr divergerec;
int lastcommontliIndex;
XLogRecPtr chkptrec;
TimeLineID chkpttli;
XLogRecPtr chkptredo;
TimeLineID source_tli;
TimeLineID target_tli;
XLogRecPtr target_wal_endrec;
size_t size;
char *buffer;
bool no_ensure_shutdown = false;
bool rewind_needed;
bool writerecoveryconf = false;
filemap_t *filemap;
pg_logging_init(argv[0]);
set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_rewind"));
progname = get_progname(argv[0]);
/* Process command-line arguments */
if (argc > 1)
{
if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
{
usage(progname);
exit(0);
}
if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
{
puts("pg_rewind (PostgreSQL) " PG_VERSION);
exit(0);
}
}
while ((c = getopt_long(argc, argv, "cD:nNPR", long_options, &option_index)) != -1)
{
switch (c)
{
case 'c':
restore_wal = true;
break;
case 'P':
showprogress = true;
break;
case 'n':
dry_run = true;
break;
case 'N':
do_sync = false;
break;
case 'R':
writerecoveryconf = true;
break;
case 3:
debug = true;
pg_logging_increase_verbosity();
break;
case 'D': /* -D or --target-pgdata */
datadir_target = pg_strdup(optarg);
break;
case 1: /* --source-pgdata */
datadir_source = pg_strdup(optarg);
break;
case 2: /* --source-server */
connstr_source = pg_strdup(optarg);
break;
case 4:
no_ensure_shutdown = true;
break;
case 5:
config_file = pg_strdup(optarg);
break;
case 6:
if (!parse_sync_method(optarg, &sync_method))
exit(1);
break;
default:
/* getopt_long already emitted a complaint */
pg_log_error_hint("Try \"%s --help\" for more information.", progname);
exit(1);
}
}
if (datadir_source == NULL && connstr_source == NULL)
{
pg_log_error("no source specified (--source-pgdata or --source-server)");
pg_log_error_hint("Try \"%s --help\" for more information.", progname);
exit(1);
}
if (datadir_source != NULL && connstr_source != NULL)
{
pg_log_error("only one of --source-pgdata or --source-server can be specified");
pg_log_error_hint("Try \"%s --help\" for more information.", progname);
exit(1);
}
if (datadir_target == NULL)
{
pg_log_error("no target data directory specified (--target-pgdata)");
pg_log_error_hint("Try \"%s --help\" for more information.", progname);
exit(1);
}
if (writerecoveryconf && connstr_source == NULL)
{
pg_log_error("no source server information (--source-server) specified for --write-recovery-conf");
pg_log_error_hint("Try \"%s --help\" for more information.", progname);
exit(1);
}
if (optind < argc)
{
pg_log_error("too many command-line arguments (first is \"%s\")",
argv[optind]);
pg_log_error_hint("Try \"%s --help\" for more information.", progname);
exit(1);
}
/*
* Don't allow pg_rewind to be run as root, to avoid overwriting the
* ownership of files in the data directory. We need only check for root
* -- any other user won't have sufficient permissions to modify files in
* the data directory.
*/
#ifndef WIN32
if (geteuid() == 0)
{
pg_log_error("cannot be executed by \"root\"");
pg_log_error_hint("You must run %s as the PostgreSQL superuser.",
progname);
exit(1);
}
#endif
get_restricted_token();
/* Set mask based on PGDATA permissions */
if (!GetDataDirectoryCreatePerm(datadir_target))
pg_fatal("could not read permissions of directory \"%s\": %m",
datadir_target);
umask(pg_mode_mask);
getRestoreCommand(argv[0]);
atexit(disconnect_atexit);
/*
* Ok, we have all the options and we're ready to start. First, connect to
* remote server.
*/
if (connstr_source)
{
conn = PQconnectdb(connstr_source);
if (PQstatus(conn) == CONNECTION_BAD)
pg_fatal("%s", PQerrorMessage(conn));
if (showprogress)
pg_log_info("connected to server");
source = init_libpq_source(conn);
}
else
source = init_local_source(datadir_source);
/*
* Check the status of the target instance.
*
* If the target instance was not cleanly shut down, start and stop the
* target cluster once in single-user mode to enforce recovery to finish,
* ensuring that the cluster can be used by pg_rewind. Note that if
* no_ensure_shutdown is specified, pg_rewind ignores this step, and users
* need to make sure by themselves that the target cluster is in a clean
* state.
*/
buffer = slurpFile(datadir_target, "global/pg_control", &size);
digestControlFile(&ControlFile_target, buffer, size);
pg_free(buffer);
if (!no_ensure_shutdown &&
ControlFile_target.state != DB_SHUTDOWNED &&
ControlFile_target.state != DB_SHUTDOWNED_IN_RECOVERY)
{
ensureCleanShutdown(argv[0]);
buffer = slurpFile(datadir_target, "global/pg_control", &size);
digestControlFile(&ControlFile_target, buffer, size);
pg_free(buffer);
}
buffer = source->fetch_file(source, "global/pg_control", &size);
digestControlFile(&ControlFile_source, buffer, size);
pg_free(buffer);
sanityChecks();
/*
* Usually, the TLI can be found in the latest checkpoint record. But if
* the source server is just being promoted (or it's a standby that's
* following a primary that's just being promoted), and the checkpoint
* requested by the promotion hasn't completed yet, the latest timeline is
* in minRecoveryPoint. So we check which is later, the TLI of the
* minRecoveryPoint or the latest checkpoint.
*/
source_tli = Max(ControlFile_source.minRecoveryPointTLI,
ControlFile_source.checkPointCopy.ThisTimeLineID);
/* Similarly for the target. */
target_tli = Max(ControlFile_target.minRecoveryPointTLI,
ControlFile_target.checkPointCopy.ThisTimeLineID);
/*
* Find the common ancestor timeline between the clusters.
*
* If both clusters are already on the same timeline, there's nothing to
* do.
*/
if (target_tli == source_tli)
{
pg_log_info("source and target cluster are on the same timeline");
rewind_needed = false;
target_wal_endrec = 0;
}
else
{
XLogRecPtr chkptendrec;
TimeLineHistoryEntry *sourceHistory;
int sourceNentries;
/*
* Retrieve timelines for both source and target, and find the point
* where they diverged.
*/
sourceHistory = getTimelineHistory(source_tli, true, &sourceNentries);
targetHistory = getTimelineHistory(target_tli, false, &targetNentries);
findCommonAncestorTimeline(sourceHistory, sourceNentries,
targetHistory, targetNentries,
&divergerec, &lastcommontliIndex);
pg_log_info("servers diverged at WAL location %X/%X on timeline %u",
LSN_FORMAT_ARGS(divergerec),
targetHistory[lastcommontliIndex].tli);
/*
* Don't need the source history anymore. The target history is still
* needed by the routines in parsexlog.c, when we read the target WAL.
*/
pfree(sourceHistory);
/*
* Determine the end-of-WAL on the target.
*
* The WAL ends at the last shutdown checkpoint, or at
* minRecoveryPoint if it was a standby. (If we supported rewinding a
* server that was not shut down cleanly, we would need to replay
* until we reach the first invalid record, like crash recovery does.)
*/
/* read the checkpoint record on the target to see where it ends. */
chkptendrec = readOneRecord(datadir_target,
ControlFile_target.checkPoint,
targetNentries - 1,
restore_command);
if (ControlFile_target.minRecoveryPoint > chkptendrec)
{
target_wal_endrec = ControlFile_target.minRecoveryPoint;
}
else
{
target_wal_endrec = chkptendrec;
}
/*
* Check for the possibility that the target is in fact a direct
* ancestor of the source. In that case, there is no divergent history
* in the target that needs rewinding.
*/
if (target_wal_endrec > divergerec)
{
rewind_needed = true;
}
else
{
/* the last common checkpoint record must be part of target WAL */
Assert(target_wal_endrec == divergerec);
rewind_needed = false;
}
}
if (!rewind_needed)
{
pg_log_info("no rewind required");
if (writerecoveryconf && !dry_run)
WriteRecoveryConfig(conn, datadir_target,
GenerateRecoveryConfig(conn, NULL, NULL));
exit(0);
}
findLastCheckpoint(datadir_target, divergerec, lastcommontliIndex,
&chkptrec, &chkpttli, &chkptredo, restore_command);
pg_log_info("rewinding from last common checkpoint at %X/%X on timeline %u",
LSN_FORMAT_ARGS(chkptrec), chkpttli);
/* Initialize the hash table to track the status of each file */
filehash_init();
/*
* Collect information about all files in the both data directories.
*/
if (showprogress)
pg_log_info("reading source file list");
source->traverse_files(source, &process_source_file);
if (showprogress)
pg_log_info("reading target file list");
traverse_datadir(datadir_target, &process_target_file);
/*
* Read the target WAL from last checkpoint before the point of fork, to
* extract all the pages that were modified on the target cluster after
* the fork.
*/
if (showprogress)
pg_log_info("reading WAL in target");
extractPageMap(datadir_target, chkptrec, lastcommontliIndex,
target_wal_endrec, restore_command);
/*
* We have collected all information we need from both systems. Decide
* what to do with each file.
*/
filemap = decide_file_actions();
if (showprogress)
calculate_totals(filemap);
/* this is too verbose even for verbose mode */
if (debug)
print_filemap(filemap);
/*
* Ok, we're ready to start copying things over.
*/
if (showprogress)
{
pg_log_info("need to copy %lu MB (total source directory size is %lu MB)",
(unsigned long) (filemap->fetch_size / (1024 * 1024)),
(unsigned long) (filemap->total_size / (1024 * 1024)));
fetch_size = filemap->fetch_size;
fetch_done = 0;
}
/*
* We have now collected all the information we need from both systems,
* and we are ready to start modifying the target directory.
*
* This is the point of no return. Once we start copying things, there is
* no turning back!
*/
perform_rewind(filemap, source, chkptrec, chkpttli, chkptredo);
if (showprogress)
pg_log_info("syncing target data directory");
sync_target_dir();
/* Also update the standby configuration, if requested. */
if (writerecoveryconf && !dry_run)
WriteRecoveryConfig(conn, datadir_target,
GenerateRecoveryConfig(conn, NULL, NULL));
/* don't need the source connection anymore */
source->destroy(source);
if (conn)
{
PQfinish(conn);
conn = NULL;
}
pg_log_info("Done!");
return 0;
}
/*
* Perform the rewind.
*
* We have already collected all the information we need from the
* target and the source.
*/
static void
perform_rewind(filemap_t *filemap, rewind_source *source,
XLogRecPtr chkptrec,
TimeLineID chkpttli,
XLogRecPtr chkptredo)
{
XLogRecPtr endrec;
TimeLineID endtli;
ControlFileData ControlFile_new;
size_t size;
char *buffer;
/*
* Execute the actions in the file map, fetching data from the source
* system as needed.
*/
for (int i = 0; i < filemap->nentries; i++)
{
file_entry_t *entry = filemap->entries[i];
/*
* If this is a relation file, copy the modified blocks.
*
* This is in addition to any other changes.
*/
if (entry->target_pages_to_overwrite.bitmapsize > 0)
{
datapagemap_iterator_t *iter;
BlockNumber blkno;
off_t offset;
iter = datapagemap_iterate(&entry->target_pages_to_overwrite);
while (datapagemap_next(iter, &blkno))
{
offset = blkno * BLCKSZ;
source->queue_fetch_range(source, entry->path, offset, BLCKSZ);
}
pg_free(iter);
}
switch (entry->action)
{
case FILE_ACTION_NONE:
/* nothing else to do */
break;
case FILE_ACTION_COPY:
source->queue_fetch_file(source, entry->path, entry->source_size);
break;
case FILE_ACTION_TRUNCATE:
truncate_target_file(entry->path, entry->source_size);
break;
case FILE_ACTION_COPY_TAIL:
source->queue_fetch_range(source, entry->path,
entry->target_size,
entry->source_size - entry->target_size);
break;
case FILE_ACTION_REMOVE:
remove_target(entry);
break;
case FILE_ACTION_CREATE:
create_target(entry);
break;
case FILE_ACTION_UNDECIDED:
pg_fatal("no action decided for file \"%s\"", entry->path);
break;
}
}
/* Complete any remaining range-fetches that we queued up above. */
source->finish_fetch(source);
close_target_file();
progress_report(true);
/*
* Fetch the control file from the source last. This ensures that the
* minRecoveryPoint is up-to-date.
*/
buffer = source->fetch_file(source, "global/pg_control", &size);
digestControlFile(&ControlFile_source_after, buffer, size);
pg_free(buffer);
/*
* Sanity check: If the source is a local system, the control file should
* not have changed since we started.
*
* XXX: We assume it hasn't been modified, but actually, what could go
* wrong? The logic handles a libpq source that's modified concurrently,
* why not a local datadir?
*/
if (datadir_source &&
memcmp(&ControlFile_source, &ControlFile_source_after,
sizeof(ControlFileData)) != 0)
{
pg_fatal("source system was modified while pg_rewind was running");
}
if (showprogress)
pg_log_info("creating backup label and updating control file");
/*
* Create a backup label file, to tell the target where to begin the WAL
* replay. Normally, from the last common checkpoint between the source
* and the target. But if the source is a standby server, it's possible
* that the last common checkpoint is *after* the standby's restartpoint.
* That implies that the source server has applied the checkpoint record,
* but hasn't performed a corresponding restartpoint yet. Make sure we
* start at the restartpoint's redo point in that case.
*
* Use the old version of the source's control file for this. The server
* might have finished the restartpoint after we started copying files,
* but we must begin from the redo point at the time that started copying.
*/
if (ControlFile_source.checkPointCopy.redo < chkptredo)
{
chkptredo = ControlFile_source.checkPointCopy.redo;
chkpttli = ControlFile_source.checkPointCopy.ThisTimeLineID;
chkptrec = ControlFile_source.checkPoint;
}
createBackupLabel(chkptredo, chkpttli, chkptrec);
/*
* Update control file of target, to tell the target how far it must
* replay the WAL (minRecoveryPoint).
*/
if (connstr_source)
{
/*
* The source is a live server. Like in an online backup, it's
* important that we recover all the WAL that was generated while we
* were copying files.
*/
if (ControlFile_source_after.state == DB_IN_ARCHIVE_RECOVERY)
{
/*
* Source is a standby server. We must replay to its
* minRecoveryPoint.
*/
endrec = ControlFile_source_after.minRecoveryPoint;
endtli = ControlFile_source_after.minRecoveryPointTLI;
}
else
{
/*
* Source is a production, non-standby, server. We must replay to
* the last WAL insert location.
*/
if (ControlFile_source_after.state != DB_IN_PRODUCTION)
pg_fatal("source system was in unexpected state at end of rewind");
endrec = source->get_current_wal_insert_lsn(source);
endtli = Max(ControlFile_source_after.checkPointCopy.ThisTimeLineID,
ControlFile_source_after.minRecoveryPointTLI);
}
}
else
{
/*
* Source is a local data directory. It should've shut down cleanly,
* and we must replay to the latest shutdown checkpoint.
*/
endrec = ControlFile_source_after.checkPoint;
endtli = ControlFile_source_after.checkPointCopy.ThisTimeLineID;
}
memcpy(&ControlFile_new, &ControlFile_source_after, sizeof(ControlFileData));
ControlFile_new.minRecoveryPoint = endrec;
ControlFile_new.minRecoveryPointTLI = endtli;
ControlFile_new.state = DB_IN_ARCHIVE_RECOVERY;
if (!dry_run)
update_controlfile(datadir_target, &ControlFile_new, do_sync);
}
static void
sanityChecks(void)
{
/* TODO Check that there's no backup_label in either cluster */
/* Check system_identifier match */
if (ControlFile_target.system_identifier != ControlFile_source.system_identifier)
pg_fatal("source and target clusters are from different systems");
/* check version */
if (ControlFile_target.pg_control_version != PG_CONTROL_VERSION ||
ControlFile_source.pg_control_version != PG_CONTROL_VERSION ||
ControlFile_target.catalog_version_no != CATALOG_VERSION_NO ||
ControlFile_source.catalog_version_no != CATALOG_VERSION_NO)
{
pg_fatal("clusters are not compatible with this version of pg_rewind");
}
/*
* Target cluster need to use checksums or hint bit wal-logging, this to
* prevent from data corruption that could occur because of hint bits.
*/
if (ControlFile_target.data_checksum_version != PG_DATA_CHECKSUM_VERSION &&
!ControlFile_target.wal_log_hints)
{
pg_fatal("target server needs to use either data checksums or \"wal_log_hints = on\"");
}
/*
* Target cluster better not be running. This doesn't guard against
* someone starting the cluster concurrently. Also, this is probably more
* strict than necessary; it's OK if the target node was not shut down
* cleanly, as long as it isn't running at the moment.
*/
if (ControlFile_target.state != DB_SHUTDOWNED &&
ControlFile_target.state != DB_SHUTDOWNED_IN_RECOVERY)
pg_fatal("target server must be shut down cleanly");
/*
* When the source is a data directory, also require that the source
* server is shut down. There isn't any very strong reason for this
* limitation, but better safe than sorry.
*/
if (datadir_source &&
ControlFile_source.state != DB_SHUTDOWNED &&
ControlFile_source.state != DB_SHUTDOWNED_IN_RECOVERY)
pg_fatal("source data directory must be shut down cleanly");
}
/*
* Print a progress report based on the fetch_size and fetch_done variables.
*
* Progress report is written at maximum once per second, except that the
* last progress report is always printed.
*
* If finished is set to true, this is the last progress report. The cursor
* is moved to the next line.
*/
void
progress_report(bool finished)
{
static pg_time_t last_progress_report = 0;
int percent;
char fetch_done_str[32];
char fetch_size_str[32];
pg_time_t now;
if (!showprogress)
return;
now = time(NULL);
if (now == last_progress_report && !finished)
return; /* Max once per second */
last_progress_report = now;
percent = fetch_size ? (int) ((fetch_done) * 100 / fetch_size) : 0;
/*
* Avoid overflowing past 100% or the full size. This may make the total
* size number change as we approach the end of the backup (the estimate
* will always be wrong if WAL is included), but that's better than having
* the done column be bigger than the total.
*/
if (percent > 100)
percent = 100;
if (fetch_done > fetch_size)
fetch_size = fetch_done;
snprintf(fetch_done_str, sizeof(fetch_done_str), UINT64_FORMAT,
fetch_done / 1024);
snprintf(fetch_size_str, sizeof(fetch_size_str), UINT64_FORMAT,
fetch_size / 1024);
fprintf(stderr, _("%*s/%s kB (%d%%) copied"),
(int) strlen(fetch_size_str), fetch_done_str, fetch_size_str,
percent);
/*
* Stay on the same line if reporting to a terminal and we're not done
* yet.
*/
fputc((!finished && isatty(fileno(stderr))) ? '\r' : '\n', stderr);
}
/*
* Find minimum from two WAL locations assuming InvalidXLogRecPtr means
* infinity as src/include/access/timeline.h states. This routine should
* be used only when comparing WAL locations related to history files.
*/
static XLogRecPtr
MinXLogRecPtr(XLogRecPtr a, XLogRecPtr b)
{
if (XLogRecPtrIsInvalid(a))
return b;
else if (XLogRecPtrIsInvalid(b))
return a;
else
return Min(a, b);
}
/*
* Retrieve timeline history for the source or target system.
*/
static TimeLineHistoryEntry *
getTimelineHistory(TimeLineID tli, bool is_source, int *nentries)
{
TimeLineHistoryEntry *history;
/*
* Timeline 1 does not have a history file, so there is no need to check
* and fake an entry with infinite start and end positions.
*/
if (tli == 1)
{
history = (TimeLineHistoryEntry *) pg_malloc(sizeof(TimeLineHistoryEntry));
history->tli = tli;
history->begin = history->end = InvalidXLogRecPtr;
*nentries = 1;
}
else
{
char path[MAXPGPATH];
char *histfile;
TLHistoryFilePath(path, tli);
/* Get history file from appropriate source */
if (is_source)
histfile = source->fetch_file(source, path, NULL);
else
histfile = slurpFile(datadir_target, path, NULL);
history = rewind_parseTimeLineHistory(histfile, tli, nentries);
pg_free(histfile);
}
if (debug)
{
int i;
if (is_source)
pg_log_debug("Source timeline history:");
else
pg_log_debug("Target timeline history:");
/*
* Print the target timeline history.
*/
for (i = 0; i < targetNentries; i++)
{
TimeLineHistoryEntry *entry;
entry = &history[i];
pg_log_debug("%u: %X/%X - %X/%X", entry->tli,
LSN_FORMAT_ARGS(entry->begin),
LSN_FORMAT_ARGS(entry->end));
}
}
return history;
}
/*
* Determine the TLI of the last common timeline in the timeline history of
* two clusters. *tliIndex is set to the index of last common timeline in
* the arrays, and *recptr is set to the position where the timeline history
* diverged (ie. the first WAL record that's not the same in both clusters).
*/
static void
findCommonAncestorTimeline(TimeLineHistoryEntry *a_history, int a_nentries,
TimeLineHistoryEntry *b_history, int b_nentries,
XLogRecPtr *recptr, int *tliIndex)
{
int i,
n;
/*
* Trace the history forward, until we hit the timeline diverge. It may
* still be possible that the source and target nodes used the same
* timeline number in their history but with different start position
* depending on the history files that each node has fetched in previous
* recovery processes. Hence check the start position of the new timeline
* as well and move down by one extra timeline entry if they do not match.
*/
n = Min(a_nentries, b_nentries);
for (i = 0; i < n; i++)
{
if (a_history[i].tli != b_history[i].tli ||
a_history[i].begin != b_history[i].begin)
break;
}
if (i > 0)
{
i--;
*recptr = MinXLogRecPtr(a_history[i].end, b_history[i].end);
*tliIndex = i;
return;
}
else
{
pg_fatal("could not find common ancestor of the source and target cluster's timelines");
}
}
/*
* Create a backup_label file that forces recovery to begin at the last common
* checkpoint.
*/
static void
createBackupLabel(XLogRecPtr startpoint, TimeLineID starttli, XLogRecPtr checkpointloc)
{
XLogSegNo startsegno;
time_t stamp_time;
char strfbuf[128];
char xlogfilename[MAXFNAMELEN];
struct tm *tmp;
char buf[1000];
int len;
XLByteToSeg(startpoint, startsegno, WalSegSz);
XLogFileName(xlogfilename, starttli, startsegno, WalSegSz);
/*
* Construct backup label file
*/
stamp_time = time(NULL);
tmp = localtime(&stamp_time);
strftime(strfbuf, sizeof(strfbuf), "%Y-%m-%d %H:%M:%S %Z", tmp);
len = snprintf(buf, sizeof(buf),
"START WAL LOCATION: %X/%X (file %s)\n"
"CHECKPOINT LOCATION: %X/%X\n"
"BACKUP METHOD: pg_rewind\n"
"BACKUP FROM: standby\n"
"START TIME: %s\n",
/* omit LABEL: line */
LSN_FORMAT_ARGS(startpoint), xlogfilename,
LSN_FORMAT_ARGS(checkpointloc),
strfbuf);
if (len >= sizeof(buf))
pg_fatal("backup label buffer too small"); /* shouldn't happen */
/* TODO: move old file out of the way, if any. */
open_target_file("backup_label", true); /* BACKUP_LABEL_FILE */
write_target_range(buf, 0, len);
close_target_file();
}
/*
* Check CRC of control file
*/
static void
checkControlFile(ControlFileData *ControlFile)
{
pg_crc32c crc;
/* Calculate CRC */
INIT_CRC32C(crc);
COMP_CRC32C(crc, (char *) ControlFile, offsetof(ControlFileData, crc));
FIN_CRC32C(crc);
/* And simply compare it */
if (!EQ_CRC32C(crc, ControlFile->crc))
pg_fatal("unexpected control file CRC");
}
/*
* Verify control file contents in the buffer 'content', and copy it to
* *ControlFile.
*/
static void
digestControlFile(ControlFileData *ControlFile, const char *content,
size_t size)
{
if (size != PG_CONTROL_FILE_SIZE)
pg_fatal("unexpected control file size %d, expected %d",
(int) size, PG_CONTROL_FILE_SIZE);
memcpy(ControlFile, content, sizeof(ControlFileData));
/* set and validate WalSegSz */
WalSegSz = ControlFile->xlog_seg_size;
if (!IsValidWalSegSize(WalSegSz))
{
pg_log_error(ngettext("invalid WAL segment size in control file (%d byte)",
"invalid WAL segment size in control file (%d bytes)",
WalSegSz),
WalSegSz);
pg_log_error_detail("The WAL segment size must be a power of two between 1 MB and 1 GB.");
exit(1);
}
/* Additional checks on control file */
checkControlFile(ControlFile);
}
/*
* Get value of GUC parameter restore_command from the target cluster.
*
* This uses a logic based on "postgres -C" to get the value from the
* cluster.
*/
static void
getRestoreCommand(const char *argv0)
{
int rc;
char postgres_exec_path[MAXPGPATH];
PQExpBuffer postgres_cmd;
if (!restore_wal)
return;
/* find postgres executable */
rc = find_other_exec(argv0, "postgres",
PG_BACKEND_VERSIONSTR,
postgres_exec_path);
if (rc < 0)
{
char full_path[MAXPGPATH];
if (find_my_exec(argv0, full_path) < 0)
strlcpy(full_path, progname, sizeof(full_path));
if (rc == -1)
pg_fatal("program \"%s\" is needed by %s but was not found in the same directory as \"%s\"",
"postgres", progname, full_path);
else
pg_fatal("program \"%s\" was found by \"%s\" but was not the same version as %s",
"postgres", full_path, progname);
}
/*
* Build a command able to retrieve the value of GUC parameter
* restore_command, if set.
*/
postgres_cmd = createPQExpBuffer();
/* path to postgres, properly quoted */
appendShellString(postgres_cmd, postgres_exec_path);
/* add -D switch, with properly quoted data directory */
appendPQExpBufferStr(postgres_cmd, " -D ");
appendShellString(postgres_cmd, datadir_target);
/* add custom configuration file only if requested */
if (config_file != NULL)
{
appendPQExpBufferStr(postgres_cmd, " -c config_file=");
appendShellString(postgres_cmd, config_file);
}
/* add -C switch, for restore_command */
appendPQExpBufferStr(postgres_cmd, " -C restore_command");
restore_command = pipe_read_line(postgres_cmd->data);
if (restore_command == NULL)
pg_fatal("unable to read restore_command from target cluster");
(void) pg_strip_crlf(restore_command);
if (strcmp(restore_command, "") == 0)
pg_fatal("restore_command is not set in the target cluster");
pg_log_debug("using for rewind restore_command = \'%s\'",
restore_command);
destroyPQExpBuffer(postgres_cmd);
}
/*
* Ensure clean shutdown of target instance by launching single-user mode
* postgres to do crash recovery.
*/
static void
ensureCleanShutdown(const char *argv0)
{
int ret;
char exec_path[MAXPGPATH];
PQExpBuffer postgres_cmd;
/* locate postgres binary */
if ((ret = find_other_exec(argv0, "postgres",
PG_BACKEND_VERSIONSTR,
exec_path)) < 0)
{
char full_path[MAXPGPATH];
if (find_my_exec(argv0, full_path) < 0)
strlcpy(full_path, progname, sizeof(full_path));
if (ret == -1)
pg_fatal("program \"%s\" is needed by %s but was not found in the same directory as \"%s\"",
"postgres", progname, full_path);
else
pg_fatal("program \"%s\" was found by \"%s\" but was not the same version as %s",
"postgres", full_path, progname);
}
pg_log_info("executing \"%s\" for target server to complete crash recovery",
exec_path);
/*
* Skip processing if requested, but only after ensuring presence of
* postgres.
*/
if (dry_run)
return;
/*
* Finally run postgres in single-user mode. There is no need to use
* fsync here. This makes the recovery faster, and the target data folder
* is synced at the end anyway.
*/
postgres_cmd = createPQExpBuffer();
/* path to postgres, properly quoted */
appendShellString(postgres_cmd, exec_path);
/* add set of options with properly quoted data directory */
appendPQExpBufferStr(postgres_cmd, " --single -F -D ");
appendShellString(postgres_cmd, datadir_target);
/* add custom configuration file only if requested */
if (config_file != NULL)
{
appendPQExpBufferStr(postgres_cmd, " -c config_file=");
appendShellString(postgres_cmd, config_file);
}
/* finish with the database name, and a properly quoted redirection */
appendPQExpBufferStr(postgres_cmd, " template1 < ");
appendShellString(postgres_cmd, DEVNULL);
fflush(NULL);
if (system(postgres_cmd->data) != 0)
{
pg_log_error("postgres single-user mode in target cluster failed");
pg_log_error_detail("Command was: %s", postgres_cmd->data);
exit(1);
}
destroyPQExpBuffer(postgres_cmd);
}
static void
disconnect_atexit(void)
{
if (conn != NULL)
PQfinish(conn);
}