From fd4ced5230162b50a5c9d33b4bf9cfb1231aa62e Mon Sep 17 00:00:00 2001 From: Simon Riggs Date: Tue, 29 Jan 2013 00:06:15 +0000 Subject: [PATCH] Fast promote mode skips checkpoint at end of recovery. pg_ctl promote -m fast will skip the checkpoint at end of recovery so that we can achieve very fast failover when the apply delay is low. Write new WAL record XLOG_END_OF_RECOVERY to allow us to switch timeline correctly for downstream log readers. If we skip synchronous end of recovery checkpoint we request a normal spread checkpoint so that the window of re-recovery is low. Simon Riggs and Kyotaro Horiguchi, with input from Fujii Masao. Review by Heikki Linnakangas --- src/backend/access/rmgrdesc/xlogdesc.c | 10 ++ src/backend/access/transam/xlog.c | 192 +++++++++++++++++++++---- src/bin/pg_ctl/pg_ctl.c | 18 ++- src/include/access/xlog_internal.h | 6 + src/include/catalog/pg_control.h | 1 + 5 files changed, 195 insertions(+), 32 deletions(-) diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index 506b208c9c..6901298516 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -18,6 +18,7 @@ #include "access/xlog_internal.h" #include "catalog/pg_control.h" #include "utils/guc.h" +#include "utils/timestamp.h" /* * GUC support @@ -119,6 +120,15 @@ xlog_desc(StringInfo buf, uint8 xl_info, char *rec) memcpy(&fpw, rec, sizeof(bool)); appendStringInfo(buf, "full_page_writes: %s", fpw ? "true" : "false"); } + else if (info == XLOG_END_OF_RECOVERY) + { + xl_end_of_recovery xlrec; + + memcpy(&xlrec, rec, sizeof(xl_end_of_recovery)); + appendStringInfo(buf, "end_of_recovery: tli %u; time %s", + xlrec.ThisTimeLineID, + timestamptz_to_str(xlrec.end_time)); + } else appendStringInfo(buf, "UNKNOWN"); } diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index cf2f6e70cf..bcd379dca7 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -66,6 +66,7 @@ #define RECOVERY_COMMAND_FILE "recovery.conf" #define RECOVERY_COMMAND_DONE "recovery.done" #define PROMOTE_SIGNAL_FILE "promote" +#define FAST_PROMOTE_SIGNAL_FILE "fast_promote" /* User-settable parameters */ @@ -210,6 +211,9 @@ bool StandbyMode = false; static char *PrimaryConnInfo = NULL; static char *TriggerFile = NULL; +/* whether request for fast promotion has been made yet */ +static bool fast_promote = false; + /* if recoveryStopsHere returns true, it saves actual stop xid/time/name here */ static TransactionId recoveryStopXid; static TimestampTz recoveryStopTime; @@ -611,6 +615,7 @@ static void CheckRequiredParameterValues(void); static void XLogReportParameters(void); static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI); static void LocalSetXLogInsertAllowed(void); +static void CreateEndOfRecoveryRecord(void); static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags); static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo); @@ -642,7 +647,7 @@ static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode, bool fetching_ckpt); static void CheckRecoveryConsistency(void); static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader, - XLogRecPtr RecPtr, int whichChkpt); + XLogRecPtr RecPtr, int whichChkpti, bool report); static bool rescanLatestTimeLine(void); static void WriteControlFile(void); static void ReadControlFile(void); @@ -4848,7 +4853,7 @@ StartupXLOG(void) * When a backup_label file is present, we want to roll forward from * the checkpoint it identifies, rather than using pg_control. */ - record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0); + record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true); if (record != NULL) { memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint)); @@ -4890,7 +4895,7 @@ StartupXLOG(void) */ checkPointLoc = ControlFile->checkPoint; RedoStartLSN = ControlFile->checkPointCopy.redo; - record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1); + record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true); if (record != NULL) { ereport(DEBUG1, @@ -4909,7 +4914,7 @@ StartupXLOG(void) else { checkPointLoc = ControlFile->prevCheckPoint; - record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2); + record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true); if (record != NULL) { ereport(LOG, @@ -5393,22 +5398,33 @@ StartupXLOG(void) } /* - * Before replaying this record, check if it is a shutdown - * checkpoint record that causes the current timeline to - * change. The checkpoint record is already considered to be - * part of the new timeline, so we update ThisTimeLineID - * before replaying it. That's important so that replayEndTLI, - * which is recorded as the minimum recovery point's TLI if + * Before replaying this record, check if this record + * causes the current timeline to change. The record is + * already considered to be part of the new timeline, + * so we update ThisTimeLineID before replaying it. + * That's important so that replayEndTLI, which is + * recorded as the minimum recovery point's TLI if * recovery stops after this record, is set correctly. */ - if (record->xl_rmid == RM_XLOG_ID && - (record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN) + if (record->xl_rmid == RM_XLOG_ID) { - CheckPoint checkPoint; - TimeLineID newTLI; + TimeLineID newTLI = ThisTimeLineID; + uint8 info = record->xl_info & ~XLR_INFO_MASK; - memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint)); - newTLI = checkPoint.ThisTimeLineID; + if (info == XLOG_CHECKPOINT_SHUTDOWN) + { + CheckPoint checkPoint; + + memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint)); + newTLI = checkPoint.ThisTimeLineID; + } + else if (info == XLOG_END_OF_RECOVERY) + { + xl_end_of_recovery xlrec; + + memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery)); + newTLI = xlrec.ThisTimeLineID; + } if (newTLI != ThisTimeLineID) { @@ -5729,9 +5745,36 @@ StartupXLOG(void) * allows some extra error checking in xlog_redo. */ if (bgwriterLaunched) - RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY | - CHECKPOINT_IMMEDIATE | - CHECKPOINT_WAIT); + { + bool checkpoint_wait = true; + + /* + * If we've been explicitly promoted with fast option, + * end of recovery without a checkpoint if possible. + */ + if (fast_promote) + { + checkPointLoc = ControlFile->prevCheckPoint; + record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, false); + if (record != NULL) + { + checkpoint_wait = false; + CreateEndOfRecoveryRecord(); + } + } + + /* + * In most cases we will wait for a full checkpoint to complete. + * + * If not, issue a normal, non-immediate checkpoint but don't wait. + */ + if (checkpoint_wait) + RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY | + CHECKPOINT_IMMEDIATE | + CHECKPOINT_WAIT); + else + RequestCheckpoint(0); /* No flags */ + } else CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE); @@ -6060,12 +6103,15 @@ LocalSetXLogInsertAllowed(void) */ static XLogRecord * ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, - int whichChkpt) + int whichChkpt, bool report) { XLogRecord *record; if (!XRecOffIsValid(RecPtr)) { + if (!report) + return NULL; + switch (whichChkpt) { case 1: @@ -6088,6 +6134,9 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, if (record == NULL) { + if (!report) + return NULL; + switch (whichChkpt) { case 1: @@ -6882,6 +6931,44 @@ CreateCheckPoint(int flags) LWLockRelease(CheckpointLock); } +/* + * Mark the end of recovery in WAL though without running a full checkpoint. + * We can expect that a restartpoint is likely to be in progress as we + * do this, though we are unwilling to wait for it to complete. So be + * careful to avoid taking the CheckpointLock anywhere here. + * + * CreateRestartPoint() allows for the case where recovery may end before + * the restartpoint completes so there is no concern of concurrent behaviour. + */ +void +CreateEndOfRecoveryRecord(void) +{ + xl_end_of_recovery xlrec; + XLogRecData rdata; + + /* sanity check */ + if (!RecoveryInProgress()) + elog(ERROR, "can only be used to end recovery"); + + xlrec.end_time = time(NULL); + xlrec.ThisTimeLineID = ThisTimeLineID; + + LocalSetXLogInsertAllowed(); + + START_CRIT_SECTION(); + + rdata.data = (char *) &xlrec; + rdata.len = sizeof(xl_end_of_recovery); + rdata.buffer = InvalidBuffer; + rdata.next = NULL; + + (void) XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY, &rdata); + + END_CRIT_SECTION(); + + LocalXLogInsertAllowed = -1; /* return to "check" state */ +} + /* * Flush all data in shared memory to disk, and fsync * @@ -7613,6 +7700,27 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record) RecoveryRestartPoint(&checkPoint); } + else if (info == XLOG_END_OF_RECOVERY) + { + xl_end_of_recovery xlrec; + + memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery)); + + /* + * For Hot Standby, we could treat this like a Shutdown Checkpoint, + * but this case is rarer and harder to test, so the benefit doesn't + * outweigh the potential extra cost of maintenance. + */ + + /* + * We should've already switched to the new TLI before replaying this + * record. + */ + if (xlrec.ThisTimeLineID != ThisTimeLineID) + ereport(PANIC, + (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record", + xlrec.ThisTimeLineID, ThisTimeLineID))); + } else if (info == XLOG_NOOP) { /* nothing to do here */ @@ -9405,8 +9513,39 @@ CheckForStandbyTrigger(void) if (IsPromoteTriggered()) { - ereport(LOG, + /* + * In 9.1 and 9.2 the postmaster unlinked the promote file + * inside the signal handler. We now leave the file in place + * and let the Startup process do the unlink. This allows + * Startup to know whether we're doing fast or normal + * promotion. Fast promotion takes precedence. + */ + if (stat(FAST_PROMOTE_SIGNAL_FILE, &stat_buf) == 0) + { + unlink(FAST_PROMOTE_SIGNAL_FILE); + unlink(PROMOTE_SIGNAL_FILE); + fast_promote = true; + } + else if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0) + { + unlink(PROMOTE_SIGNAL_FILE); + fast_promote = false; + } + + /* + * We only look for fast promote via the pg_ctl promote option. + * It would be possible to extend trigger file support for the + * fast promotion option but that wouldn't be backwards compatible + * anyway and we're looking to focus further work on the promote + * option as the right way to signal end of recovery. + */ + if (fast_promote) + ereport(LOG, + (errmsg("received fast promote request"))); + else + ereport(LOG, (errmsg("received promote request"))); + ResetPromoteTriggered(); triggered = true; return true; @@ -9435,15 +9574,10 @@ CheckPromoteSignal(void) { struct stat stat_buf; - if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0) - { - /* - * Since we are in a signal handler, it's not safe to elog. We - * silently ignore any error from unlink. - */ - unlink(PROMOTE_SIGNAL_FILE); + if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 || + stat(FAST_PROMOTE_SIGNAL_FILE, &stat_buf) == 0) return true; - } + return false; } diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c index e412d71dcf..e086b1244c 100644 --- a/src/bin/pg_ctl/pg_ctl.c +++ b/src/bin/pg_ctl/pg_ctl.c @@ -1136,6 +1136,15 @@ do_promote(void) exit(1); } + /* + * Use two different kinds of promotion file so we can understand + * the difference between smart and fast promotion. + */ + if (shutdown_mode >= FAST_MODE) + snprintf(promote_file, MAXPGPATH, "%s/fast_promote", pg_data); + else + snprintf(promote_file, MAXPGPATH, "%s/promote", pg_data); + if ((prmfile = fopen(promote_file, "w")) == NULL) { write_stderr(_("%s: could not create promote signal file \"%s\": %s\n"), @@ -1799,7 +1808,7 @@ do_help(void) " [-o \"OPTIONS\"]\n"), progname); printf(_(" %s reload [-D DATADIR] [-s]\n"), progname); printf(_(" %s status [-D DATADIR]\n"), progname); - printf(_(" %s promote [-D DATADIR] [-s]\n"), progname); + printf(_(" %s promote [-D DATADIR] [-s] [-m PROMOTION-MODE]\n"), progname); printf(_(" %s kill SIGNALNAME PID\n"), progname); #if defined(WIN32) || defined(__CYGWIN__) printf(_(" %s register [-N SERVICENAME] [-U USERNAME] [-P PASSWORD] [-D DATADIR]\n" @@ -1828,7 +1837,7 @@ do_help(void) printf(_(" -o OPTIONS command line options to pass to postgres\n" " (PostgreSQL server executable) or initdb\n")); printf(_(" -p PATH-TO-POSTGRES normally not necessary\n")); - printf(_("\nOptions for stop or restart:\n")); + printf(_("\nOptions for stop, restart or promote:\n")); printf(_(" -m, --mode=MODE MODE can be \"smart\", \"fast\", or \"immediate\"\n")); printf(_("\nShutdown modes are:\n")); @@ -1836,6 +1845,10 @@ do_help(void) printf(_(" fast quit directly, with proper shutdown\n")); printf(_(" immediate quit without complete shutdown; will lead to recovery on restart\n")); + printf(_("\nPromotion modes are:\n")); + printf(_(" smart promote after performing a checkpoint\n")); + printf(_(" fast promote quickly without waiting for checkpoint completion\n")); + printf(_("\nAllowed signal names for kill:\n")); printf(" ABRT HUP INT QUIT TERM USR1 USR2\n"); @@ -2271,7 +2284,6 @@ main(int argc, char **argv) snprintf(pid_file, MAXPGPATH, "%s/postmaster.pid", pg_data); snprintf(backup_file, MAXPGPATH, "%s/backup_label", pg_data); snprintf(recovery_file, MAXPGPATH, "%s/recovery.conf", pg_data); - snprintf(promote_file, MAXPGPATH, "%s/promote", pg_data); } switch (ctl_command) diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 43e1e60f9b..ce9957e618 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -217,6 +217,12 @@ typedef struct xl_restore_point char rp_name[MAXFNAMELEN]; } xl_restore_point; +/* End of recovery mark, when we don't do an END_OF_RECOVERY checkpoint */ +typedef struct xl_end_of_recovery +{ + TimestampTz end_time; + TimeLineID ThisTimeLineID; +} xl_end_of_recovery; /* * XLogRecord is defined in xlog.h, but we avoid #including that to keep diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index e4a9abe7bc..ec8cea7c86 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -64,6 +64,7 @@ typedef struct CheckPoint #define XLOG_PARAMETER_CHANGE 0x60 #define XLOG_RESTORE_POINT 0x70 #define XLOG_FPW_CHANGE 0x80 +#define XLOG_END_OF_RECOVERY 0x90 /*