Fast promote mode skips checkpoint at end of recovery.

pg_ctl promote -m fast will skip the checkpoint at end of recovery so that we
can achieve very fast failover when the apply delay is low. Write new WAL record
XLOG_END_OF_RECOVERY to allow us to switch timeline correctly for downstream log
readers. If we skip synchronous end of recovery checkpoint we request a normal
spread checkpoint so that the window of re-recovery is low.

Simon Riggs and Kyotaro Horiguchi, with input from Fujii Masao.
Review by Heikki Linnakangas
This commit is contained in:
Simon Riggs 2013-01-29 00:06:15 +00:00
parent ee22c55f5a
commit fd4ced5230
5 changed files with 195 additions and 32 deletions

View File

@ -18,6 +18,7 @@
#include "access/xlog_internal.h"
#include "catalog/pg_control.h"
#include "utils/guc.h"
#include "utils/timestamp.h"
/*
* GUC support
@ -119,6 +120,15 @@ xlog_desc(StringInfo buf, uint8 xl_info, char *rec)
memcpy(&fpw, rec, sizeof(bool));
appendStringInfo(buf, "full_page_writes: %s", fpw ? "true" : "false");
}
else if (info == XLOG_END_OF_RECOVERY)
{
xl_end_of_recovery xlrec;
memcpy(&xlrec, rec, sizeof(xl_end_of_recovery));
appendStringInfo(buf, "end_of_recovery: tli %u; time %s",
xlrec.ThisTimeLineID,
timestamptz_to_str(xlrec.end_time));
}
else
appendStringInfo(buf, "UNKNOWN");
}

View File

@ -66,6 +66,7 @@
#define RECOVERY_COMMAND_FILE "recovery.conf"
#define RECOVERY_COMMAND_DONE "recovery.done"
#define PROMOTE_SIGNAL_FILE "promote"
#define FAST_PROMOTE_SIGNAL_FILE "fast_promote"
/* User-settable parameters */
@ -210,6 +211,9 @@ bool StandbyMode = false;
static char *PrimaryConnInfo = NULL;
static char *TriggerFile = NULL;
/* whether request for fast promotion has been made yet */
static bool fast_promote = false;
/* if recoveryStopsHere returns true, it saves actual stop xid/time/name here */
static TransactionId recoveryStopXid;
static TimestampTz recoveryStopTime;
@ -611,6 +615,7 @@ static void CheckRequiredParameterValues(void);
static void XLogReportParameters(void);
static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI);
static void LocalSetXLogInsertAllowed(void);
static void CreateEndOfRecoveryRecord(void);
static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
@ -642,7 +647,7 @@ static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
int emode, bool fetching_ckpt);
static void CheckRecoveryConsistency(void);
static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
XLogRecPtr RecPtr, int whichChkpt);
XLogRecPtr RecPtr, int whichChkpti, bool report);
static bool rescanLatestTimeLine(void);
static void WriteControlFile(void);
static void ReadControlFile(void);
@ -4848,7 +4853,7 @@ StartupXLOG(void)
* When a backup_label file is present, we want to roll forward from
* the checkpoint it identifies, rather than using pg_control.
*/
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0);
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
if (record != NULL)
{
memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
@ -4890,7 +4895,7 @@ StartupXLOG(void)
*/
checkPointLoc = ControlFile->checkPoint;
RedoStartLSN = ControlFile->checkPointCopy.redo;
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1);
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
if (record != NULL)
{
ereport(DEBUG1,
@ -4909,7 +4914,7 @@ StartupXLOG(void)
else
{
checkPointLoc = ControlFile->prevCheckPoint;
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2);
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
if (record != NULL)
{
ereport(LOG,
@ -5393,22 +5398,33 @@ StartupXLOG(void)
}
/*
* Before replaying this record, check if it is a shutdown
* checkpoint record that causes the current timeline to
* change. The checkpoint record is already considered to be
* part of the new timeline, so we update ThisTimeLineID
* before replaying it. That's important so that replayEndTLI,
* which is recorded as the minimum recovery point's TLI if
* Before replaying this record, check if this record
* causes the current timeline to change. The record is
* already considered to be part of the new timeline,
* so we update ThisTimeLineID before replaying it.
* That's important so that replayEndTLI, which is
* recorded as the minimum recovery point's TLI if
* recovery stops after this record, is set correctly.
*/
if (record->xl_rmid == RM_XLOG_ID &&
(record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN)
if (record->xl_rmid == RM_XLOG_ID)
{
CheckPoint checkPoint;
TimeLineID newTLI;
TimeLineID newTLI = ThisTimeLineID;
uint8 info = record->xl_info & ~XLR_INFO_MASK;
memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
newTLI = checkPoint.ThisTimeLineID;
if (info == XLOG_CHECKPOINT_SHUTDOWN)
{
CheckPoint checkPoint;
memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
newTLI = checkPoint.ThisTimeLineID;
}
else if (info == XLOG_END_OF_RECOVERY)
{
xl_end_of_recovery xlrec;
memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
newTLI = xlrec.ThisTimeLineID;
}
if (newTLI != ThisTimeLineID)
{
@ -5729,9 +5745,36 @@ StartupXLOG(void)
* allows some extra error checking in xlog_redo.
*/
if (bgwriterLaunched)
RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
CHECKPOINT_IMMEDIATE |
CHECKPOINT_WAIT);
{
bool checkpoint_wait = true;
/*
* If we've been explicitly promoted with fast option,
* end of recovery without a checkpoint if possible.
*/
if (fast_promote)
{
checkPointLoc = ControlFile->prevCheckPoint;
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, false);
if (record != NULL)
{
checkpoint_wait = false;
CreateEndOfRecoveryRecord();
}
}
/*
* In most cases we will wait for a full checkpoint to complete.
*
* If not, issue a normal, non-immediate checkpoint but don't wait.
*/
if (checkpoint_wait)
RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
CHECKPOINT_IMMEDIATE |
CHECKPOINT_WAIT);
else
RequestCheckpoint(0); /* No flags */
}
else
CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
@ -6060,12 +6103,15 @@ LocalSetXLogInsertAllowed(void)
*/
static XLogRecord *
ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
int whichChkpt)
int whichChkpt, bool report)
{
XLogRecord *record;
if (!XRecOffIsValid(RecPtr))
{
if (!report)
return NULL;
switch (whichChkpt)
{
case 1:
@ -6088,6 +6134,9 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
if (record == NULL)
{
if (!report)
return NULL;
switch (whichChkpt)
{
case 1:
@ -6882,6 +6931,44 @@ CreateCheckPoint(int flags)
LWLockRelease(CheckpointLock);
}
/*
* Mark the end of recovery in WAL though without running a full checkpoint.
* We can expect that a restartpoint is likely to be in progress as we
* do this, though we are unwilling to wait for it to complete. So be
* careful to avoid taking the CheckpointLock anywhere here.
*
* CreateRestartPoint() allows for the case where recovery may end before
* the restartpoint completes so there is no concern of concurrent behaviour.
*/
void
CreateEndOfRecoveryRecord(void)
{
xl_end_of_recovery xlrec;
XLogRecData rdata;
/* sanity check */
if (!RecoveryInProgress())
elog(ERROR, "can only be used to end recovery");
xlrec.end_time = time(NULL);
xlrec.ThisTimeLineID = ThisTimeLineID;
LocalSetXLogInsertAllowed();
START_CRIT_SECTION();
rdata.data = (char *) &xlrec;
rdata.len = sizeof(xl_end_of_recovery);
rdata.buffer = InvalidBuffer;
rdata.next = NULL;
(void) XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY, &rdata);
END_CRIT_SECTION();
LocalXLogInsertAllowed = -1; /* return to "check" state */
}
/*
* Flush all data in shared memory to disk, and fsync
*
@ -7613,6 +7700,27 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
RecoveryRestartPoint(&checkPoint);
}
else if (info == XLOG_END_OF_RECOVERY)
{
xl_end_of_recovery xlrec;
memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
/*
* For Hot Standby, we could treat this like a Shutdown Checkpoint,
* but this case is rarer and harder to test, so the benefit doesn't
* outweigh the potential extra cost of maintenance.
*/
/*
* We should've already switched to the new TLI before replaying this
* record.
*/
if (xlrec.ThisTimeLineID != ThisTimeLineID)
ereport(PANIC,
(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
xlrec.ThisTimeLineID, ThisTimeLineID)));
}
else if (info == XLOG_NOOP)
{
/* nothing to do here */
@ -9405,8 +9513,39 @@ CheckForStandbyTrigger(void)
if (IsPromoteTriggered())
{
ereport(LOG,
/*
* In 9.1 and 9.2 the postmaster unlinked the promote file
* inside the signal handler. We now leave the file in place
* and let the Startup process do the unlink. This allows
* Startup to know whether we're doing fast or normal
* promotion. Fast promotion takes precedence.
*/
if (stat(FAST_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
{
unlink(FAST_PROMOTE_SIGNAL_FILE);
unlink(PROMOTE_SIGNAL_FILE);
fast_promote = true;
}
else if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
{
unlink(PROMOTE_SIGNAL_FILE);
fast_promote = false;
}
/*
* We only look for fast promote via the pg_ctl promote option.
* It would be possible to extend trigger file support for the
* fast promotion option but that wouldn't be backwards compatible
* anyway and we're looking to focus further work on the promote
* option as the right way to signal end of recovery.
*/
if (fast_promote)
ereport(LOG,
(errmsg("received fast promote request")));
else
ereport(LOG,
(errmsg("received promote request")));
ResetPromoteTriggered();
triggered = true;
return true;
@ -9435,15 +9574,10 @@ CheckPromoteSignal(void)
{
struct stat stat_buf;
if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
{
/*
* Since we are in a signal handler, it's not safe to elog. We
* silently ignore any error from unlink.
*/
unlink(PROMOTE_SIGNAL_FILE);
if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
stat(FAST_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
return true;
}
return false;
}

View File

@ -1136,6 +1136,15 @@ do_promote(void)
exit(1);
}
/*
* Use two different kinds of promotion file so we can understand
* the difference between smart and fast promotion.
*/
if (shutdown_mode >= FAST_MODE)
snprintf(promote_file, MAXPGPATH, "%s/fast_promote", pg_data);
else
snprintf(promote_file, MAXPGPATH, "%s/promote", pg_data);
if ((prmfile = fopen(promote_file, "w")) == NULL)
{
write_stderr(_("%s: could not create promote signal file \"%s\": %s\n"),
@ -1799,7 +1808,7 @@ do_help(void)
" [-o \"OPTIONS\"]\n"), progname);
printf(_(" %s reload [-D DATADIR] [-s]\n"), progname);
printf(_(" %s status [-D DATADIR]\n"), progname);
printf(_(" %s promote [-D DATADIR] [-s]\n"), progname);
printf(_(" %s promote [-D DATADIR] [-s] [-m PROMOTION-MODE]\n"), progname);
printf(_(" %s kill SIGNALNAME PID\n"), progname);
#if defined(WIN32) || defined(__CYGWIN__)
printf(_(" %s register [-N SERVICENAME] [-U USERNAME] [-P PASSWORD] [-D DATADIR]\n"
@ -1828,7 +1837,7 @@ do_help(void)
printf(_(" -o OPTIONS command line options to pass to postgres\n"
" (PostgreSQL server executable) or initdb\n"));
printf(_(" -p PATH-TO-POSTGRES normally not necessary\n"));
printf(_("\nOptions for stop or restart:\n"));
printf(_("\nOptions for stop, restart or promote:\n"));
printf(_(" -m, --mode=MODE MODE can be \"smart\", \"fast\", or \"immediate\"\n"));
printf(_("\nShutdown modes are:\n"));
@ -1836,6 +1845,10 @@ do_help(void)
printf(_(" fast quit directly, with proper shutdown\n"));
printf(_(" immediate quit without complete shutdown; will lead to recovery on restart\n"));
printf(_("\nPromotion modes are:\n"));
printf(_(" smart promote after performing a checkpoint\n"));
printf(_(" fast promote quickly without waiting for checkpoint completion\n"));
printf(_("\nAllowed signal names for kill:\n"));
printf(" ABRT HUP INT QUIT TERM USR1 USR2\n");
@ -2271,7 +2284,6 @@ main(int argc, char **argv)
snprintf(pid_file, MAXPGPATH, "%s/postmaster.pid", pg_data);
snprintf(backup_file, MAXPGPATH, "%s/backup_label", pg_data);
snprintf(recovery_file, MAXPGPATH, "%s/recovery.conf", pg_data);
snprintf(promote_file, MAXPGPATH, "%s/promote", pg_data);
}
switch (ctl_command)

View File

@ -217,6 +217,12 @@ typedef struct xl_restore_point
char rp_name[MAXFNAMELEN];
} xl_restore_point;
/* End of recovery mark, when we don't do an END_OF_RECOVERY checkpoint */
typedef struct xl_end_of_recovery
{
TimestampTz end_time;
TimeLineID ThisTimeLineID;
} xl_end_of_recovery;
/*
* XLogRecord is defined in xlog.h, but we avoid #including that to keep

View File

@ -64,6 +64,7 @@ typedef struct CheckPoint
#define XLOG_PARAMETER_CHANGE 0x60
#define XLOG_RESTORE_POINT 0x70
#define XLOG_FPW_CHANGE 0x80
#define XLOG_END_OF_RECOVERY 0x90
/*