Start background writer during archive recovery. Background writer now performs

its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 15:58:41 +00:00 · 2009-02-18 15:58:41 +00:00 · cdd46c7654
parent 36a9cf38c5
commit cdd46c7654
8 changed files with 920 additions and 191 deletions
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.330 2009/02/07 10:49:36 heikki Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.331 2009/02/18 15:58:40 heikki Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -36,6 +36,7 @@
 #include "catalog/pg_control.h"
 #include "catalog/pg_type.h"
 #include "funcapi.h"
+#include "libpq/pqsignal.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "postmaster/bgwriter.h"
@ -47,6 +48,7 @@
 #include "storage/smgr.h"
 #include "storage/spin.h"
 #include "utils/builtins.h"
+#include "utils/flatfiles.h"
 #include "utils/guc.h"
 #include "utils/ps_status.h"
 #include "pg_trace.h"
@ -119,12 +121,27 @@ CheckpointStatsData CheckpointStats;
 */
 TimeLineID	ThisTimeLineID = 0;

-/* Are we doing recovery from XLOG? */
+/*
+ * Are we doing recovery from XLOG? 
+ *
+ * This is only ever true in the startup process, even if the system is still
+ * in recovery. Prior to 8.4, all activity during recovery were carried out
+ * by Startup process. This local variable continues to be used in functions
+ * that need to act differently when called from a redo function (e.g skip
+ * WAL logging). To check whether the system is in recovery regardless of what
+ * process you're running in, use RecoveryInProgress().
+ */
 bool		InRecovery = false;

 /* Are we recovering using offline XLOG archives? */
 static bool InArchiveRecovery = false;

+/*
+ * Local copy of SharedRecoveryInProgress variable. True actually means "not
+ * known, need to check the shared state"
+ */
+static bool LocalRecoveryInProgress = true;
+
 /* Was the last xlog file restored from archive, or local? */
 static bool restoredFromArchive = false;

@ -133,7 +150,6 @@ static char *recoveryRestoreCommand = NULL;
 static bool recoveryTarget = false;
 static bool recoveryTargetExact = false;
 static bool recoveryTargetInclusive = true;
-static bool recoveryLogRestartpoints = false;
 static TransactionId recoveryTargetXid;
 static TimestampTz recoveryTargetTime;
 static TimestampTz recoveryLastXTime = 0;
@ -242,9 +258,8 @@ static XLogRecPtr RedoRecPtr;
 * ControlFileLock: must be held to read/update control file or create
 * new log file.
 *
- * CheckpointLock: must be held to do a checkpoint (ensures only one
- * checkpointer at a time; currently, with all checkpoints done by the
- * bgwriter, this is just pro forma).
+ * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
+ * only one checkpointer at a time)
 *
 *----------
 */
@ -313,6 +328,25 @@ typedef struct XLogCtlData
 	int			XLogCacheBlck;	/* highest allocated xlog buffer index */
 	TimeLineID	ThisTimeLineID;

+	/*
+	 * SharedRecoveryInProgress indicates if we're still in crash or archive
+	 * recovery.  It's checked by RecoveryInProgress().
+	 */
+	bool		SharedRecoveryInProgress;
+
+	/*
+	 * During recovery, we keep a copy of the latest checkpoint record
+	 * here.  Used by the background writer when it wants to create
+	 * a restartpoint.
+	 *
+	 * Protected by info_lck.
+	 */
+	XLogRecPtr	lastCheckPointRecPtr;
+	CheckPoint	lastCheckPoint;
+
+	/* end+1 of the last record replayed (or being replayed) */
+	XLogRecPtr	replayEndRecPtr;
+
 	slock_t		info_lck;		/* locks shared variables shown above */
 } XLogCtlData;

@ -387,9 +421,21 @@ static XLogRecPtr ReadRecPtr;	/* start of last record read */
 static XLogRecPtr EndRecPtr;	/* end+1 of last record read */
 static XLogRecord *nextRecord = NULL;
 static TimeLineID lastPageTLI = 0;
+static XLogRecPtr minRecoveryPoint; /* local copy of ControlFile->minRecoveryPoint */
+static bool	updateMinRecoveryPoint = true;

 static bool InRedo = false;

+/*
+ * Flag set by interrupt handlers for later service in the redo loop.
+ */
+static volatile sig_atomic_t shutdown_requested = false;
+/*
+ * Flag set when executing a restore command, to tell SIGTERM signal handler
+ * that it's safe to just proc_exit(0).
+ */
+static volatile sig_atomic_t in_restore_command = false;
+

 static void XLogArchiveNotify(const char *xlog);
 static void XLogArchiveNotifySeg(uint32 log, uint32 seg);
@ -420,6 +466,7 @@ static void PreallocXlogFiles(XLogRecPtr endptr);
 static void RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr);
 static void ValidateXLOGDirectoryStructure(void);
 static void CleanupBackupHistory(void);
+static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
 static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode);
 static bool ValidXLOGHeader(XLogPageHeader hdr, int emode);
 static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt);
@ -484,6 +531,10 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 	bool		doPageWrites;
 	bool		isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);

+	/* cross-check on whether we should be here or not */
+	if (RecoveryInProgress())
+		elog(FATAL, "cannot make new WAL entries during recovery");
+
 	/* info's high bits are reserved for use by me */
 	if (info & XLR_INFO_MASK)
 		elog(PANIC, "invalid xlog info mask %02X", info);
@ -1717,6 +1768,63 @@ XLogSetAsyncCommitLSN(XLogRecPtr asyncCommitLSN)
 	SpinLockRelease(&xlogctl->info_lck);
 }

+/*
+ * Advance minRecoveryPoint in control file.
+ *
+ * If we crash during recovery, we must reach this point again before the
+ * database is consistent. 
+ * 
+ * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
+ * is is only updated if it's not already greater than or equal to 'lsn'.
+ */
+static void
+UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
+{
+	/* Quick check using our local copy of the variable */
+	if (!updateMinRecoveryPoint || (!force && XLByteLE(lsn, minRecoveryPoint)))
+		return;
+
+	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+
+	/* update local copy */
+	minRecoveryPoint = ControlFile->minRecoveryPoint;
+
+	/*
+	 * An invalid minRecoveryPoint means that we need to recover all the WAL,
+	 * ie. crash recovery. Don't update the control file in that case.
+	 */
+	if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
+		updateMinRecoveryPoint = false;
+	else if (force || XLByteLT(minRecoveryPoint, lsn))
+	{
+		/* use volatile pointer to prevent code rearrangement */
+		volatile XLogCtlData *xlogctl = XLogCtl;
+		XLogRecPtr newMinRecoveryPoint;
+
+		/*
+		 * To avoid having to update the control file too often, we update it
+		 * all the way to the last record being replayed, even though 'lsn'
+		 * would suffice for correctness.
+		 */
+		SpinLockAcquire(&xlogctl->info_lck);
+		newMinRecoveryPoint = xlogctl->replayEndRecPtr;
+		SpinLockRelease(&xlogctl->info_lck);
+
+		/* update control file */
+		if (XLByteLT(ControlFile->minRecoveryPoint, newMinRecoveryPoint))
+		{
+			ControlFile->minRecoveryPoint = newMinRecoveryPoint;
+			UpdateControlFile();
+			minRecoveryPoint = newMinRecoveryPoint;
+
+			ereport(DEBUG2,
+					(errmsg("updated min recovery point to %X/%X",
+						minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff)));
+		}
+	}
+	LWLockRelease(ControlFileLock);
+}
+
 /*
 * Ensure that all XLOG data through the given position is flushed to disk.
 *
@ -1729,9 +1837,15 @@ XLogFlush(XLogRecPtr record)
 	XLogRecPtr	WriteRqstPtr;
 	XLogwrtRqst WriteRqst;

-	/* Disabled during REDO */
-	if (InRedo)
+	/*
+	 * During REDO, we don't try to flush the WAL, but update minRecoveryPoint
+	 * instead.
+	 */
+	if (RecoveryInProgress())
+	{
+		UpdateMinRecoveryPoint(record, false);
 		return;
+	}

 	/* Quick exit if already known flushed */
 	if (XLByteLE(record, LogwrtResult.Flush))
@ -1818,9 +1932,9 @@ XLogFlush(XLogRecPtr record)
 	 * the bad page is encountered again during recovery then we would be
 	 * unable to restart the database at all!  (This scenario has actually
 	 * happened in the field several times with 7.1 releases. Note that we
-	 * cannot get here while InRedo is true, but if the bad page is brought in
-	 * and marked dirty during recovery then CreateCheckPoint will try to
-	 * flush it at the end of recovery.)
+	 * cannot get here while RecoveryInProgress(), but if the bad page is
+	 * brought in and marked dirty during recovery then if a checkpoint were
+	 * performed at the end of recovery it will try to flush it.
 	 *
 	 * The current approach is to ERROR under normal conditions, but only
 	 * WARNING during recovery, so that the system can be brought up even if
@ -1857,6 +1971,10 @@ XLogBackgroundFlush(void)
 	XLogRecPtr	WriteRqstPtr;
 	bool		flexible = true;

+	/* XLOG doesn't need flushing during recovery */
+	if (RecoveryInProgress())
+		return;
+
 	/* read LogwrtResult and update local state */
 	{
 		/* use volatile pointer to prevent code rearrangement */
@ -1928,6 +2046,10 @@ XLogAsyncCommitFlush(void)
 	/* use volatile pointer to prevent code rearrangement */
 	volatile XLogCtlData *xlogctl = XLogCtl;

+	/* There's no asynchronously committed transactions during recovery */
+	if (RecoveryInProgress())
+		return;
+
 	SpinLockAcquire(&xlogctl->info_lck);
 	WriteRqstPtr = xlogctl->asyncCommitLSN;
 	SpinLockRelease(&xlogctl->info_lck);
@ -1944,6 +2066,10 @@ XLogAsyncCommitFlush(void)
 bool
 XLogNeedsFlush(XLogRecPtr record)
 {
+	/* XLOG doesn't need flushing during recovery */
+	if (RecoveryInProgress())
+		return false;
+
 	/* Quick exit if already known flushed */
 	if (XLByteLE(record, LogwrtResult.Flush))
 		return false;
@ -2618,10 +2744,23 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 			(errmsg_internal("executing restore command \"%s\"",
 							 xlogRestoreCmd)));

+	/*
+	 * Set in_restore_command to tell the signal handler that we should exit
+	 * right away on SIGTERM. We know that we're in a safe point to do that.
+	 * Check if we had already received the signal, so that we don't miss a
+	 * shutdown request received just before this.
+	 */
+	in_restore_command = true;
+	if (shutdown_requested)
+		proc_exit(0);
+
 	/*
 	 * Copy xlog from archival storage to XLOGDIR
 	 */
 	rc = system(xlogRestoreCmd);
+
+	in_restore_command = false;
+
 	if (rc == 0)
 	{
 		/*
@ -2674,14 +2813,24 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 	 * assume that recovery is complete and start up the database!) It's
 	 * essential to abort on child SIGINT and SIGQUIT, because per spec
 	 * system() ignores SIGINT and SIGQUIT while waiting; if we see one of
-	 * those it's a good bet we should have gotten it too.  Aborting on other
-	 * signals such as SIGTERM seems a good idea as well.
+	 * those it's a good bet we should have gotten it too.
+	 *
+	 * On SIGTERM, assume we have received a fast shutdown request, and exit
+	 * cleanly. It's pure chance whether we receive the SIGTERM first, or the
+	 * child process. If we receive it first, the signal handler will call
+	 * proc_exit(0), otherwise we do it here. If we or the child process
+	 * received SIGTERM for any other reason than a fast shutdown request,
+	 * postmaster will perform an immediate shutdown when it sees us exiting
+	 * unexpectedly.
 	 *
 	 * Per the Single Unix Spec, shells report exit status > 128 when a called
 	 * command died on a signal.  Also, 126 and 127 are used to report
 	 * problems such as an unfindable command; treat those as fatal errors
 	 * too.
 	 */
+	if (WTERMSIG(rc) == SIGTERM)
+		proc_exit(0);
+
 	signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;

 	ereport(signaled ? FATAL : DEBUG2,
@ -4584,18 +4733,6 @@ readRecoveryCommandFile(void)
 			ereport(LOG,
 					(errmsg("recovery_target_inclusive = %s", tok2)));
 		}
-		else if (strcmp(tok1, "log_restartpoints") == 0)
-		{
-			/*
-			 * does nothing if a recovery_target is not also set
-			 */
-			if (!parse_bool(tok2, &recoveryLogRestartpoints))
-				  ereport(ERROR,
-							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-					  errmsg("parameter \"log_restartpoints\" requires a Boolean value")));
-			ereport(LOG,
-					(errmsg("log_restartpoints = %s", tok2)));
-		}
 		else
 			ereport(FATAL,
 					(errmsg("unrecognized recovery parameter \"%s\"",
@ -4877,7 +5014,7 @@ StartupXLOG(void)
 	XLogRecPtr	RecPtr,
 				LastRec,
 				checkPointLoc,
-				minRecoveryLoc,
+				backupStopLoc,
 				EndOfLog;
 	uint32		endLogId;
 	uint32		endLogSeg;
@ -4885,6 +5022,8 @@ StartupXLOG(void)
 	uint32		freespace;
 	TransactionId oldestActiveXID;

+	XLogCtl->SharedRecoveryInProgress = true;
+
 	/*
 	 * Read control file and check XLOG status looks valid.
 	 *
@ -4964,7 +5103,7 @@ StartupXLOG(void)
 						recoveryTargetTLI,
 						ControlFile->checkPointCopy.ThisTimeLineID)));

-	if (read_backup_label(&checkPointLoc, &minRecoveryLoc))
+	if (read_backup_label(&checkPointLoc, &backupStopLoc))
 	{
 		/*
 		 * When a backup_label file is present, we want to roll forward from
@ -5102,11 +5241,23 @@ StartupXLOG(void)
 		ControlFile->prevCheckPoint = ControlFile->checkPoint;
 		ControlFile->checkPoint = checkPointLoc;
 		ControlFile->checkPointCopy = checkPoint;
-		if (minRecoveryLoc.xlogid != 0 || minRecoveryLoc.xrecoff != 0)
-			ControlFile->minRecoveryPoint = minRecoveryLoc;
+		if (backupStopLoc.xlogid != 0 || backupStopLoc.xrecoff != 0)
+		{
+			if (XLByteLT(ControlFile->minRecoveryPoint, backupStopLoc))
+				ControlFile->minRecoveryPoint = backupStopLoc;
+		}
 		ControlFile->time = (pg_time_t) time(NULL);
+		/* No need to hold ControlFileLock yet, we aren't up far enough */
 		UpdateControlFile();

+		/* update our local copy of minRecoveryPoint */
+		minRecoveryPoint = ControlFile->minRecoveryPoint;
+
+		/*
+		 * Reset pgstat data, because it may be invalid after recovery.
+		 */
+		pgstat_reset_all();
+
 		/*
 		 * If there was a backup label file, it's done its job and the info
 		 * has now been propagated into pg_control.  We must get rid of the
@ -5151,12 +5302,41 @@ StartupXLOG(void)
 		{
 			bool		recoveryContinue = true;
 			bool		recoveryApply = true;
+			bool		reachedMinRecoveryPoint = false;
 			ErrorContextCallback errcontext;
+			/* use volatile pointer to prevent code rearrangement */
+			volatile XLogCtlData *xlogctl = XLogCtl;
+
+			/* Update shared replayEndRecPtr */
+			SpinLockAcquire(&xlogctl->info_lck);
+			xlogctl->replayEndRecPtr = ReadRecPtr;
+			SpinLockRelease(&xlogctl->info_lck);

 			InRedo = true;
-			ereport(LOG,
-					(errmsg("redo starts at %X/%X",
-							ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
+
+			if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
+				ereport(LOG,
+						(errmsg("redo starts at %X/%X",
+								ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
+			else
+				ereport(LOG,
+						(errmsg("redo starts at %X/%X, consistency will be reached at %X/%X",
+						ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
+						minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff)));
+
+			/*
+			 * Let postmaster know we've started redo now, so that it can
+			 * launch bgwriter to perform restartpoints.  We don't bother
+			 * during crash recovery as restartpoints can only be performed
+			 * during archive recovery.  And we'd like to keep crash recovery
+			 * simple, to avoid introducing bugs that could you from
+			 * recovering after crash.
+			 *
+			 * After this point, we can no longer assume that we're the only
+			 * process in addition to postmaster!
+			 */
+			if (InArchiveRecovery && IsUnderPostmaster)
+				SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);

 			/*
 			 * main redo apply loop
@ -5182,6 +5362,30 @@ StartupXLOG(void)
 				}
 #endif

+				/*
+				 * Check if we were requested to exit without finishing
+				 * recovery.
+				 */
+				if (shutdown_requested)
+					proc_exit(0);
+
+				/*
+				 * Have we reached our safe starting point? If so, we can
+				 * tell postmaster that the database is consistent now.
+				 */
+				if (!reachedMinRecoveryPoint && 
+					 XLByteLE(minRecoveryPoint, EndRecPtr))
+				{
+					reachedMinRecoveryPoint = true;
+					if (InArchiveRecovery)
+					{
+						ereport(LOG,
+								(errmsg("consistent recovery state reached")));
+						if (IsUnderPostmaster)
+							SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
+					}
+				}
+
 				/*
 				 * Have we reached our recovery target?
 				 */
@ -5207,6 +5411,15 @@ StartupXLOG(void)
 					TransactionIdAdvance(ShmemVariableCache->nextXid);
 				}

+				/*
+				 * Update shared replayEndRecPtr before replaying this
+				 * record, so that XLogFlush will update minRecoveryPoint
+				 * correctly.
+				 */
+				SpinLockAcquire(&xlogctl->info_lck);
+				xlogctl->replayEndRecPtr = EndRecPtr;
+				SpinLockRelease(&xlogctl->info_lck);
+
 				RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);

 				/* Pop the error context stack */
@ -5250,14 +5463,14 @@ StartupXLOG(void)
 	 * Complain if we did not roll forward far enough to render the backup
 	 * dump consistent.
 	 */
-	if (XLByteLT(EndOfLog, ControlFile->minRecoveryPoint))
+	if (InRecovery && XLByteLT(EndOfLog, minRecoveryPoint))
 	{
 		if (reachedStopPoint)	/* stopped because of stop request */
 			ereport(FATAL,
-					(errmsg("requested recovery stop point is before end time of backup dump")));
+					(errmsg("requested recovery stop point is before consistent recovery point")));
 		else	/* ran off end of WAL */
 			ereport(FATAL,
-					(errmsg("WAL ends before end time of backup dump")));
+					(errmsg("WAL ends before consistent recovery point")));
 	}

 	/*
@ -5352,6 +5565,12 @@ StartupXLOG(void)
 	/* Pre-scan prepared transactions to find out the range of XIDs present */
 	oldestActiveXID = PrescanPreparedTransactions();

+	/*
+	 * Allow writing WAL for us, so that we can create a checkpoint record.
+	 * But not yet for other backends!
+	 */
+	LocalRecoveryInProgress = false;
+
 	if (InRecovery)
 	{
 		int			rmid;
@ -5371,11 +5590,6 @@ StartupXLOG(void)
 		 */
 		XLogCheckInvalidPages();

-		/*
-		 * Reset pgstat data, because it may be invalid after recovery.
-		 */
-		pgstat_reset_all();
-
 		/*
 		 * Perform a checkpoint to update all our recovery activity to disk.
 		 *
@ -5398,12 +5612,14 @@ StartupXLOG(void)
 	 */
 	InRecovery = false;

+	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
 	ControlFile->state = DB_IN_PRODUCTION;
 	ControlFile->time = (pg_time_t) time(NULL);
 	UpdateControlFile();
+	LWLockRelease(ControlFileLock);

 	/* start the archive_timeout timer running */
-	XLogCtl->Write.lastSegSwitchTime = ControlFile->time;
+	XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);

 	/* initialize shared-memory copy of latest checkpoint XID/epoch */
 	XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
@ -5438,6 +5654,45 @@ StartupXLOG(void)
 		readRecordBuf = NULL;
 		readRecordBufSize = 0;
 	}
+
+	/*
+	 * All done. Allow others to write WAL.
+	 */
+	XLogCtl->SharedRecoveryInProgress = false;
+}
+
+/*
+ * Is the system still in recovery?
+ *
+ * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
+ * variables the first time we see that recovery is finished.
+ */
+bool
+RecoveryInProgress(void)
+{
+	/*
+	 * We check shared state each time only until we leave recovery mode.
+	 * We can't re-enter recovery, so we rely on the local state variable
+	 * after that.
+	 */
+	if (!LocalRecoveryInProgress)
+		return false;
+	else
+	{
+		/* use volatile pointer to prevent code rearrangement */
+		volatile XLogCtlData *xlogctl = XLogCtl;
+
+		LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
+
+		/*
+		 * Initialize TimeLineID and RedoRecPtr the first time we see that
+		 * recovery is finished.
+		 */
+		if (!LocalRecoveryInProgress)
+			InitXLOGAccess();
+
+		return LocalRecoveryInProgress;
+	}
 }

 /*
@ -5569,6 +5824,8 @@ InitXLOGAccess(void)
 {
 	/* ThisTimeLineID doesn't change so we need no lock to copy it */
 	ThisTimeLineID = XLogCtl->ThisTimeLineID;
+	Assert(ThisTimeLineID != 0);
+
 	/* Use GetRedoRecPtr to copy the RedoRecPtr safely */
 	(void) GetRedoRecPtr();
 }
@ -5680,7 +5937,10 @@ ShutdownXLOG(int code, Datum arg)
 	ereport(LOG,
 			(errmsg("shutting down")));

-	CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
+	if (RecoveryInProgress())
+		CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
+	else
+		CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
 	ShutdownCLOG();
 	ShutdownSUBTRANS();
 	ShutdownMultiXact();
@ -5693,9 +5953,20 @@ ShutdownXLOG(int code, Datum arg)
 * Log start of a checkpoint.
 */
 static void
-LogCheckpointStart(int flags)
+LogCheckpointStart(int flags, bool restartpoint)
 {
-	elog(LOG, "checkpoint starting:%s%s%s%s%s%s",
+	char *msg;
+
+	/*
+	 * XXX: This is hopelessly untranslatable. We could call gettext_noop
+	 * for the main message, but what about all the flags?
+	 */
+	if (restartpoint)
+		msg = "restartpoint starting:%s%s%s%s%s%s";
+	else
+		msg = "checkpoint starting:%s%s%s%s%s%s";
+
+	elog(LOG, msg,
 		 (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
 		 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
 		 (flags & CHECKPOINT_FORCE) ? " force" : "",
@ -5708,7 +5979,7 @@ LogCheckpointStart(int flags)
 * Log end of a checkpoint.
 */
 static void
-LogCheckpointEnd(void)
+LogCheckpointEnd(bool restartpoint)
 {
 	long		write_secs,
 				sync_secs,
@ -5731,17 +6002,26 @@ LogCheckpointEnd(void)
 						CheckpointStats.ckpt_sync_end_t,
 						&sync_secs, &sync_usecs);

-	elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
-		 "%d transaction log file(s) added, %d removed, %d recycled; "
-		 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
-		 CheckpointStats.ckpt_bufs_written,
-		 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
-		 CheckpointStats.ckpt_segs_added,
-		 CheckpointStats.ckpt_segs_removed,
-		 CheckpointStats.ckpt_segs_recycled,
-		 write_secs, write_usecs / 1000,
-		 sync_secs, sync_usecs / 1000,
-		 total_secs, total_usecs / 1000);
+	if (restartpoint)
+		elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
+			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
+			 CheckpointStats.ckpt_bufs_written,
+			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
+			 write_secs, write_usecs / 1000,
+			 sync_secs, sync_usecs / 1000,
+			 total_secs, total_usecs / 1000);
+	else
+		elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
+			 "%d transaction log file(s) added, %d removed, %d recycled; "
+			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
+			 CheckpointStats.ckpt_bufs_written,
+			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
+			 CheckpointStats.ckpt_segs_added,
+			 CheckpointStats.ckpt_segs_removed,
+			 CheckpointStats.ckpt_segs_recycled,
+			 write_secs, write_usecs / 1000,
+			 sync_secs, sync_usecs / 1000,
+			 total_secs, total_usecs / 1000);
 }

 /*
@ -5772,13 +6052,33 @@ CreateCheckPoint(int flags)
 	TransactionId *inCommitXids;
 	int			nInCommit;

+	/* shouldn't happen */
+	if (RecoveryInProgress())
+		elog(ERROR, "can't create a checkpoint during recovery");
+
 	/*
 	 * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
-	 * (This is just pro forma, since in the present system structure there is
-	 * only one process that is allowed to issue checkpoints at any given
-	 * time.)
+	 * During normal operation, bgwriter is the only process that creates
+	 * checkpoints, but at the end of archive recovery, the bgwriter can be
+	 * busy creating a restartpoint while the startup process tries to perform
+	 * the startup checkpoint.
 	 */
-	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
+	if (!LWLockConditionalAcquire(CheckpointLock, LW_EXCLUSIVE))
+	{
+		Assert(InRecovery);
+
+		/*
+		 * A restartpoint is in progress. Wait until it finishes. This can
+		 * cause an extra restartpoint to be performed, but that's OK because
+		 * we're just about to perform a checkpoint anyway. Flushing the
+		 * buffers in this restartpoint can take some time, but that time is
+		 * saved from the upcoming checkpoint so the net effect is zero.
+		 */
+		ereport(DEBUG2, (errmsg("hurrying in-progress restartpoint")));
+		RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_WAIT);
+
+		LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
+	}

 	/*
 	 * Prepare to accumulate statistics.
@ -5797,9 +6097,11 @@ CreateCheckPoint(int flags)

 	if (shutdown)
 	{
+		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
 		ControlFile->state = DB_SHUTDOWNING;
 		ControlFile->time = (pg_time_t) time(NULL);
 		UpdateControlFile();
+		LWLockRelease(ControlFileLock);
 	}

 	/*
@ -5903,7 +6205,7 @@ CreateCheckPoint(int flags)
 	 * to log anything if we decided to skip the checkpoint.
 	 */
 	if (log_checkpoints)
-		LogCheckpointStart(flags);
+		LogCheckpointStart(flags, false);

 	TRACE_POSTGRESQL_CHECKPOINT_START(flags);

@ -6070,7 +6372,7 @@ CreateCheckPoint(int flags)

 	/* All real work is done, but log before releasing lock. */
 	if (log_checkpoints)
-		LogCheckpointEnd();
+		LogCheckpointEnd(false);

        TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
                                NBuffers, CheckpointStats.ckpt_segs_added,
@ -6098,32 +6400,17 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
 }

 /*
- * Set a recovery restart point if appropriate
- *
- * This is similar to CreateCheckPoint, but is used during WAL recovery
- * to establish a point from which recovery can roll forward without
- * replaying the entire recovery log.  This function is called each time
- * a checkpoint record is read from XLOG; it must determine whether a
- * restartpoint is needed or not.
+ * This is used during WAL recovery to establish a point from which recovery
+ * can roll forward without replaying the entire recovery log.  This function
+ * is called each time a checkpoint record is read from XLOG. It is stored
+ * in shared memory, so that it can be used as a restartpoint later on.
 */
 static void
 RecoveryRestartPoint(const CheckPoint *checkPoint)
 {
-	int			elapsed_secs;
 	int			rmid;
-
-	/*
-	 * Do nothing if the elapsed time since the last restartpoint is less than
-	 * half of checkpoint_timeout.	(We use a value less than
-	 * checkpoint_timeout so that variations in the timing of checkpoints on
-	 * the master, or speed of transmission of WAL segments to a slave, won't
-	 * make the slave skip a restartpoint once it's synced with the master.)
-	 * Checking true elapsed time keeps us from doing restartpoints too often
-	 * while rapidly scanning large amounts of WAL.
-	 */
-	elapsed_secs = (pg_time_t) time(NULL) - ControlFile->time;
-	if (elapsed_secs < CheckPointTimeout / 2)
-		return;
+	/* use volatile pointer to prevent code rearrangement */
+	volatile XLogCtlData *xlogctl = XLogCtl;

 	/*
 	 * Is it safe to checkpoint?  We must ask each of the resource managers
@ -6145,28 +6432,128 @@ RecoveryRestartPoint(const CheckPoint *checkPoint)
 	}

 	/*
-	 * OK, force data out to disk
+	 * Copy the checkpoint record to shared memory, so that bgwriter can
+	 * use it the next time it wants to perform a restartpoint.
 	 */
-	CheckPointGuts(checkPoint->redo, CHECKPOINT_IMMEDIATE);
+	SpinLockAcquire(&xlogctl->info_lck);
+	XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
+	memcpy(&XLogCtl->lastCheckPoint, checkPoint, sizeof(CheckPoint));
+	SpinLockRelease(&xlogctl->info_lck);
+}
+
+/*
+ * This is similar to CreateCheckPoint, but is used during WAL recovery
+ * to establish a point from which recovery can roll forward without
+ * replaying the entire recovery log.
+ *
+ * Returns true if a new restartpoint was established. We can only establish
+ * a restartpoint if we have replayed a checkpoint record since last
+ * restartpoint.
+ */
+bool
+CreateRestartPoint(int flags)
+{
+	XLogRecPtr lastCheckPointRecPtr;
+	CheckPoint lastCheckPoint;
+	/* use volatile pointer to prevent code rearrangement */
+	volatile XLogCtlData *xlogctl = XLogCtl;

 	/*
-	 * Update pg_control so that any subsequent crash will restart from this
-	 * checkpoint.	Note: ReadRecPtr gives the XLOG address of the checkpoint
-	 * record itself.
+	 * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
+	 * happens at a time.
 	 */
+	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
+
+	/* Get the a local copy of the last checkpoint record. */
+	SpinLockAcquire(&xlogctl->info_lck);
+	lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
+	memcpy(&lastCheckPoint, &XLogCtl->lastCheckPoint, sizeof(CheckPoint));
+	SpinLockRelease(&xlogctl->info_lck);
+
+	/* 
+	 * Check that we're still in recovery mode. It's ok if we exit recovery
+	 * mode after this check, the restart point is valid anyway.
+	 */
+	if (!RecoveryInProgress())
+	{
+		ereport(DEBUG2,
+				(errmsg("skipping restartpoint, recovery has already ended")));
+		LWLockRelease(CheckpointLock);
+		return false;
+	}
+
+	/*
+	 * If the last checkpoint record we've replayed is already our last
+	 * restartpoint, we can't perform a new restart point. We still update
+	 * minRecoveryPoint in that case, so that if this is a shutdown restart
+	 * point, we won't start up earlier than before. That's not strictly
+	 * necessary, but when we get hot standby capability, it would be rather
+	 * weird if the database opened up for read-only connections at a
+	 * point-in-time before the last shutdown. Such time travel is still
+	 * possible in case of immediate shutdown, though.
+	 *
+	 * We don't explicitly advance minRecoveryPoint when we do create a
+	 * restartpoint. It's assumed that flushing the buffers will do that
+	 * as a side-effect.
+	 */
+	if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
+		XLByteLE(lastCheckPoint.redo, ControlFile->checkPointCopy.redo))
+	{
+		XLogRecPtr InvalidXLogRecPtr = {0, 0};
+		ereport(DEBUG2,
+				(errmsg("skipping restartpoint, already performed at %X/%X",
+						lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));
+
+		UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
+		LWLockRelease(CheckpointLock);
+		return false;
+	}
+
+	if (log_checkpoints)
+	{
+		/*
+		 * Prepare to accumulate statistics.
+		 */
+		MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
+		CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
+
+		LogCheckpointStart(flags, true);
+	}
+
+	CheckPointGuts(lastCheckPoint.redo, flags);
+
+	/*
+	 * Update pg_control, using current time
+	 */
+	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
 	ControlFile->prevCheckPoint = ControlFile->checkPoint;
-	ControlFile->checkPoint = ReadRecPtr;
-	ControlFile->checkPointCopy = *checkPoint;
+	ControlFile->checkPoint = lastCheckPointRecPtr;
+	ControlFile->checkPointCopy = lastCheckPoint;
 	ControlFile->time = (pg_time_t) time(NULL);
 	UpdateControlFile();
+	LWLockRelease(ControlFileLock);

-	ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
+	/*
+	 * Currently, there is no need to truncate pg_subtrans during recovery.
+	 * If we did do that, we will need to have called StartupSUBTRANS()
+	 * already and then TruncateSUBTRANS() would go here.
+	 */
+
+	/* All real work is done, but log before releasing lock. */
+	if (log_checkpoints)
+		LogCheckpointEnd(true);
+
+	ereport((log_checkpoints ? LOG : DEBUG2),
 			(errmsg("recovery restart point at %X/%X",
-					checkPoint->redo.xlogid, checkPoint->redo.xrecoff)));
+					lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));
+
 	if (recoveryLastXTime)
-		ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
-				(errmsg("last completed transaction was at log time %s",
-						timestamptz_to_str(recoveryLastXTime))));
+		ereport((log_checkpoints ? LOG : DEBUG2),
+			(errmsg("last completed transaction was at log time %s",
+					timestamptz_to_str(recoveryLastXTime))));
+
+	LWLockRelease(CheckpointLock);
+	return true;
 }

 /*
@ -6232,6 +6619,9 @@ RequestXLogSwitch(void)

 /*
 * XLOG resource manager's routines
+ *
+ * Definitions of info values are in include/catalog/pg_control.h, though
+ * not all records types are related to control file processing.
 */
 void
 xlog_redo(XLogRecPtr lsn, XLogRecord *record)
@ -6278,9 +6668,9 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
 								 (int) checkPoint.ThisTimeLineID))
 				ereport(PANIC,
 						(errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
-								checkPoint.ThisTimeLineID, ThisTimeLineID)));
-			/* Following WAL records should be run with new TLI */
-			ThisTimeLineID = checkPoint.ThisTimeLineID;
+                               checkPoint.ThisTimeLineID, ThisTimeLineID)));
+           /* Following WAL records should be run with new TLI */
+           ThisTimeLineID = checkPoint.ThisTimeLineID;
 		}

 		RecoveryRestartPoint(&checkPoint);
@ -7221,3 +7611,92 @@ CancelBackup(void)
 	}
 }

+/* ------------------------------------------------------
+ *  Startup Process main entry point and signal handlers
+ * ------------------------------------------------------
+ */
+
+/*
+ * startupproc_quickdie() occurs when signalled SIGQUIT by the postmaster.
+ *
+ * Some backend has bought the farm,
+ * so we need to stop what we're doing and exit.
+ */
+static void
+startupproc_quickdie(SIGNAL_ARGS)
+{
+	PG_SETMASK(&BlockSig);
+
+	/*
+	 * DO NOT proc_exit() -- we're here because shared memory may be
+	 * corrupted, so we don't want to try to clean up our transaction. Just
+	 * nail the windows shut and get out of town.
+	 *
+	 * Note we do exit(2) not exit(0).	This is to force the postmaster into a
+	 * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
+	 * backend.  This is necessary precisely because we don't clean up our
+	 * shared memory state.
+	 */
+	exit(2);
+}
+
+
+/* SIGTERM: set flag to abort redo and exit */
+static void
+StartupProcShutdownHandler(SIGNAL_ARGS)
+{
+	if (in_restore_command)
+		proc_exit(0);
+	else
+		shutdown_requested = true;
+}
+
+/* Main entry point for startup process */
+void
+StartupProcessMain(void)
+{
+	/*
+	 * If possible, make this process a group leader, so that the postmaster
+	 * can signal any child processes too.
+	 */
+#ifdef HAVE_SETSID
+	if (setsid() < 0)
+		elog(FATAL, "setsid() failed: %m");
+#endif
+
+	/*
+	 * Properly accept or ignore signals the postmaster might send us
+	 */
+	pqsignal(SIGHUP, SIG_IGN);	/* ignore config file updates */
+	pqsignal(SIGINT, SIG_IGN);		/* ignore query cancel */
+	pqsignal(SIGTERM, StartupProcShutdownHandler); /* request shutdown */
+	pqsignal(SIGQUIT, startupproc_quickdie);		/* hard crash time */
+	pqsignal(SIGALRM, SIG_IGN);
+	pqsignal(SIGPIPE, SIG_IGN);
+	pqsignal(SIGUSR1, SIG_IGN);
+	pqsignal(SIGUSR2, SIG_IGN);
+
+	/*
+	 * Reset some signals that are accepted by postmaster but not here
+	 */
+	pqsignal(SIGCHLD, SIG_DFL);
+	pqsignal(SIGTTIN, SIG_DFL);
+	pqsignal(SIGTTOU, SIG_DFL);
+	pqsignal(SIGCONT, SIG_DFL);
+	pqsignal(SIGWINCH, SIG_DFL);
+
+	/*
+	 * Unblock signals (they were blocked when the postmaster forked us)
+	 */
+	PG_SETMASK(&UnBlockSig);
+
+	StartupXLOG();	
+
+	BuildFlatFiles(false);
+
+	/* Let postmaster know that startup is finished */
+	SendPostmasterSignal(PMSIGNAL_RECOVERY_COMPLETED);
+
+	/* exit normally */
+	proc_exit(0);
+}
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@ -8,7 +8,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.249 2009/01/22 20:16:00 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.250 2009/02/18 15:58:41 heikki Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -37,7 +37,6 @@
 #include "storage/proc.h"
 #include "tcop/tcopprot.h"
 #include "utils/builtins.h"
-#include "utils/flatfiles.h"
 #include "utils/fmgroids.h"
 #include "utils/memutils.h"
 #include "utils/ps_status.h"
@ -416,14 +415,12 @@ AuxiliaryProcessMain(int argc, char *argv[])
 			proc_exit(1);		/* should never return */

 		case StartupProcess:
-			bootstrap_signals();
-			StartupXLOG();
-			BuildFlatFiles(false);
-			proc_exit(0);		/* startup done */
+			/* don't set signals, startup process has its own agenda */
+			StartupProcessMain();
+			proc_exit(1);		/* should never return */

 		case BgWriterProcess:
 			/* don't set signals, bgwriter has its own agenda */
-			InitXLOGAccess();
 			BackgroundWriterMain();
 			proc_exit(1);		/* should never return */

--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@ -37,7 +37,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.55 2009/01/01 17:23:46 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.56 2009/02/18 15:58:41 heikki Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -49,6 +49,7 @@
 #include <unistd.h>

 #include "access/xlog_internal.h"
+#include "catalog/pg_control.h"
 #include "libpq/pqsignal.h"
 #include "miscadmin.h"
 #include "pgstat.h"
@ -423,9 +424,19 @@ BackgroundWriterMain(void)
 		 */
 		if (do_checkpoint)
 		{
+			bool	ckpt_performed = false;
+			bool	do_restartpoint;
+
 			/* use volatile pointer to prevent code rearrangement */
 			volatile BgWriterShmemStruct *bgs = BgWriterShmem;

+			/*
+			 * Check if we should perform a checkpoint or a restartpoint.
+			 * As a side-effect, RecoveryInProgress() initializes
+			 * TimeLineID if it's not set yet.
+			 */
+			do_restartpoint = RecoveryInProgress();
+
 			/*
 			 * Atomically fetch the request flags to figure out what kind of a
 			 * checkpoint we should perform, and increase the started-counter
@ -444,7 +455,8 @@ BackgroundWriterMain(void)
 			 * implementation will not generate warnings caused by
 			 * CheckPointTimeout < CheckPointWarning.
 			 */
-			if ((flags & CHECKPOINT_CAUSE_XLOG) &&
+			if (!do_restartpoint &&
+				(flags & CHECKPOINT_CAUSE_XLOG) &&
 				elapsed_secs < CheckPointWarning)
 				ereport(LOG,
 						(errmsg("checkpoints are occurring too frequently (%d seconds apart)",
@ -455,14 +467,21 @@ BackgroundWriterMain(void)
 			 * Initialize bgwriter-private variables used during checkpoint.
 			 */
 			ckpt_active = true;
-			ckpt_start_recptr = GetInsertRecPtr();
+			if (!do_restartpoint)
+				ckpt_start_recptr = GetInsertRecPtr();
 			ckpt_start_time = now;
 			ckpt_cached_elapsed = 0;

 			/*
 			 * Do the checkpoint.
 			 */
-			CreateCheckPoint(flags);
+			if (!do_restartpoint)
+			{
+				CreateCheckPoint(flags);
+				ckpt_performed = true;
+			}
+			else
+				ckpt_performed = CreateRestartPoint(flags);

 			/*
 			 * After any checkpoint, close all smgr files.	This is so we
@ -477,14 +496,27 @@ BackgroundWriterMain(void)
 			bgs->ckpt_done = bgs->ckpt_started;
 			SpinLockRelease(&bgs->ckpt_lck);

-			ckpt_active = false;
+			if (ckpt_performed)
+			{
+				/*
+				 * Note we record the checkpoint start time not end time as
+				 * last_checkpoint_time.  This is so that time-driven
+				 * checkpoints happen at a predictable spacing.
+				 */
+				last_checkpoint_time = now;
+			}
+			else
+			{
+				/*
+				 * We were not able to perform the restartpoint (checkpoints
+				 * throw an ERROR in case of error).  Most likely because we
+				 * have not received any new checkpoint WAL records since the
+				 * last restartpoint. Try again in 15 s.
+				 */
+				last_checkpoint_time = now - CheckPointTimeout + 15;
+			}

-			/*
-			 * Note we record the checkpoint start time not end time as
-			 * last_checkpoint_time.  This is so that time-driven checkpoints
-			 * happen at a predictable spacing.
-			 */
-			last_checkpoint_time = now;
+			ckpt_active = false;
 		}
 		else
 			BgBufferSync();
@ -507,7 +539,7 @@ CheckArchiveTimeout(void)
 	pg_time_t	now;
 	pg_time_t	last_time;

-	if (XLogArchiveTimeout <= 0)
+	if (XLogArchiveTimeout <= 0 || RecoveryInProgress())
 		return;

 	now = (pg_time_t) time(NULL);
@ -714,16 +746,19 @@ IsCheckpointOnSchedule(double progress)
 	 * However, it's good enough for our purposes, we're only calculating an
 	 * estimate anyway.
 	 */
-	recptr = GetInsertRecPtr();
-	elapsed_xlogs =
-		(((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
-		 ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
-		CheckPointSegments;
-
-	if (progress < elapsed_xlogs)
+	if (!RecoveryInProgress())
 	{
-		ckpt_cached_elapsed = elapsed_xlogs;
-		return false;
+		recptr = GetInsertRecPtr();
+		elapsed_xlogs =
+			(((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
+			 ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
+			CheckPointSegments;
+
+		if (progress < elapsed_xlogs)
+		{
+			ckpt_cached_elapsed = elapsed_xlogs;
+			return false;
+		}
 	}

 	/*
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@ -37,7 +37,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.570 2009/01/04 22:19:59 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.571 2009/02/18 15:58:41 heikki Exp $
 *
 * NOTES
 *
@ -225,11 +225,38 @@ static pid_t StartupPID = 0,
 static int	Shutdown = NoShutdown;

 static bool FatalError = false; /* T if recovering from backend crash */
+static bool RecoveryError = false; /* T if recovery failed */
+
+/* State of WAL redo */
+#define			NoRecovery			0
+#define			RecoveryStarted		1
+#define			RecoveryConsistent	2
+#define			RecoveryCompleted	3
+
+static int	RecoveryStatus = NoRecovery;

 /*
 * We use a simple state machine to control startup, shutdown, and
 * crash recovery (which is rather like shutdown followed by startup).
 *
+ * After doing all the postmaster initialization work, we enter PM_STARTUP
+ * state and the startup process is launched. The startup process begins by
+ * reading the control file and other preliminary initialization steps. When
+ * it's ready to start WAL redo, it signals postmaster, and we switch to
+ * PM_RECOVERY phase. The background writer is launched, while the startup
+ * process continues applying WAL. 
+ * 
+ * After reaching a consistent point in WAL redo, startup process signals
+ * us again, and we switch to PM_RECOVERY_CONSISTENT phase. There's currently
+ * no difference between PM_RECOVERY and PM_RECOVERY_CONSISTENT, but we
+ * could start accepting connections to perform read-only queries at this
+ * point, if we had the infrastructure to do that.
+ *
+ * When the WAL redo is finished, the startup process signals us the third
+ * time, and we switch to PM_RUN state. The startup process can also skip the
+ * recovery and consistent recovery phases altogether, as it will during
+ * normal startup when there's no recovery to be done, for example.
+ *
 * Normal child backends can only be launched when we are in PM_RUN state.
 * (We also allow it in PM_WAIT_BACKUP state, but only for superusers.)
 * In other states we handle connection requests by launching "dead_end"
@ -245,15 +272,19 @@ static bool FatalError = false; /* T if recovering from backend crash */
 *
 * Notice that this state variable does not distinguish *why* we entered
 * states later than PM_RUN --- Shutdown and FatalError must be consulted
- * to find that out.  FatalError is never true in PM_RUN state, nor in
- * PM_SHUTDOWN states (because we don't enter those states when trying to
- * recover from a crash).  It can be true in PM_STARTUP state, because we
- * don't clear it until we've successfully recovered.
+ * to find that out.  FatalError is never true in PM_RECOVERY_* or PM_RUN
+ * states, nor in PM_SHUTDOWN states (because we don't enter those states
+ * when trying to recover from a crash).  It can be true in PM_STARTUP state,
+ * because we don't clear it until we've successfully started WAL redo.
+ * Similarly, RecoveryError means that we have crashed during recovery, and
+ * should not try to restart.
 */
 typedef enum
 {
 	PM_INIT,					/* postmaster starting */
 	PM_STARTUP,					/* waiting for startup subprocess */
+	PM_RECOVERY,				/* in recovery mode */
+	PM_RECOVERY_CONSISTENT,		/* consistent recovery mode */
 	PM_RUN,						/* normal "database is alive" state */
 	PM_WAIT_BACKUP,				/* waiting for online backup mode to end */
 	PM_WAIT_BACKENDS,			/* waiting for live backends to exit */
@ -307,6 +338,7 @@ static void pmdie(SIGNAL_ARGS);
 static void reaper(SIGNAL_ARGS);
 static void sigusr1_handler(SIGNAL_ARGS);
 static void dummy_handler(SIGNAL_ARGS);
+static void CheckRecoverySignals(void);
 static void CleanupBackend(int pid, int exitstatus);
 static void HandleChildCrash(int pid, int exitstatus, const char *procname);
 static void LogChildExit(int lev, const char *procname,
@ -1302,7 +1334,9 @@ ServerLoop(void)
 		 * state that prevents it, start one.  It doesn't matter if this
 		 * fails, we'll just try again later.
 		 */
-		if (BgWriterPID == 0 && pmState == PM_RUN)
+		if (BgWriterPID == 0 &&
+			(pmState == PM_RUN || pmState == PM_RECOVERY || 
+			 pmState == PM_RECOVERY_CONSISTENT))
 			BgWriterPID = StartBackgroundWriter();

 		/*
@ -1752,7 +1786,10 @@ canAcceptConnections(void)
 			return CAC_WAITBACKUP;	/* allow superusers only */
 		if (Shutdown > NoShutdown)
 			return CAC_SHUTDOWN;	/* shutdown is pending */
-		if (pmState == PM_STARTUP && !FatalError)
+		if (!FatalError &&
+			(pmState == PM_STARTUP ||
+			 pmState == PM_RECOVERY ||
+			 pmState == PM_RECOVERY_CONSISTENT))
 			return CAC_STARTUP; /* normal startup */
 		return CAC_RECOVERY;	/* else must be crash recovery */
 	}
@ -1982,7 +2019,7 @@ pmdie(SIGNAL_ARGS)
 			ereport(LOG,
 					(errmsg("received smart shutdown request")));

-			if (pmState == PM_RUN)
+			if (pmState == PM_RUN || pmState == PM_RECOVERY || pmState == PM_RECOVERY_CONSISTENT)
 			{
 				/* autovacuum workers are told to shut down immediately */
 				SignalAutovacWorkers(SIGTERM);
@ -2019,7 +2056,14 @@ pmdie(SIGNAL_ARGS)

 			if (StartupPID != 0)
 				signal_child(StartupPID, SIGTERM);
-			if (pmState == PM_RUN || pmState == PM_WAIT_BACKUP)
+			if (pmState == PM_RECOVERY)
+			{
+				/* only bgwriter is active in this state */
+				pmState = PM_WAIT_BACKENDS;
+			}
+			if (pmState == PM_RUN ||
+				pmState == PM_WAIT_BACKUP ||
+				pmState == PM_RECOVERY_CONSISTENT)
 			{
 				ereport(LOG,
 						(errmsg("aborting any active transactions")));
@ -2116,10 +2160,22 @@ reaper(SIGNAL_ARGS)
 		if (pid == StartupPID)
 		{
 			StartupPID = 0;
-			Assert(pmState == PM_STARTUP);

-			/* FATAL exit of startup is treated as catastrophic */
-			if (!EXIT_STATUS_0(exitstatus))
+			/*
+			 * Check if we've received a signal from the startup process
+			 * first. This can change pmState. If the startup process sends
+			 * a signal and exits immediately after that, we might not have
+			 * processed the signal yet. We need to know if it completed
+			 * recovery before it exited.
+			 */
+			CheckRecoverySignals();
+
+			/*
+			 * Unexpected exit of startup process (including FATAL exit)
+			 * during PM_STARTUP is treated as catastrophic. There is no
+			 * other processes running yet.
+			 */
+			if (pmState == PM_STARTUP)
 			{
 				LogChildExit(LOG, _("startup process"),
 							 pid, exitstatus);
@ -2127,60 +2183,30 @@ reaper(SIGNAL_ARGS)
 				(errmsg("aborting startup due to startup process failure")));
 				ExitPostmaster(1);
 			}
-
 			/*
-			 * Startup succeeded - we are done with system startup or
-			 * recovery.
+			 * Any unexpected exit (including FATAL exit) of the startup
+			 * process is treated as a crash, except that we don't want
+			 * to reinitialize.
 			 */
-			FatalError = false;
-
-			/*
-			 * Go to shutdown mode if a shutdown request was pending.
-			 */
-			if (Shutdown > NoShutdown)
+			if (!EXIT_STATUS_0(exitstatus))
 			{
-				pmState = PM_WAIT_BACKENDS;
-				/* PostmasterStateMachine logic does the rest */
+				RecoveryError = true;
+				HandleChildCrash(pid, exitstatus,
+								 _("startup process"));
 				continue;
 			}
-
 			/*
-			 * Otherwise, commence normal operations.
+			 * Startup process exited normally, but didn't finish recovery.
+			 * This can happen if someone else than postmaster kills the
+			 * startup process with SIGTERM. Treat it like a crash.
 			 */
-			pmState = PM_RUN;
-
-			/*
-			 * Load the flat authorization file into postmaster's cache. The
-			 * startup process has recomputed this from the database contents,
-			 * so we wait till it finishes before loading it.
-			 */
-			load_role();
-
-			/*
-			 * Crank up the background writer.	It doesn't matter if this
-			 * fails, we'll just try again later.
-			 */
-			Assert(BgWriterPID == 0);
-			BgWriterPID = StartBackgroundWriter();
-
-			/*
-			 * Likewise, start other special children as needed.  In a restart
-			 * situation, some of them may be alive already.
-			 */
-			if (WalWriterPID == 0)
-				WalWriterPID = StartWalWriter();
-			if (AutoVacuumingActive() && AutoVacPID == 0)
-				AutoVacPID = StartAutoVacLauncher();
-			if (XLogArchivingActive() && PgArchPID == 0)
-				PgArchPID = pgarch_start();
-			if (PgStatPID == 0)
-				PgStatPID = pgstat_start();
-
-			/* at this point we are really open for business */
-			ereport(LOG,
-				 (errmsg("database system is ready to accept connections")));
-
-			continue;
+			if (pmState == PM_RECOVERY || pmState == PM_RECOVERY_CONSISTENT)
+			{
+				RecoveryError = true;
+				HandleChildCrash(pid, exitstatus,
+								 _("startup process"));
+				continue;
+			}
 		}

 		/*
@ -2443,6 +2469,18 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
 		}
 	}

+	/* Take care of the startup process too */
+	if (pid == StartupPID)
+		StartupPID = 0;
+	else if (StartupPID != 0 && !FatalError)
+	{
+		ereport(DEBUG2,
+				(errmsg_internal("sending %s to process %d",
+								 (SendStop ? "SIGSTOP" : "SIGQUIT"),
+								 (int) StartupPID)));
+		signal_child(BgWriterPID, (SendStop ? SIGSTOP : SIGQUIT));
+	}
+
 	/* Take care of the bgwriter too */
 	if (pid == BgWriterPID)
 		BgWriterPID = 0;
@ -2514,7 +2552,9 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)

 	FatalError = true;
 	/* We now transit into a state of waiting for children to die */
-	if (pmState == PM_RUN ||
+	if (pmState == PM_RECOVERY ||
+		pmState == PM_RECOVERY_CONSISTENT ||
+		pmState == PM_RUN ||
 		pmState == PM_WAIT_BACKUP ||
 		pmState == PM_SHUTDOWN)
 		pmState = PM_WAIT_BACKENDS;
@ -2582,6 +2622,127 @@ LogChildExit(int lev, const char *procname, int pid, int exitstatus)
 static void
 PostmasterStateMachine(void)
 {
+	/* Startup states */
+
+	if (pmState == PM_STARTUP && RecoveryStatus > NoRecovery)
+	{
+		/* WAL redo has started. We're out of reinitialization. */
+		FatalError = false;
+
+		/*
+		 * Go to shutdown mode if a shutdown request was pending.
+		 */
+		if (Shutdown > NoShutdown)
+		{
+			pmState = PM_WAIT_BACKENDS;
+			/* PostmasterStateMachine logic does the rest */
+		}
+		else
+		{
+			/*
+			 * Crank up the background writer.	It doesn't matter if this
+			 * fails, we'll just try again later.
+			 */
+			Assert(BgWriterPID == 0);
+			BgWriterPID = StartBackgroundWriter();
+
+			pmState = PM_RECOVERY;
+		}
+	}
+	if (pmState == PM_RECOVERY && RecoveryStatus >= RecoveryConsistent)
+	{
+		/*
+		 * Go to shutdown mode if a shutdown request was pending.
+		 */
+		if (Shutdown > NoShutdown)
+		{
+			pmState = PM_WAIT_BACKENDS;
+			/* PostmasterStateMachine logic does the rest */
+		}
+		else
+		{
+			/*
+			 * Startup process has entered recovery. We consider that good
+			 * enough to reset FatalError.
+			 */
+			pmState = PM_RECOVERY_CONSISTENT;
+
+			/*
+			 * Load the flat authorization file into postmaster's cache. The
+			 * startup process won't have recomputed this from the database yet,
+			 * so we it may change following recovery. 
+			 */
+			load_role();
+
+			/*
+			 * Likewise, start other special children as needed.
+			 */
+			Assert(PgStatPID == 0);
+			PgStatPID = pgstat_start();
+
+			/* XXX at this point we could accept read-only connections */
+			ereport(DEBUG1,
+				 (errmsg("database system is in consistent recovery mode")));
+		}
+	}
+	if ((pmState == PM_RECOVERY || 
+		 pmState == PM_RECOVERY_CONSISTENT ||
+		 pmState == PM_STARTUP) &&
+		RecoveryStatus == RecoveryCompleted)
+	{
+		/*
+		 * Startup succeeded.
+		 *
+		 * Go to shutdown mode if a shutdown request was pending.
+		 */
+		if (Shutdown > NoShutdown)
+		{
+			pmState = PM_WAIT_BACKENDS;
+			/* PostmasterStateMachine logic does the rest */
+		}
+		else
+		{
+			/*
+			 * Otherwise, commence normal operations.
+			 */
+			pmState = PM_RUN;
+
+			/*
+			 * Load the flat authorization file into postmaster's cache. The
+			 * startup process has recomputed this from the database contents,
+			 * so we wait till it finishes before loading it.
+			 */
+			load_role();
+
+			/*
+			 * Crank up the background writer, if we didn't do that already
+			 * when we entered consistent recovery phase.  It doesn't matter
+			 * if this fails, we'll just try again later.
+			 */
+			if (BgWriterPID == 0)
+				BgWriterPID = StartBackgroundWriter();
+
+			/*
+			 * Likewise, start other special children as needed.  In a restart
+			 * situation, some of them may be alive already.
+			 */
+			if (WalWriterPID == 0)
+				WalWriterPID = StartWalWriter();
+			if (AutoVacuumingActive() && AutoVacPID == 0)
+				AutoVacPID = StartAutoVacLauncher();
+			if (XLogArchivingActive() && PgArchPID == 0)
+				PgArchPID = pgarch_start();
+			if (PgStatPID == 0)
+				PgStatPID = pgstat_start();
+
+			/* at this point we are really open for business */
+			ereport(LOG,
+				(errmsg("database system is ready to accept connections")));
+		}
+	}
+
+	/* Shutdown states */
+
 	if (pmState == PM_WAIT_BACKUP)
 	{
 		/*
@ -2722,6 +2883,15 @@ PostmasterStateMachine(void)
 		}
 	}

+	/*
+	 * If recovery failed, wait for all non-syslogger children to exit,
+	 * and then exit postmaster. We don't try to reinitialize when recovery
+	 * fails, because more than likely it will just fail again and we will
+	 * keep trying forever.
+	 */
+	if (RecoveryError && pmState == PM_NO_CHILDREN)
+		ExitPostmaster(1);		
+
 	/*
 	 * If we need to recover from a crash, wait for all non-syslogger
 	 * children to exit, then reset shmem and StartupDataBase.
@ -2734,6 +2904,8 @@ PostmasterStateMachine(void)
 		shmem_exit(1);
 		reset_shared(PostPortNumber);

+		RecoveryStatus = NoRecovery;
+
 		StartupPID = StartupDataBase();
 		Assert(StartupPID != 0);
 		pmState = PM_STARTUP;
@ -3837,6 +4009,37 @@ ExitPostmaster(int status)
 	proc_exit(status);
 }

+/*
+ * common code used in sigusr1_handler() and reaper() to handle
+ * recovery-related signals from startup process
+ */
+static void
+CheckRecoverySignals(void)
+{
+	bool changed = false;
+
+	if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_STARTED))
+	{
+		Assert(pmState == PM_STARTUP);
+
+		RecoveryStatus = RecoveryStarted;
+		changed = true;
+	}
+	if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT))
+	{
+		RecoveryStatus = RecoveryConsistent;
+		changed = true;
+	}
+	if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_COMPLETED))
+	{
+		RecoveryStatus = RecoveryCompleted;
+		changed = true;
+	}
+
+	if (changed)
+		PostmasterStateMachine();
+}
+
 /*
 * sigusr1_handler - handle signal conditions from child processes
 */
@ -3847,6 +4050,8 @@ sigusr1_handler(SIGNAL_ARGS)

 	PG_SETMASK(&BlockSig);

+	CheckRecoverySignals();
+
 	if (CheckPostmasterSignal(PMSIGNAL_PASSWORD_CHANGE))
 	{
 		/*
--- a/src/backend/storage/buffer/README
+++ b/src/backend/storage/buffer/README
@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.15 2008/11/06 20:51:14 tgl Exp $
+$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.16 2009/02/18 15:58:41 heikki Exp $

 Notes About Shared Buffer Access Rules
 ======================================
@ -268,3 +268,8 @@ out (and anyone else who flushes buffer contents to disk must do so too).
 This ensures that the page image transferred to disk is reasonably consistent.
 We might miss a hint-bit update or two but that isn't a problem, for the same
 reasons mentioned under buffer access rules.
+
+As of 8.4, background writer starts during recovery mode when there is
+some form of potentially extended recovery to perform. It performs an
+identical service to normal processing, except that checkpoints it
+writes are technically restartpoints.
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/init/postinit.c,v 1.187 2009/01/01 17:23:51 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/init/postinit.c,v 1.188 2009/02/18 15:58:41 heikki Exp $
 *
 *
 *-------------------------------------------------------------------------
@ -324,7 +324,7 @@ InitCommunication(void)
 * If you're wondering why this is separate from InitPostgres at all:
 * the critical distinction is that this stuff has to happen before we can
 * run XLOG-related initialization, which is done before InitPostgres --- in
- * fact, for cases such as checkpoint creation processes, InitPostgres may
+ * fact, for cases such as the background writer process, InitPostgres may
 * never be done at all.
 */
 void
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@ -6,7 +6,7 @@
 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.90 2009/01/20 18:59:37 heikki Exp $
+ * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.91 2009/02/18 15:58:41 heikki Exp $
 */
 #ifndef XLOG_H
 #define XLOG_H
@ -199,6 +199,8 @@ extern void RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup);
 extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record);
 extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec);

+extern bool RecoveryInProgress(void);
+
 extern void UpdateControlFile(void);
 extern Size XLOGShmemSize(void);
 extern void XLOGShmemInit(void);
@ -207,9 +209,12 @@ extern void StartupXLOG(void);
 extern void ShutdownXLOG(int code, Datum arg);
 extern void InitXLOGAccess(void);
 extern void CreateCheckPoint(int flags);
+extern bool CreateRestartPoint(int flags);
 extern void XLogPutNextOid(Oid nextOid);
 extern XLogRecPtr GetRedoRecPtr(void);
 extern XLogRecPtr GetInsertRecPtr(void);
 extern void GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch);

+extern void StartupProcessMain(void);
+
 #endif   /* XLOG_H */
--- a/src/include/storage/pmsignal.h
+++ b/src/include/storage/pmsignal.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/storage/pmsignal.h,v 1.21 2009/01/01 17:24:01 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/storage/pmsignal.h,v 1.22 2009/02/18 15:58:41 heikki Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -22,6 +22,9 @@
 */
 typedef enum
 {
+	PMSIGNAL_RECOVERY_STARTED,	/* recovery has started */
+	PMSIGNAL_RECOVERY_CONSISTENT, /* recovery has reached consistent state */
+	PMSIGNAL_RECOVERY_COMPLETED, /* recovery has completed */
 	PMSIGNAL_PASSWORD_CHANGE,	/* pg_auth file has changed */
 	PMSIGNAL_WAKEN_ARCHIVER,	/* send a NOTIFY signal to xlog archiver */
 	PMSIGNAL_ROTATE_LOGFILE,	/* send SIGUSR1 to syslogger to rotate logfile */