diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 9af551d576..4bc7e776b0 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.330 2009/02/07 10:49:36 heikki Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.331 2009/02/18 15:58:40 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -36,6 +36,7 @@ #include "catalog/pg_control.h" #include "catalog/pg_type.h" #include "funcapi.h" +#include "libpq/pqsignal.h" #include "miscadmin.h" #include "pgstat.h" #include "postmaster/bgwriter.h" @@ -47,6 +48,7 @@ #include "storage/smgr.h" #include "storage/spin.h" #include "utils/builtins.h" +#include "utils/flatfiles.h" #include "utils/guc.h" #include "utils/ps_status.h" #include "pg_trace.h" @@ -119,12 +121,27 @@ CheckpointStatsData CheckpointStats; */ TimeLineID ThisTimeLineID = 0; -/* Are we doing recovery from XLOG? */ +/* + * Are we doing recovery from XLOG? + * + * This is only ever true in the startup process, even if the system is still + * in recovery. Prior to 8.4, all activity during recovery were carried out + * by Startup process. This local variable continues to be used in functions + * that need to act differently when called from a redo function (e.g skip + * WAL logging). To check whether the system is in recovery regardless of what + * process you're running in, use RecoveryInProgress(). + */ bool InRecovery = false; /* Are we recovering using offline XLOG archives? */ static bool InArchiveRecovery = false; +/* + * Local copy of SharedRecoveryInProgress variable. True actually means "not + * known, need to check the shared state" + */ +static bool LocalRecoveryInProgress = true; + /* Was the last xlog file restored from archive, or local? */ static bool restoredFromArchive = false; @@ -133,7 +150,6 @@ static char *recoveryRestoreCommand = NULL; static bool recoveryTarget = false; static bool recoveryTargetExact = false; static bool recoveryTargetInclusive = true; -static bool recoveryLogRestartpoints = false; static TransactionId recoveryTargetXid; static TimestampTz recoveryTargetTime; static TimestampTz recoveryLastXTime = 0; @@ -242,9 +258,8 @@ static XLogRecPtr RedoRecPtr; * ControlFileLock: must be held to read/update control file or create * new log file. * - * CheckpointLock: must be held to do a checkpoint (ensures only one - * checkpointer at a time; currently, with all checkpoints done by the - * bgwriter, this is just pro forma). + * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures + * only one checkpointer at a time) * *---------- */ @@ -313,6 +328,25 @@ typedef struct XLogCtlData int XLogCacheBlck; /* highest allocated xlog buffer index */ TimeLineID ThisTimeLineID; + /* + * SharedRecoveryInProgress indicates if we're still in crash or archive + * recovery. It's checked by RecoveryInProgress(). + */ + bool SharedRecoveryInProgress; + + /* + * During recovery, we keep a copy of the latest checkpoint record + * here. Used by the background writer when it wants to create + * a restartpoint. + * + * Protected by info_lck. + */ + XLogRecPtr lastCheckPointRecPtr; + CheckPoint lastCheckPoint; + + /* end+1 of the last record replayed (or being replayed) */ + XLogRecPtr replayEndRecPtr; + slock_t info_lck; /* locks shared variables shown above */ } XLogCtlData; @@ -387,9 +421,21 @@ static XLogRecPtr ReadRecPtr; /* start of last record read */ static XLogRecPtr EndRecPtr; /* end+1 of last record read */ static XLogRecord *nextRecord = NULL; static TimeLineID lastPageTLI = 0; +static XLogRecPtr minRecoveryPoint; /* local copy of ControlFile->minRecoveryPoint */ +static bool updateMinRecoveryPoint = true; static bool InRedo = false; +/* + * Flag set by interrupt handlers for later service in the redo loop. + */ +static volatile sig_atomic_t shutdown_requested = false; +/* + * Flag set when executing a restore command, to tell SIGTERM signal handler + * that it's safe to just proc_exit(0). + */ +static volatile sig_atomic_t in_restore_command = false; + static void XLogArchiveNotify(const char *xlog); static void XLogArchiveNotifySeg(uint32 log, uint32 seg); @@ -420,6 +466,7 @@ static void PreallocXlogFiles(XLogRecPtr endptr); static void RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr); static void ValidateXLOGDirectoryStructure(void); static void CleanupBackupHistory(void); +static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force); static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode); static bool ValidXLOGHeader(XLogPageHeader hdr, int emode); static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt); @@ -484,6 +531,10 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata) bool doPageWrites; bool isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH); + /* cross-check on whether we should be here or not */ + if (RecoveryInProgress()) + elog(FATAL, "cannot make new WAL entries during recovery"); + /* info's high bits are reserved for use by me */ if (info & XLR_INFO_MASK) elog(PANIC, "invalid xlog info mask %02X", info); @@ -1717,6 +1768,63 @@ XLogSetAsyncCommitLSN(XLogRecPtr asyncCommitLSN) SpinLockRelease(&xlogctl->info_lck); } +/* + * Advance minRecoveryPoint in control file. + * + * If we crash during recovery, we must reach this point again before the + * database is consistent. + * + * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint + * is is only updated if it's not already greater than or equal to 'lsn'. + */ +static void +UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force) +{ + /* Quick check using our local copy of the variable */ + if (!updateMinRecoveryPoint || (!force && XLByteLE(lsn, minRecoveryPoint))) + return; + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + + /* update local copy */ + minRecoveryPoint = ControlFile->minRecoveryPoint; + + /* + * An invalid minRecoveryPoint means that we need to recover all the WAL, + * ie. crash recovery. Don't update the control file in that case. + */ + if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0) + updateMinRecoveryPoint = false; + else if (force || XLByteLT(minRecoveryPoint, lsn)) + { + /* use volatile pointer to prevent code rearrangement */ + volatile XLogCtlData *xlogctl = XLogCtl; + XLogRecPtr newMinRecoveryPoint; + + /* + * To avoid having to update the control file too often, we update it + * all the way to the last record being replayed, even though 'lsn' + * would suffice for correctness. + */ + SpinLockAcquire(&xlogctl->info_lck); + newMinRecoveryPoint = xlogctl->replayEndRecPtr; + SpinLockRelease(&xlogctl->info_lck); + + /* update control file */ + if (XLByteLT(ControlFile->minRecoveryPoint, newMinRecoveryPoint)) + { + ControlFile->minRecoveryPoint = newMinRecoveryPoint; + UpdateControlFile(); + minRecoveryPoint = newMinRecoveryPoint; + + ereport(DEBUG2, + (errmsg("updated min recovery point to %X/%X", + minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff))); + } + } + LWLockRelease(ControlFileLock); +} + /* * Ensure that all XLOG data through the given position is flushed to disk. * @@ -1729,9 +1837,15 @@ XLogFlush(XLogRecPtr record) XLogRecPtr WriteRqstPtr; XLogwrtRqst WriteRqst; - /* Disabled during REDO */ - if (InRedo) + /* + * During REDO, we don't try to flush the WAL, but update minRecoveryPoint + * instead. + */ + if (RecoveryInProgress()) + { + UpdateMinRecoveryPoint(record, false); return; + } /* Quick exit if already known flushed */ if (XLByteLE(record, LogwrtResult.Flush)) @@ -1818,9 +1932,9 @@ XLogFlush(XLogRecPtr record) * the bad page is encountered again during recovery then we would be * unable to restart the database at all! (This scenario has actually * happened in the field several times with 7.1 releases. Note that we - * cannot get here while InRedo is true, but if the bad page is brought in - * and marked dirty during recovery then CreateCheckPoint will try to - * flush it at the end of recovery.) + * cannot get here while RecoveryInProgress(), but if the bad page is + * brought in and marked dirty during recovery then if a checkpoint were + * performed at the end of recovery it will try to flush it. * * The current approach is to ERROR under normal conditions, but only * WARNING during recovery, so that the system can be brought up even if @@ -1857,6 +1971,10 @@ XLogBackgroundFlush(void) XLogRecPtr WriteRqstPtr; bool flexible = true; + /* XLOG doesn't need flushing during recovery */ + if (RecoveryInProgress()) + return; + /* read LogwrtResult and update local state */ { /* use volatile pointer to prevent code rearrangement */ @@ -1928,6 +2046,10 @@ XLogAsyncCommitFlush(void) /* use volatile pointer to prevent code rearrangement */ volatile XLogCtlData *xlogctl = XLogCtl; + /* There's no asynchronously committed transactions during recovery */ + if (RecoveryInProgress()) + return; + SpinLockAcquire(&xlogctl->info_lck); WriteRqstPtr = xlogctl->asyncCommitLSN; SpinLockRelease(&xlogctl->info_lck); @@ -1944,6 +2066,10 @@ XLogAsyncCommitFlush(void) bool XLogNeedsFlush(XLogRecPtr record) { + /* XLOG doesn't need flushing during recovery */ + if (RecoveryInProgress()) + return false; + /* Quick exit if already known flushed */ if (XLByteLE(record, LogwrtResult.Flush)) return false; @@ -2618,10 +2744,23 @@ RestoreArchivedFile(char *path, const char *xlogfname, (errmsg_internal("executing restore command \"%s\"", xlogRestoreCmd))); + /* + * Set in_restore_command to tell the signal handler that we should exit + * right away on SIGTERM. We know that we're in a safe point to do that. + * Check if we had already received the signal, so that we don't miss a + * shutdown request received just before this. + */ + in_restore_command = true; + if (shutdown_requested) + proc_exit(0); + /* * Copy xlog from archival storage to XLOGDIR */ rc = system(xlogRestoreCmd); + + in_restore_command = false; + if (rc == 0) { /* @@ -2674,14 +2813,24 @@ RestoreArchivedFile(char *path, const char *xlogfname, * assume that recovery is complete and start up the database!) It's * essential to abort on child SIGINT and SIGQUIT, because per spec * system() ignores SIGINT and SIGQUIT while waiting; if we see one of - * those it's a good bet we should have gotten it too. Aborting on other - * signals such as SIGTERM seems a good idea as well. + * those it's a good bet we should have gotten it too. + * + * On SIGTERM, assume we have received a fast shutdown request, and exit + * cleanly. It's pure chance whether we receive the SIGTERM first, or the + * child process. If we receive it first, the signal handler will call + * proc_exit(0), otherwise we do it here. If we or the child process + * received SIGTERM for any other reason than a fast shutdown request, + * postmaster will perform an immediate shutdown when it sees us exiting + * unexpectedly. * * Per the Single Unix Spec, shells report exit status > 128 when a called * command died on a signal. Also, 126 and 127 are used to report * problems such as an unfindable command; treat those as fatal errors * too. */ + if (WTERMSIG(rc) == SIGTERM) + proc_exit(0); + signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125; ereport(signaled ? FATAL : DEBUG2, @@ -4584,18 +4733,6 @@ readRecoveryCommandFile(void) ereport(LOG, (errmsg("recovery_target_inclusive = %s", tok2))); } - else if (strcmp(tok1, "log_restartpoints") == 0) - { - /* - * does nothing if a recovery_target is not also set - */ - if (!parse_bool(tok2, &recoveryLogRestartpoints)) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("parameter \"log_restartpoints\" requires a Boolean value"))); - ereport(LOG, - (errmsg("log_restartpoints = %s", tok2))); - } else ereport(FATAL, (errmsg("unrecognized recovery parameter \"%s\"", @@ -4877,7 +5014,7 @@ StartupXLOG(void) XLogRecPtr RecPtr, LastRec, checkPointLoc, - minRecoveryLoc, + backupStopLoc, EndOfLog; uint32 endLogId; uint32 endLogSeg; @@ -4885,6 +5022,8 @@ StartupXLOG(void) uint32 freespace; TransactionId oldestActiveXID; + XLogCtl->SharedRecoveryInProgress = true; + /* * Read control file and check XLOG status looks valid. * @@ -4964,7 +5103,7 @@ StartupXLOG(void) recoveryTargetTLI, ControlFile->checkPointCopy.ThisTimeLineID))); - if (read_backup_label(&checkPointLoc, &minRecoveryLoc)) + if (read_backup_label(&checkPointLoc, &backupStopLoc)) { /* * When a backup_label file is present, we want to roll forward from @@ -5102,11 +5241,23 @@ StartupXLOG(void) ControlFile->prevCheckPoint = ControlFile->checkPoint; ControlFile->checkPoint = checkPointLoc; ControlFile->checkPointCopy = checkPoint; - if (minRecoveryLoc.xlogid != 0 || minRecoveryLoc.xrecoff != 0) - ControlFile->minRecoveryPoint = minRecoveryLoc; + if (backupStopLoc.xlogid != 0 || backupStopLoc.xrecoff != 0) + { + if (XLByteLT(ControlFile->minRecoveryPoint, backupStopLoc)) + ControlFile->minRecoveryPoint = backupStopLoc; + } ControlFile->time = (pg_time_t) time(NULL); + /* No need to hold ControlFileLock yet, we aren't up far enough */ UpdateControlFile(); + /* update our local copy of minRecoveryPoint */ + minRecoveryPoint = ControlFile->minRecoveryPoint; + + /* + * Reset pgstat data, because it may be invalid after recovery. + */ + pgstat_reset_all(); + /* * If there was a backup label file, it's done its job and the info * has now been propagated into pg_control. We must get rid of the @@ -5151,12 +5302,41 @@ StartupXLOG(void) { bool recoveryContinue = true; bool recoveryApply = true; + bool reachedMinRecoveryPoint = false; ErrorContextCallback errcontext; + /* use volatile pointer to prevent code rearrangement */ + volatile XLogCtlData *xlogctl = XLogCtl; + + /* Update shared replayEndRecPtr */ + SpinLockAcquire(&xlogctl->info_lck); + xlogctl->replayEndRecPtr = ReadRecPtr; + SpinLockRelease(&xlogctl->info_lck); InRedo = true; - ereport(LOG, - (errmsg("redo starts at %X/%X", - ReadRecPtr.xlogid, ReadRecPtr.xrecoff))); + + if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0) + ereport(LOG, + (errmsg("redo starts at %X/%X", + ReadRecPtr.xlogid, ReadRecPtr.xrecoff))); + else + ereport(LOG, + (errmsg("redo starts at %X/%X, consistency will be reached at %X/%X", + ReadRecPtr.xlogid, ReadRecPtr.xrecoff, + minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff))); + + /* + * Let postmaster know we've started redo now, so that it can + * launch bgwriter to perform restartpoints. We don't bother + * during crash recovery as restartpoints can only be performed + * during archive recovery. And we'd like to keep crash recovery + * simple, to avoid introducing bugs that could you from + * recovering after crash. + * + * After this point, we can no longer assume that we're the only + * process in addition to postmaster! + */ + if (InArchiveRecovery && IsUnderPostmaster) + SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED); /* * main redo apply loop @@ -5182,6 +5362,30 @@ StartupXLOG(void) } #endif + /* + * Check if we were requested to exit without finishing + * recovery. + */ + if (shutdown_requested) + proc_exit(0); + + /* + * Have we reached our safe starting point? If so, we can + * tell postmaster that the database is consistent now. + */ + if (!reachedMinRecoveryPoint && + XLByteLE(minRecoveryPoint, EndRecPtr)) + { + reachedMinRecoveryPoint = true; + if (InArchiveRecovery) + { + ereport(LOG, + (errmsg("consistent recovery state reached"))); + if (IsUnderPostmaster) + SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT); + } + } + /* * Have we reached our recovery target? */ @@ -5207,6 +5411,15 @@ StartupXLOG(void) TransactionIdAdvance(ShmemVariableCache->nextXid); } + /* + * Update shared replayEndRecPtr before replaying this + * record, so that XLogFlush will update minRecoveryPoint + * correctly. + */ + SpinLockAcquire(&xlogctl->info_lck); + xlogctl->replayEndRecPtr = EndRecPtr; + SpinLockRelease(&xlogctl->info_lck); + RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record); /* Pop the error context stack */ @@ -5250,14 +5463,14 @@ StartupXLOG(void) * Complain if we did not roll forward far enough to render the backup * dump consistent. */ - if (XLByteLT(EndOfLog, ControlFile->minRecoveryPoint)) + if (InRecovery && XLByteLT(EndOfLog, minRecoveryPoint)) { if (reachedStopPoint) /* stopped because of stop request */ ereport(FATAL, - (errmsg("requested recovery stop point is before end time of backup dump"))); + (errmsg("requested recovery stop point is before consistent recovery point"))); else /* ran off end of WAL */ ereport(FATAL, - (errmsg("WAL ends before end time of backup dump"))); + (errmsg("WAL ends before consistent recovery point"))); } /* @@ -5352,6 +5565,12 @@ StartupXLOG(void) /* Pre-scan prepared transactions to find out the range of XIDs present */ oldestActiveXID = PrescanPreparedTransactions(); + /* + * Allow writing WAL for us, so that we can create a checkpoint record. + * But not yet for other backends! + */ + LocalRecoveryInProgress = false; + if (InRecovery) { int rmid; @@ -5371,11 +5590,6 @@ StartupXLOG(void) */ XLogCheckInvalidPages(); - /* - * Reset pgstat data, because it may be invalid after recovery. - */ - pgstat_reset_all(); - /* * Perform a checkpoint to update all our recovery activity to disk. * @@ -5398,12 +5612,14 @@ StartupXLOG(void) */ InRecovery = false; + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ControlFile->state = DB_IN_PRODUCTION; ControlFile->time = (pg_time_t) time(NULL); UpdateControlFile(); + LWLockRelease(ControlFileLock); /* start the archive_timeout timer running */ - XLogCtl->Write.lastSegSwitchTime = ControlFile->time; + XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL); /* initialize shared-memory copy of latest checkpoint XID/epoch */ XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch; @@ -5438,6 +5654,45 @@ StartupXLOG(void) readRecordBuf = NULL; readRecordBufSize = 0; } + + /* + * All done. Allow others to write WAL. + */ + XLogCtl->SharedRecoveryInProgress = false; +} + +/* + * Is the system still in recovery? + * + * As a side-effect, we initialize the local TimeLineID and RedoRecPtr + * variables the first time we see that recovery is finished. + */ +bool +RecoveryInProgress(void) +{ + /* + * We check shared state each time only until we leave recovery mode. + * We can't re-enter recovery, so we rely on the local state variable + * after that. + */ + if (!LocalRecoveryInProgress) + return false; + else + { + /* use volatile pointer to prevent code rearrangement */ + volatile XLogCtlData *xlogctl = XLogCtl; + + LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress; + + /* + * Initialize TimeLineID and RedoRecPtr the first time we see that + * recovery is finished. + */ + if (!LocalRecoveryInProgress) + InitXLOGAccess(); + + return LocalRecoveryInProgress; + } } /* @@ -5569,6 +5824,8 @@ InitXLOGAccess(void) { /* ThisTimeLineID doesn't change so we need no lock to copy it */ ThisTimeLineID = XLogCtl->ThisTimeLineID; + Assert(ThisTimeLineID != 0); + /* Use GetRedoRecPtr to copy the RedoRecPtr safely */ (void) GetRedoRecPtr(); } @@ -5680,7 +5937,10 @@ ShutdownXLOG(int code, Datum arg) ereport(LOG, (errmsg("shutting down"))); - CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); + if (RecoveryInProgress()) + CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); + else + CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); ShutdownCLOG(); ShutdownSUBTRANS(); ShutdownMultiXact(); @@ -5693,9 +5953,20 @@ ShutdownXLOG(int code, Datum arg) * Log start of a checkpoint. */ static void -LogCheckpointStart(int flags) +LogCheckpointStart(int flags, bool restartpoint) { - elog(LOG, "checkpoint starting:%s%s%s%s%s%s", + char *msg; + + /* + * XXX: This is hopelessly untranslatable. We could call gettext_noop + * for the main message, but what about all the flags? + */ + if (restartpoint) + msg = "restartpoint starting:%s%s%s%s%s%s"; + else + msg = "checkpoint starting:%s%s%s%s%s%s"; + + elog(LOG, msg, (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "", (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "", (flags & CHECKPOINT_FORCE) ? " force" : "", @@ -5708,7 +5979,7 @@ LogCheckpointStart(int flags) * Log end of a checkpoint. */ static void -LogCheckpointEnd(void) +LogCheckpointEnd(bool restartpoint) { long write_secs, sync_secs, @@ -5731,17 +6002,26 @@ LogCheckpointEnd(void) CheckpointStats.ckpt_sync_end_t, &sync_secs, &sync_usecs); - elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); " - "%d transaction log file(s) added, %d removed, %d recycled; " - "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s", - CheckpointStats.ckpt_bufs_written, - (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, - CheckpointStats.ckpt_segs_added, - CheckpointStats.ckpt_segs_removed, - CheckpointStats.ckpt_segs_recycled, - write_secs, write_usecs / 1000, - sync_secs, sync_usecs / 1000, - total_secs, total_usecs / 1000); + if (restartpoint) + elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); " + "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s", + CheckpointStats.ckpt_bufs_written, + (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, + write_secs, write_usecs / 1000, + sync_secs, sync_usecs / 1000, + total_secs, total_usecs / 1000); + else + elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); " + "%d transaction log file(s) added, %d removed, %d recycled; " + "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s", + CheckpointStats.ckpt_bufs_written, + (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, + CheckpointStats.ckpt_segs_added, + CheckpointStats.ckpt_segs_removed, + CheckpointStats.ckpt_segs_recycled, + write_secs, write_usecs / 1000, + sync_secs, sync_usecs / 1000, + total_secs, total_usecs / 1000); } /* @@ -5772,13 +6052,33 @@ CreateCheckPoint(int flags) TransactionId *inCommitXids; int nInCommit; + /* shouldn't happen */ + if (RecoveryInProgress()) + elog(ERROR, "can't create a checkpoint during recovery"); + /* * Acquire CheckpointLock to ensure only one checkpoint happens at a time. - * (This is just pro forma, since in the present system structure there is - * only one process that is allowed to issue checkpoints at any given - * time.) + * During normal operation, bgwriter is the only process that creates + * checkpoints, but at the end of archive recovery, the bgwriter can be + * busy creating a restartpoint while the startup process tries to perform + * the startup checkpoint. */ - LWLockAcquire(CheckpointLock, LW_EXCLUSIVE); + if (!LWLockConditionalAcquire(CheckpointLock, LW_EXCLUSIVE)) + { + Assert(InRecovery); + + /* + * A restartpoint is in progress. Wait until it finishes. This can + * cause an extra restartpoint to be performed, but that's OK because + * we're just about to perform a checkpoint anyway. Flushing the + * buffers in this restartpoint can take some time, but that time is + * saved from the upcoming checkpoint so the net effect is zero. + */ + ereport(DEBUG2, (errmsg("hurrying in-progress restartpoint"))); + RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_WAIT); + + LWLockAcquire(CheckpointLock, LW_EXCLUSIVE); + } /* * Prepare to accumulate statistics. @@ -5797,9 +6097,11 @@ CreateCheckPoint(int flags) if (shutdown) { + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ControlFile->state = DB_SHUTDOWNING; ControlFile->time = (pg_time_t) time(NULL); UpdateControlFile(); + LWLockRelease(ControlFileLock); } /* @@ -5903,7 +6205,7 @@ CreateCheckPoint(int flags) * to log anything if we decided to skip the checkpoint. */ if (log_checkpoints) - LogCheckpointStart(flags); + LogCheckpointStart(flags, false); TRACE_POSTGRESQL_CHECKPOINT_START(flags); @@ -6070,7 +6372,7 @@ CreateCheckPoint(int flags) /* All real work is done, but log before releasing lock. */ if (log_checkpoints) - LogCheckpointEnd(); + LogCheckpointEnd(false); TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written, NBuffers, CheckpointStats.ckpt_segs_added, @@ -6098,32 +6400,17 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags) } /* - * Set a recovery restart point if appropriate - * - * This is similar to CreateCheckPoint, but is used during WAL recovery - * to establish a point from which recovery can roll forward without - * replaying the entire recovery log. This function is called each time - * a checkpoint record is read from XLOG; it must determine whether a - * restartpoint is needed or not. + * This is used during WAL recovery to establish a point from which recovery + * can roll forward without replaying the entire recovery log. This function + * is called each time a checkpoint record is read from XLOG. It is stored + * in shared memory, so that it can be used as a restartpoint later on. */ static void RecoveryRestartPoint(const CheckPoint *checkPoint) { - int elapsed_secs; int rmid; - - /* - * Do nothing if the elapsed time since the last restartpoint is less than - * half of checkpoint_timeout. (We use a value less than - * checkpoint_timeout so that variations in the timing of checkpoints on - * the master, or speed of transmission of WAL segments to a slave, won't - * make the slave skip a restartpoint once it's synced with the master.) - * Checking true elapsed time keeps us from doing restartpoints too often - * while rapidly scanning large amounts of WAL. - */ - elapsed_secs = (pg_time_t) time(NULL) - ControlFile->time; - if (elapsed_secs < CheckPointTimeout / 2) - return; + /* use volatile pointer to prevent code rearrangement */ + volatile XLogCtlData *xlogctl = XLogCtl; /* * Is it safe to checkpoint? We must ask each of the resource managers @@ -6145,28 +6432,128 @@ RecoveryRestartPoint(const CheckPoint *checkPoint) } /* - * OK, force data out to disk + * Copy the checkpoint record to shared memory, so that bgwriter can + * use it the next time it wants to perform a restartpoint. */ - CheckPointGuts(checkPoint->redo, CHECKPOINT_IMMEDIATE); + SpinLockAcquire(&xlogctl->info_lck); + XLogCtl->lastCheckPointRecPtr = ReadRecPtr; + memcpy(&XLogCtl->lastCheckPoint, checkPoint, sizeof(CheckPoint)); + SpinLockRelease(&xlogctl->info_lck); +} + +/* + * This is similar to CreateCheckPoint, but is used during WAL recovery + * to establish a point from which recovery can roll forward without + * replaying the entire recovery log. + * + * Returns true if a new restartpoint was established. We can only establish + * a restartpoint if we have replayed a checkpoint record since last + * restartpoint. + */ +bool +CreateRestartPoint(int flags) +{ + XLogRecPtr lastCheckPointRecPtr; + CheckPoint lastCheckPoint; + /* use volatile pointer to prevent code rearrangement */ + volatile XLogCtlData *xlogctl = XLogCtl; /* - * Update pg_control so that any subsequent crash will restart from this - * checkpoint. Note: ReadRecPtr gives the XLOG address of the checkpoint - * record itself. + * Acquire CheckpointLock to ensure only one restartpoint or checkpoint + * happens at a time. */ + LWLockAcquire(CheckpointLock, LW_EXCLUSIVE); + + /* Get the a local copy of the last checkpoint record. */ + SpinLockAcquire(&xlogctl->info_lck); + lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr; + memcpy(&lastCheckPoint, &XLogCtl->lastCheckPoint, sizeof(CheckPoint)); + SpinLockRelease(&xlogctl->info_lck); + + /* + * Check that we're still in recovery mode. It's ok if we exit recovery + * mode after this check, the restart point is valid anyway. + */ + if (!RecoveryInProgress()) + { + ereport(DEBUG2, + (errmsg("skipping restartpoint, recovery has already ended"))); + LWLockRelease(CheckpointLock); + return false; + } + + /* + * If the last checkpoint record we've replayed is already our last + * restartpoint, we can't perform a new restart point. We still update + * minRecoveryPoint in that case, so that if this is a shutdown restart + * point, we won't start up earlier than before. That's not strictly + * necessary, but when we get hot standby capability, it would be rather + * weird if the database opened up for read-only connections at a + * point-in-time before the last shutdown. Such time travel is still + * possible in case of immediate shutdown, though. + * + * We don't explicitly advance minRecoveryPoint when we do create a + * restartpoint. It's assumed that flushing the buffers will do that + * as a side-effect. + */ + if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) || + XLByteLE(lastCheckPoint.redo, ControlFile->checkPointCopy.redo)) + { + XLogRecPtr InvalidXLogRecPtr = {0, 0}; + ereport(DEBUG2, + (errmsg("skipping restartpoint, already performed at %X/%X", + lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff))); + + UpdateMinRecoveryPoint(InvalidXLogRecPtr, true); + LWLockRelease(CheckpointLock); + return false; + } + + if (log_checkpoints) + { + /* + * Prepare to accumulate statistics. + */ + MemSet(&CheckpointStats, 0, sizeof(CheckpointStats)); + CheckpointStats.ckpt_start_t = GetCurrentTimestamp(); + + LogCheckpointStart(flags, true); + } + + CheckPointGuts(lastCheckPoint.redo, flags); + + /* + * Update pg_control, using current time + */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ControlFile->prevCheckPoint = ControlFile->checkPoint; - ControlFile->checkPoint = ReadRecPtr; - ControlFile->checkPointCopy = *checkPoint; + ControlFile->checkPoint = lastCheckPointRecPtr; + ControlFile->checkPointCopy = lastCheckPoint; ControlFile->time = (pg_time_t) time(NULL); UpdateControlFile(); + LWLockRelease(ControlFileLock); - ereport((recoveryLogRestartpoints ? LOG : DEBUG2), + /* + * Currently, there is no need to truncate pg_subtrans during recovery. + * If we did do that, we will need to have called StartupSUBTRANS() + * already and then TruncateSUBTRANS() would go here. + */ + + /* All real work is done, but log before releasing lock. */ + if (log_checkpoints) + LogCheckpointEnd(true); + + ereport((log_checkpoints ? LOG : DEBUG2), (errmsg("recovery restart point at %X/%X", - checkPoint->redo.xlogid, checkPoint->redo.xrecoff))); + lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff))); + if (recoveryLastXTime) - ereport((recoveryLogRestartpoints ? LOG : DEBUG2), - (errmsg("last completed transaction was at log time %s", - timestamptz_to_str(recoveryLastXTime)))); + ereport((log_checkpoints ? LOG : DEBUG2), + (errmsg("last completed transaction was at log time %s", + timestamptz_to_str(recoveryLastXTime)))); + + LWLockRelease(CheckpointLock); + return true; } /* @@ -6232,6 +6619,9 @@ RequestXLogSwitch(void) /* * XLOG resource manager's routines + * + * Definitions of info values are in include/catalog/pg_control.h, though + * not all records types are related to control file processing. */ void xlog_redo(XLogRecPtr lsn, XLogRecord *record) @@ -6278,9 +6668,9 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record) (int) checkPoint.ThisTimeLineID)) ereport(PANIC, (errmsg("unexpected timeline ID %u (after %u) in checkpoint record", - checkPoint.ThisTimeLineID, ThisTimeLineID))); - /* Following WAL records should be run with new TLI */ - ThisTimeLineID = checkPoint.ThisTimeLineID; + checkPoint.ThisTimeLineID, ThisTimeLineID))); + /* Following WAL records should be run with new TLI */ + ThisTimeLineID = checkPoint.ThisTimeLineID; } RecoveryRestartPoint(&checkPoint); @@ -7221,3 +7611,92 @@ CancelBackup(void) } } +/* ------------------------------------------------------ + * Startup Process main entry point and signal handlers + * ------------------------------------------------------ + */ + +/* + * startupproc_quickdie() occurs when signalled SIGQUIT by the postmaster. + * + * Some backend has bought the farm, + * so we need to stop what we're doing and exit. + */ +static void +startupproc_quickdie(SIGNAL_ARGS) +{ + PG_SETMASK(&BlockSig); + + /* + * DO NOT proc_exit() -- we're here because shared memory may be + * corrupted, so we don't want to try to clean up our transaction. Just + * nail the windows shut and get out of town. + * + * Note we do exit(2) not exit(0). This is to force the postmaster into a + * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random + * backend. This is necessary precisely because we don't clean up our + * shared memory state. + */ + exit(2); +} + + +/* SIGTERM: set flag to abort redo and exit */ +static void +StartupProcShutdownHandler(SIGNAL_ARGS) +{ + if (in_restore_command) + proc_exit(0); + else + shutdown_requested = true; +} + +/* Main entry point for startup process */ +void +StartupProcessMain(void) +{ + /* + * If possible, make this process a group leader, so that the postmaster + * can signal any child processes too. + */ +#ifdef HAVE_SETSID + if (setsid() < 0) + elog(FATAL, "setsid() failed: %m"); +#endif + + /* + * Properly accept or ignore signals the postmaster might send us + */ + pqsignal(SIGHUP, SIG_IGN); /* ignore config file updates */ + pqsignal(SIGINT, SIG_IGN); /* ignore query cancel */ + pqsignal(SIGTERM, StartupProcShutdownHandler); /* request shutdown */ + pqsignal(SIGQUIT, startupproc_quickdie); /* hard crash time */ + pqsignal(SIGALRM, SIG_IGN); + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR1, SIG_IGN); + pqsignal(SIGUSR2, SIG_IGN); + + /* + * Reset some signals that are accepted by postmaster but not here + */ + pqsignal(SIGCHLD, SIG_DFL); + pqsignal(SIGTTIN, SIG_DFL); + pqsignal(SIGTTOU, SIG_DFL); + pqsignal(SIGCONT, SIG_DFL); + pqsignal(SIGWINCH, SIG_DFL); + + /* + * Unblock signals (they were blocked when the postmaster forked us) + */ + PG_SETMASK(&UnBlockSig); + + StartupXLOG(); + + BuildFlatFiles(false); + + /* Let postmaster know that startup is finished */ + SendPostmasterSignal(PMSIGNAL_RECOVERY_COMPLETED); + + /* exit normally */ + proc_exit(0); +} diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 19aab42554..5a0f852b6f 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.249 2009/01/22 20:16:00 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.250 2009/02/18 15:58:41 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -37,7 +37,6 @@ #include "storage/proc.h" #include "tcop/tcopprot.h" #include "utils/builtins.h" -#include "utils/flatfiles.h" #include "utils/fmgroids.h" #include "utils/memutils.h" #include "utils/ps_status.h" @@ -416,14 +415,12 @@ AuxiliaryProcessMain(int argc, char *argv[]) proc_exit(1); /* should never return */ case StartupProcess: - bootstrap_signals(); - StartupXLOG(); - BuildFlatFiles(false); - proc_exit(0); /* startup done */ + /* don't set signals, startup process has its own agenda */ + StartupProcessMain(); + proc_exit(1); /* should never return */ case BgWriterProcess: /* don't set signals, bgwriter has its own agenda */ - InitXLOGAccess(); BackgroundWriterMain(); proc_exit(1); /* should never return */ diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index a7377f0280..4909fbe37a 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -37,7 +37,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.55 2009/01/01 17:23:46 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.56 2009/02/18 15:58:41 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -49,6 +49,7 @@ #include #include "access/xlog_internal.h" +#include "catalog/pg_control.h" #include "libpq/pqsignal.h" #include "miscadmin.h" #include "pgstat.h" @@ -423,9 +424,19 @@ BackgroundWriterMain(void) */ if (do_checkpoint) { + bool ckpt_performed = false; + bool do_restartpoint; + /* use volatile pointer to prevent code rearrangement */ volatile BgWriterShmemStruct *bgs = BgWriterShmem; + /* + * Check if we should perform a checkpoint or a restartpoint. + * As a side-effect, RecoveryInProgress() initializes + * TimeLineID if it's not set yet. + */ + do_restartpoint = RecoveryInProgress(); + /* * Atomically fetch the request flags to figure out what kind of a * checkpoint we should perform, and increase the started-counter @@ -444,7 +455,8 @@ BackgroundWriterMain(void) * implementation will not generate warnings caused by * CheckPointTimeout < CheckPointWarning. */ - if ((flags & CHECKPOINT_CAUSE_XLOG) && + if (!do_restartpoint && + (flags & CHECKPOINT_CAUSE_XLOG) && elapsed_secs < CheckPointWarning) ereport(LOG, (errmsg("checkpoints are occurring too frequently (%d seconds apart)", @@ -455,14 +467,21 @@ BackgroundWriterMain(void) * Initialize bgwriter-private variables used during checkpoint. */ ckpt_active = true; - ckpt_start_recptr = GetInsertRecPtr(); + if (!do_restartpoint) + ckpt_start_recptr = GetInsertRecPtr(); ckpt_start_time = now; ckpt_cached_elapsed = 0; /* * Do the checkpoint. */ - CreateCheckPoint(flags); + if (!do_restartpoint) + { + CreateCheckPoint(flags); + ckpt_performed = true; + } + else + ckpt_performed = CreateRestartPoint(flags); /* * After any checkpoint, close all smgr files. This is so we @@ -477,14 +496,27 @@ BackgroundWriterMain(void) bgs->ckpt_done = bgs->ckpt_started; SpinLockRelease(&bgs->ckpt_lck); - ckpt_active = false; + if (ckpt_performed) + { + /* + * Note we record the checkpoint start time not end time as + * last_checkpoint_time. This is so that time-driven + * checkpoints happen at a predictable spacing. + */ + last_checkpoint_time = now; + } + else + { + /* + * We were not able to perform the restartpoint (checkpoints + * throw an ERROR in case of error). Most likely because we + * have not received any new checkpoint WAL records since the + * last restartpoint. Try again in 15 s. + */ + last_checkpoint_time = now - CheckPointTimeout + 15; + } - /* - * Note we record the checkpoint start time not end time as - * last_checkpoint_time. This is so that time-driven checkpoints - * happen at a predictable spacing. - */ - last_checkpoint_time = now; + ckpt_active = false; } else BgBufferSync(); @@ -507,7 +539,7 @@ CheckArchiveTimeout(void) pg_time_t now; pg_time_t last_time; - if (XLogArchiveTimeout <= 0) + if (XLogArchiveTimeout <= 0 || RecoveryInProgress()) return; now = (pg_time_t) time(NULL); @@ -714,16 +746,19 @@ IsCheckpointOnSchedule(double progress) * However, it's good enough for our purposes, we're only calculating an * estimate anyway. */ - recptr = GetInsertRecPtr(); - elapsed_xlogs = - (((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile + - ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) / - CheckPointSegments; - - if (progress < elapsed_xlogs) + if (!RecoveryInProgress()) { - ckpt_cached_elapsed = elapsed_xlogs; - return false; + recptr = GetInsertRecPtr(); + elapsed_xlogs = + (((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile + + ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) / + CheckPointSegments; + + if (progress < elapsed_xlogs) + { + ckpt_cached_elapsed = elapsed_xlogs; + return false; + } } /* diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 0d0d23f53a..49ee57c77e 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -37,7 +37,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.570 2009/01/04 22:19:59 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.571 2009/02/18 15:58:41 heikki Exp $ * * NOTES * @@ -225,11 +225,38 @@ static pid_t StartupPID = 0, static int Shutdown = NoShutdown; static bool FatalError = false; /* T if recovering from backend crash */ +static bool RecoveryError = false; /* T if recovery failed */ + +/* State of WAL redo */ +#define NoRecovery 0 +#define RecoveryStarted 1 +#define RecoveryConsistent 2 +#define RecoveryCompleted 3 + +static int RecoveryStatus = NoRecovery; /* * We use a simple state machine to control startup, shutdown, and * crash recovery (which is rather like shutdown followed by startup). * + * After doing all the postmaster initialization work, we enter PM_STARTUP + * state and the startup process is launched. The startup process begins by + * reading the control file and other preliminary initialization steps. When + * it's ready to start WAL redo, it signals postmaster, and we switch to + * PM_RECOVERY phase. The background writer is launched, while the startup + * process continues applying WAL. + * + * After reaching a consistent point in WAL redo, startup process signals + * us again, and we switch to PM_RECOVERY_CONSISTENT phase. There's currently + * no difference between PM_RECOVERY and PM_RECOVERY_CONSISTENT, but we + * could start accepting connections to perform read-only queries at this + * point, if we had the infrastructure to do that. + * + * When the WAL redo is finished, the startup process signals us the third + * time, and we switch to PM_RUN state. The startup process can also skip the + * recovery and consistent recovery phases altogether, as it will during + * normal startup when there's no recovery to be done, for example. + * * Normal child backends can only be launched when we are in PM_RUN state. * (We also allow it in PM_WAIT_BACKUP state, but only for superusers.) * In other states we handle connection requests by launching "dead_end" @@ -245,15 +272,19 @@ static bool FatalError = false; /* T if recovering from backend crash */ * * Notice that this state variable does not distinguish *why* we entered * states later than PM_RUN --- Shutdown and FatalError must be consulted - * to find that out. FatalError is never true in PM_RUN state, nor in - * PM_SHUTDOWN states (because we don't enter those states when trying to - * recover from a crash). It can be true in PM_STARTUP state, because we - * don't clear it until we've successfully recovered. + * to find that out. FatalError is never true in PM_RECOVERY_* or PM_RUN + * states, nor in PM_SHUTDOWN states (because we don't enter those states + * when trying to recover from a crash). It can be true in PM_STARTUP state, + * because we don't clear it until we've successfully started WAL redo. + * Similarly, RecoveryError means that we have crashed during recovery, and + * should not try to restart. */ typedef enum { PM_INIT, /* postmaster starting */ PM_STARTUP, /* waiting for startup subprocess */ + PM_RECOVERY, /* in recovery mode */ + PM_RECOVERY_CONSISTENT, /* consistent recovery mode */ PM_RUN, /* normal "database is alive" state */ PM_WAIT_BACKUP, /* waiting for online backup mode to end */ PM_WAIT_BACKENDS, /* waiting for live backends to exit */ @@ -307,6 +338,7 @@ static void pmdie(SIGNAL_ARGS); static void reaper(SIGNAL_ARGS); static void sigusr1_handler(SIGNAL_ARGS); static void dummy_handler(SIGNAL_ARGS); +static void CheckRecoverySignals(void); static void CleanupBackend(int pid, int exitstatus); static void HandleChildCrash(int pid, int exitstatus, const char *procname); static void LogChildExit(int lev, const char *procname, @@ -1302,7 +1334,9 @@ ServerLoop(void) * state that prevents it, start one. It doesn't matter if this * fails, we'll just try again later. */ - if (BgWriterPID == 0 && pmState == PM_RUN) + if (BgWriterPID == 0 && + (pmState == PM_RUN || pmState == PM_RECOVERY || + pmState == PM_RECOVERY_CONSISTENT)) BgWriterPID = StartBackgroundWriter(); /* @@ -1752,7 +1786,10 @@ canAcceptConnections(void) return CAC_WAITBACKUP; /* allow superusers only */ if (Shutdown > NoShutdown) return CAC_SHUTDOWN; /* shutdown is pending */ - if (pmState == PM_STARTUP && !FatalError) + if (!FatalError && + (pmState == PM_STARTUP || + pmState == PM_RECOVERY || + pmState == PM_RECOVERY_CONSISTENT)) return CAC_STARTUP; /* normal startup */ return CAC_RECOVERY; /* else must be crash recovery */ } @@ -1982,7 +2019,7 @@ pmdie(SIGNAL_ARGS) ereport(LOG, (errmsg("received smart shutdown request"))); - if (pmState == PM_RUN) + if (pmState == PM_RUN || pmState == PM_RECOVERY || pmState == PM_RECOVERY_CONSISTENT) { /* autovacuum workers are told to shut down immediately */ SignalAutovacWorkers(SIGTERM); @@ -2019,7 +2056,14 @@ pmdie(SIGNAL_ARGS) if (StartupPID != 0) signal_child(StartupPID, SIGTERM); - if (pmState == PM_RUN || pmState == PM_WAIT_BACKUP) + if (pmState == PM_RECOVERY) + { + /* only bgwriter is active in this state */ + pmState = PM_WAIT_BACKENDS; + } + if (pmState == PM_RUN || + pmState == PM_WAIT_BACKUP || + pmState == PM_RECOVERY_CONSISTENT) { ereport(LOG, (errmsg("aborting any active transactions"))); @@ -2116,10 +2160,22 @@ reaper(SIGNAL_ARGS) if (pid == StartupPID) { StartupPID = 0; - Assert(pmState == PM_STARTUP); - /* FATAL exit of startup is treated as catastrophic */ - if (!EXIT_STATUS_0(exitstatus)) + /* + * Check if we've received a signal from the startup process + * first. This can change pmState. If the startup process sends + * a signal and exits immediately after that, we might not have + * processed the signal yet. We need to know if it completed + * recovery before it exited. + */ + CheckRecoverySignals(); + + /* + * Unexpected exit of startup process (including FATAL exit) + * during PM_STARTUP is treated as catastrophic. There is no + * other processes running yet. + */ + if (pmState == PM_STARTUP) { LogChildExit(LOG, _("startup process"), pid, exitstatus); @@ -2127,60 +2183,30 @@ reaper(SIGNAL_ARGS) (errmsg("aborting startup due to startup process failure"))); ExitPostmaster(1); } - /* - * Startup succeeded - we are done with system startup or - * recovery. + * Any unexpected exit (including FATAL exit) of the startup + * process is treated as a crash, except that we don't want + * to reinitialize. */ - FatalError = false; - - /* - * Go to shutdown mode if a shutdown request was pending. - */ - if (Shutdown > NoShutdown) + if (!EXIT_STATUS_0(exitstatus)) { - pmState = PM_WAIT_BACKENDS; - /* PostmasterStateMachine logic does the rest */ + RecoveryError = true; + HandleChildCrash(pid, exitstatus, + _("startup process")); continue; } - /* - * Otherwise, commence normal operations. + * Startup process exited normally, but didn't finish recovery. + * This can happen if someone else than postmaster kills the + * startup process with SIGTERM. Treat it like a crash. */ - pmState = PM_RUN; - - /* - * Load the flat authorization file into postmaster's cache. The - * startup process has recomputed this from the database contents, - * so we wait till it finishes before loading it. - */ - load_role(); - - /* - * Crank up the background writer. It doesn't matter if this - * fails, we'll just try again later. - */ - Assert(BgWriterPID == 0); - BgWriterPID = StartBackgroundWriter(); - - /* - * Likewise, start other special children as needed. In a restart - * situation, some of them may be alive already. - */ - if (WalWriterPID == 0) - WalWriterPID = StartWalWriter(); - if (AutoVacuumingActive() && AutoVacPID == 0) - AutoVacPID = StartAutoVacLauncher(); - if (XLogArchivingActive() && PgArchPID == 0) - PgArchPID = pgarch_start(); - if (PgStatPID == 0) - PgStatPID = pgstat_start(); - - /* at this point we are really open for business */ - ereport(LOG, - (errmsg("database system is ready to accept connections"))); - - continue; + if (pmState == PM_RECOVERY || pmState == PM_RECOVERY_CONSISTENT) + { + RecoveryError = true; + HandleChildCrash(pid, exitstatus, + _("startup process")); + continue; + } } /* @@ -2443,6 +2469,18 @@ HandleChildCrash(int pid, int exitstatus, const char *procname) } } + /* Take care of the startup process too */ + if (pid == StartupPID) + StartupPID = 0; + else if (StartupPID != 0 && !FatalError) + { + ereport(DEBUG2, + (errmsg_internal("sending %s to process %d", + (SendStop ? "SIGSTOP" : "SIGQUIT"), + (int) StartupPID))); + signal_child(BgWriterPID, (SendStop ? SIGSTOP : SIGQUIT)); + } + /* Take care of the bgwriter too */ if (pid == BgWriterPID) BgWriterPID = 0; @@ -2514,7 +2552,9 @@ HandleChildCrash(int pid, int exitstatus, const char *procname) FatalError = true; /* We now transit into a state of waiting for children to die */ - if (pmState == PM_RUN || + if (pmState == PM_RECOVERY || + pmState == PM_RECOVERY_CONSISTENT || + pmState == PM_RUN || pmState == PM_WAIT_BACKUP || pmState == PM_SHUTDOWN) pmState = PM_WAIT_BACKENDS; @@ -2582,6 +2622,127 @@ LogChildExit(int lev, const char *procname, int pid, int exitstatus) static void PostmasterStateMachine(void) { + /* Startup states */ + + if (pmState == PM_STARTUP && RecoveryStatus > NoRecovery) + { + /* WAL redo has started. We're out of reinitialization. */ + FatalError = false; + + /* + * Go to shutdown mode if a shutdown request was pending. + */ + if (Shutdown > NoShutdown) + { + pmState = PM_WAIT_BACKENDS; + /* PostmasterStateMachine logic does the rest */ + } + else + { + /* + * Crank up the background writer. It doesn't matter if this + * fails, we'll just try again later. + */ + Assert(BgWriterPID == 0); + BgWriterPID = StartBackgroundWriter(); + + pmState = PM_RECOVERY; + } + } + if (pmState == PM_RECOVERY && RecoveryStatus >= RecoveryConsistent) + { + /* + * Go to shutdown mode if a shutdown request was pending. + */ + if (Shutdown > NoShutdown) + { + pmState = PM_WAIT_BACKENDS; + /* PostmasterStateMachine logic does the rest */ + } + else + { + /* + * Startup process has entered recovery. We consider that good + * enough to reset FatalError. + */ + pmState = PM_RECOVERY_CONSISTENT; + + /* + * Load the flat authorization file into postmaster's cache. The + * startup process won't have recomputed this from the database yet, + * so we it may change following recovery. + */ + load_role(); + + /* + * Likewise, start other special children as needed. + */ + Assert(PgStatPID == 0); + PgStatPID = pgstat_start(); + + /* XXX at this point we could accept read-only connections */ + ereport(DEBUG1, + (errmsg("database system is in consistent recovery mode"))); + } + } + if ((pmState == PM_RECOVERY || + pmState == PM_RECOVERY_CONSISTENT || + pmState == PM_STARTUP) && + RecoveryStatus == RecoveryCompleted) + { + /* + * Startup succeeded. + * + * Go to shutdown mode if a shutdown request was pending. + */ + if (Shutdown > NoShutdown) + { + pmState = PM_WAIT_BACKENDS; + /* PostmasterStateMachine logic does the rest */ + } + else + { + /* + * Otherwise, commence normal operations. + */ + pmState = PM_RUN; + + /* + * Load the flat authorization file into postmaster's cache. The + * startup process has recomputed this from the database contents, + * so we wait till it finishes before loading it. + */ + load_role(); + + /* + * Crank up the background writer, if we didn't do that already + * when we entered consistent recovery phase. It doesn't matter + * if this fails, we'll just try again later. + */ + if (BgWriterPID == 0) + BgWriterPID = StartBackgroundWriter(); + + /* + * Likewise, start other special children as needed. In a restart + * situation, some of them may be alive already. + */ + if (WalWriterPID == 0) + WalWriterPID = StartWalWriter(); + if (AutoVacuumingActive() && AutoVacPID == 0) + AutoVacPID = StartAutoVacLauncher(); + if (XLogArchivingActive() && PgArchPID == 0) + PgArchPID = pgarch_start(); + if (PgStatPID == 0) + PgStatPID = pgstat_start(); + + /* at this point we are really open for business */ + ereport(LOG, + (errmsg("database system is ready to accept connections"))); + } + } + + /* Shutdown states */ + if (pmState == PM_WAIT_BACKUP) { /* @@ -2722,6 +2883,15 @@ PostmasterStateMachine(void) } } + /* + * If recovery failed, wait for all non-syslogger children to exit, + * and then exit postmaster. We don't try to reinitialize when recovery + * fails, because more than likely it will just fail again and we will + * keep trying forever. + */ + if (RecoveryError && pmState == PM_NO_CHILDREN) + ExitPostmaster(1); + /* * If we need to recover from a crash, wait for all non-syslogger * children to exit, then reset shmem and StartupDataBase. @@ -2734,6 +2904,8 @@ PostmasterStateMachine(void) shmem_exit(1); reset_shared(PostPortNumber); + RecoveryStatus = NoRecovery; + StartupPID = StartupDataBase(); Assert(StartupPID != 0); pmState = PM_STARTUP; @@ -3837,6 +4009,37 @@ ExitPostmaster(int status) proc_exit(status); } +/* + * common code used in sigusr1_handler() and reaper() to handle + * recovery-related signals from startup process + */ +static void +CheckRecoverySignals(void) +{ + bool changed = false; + + if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_STARTED)) + { + Assert(pmState == PM_STARTUP); + + RecoveryStatus = RecoveryStarted; + changed = true; + } + if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT)) + { + RecoveryStatus = RecoveryConsistent; + changed = true; + } + if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_COMPLETED)) + { + RecoveryStatus = RecoveryCompleted; + changed = true; + } + + if (changed) + PostmasterStateMachine(); +} + /* * sigusr1_handler - handle signal conditions from child processes */ @@ -3847,6 +4050,8 @@ sigusr1_handler(SIGNAL_ARGS) PG_SETMASK(&BlockSig); + CheckRecoverySignals(); + if (CheckPostmasterSignal(PMSIGNAL_PASSWORD_CHANGE)) { /* diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README index 696e5e8c30..62b2eccec7 100644 --- a/src/backend/storage/buffer/README +++ b/src/backend/storage/buffer/README @@ -1,4 +1,4 @@ -$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.15 2008/11/06 20:51:14 tgl Exp $ +$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.16 2009/02/18 15:58:41 heikki Exp $ Notes About Shared Buffer Access Rules ====================================== @@ -268,3 +268,8 @@ out (and anyone else who flushes buffer contents to disk must do so too). This ensures that the page image transferred to disk is reasonably consistent. We might miss a hint-bit update or two but that isn't a problem, for the same reasons mentioned under buffer access rules. + +As of 8.4, background writer starts during recovery mode when there is +some form of potentially extended recovery to perform. It performs an +identical service to normal processing, except that checkpoints it +writes are technically restartpoints. diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index dc4cb467f7..7f2cfdaea0 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/init/postinit.c,v 1.187 2009/01/01 17:23:51 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/init/postinit.c,v 1.188 2009/02/18 15:58:41 heikki Exp $ * * *------------------------------------------------------------------------- @@ -324,7 +324,7 @@ InitCommunication(void) * If you're wondering why this is separate from InitPostgres at all: * the critical distinction is that this stuff has to happen before we can * run XLOG-related initialization, which is done before InitPostgres --- in - * fact, for cases such as checkpoint creation processes, InitPostgres may + * fact, for cases such as the background writer process, InitPostgres may * never be done at all. */ void diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 99b0509311..ca1d236154 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -6,7 +6,7 @@ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.90 2009/01/20 18:59:37 heikki Exp $ + * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.91 2009/02/18 15:58:41 heikki Exp $ */ #ifndef XLOG_H #define XLOG_H @@ -199,6 +199,8 @@ extern void RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup); extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record); extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec); +extern bool RecoveryInProgress(void); + extern void UpdateControlFile(void); extern Size XLOGShmemSize(void); extern void XLOGShmemInit(void); @@ -207,9 +209,12 @@ extern void StartupXLOG(void); extern void ShutdownXLOG(int code, Datum arg); extern void InitXLOGAccess(void); extern void CreateCheckPoint(int flags); +extern bool CreateRestartPoint(int flags); extern void XLogPutNextOid(Oid nextOid); extern XLogRecPtr GetRedoRecPtr(void); extern XLogRecPtr GetInsertRecPtr(void); extern void GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch); +extern void StartupProcessMain(void); + #endif /* XLOG_H */ diff --git a/src/include/storage/pmsignal.h b/src/include/storage/pmsignal.h index 6d6facc0f6..6d2b827c01 100644 --- a/src/include/storage/pmsignal.h +++ b/src/include/storage/pmsignal.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/pmsignal.h,v 1.21 2009/01/01 17:24:01 momjian Exp $ + * $PostgreSQL: pgsql/src/include/storage/pmsignal.h,v 1.22 2009/02/18 15:58:41 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -22,6 +22,9 @@ */ typedef enum { + PMSIGNAL_RECOVERY_STARTED, /* recovery has started */ + PMSIGNAL_RECOVERY_CONSISTENT, /* recovery has reached consistent state */ + PMSIGNAL_RECOVERY_COMPLETED, /* recovery has completed */ PMSIGNAL_PASSWORD_CHANGE, /* pg_auth file has changed */ PMSIGNAL_WAKEN_ARCHIVER, /* send a NOTIFY signal to xlog archiver */ PMSIGNAL_ROTATE_LOGFILE, /* send SIGUSR1 to syslogger to rotate logfile */