diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index a3331bdef6..0e49ba3217 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1,4 +1,4 @@ - + Server Configuration @@ -1413,7 +1413,7 @@ SET ENABLE_SEQSCAN TO OFF; - + wal_buffers (integer) @@ -1438,7 +1438,27 @@ SET ENABLE_SEQSCAN TO OFF; - + + + wal_writer_delay (integer) + + wal_writer_delay configuration parameter + + + + Specifies the delay between activity rounds for the WAL writer. + In each round the writer will flush WAL to disk. It then sleeps for + wal_writer_delay milliseconds, and repeats. The default + value is 200 milliseconds (200ms). Note that on many + systems, the effective resolution of sleep delays is 10 milliseconds; + setting wal_writer_delay to a value that is not a multiple + of 10 might have the same results as setting it to the next higher + multiple of 10. This parameter can only be set in the + postgresql.conf file or on the server command line. + + + + commit_delay (integer) @@ -1521,7 +1541,7 @@ SET ENABLE_SEQSCAN TO OFF; - Specifies the target length of checkpoints, as a fraction of + Specifies the target length of checkpoints, as a fraction of the checkpoint interval. The default is 0.5. This parameter can only be set in the postgresql.conf diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 15c9f310a6..25789ddaa6 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.274 2007/06/30 19:12:01 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.275 2007/07/24 04:54:08 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -484,7 +484,6 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata) uint32 len, write_len; unsigned i; - XLogwrtRqst LogwrtRqst; bool updrqst; bool doPageWrites; bool isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH); @@ -643,43 +642,6 @@ begin:; START_CRIT_SECTION(); - /* update LogwrtResult before doing cache fill check */ - { - /* use volatile pointer to prevent code rearrangement */ - volatile XLogCtlData *xlogctl = XLogCtl; - - SpinLockAcquire(&xlogctl->info_lck); - LogwrtRqst = xlogctl->LogwrtRqst; - LogwrtResult = xlogctl->LogwrtResult; - SpinLockRelease(&xlogctl->info_lck); - } - - /* - * If cache is half filled then try to acquire write lock and do - * XLogWrite. Ignore any fractional blocks in performing this check. - */ - LogwrtRqst.Write.xrecoff -= LogwrtRqst.Write.xrecoff % XLOG_BLCKSZ; - if (LogwrtRqst.Write.xlogid != LogwrtResult.Write.xlogid || - (LogwrtRqst.Write.xrecoff >= LogwrtResult.Write.xrecoff + - XLogCtl->XLogCacheByte / 2)) - { - if (LWLockConditionalAcquire(WALWriteLock, LW_EXCLUSIVE)) - { - /* - * Since the amount of data we write here is completely optional - * anyway, tell XLogWrite it can be "flexible" and stop at a - * convenient boundary. This allows writes triggered by this - * mechanism to synchronize with the cache boundaries, so that in - * a long transaction we'll basically dump alternating halves of - * the buffer array. - */ - LogwrtResult = XLogCtl->Write.LogwrtResult; - if (XLByteLT(LogwrtResult.Write, LogwrtRqst.Write)) - XLogWrite(LogwrtRqst, true, false); - LWLockRelease(WALWriteLock); - } - } - /* Now wait to get insert lock */ LWLockAcquire(WALInsertLock, LW_EXCLUSIVE); @@ -1800,6 +1762,85 @@ XLogFlush(XLogRecPtr record) LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff); } +/* + * Flush xlog, but without specifying exactly where to flush to. + * + * We normally flush only completed blocks; but if there is nothing to do on + * that basis, we check for unflushed async commits in the current incomplete + * block, and flush through the latest one of those. Thus, if async commits + * are not being used, we will flush complete blocks only. We can guarantee + * that async commits reach disk after at most three cycles; normally only + * one or two. (We allow XLogWrite to write "flexibly", meaning it can stop + * at the end of the buffer ring; this makes a difference only with very high + * load or long wal_writer_delay, but imposes one extra cycle for the worst + * case for async commits.) + * + * This routine is invoked periodically by the background walwriter process. + */ +void +XLogBackgroundFlush(void) +{ + XLogRecPtr WriteRqstPtr; + bool flexible = true; + + /* read LogwrtResult and update local state */ + { + /* use volatile pointer to prevent code rearrangement */ + volatile XLogCtlData *xlogctl = XLogCtl; + + SpinLockAcquire(&xlogctl->info_lck); + LogwrtResult = xlogctl->LogwrtResult; + WriteRqstPtr = xlogctl->LogwrtRqst.Write; + SpinLockRelease(&xlogctl->info_lck); + } + + /* back off to last completed page boundary */ + WriteRqstPtr.xrecoff -= WriteRqstPtr.xrecoff % XLOG_BLCKSZ; + +#ifdef NOT_YET /* async commit patch is still to come */ + /* if we have already flushed that far, consider async commit records */ + if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush)) + { + /* use volatile pointer to prevent code rearrangement */ + volatile XLogCtlData *xlogctl = XLogCtl; + + SpinLockAcquire(&xlogctl->async_commit_lck); + WriteRqstPtr = xlogctl->asyncCommitLSN; + SpinLockRelease(&xlogctl->async_commit_lck); + flexible = false; /* ensure it all gets written */ + } +#endif + + /* Done if already known flushed */ + if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush)) + return; + +#ifdef WAL_DEBUG + if (XLOG_DEBUG) + elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X", + WriteRqstPtr.xlogid, WriteRqstPtr.xrecoff, + LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff, + LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff); +#endif + + START_CRIT_SECTION(); + + /* now wait for the write lock */ + LWLockAcquire(WALWriteLock, LW_EXCLUSIVE); + LogwrtResult = XLogCtl->Write.LogwrtResult; + if (!XLByteLE(WriteRqstPtr, LogwrtResult.Flush)) + { + XLogwrtRqst WriteRqst; + + WriteRqst.Write = WriteRqstPtr; + WriteRqst.Flush = WriteRqstPtr; + XLogWrite(WriteRqst, flexible, false); + } + LWLockRelease(WALWriteLock); + + END_CRIT_SECTION(); +} + /* * Test whether XLOG data has been flushed up to (at least) the given position. * diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 78eb6797db..3ffff2a2cc 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.234 2007/06/28 00:02:37 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.235 2007/07/24 04:54:09 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -30,6 +30,7 @@ #include "miscadmin.h" #include "nodes/makefuncs.h" #include "postmaster/bgwriter.h" +#include "postmaster/walwriter.h" #include "storage/freespace.h" #include "storage/ipc.h" #include "storage/proc.h" @@ -195,7 +196,7 @@ static IndexList *ILHead = NULL; * AuxiliaryProcessMain * * The main entry point for auxiliary processes, such as the bgwriter, - * bootstrapper and the shared memory checker code. + * walwriter, bootstrapper and the shared memory checker code. * * This code is here just because of historical reasons. */ @@ -331,6 +332,9 @@ AuxiliaryProcessMain(int argc, char *argv[]) case BgWriterProcess: statmsg = "writer process"; break; + case WalWriterProcess: + statmsg = "wal writer process"; + break; default: statmsg = "??? process"; break; @@ -419,6 +423,12 @@ AuxiliaryProcessMain(int argc, char *argv[]) InitXLOGAccess(); BackgroundWriterMain(); proc_exit(1); /* should never return */ + + case WalWriterProcess: + /* don't set signals, walwriter has its own agenda */ + InitXLOGAccess(); + WalWriterMain(); + proc_exit(1); /* should never return */ default: elog(PANIC, "unrecognized process type: %d", auxType); diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile index a49e0e393b..7ccba285f2 100644 --- a/src/backend/postmaster/Makefile +++ b/src/backend/postmaster/Makefile @@ -4,7 +4,7 @@ # Makefile for src/backend/postmaster # # IDENTIFICATION -# $PostgreSQL: pgsql/src/backend/postmaster/Makefile,v 1.22 2007/01/20 17:16:12 petere Exp $ +# $PostgreSQL: pgsql/src/backend/postmaster/Makefile,v 1.23 2007/07/24 04:54:09 tgl Exp $ # #------------------------------------------------------------------------- @@ -12,8 +12,8 @@ subdir = src/backend/postmaster top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -OBJS = bgwriter.o autovacuum.o pgarch.o pgstat.o postmaster.o syslogger.o \ - fork_process.o +OBJS = autovacuum.o bgwriter.o fork_process.o pgarch.o pgstat.o postmaster.o \ + syslogger.o walwriter.o all: SUBSYS.o diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 7a1270b014..f1f9effae7 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -37,7 +37,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.534 2007/07/23 10:16:54 mha Exp $ + * $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.535 2007/07/24 04:54:09 tgl Exp $ * * NOTES * @@ -136,7 +136,7 @@ typedef struct bkend { pid_t pid; /* process id of backend */ long cancel_key; /* cancel key for cancels for this backend */ - bool is_autovacuum; /* is it an autovacuum process */ + bool is_autovacuum; /* is it an autovacuum process? */ } Backend; static Dllist *BackendList; @@ -144,9 +144,9 @@ static Dllist *BackendList; #ifdef EXEC_BACKEND /* * Number of entries in the backend table. Twice the number of backends, - * plus four other subprocesses (stats, bgwriter, autovac, logger). + * plus five other subprocesses (stats, bgwriter, walwriter, autovac, logger). */ -#define NUM_BACKENDARRAY_ELEMS (2*MaxBackends + 4) +#define NUM_BACKENDARRAY_ELEMS (2*MaxBackends + 5) static Backend *ShmemBackendArray; #endif @@ -201,6 +201,7 @@ char *bonjour_name; /* PIDs of special child processes; 0 when not running */ static pid_t StartupPID = 0, BgWriterPID = 0, + WalWriterPID = 0, AutoVacPID = 0, PgArchPID = 0, PgStatPID = 0, @@ -221,7 +222,7 @@ bool ClientAuthInProgress = false; /* T during new-client bool redirection_done = false; /* received START_AUTOVAC_LAUNCHER signal */ -static bool start_autovac_launcher = false; +static volatile sig_atomic_t start_autovac_launcher = false; /* * State for assigning random salts and cancel keys. @@ -365,6 +366,7 @@ static void ShmemBackendArrayRemove(pid_t pid); #define StartupDataBase() StartChildProcess(StartupProcess) #define StartBackgroundWriter() StartChildProcess(BgWriterProcess) +#define StartWalWriter() StartChildProcess(WalWriterProcess) /* Macros to check exit status of a child process */ #define EXIT_STATUS_0(st) ((st) == 0) @@ -909,8 +911,9 @@ PostmasterMain(int argc, char *argv[]) * * CAUTION: when changing this list, check for side-effects on the signal * handling setup of child processes. See tcop/postgres.c, - * bootstrap/bootstrap.c, postmaster/bgwriter.c, postmaster/autovacuum.c, - * postmaster/pgarch.c, postmaster/pgstat.c, and postmaster/syslogger.c. + * bootstrap/bootstrap.c, postmaster/bgwriter.c, postmaster/walwriter.c, + * postmaster/autovacuum.c, postmaster/pgarch.c, postmaster/pgstat.c, and + * postmaster/syslogger.c. */ pqinitmask(); PG_SETMASK(&BlockSig); @@ -1244,6 +1247,15 @@ ServerLoop(void) signal_child(BgWriterPID, SIGUSR2); } + /* + * Likewise, if we have lost the walwriter process, try to start a + * new one. We don't need walwriter to complete a shutdown, so + * don't start it if shutdown already initiated. + */ + if (WalWriterPID == 0 && + StartupPID == 0 && !FatalError && Shutdown == NoShutdown) + WalWriterPID = StartWalWriter(); + /* If we have lost the autovacuum launcher, try to start a new one */ if (AutoVacPID == 0 && (AutoVacuumingActive() || start_autovac_launcher) && @@ -1251,7 +1263,7 @@ ServerLoop(void) { AutoVacPID = StartAutoVacLauncher(); if (AutoVacPID != 0) - start_autovac_launcher = false; /* signal successfully processed */ + start_autovac_launcher = false; /* signal processed */ } /* If we have lost the archiver, try to start a new one */ @@ -1842,6 +1854,8 @@ SIGHUP_handler(SIGNAL_ARGS) SignalChildren(SIGHUP); if (BgWriterPID != 0) signal_child(BgWriterPID, SIGHUP); + if (WalWriterPID != 0) + signal_child(WalWriterPID, SIGHUP); if (AutoVacPID != 0) signal_child(AutoVacPID, SIGHUP); if (PgArchPID != 0) @@ -1901,8 +1915,11 @@ pmdie(SIGNAL_ARGS) /* and the autovac launcher too */ if (AutoVacPID != 0) signal_child(AutoVacPID, SIGTERM); + /* and the walwriter too */ + if (WalWriterPID != 0) + signal_child(WalWriterPID, SIGTERM); - if (DLGetHead(BackendList) || AutoVacPID != 0) + if (DLGetHead(BackendList) || AutoVacPID != 0 || WalWriterPID != 0) break; /* let reaper() handle this */ /* @@ -1938,7 +1955,7 @@ pmdie(SIGNAL_ARGS) ereport(LOG, (errmsg("received fast shutdown request"))); - if (DLGetHead(BackendList) || AutoVacPID != 0) + if (DLGetHead(BackendList) || AutoVacPID != 0 || WalWriterPID != 0) { if (!FatalError) { @@ -1947,6 +1964,8 @@ pmdie(SIGNAL_ARGS) SignalChildren(SIGTERM); if (AutoVacPID != 0) signal_child(AutoVacPID, SIGTERM); + if (WalWriterPID != 0) + signal_child(WalWriterPID, SIGTERM); /* reaper() does the rest */ } break; @@ -1957,6 +1976,7 @@ pmdie(SIGNAL_ARGS) * * Note: if we previously got SIGTERM then we may send SIGUSR2 to * the bgwriter a second time here. This should be harmless. + * Ditto for the signals to the other special children. */ if (StartupPID != 0) { @@ -1993,6 +2013,8 @@ pmdie(SIGNAL_ARGS) signal_child(StartupPID, SIGQUIT); if (BgWriterPID != 0) signal_child(BgWriterPID, SIGQUIT); + if (WalWriterPID != 0) + signal_child(WalWriterPID, SIGQUIT); if (AutoVacPID != 0) signal_child(AutoVacPID, SIGQUIT); if (PgArchPID != 0) @@ -2091,13 +2113,14 @@ reaper(SIGNAL_ARGS) /* * Go to shutdown mode if a shutdown request was pending. - * Otherwise, try to start the archiver, stats collector and - * autovacuum launcher. + * Otherwise, try to start the other special children. */ if (Shutdown > NoShutdown && BgWriterPID != 0) signal_child(BgWriterPID, SIGUSR2); else if (Shutdown == NoShutdown) { + if (WalWriterPID == 0) + WalWriterPID = StartWalWriter(); if (XLogArchivingActive() && PgArchPID == 0) PgArchPID = pgarch_start(); if (PgStatPID == 0) @@ -2121,7 +2144,8 @@ reaper(SIGNAL_ARGS) BgWriterPID = 0; if (EXIT_STATUS_0(exitstatus) && Shutdown > NoShutdown && !FatalError && - !DLGetHead(BackendList) && AutoVacPID == 0) + !DLGetHead(BackendList) && + WalWriterPID == 0 && AutoVacPID == 0) { /* * Normal postmaster exit is here: we've seen normal exit of @@ -2150,7 +2174,8 @@ reaper(SIGNAL_ARGS) * required will happen on next postmaster start. */ if (Shutdown > NoShutdown && - !DLGetHead(BackendList) && AutoVacPID == 0) + !DLGetHead(BackendList) && + WalWriterPID == 0 && AutoVacPID == 0) { ereport(LOG, (errmsg("abnormal database system shutdown"))); @@ -2161,6 +2186,20 @@ reaper(SIGNAL_ARGS) continue; } + /* + * Was it the wal writer? Normal exit can be ignored; we'll + * start a new one at the next iteration of the postmaster's main loop, + * if necessary. Any other exit condition is treated as a crash. + */ + if (WalWriterPID != 0 && pid == WalWriterPID) + { + WalWriterPID = 0; + if (!EXIT_STATUS_0(exitstatus)) + HandleChildCrash(pid, exitstatus, + _("wal writer process")); + continue; + } + /* * Was it the autovacuum launcher? Normal exit can be ignored; we'll * start a new one at the next iteration of the postmaster's main loop, @@ -2233,7 +2272,8 @@ reaper(SIGNAL_ARGS) * StartupDataBase. (We can ignore the archiver and stats processes * here since they are not connected to shmem.) */ - if (DLGetHead(BackendList) || StartupPID != 0 || BgWriterPID != 0 || + if (DLGetHead(BackendList) || StartupPID != 0 || + BgWriterPID != 0 || WalWriterPID != 0 || AutoVacPID != 0) goto reaper_done; ereport(LOG, @@ -2249,7 +2289,8 @@ reaper(SIGNAL_ARGS) if (Shutdown > NoShutdown) { - if (DLGetHead(BackendList) || StartupPID != 0 || AutoVacPID != 0) + if (DLGetHead(BackendList) || StartupPID != 0 || AutoVacPID != 0 || + WalWriterPID != 0) goto reaper_done; /* Start the bgwriter if not running */ if (BgWriterPID == 0) @@ -2315,7 +2356,8 @@ CleanupBackend(int pid, } /* - * HandleChildCrash -- cleanup after failed backend, bgwriter, or autovacuum. + * HandleChildCrash -- cleanup after failed backend, bgwriter, walwriter, + * or autovacuum. * * The objectives here are to clean up our local state about the child * process, and to signal all other remaining children to quickdie. @@ -2390,6 +2432,18 @@ HandleChildCrash(int pid, int exitstatus, const char *procname) signal_child(BgWriterPID, (SendStop ? SIGSTOP : SIGQUIT)); } + /* Take care of the walwriter too */ + if (pid == WalWriterPID) + WalWriterPID = 0; + else if (WalWriterPID != 0 && !FatalError) + { + ereport(DEBUG2, + (errmsg_internal("sending %s to process %d", + (SendStop ? "SIGSTOP" : "SIGQUIT"), + (int) WalWriterPID))); + signal_child(WalWriterPID, (SendStop ? SIGSTOP : SIGQUIT)); + } + /* Take care of the autovacuum launcher too */ if (pid == AutoVacPID) AutoVacPID = 0; @@ -3622,9 +3676,11 @@ sigusr1_handler(SIGNAL_ARGS) start_autovac_launcher = true; } - /* The autovacuum launcher wants us to start a worker process. */ if (CheckPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER)) + { + /* The autovacuum launcher wants us to start a worker process. */ StartAutovacuumWorker(); + } PG_SETMASK(&UnBlockSig); @@ -3805,6 +3861,10 @@ StartChildProcess(AuxProcType type) ereport(LOG, (errmsg("could not fork background writer process: %m"))); break; + case WalWriterProcess: + ereport(LOG, + (errmsg("could not fork wal writer process: %m"))); + break; default: ereport(LOG, (errmsg("could not fork process: %m"))); diff --git a/src/backend/postmaster/walwriter.c b/src/backend/postmaster/walwriter.c new file mode 100644 index 0000000000..b4d594661a --- /dev/null +++ b/src/backend/postmaster/walwriter.c @@ -0,0 +1,311 @@ +/*------------------------------------------------------------------------- + * + * walwriter.c + * + * The WAL writer background process is new as of Postgres 8.3. It attempts + * to keep regular backends from having to write out (and fsync) WAL pages. + * Also, it guarantees that transaction commit records that weren't synced + * to disk immediately upon commit (ie, were "asynchronously committed") + * will reach disk within a knowable time --- which, as it happens, is at + * most three times the wal_writer_delay cycle time. + * + * Note that as with the bgwriter for shared buffers, regular backends are + * still empowered to issue WAL writes and fsyncs when the walwriter doesn't + * keep up. + * + * Because the walwriter's cycle is directly linked to the maximum delay + * before async-commit transactions are guaranteed committed, it's probably + * unwise to load additional functionality onto it. For instance, if you've + * got a yen to create xlog segments further in advance, that'd be better done + * in bgwriter than in walwriter. + * + * The walwriter is started by the postmaster as soon as the startup subprocess + * finishes. It remains alive until the postmaster commands it to terminate. + * Normal termination is by SIGTERM, which instructs the walwriter to exit(0). + * Emergency termination is by SIGQUIT; like any backend, the walwriter will + * simply abort and exit on SIGQUIT. + * + * If the walwriter exits unexpectedly, the postmaster treats that the same + * as a backend crash: shared memory may be corrupted, so remaining backends + * should be killed by SIGQUIT and then a recovery cycle started. + * + * + * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/postmaster/walwriter.c,v 1.1 2007/07/24 04:54:09 tgl Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include +#include +#include + +#include "access/xlog.h" +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "postmaster/walwriter.h" +#include "storage/bufmgr.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "storage/pmsignal.h" +#include "storage/smgr.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/resowner.h" + + +/* + * GUC parameters + */ +int WalWriterDelay = 200; + +/* + * Flags set by interrupt handlers for later service in the main loop. + */ +static volatile sig_atomic_t got_SIGHUP = false; +static volatile sig_atomic_t shutdown_requested = false; + +/* Signal handlers */ +static void wal_quickdie(SIGNAL_ARGS); +static void WalSigHupHandler(SIGNAL_ARGS); +static void WalShutdownHandler(SIGNAL_ARGS); + + +/* + * Main entry point for walwriter process + * + * This is invoked from BootstrapMain, which has already created the basic + * execution environment, but not enabled signals yet. + */ +void +WalWriterMain(void) +{ + sigjmp_buf local_sigjmp_buf; + MemoryContext walwriter_context; + + /* + * If possible, make this process a group leader, so that the postmaster + * can signal any child processes too. (walwriter probably never has + * any child processes, but for consistency we make all postmaster + * child processes do this.) + */ +#ifdef HAVE_SETSID + if (setsid() < 0) + elog(FATAL, "setsid() failed: %m"); +#endif + + /* + * Properly accept or ignore signals the postmaster might send us + * + * We have no particular use for SIGINT at the moment, but seems + * reasonable to treat like SIGTERM. + */ + pqsignal(SIGHUP, WalSigHupHandler); /* set flag to read config file */ + pqsignal(SIGINT, WalShutdownHandler); /* request shutdown */ + pqsignal(SIGTERM, WalShutdownHandler); /* request shutdown */ + pqsignal(SIGQUIT, wal_quickdie); /* hard crash time */ + pqsignal(SIGALRM, SIG_IGN); + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR1, SIG_IGN); /* reserve for sinval */ + pqsignal(SIGUSR2, SIG_IGN); /* not used */ + + /* + * Reset some signals that are accepted by postmaster but not here + */ + pqsignal(SIGCHLD, SIG_DFL); + pqsignal(SIGTTIN, SIG_DFL); + pqsignal(SIGTTOU, SIG_DFL); + pqsignal(SIGCONT, SIG_DFL); + pqsignal(SIGWINCH, SIG_DFL); + + /* We allow SIGQUIT (quickdie) at all times */ +#ifdef HAVE_SIGPROCMASK + sigdelset(&BlockSig, SIGQUIT); +#else + BlockSig &= ~(sigmask(SIGQUIT)); +#endif + + /* + * Create a resource owner to keep track of our resources (not clear + * that we need this, but may as well have one). + */ + CurrentResourceOwner = ResourceOwnerCreate(NULL, "Wal Writer"); + + /* + * Create a memory context that we will do all our work in. We do this so + * that we can reset the context during error recovery and thereby avoid + * possible memory leaks. Formerly this code just ran in + * TopMemoryContext, but resetting that would be a really bad idea. + */ + walwriter_context = AllocSetContextCreate(TopMemoryContext, + "Wal Writer", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + MemoryContextSwitchTo(walwriter_context); + + /* + * If an exception is encountered, processing resumes here. + * + * This code is heavily based on bgwriter.c, q.v. + */ + if (sigsetjmp(local_sigjmp_buf, 1) != 0) + { + /* Since not using PG_TRY, must reset error stack by hand */ + error_context_stack = NULL; + + /* Prevent interrupts while cleaning up */ + HOLD_INTERRUPTS(); + + /* Report the error to the server log */ + EmitErrorReport(); + + /* + * These operations are really just a minimal subset of + * AbortTransaction(). We don't have very many resources to worry + * about in walwriter, but we do have LWLocks, and perhaps buffers? + */ + LWLockReleaseAll(); + AbortBufferIO(); + UnlockBuffers(); + /* buffer pins are released here: */ + ResourceOwnerRelease(CurrentResourceOwner, + RESOURCE_RELEASE_BEFORE_LOCKS, + false, true); + /* we needn't bother with the other ResourceOwnerRelease phases */ + AtEOXact_Buffers(false); + + /* + * Now return to normal top-level context and clear ErrorContext for + * next time. + */ + MemoryContextSwitchTo(walwriter_context); + FlushErrorState(); + + /* Flush any leaked data in the top-level context */ + MemoryContextResetAndDeleteChildren(walwriter_context); + + /* Now we can allow interrupts again */ + RESUME_INTERRUPTS(); + + /* + * Sleep at least 1 second after any error. A write error is likely + * to be repeated, and we don't want to be filling the error logs as + * fast as we can. + */ + pg_usleep(1000000L); + + /* + * Close all open files after any error. This is helpful on Windows, + * where holding deleted files open causes various strange errors. + * It's not clear we need it elsewhere, but shouldn't hurt. + */ + smgrcloseall(); + } + + /* We can now handle ereport(ERROR) */ + PG_exception_stack = &local_sigjmp_buf; + + /* + * Unblock signals (they were blocked when the postmaster forked us) + */ + PG_SETMASK(&UnBlockSig); + + /* + * Loop forever + */ + for (;;) + { + long udelay; + + /* + * Emergency bailout if postmaster has died. This is to avoid the + * necessity for manual cleanup of all postmaster children. + */ + if (!PostmasterIsAlive(true)) + exit(1); + + /* + * Process any requests or signals received recently. + */ + if (got_SIGHUP) + { + got_SIGHUP = false; + ProcessConfigFile(PGC_SIGHUP); + } + if (shutdown_requested) + { + /* Normal exit from the walwriter is here */ + proc_exit(0); /* done */ + } + + /* + * Do what we're here for... + */ + XLogBackgroundFlush(); + + /* + * Delay until time to do something more, but fall out of delay + * reasonably quickly if signaled. + */ + udelay = WalWriterDelay * 1000L; + while (udelay > 999999L) + { + if (got_SIGHUP || shutdown_requested) + break; + pg_usleep(1000000L); + udelay -= 1000000L; + } + if (!(got_SIGHUP || shutdown_requested)) + pg_usleep(udelay); + } +} + + +/* -------------------------------- + * signal handler routines + * -------------------------------- + */ + +/* + * wal_quickdie() occurs when signalled SIGQUIT by the postmaster. + * + * Some backend has bought the farm, + * so we need to stop what we're doing and exit. + */ +static void +wal_quickdie(SIGNAL_ARGS) +{ + PG_SETMASK(&BlockSig); + + /* + * DO NOT proc_exit() -- we're here because shared memory may be + * corrupted, so we don't want to try to clean up our transaction. Just + * nail the windows shut and get out of town. + * + * Note we do exit(2) not exit(0). This is to force the postmaster into a + * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random + * backend. This is necessary precisely because we don't clean up our + * shared memory state. + */ + exit(2); +} + +/* SIGHUP: set flag to re-read config file at next convenient time */ +static void +WalSigHupHandler(SIGNAL_ARGS) +{ + got_SIGHUP = true; +} + +/* SIGTERM: set flag to exit normally */ +static void +WalShutdownHandler(SIGNAL_ARGS) +{ + shutdown_requested = true; +} diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 06915017e6..b2d0ea9cae 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -10,7 +10,7 @@ * Written by Peter Eisentraut . * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.406 2007/07/24 01:53:56 alvherre Exp $ + * $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.407 2007/07/24 04:54:09 tgl Exp $ * *-------------------------------------------------------------------- */ @@ -54,6 +54,7 @@ #include "postmaster/bgwriter.h" #include "postmaster/postmaster.h" #include "postmaster/syslogger.h" +#include "postmaster/walwriter.h" #include "storage/fd.h" #include "storage/freespace.h" #include "tcop/tcopprot.h" @@ -1509,6 +1510,16 @@ static struct config_int ConfigureNamesInt[] = 8, 4, INT_MAX, NULL, NULL }, + { + {"wal_writer_delay", PGC_SIGHUP, WAL_SETTINGS, + gettext_noop("WAL writer sleep time between WAL flushes."), + NULL, + GUC_UNIT_MS + }, + &WalWriterDelay, + 200, 1, 10000, NULL, NULL + }, + { {"commit_delay", PGC_USERSET, WAL_CHECKPOINTS, gettext_noop("Sets the delay in microseconds between transaction commit and " diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 51c83ade0a..8bfad997ff 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -159,6 +159,8 @@ #full_page_writes = on # recover from partial page writes #wal_buffers = 64kB # min 32kB # (change requires restart) +#wal_writer_delay = 200ms # range 1-10000, in milliseconds + #commit_delay = 0 # range 0-100000, in microseconds #commit_siblings = 5 # range 1-1000 diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 1b4fecdb96..adc99a6eb0 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -6,7 +6,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.80 2007/06/30 19:12:02 tgl Exp $ + * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.81 2007/07/24 04:54:09 tgl Exp $ */ #ifndef XLOG_H #define XLOG_H @@ -196,6 +196,7 @@ extern CheckpointStatsData CheckpointStats; extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata); extern void XLogFlush(XLogRecPtr RecPtr); +extern void XLogBackgroundFlush(void); extern bool XLogNeedsFlush(XLogRecPtr RecPtr); extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record); diff --git a/src/include/bootstrap/bootstrap.h b/src/include/bootstrap/bootstrap.h index bbde68ea1b..d75626c8d2 100644 --- a/src/include/bootstrap/bootstrap.h +++ b/src/include/bootstrap/bootstrap.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/bootstrap/bootstrap.h,v 1.46 2007/03/07 13:35:03 alvherre Exp $ + * $PostgreSQL: pgsql/src/include/bootstrap/bootstrap.h,v 1.47 2007/07/24 04:54:09 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -69,7 +69,8 @@ typedef enum CheckerProcess, BootstrapProcess, StartupProcess, - BgWriterProcess + BgWriterProcess, + WalWriterProcess } AuxProcType; #endif /* BOOTSTRAP_H */ diff --git a/src/include/postmaster/walwriter.h b/src/include/postmaster/walwriter.h new file mode 100644 index 0000000000..3cefe9ad7b --- /dev/null +++ b/src/include/postmaster/walwriter.h @@ -0,0 +1,20 @@ +/*------------------------------------------------------------------------- + * + * walwriter.h + * Exports from postmaster/walwriter.c. + * + * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group + * + * $PostgreSQL: pgsql/src/include/postmaster/walwriter.h,v 1.1 2007/07/24 04:54:09 tgl Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef _WALWRITER_H +#define _WALWRITER_H + +/* GUC options */ +extern int WalWriterDelay; + +extern void WalWriterMain(void); + +#endif /* _WALWRITER_H */