Split work of bgwriter between 2 processes: bgwriter and checkpointer.

bgwriter is now a much less important process, responsible for page
cleaning duties only. checkpointer is now responsible for checkpoints
and so has a key role in shutdown. Later patches will correct doc
references to the now old idea that bgwriter performs checkpoints.
Has beneficial effect on performance at high write rates, but mainly
refactoring to more easily allow changes for power reduction by
simplifying previously tortuous code around required to allow page
cleaning and checkpointing to time slice in the same process.

Patch by me, Review by Dickson Guedes
This commit is contained in:
Simon Riggs 2011-11-01 17:14:47 +00:00
parent 589adb86ee
commit 806a2aee37
11 changed files with 124 additions and 987 deletions

View File

@ -315,6 +315,9 @@ AuxiliaryProcessMain(int argc, char *argv[])
case BgWriterProcess:
statmsg = "writer process";
break;
case CheckpointerProcess:
statmsg = "checkpointer process";
break;
case WalWriterProcess:
statmsg = "wal writer process";
break;
@ -415,6 +418,11 @@ AuxiliaryProcessMain(int argc, char *argv[])
BackgroundWriterMain();
proc_exit(1); /* should never return */
case CheckpointerProcess:
/* don't set signals, checkpointer has its own agenda */
CheckpointerMain();
proc_exit(1); /* should never return */
case WalWriterProcess:
/* don't set signals, walwriter has its own agenda */
InitXLOGAccess();

View File

@ -13,6 +13,6 @@ top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
OBJS = autovacuum.o bgwriter.o fork_process.o pgarch.o pgstat.o postmaster.o \
syslogger.o walwriter.o
syslogger.o walwriter.o checkpointer.o
include $(top_srcdir)/src/backend/common.mk

File diff suppressed because it is too large Load Diff

View File

@ -208,6 +208,7 @@ char *output_config_variable = NULL;
/* PIDs of special child processes; 0 when not running */
static pid_t StartupPID = 0,
BgWriterPID = 0,
CheckpointerPID = 0,
WalWriterPID = 0,
WalReceiverPID = 0,
AutoVacPID = 0,
@ -279,7 +280,7 @@ typedef enum
PM_WAIT_BACKUP, /* waiting for online backup mode to end */
PM_WAIT_READONLY, /* waiting for read only backends to exit */
PM_WAIT_BACKENDS, /* waiting for live backends to exit */
PM_SHUTDOWN, /* waiting for bgwriter to do shutdown ckpt */
PM_SHUTDOWN, /* waiting for checkpointer to do shutdown ckpt */
PM_SHUTDOWN_2, /* waiting for archiver and walsenders to
* finish */
PM_WAIT_DEAD_END, /* waiting for dead_end children to exit */
@ -465,6 +466,7 @@ static void ShmemBackendArrayRemove(Backend *bn);
#define StartupDataBase() StartChildProcess(StartupProcess)
#define StartBackgroundWriter() StartChildProcess(BgWriterProcess)
#define StartCheckpointer() StartChildProcess(CheckpointerProcess)
#define StartWalWriter() StartChildProcess(WalWriterProcess)
#define StartWalReceiver() StartChildProcess(WalReceiverProcess)
@ -1028,8 +1030,8 @@ PostmasterMain(int argc, char *argv[])
* CAUTION: when changing this list, check for side-effects on the signal
* handling setup of child processes. See tcop/postgres.c,
* bootstrap/bootstrap.c, postmaster/bgwriter.c, postmaster/walwriter.c,
* postmaster/autovacuum.c, postmaster/pgarch.c, postmaster/pgstat.c, and
* postmaster/syslogger.c.
* postmaster/autovacuum.c, postmaster/pgarch.c, postmaster/pgstat.c,
* postmaster/syslogger.c and postmaster/checkpointer.c
*/
pqinitmask();
PG_SETMASK(&BlockSig);
@ -1366,10 +1368,14 @@ ServerLoop(void)
* state that prevents it, start one. It doesn't matter if this
* fails, we'll just try again later.
*/
if (BgWriterPID == 0 &&
(pmState == PM_RUN || pmState == PM_RECOVERY ||
pmState == PM_HOT_STANDBY))
BgWriterPID = StartBackgroundWriter();
if (pmState == PM_RUN || pmState == PM_RECOVERY ||
pmState == PM_HOT_STANDBY)
{
if (BgWriterPID == 0)
BgWriterPID = StartBackgroundWriter();
if (CheckpointerPID == 0)
CheckpointerPID = StartCheckpointer();
}
/*
* Likewise, if we have lost the walwriter process, try to start a new
@ -2047,6 +2053,8 @@ SIGHUP_handler(SIGNAL_ARGS)
signal_child(StartupPID, SIGHUP);
if (BgWriterPID != 0)
signal_child(BgWriterPID, SIGHUP);
if (CheckpointerPID != 0)
signal_child(CheckpointerPID, SIGHUP);
if (WalWriterPID != 0)
signal_child(WalWriterPID, SIGHUP);
if (WalReceiverPID != 0)
@ -2119,6 +2127,8 @@ pmdie(SIGNAL_ARGS)
/* and the walwriter too */
if (WalWriterPID != 0)
signal_child(WalWriterPID, SIGTERM);
if (BgWriterPID != 0)
signal_child(BgWriterPID, SIGTERM);
/*
* If we're in recovery, we can't kill the startup process
@ -2159,9 +2169,11 @@ pmdie(SIGNAL_ARGS)
signal_child(StartupPID, SIGTERM);
if (WalReceiverPID != 0)
signal_child(WalReceiverPID, SIGTERM);
if (BgWriterPID != 0)
signal_child(BgWriterPID, SIGTERM);
if (pmState == PM_RECOVERY)
{
/* only bgwriter is active in this state */
/* only checkpointer is active in this state */
pmState = PM_WAIT_BACKENDS;
}
else if (pmState == PM_RUN ||
@ -2206,6 +2218,8 @@ pmdie(SIGNAL_ARGS)
signal_child(StartupPID, SIGQUIT);
if (BgWriterPID != 0)
signal_child(BgWriterPID, SIGQUIT);
if (CheckpointerPID != 0)
signal_child(CheckpointerPID, SIGQUIT);
if (WalWriterPID != 0)
signal_child(WalWriterPID, SIGQUIT);
if (WalReceiverPID != 0)
@ -2336,12 +2350,14 @@ reaper(SIGNAL_ARGS)
}
/*
* Crank up the background writer, if we didn't do that already
* Crank up background tasks, if we didn't do that already
* when we entered consistent recovery state. It doesn't matter
* if this fails, we'll just try again later.
*/
if (BgWriterPID == 0)
BgWriterPID = StartBackgroundWriter();
if (CheckpointerPID == 0)
CheckpointerPID = StartCheckpointer();
/*
* Likewise, start other special children as needed. In a restart
@ -2369,10 +2385,22 @@ reaper(SIGNAL_ARGS)
if (pid == BgWriterPID)
{
BgWriterPID = 0;
if (!EXIT_STATUS_0(exitstatus))
HandleChildCrash(pid, exitstatus,
_("background writer process"));
continue;
}
/*
* Was it the checkpointer?
*/
if (pid == CheckpointerPID)
{
CheckpointerPID = 0;
if (EXIT_STATUS_0(exitstatus) && pmState == PM_SHUTDOWN)
{
/*
* OK, we saw normal exit of the bgwriter after it's been told
* OK, we saw normal exit of the checkpointer after it's been told
* to shut down. We expect that it wrote a shutdown
* checkpoint. (If for some reason it didn't, recovery will
* occur on next postmaster start.)
@ -2409,11 +2437,11 @@ reaper(SIGNAL_ARGS)
else
{
/*
* Any unexpected exit of the bgwriter (including FATAL exit)
* Any unexpected exit of the checkpointer (including FATAL exit)
* is treated as a crash.
*/
HandleChildCrash(pid, exitstatus,
_("background writer process"));
_("checkpointer process"));
}
continue;
@ -2597,8 +2625,8 @@ CleanupBackend(int pid,
}
/*
* HandleChildCrash -- cleanup after failed backend, bgwriter, walwriter,
* or autovacuum.
* HandleChildCrash -- cleanup after failed backend, bgwriter, checkpointer,
* walwriter or autovacuum.
*
* The objectives here are to clean up our local state about the child
* process, and to signal all other remaining children to quickdie.
@ -2691,6 +2719,18 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
signal_child(BgWriterPID, (SendStop ? SIGSTOP : SIGQUIT));
}
/* Take care of the checkpointer too */
if (pid == CheckpointerPID)
CheckpointerPID = 0;
else if (CheckpointerPID != 0 && !FatalError)
{
ereport(DEBUG2,
(errmsg_internal("sending %s to process %d",
(SendStop ? "SIGSTOP" : "SIGQUIT"),
(int) CheckpointerPID)));
signal_child(CheckpointerPID, (SendStop ? SIGSTOP : SIGQUIT));
}
/* Take care of the walwriter too */
if (pid == WalWriterPID)
WalWriterPID = 0;
@ -2887,9 +2927,10 @@ PostmasterStateMachine(void)
{
/*
* PM_WAIT_BACKENDS state ends when we have no regular backends
* (including autovac workers) and no walwriter or autovac launcher.
* If we are doing crash recovery then we expect the bgwriter to exit
* too, otherwise not. The archiver, stats, and syslogger processes
* (including autovac workers) and no walwriter, autovac launcher
* or bgwriter. If we are doing crash recovery then we expect the
* checkpointer to exit as well, otherwise not.
* The archiver, stats, and syslogger processes
* are disregarded since they are not connected to shared memory; we
* also disregard dead_end children here. Walsenders are also
* disregarded, they will be terminated later after writing the
@ -2898,7 +2939,8 @@ PostmasterStateMachine(void)
if (CountChildren(BACKEND_TYPE_NORMAL | BACKEND_TYPE_AUTOVAC) == 0 &&
StartupPID == 0 &&
WalReceiverPID == 0 &&
(BgWriterPID == 0 || !FatalError) &&
BgWriterPID == 0 &&
(CheckpointerPID == 0 || !FatalError) &&
WalWriterPID == 0 &&
AutoVacPID == 0)
{
@ -2920,22 +2962,22 @@ PostmasterStateMachine(void)
/*
* If we get here, we are proceeding with normal shutdown. All
* the regular children are gone, and it's time to tell the
* bgwriter to do a shutdown checkpoint.
* checkpointer to do a shutdown checkpoint.
*/
Assert(Shutdown > NoShutdown);
/* Start the bgwriter if not running */
if (BgWriterPID == 0)
BgWriterPID = StartBackgroundWriter();
/* Start the checkpointer if not running */
if (CheckpointerPID == 0)
CheckpointerPID = StartCheckpointer();
/* And tell it to shut down */
if (BgWriterPID != 0)
if (CheckpointerPID != 0)
{
signal_child(BgWriterPID, SIGUSR2);
signal_child(CheckpointerPID, SIGUSR2);
pmState = PM_SHUTDOWN;
}
else
{
/*
* If we failed to fork a bgwriter, just shut down. Any
* If we failed to fork a checkpointer, just shut down. Any
* required cleanup will happen at next restart. We set
* FatalError so that an "abnormal shutdown" message gets
* logged when we exit.
@ -2994,6 +3036,7 @@ PostmasterStateMachine(void)
Assert(StartupPID == 0);
Assert(WalReceiverPID == 0);
Assert(BgWriterPID == 0);
Assert(CheckpointerPID == 0);
Assert(WalWriterPID == 0);
Assert(AutoVacPID == 0);
/* syslogger is not considered here */
@ -4173,6 +4216,8 @@ sigusr1_handler(SIGNAL_ARGS)
*/
Assert(BgWriterPID == 0);
BgWriterPID = StartBackgroundWriter();
Assert(CheckpointerPID == 0);
CheckpointerPID = StartCheckpointer();
pmState = PM_RECOVERY;
}
@ -4459,6 +4504,10 @@ StartChildProcess(AuxProcType type)
ereport(LOG,
(errmsg("could not fork background writer process: %m")));
break;
case CheckpointerProcess:
ereport(LOG,
(errmsg("could not fork checkpointer process: %m")));
break;
case WalWriterProcess:
ereport(LOG,
(errmsg("could not fork WAL writer process: %m")));

View File

@ -1278,11 +1278,9 @@ BufferSync(int flags)
break;
/*
* Perform normal bgwriter duties and sleep to throttle our
* I/O rate.
* Sleep to throttle our I/O rate.
*/
CheckpointWriteDelay(flags,
(double) num_written / num_to_write);
CheckpointWriteDelay(flags, (double) num_written / num_to_write);
}
}

View File

@ -38,7 +38,7 @@
/*
* Special values for the segno arg to RememberFsyncRequest.
*
* Note that CompactBgwriterRequestQueue assumes that it's OK to remove an
* Note that CompactcheckpointerRequestQueue assumes that it's OK to remove an
* fsync request from the queue if an identical, subsequent request is found.
* See comments there before making changes here.
*/
@ -77,7 +77,7 @@
* Inactive segments are those that once contained data but are currently
* not needed because of an mdtruncate() operation. The reason for leaving
* them present at size zero, rather than unlinking them, is that other
* backends and/or the bgwriter might be holding open file references to
* backends and/or the checkpointer might be holding open file references to
* such segments. If the relation expands again after mdtruncate(), such
* that a deactivated segment becomes active again, it is important that
* such file references still be valid --- else data might get written
@ -111,7 +111,7 @@ static MemoryContext MdCxt; /* context for all md.c allocations */
/*
* In some contexts (currently, standalone backends and the bgwriter process)
* In some contexts (currently, standalone backends and the checkpointer process)
* we keep track of pending fsync operations: we need to remember all relation
* segments that have been written since the last checkpoint, so that we can
* fsync them down to disk before completing the next checkpoint. This hash
@ -123,7 +123,7 @@ static MemoryContext MdCxt; /* context for all md.c allocations */
* a hash table, because we don't expect there to be any duplicate requests.
*
* (Regular backends do not track pending operations locally, but forward
* them to the bgwriter.)
* them to the checkpointer.)
*/
typedef struct
{
@ -194,7 +194,7 @@ mdinit(void)
* Create pending-operations hashtable if we need it. Currently, we need
* it if we are standalone (not under a postmaster) OR if we are a
* bootstrap-mode subprocess of a postmaster (that is, a startup or
* bgwriter process).
* checkpointer process).
*/
if (!IsUnderPostmaster || IsBootstrapProcessingMode())
{
@ -214,10 +214,10 @@ mdinit(void)
}
/*
* In archive recovery, we rely on bgwriter to do fsyncs, but we will have
* In archive recovery, we rely on checkpointer to do fsyncs, but we will have
* already created the pendingOpsTable during initialization of the startup
* process. Calling this function drops the local pendingOpsTable so that
* subsequent requests will be forwarded to bgwriter.
* subsequent requests will be forwarded to checkpointer.
*/
void
SetForwardFsyncRequests(void)
@ -765,9 +765,9 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
* NOTE: this assumption could only be wrong if another backend has
* truncated the relation. We rely on higher code levels to handle that
* scenario by closing and re-opening the md fd, which is handled via
* relcache flush. (Since the bgwriter doesn't participate in relcache
* relcache flush. (Since the checkpointer doesn't participate in relcache
* flush, it could have segment chain entries for inactive segments;
* that's OK because the bgwriter never needs to compute relation size.)
* that's OK because the checkpointer never needs to compute relation size.)
*/
while (v->mdfd_chain != NULL)
{
@ -957,7 +957,7 @@ mdsync(void)
elog(ERROR, "cannot sync without a pendingOpsTable");
/*
* If we are in the bgwriter, the sync had better include all fsync
* If we are in the checkpointer, the sync had better include all fsync
* requests that were queued by backends up to this point. The tightest
* race condition that could occur is that a buffer that must be written
* and fsync'd for the checkpoint could have been dumped by a backend just
@ -1033,7 +1033,7 @@ mdsync(void)
int failures;
/*
* If in bgwriter, we want to absorb pending requests every so
* If in checkpointer, we want to absorb pending requests every so
* often to prevent overflow of the fsync request queue. It is
* unspecified whether newly-added entries will be visited by
* hash_seq_search, but we don't care since we don't need to
@ -1070,9 +1070,9 @@ mdsync(void)
* say "but an unreferenced SMgrRelation is still a leak!" Not
* really, because the only case in which a checkpoint is done
* by a process that isn't about to shut down is in the
* bgwriter, and it will periodically do smgrcloseall(). This
* checkpointer, and it will periodically do smgrcloseall(). This
* fact justifies our not closing the reln in the success path
* either, which is a good thing since in non-bgwriter cases
* either, which is a good thing since in non-checkpointer cases
* we couldn't safely do that.) Furthermore, in many cases
* the relation will have been dirtied through this same smgr
* relation, and so we can save a file open/close cycle.
@ -1301,7 +1301,7 @@ register_unlink(RelFileNodeBackend rnode)
else
{
/*
* Notify the bgwriter about it. If we fail to queue the request
* Notify the checkpointer about it. If we fail to queue the request
* message, we have to sleep and try again, because we can't simply
* delete the file now. Ugly, but hopefully won't happen often.
*
@ -1315,10 +1315,10 @@ register_unlink(RelFileNodeBackend rnode)
}
/*
* RememberFsyncRequest() -- callback from bgwriter side of fsync request
* RememberFsyncRequest() -- callback from checkpointer side of fsync request
*
* We stuff most fsync requests into the local hash table for execution
* during the bgwriter's next checkpoint. UNLINK requests go into a
* during the checkpointer's next checkpoint. UNLINK requests go into a
* separate linked list, however, because they get processed separately.
*
* The range of possible segment numbers is way less than the range of
@ -1460,20 +1460,20 @@ ForgetRelationFsyncRequests(RelFileNodeBackend rnode, ForkNumber forknum)
else if (IsUnderPostmaster)
{
/*
* Notify the bgwriter about it. If we fail to queue the revoke
* Notify the checkpointer about it. If we fail to queue the revoke
* message, we have to sleep and try again ... ugly, but hopefully
* won't happen often.
*
* XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with an
* error would leave the no-longer-used file still present on disk,
* which would be bad, so I'm inclined to assume that the bgwriter
* which would be bad, so I'm inclined to assume that the checkpointer
* will always empty the queue soon.
*/
while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC))
pg_usleep(10000L); /* 10 msec seems a good number */
/*
* Note we don't wait for the bgwriter to actually absorb the revoke
* Note we don't wait for the checkpointer to actually absorb the revoke
* message; see mdsync() for the implications.
*/
}

View File

@ -256,7 +256,7 @@ typedef struct RmgrData
extern const RmgrData RmgrTable[];
/*
* Exported to support xlog switching from bgwriter
* Exported to support xlog switching from checkpointer
*/
extern pg_time_t GetLastSegSwitchTime(void);
extern XLogRecPtr RequestXLogSwitch(void);

View File

@ -22,6 +22,7 @@ typedef enum
BootstrapProcess,
StartupProcess,
BgWriterProcess,
CheckpointerProcess,
WalWriterProcess,
WalReceiverProcess,

View File

@ -23,6 +23,7 @@ extern int CheckPointWarning;
extern double CheckPointCompletionTarget;
extern void BackgroundWriterMain(void);
extern void CheckpointerMain(void);
extern void RequestCheckpoint(int flags);
extern void CheckpointWriteDelay(int flags, double progress);

View File

@ -190,11 +190,11 @@ extern PROC_HDR *ProcGlobal;
* We set aside some extra PGPROC structures for auxiliary processes,
* ie things that aren't full-fledged backends but need shmem access.
*
* Background writer and WAL writer run during normal operation. Startup
* process and WAL receiver also consume 2 slots, but WAL writer is
* launched only after startup has exited, so we only need 3 slots.
* Background writer, checkpointer and WAL writer run during normal operation.
* Startup process and WAL receiver also consume 2 slots, but WAL writer is
* launched only after startup has exited, so we only need 4 slots.
*/
#define NUM_AUXILIARY_PROCS 3
#define NUM_AUXILIARY_PROCS 4
/* configurable options */

View File

@ -19,7 +19,7 @@
/*
* Reasons for signalling a Postgres child process (a backend or an auxiliary
* process, like bgwriter). We can cope with concurrent signals for different
* process, like checkpointer). We can cope with concurrent signals for different
* reasons. However, if the same reason is signaled multiple times in quick
* succession, the process is likely to observe only one notification of it.
* This is okay for the present uses.