diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 4fe08df350..f9b839c3da 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -315,6 +315,9 @@ AuxiliaryProcessMain(int argc, char *argv[]) case BgWriterProcess: statmsg = "writer process"; break; + case CheckpointerProcess: + statmsg = "checkpointer process"; + break; case WalWriterProcess: statmsg = "wal writer process"; break; @@ -415,6 +418,11 @@ AuxiliaryProcessMain(int argc, char *argv[]) BackgroundWriterMain(); proc_exit(1); /* should never return */ + case CheckpointerProcess: + /* don't set signals, checkpointer has its own agenda */ + CheckpointerMain(); + proc_exit(1); /* should never return */ + case WalWriterProcess: /* don't set signals, walwriter has its own agenda */ InitXLOGAccess(); diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile index 0767e97435..e7414d20de 100644 --- a/src/backend/postmaster/Makefile +++ b/src/backend/postmaster/Makefile @@ -13,6 +13,6 @@ top_builddir = ../../.. include $(top_builddir)/src/Makefile.global OBJS = autovacuum.o bgwriter.o fork_process.o pgarch.o pgstat.o postmaster.o \ - syslogger.o walwriter.o + syslogger.o walwriter.o checkpointer.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index 2d0b63987e..2841cdfc81 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -10,20 +10,13 @@ * still empowered to issue writes if the bgwriter fails to maintain enough * clean shared buffers. * - * The bgwriter is also charged with handling all checkpoints. It will - * automatically dispatch a checkpoint after a certain amount of time has - * elapsed since the last one, and it can be signaled to perform requested - * checkpoints as well. (The GUC parameter that mandates a checkpoint every - * so many WAL segments is implemented by having backends signal the bgwriter - * when they fill WAL segments; the bgwriter itself doesn't watch for the - * condition.) + * As of Postgres 9.2 the bgwriter no longer handles checkpoints. * * The bgwriter is started by the postmaster as soon as the startup subprocess * finishes, or as soon as recovery begins if we are doing archive recovery. * It remains alive until the postmaster commands it to terminate. - * Normal termination is by SIGUSR2, which instructs the bgwriter to execute - * a shutdown checkpoint and then exit(0). (All backends must be stopped - * before SIGUSR2 is issued!) Emergency termination is by SIGQUIT; like any + * Normal termination is by SIGUSR2, which instructs the bgwriter to exit(0). + * Emergency termination is by SIGQUIT; like any * backend, the bgwriter will simply abort and exit on SIGQUIT. * * If the bgwriter exits unexpectedly, the postmaster treats that the same @@ -54,7 +47,6 @@ #include "miscadmin.h" #include "pgstat.h" #include "postmaster/bgwriter.h" -#include "replication/syncrep.h" #include "storage/bufmgr.h" #include "storage/ipc.h" #include "storage/lwlock.h" @@ -67,96 +59,15 @@ #include "utils/resowner.h" -/*---------- - * Shared memory area for communication between bgwriter and backends - * - * The ckpt counters allow backends to watch for completion of a checkpoint - * request they send. Here's how it works: - * * At start of a checkpoint, bgwriter reads (and clears) the request flags - * and increments ckpt_started, while holding ckpt_lck. - * * On completion of a checkpoint, bgwriter sets ckpt_done to - * equal ckpt_started. - * * On failure of a checkpoint, bgwriter increments ckpt_failed - * and sets ckpt_done to equal ckpt_started. - * - * The algorithm for backends is: - * 1. Record current values of ckpt_failed and ckpt_started, and - * set request flags, while holding ckpt_lck. - * 2. Send signal to request checkpoint. - * 3. Sleep until ckpt_started changes. Now you know a checkpoint has - * begun since you started this algorithm (although *not* that it was - * specifically initiated by your signal), and that it is using your flags. - * 4. Record new value of ckpt_started. - * 5. Sleep until ckpt_done >= saved value of ckpt_started. (Use modulo - * arithmetic here in case counters wrap around.) Now you know a - * checkpoint has started and completed, but not whether it was - * successful. - * 6. If ckpt_failed is different from the originally saved value, - * assume request failed; otherwise it was definitely successful. - * - * ckpt_flags holds the OR of the checkpoint request flags sent by all - * requesting backends since the last checkpoint start. The flags are - * chosen so that OR'ing is the correct way to combine multiple requests. - * - * num_backend_writes is used to count the number of buffer writes performed - * by non-bgwriter processes. This counter should be wide enough that it - * can't overflow during a single bgwriter cycle. num_backend_fsync - * counts the subset of those writes that also had to do their own fsync, - * because the background writer failed to absorb their request. - * - * The requests array holds fsync requests sent by backends and not yet - * absorbed by the bgwriter. - * - * Unlike the checkpoint fields, num_backend_writes, num_backend_fsync, and - * the requests fields are protected by BgWriterCommLock. - *---------- - */ -typedef struct -{ - RelFileNodeBackend rnode; - ForkNumber forknum; - BlockNumber segno; /* see md.c for special values */ - /* might add a real request-type field later; not needed yet */ -} BgWriterRequest; - -typedef struct -{ - pid_t bgwriter_pid; /* PID of bgwriter (0 if not started) */ - - slock_t ckpt_lck; /* protects all the ckpt_* fields */ - - int ckpt_started; /* advances when checkpoint starts */ - int ckpt_done; /* advances when checkpoint done */ - int ckpt_failed; /* advances when checkpoint fails */ - - int ckpt_flags; /* checkpoint flags, as defined in xlog.h */ - - uint32 num_backend_writes; /* counts non-bgwriter buffer writes */ - uint32 num_backend_fsync; /* counts non-bgwriter fsync calls */ - - int num_requests; /* current # of requests */ - int max_requests; /* allocated array size */ - BgWriterRequest requests[1]; /* VARIABLE LENGTH ARRAY */ -} BgWriterShmemStruct; - -static BgWriterShmemStruct *BgWriterShmem; - -/* interval for calling AbsorbFsyncRequests in CheckpointWriteDelay */ -#define WRITES_PER_ABSORB 1000 - /* * GUC parameters */ int BgWriterDelay = 200; -int CheckPointTimeout = 300; -int CheckPointWarning = 30; -double CheckPointCompletionTarget = 0.5; /* * Flags set by interrupt handlers for later service in the main loop. */ static volatile sig_atomic_t got_SIGHUP = false; -static volatile sig_atomic_t checkpoint_requested = false; static volatile sig_atomic_t shutdown_requested = false; /* @@ -164,29 +75,14 @@ static volatile sig_atomic_t shutdown_requested = false; */ static bool am_bg_writer = false; -static bool ckpt_active = false; - -/* these values are valid when ckpt_active is true: */ -static pg_time_t ckpt_start_time; -static XLogRecPtr ckpt_start_recptr; -static double ckpt_cached_elapsed; - -static pg_time_t last_checkpoint_time; -static pg_time_t last_xlog_switch_time; - /* Prototypes for private functions */ -static void CheckArchiveTimeout(void); static void BgWriterNap(void); -static bool IsCheckpointOnSchedule(double progress); -static bool ImmediateCheckpointRequested(void); -static bool CompactBgwriterRequestQueue(void); /* Signal handlers */ static void bg_quickdie(SIGNAL_ARGS); static void BgSigHupHandler(SIGNAL_ARGS); -static void ReqCheckpointHandler(SIGNAL_ARGS); static void ReqShutdownHandler(SIGNAL_ARGS); @@ -202,7 +98,6 @@ BackgroundWriterMain(void) sigjmp_buf local_sigjmp_buf; MemoryContext bgwriter_context; - BgWriterShmem->bgwriter_pid = MyProcPid; am_bg_writer = true; /* @@ -228,13 +123,13 @@ BackgroundWriterMain(void) * process to participate in ProcSignal signalling. */ pqsignal(SIGHUP, BgSigHupHandler); /* set flag to read config file */ - pqsignal(SIGINT, ReqCheckpointHandler); /* request checkpoint */ - pqsignal(SIGTERM, SIG_IGN); /* ignore SIGTERM */ + pqsignal(SIGINT, SIG_IGN); /* as of 9.2 no longer requests checkpoint */ + pqsignal(SIGTERM, ReqShutdownHandler); /* shutdown */ pqsignal(SIGQUIT, bg_quickdie); /* hard crash time */ pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); - pqsignal(SIGUSR1, SIG_IGN); /* reserve for ProcSignal */ - pqsignal(SIGUSR2, ReqShutdownHandler); /* request shutdown */ + pqsignal(SIGUSR1, SIG_IGN); /* reserve for ProcSignal */ + pqsignal(SIGUSR2, SIG_IGN); /* request shutdown */ /* * Reset some signals that are accepted by postmaster but not here @@ -248,11 +143,6 @@ BackgroundWriterMain(void) /* We allow SIGQUIT (quickdie) at all times */ sigdelset(&BlockSig, SIGQUIT); - /* - * Initialize so that first time-driven event happens at the correct time. - */ - last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL); - /* * Create a resource owner to keep track of our resources (currently only * buffer pins). @@ -305,20 +195,6 @@ BackgroundWriterMain(void) AtEOXact_Files(); AtEOXact_HashTables(false); - /* Warn any waiting backends that the checkpoint failed. */ - if (ckpt_active) - { - /* use volatile pointer to prevent code rearrangement */ - volatile BgWriterShmemStruct *bgs = BgWriterShmem; - - SpinLockAcquire(&bgs->ckpt_lck); - bgs->ckpt_failed++; - bgs->ckpt_done = bgs->ckpt_started; - SpinLockRelease(&bgs->ckpt_lck); - - ckpt_active = false; - } - /* * Now return to normal top-level context and clear ErrorContext for * next time. @@ -361,19 +237,11 @@ BackgroundWriterMain(void) if (RecoveryInProgress()) ThisTimeLineID = GetRecoveryTargetTLI(); - /* Do this once before starting the loop, then just at SIGHUP time. */ - SyncRepUpdateSyncStandbysDefined(); - /* * Loop forever */ for (;;) { - bool do_checkpoint = false; - int flags = 0; - pg_time_t now; - int elapsed_secs; - /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. @@ -381,23 +249,11 @@ BackgroundWriterMain(void) if (!PostmasterIsAlive()) exit(1); - /* - * Process any requests or signals received recently. - */ - AbsorbFsyncRequests(); - if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); /* update global shmem state for sync rep */ - SyncRepUpdateSyncStandbysDefined(); - } - if (checkpoint_requested) - { - checkpoint_requested = false; - do_checkpoint = true; - BgWriterStats.m_requested_checkpoints++; } if (shutdown_requested) { @@ -406,203 +262,20 @@ BackgroundWriterMain(void) * control back to the sigsetjmp block above */ ExitOnAnyError = true; - /* Close down the database */ - ShutdownXLOG(0, 0); /* Normal exit from the bgwriter is here */ proc_exit(0); /* done */ } /* - * Force a checkpoint if too much time has elapsed since the last one. - * Note that we count a timed checkpoint in stats only when this - * occurs without an external request, but we set the CAUSE_TIME flag - * bit even if there is also an external request. + * Do one cycle of dirty-buffer writing. */ - now = (pg_time_t) time(NULL); - elapsed_secs = now - last_checkpoint_time; - if (elapsed_secs >= CheckPointTimeout) - { - if (!do_checkpoint) - BgWriterStats.m_timed_checkpoints++; - do_checkpoint = true; - flags |= CHECKPOINT_CAUSE_TIME; - } - - /* - * Do a checkpoint if requested, otherwise do one cycle of - * dirty-buffer writing. - */ - if (do_checkpoint) - { - bool ckpt_performed = false; - bool do_restartpoint; - - /* use volatile pointer to prevent code rearrangement */ - volatile BgWriterShmemStruct *bgs = BgWriterShmem; - - /* - * Check if we should perform a checkpoint or a restartpoint. As a - * side-effect, RecoveryInProgress() initializes TimeLineID if - * it's not set yet. - */ - do_restartpoint = RecoveryInProgress(); - - /* - * Atomically fetch the request flags to figure out what kind of a - * checkpoint we should perform, and increase the started-counter - * to acknowledge that we've started a new checkpoint. - */ - SpinLockAcquire(&bgs->ckpt_lck); - flags |= bgs->ckpt_flags; - bgs->ckpt_flags = 0; - bgs->ckpt_started++; - SpinLockRelease(&bgs->ckpt_lck); - - /* - * The end-of-recovery checkpoint is a real checkpoint that's - * performed while we're still in recovery. - */ - if (flags & CHECKPOINT_END_OF_RECOVERY) - do_restartpoint = false; - - /* - * We will warn if (a) too soon since last checkpoint (whatever - * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag - * since the last checkpoint start. Note in particular that this - * implementation will not generate warnings caused by - * CheckPointTimeout < CheckPointWarning. - */ - if (!do_restartpoint && - (flags & CHECKPOINT_CAUSE_XLOG) && - elapsed_secs < CheckPointWarning) - ereport(LOG, - (errmsg_plural("checkpoints are occurring too frequently (%d second apart)", - "checkpoints are occurring too frequently (%d seconds apart)", - elapsed_secs, - elapsed_secs), - errhint("Consider increasing the configuration parameter \"checkpoint_segments\"."))); - - /* - * Initialize bgwriter-private variables used during checkpoint. - */ - ckpt_active = true; - if (!do_restartpoint) - ckpt_start_recptr = GetInsertRecPtr(); - ckpt_start_time = now; - ckpt_cached_elapsed = 0; - - /* - * Do the checkpoint. - */ - if (!do_restartpoint) - { - CreateCheckPoint(flags); - ckpt_performed = true; - } - else - ckpt_performed = CreateRestartPoint(flags); - - /* - * After any checkpoint, close all smgr files. This is so we - * won't hang onto smgr references to deleted files indefinitely. - */ - smgrcloseall(); - - /* - * Indicate checkpoint completion to any waiting backends. - */ - SpinLockAcquire(&bgs->ckpt_lck); - bgs->ckpt_done = bgs->ckpt_started; - SpinLockRelease(&bgs->ckpt_lck); - - if (ckpt_performed) - { - /* - * Note we record the checkpoint start time not end time as - * last_checkpoint_time. This is so that time-driven - * checkpoints happen at a predictable spacing. - */ - last_checkpoint_time = now; - } - else - { - /* - * We were not able to perform the restartpoint (checkpoints - * throw an ERROR in case of error). Most likely because we - * have not received any new checkpoint WAL records since the - * last restartpoint. Try again in 15 s. - */ - last_checkpoint_time = now - CheckPointTimeout + 15; - } - - ckpt_active = false; - } - else - BgBufferSync(); - - /* Check for archive_timeout and switch xlog files if necessary. */ - CheckArchiveTimeout(); + BgBufferSync(); /* Nap for the configured time. */ BgWriterNap(); } } -/* - * CheckArchiveTimeout -- check for archive_timeout and switch xlog files - * - * This will switch to a new WAL file and force an archive file write - * if any activity is recorded in the current WAL file, including just - * a single checkpoint record. - */ -static void -CheckArchiveTimeout(void) -{ - pg_time_t now; - pg_time_t last_time; - - if (XLogArchiveTimeout <= 0 || RecoveryInProgress()) - return; - - now = (pg_time_t) time(NULL); - - /* First we do a quick check using possibly-stale local state. */ - if ((int) (now - last_xlog_switch_time) < XLogArchiveTimeout) - return; - - /* - * Update local state ... note that last_xlog_switch_time is the last time - * a switch was performed *or requested*. - */ - last_time = GetLastSegSwitchTime(); - - last_xlog_switch_time = Max(last_xlog_switch_time, last_time); - - /* Now we can do the real check */ - if ((int) (now - last_xlog_switch_time) >= XLogArchiveTimeout) - { - XLogRecPtr switchpoint; - - /* OK, it's time to switch */ - switchpoint = RequestXLogSwitch(); - - /* - * If the returned pointer points exactly to a segment boundary, - * assume nothing happened. - */ - if ((switchpoint.xrecoff % XLogSegSize) != 0) - ereport(DEBUG1, - (errmsg("transaction log switch forced (archive_timeout=%d)", - XLogArchiveTimeout))); - - /* - * Update state in any case, so we don't retry constantly when the - * system is idle. - */ - last_xlog_switch_time = now; - } -} - /* * BgWriterNap -- Nap for the configured time or until a signal is received. */ @@ -624,185 +297,24 @@ BgWriterNap(void) * respond reasonably promptly when someone signals us, break down the * sleep into 1-second increments, and check for interrupts after each * nap. - * - * We absorb pending requests after each short sleep. */ - if (bgwriter_lru_maxpages > 0 || ckpt_active) + if (bgwriter_lru_maxpages > 0) udelay = BgWriterDelay * 1000L; - else if (XLogArchiveTimeout > 0) - udelay = 1000000L; /* One second */ else udelay = 10000000L; /* Ten seconds */ while (udelay > 999999L) { - if (got_SIGHUP || shutdown_requested || - (ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested)) + if (got_SIGHUP || shutdown_requested) break; pg_usleep(1000000L); - AbsorbFsyncRequests(); udelay -= 1000000L; } - if (!(got_SIGHUP || shutdown_requested || - (ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested))) + if (!(got_SIGHUP || shutdown_requested)) pg_usleep(udelay); } -/* - * Returns true if an immediate checkpoint request is pending. (Note that - * this does not check the *current* checkpoint's IMMEDIATE flag, but whether - * there is one pending behind it.) - */ -static bool -ImmediateCheckpointRequested(void) -{ - if (checkpoint_requested) - { - volatile BgWriterShmemStruct *bgs = BgWriterShmem; - - /* - * We don't need to acquire the ckpt_lck in this case because we're - * only looking at a single flag bit. - */ - if (bgs->ckpt_flags & CHECKPOINT_IMMEDIATE) - return true; - } - return false; -} - -/* - * CheckpointWriteDelay -- yield control to bgwriter during a checkpoint - * - * This function is called after each page write performed by BufferSync(). - * It is responsible for keeping the bgwriter's normal activities in - * progress during a long checkpoint, and for throttling BufferSync()'s - * write rate to hit checkpoint_completion_target. - * - * The checkpoint request flags should be passed in; currently the only one - * examined is CHECKPOINT_IMMEDIATE, which disables delays between writes. - * - * 'progress' is an estimate of how much of the work has been done, as a - * fraction between 0.0 meaning none, and 1.0 meaning all done. - */ -void -CheckpointWriteDelay(int flags, double progress) -{ - static int absorb_counter = WRITES_PER_ABSORB; - - /* Do nothing if checkpoint is being executed by non-bgwriter process */ - if (!am_bg_writer) - return; - - /* - * Perform the usual bgwriter duties and take a nap, unless we're behind - * schedule, in which case we just try to catch up as quickly as possible. - */ - if (!(flags & CHECKPOINT_IMMEDIATE) && - !shutdown_requested && - !ImmediateCheckpointRequested() && - IsCheckpointOnSchedule(progress)) - { - if (got_SIGHUP) - { - got_SIGHUP = false; - ProcessConfigFile(PGC_SIGHUP); - /* update global shmem state for sync rep */ - SyncRepUpdateSyncStandbysDefined(); - } - - AbsorbFsyncRequests(); - absorb_counter = WRITES_PER_ABSORB; - - BgBufferSync(); - CheckArchiveTimeout(); - BgWriterNap(); - } - else if (--absorb_counter <= 0) - { - /* - * Absorb pending fsync requests after each WRITES_PER_ABSORB write - * operations even when we don't sleep, to prevent overflow of the - * fsync request queue. - */ - AbsorbFsyncRequests(); - absorb_counter = WRITES_PER_ABSORB; - } -} - -/* - * IsCheckpointOnSchedule -- are we on schedule to finish this checkpoint - * in time? - * - * Compares the current progress against the time/segments elapsed since last - * checkpoint, and returns true if the progress we've made this far is greater - * than the elapsed time/segments. - */ -static bool -IsCheckpointOnSchedule(double progress) -{ - XLogRecPtr recptr; - struct timeval now; - double elapsed_xlogs, - elapsed_time; - - Assert(ckpt_active); - - /* Scale progress according to checkpoint_completion_target. */ - progress *= CheckPointCompletionTarget; - - /* - * Check against the cached value first. Only do the more expensive - * calculations once we reach the target previously calculated. Since - * neither time or WAL insert pointer moves backwards, a freshly - * calculated value can only be greater than or equal to the cached value. - */ - if (progress < ckpt_cached_elapsed) - return false; - - /* - * Check progress against WAL segments written and checkpoint_segments. - * - * We compare the current WAL insert location against the location - * computed before calling CreateCheckPoint. The code in XLogInsert that - * actually triggers a checkpoint when checkpoint_segments is exceeded - * compares against RedoRecptr, so this is not completely accurate. - * However, it's good enough for our purposes, we're only calculating an - * estimate anyway. - */ - if (!RecoveryInProgress()) - { - recptr = GetInsertRecPtr(); - elapsed_xlogs = - (((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile + - ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) / - CheckPointSegments; - - if (progress < elapsed_xlogs) - { - ckpt_cached_elapsed = elapsed_xlogs; - return false; - } - } - - /* - * Check progress against time elapsed and checkpoint_timeout. - */ - gettimeofday(&now, NULL); - elapsed_time = ((double) ((pg_time_t) now.tv_sec - ckpt_start_time) + - now.tv_usec / 1000000.0) / CheckPointTimeout; - - if (progress < elapsed_time) - { - ckpt_cached_elapsed = elapsed_time; - return false; - } - - /* It looks like we're on schedule. */ - return true; -} - - /* -------------------------------- * signal handler routines * -------------------------------- @@ -847,441 +359,9 @@ BgSigHupHandler(SIGNAL_ARGS) got_SIGHUP = true; } -/* SIGINT: set flag to run a normal checkpoint right away */ -static void -ReqCheckpointHandler(SIGNAL_ARGS) -{ - checkpoint_requested = true; -} - /* SIGUSR2: set flag to run a shutdown checkpoint and exit */ static void ReqShutdownHandler(SIGNAL_ARGS) { shutdown_requested = true; } - - -/* -------------------------------- - * communication with backends - * -------------------------------- - */ - -/* - * BgWriterShmemSize - * Compute space needed for bgwriter-related shared memory - */ -Size -BgWriterShmemSize(void) -{ - Size size; - - /* - * Currently, the size of the requests[] array is arbitrarily set equal to - * NBuffers. This may prove too large or small ... - */ - size = offsetof(BgWriterShmemStruct, requests); - size = add_size(size, mul_size(NBuffers, sizeof(BgWriterRequest))); - - return size; -} - -/* - * BgWriterShmemInit - * Allocate and initialize bgwriter-related shared memory - */ -void -BgWriterShmemInit(void) -{ - bool found; - - BgWriterShmem = (BgWriterShmemStruct *) - ShmemInitStruct("Background Writer Data", - BgWriterShmemSize(), - &found); - - if (!found) - { - /* First time through, so initialize */ - MemSet(BgWriterShmem, 0, sizeof(BgWriterShmemStruct)); - SpinLockInit(&BgWriterShmem->ckpt_lck); - BgWriterShmem->max_requests = NBuffers; - } -} - -/* - * RequestCheckpoint - * Called in backend processes to request a checkpoint - * - * flags is a bitwise OR of the following: - * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown. - * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery. - * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP, - * ignoring checkpoint_completion_target parameter. - * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured - * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or - * CHECKPOINT_END_OF_RECOVERY). - * CHECKPOINT_WAIT: wait for completion before returning (otherwise, - * just signal bgwriter to do it, and return). - * CHECKPOINT_CAUSE_XLOG: checkpoint is requested due to xlog filling. - * (This affects logging, and in particular enables CheckPointWarning.) - */ -void -RequestCheckpoint(int flags) -{ - /* use volatile pointer to prevent code rearrangement */ - volatile BgWriterShmemStruct *bgs = BgWriterShmem; - int ntries; - int old_failed, - old_started; - - /* - * If in a standalone backend, just do it ourselves. - */ - if (!IsPostmasterEnvironment) - { - /* - * There's no point in doing slow checkpoints in a standalone backend, - * because there's no other backends the checkpoint could disrupt. - */ - CreateCheckPoint(flags | CHECKPOINT_IMMEDIATE); - - /* - * After any checkpoint, close all smgr files. This is so we won't - * hang onto smgr references to deleted files indefinitely. - */ - smgrcloseall(); - - return; - } - - /* - * Atomically set the request flags, and take a snapshot of the counters. - * When we see ckpt_started > old_started, we know the flags we set here - * have been seen by bgwriter. - * - * Note that we OR the flags with any existing flags, to avoid overriding - * a "stronger" request by another backend. The flag senses must be - * chosen to make this work! - */ - SpinLockAcquire(&bgs->ckpt_lck); - - old_failed = bgs->ckpt_failed; - old_started = bgs->ckpt_started; - bgs->ckpt_flags |= flags; - - SpinLockRelease(&bgs->ckpt_lck); - - /* - * Send signal to request checkpoint. It's possible that the bgwriter - * hasn't started yet, or is in process of restarting, so we will retry a - * few times if needed. Also, if not told to wait for the checkpoint to - * occur, we consider failure to send the signal to be nonfatal and merely - * LOG it. - */ - for (ntries = 0;; ntries++) - { - if (BgWriterShmem->bgwriter_pid == 0) - { - if (ntries >= 20) /* max wait 2.0 sec */ - { - elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG, - "could not request checkpoint because bgwriter not running"); - break; - } - } - else if (kill(BgWriterShmem->bgwriter_pid, SIGINT) != 0) - { - if (ntries >= 20) /* max wait 2.0 sec */ - { - elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG, - "could not signal for checkpoint: %m"); - break; - } - } - else - break; /* signal sent successfully */ - - CHECK_FOR_INTERRUPTS(); - pg_usleep(100000L); /* wait 0.1 sec, then retry */ - } - - /* - * If requested, wait for completion. We detect completion according to - * the algorithm given above. - */ - if (flags & CHECKPOINT_WAIT) - { - int new_started, - new_failed; - - /* Wait for a new checkpoint to start. */ - for (;;) - { - SpinLockAcquire(&bgs->ckpt_lck); - new_started = bgs->ckpt_started; - SpinLockRelease(&bgs->ckpt_lck); - - if (new_started != old_started) - break; - - CHECK_FOR_INTERRUPTS(); - pg_usleep(100000L); - } - - /* - * We are waiting for ckpt_done >= new_started, in a modulo sense. - */ - for (;;) - { - int new_done; - - SpinLockAcquire(&bgs->ckpt_lck); - new_done = bgs->ckpt_done; - new_failed = bgs->ckpt_failed; - SpinLockRelease(&bgs->ckpt_lck); - - if (new_done - new_started >= 0) - break; - - CHECK_FOR_INTERRUPTS(); - pg_usleep(100000L); - } - - if (new_failed != old_failed) - ereport(ERROR, - (errmsg("checkpoint request failed"), - errhint("Consult recent messages in the server log for details."))); - } -} - -/* - * ForwardFsyncRequest - * Forward a file-fsync request from a backend to the bgwriter - * - * Whenever a backend is compelled to write directly to a relation - * (which should be seldom, if the bgwriter is getting its job done), - * the backend calls this routine to pass over knowledge that the relation - * is dirty and must be fsync'd before next checkpoint. We also use this - * opportunity to count such writes for statistical purposes. - * - * segno specifies which segment (not block!) of the relation needs to be - * fsync'd. (Since the valid range is much less than BlockNumber, we can - * use high values for special flags; that's all internal to md.c, which - * see for details.) - * - * To avoid holding the lock for longer than necessary, we normally write - * to the requests[] queue without checking for duplicates. The bgwriter - * will have to eliminate dups internally anyway. However, if we discover - * that the queue is full, we make a pass over the entire queue to compact - * it. This is somewhat expensive, but the alternative is for the backend - * to perform its own fsync, which is far more expensive in practice. It - * is theoretically possible a backend fsync might still be necessary, if - * the queue is full and contains no duplicate entries. In that case, we - * let the backend know by returning false. - */ -bool -ForwardFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum, - BlockNumber segno) -{ - BgWriterRequest *request; - - if (!IsUnderPostmaster) - return false; /* probably shouldn't even get here */ - - if (am_bg_writer) - elog(ERROR, "ForwardFsyncRequest must not be called in bgwriter"); - - LWLockAcquire(BgWriterCommLock, LW_EXCLUSIVE); - - /* Count all backend writes regardless of if they fit in the queue */ - BgWriterShmem->num_backend_writes++; - - /* - * If the background writer isn't running or the request queue is full, - * the backend will have to perform its own fsync request. But before - * forcing that to happen, we can try to compact the background writer - * request queue. - */ - if (BgWriterShmem->bgwriter_pid == 0 || - (BgWriterShmem->num_requests >= BgWriterShmem->max_requests - && !CompactBgwriterRequestQueue())) - { - /* - * Count the subset of writes where backends have to do their own - * fsync - */ - BgWriterShmem->num_backend_fsync++; - LWLockRelease(BgWriterCommLock); - return false; - } - request = &BgWriterShmem->requests[BgWriterShmem->num_requests++]; - request->rnode = rnode; - request->forknum = forknum; - request->segno = segno; - LWLockRelease(BgWriterCommLock); - return true; -} - -/* - * CompactBgwriterRequestQueue - * Remove duplicates from the request queue to avoid backend fsyncs. - * - * Although a full fsync request queue is not common, it can lead to severe - * performance problems when it does happen. So far, this situation has - * only been observed to occur when the system is under heavy write load, - * and especially during the "sync" phase of a checkpoint. Without this - * logic, each backend begins doing an fsync for every block written, which - * gets very expensive and can slow down the whole system. - * - * Trying to do this every time the queue is full could lose if there - * aren't any removable entries. But should be vanishingly rare in - * practice: there's one queue entry per shared buffer. - */ -static bool -CompactBgwriterRequestQueue() -{ - struct BgWriterSlotMapping - { - BgWriterRequest request; - int slot; - }; - - int n, - preserve_count; - int num_skipped = 0; - HASHCTL ctl; - HTAB *htab; - bool *skip_slot; - - /* must hold BgWriterCommLock in exclusive mode */ - Assert(LWLockHeldByMe(BgWriterCommLock)); - - /* Initialize temporary hash table */ - MemSet(&ctl, 0, sizeof(ctl)); - ctl.keysize = sizeof(BgWriterRequest); - ctl.entrysize = sizeof(struct BgWriterSlotMapping); - ctl.hash = tag_hash; - htab = hash_create("CompactBgwriterRequestQueue", - BgWriterShmem->num_requests, - &ctl, - HASH_ELEM | HASH_FUNCTION); - - /* Initialize skip_slot array */ - skip_slot = palloc0(sizeof(bool) * BgWriterShmem->num_requests); - - /* - * The basic idea here is that a request can be skipped if it's followed - * by a later, identical request. It might seem more sensible to work - * backwards from the end of the queue and check whether a request is - * *preceded* by an earlier, identical request, in the hopes of doing less - * copying. But that might change the semantics, if there's an - * intervening FORGET_RELATION_FSYNC or FORGET_DATABASE_FSYNC request, so - * we do it this way. It would be possible to be even smarter if we made - * the code below understand the specific semantics of such requests (it - * could blow away preceding entries that would end up being canceled - * anyhow), but it's not clear that the extra complexity would buy us - * anything. - */ - for (n = 0; n < BgWriterShmem->num_requests; ++n) - { - BgWriterRequest *request; - struct BgWriterSlotMapping *slotmap; - bool found; - - request = &BgWriterShmem->requests[n]; - slotmap = hash_search(htab, request, HASH_ENTER, &found); - if (found) - { - skip_slot[slotmap->slot] = true; - ++num_skipped; - } - slotmap->slot = n; - } - - /* Done with the hash table. */ - hash_destroy(htab); - - /* If no duplicates, we're out of luck. */ - if (!num_skipped) - { - pfree(skip_slot); - return false; - } - - /* We found some duplicates; remove them. */ - for (n = 0, preserve_count = 0; n < BgWriterShmem->num_requests; ++n) - { - if (skip_slot[n]) - continue; - BgWriterShmem->requests[preserve_count++] = BgWriterShmem->requests[n]; - } - ereport(DEBUG1, - (errmsg("compacted fsync request queue from %d entries to %d entries", - BgWriterShmem->num_requests, preserve_count))); - BgWriterShmem->num_requests = preserve_count; - - /* Cleanup. */ - pfree(skip_slot); - return true; -} - -/* - * AbsorbFsyncRequests - * Retrieve queued fsync requests and pass them to local smgr. - * - * This is exported because it must be called during CreateCheckPoint; - * we have to be sure we have accepted all pending requests just before - * we start fsync'ing. Since CreateCheckPoint sometimes runs in - * non-bgwriter processes, do nothing if not bgwriter. - */ -void -AbsorbFsyncRequests(void) -{ - BgWriterRequest *requests = NULL; - BgWriterRequest *request; - int n; - - if (!am_bg_writer) - return; - - /* - * We have to PANIC if we fail to absorb all the pending requests (eg, - * because our hashtable runs out of memory). This is because the system - * cannot run safely if we are unable to fsync what we have been told to - * fsync. Fortunately, the hashtable is so small that the problem is - * quite unlikely to arise in practice. - */ - START_CRIT_SECTION(); - - /* - * We try to avoid holding the lock for a long time by copying the request - * array. - */ - LWLockAcquire(BgWriterCommLock, LW_EXCLUSIVE); - - /* Transfer write count into pending pgstats message */ - BgWriterStats.m_buf_written_backend += BgWriterShmem->num_backend_writes; - BgWriterStats.m_buf_fsync_backend += BgWriterShmem->num_backend_fsync; - - BgWriterShmem->num_backend_writes = 0; - BgWriterShmem->num_backend_fsync = 0; - - n = BgWriterShmem->num_requests; - if (n > 0) - { - requests = (BgWriterRequest *) palloc(n * sizeof(BgWriterRequest)); - memcpy(requests, BgWriterShmem->requests, n * sizeof(BgWriterRequest)); - } - BgWriterShmem->num_requests = 0; - - LWLockRelease(BgWriterCommLock); - - for (request = requests; n > 0; request++, n--) - RememberFsyncRequest(request->rnode, request->forknum, request->segno); - - if (requests) - pfree(requests); - - END_CRIT_SECTION(); -} diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 4bb519d46a..6758083bdd 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -208,6 +208,7 @@ char *output_config_variable = NULL; /* PIDs of special child processes; 0 when not running */ static pid_t StartupPID = 0, BgWriterPID = 0, + CheckpointerPID = 0, WalWriterPID = 0, WalReceiverPID = 0, AutoVacPID = 0, @@ -279,7 +280,7 @@ typedef enum PM_WAIT_BACKUP, /* waiting for online backup mode to end */ PM_WAIT_READONLY, /* waiting for read only backends to exit */ PM_WAIT_BACKENDS, /* waiting for live backends to exit */ - PM_SHUTDOWN, /* waiting for bgwriter to do shutdown ckpt */ + PM_SHUTDOWN, /* waiting for checkpointer to do shutdown ckpt */ PM_SHUTDOWN_2, /* waiting for archiver and walsenders to * finish */ PM_WAIT_DEAD_END, /* waiting for dead_end children to exit */ @@ -465,6 +466,7 @@ static void ShmemBackendArrayRemove(Backend *bn); #define StartupDataBase() StartChildProcess(StartupProcess) #define StartBackgroundWriter() StartChildProcess(BgWriterProcess) +#define StartCheckpointer() StartChildProcess(CheckpointerProcess) #define StartWalWriter() StartChildProcess(WalWriterProcess) #define StartWalReceiver() StartChildProcess(WalReceiverProcess) @@ -1028,8 +1030,8 @@ PostmasterMain(int argc, char *argv[]) * CAUTION: when changing this list, check for side-effects on the signal * handling setup of child processes. See tcop/postgres.c, * bootstrap/bootstrap.c, postmaster/bgwriter.c, postmaster/walwriter.c, - * postmaster/autovacuum.c, postmaster/pgarch.c, postmaster/pgstat.c, and - * postmaster/syslogger.c. + * postmaster/autovacuum.c, postmaster/pgarch.c, postmaster/pgstat.c, + * postmaster/syslogger.c and postmaster/checkpointer.c */ pqinitmask(); PG_SETMASK(&BlockSig); @@ -1366,10 +1368,14 @@ ServerLoop(void) * state that prevents it, start one. It doesn't matter if this * fails, we'll just try again later. */ - if (BgWriterPID == 0 && - (pmState == PM_RUN || pmState == PM_RECOVERY || - pmState == PM_HOT_STANDBY)) - BgWriterPID = StartBackgroundWriter(); + if (pmState == PM_RUN || pmState == PM_RECOVERY || + pmState == PM_HOT_STANDBY) + { + if (BgWriterPID == 0) + BgWriterPID = StartBackgroundWriter(); + if (CheckpointerPID == 0) + CheckpointerPID = StartCheckpointer(); + } /* * Likewise, if we have lost the walwriter process, try to start a new @@ -2047,6 +2053,8 @@ SIGHUP_handler(SIGNAL_ARGS) signal_child(StartupPID, SIGHUP); if (BgWriterPID != 0) signal_child(BgWriterPID, SIGHUP); + if (CheckpointerPID != 0) + signal_child(CheckpointerPID, SIGHUP); if (WalWriterPID != 0) signal_child(WalWriterPID, SIGHUP); if (WalReceiverPID != 0) @@ -2119,6 +2127,8 @@ pmdie(SIGNAL_ARGS) /* and the walwriter too */ if (WalWriterPID != 0) signal_child(WalWriterPID, SIGTERM); + if (BgWriterPID != 0) + signal_child(BgWriterPID, SIGTERM); /* * If we're in recovery, we can't kill the startup process @@ -2159,9 +2169,11 @@ pmdie(SIGNAL_ARGS) signal_child(StartupPID, SIGTERM); if (WalReceiverPID != 0) signal_child(WalReceiverPID, SIGTERM); + if (BgWriterPID != 0) + signal_child(BgWriterPID, SIGTERM); if (pmState == PM_RECOVERY) { - /* only bgwriter is active in this state */ + /* only checkpointer is active in this state */ pmState = PM_WAIT_BACKENDS; } else if (pmState == PM_RUN || @@ -2206,6 +2218,8 @@ pmdie(SIGNAL_ARGS) signal_child(StartupPID, SIGQUIT); if (BgWriterPID != 0) signal_child(BgWriterPID, SIGQUIT); + if (CheckpointerPID != 0) + signal_child(CheckpointerPID, SIGQUIT); if (WalWriterPID != 0) signal_child(WalWriterPID, SIGQUIT); if (WalReceiverPID != 0) @@ -2336,12 +2350,14 @@ reaper(SIGNAL_ARGS) } /* - * Crank up the background writer, if we didn't do that already + * Crank up background tasks, if we didn't do that already * when we entered consistent recovery state. It doesn't matter * if this fails, we'll just try again later. */ if (BgWriterPID == 0) BgWriterPID = StartBackgroundWriter(); + if (CheckpointerPID == 0) + CheckpointerPID = StartCheckpointer(); /* * Likewise, start other special children as needed. In a restart @@ -2369,10 +2385,22 @@ reaper(SIGNAL_ARGS) if (pid == BgWriterPID) { BgWriterPID = 0; + if (!EXIT_STATUS_0(exitstatus)) + HandleChildCrash(pid, exitstatus, + _("background writer process")); + continue; + } + + /* + * Was it the checkpointer? + */ + if (pid == CheckpointerPID) + { + CheckpointerPID = 0; if (EXIT_STATUS_0(exitstatus) && pmState == PM_SHUTDOWN) { /* - * OK, we saw normal exit of the bgwriter after it's been told + * OK, we saw normal exit of the checkpointer after it's been told * to shut down. We expect that it wrote a shutdown * checkpoint. (If for some reason it didn't, recovery will * occur on next postmaster start.) @@ -2409,11 +2437,11 @@ reaper(SIGNAL_ARGS) else { /* - * Any unexpected exit of the bgwriter (including FATAL exit) + * Any unexpected exit of the checkpointer (including FATAL exit) * is treated as a crash. */ HandleChildCrash(pid, exitstatus, - _("background writer process")); + _("checkpointer process")); } continue; @@ -2597,8 +2625,8 @@ CleanupBackend(int pid, } /* - * HandleChildCrash -- cleanup after failed backend, bgwriter, walwriter, - * or autovacuum. + * HandleChildCrash -- cleanup after failed backend, bgwriter, checkpointer, + * walwriter or autovacuum. * * The objectives here are to clean up our local state about the child * process, and to signal all other remaining children to quickdie. @@ -2691,6 +2719,18 @@ HandleChildCrash(int pid, int exitstatus, const char *procname) signal_child(BgWriterPID, (SendStop ? SIGSTOP : SIGQUIT)); } + /* Take care of the checkpointer too */ + if (pid == CheckpointerPID) + CheckpointerPID = 0; + else if (CheckpointerPID != 0 && !FatalError) + { + ereport(DEBUG2, + (errmsg_internal("sending %s to process %d", + (SendStop ? "SIGSTOP" : "SIGQUIT"), + (int) CheckpointerPID))); + signal_child(CheckpointerPID, (SendStop ? SIGSTOP : SIGQUIT)); + } + /* Take care of the walwriter too */ if (pid == WalWriterPID) WalWriterPID = 0; @@ -2887,9 +2927,10 @@ PostmasterStateMachine(void) { /* * PM_WAIT_BACKENDS state ends when we have no regular backends - * (including autovac workers) and no walwriter or autovac launcher. - * If we are doing crash recovery then we expect the bgwriter to exit - * too, otherwise not. The archiver, stats, and syslogger processes + * (including autovac workers) and no walwriter, autovac launcher + * or bgwriter. If we are doing crash recovery then we expect the + * checkpointer to exit as well, otherwise not. + * The archiver, stats, and syslogger processes * are disregarded since they are not connected to shared memory; we * also disregard dead_end children here. Walsenders are also * disregarded, they will be terminated later after writing the @@ -2898,7 +2939,8 @@ PostmasterStateMachine(void) if (CountChildren(BACKEND_TYPE_NORMAL | BACKEND_TYPE_AUTOVAC) == 0 && StartupPID == 0 && WalReceiverPID == 0 && - (BgWriterPID == 0 || !FatalError) && + BgWriterPID == 0 && + (CheckpointerPID == 0 || !FatalError) && WalWriterPID == 0 && AutoVacPID == 0) { @@ -2920,22 +2962,22 @@ PostmasterStateMachine(void) /* * If we get here, we are proceeding with normal shutdown. All * the regular children are gone, and it's time to tell the - * bgwriter to do a shutdown checkpoint. + * checkpointer to do a shutdown checkpoint. */ Assert(Shutdown > NoShutdown); - /* Start the bgwriter if not running */ - if (BgWriterPID == 0) - BgWriterPID = StartBackgroundWriter(); + /* Start the checkpointer if not running */ + if (CheckpointerPID == 0) + CheckpointerPID = StartCheckpointer(); /* And tell it to shut down */ - if (BgWriterPID != 0) + if (CheckpointerPID != 0) { - signal_child(BgWriterPID, SIGUSR2); + signal_child(CheckpointerPID, SIGUSR2); pmState = PM_SHUTDOWN; } else { /* - * If we failed to fork a bgwriter, just shut down. Any + * If we failed to fork a checkpointer, just shut down. Any * required cleanup will happen at next restart. We set * FatalError so that an "abnormal shutdown" message gets * logged when we exit. @@ -2994,6 +3036,7 @@ PostmasterStateMachine(void) Assert(StartupPID == 0); Assert(WalReceiverPID == 0); Assert(BgWriterPID == 0); + Assert(CheckpointerPID == 0); Assert(WalWriterPID == 0); Assert(AutoVacPID == 0); /* syslogger is not considered here */ @@ -4173,6 +4216,8 @@ sigusr1_handler(SIGNAL_ARGS) */ Assert(BgWriterPID == 0); BgWriterPID = StartBackgroundWriter(); + Assert(CheckpointerPID == 0); + CheckpointerPID = StartCheckpointer(); pmState = PM_RECOVERY; } @@ -4459,6 +4504,10 @@ StartChildProcess(AuxProcType type) ereport(LOG, (errmsg("could not fork background writer process: %m"))); break; + case CheckpointerProcess: + ereport(LOG, + (errmsg("could not fork checkpointer process: %m"))); + break; case WalWriterProcess: ereport(LOG, (errmsg("could not fork WAL writer process: %m"))); diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index f47f1b66b1..e59af33e72 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -1278,11 +1278,9 @@ BufferSync(int flags) break; /* - * Perform normal bgwriter duties and sleep to throttle our - * I/O rate. + * Sleep to throttle our I/O rate. */ - CheckpointWriteDelay(flags, - (double) num_written / num_to_write); + CheckpointWriteDelay(flags, (double) num_written / num_to_write); } } diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 3015885117..a761369d39 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -38,7 +38,7 @@ /* * Special values for the segno arg to RememberFsyncRequest. * - * Note that CompactBgwriterRequestQueue assumes that it's OK to remove an + * Note that CompactcheckpointerRequestQueue assumes that it's OK to remove an * fsync request from the queue if an identical, subsequent request is found. * See comments there before making changes here. */ @@ -77,7 +77,7 @@ * Inactive segments are those that once contained data but are currently * not needed because of an mdtruncate() operation. The reason for leaving * them present at size zero, rather than unlinking them, is that other - * backends and/or the bgwriter might be holding open file references to + * backends and/or the checkpointer might be holding open file references to * such segments. If the relation expands again after mdtruncate(), such * that a deactivated segment becomes active again, it is important that * such file references still be valid --- else data might get written @@ -111,7 +111,7 @@ static MemoryContext MdCxt; /* context for all md.c allocations */ /* - * In some contexts (currently, standalone backends and the bgwriter process) + * In some contexts (currently, standalone backends and the checkpointer process) * we keep track of pending fsync operations: we need to remember all relation * segments that have been written since the last checkpoint, so that we can * fsync them down to disk before completing the next checkpoint. This hash @@ -123,7 +123,7 @@ static MemoryContext MdCxt; /* context for all md.c allocations */ * a hash table, because we don't expect there to be any duplicate requests. * * (Regular backends do not track pending operations locally, but forward - * them to the bgwriter.) + * them to the checkpointer.) */ typedef struct { @@ -194,7 +194,7 @@ mdinit(void) * Create pending-operations hashtable if we need it. Currently, we need * it if we are standalone (not under a postmaster) OR if we are a * bootstrap-mode subprocess of a postmaster (that is, a startup or - * bgwriter process). + * checkpointer process). */ if (!IsUnderPostmaster || IsBootstrapProcessingMode()) { @@ -214,10 +214,10 @@ mdinit(void) } /* - * In archive recovery, we rely on bgwriter to do fsyncs, but we will have + * In archive recovery, we rely on checkpointer to do fsyncs, but we will have * already created the pendingOpsTable during initialization of the startup * process. Calling this function drops the local pendingOpsTable so that - * subsequent requests will be forwarded to bgwriter. + * subsequent requests will be forwarded to checkpointer. */ void SetForwardFsyncRequests(void) @@ -765,9 +765,9 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum) * NOTE: this assumption could only be wrong if another backend has * truncated the relation. We rely on higher code levels to handle that * scenario by closing and re-opening the md fd, which is handled via - * relcache flush. (Since the bgwriter doesn't participate in relcache + * relcache flush. (Since the checkpointer doesn't participate in relcache * flush, it could have segment chain entries for inactive segments; - * that's OK because the bgwriter never needs to compute relation size.) + * that's OK because the checkpointer never needs to compute relation size.) */ while (v->mdfd_chain != NULL) { @@ -957,7 +957,7 @@ mdsync(void) elog(ERROR, "cannot sync without a pendingOpsTable"); /* - * If we are in the bgwriter, the sync had better include all fsync + * If we are in the checkpointer, the sync had better include all fsync * requests that were queued by backends up to this point. The tightest * race condition that could occur is that a buffer that must be written * and fsync'd for the checkpoint could have been dumped by a backend just @@ -1033,7 +1033,7 @@ mdsync(void) int failures; /* - * If in bgwriter, we want to absorb pending requests every so + * If in checkpointer, we want to absorb pending requests every so * often to prevent overflow of the fsync request queue. It is * unspecified whether newly-added entries will be visited by * hash_seq_search, but we don't care since we don't need to @@ -1070,9 +1070,9 @@ mdsync(void) * say "but an unreferenced SMgrRelation is still a leak!" Not * really, because the only case in which a checkpoint is done * by a process that isn't about to shut down is in the - * bgwriter, and it will periodically do smgrcloseall(). This + * checkpointer, and it will periodically do smgrcloseall(). This * fact justifies our not closing the reln in the success path - * either, which is a good thing since in non-bgwriter cases + * either, which is a good thing since in non-checkpointer cases * we couldn't safely do that.) Furthermore, in many cases * the relation will have been dirtied through this same smgr * relation, and so we can save a file open/close cycle. @@ -1301,7 +1301,7 @@ register_unlink(RelFileNodeBackend rnode) else { /* - * Notify the bgwriter about it. If we fail to queue the request + * Notify the checkpointer about it. If we fail to queue the request * message, we have to sleep and try again, because we can't simply * delete the file now. Ugly, but hopefully won't happen often. * @@ -1315,10 +1315,10 @@ register_unlink(RelFileNodeBackend rnode) } /* - * RememberFsyncRequest() -- callback from bgwriter side of fsync request + * RememberFsyncRequest() -- callback from checkpointer side of fsync request * * We stuff most fsync requests into the local hash table for execution - * during the bgwriter's next checkpoint. UNLINK requests go into a + * during the checkpointer's next checkpoint. UNLINK requests go into a * separate linked list, however, because they get processed separately. * * The range of possible segment numbers is way less than the range of @@ -1460,20 +1460,20 @@ ForgetRelationFsyncRequests(RelFileNodeBackend rnode, ForkNumber forknum) else if (IsUnderPostmaster) { /* - * Notify the bgwriter about it. If we fail to queue the revoke + * Notify the checkpointer about it. If we fail to queue the revoke * message, we have to sleep and try again ... ugly, but hopefully * won't happen often. * * XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with an * error would leave the no-longer-used file still present on disk, - * which would be bad, so I'm inclined to assume that the bgwriter + * which would be bad, so I'm inclined to assume that the checkpointer * will always empty the queue soon. */ while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC)) pg_usleep(10000L); /* 10 msec seems a good number */ /* - * Note we don't wait for the bgwriter to actually absorb the revoke + * Note we don't wait for the checkpointer to actually absorb the revoke * message; see mdsync() for the implications. */ } diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 4eaa243948..cb43879773 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -256,7 +256,7 @@ typedef struct RmgrData extern const RmgrData RmgrTable[]; /* - * Exported to support xlog switching from bgwriter + * Exported to support xlog switching from checkpointer */ extern pg_time_t GetLastSegSwitchTime(void); extern XLogRecPtr RequestXLogSwitch(void); diff --git a/src/include/bootstrap/bootstrap.h b/src/include/bootstrap/bootstrap.h index cee9bd1fa4..6153b7a0a2 100644 --- a/src/include/bootstrap/bootstrap.h +++ b/src/include/bootstrap/bootstrap.h @@ -22,6 +22,7 @@ typedef enum BootstrapProcess, StartupProcess, BgWriterProcess, + CheckpointerProcess, WalWriterProcess, WalReceiverProcess, diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h index eaf2206f5e..c05901e929 100644 --- a/src/include/postmaster/bgwriter.h +++ b/src/include/postmaster/bgwriter.h @@ -23,6 +23,7 @@ extern int CheckPointWarning; extern double CheckPointCompletionTarget; extern void BackgroundWriterMain(void); +extern void CheckpointerMain(void); extern void RequestCheckpoint(int flags); extern void CheckpointWriteDelay(int flags, double progress); diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index 46ec625e08..6e798b1b2d 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -190,11 +190,11 @@ extern PROC_HDR *ProcGlobal; * We set aside some extra PGPROC structures for auxiliary processes, * ie things that aren't full-fledged backends but need shmem access. * - * Background writer and WAL writer run during normal operation. Startup - * process and WAL receiver also consume 2 slots, but WAL writer is - * launched only after startup has exited, so we only need 3 slots. + * Background writer, checkpointer and WAL writer run during normal operation. + * Startup process and WAL receiver also consume 2 slots, but WAL writer is + * launched only after startup has exited, so we only need 4 slots. */ -#define NUM_AUXILIARY_PROCS 3 +#define NUM_AUXILIARY_PROCS 4 /* configurable options */ diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h index 2a27e0b7ed..d5afe01778 100644 --- a/src/include/storage/procsignal.h +++ b/src/include/storage/procsignal.h @@ -19,7 +19,7 @@ /* * Reasons for signalling a Postgres child process (a backend or an auxiliary - * process, like bgwriter). We can cope with concurrent signals for different + * process, like checkpointer). We can cope with concurrent signals for different * reasons. However, if the same reason is signaled multiple times in quick * succession, the process is likely to observe only one notification of it. * This is okay for the present uses.