/*------------------------------------------------------------------------- * * parallel.c * * Parallel support for pg_dump and pg_restore * * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION * src/bin/pg_dump/parallel.c * *------------------------------------------------------------------------- */ /* * Parallel operation works like this: * * The original, leader process calls ParallelBackupStart(), which forks off * the desired number of worker processes, which each enter WaitForCommands(). * * The leader process dispatches an individual work item to one of the worker * processes in DispatchJobForTocEntry(). We send a command string such as * "DUMP 1234" or "RESTORE 1234", where 1234 is the TocEntry ID. * The worker process receives and decodes the command and passes it to the * routine pointed to by AH->WorkerJobDumpPtr or AH->WorkerJobRestorePtr, * which are routines of the current archive format. That routine performs * the required action (dump or restore) and returns an integer status code. * This is passed back to the leader where we pass it to the * ParallelCompletionPtr callback function that was passed to * DispatchJobForTocEntry(). The callback function does state updating * for the leader control logic in pg_backup_archiver.c. * * In principle additional archive-format-specific information might be needed * in commands or worker status responses, but so far that hasn't proved * necessary, since workers have full copies of the ArchiveHandle/TocEntry * data structures. Remember that we have forked off the workers only after * we have read in the catalog. That's why our worker processes can also * access the catalog information. (In the Windows case, the workers are * threads in the same process. To avoid problems, they work with cloned * copies of the Archive data structure; see RunWorker().) * * In the leader process, the workerStatus field for each worker has one of * the following values: * WRKR_NOT_STARTED: we've not yet forked this worker * WRKR_IDLE: it's waiting for a command * WRKR_WORKING: it's working on a command * WRKR_TERMINATED: process ended * The pstate->te[] entry for each worker is valid when it's in WRKR_WORKING * state, and must be NULL in other states. */ #include "postgres_fe.h" #ifndef WIN32 #include #include #include #include #include #endif #include "fe_utils/string_utils.h" #include "parallel.h" #include "pg_backup_utils.h" #include "port/pg_bswap.h" /* Mnemonic macros for indexing the fd array returned by pipe(2) */ #define PIPE_READ 0 #define PIPE_WRITE 1 #define NO_SLOT (-1) /* Failure result for GetIdleWorker() */ /* Worker process statuses */ typedef enum { WRKR_NOT_STARTED = 0, WRKR_IDLE, WRKR_WORKING, WRKR_TERMINATED, } T_WorkerStatus; #define WORKER_IS_RUNNING(workerStatus) \ ((workerStatus) == WRKR_IDLE || (workerStatus) == WRKR_WORKING) /* * Private per-parallel-worker state (typedef for this is in parallel.h). * * Much of this is valid only in the leader process (or, on Windows, should * be touched only by the leader thread). But the AH field should be touched * only by workers. The pipe descriptors are valid everywhere. */ struct ParallelSlot { T_WorkerStatus workerStatus; /* see enum above */ /* These fields are valid if workerStatus == WRKR_WORKING: */ ParallelCompletionPtr callback; /* function to call on completion */ void *callback_data; /* passthrough data for it */ ArchiveHandle *AH; /* Archive data worker is using */ int pipeRead; /* leader's end of the pipes */ int pipeWrite; int pipeRevRead; /* child's end of the pipes */ int pipeRevWrite; /* Child process/thread identity info: */ #ifdef WIN32 uintptr_t hThread; unsigned int threadId; #else pid_t pid; #endif }; #ifdef WIN32 /* * Structure to hold info passed by _beginthreadex() to the function it calls * via its single allowed argument. */ typedef struct { ArchiveHandle *AH; /* leader database connection */ ParallelSlot *slot; /* this worker's parallel slot */ } WorkerInfo; /* Windows implementation of pipe access */ static int pgpipe(int handles[2]); #define piperead(a,b,c) recv(a,b,c,0) #define pipewrite(a,b,c) send(a,b,c,0) #else /* !WIN32 */ /* Non-Windows implementation of pipe access */ #define pgpipe(a) pipe(a) #define piperead(a,b,c) read(a,b,c) #define pipewrite(a,b,c) write(a,b,c) #endif /* WIN32 */ /* * State info for archive_close_connection() shutdown callback. */ typedef struct ShutdownInformation { ParallelState *pstate; Archive *AHX; } ShutdownInformation; static ShutdownInformation shutdown_info; /* * State info for signal handling. * We assume signal_info initializes to zeroes. * * On Unix, myAH is the leader DB connection in the leader process, and the * worker's own connection in worker processes. On Windows, we have only one * instance of signal_info, so myAH is the leader connection and the worker * connections must be dug out of pstate->parallelSlot[]. */ typedef struct DumpSignalInformation { ArchiveHandle *myAH; /* database connection to issue cancel for */ ParallelState *pstate; /* parallel state, if any */ bool handler_set; /* signal handler set up in this process? */ #ifndef WIN32 bool am_worker; /* am I a worker process? */ #endif } DumpSignalInformation; static volatile DumpSignalInformation signal_info; #ifdef WIN32 static CRITICAL_SECTION signal_info_lock; #endif /* * Write a simple string to stderr --- must be safe in a signal handler. * We ignore the write() result since there's not much we could do about it. * Certain compilers make that harder than it ought to be. */ #define write_stderr(str) \ do { \ const char *str_ = (str); \ int rc_; \ rc_ = write(fileno(stderr), str_, strlen(str_)); \ (void) rc_; \ } while (0) #ifdef WIN32 /* file-scope variables */ static DWORD tls_index; /* globally visible variables (needed by exit_nicely) */ bool parallel_init_done = false; DWORD mainThreadId; #endif /* WIN32 */ /* Local function prototypes */ static ParallelSlot *GetMyPSlot(ParallelState *pstate); static void archive_close_connection(int code, void *arg); static void ShutdownWorkersHard(ParallelState *pstate); static void WaitForTerminatingWorkers(ParallelState *pstate); static void set_cancel_handler(void); static void set_cancel_pstate(ParallelState *pstate); static void set_cancel_slot_archive(ParallelSlot *slot, ArchiveHandle *AH); static void RunWorker(ArchiveHandle *AH, ParallelSlot *slot); static int GetIdleWorker(ParallelState *pstate); static bool HasEveryWorkerTerminated(ParallelState *pstate); static void lockTableForWorker(ArchiveHandle *AH, TocEntry *te); static void WaitForCommands(ArchiveHandle *AH, int pipefd[2]); static bool ListenToWorkers(ArchiveHandle *AH, ParallelState *pstate, bool do_wait); static char *getMessageFromLeader(int pipefd[2]); static void sendMessageToLeader(int pipefd[2], const char *str); static int select_loop(int maxFd, fd_set *workerset); static char *getMessageFromWorker(ParallelState *pstate, bool do_wait, int *worker); static void sendMessageToWorker(ParallelState *pstate, int worker, const char *str); static char *readMessageFromPipe(int fd); #define messageStartsWith(msg, prefix) \ (strncmp(msg, prefix, strlen(prefix)) == 0) /* * Initialize parallel dump support --- should be called early in process * startup. (Currently, this is called whether or not we intend parallel * activity.) */ void init_parallel_dump_utils(void) { #ifdef WIN32 if (!parallel_init_done) { WSADATA wsaData; int err; /* Prepare for threaded operation */ tls_index = TlsAlloc(); mainThreadId = GetCurrentThreadId(); /* Initialize socket access */ err = WSAStartup(MAKEWORD(2, 2), &wsaData); if (err != 0) pg_fatal("%s() failed: error code %d", "WSAStartup", err); parallel_init_done = true; } #endif } /* * Find the ParallelSlot for the current worker process or thread. * * Returns NULL if no matching slot is found (this implies we're the leader). */ static ParallelSlot * GetMyPSlot(ParallelState *pstate) { int i; for (i = 0; i < pstate->numWorkers; i++) { #ifdef WIN32 if (pstate->parallelSlot[i].threadId == GetCurrentThreadId()) #else if (pstate->parallelSlot[i].pid == getpid()) #endif return &(pstate->parallelSlot[i]); } return NULL; } /* * A thread-local version of getLocalPQExpBuffer(). * * Non-reentrant but reduces memory leakage: we'll consume one buffer per * thread, which is much better than one per fmtId/fmtQualifiedId call. */ #ifdef WIN32 static PQExpBuffer getThreadLocalPQExpBuffer(void) { /* * The Tls code goes awry if we use a static var, so we provide for both * static and auto, and omit any use of the static var when using Tls. We * rely on TlsGetValue() to return 0 if the value is not yet set. */ static PQExpBuffer s_id_return = NULL; PQExpBuffer id_return; if (parallel_init_done) id_return = (PQExpBuffer) TlsGetValue(tls_index); else id_return = s_id_return; if (id_return) /* first time through? */ { /* same buffer, just wipe contents */ resetPQExpBuffer(id_return); } else { /* new buffer */ id_return = createPQExpBuffer(); if (parallel_init_done) TlsSetValue(tls_index, id_return); else s_id_return = id_return; } return id_return; } #endif /* WIN32 */ /* * pg_dump and pg_restore call this to register the cleanup handler * as soon as they've created the ArchiveHandle. */ void on_exit_close_archive(Archive *AHX) { shutdown_info.AHX = AHX; on_exit_nicely(archive_close_connection, &shutdown_info); } /* * on_exit_nicely handler for shutting down database connections and * worker processes cleanly. */ static void archive_close_connection(int code, void *arg) { ShutdownInformation *si = (ShutdownInformation *) arg; if (si->pstate) { /* In parallel mode, must figure out who we are */ ParallelSlot *slot = GetMyPSlot(si->pstate); if (!slot) { /* * We're the leader. Forcibly shut down workers, then close our * own database connection, if any. */ ShutdownWorkersHard(si->pstate); if (si->AHX) DisconnectDatabase(si->AHX); } else { /* * We're a worker. Shut down our own DB connection if any. On * Windows, we also have to close our communication sockets, to * emulate what will happen on Unix when the worker process exits. * (Without this, if this is a premature exit, the leader would * fail to detect it because there would be no EOF condition on * the other end of the pipe.) */ if (slot->AH) DisconnectDatabase(&(slot->AH->public)); #ifdef WIN32 closesocket(slot->pipeRevRead); closesocket(slot->pipeRevWrite); #endif } } else { /* Non-parallel operation: just kill the leader DB connection */ if (si->AHX) DisconnectDatabase(si->AHX); } } /* * Forcibly shut down any remaining workers, waiting for them to finish. * * Note that we don't expect to come here during normal exit (the workers * should be long gone, and the ParallelState too). We're only here in a * pg_fatal() situation, so intervening to cancel active commands is * appropriate. */ static void ShutdownWorkersHard(ParallelState *pstate) { int i; /* * Close our write end of the sockets so that any workers waiting for * commands know they can exit. (Note: some of the pipeWrite fields might * still be zero, if we failed to initialize all the workers. Hence, just * ignore errors here.) */ for (i = 0; i < pstate->numWorkers; i++) closesocket(pstate->parallelSlot[i].pipeWrite); /* * Force early termination of any commands currently in progress. */ #ifndef WIN32 /* On non-Windows, send SIGTERM to each worker process. */ for (i = 0; i < pstate->numWorkers; i++) { pid_t pid = pstate->parallelSlot[i].pid; if (pid != 0) kill(pid, SIGTERM); } #else /* * On Windows, send query cancels directly to the workers' backends. Use * a critical section to ensure worker threads don't change state. */ EnterCriticalSection(&signal_info_lock); for (i = 0; i < pstate->numWorkers; i++) { ArchiveHandle *AH = pstate->parallelSlot[i].AH; char errbuf[1]; if (AH != NULL && AH->connCancel != NULL) (void) PQcancel(AH->connCancel, errbuf, sizeof(errbuf)); } LeaveCriticalSection(&signal_info_lock); #endif /* Now wait for them to terminate. */ WaitForTerminatingWorkers(pstate); } /* * Wait for all workers to terminate. */ static void WaitForTerminatingWorkers(ParallelState *pstate) { while (!HasEveryWorkerTerminated(pstate)) { ParallelSlot *slot = NULL; int j; #ifndef WIN32 /* On non-Windows, use wait() to wait for next worker to end */ int status; pid_t pid = wait(&status); /* Find dead worker's slot, and clear the PID field */ for (j = 0; j < pstate->numWorkers; j++) { slot = &(pstate->parallelSlot[j]); if (slot->pid == pid) { slot->pid = 0; break; } } #else /* WIN32 */ /* On Windows, we must use WaitForMultipleObjects() */ HANDLE *lpHandles = pg_malloc(sizeof(HANDLE) * pstate->numWorkers); int nrun = 0; DWORD ret; uintptr_t hThread; for (j = 0; j < pstate->numWorkers; j++) { if (WORKER_IS_RUNNING(pstate->parallelSlot[j].workerStatus)) { lpHandles[nrun] = (HANDLE) pstate->parallelSlot[j].hThread; nrun++; } } ret = WaitForMultipleObjects(nrun, lpHandles, false, INFINITE); Assert(ret != WAIT_FAILED); hThread = (uintptr_t) lpHandles[ret - WAIT_OBJECT_0]; free(lpHandles); /* Find dead worker's slot, and clear the hThread field */ for (j = 0; j < pstate->numWorkers; j++) { slot = &(pstate->parallelSlot[j]); if (slot->hThread == hThread) { /* For cleanliness, close handles for dead threads */ CloseHandle((HANDLE) slot->hThread); slot->hThread = (uintptr_t) INVALID_HANDLE_VALUE; break; } } #endif /* WIN32 */ /* On all platforms, update workerStatus and te[] as well */ Assert(j < pstate->numWorkers); slot->workerStatus = WRKR_TERMINATED; pstate->te[j] = NULL; } } /* * Code for responding to cancel interrupts (SIGINT, control-C, etc) * * This doesn't quite belong in this module, but it needs access to the * ParallelState data, so there's not really a better place either. * * When we get a cancel interrupt, we could just die, but in pg_restore that * could leave a SQL command (e.g., CREATE INDEX on a large table) running * for a long time. Instead, we try to send a cancel request and then die. * pg_dump probably doesn't really need this, but we might as well use it * there too. Note that sending the cancel directly from the signal handler * is safe because PQcancel() is written to make it so. * * In parallel operation on Unix, each process is responsible for canceling * its own connection (this must be so because nobody else has access to it). * Furthermore, the leader process should attempt to forward its signal to * each child. In simple manual use of pg_dump/pg_restore, forwarding isn't * needed because typing control-C at the console would deliver SIGINT to * every member of the terminal process group --- but in other scenarios it * might be that only the leader gets signaled. * * On Windows, the cancel handler runs in a separate thread, because that's * how SetConsoleCtrlHandler works. We make it stop worker threads, send * cancels on all active connections, and then return FALSE, which will allow * the process to die. For safety's sake, we use a critical section to * protect the PGcancel structures against being changed while the signal * thread runs. */ #ifndef WIN32 /* * Signal handler (Unix only) */ static void sigTermHandler(SIGNAL_ARGS) { int i; char errbuf[1]; /* * Some platforms allow delivery of new signals to interrupt an active * signal handler. That could muck up our attempt to send PQcancel, so * disable the signals that set_cancel_handler enabled. */ pqsignal(SIGINT, SIG_IGN); pqsignal(SIGTERM, SIG_IGN); pqsignal(SIGQUIT, SIG_IGN); /* * If we're in the leader, forward signal to all workers. (It seems best * to do this before PQcancel; killing the leader transaction will result * in invalid-snapshot errors from active workers, which maybe we can * quiet by killing workers first.) Ignore any errors. */ if (signal_info.pstate != NULL) { for (i = 0; i < signal_info.pstate->numWorkers; i++) { pid_t pid = signal_info.pstate->parallelSlot[i].pid; if (pid != 0) kill(pid, SIGTERM); } } /* * Send QueryCancel if we have a connection to send to. Ignore errors, * there's not much we can do about them anyway. */ if (signal_info.myAH != NULL && signal_info.myAH->connCancel != NULL) (void) PQcancel(signal_info.myAH->connCancel, errbuf, sizeof(errbuf)); /* * Report we're quitting, using nothing more complicated than write(2). * When in parallel operation, only the leader process should do this. */ if (!signal_info.am_worker) { if (progname) { write_stderr(progname); write_stderr(": "); } write_stderr("terminated by user\n"); } /* * And die, using _exit() not exit() because the latter will invoke atexit * handlers that can fail if we interrupted related code. */ _exit(1); } /* * Enable cancel interrupt handler, if not already done. */ static void set_cancel_handler(void) { /* * When forking, signal_info.handler_set will propagate into the new * process, but that's fine because the signal handler state does too. */ if (!signal_info.handler_set) { signal_info.handler_set = true; pqsignal(SIGINT, sigTermHandler); pqsignal(SIGTERM, sigTermHandler); pqsignal(SIGQUIT, sigTermHandler); } } #else /* WIN32 */ /* * Console interrupt handler --- runs in a newly-started thread. * * After stopping other threads and sending cancel requests on all open * connections, we return FALSE which will allow the default ExitProcess() * action to be taken. */ static BOOL WINAPI consoleHandler(DWORD dwCtrlType) { int i; char errbuf[1]; if (dwCtrlType == CTRL_C_EVENT || dwCtrlType == CTRL_BREAK_EVENT) { /* Critical section prevents changing data we look at here */ EnterCriticalSection(&signal_info_lock); /* * If in parallel mode, stop worker threads and send QueryCancel to * their connected backends. The main point of stopping the worker * threads is to keep them from reporting the query cancels as errors, * which would clutter the user's screen. We needn't stop the leader * thread since it won't be doing much anyway. Do this before * canceling the main transaction, else we might get invalid-snapshot * errors reported before we can stop the workers. Ignore errors, * there's not much we can do about them anyway. */ if (signal_info.pstate != NULL) { for (i = 0; i < signal_info.pstate->numWorkers; i++) { ParallelSlot *slot = &(signal_info.pstate->parallelSlot[i]); ArchiveHandle *AH = slot->AH; HANDLE hThread = (HANDLE) slot->hThread; /* * Using TerminateThread here may leave some resources leaked, * but it doesn't matter since we're about to end the whole * process. */ if (hThread != INVALID_HANDLE_VALUE) TerminateThread(hThread, 0); if (AH != NULL && AH->connCancel != NULL) (void) PQcancel(AH->connCancel, errbuf, sizeof(errbuf)); } } /* * Send QueryCancel to leader connection, if enabled. Ignore errors, * there's not much we can do about them anyway. */ if (signal_info.myAH != NULL && signal_info.myAH->connCancel != NULL) (void) PQcancel(signal_info.myAH->connCancel, errbuf, sizeof(errbuf)); LeaveCriticalSection(&signal_info_lock); /* * Report we're quitting, using nothing more complicated than * write(2). (We might be able to get away with using pg_log_*() * here, but since we terminated other threads uncleanly above, it * seems better to assume as little as possible.) */ if (progname) { write_stderr(progname); write_stderr(": "); } write_stderr("terminated by user\n"); } /* Always return FALSE to allow signal handling to continue */ return FALSE; } /* * Enable cancel interrupt handler, if not already done. */ static void set_cancel_handler(void) { if (!signal_info.handler_set) { signal_info.handler_set = true; InitializeCriticalSection(&signal_info_lock); SetConsoleCtrlHandler(consoleHandler, TRUE); } } #endif /* WIN32 */ /* * set_archive_cancel_info * * Fill AH->connCancel with cancellation info for the specified database * connection; or clear it if conn is NULL. */ void set_archive_cancel_info(ArchiveHandle *AH, PGconn *conn) { PGcancel *oldConnCancel; /* * Activate the interrupt handler if we didn't yet in this process. On * Windows, this also initializes signal_info_lock; therefore it's * important that this happen at least once before we fork off any * threads. */ set_cancel_handler(); /* * On Unix, we assume that storing a pointer value is atomic with respect * to any possible signal interrupt. On Windows, use a critical section. */ #ifdef WIN32 EnterCriticalSection(&signal_info_lock); #endif /* Free the old one if we have one */ oldConnCancel = AH->connCancel; /* be sure interrupt handler doesn't use pointer while freeing */ AH->connCancel = NULL; if (oldConnCancel != NULL) PQfreeCancel(oldConnCancel); /* Set the new one if specified */ if (conn) AH->connCancel = PQgetCancel(conn); /* * On Unix, there's only ever one active ArchiveHandle per process, so we * can just set signal_info.myAH unconditionally. On Windows, do that * only in the main thread; worker threads have to make sure their * ArchiveHandle appears in the pstate data, which is dealt with in * RunWorker(). */ #ifndef WIN32 signal_info.myAH = AH; #else if (mainThreadId == GetCurrentThreadId()) signal_info.myAH = AH; #endif #ifdef WIN32 LeaveCriticalSection(&signal_info_lock); #endif } /* * set_cancel_pstate * * Set signal_info.pstate to point to the specified ParallelState, if any. * We need this mainly to have an interlock against Windows signal thread. */ static void set_cancel_pstate(ParallelState *pstate) { #ifdef WIN32 EnterCriticalSection(&signal_info_lock); #endif signal_info.pstate = pstate; #ifdef WIN32 LeaveCriticalSection(&signal_info_lock); #endif } /* * set_cancel_slot_archive * * Set ParallelSlot's AH field to point to the specified archive, if any. * We need this mainly to have an interlock against Windows signal thread. */ static void set_cancel_slot_archive(ParallelSlot *slot, ArchiveHandle *AH) { #ifdef WIN32 EnterCriticalSection(&signal_info_lock); #endif slot->AH = AH; #ifdef WIN32 LeaveCriticalSection(&signal_info_lock); #endif } /* * This function is called by both Unix and Windows variants to set up * and run a worker process. Caller should exit the process (or thread) * upon return. */ static void RunWorker(ArchiveHandle *AH, ParallelSlot *slot) { int pipefd[2]; /* fetch child ends of pipes */ pipefd[PIPE_READ] = slot->pipeRevRead; pipefd[PIPE_WRITE] = slot->pipeRevWrite; /* * Clone the archive so that we have our own state to work with, and in * particular our own database connection. * * We clone on Unix as well as Windows, even though technically we don't * need to because fork() gives us a copy in our own address space * already. But CloneArchive resets the state information and also clones * the database connection which both seem kinda helpful. */ AH = CloneArchive(AH); /* Remember cloned archive where signal handler can find it */ set_cancel_slot_archive(slot, AH); /* * Call the setup worker function that's defined in the ArchiveHandle. */ (AH->SetupWorkerPtr) ((Archive *) AH); /* * Execute commands until done. */ WaitForCommands(AH, pipefd); /* * Disconnect from database and clean up. */ set_cancel_slot_archive(slot, NULL); DisconnectDatabase(&(AH->public)); DeCloneArchive(AH); } /* * Thread base function for Windows */ #ifdef WIN32 static unsigned __stdcall init_spawned_worker_win32(WorkerInfo *wi) { ArchiveHandle *AH = wi->AH; ParallelSlot *slot = wi->slot; /* Don't need WorkerInfo anymore */ free(wi); /* Run the worker ... */ RunWorker(AH, slot); /* Exit the thread */ _endthreadex(0); return 0; } #endif /* WIN32 */ /* * This function starts a parallel dump or restore by spawning off the worker * processes. For Windows, it creates a number of threads; on Unix the * workers are created with fork(). */ ParallelState * ParallelBackupStart(ArchiveHandle *AH) { ParallelState *pstate; int i; Assert(AH->public.numWorkers > 0); pstate = (ParallelState *) pg_malloc(sizeof(ParallelState)); pstate->numWorkers = AH->public.numWorkers; pstate->te = NULL; pstate->parallelSlot = NULL; if (AH->public.numWorkers == 1) return pstate; /* Create status arrays, being sure to initialize all fields to 0 */ pstate->te = (TocEntry **) pg_malloc0(pstate->numWorkers * sizeof(TocEntry *)); pstate->parallelSlot = (ParallelSlot *) pg_malloc0(pstate->numWorkers * sizeof(ParallelSlot)); #ifdef WIN32 /* Make fmtId() and fmtQualifiedId() use thread-local storage */ getLocalPQExpBuffer = getThreadLocalPQExpBuffer; #endif /* * Set the pstate in shutdown_info, to tell the exit handler that it must * clean up workers as well as the main database connection. But we don't * set this in signal_info yet, because we don't want child processes to * inherit non-NULL signal_info.pstate. */ shutdown_info.pstate = pstate; /* * Temporarily disable query cancellation on the leader connection. This * ensures that child processes won't inherit valid AH->connCancel * settings and thus won't try to issue cancels against the leader's * connection. No harm is done if we fail while it's disabled, because * the leader connection is idle at this point anyway. */ set_archive_cancel_info(AH, NULL); /* Ensure stdio state is quiesced before forking */ fflush(NULL); /* Create desired number of workers */ for (i = 0; i < pstate->numWorkers; i++) { #ifdef WIN32 WorkerInfo *wi; uintptr_t handle; #else pid_t pid; #endif ParallelSlot *slot = &(pstate->parallelSlot[i]); int pipeMW[2], pipeWM[2]; /* Create communication pipes for this worker */ if (pgpipe(pipeMW) < 0 || pgpipe(pipeWM) < 0) pg_fatal("could not create communication channels: %m"); /* leader's ends of the pipes */ slot->pipeRead = pipeWM[PIPE_READ]; slot->pipeWrite = pipeMW[PIPE_WRITE]; /* child's ends of the pipes */ slot->pipeRevRead = pipeMW[PIPE_READ]; slot->pipeRevWrite = pipeWM[PIPE_WRITE]; #ifdef WIN32 /* Create transient structure to pass args to worker function */ wi = (WorkerInfo *) pg_malloc(sizeof(WorkerInfo)); wi->AH = AH; wi->slot = slot; handle = _beginthreadex(NULL, 0, (void *) &init_spawned_worker_win32, wi, 0, &(slot->threadId)); slot->hThread = handle; slot->workerStatus = WRKR_IDLE; #else /* !WIN32 */ pid = fork(); if (pid == 0) { /* we are the worker */ int j; /* this is needed for GetMyPSlot() */ slot->pid = getpid(); /* instruct signal handler that we're in a worker now */ signal_info.am_worker = true; /* close read end of Worker -> Leader */ closesocket(pipeWM[PIPE_READ]); /* close write end of Leader -> Worker */ closesocket(pipeMW[PIPE_WRITE]); /* * Close all inherited fds for communication of the leader with * previously-forked workers. */ for (j = 0; j < i; j++) { closesocket(pstate->parallelSlot[j].pipeRead); closesocket(pstate->parallelSlot[j].pipeWrite); } /* Run the worker ... */ RunWorker(AH, slot); /* We can just exit(0) when done */ exit(0); } else if (pid < 0) { /* fork failed */ pg_fatal("could not create worker process: %m"); } /* In Leader after successful fork */ slot->pid = pid; slot->workerStatus = WRKR_IDLE; /* close read end of Leader -> Worker */ closesocket(pipeMW[PIPE_READ]); /* close write end of Worker -> Leader */ closesocket(pipeWM[PIPE_WRITE]); #endif /* WIN32 */ } /* * Having forked off the workers, disable SIGPIPE so that leader isn't * killed if it tries to send a command to a dead worker. We don't want * the workers to inherit this setting, though. */ #ifndef WIN32 pqsignal(SIGPIPE, SIG_IGN); #endif /* * Re-establish query cancellation on the leader connection. */ set_archive_cancel_info(AH, AH->connection); /* * Tell the cancel signal handler to forward signals to worker processes, * too. (As with query cancel, we did not need this earlier because the * workers have not yet been given anything to do; if we die before this * point, any already-started workers will see EOF and quit promptly.) */ set_cancel_pstate(pstate); return pstate; } /* * Close down a parallel dump or restore. */ void ParallelBackupEnd(ArchiveHandle *AH, ParallelState *pstate) { int i; /* No work if non-parallel */ if (pstate->numWorkers == 1) return; /* There should not be any unfinished jobs */ Assert(IsEveryWorkerIdle(pstate)); /* Close the sockets so that the workers know they can exit */ for (i = 0; i < pstate->numWorkers; i++) { closesocket(pstate->parallelSlot[i].pipeRead); closesocket(pstate->parallelSlot[i].pipeWrite); } /* Wait for them to exit */ WaitForTerminatingWorkers(pstate); /* * Unlink pstate from shutdown_info, so the exit handler will not try to * use it; and likewise unlink from signal_info. */ shutdown_info.pstate = NULL; set_cancel_pstate(NULL); /* Release state (mere neatnik-ism, since we're about to terminate) */ free(pstate->te); free(pstate->parallelSlot); free(pstate); } /* * These next four functions handle construction and parsing of the command * strings and response strings for parallel workers. * * Currently, these can be the same regardless of which archive format we are * processing. In future, we might want to let format modules override these * functions to add format-specific data to a command or response. */ /* * buildWorkerCommand: format a command string to send to a worker. * * The string is built in the caller-supplied buffer of size buflen. */ static void buildWorkerCommand(ArchiveHandle *AH, TocEntry *te, T_Action act, char *buf, int buflen) { if (act == ACT_DUMP) snprintf(buf, buflen, "DUMP %d", te->dumpId); else if (act == ACT_RESTORE) snprintf(buf, buflen, "RESTORE %d", te->dumpId); else Assert(false); } /* * parseWorkerCommand: interpret a command string in a worker. */ static void parseWorkerCommand(ArchiveHandle *AH, TocEntry **te, T_Action *act, const char *msg) { DumpId dumpId; int nBytes; if (messageStartsWith(msg, "DUMP ")) { *act = ACT_DUMP; sscanf(msg, "DUMP %d%n", &dumpId, &nBytes); Assert(nBytes == strlen(msg)); *te = getTocEntryByDumpId(AH, dumpId); Assert(*te != NULL); } else if (messageStartsWith(msg, "RESTORE ")) { *act = ACT_RESTORE; sscanf(msg, "RESTORE %d%n", &dumpId, &nBytes); Assert(nBytes == strlen(msg)); *te = getTocEntryByDumpId(AH, dumpId); Assert(*te != NULL); } else pg_fatal("unrecognized command received from leader: \"%s\"", msg); } /* * buildWorkerResponse: format a response string to send to the leader. * * The string is built in the caller-supplied buffer of size buflen. */ static void buildWorkerResponse(ArchiveHandle *AH, TocEntry *te, T_Action act, int status, char *buf, int buflen) { snprintf(buf, buflen, "OK %d %d %d", te->dumpId, status, status == WORKER_IGNORED_ERRORS ? AH->public.n_errors : 0); } /* * parseWorkerResponse: parse the status message returned by a worker. * * Returns the integer status code, and may update fields of AH and/or te. */ static int parseWorkerResponse(ArchiveHandle *AH, TocEntry *te, const char *msg) { DumpId dumpId; int nBytes, n_errors; int status = 0; if (messageStartsWith(msg, "OK ")) { sscanf(msg, "OK %d %d %d%n", &dumpId, &status, &n_errors, &nBytes); Assert(dumpId == te->dumpId); Assert(nBytes == strlen(msg)); AH->public.n_errors += n_errors; } else pg_fatal("invalid message received from worker: \"%s\"", msg); return status; } /* * Dispatch a job to some free worker. * * te is the TocEntry to be processed, act is the action to be taken on it. * callback is the function to call on completion of the job. * * If no worker is currently available, this will block, and previously * registered callback functions may be called. */ void DispatchJobForTocEntry(ArchiveHandle *AH, ParallelState *pstate, TocEntry *te, T_Action act, ParallelCompletionPtr callback, void *callback_data) { int worker; char buf[256]; /* Get a worker, waiting if none are idle */ while ((worker = GetIdleWorker(pstate)) == NO_SLOT) WaitForWorkers(AH, pstate, WFW_ONE_IDLE); /* Construct and send command string */ buildWorkerCommand(AH, te, act, buf, sizeof(buf)); sendMessageToWorker(pstate, worker, buf); /* Remember worker is busy, and which TocEntry it's working on */ pstate->parallelSlot[worker].workerStatus = WRKR_WORKING; pstate->parallelSlot[worker].callback = callback; pstate->parallelSlot[worker].callback_data = callback_data; pstate->te[worker] = te; } /* * Find an idle worker and return its slot number. * Return NO_SLOT if none are idle. */ static int GetIdleWorker(ParallelState *pstate) { int i; for (i = 0; i < pstate->numWorkers; i++) { if (pstate->parallelSlot[i].workerStatus == WRKR_IDLE) return i; } return NO_SLOT; } /* * Return true iff no worker is running. */ static bool HasEveryWorkerTerminated(ParallelState *pstate) { int i; for (i = 0; i < pstate->numWorkers; i++) { if (WORKER_IS_RUNNING(pstate->parallelSlot[i].workerStatus)) return false; } return true; } /* * Return true iff every worker is in the WRKR_IDLE state. */ bool IsEveryWorkerIdle(ParallelState *pstate) { int i; for (i = 0; i < pstate->numWorkers; i++) { if (pstate->parallelSlot[i].workerStatus != WRKR_IDLE) return false; } return true; } /* * Acquire lock on a table to be dumped by a worker process. * * The leader process is already holding an ACCESS SHARE lock. Ordinarily * it's no problem for a worker to get one too, but if anything else besides * pg_dump is running, there's a possible deadlock: * * 1) Leader dumps the schema and locks all tables in ACCESS SHARE mode. * 2) Another process requests an ACCESS EXCLUSIVE lock (which is not granted * because the leader holds a conflicting ACCESS SHARE lock). * 3) A worker process also requests an ACCESS SHARE lock to read the table. * The worker is enqueued behind the ACCESS EXCLUSIVE lock request. * 4) Now we have a deadlock, since the leader is effectively waiting for * the worker. The server cannot detect that, however. * * To prevent an infinite wait, prior to touching a table in a worker, request * a lock in ACCESS SHARE mode but with NOWAIT. If we don't get the lock, * then we know that somebody else has requested an ACCESS EXCLUSIVE lock and * so we have a deadlock. We must fail the backup in that case. */ static void lockTableForWorker(ArchiveHandle *AH, TocEntry *te) { const char *qualId; PQExpBuffer query; PGresult *res; /* Nothing to do for BLOBS */ if (strcmp(te->desc, "BLOBS") == 0) return; query = createPQExpBuffer(); qualId = fmtQualifiedId(te->namespace, te->tag); appendPQExpBuffer(query, "LOCK TABLE %s IN ACCESS SHARE MODE NOWAIT", qualId); res = PQexec(AH->connection, query->data); if (!res || PQresultStatus(res) != PGRES_COMMAND_OK) pg_fatal("could not obtain lock on relation \"%s\"\n" "This usually means that someone requested an ACCESS EXCLUSIVE lock " "on the table after the pg_dump parent process had gotten the " "initial ACCESS SHARE lock on the table.", qualId); PQclear(res); destroyPQExpBuffer(query); } /* * WaitForCommands: main routine for a worker process. * * Read and execute commands from the leader until we see EOF on the pipe. */ static void WaitForCommands(ArchiveHandle *AH, int pipefd[2]) { char *command; TocEntry *te; T_Action act; int status = 0; char buf[256]; for (;;) { if (!(command = getMessageFromLeader(pipefd))) { /* EOF, so done */ return; } /* Decode the command */ parseWorkerCommand(AH, &te, &act, command); if (act == ACT_DUMP) { /* Acquire lock on this table within the worker's session */ lockTableForWorker(AH, te); /* Perform the dump command */ status = (AH->WorkerJobDumpPtr) (AH, te); } else if (act == ACT_RESTORE) { /* Perform the restore command */ status = (AH->WorkerJobRestorePtr) (AH, te); } else Assert(false); /* Return status to leader */ buildWorkerResponse(AH, te, act, status, buf, sizeof(buf)); sendMessageToLeader(pipefd, buf); /* command was pg_malloc'd and we are responsible for free()ing it. */ free(command); } } /* * Check for status messages from workers. * * If do_wait is true, wait to get a status message; otherwise, just return * immediately if there is none available. * * When we get a status message, we pass the status code to the callback * function that was specified to DispatchJobForTocEntry, then reset the * worker status to IDLE. * * Returns true if we collected a status message, else false. * * XXX is it worth checking for more than one status message per call? * It seems somewhat unlikely that multiple workers would finish at exactly * the same time. */ static bool ListenToWorkers(ArchiveHandle *AH, ParallelState *pstate, bool do_wait) { int worker; char *msg; /* Try to collect a status message */ msg = getMessageFromWorker(pstate, do_wait, &worker); if (!msg) { /* If do_wait is true, we must have detected EOF on some socket */ if (do_wait) pg_fatal("a worker process died unexpectedly"); return false; } /* Process it and update our idea of the worker's status */ if (messageStartsWith(msg, "OK ")) { ParallelSlot *slot = &pstate->parallelSlot[worker]; TocEntry *te = pstate->te[worker]; int status; status = parseWorkerResponse(AH, te, msg); slot->callback(AH, te, status, slot->callback_data); slot->workerStatus = WRKR_IDLE; pstate->te[worker] = NULL; } else pg_fatal("invalid message received from worker: \"%s\"", msg); /* Free the string returned from getMessageFromWorker */ free(msg); return true; } /* * Check for status results from workers, waiting if necessary. * * Available wait modes are: * WFW_NO_WAIT: reap any available status, but don't block * WFW_GOT_STATUS: wait for at least one more worker to finish * WFW_ONE_IDLE: wait for at least one worker to be idle * WFW_ALL_IDLE: wait for all workers to be idle * * Any received results are passed to the callback specified to * DispatchJobForTocEntry. * * This function is executed in the leader process. */ void WaitForWorkers(ArchiveHandle *AH, ParallelState *pstate, WFW_WaitOption mode) { bool do_wait = false; /* * In GOT_STATUS mode, always block waiting for a message, since we can't * return till we get something. In other modes, we don't block the first * time through the loop. */ if (mode == WFW_GOT_STATUS) { /* Assert that caller knows what it's doing */ Assert(!IsEveryWorkerIdle(pstate)); do_wait = true; } for (;;) { /* * Check for status messages, even if we don't need to block. We do * not try very hard to reap all available messages, though, since * there's unlikely to be more than one. */ if (ListenToWorkers(AH, pstate, do_wait)) { /* * If we got a message, we are done by definition for GOT_STATUS * mode, and we can also be certain that there's at least one idle * worker. So we're done in all but ALL_IDLE mode. */ if (mode != WFW_ALL_IDLE) return; } /* Check whether we must wait for new status messages */ switch (mode) { case WFW_NO_WAIT: return; /* never wait */ case WFW_GOT_STATUS: Assert(false); /* can't get here, because we waited */ break; case WFW_ONE_IDLE: if (GetIdleWorker(pstate) != NO_SLOT) return; break; case WFW_ALL_IDLE: if (IsEveryWorkerIdle(pstate)) return; break; } /* Loop back, and this time wait for something to happen */ do_wait = true; } } /* * Read one command message from the leader, blocking if necessary * until one is available, and return it as a malloc'd string. * On EOF, return NULL. * * This function is executed in worker processes. */ static char * getMessageFromLeader(int pipefd[2]) { return readMessageFromPipe(pipefd[PIPE_READ]); } /* * Send a status message to the leader. * * This function is executed in worker processes. */ static void sendMessageToLeader(int pipefd[2], const char *str) { int len = strlen(str) + 1; if (pipewrite(pipefd[PIPE_WRITE], str, len) != len) pg_fatal("could not write to the communication channel: %m"); } /* * Wait until some descriptor in "workerset" becomes readable. * Returns -1 on error, else the number of readable descriptors. */ static int select_loop(int maxFd, fd_set *workerset) { int i; fd_set saveSet = *workerset; for (;;) { *workerset = saveSet; i = select(maxFd + 1, workerset, NULL, NULL, NULL); #ifndef WIN32 if (i < 0 && errno == EINTR) continue; #else if (i == SOCKET_ERROR && WSAGetLastError() == WSAEINTR) continue; #endif break; } return i; } /* * Check for messages from worker processes. * * If a message is available, return it as a malloc'd string, and put the * index of the sending worker in *worker. * * If nothing is available, wait if "do_wait" is true, else return NULL. * * If we detect EOF on any socket, we'll return NULL. It's not great that * that's hard to distinguish from the no-data-available case, but for now * our one caller is okay with that. * * This function is executed in the leader process. */ static char * getMessageFromWorker(ParallelState *pstate, bool do_wait, int *worker) { int i; fd_set workerset; int maxFd = -1; struct timeval nowait = {0, 0}; /* construct bitmap of socket descriptors for select() */ FD_ZERO(&workerset); for (i = 0; i < pstate->numWorkers; i++) { if (!WORKER_IS_RUNNING(pstate->parallelSlot[i].workerStatus)) continue; FD_SET(pstate->parallelSlot[i].pipeRead, &workerset); if (pstate->parallelSlot[i].pipeRead > maxFd) maxFd = pstate->parallelSlot[i].pipeRead; } if (do_wait) { i = select_loop(maxFd, &workerset); Assert(i != 0); } else { if ((i = select(maxFd + 1, &workerset, NULL, NULL, &nowait)) == 0) return NULL; } if (i < 0) pg_fatal("%s() failed: %m", "select"); for (i = 0; i < pstate->numWorkers; i++) { char *msg; if (!WORKER_IS_RUNNING(pstate->parallelSlot[i].workerStatus)) continue; if (!FD_ISSET(pstate->parallelSlot[i].pipeRead, &workerset)) continue; /* * Read the message if any. If the socket is ready because of EOF, * we'll return NULL instead (and the socket will stay ready, so the * condition will persist). * * Note: because this is a blocking read, we'll wait if only part of * the message is available. Waiting a long time would be bad, but * since worker status messages are short and are always sent in one * operation, it shouldn't be a problem in practice. */ msg = readMessageFromPipe(pstate->parallelSlot[i].pipeRead); *worker = i; return msg; } Assert(false); return NULL; } /* * Send a command message to the specified worker process. * * This function is executed in the leader process. */ static void sendMessageToWorker(ParallelState *pstate, int worker, const char *str) { int len = strlen(str) + 1; if (pipewrite(pstate->parallelSlot[worker].pipeWrite, str, len) != len) { pg_fatal("could not write to the communication channel: %m"); } } /* * Read one message from the specified pipe (fd), blocking if necessary * until one is available, and return it as a malloc'd string. * On EOF, return NULL. * * A "message" on the channel is just a null-terminated string. */ static char * readMessageFromPipe(int fd) { char *msg; int msgsize, bufsize; int ret; /* * In theory, if we let piperead() read multiple bytes, it might give us * back fragments of multiple messages. (That can't actually occur, since * neither leader nor workers send more than one message without waiting * for a reply, but we don't wish to assume that here.) For simplicity, * read a byte at a time until we get the terminating '\0'. This method * is a bit inefficient, but since this is only used for relatively short * command and status strings, it shouldn't matter. */ bufsize = 64; /* could be any number */ msg = (char *) pg_malloc(bufsize); msgsize = 0; for (;;) { Assert(msgsize < bufsize); ret = piperead(fd, msg + msgsize, 1); if (ret <= 0) break; /* error or connection closure */ Assert(ret == 1); if (msg[msgsize] == '\0') return msg; /* collected whole message */ msgsize++; if (msgsize == bufsize) /* enlarge buffer if needed */ { bufsize += 16; /* could be any number */ msg = (char *) pg_realloc(msg, bufsize); } } /* Other end has closed the connection */ pg_free(msg); return NULL; } #ifdef WIN32 /* * This is a replacement version of pipe(2) for Windows which allows the pipe * handles to be used in select(). * * Reads and writes on the pipe must go through piperead()/pipewrite(). * * For consistency with Unix we declare the returned handles as "int". * This is okay even on WIN64 because system handles are not more than * 32 bits wide, but we do have to do some casting. */ static int pgpipe(int handles[2]) { pgsocket s, tmp_sock; struct sockaddr_in serv_addr; int len = sizeof(serv_addr); /* We have to use the Unix socket invalid file descriptor value here. */ handles[0] = handles[1] = -1; /* * setup listen socket */ if ((s = socket(AF_INET, SOCK_STREAM, 0)) == PGINVALID_SOCKET) { pg_log_error("pgpipe: could not create socket: error code %d", WSAGetLastError()); return -1; } memset(&serv_addr, 0, sizeof(serv_addr)); serv_addr.sin_family = AF_INET; serv_addr.sin_port = pg_hton16(0); serv_addr.sin_addr.s_addr = pg_hton32(INADDR_LOOPBACK); if (bind(s, (SOCKADDR *) &serv_addr, len) == SOCKET_ERROR) { pg_log_error("pgpipe: could not bind: error code %d", WSAGetLastError()); closesocket(s); return -1; } if (listen(s, 1) == SOCKET_ERROR) { pg_log_error("pgpipe: could not listen: error code %d", WSAGetLastError()); closesocket(s); return -1; } if (getsockname(s, (SOCKADDR *) &serv_addr, &len) == SOCKET_ERROR) { pg_log_error("pgpipe: %s() failed: error code %d", "getsockname", WSAGetLastError()); closesocket(s); return -1; } /* * setup pipe handles */ if ((tmp_sock = socket(AF_INET, SOCK_STREAM, 0)) == PGINVALID_SOCKET) { pg_log_error("pgpipe: could not create second socket: error code %d", WSAGetLastError()); closesocket(s); return -1; } handles[1] = (int) tmp_sock; if (connect(handles[1], (SOCKADDR *) &serv_addr, len) == SOCKET_ERROR) { pg_log_error("pgpipe: could not connect socket: error code %d", WSAGetLastError()); closesocket(handles[1]); handles[1] = -1; closesocket(s); return -1; } if ((tmp_sock = accept(s, (SOCKADDR *) &serv_addr, &len)) == PGINVALID_SOCKET) { pg_log_error("pgpipe: could not accept connection: error code %d", WSAGetLastError()); closesocket(handles[1]); handles[1] = -1; closesocket(s); return -1; } handles[0] = (int) tmp_sock; closesocket(s); return 0; } #endif /* WIN32 */