From 9e257a181cc1dc5e19eb5d770ce09cc98f470f5f Mon Sep 17 00:00:00 2001 From: Andrew Dunstan Date: Sun, 24 Mar 2013 11:27:20 -0400 Subject: [PATCH] Add parallel pg_dump option. New infrastructure is added which creates a set number of workers (threads on Windows, forked processes on Unix). Jobs are then handed out to these workers by the master process as needed. pg_restore is adjusted to use this new infrastructure in place of the old setup which created a new worker for each step on the fly. Parallel dumps acquire a snapshot clone in order to stay consistent, if available. The parallel option is selected by the -j / --jobs command line parameter of pg_dump. Joachim Wieland, lightly editorialized by Andrew Dunstan. --- doc/src/sgml/backup.sgml | 18 + doc/src/sgml/perform.sgml | 9 + doc/src/sgml/ref/pg_dump.sgml | 89 +- src/bin/pg_dump/Makefile | 2 +- src/bin/pg_dump/compress_io.c | 10 + src/bin/pg_dump/dumputils.c | 86 +- src/bin/pg_dump/dumputils.h | 13 +- src/bin/pg_dump/parallel.c | 1293 +++++++++++++++++++++++++ src/bin/pg_dump/parallel.h | 85 ++ src/bin/pg_dump/pg_backup.h | 11 +- src/bin/pg_dump/pg_backup_archiver.c | 747 ++++++-------- src/bin/pg_dump/pg_backup_archiver.h | 35 +- src/bin/pg_dump/pg_backup_custom.c | 88 +- src/bin/pg_dump/pg_backup_db.c | 20 +- src/bin/pg_dump/pg_backup_directory.c | 264 ++++- src/bin/pg_dump/pg_backup_tar.c | 8 +- src/bin/pg_dump/pg_dump.c | 691 +++++++------ src/bin/pg_dump/pg_dump.h | 3 + src/bin/pg_dump/pg_dump_sort.c | 92 +- src/bin/pg_dump/pg_dumpall.c | 20 +- src/bin/pg_dump/pg_restore.c | 17 +- src/tools/msvc/Mkvcbuild.pm | 5 + 22 files changed, 2776 insertions(+), 830 deletions(-) create mode 100644 src/bin/pg_dump/parallel.c create mode 100644 src/bin/pg_dump/parallel.h diff --git a/doc/src/sgml/backup.sgml b/doc/src/sgml/backup.sgml index c4215bed98..e444b1cde3 100644 --- a/doc/src/sgml/backup.sgml +++ b/doc/src/sgml/backup.sgml @@ -310,6 +310,24 @@ pg_restore -d dbname + + Use <application>pg_dump</>'s parallel dump feature. + + To speed up the dump of a large database, you can use + pg_dump's parallel mode. This will dump + multiple tables at the same time. You can control the degree of + parallelism with the -j parameter. Parallel dumps + are only supported for the "directory" archive format. + + +pg_dump -j num -F d -f out.dir dbname + + + You can use pg_restore -j to restore a dump in parallel. + This will work for any archive of either the "custom" or the "directory" + archive mode, whether or not it has been created with pg_dump -j. + + diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml index 1e7544afeb..34eace35b6 100644 --- a/doc/src/sgml/perform.sgml +++ b/doc/src/sgml/perform.sgml @@ -1433,6 +1433,15 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse; base backup. + + + Experiment with the parallel dump and restore modes of both + pg_dump and pg_restore and find the + optimal number of concurrent jobs to use. Dumping and restoring in + parallel by means of the + Consider whether the whole dump should be restored as a single diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml index 6d0f214d42..0186ce0938 100644 --- a/doc/src/sgml/ref/pg_dump.sgml +++ b/doc/src/sgml/ref/pg_dump.sgml @@ -73,10 +73,12 @@ PostgreSQL documentation transfer mechanism. pg_dump can be used to backup an entire database, then pg_restore can be used to examine the archive and/or select which parts of the - database are to be restored. The most flexible output file format is - the custom format (). It allows - for selection and reordering of all archived items, and is compressed - by default. + database are to be restored. The most flexible output file formats are + the custom format () and the + directory format(). They allow + for selection and reordering of all archived items, support parallel + restoration, and are compressed by default. The directory + format is the only format that supports parallel dumps. @@ -251,7 +253,8 @@ PostgreSQL documentation can read. A directory format archive can be manipulated with standard Unix tools; for example, files in an uncompressed archive can be compressed with the gzip tool. - This format is compressed by default. + This format is compressed by default and also supports parallel + dumps. @@ -285,6 +288,62 @@ PostgreSQL documentation + + + + + + Run the dump in parallel by dumping njobs + tables simultaneously. This option reduces the time of the dump but it also + increases the load on the database server. You can only use this option with the + directory output format because this is the only output format where multiple processes + can write their data at the same time. + + + pg_dump will open njobs + + 1 connections to the database, so make sure your + setting is high enough to accommodate all connections. + + + Requesting exclusive locks on database objects while running a parallel dump could + cause the dump to fail. The reason is that the pg_dump master process + requests shared locks on the objects that the worker processes are going to dump later + in order to + make sure that nobody deletes them and makes them go away while the dump is running. + If another client then requests an exclusive lock on a table, that lock will not be + granted but will be queued waiting for the shared lock of the master process to be + released.. Consequently any other access to the table will not be granted either and + will queue after the exclusive lock request. This includes the worker process trying + to dump the table. Without any precautions this would be a classic deadlock situation. + To detect this conflict, the pg_dump worker process requests another + shared lock using the NOWAIT option. If the worker process is not granted + this shared lock, somebody else must have requested an exclusive lock in the meantime + and there is no way to continue with the dump, so pg_dump has no choice + but to abort the dump. + + + For a consistent backup, the database server needs to support synchronized snapshots, + a feature that was introduced in PostgreSQL 9.2. With this + feature, database clients can ensure they see the same dataset even though they use + different connections. pg_dump -j uses multiple database + connections; it connects to the database once with the master process and + once again for each worker job. Without the sychronized snapshot feature, the + different worker jobs wouldn't be guaranteed to see the same data in each connection, + which could lead to an inconsistent backup. + + + If you want to run a parallel dump of a pre-9.2 server, you need to make sure that the + database content doesn't change from between the time the master connects to the + database until the last worker job has connected to the database. The easiest way to + do this is to halt any data modifying processes (DDL and DML) accessing the database + before starting the backup. You also need to specify the + parameter when running + pg_dump -j against a pre-9.2 PostgreSQL + server. + + + + @@ -690,6 +749,17 @@ PostgreSQL documentation + + + + + This option allows running pg_dump -j against a pre-9.2 + server, see the documentation of the parameter + for more details. + + + + @@ -1082,6 +1152,15 @@ CREATE DATABASE foo WITH TEMPLATE template0; + + To dump a database into a directory-format archive in parallel with + 5 worker jobs: + + +$ pg_dump -Fd mydb -j 5 -f dumpdir + + + To reload an archive file into a (freshly created) database named newdb: diff --git a/src/bin/pg_dump/Makefile b/src/bin/pg_dump/Makefile index a6ab39d347..6336edc65b 100644 --- a/src/bin/pg_dump/Makefile +++ b/src/bin/pg_dump/Makefile @@ -19,7 +19,7 @@ include $(top_builddir)/src/Makefile.global override CPPFLAGS := -I$(libpq_srcdir) $(CPPFLAGS) OBJS= pg_backup_archiver.o pg_backup_db.o pg_backup_custom.o \ - pg_backup_null.o pg_backup_tar.o \ + pg_backup_null.o pg_backup_tar.o parallel.o \ pg_backup_directory.o dumputils.o compress_io.o $(WIN32RES) KEYWRDOBJS = keywords.o kwlookup.o diff --git a/src/bin/pg_dump/compress_io.c b/src/bin/pg_dump/compress_io.c index 768b923ae5..0308f66c49 100644 --- a/src/bin/pg_dump/compress_io.c +++ b/src/bin/pg_dump/compress_io.c @@ -54,6 +54,7 @@ #include "compress_io.h" #include "dumputils.h" +#include "parallel.h" /*---------------------- * Compressor API @@ -182,6 +183,9 @@ size_t WriteDataToArchive(ArchiveHandle *AH, CompressorState *cs, const void *data, size_t dLen) { + /* Are we aborting? */ + checkAborting(AH); + switch (cs->comprAlg) { case COMPR_ALG_LIBZ: @@ -351,6 +355,9 @@ ReadDataFromArchiveZlib(ArchiveHandle *AH, ReadFunc readF) /* no minimal chunk size for zlib */ while ((cnt = readF(AH, &buf, &buflen))) { + /* Are we aborting? */ + checkAborting(AH); + zp->next_in = (void *) buf; zp->avail_in = cnt; @@ -411,6 +418,9 @@ ReadDataFromArchiveNone(ArchiveHandle *AH, ReadFunc readF) while ((cnt = readF(AH, &buf, &buflen))) { + /* Are we aborting? */ + checkAborting(AH); + ahwrite(buf, 1, cnt, AH); } diff --git a/src/bin/pg_dump/dumputils.c b/src/bin/pg_dump/dumputils.c index 0a09882f5d..7322f1a825 100644 --- a/src/bin/pg_dump/dumputils.c +++ b/src/bin/pg_dump/dumputils.c @@ -38,6 +38,7 @@ static struct } on_exit_nicely_list[MAX_ON_EXIT_NICELY]; static int on_exit_nicely_index; +void (*on_exit_msg_func) (const char *modulename, const char *fmt, va_list ap) = vwrite_msg; #define supports_grant_options(version) ((version) >= 70400) @@ -48,11 +49,21 @@ static bool parseAclItem(const char *item, const char *type, static char *copyAclUserName(PQExpBuffer output, char *input); static void AddAcl(PQExpBuffer aclbuf, const char *keyword, const char *subname); +static PQExpBuffer getThreadLocalPQExpBuffer(void); #ifdef WIN32 +static void shutdown_parallel_dump_utils(int code, void *unused); static bool parallel_init_done = false; static DWORD tls_index; static DWORD mainThreadId; + +static void +shutdown_parallel_dump_utils(int code, void *unused) +{ + /* Call the cleanup function only from the main thread */ + if (mainThreadId == GetCurrentThreadId()) + WSACleanup(); +} #endif void @@ -61,23 +72,29 @@ init_parallel_dump_utils(void) #ifdef WIN32 if (!parallel_init_done) { + WSADATA wsaData; + int err; + tls_index = TlsAlloc(); - parallel_init_done = true; mainThreadId = GetCurrentThreadId(); + err = WSAStartup(MAKEWORD(2, 2), &wsaData); + if (err != 0) + { + fprintf(stderr, _("WSAStartup failed: %d\n"), err); + exit_nicely(1); + } + on_exit_nicely(shutdown_parallel_dump_utils, NULL); + parallel_init_done = true; } #endif } /* - * Quotes input string if it's not a legitimate SQL identifier as-is. - * - * Note that the returned string must be used before calling fmtId again, - * since we re-use the same return buffer each time. Non-reentrant but - * reduces memory leakage. (On Windows the memory leakage will be one buffer - * per thread, which is at least better than one per call). + * Non-reentrant but reduces memory leakage. (On Windows the memory leakage + * will be one buffer per thread, which is at least better than one per call). */ -const char * -fmtId(const char *rawid) +static PQExpBuffer +getThreadLocalPQExpBuffer(void) { /* * The Tls code goes awry if we use a static var, so we provide for both @@ -86,9 +103,6 @@ fmtId(const char *rawid) static PQExpBuffer s_id_return = NULL; PQExpBuffer id_return; - const char *cp; - bool need_quotes = false; - #ifdef WIN32 if (parallel_init_done) id_return = (PQExpBuffer) TlsGetValue(tls_index); /* 0 when not set */ @@ -118,6 +132,23 @@ fmtId(const char *rawid) } + return id_return; +} + +/* + * Quotes input string if it's not a legitimate SQL identifier as-is. + * + * Note that the returned string must be used before calling fmtId again, + * since we re-use the same return buffer each time. + */ +const char * +fmtId(const char *rawid) +{ + PQExpBuffer id_return = getThreadLocalPQExpBuffer(); + + const char *cp; + bool need_quotes = false; + /* * These checks need to match the identifier production in scan.l. Don't * use islower() etc. @@ -185,6 +216,35 @@ fmtId(const char *rawid) return id_return->data; } +/* + * fmtQualifiedId - convert a qualified name to the proper format for + * the source database. + * + * Like fmtId, use the result before calling again. + * + * Since we call fmtId and it also uses getThreadLocalPQExpBuffer() we cannot + * use it until we're finished with calling fmtId(). + */ +const char * +fmtQualifiedId(int remoteVersion, const char *schema, const char *id) +{ + PQExpBuffer id_return; + PQExpBuffer lcl_pqexp = createPQExpBuffer(); + + /* Suppress schema name if fetching from pre-7.3 DB */ + if (remoteVersion >= 70300 && schema && *schema) + { + appendPQExpBuffer(lcl_pqexp, "%s.", fmtId(schema)); + } + appendPQExpBuffer(lcl_pqexp, "%s", fmtId(id)); + + id_return = getThreadLocalPQExpBuffer(); + + appendPQExpBuffer(id_return, "%s", lcl_pqexp->data); + destroyPQExpBuffer(lcl_pqexp); + + return id_return->data; +} /* * Convert a string value to an SQL string literal and append it to @@ -1315,7 +1375,7 @@ exit_horribly(const char *modulename, const char *fmt,...) va_list ap; va_start(ap, fmt); - vwrite_msg(modulename, fmt, ap); + on_exit_msg_func(modulename, fmt, ap); va_end(ap); exit_nicely(1); diff --git a/src/bin/pg_dump/dumputils.h b/src/bin/pg_dump/dumputils.h index a4b351d03c..90da787c5c 100644 --- a/src/bin/pg_dump/dumputils.h +++ b/src/bin/pg_dump/dumputils.h @@ -29,14 +29,14 @@ typedef enum /* bits returned by set_dump_section */ typedef struct SimpleStringListCell { - struct SimpleStringListCell *next; - char val[1]; /* VARIABLE LENGTH FIELD */ + struct SimpleStringListCell *next; + char val[1]; /* VARIABLE LENGTH FIELD */ } SimpleStringListCell; typedef struct SimpleStringList { - SimpleStringListCell *head; - SimpleStringListCell *tail; + SimpleStringListCell *head; + SimpleStringListCell *tail; } SimpleStringList; @@ -47,6 +47,8 @@ extern const char *progname; extern void init_parallel_dump_utils(void); extern const char *fmtId(const char *identifier); +extern const char *fmtQualifiedId(int remoteVersion, + const char *schema, const char *id); extern void appendStringLiteral(PQExpBuffer buf, const char *str, int encoding, bool std_strings); extern void appendStringLiteralConn(PQExpBuffer buf, const char *str, @@ -85,11 +87,12 @@ __attribute__((format(PG_PRINTF_ATTRIBUTE, 2, 0))); extern void exit_horribly(const char *modulename, const char *fmt,...) __attribute__((format(PG_PRINTF_ATTRIBUTE, 2, 3), noreturn)); +extern void (*on_exit_msg_func) (const char *modulename, const char *fmt, va_list ap) + __attribute__((format(PG_PRINTF_ATTRIBUTE, 2, 0))); extern void on_exit_nicely(on_exit_nicely_callback function, void *arg); extern void exit_nicely(int code) __attribute__((noreturn)); extern void simple_string_list_append(SimpleStringList *list, const char *val); extern bool simple_string_list_member(SimpleStringList *list, const char *val); - #endif /* DUMPUTILS_H */ diff --git a/src/bin/pg_dump/parallel.c b/src/bin/pg_dump/parallel.c new file mode 100644 index 0000000000..dedf4311b9 --- /dev/null +++ b/src/bin/pg_dump/parallel.c @@ -0,0 +1,1293 @@ +/*------------------------------------------------------------------------- + * + * parallel.c + * + * Parallel support for the pg_dump archiver + * + * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * The author is not responsible for loss or damages that may + * result from its use. + * + * IDENTIFICATION + * src/bin/pg_dump/parallel.c + * + *------------------------------------------------------------------------- + */ + +#include "pg_backup_db.h" + +#include "dumputils.h" +#include "parallel.h" + +#ifndef WIN32 +#include +#include +#include "signal.h" +#include +#include +#endif + +#define PIPE_READ 0 +#define PIPE_WRITE 1 + +/* file-scope variables */ +#ifdef WIN32 +static unsigned int tMasterThreadId = 0; +static HANDLE termEvent = INVALID_HANDLE_VALUE; +static int pgpipe(int handles[2]); +static int piperead(int s, char *buf, int len); + +/* + * Structure to hold info passed by _beginthreadex() to the function it calls + * via its single allowed argument. + */ +typedef struct +{ + ArchiveHandle *AH; + RestoreOptions *ropt; + int worker; + int pipeRead; + int pipeWrite; +} WorkerInfo; + +#define pipewrite(a,b,c) send(a,b,c,0) +#else +/* + * aborting is only ever used in the master, the workers are fine with just + * wantAbort. + */ +static bool aborting = false; +static volatile sig_atomic_t wantAbort = 0; + +#define pgpipe(a) pipe(a) +#define piperead(a,b,c) read(a,b,c) +#define pipewrite(a,b,c) write(a,b,c) +#endif + +typedef struct ShutdownInformation +{ + ParallelState *pstate; + Archive *AHX; +} ShutdownInformation; + +static ShutdownInformation shutdown_info; + +static const char *modulename = gettext_noop("parallel archiver"); + +static ParallelSlot *GetMyPSlot(ParallelState *pstate); +static void +parallel_exit_msg_func(const char *modulename, + const char *fmt, va_list ap) +__attribute__((format(PG_PRINTF_ATTRIBUTE, 2, 0))); +static void +parallel_msg_master(ParallelSlot *slot, const char *modulename, + const char *fmt, va_list ap) +__attribute__((format(PG_PRINTF_ATTRIBUTE, 3, 0))); +static void archive_close_connection(int code, void *arg); +static void ShutdownWorkersHard(ParallelState *pstate); +static void WaitForTerminatingWorkers(ParallelState *pstate); + +#ifndef WIN32 +static void sigTermHandler(int signum); +#endif +static void SetupWorker(ArchiveHandle *AH, int pipefd[2], int worker, + RestoreOptions *ropt); +static bool HasEveryWorkerTerminated(ParallelState *pstate); + +static void lockTableNoWait(ArchiveHandle *AH, TocEntry *te); +static void WaitForCommands(ArchiveHandle *AH, int pipefd[2]); +static char *getMessageFromMaster(int pipefd[2]); +static void sendMessageToMaster(int pipefd[2], const char *str); +static int select_loop(int maxFd, fd_set *workerset); +static char *getMessageFromWorker(ParallelState *pstate, + bool do_wait, int *worker); +static void sendMessageToWorker(ParallelState *pstate, + int worker, const char *str); +static char *readMessageFromPipe(int fd); + +#define messageStartsWith(msg, prefix) \ + (strncmp(msg, prefix, strlen(prefix)) == 0) +#define messageEquals(msg, pattern) \ + (strcmp(msg, pattern) == 0) + +static ParallelSlot * +GetMyPSlot(ParallelState *pstate) +{ + int i; + + for (i = 0; i < pstate->numWorkers; i++) +#ifdef WIN32 + if (pstate->parallelSlot[i].threadId == GetCurrentThreadId()) +#else + if (pstate->parallelSlot[i].pid == getpid()) +#endif + return &(pstate->parallelSlot[i]); + + return NULL; +} + +/* + * This is the function that will be called from exit_horribly() to print the + * error message. If the worker process does exit_horribly(), we forward its + * last words to the master process. The master process then does + * exit_horribly() with this error message itself and prints it normally. + * After printing the message, exit_horribly() on the master will shut down + * the remaining worker processes. + */ +static void +parallel_exit_msg_func(const char *modulename, const char *fmt, va_list ap) +{ + ParallelState *pstate = shutdown_info.pstate; + ParallelSlot *slot; + + Assert(pstate); + + slot = GetMyPSlot(pstate); + + if (!slot) + /* We're the parent, just write the message out */ + vwrite_msg(modulename, fmt, ap); + else + /* If we're a worker process, send the msg to the master process */ + parallel_msg_master(slot, modulename, fmt, ap); +} + +/* Sends the error message from the worker to the master process */ +static void +parallel_msg_master(ParallelSlot *slot, const char *modulename, + const char *fmt, va_list ap) +{ + char buf[512]; + int pipefd[2]; + + pipefd[PIPE_READ] = slot->pipeRevRead; + pipefd[PIPE_WRITE] = slot->pipeRevWrite; + + strcpy(buf, "ERROR "); + vsnprintf(buf + strlen("ERROR "), + sizeof(buf) - strlen("ERROR "), fmt, ap); + + sendMessageToMaster(pipefd, buf); +} + +/* + * pg_dump and pg_restore register the Archive pointer for the exit handler + * (called from exit_horribly). This function mainly exists so that we can + * keep shutdown_info in file scope only. + */ +void +on_exit_close_archive(Archive *AHX) +{ + shutdown_info.AHX = AHX; + on_exit_nicely(archive_close_connection, &shutdown_info); +} + +/* + * This function can close archives in both the parallel and non-parallel + * case. + */ +static void +archive_close_connection(int code, void *arg) +{ + ShutdownInformation *si = (ShutdownInformation *) arg; + + if (si->pstate) + { + ParallelSlot *slot = GetMyPSlot(si->pstate); + + if (!slot) + { + /* + * We're the master: We have already printed out the message + * passed to exit_horribly() either from the master itself or from + * a worker process. Now we need to close our own database + * connection (only open during parallel dump but not restore) and + * shut down the remaining workers. + */ + DisconnectDatabase(si->AHX); +#ifndef WIN32 + + /* + * Setting aborting to true switches to best-effort-mode + * (send/receive but ignore errors) in communicating with our + * workers. + */ + aborting = true; +#endif + ShutdownWorkersHard(si->pstate); + } + else if (slot->args->AH) + DisconnectDatabase(&(slot->args->AH->public)); + } + else if (si->AHX) + DisconnectDatabase(si->AHX); +} + +/* + * If we have one worker that terminates for some reason, we'd like the other + * threads to terminate as well (and not finish with their 70 GB table dump + * first...). Now in UNIX we can just kill these processes, and let the signal + * handler set wantAbort to 1. In Windows we set a termEvent and this serves + * as the signal for everyone to terminate. + */ +void +checkAborting(ArchiveHandle *AH) +{ +#ifdef WIN32 + if (WaitForSingleObject(termEvent, 0) == WAIT_OBJECT_0) +#else + if (wantAbort) +#endif + exit_horribly(modulename, "worker is terminating\n"); +} + +/* + * Shut down any remaining workers, this has an implicit do_wait == true. + * + * The fastest way we can make the workers terminate gracefully is when + * they are listening for new commands and we just tell them to terminate. + */ +static void +ShutdownWorkersHard(ParallelState *pstate) +{ +#ifndef WIN32 + int i; + + signal(SIGPIPE, SIG_IGN); + + /* + * Close our write end of the sockets so that the workers know they can + * exit. + */ + for (i = 0; i < pstate->numWorkers; i++) + closesocket(pstate->parallelSlot[i].pipeWrite); + + for (i = 0; i < pstate->numWorkers; i++) + kill(pstate->parallelSlot[i].pid, SIGTERM); +#else + /* The workers monitor this event via checkAborting(). */ + SetEvent(termEvent); +#endif + + WaitForTerminatingWorkers(pstate); +} + +/* + * Wait for the termination of the processes using the OS-specific method. + */ +static void +WaitForTerminatingWorkers(ParallelState *pstate) +{ + while (!HasEveryWorkerTerminated(pstate)) + { + ParallelSlot *slot = NULL; + int j; + +#ifndef WIN32 + int status; + pid_t pid = wait(&status); + + for (j = 0; j < pstate->numWorkers; j++) + if (pstate->parallelSlot[j].pid == pid) + slot = &(pstate->parallelSlot[j]); +#else + uintptr_t hThread; + DWORD ret; + uintptr_t *lpHandles = pg_malloc(sizeof(HANDLE) * pstate->numWorkers); + int nrun = 0; + + for (j = 0; j < pstate->numWorkers; j++) + if (pstate->parallelSlot[j].workerStatus != WRKR_TERMINATED) + { + lpHandles[nrun] = pstate->parallelSlot[j].hThread; + nrun++; + } + ret = WaitForMultipleObjects(nrun, (HANDLE *) lpHandles, false, INFINITE); + Assert(ret != WAIT_FAILED); + hThread = lpHandles[ret - WAIT_OBJECT_0]; + + for (j = 0; j < pstate->numWorkers; j++) + if (pstate->parallelSlot[j].hThread == hThread) + slot = &(pstate->parallelSlot[j]); + + free(lpHandles); +#endif + Assert(slot); + + slot->workerStatus = WRKR_TERMINATED; + } + Assert(HasEveryWorkerTerminated(pstate)); +} + +#ifndef WIN32 +/* Signal handling (UNIX only) */ +static void +sigTermHandler(int signum) +{ + wantAbort = 1; +} +#endif + +/* + * This function is called by both UNIX and Windows variants to set up a + * worker process. + */ +static void +SetupWorker(ArchiveHandle *AH, int pipefd[2], int worker, + RestoreOptions *ropt) +{ + /* + * Call the setup worker function that's defined in the ArchiveHandle. + * + * We get the raw connection only for the reason that we can close it + * properly when we shut down. This happens only that way when it is + * brought down because of an error. + */ + (AH->SetupWorkerPtr) ((Archive *) AH, ropt); + + Assert(AH->connection != NULL); + + WaitForCommands(AH, pipefd); + + closesocket(pipefd[PIPE_READ]); + closesocket(pipefd[PIPE_WRITE]); +} + +#ifdef WIN32 +static unsigned __stdcall +init_spawned_worker_win32(WorkerInfo *wi) +{ + ArchiveHandle *AH; + int pipefd[2] = {wi->pipeRead, wi->pipeWrite}; + int worker = wi->worker; + RestoreOptions *ropt = wi->ropt; + + AH = CloneArchive(wi->AH); + + free(wi); + SetupWorker(AH, pipefd, worker, ropt); + + DeCloneArchive(AH); + _endthreadex(0); + return 0; +} +#endif + +/* + * This function starts the parallel dump or restore by spawning off the + * worker processes in both Unix and Windows. For Windows, it creates a number + * of threads while it does a fork() on Unix. + */ +ParallelState * +ParallelBackupStart(ArchiveHandle *AH, RestoreOptions *ropt) +{ + ParallelState *pstate; + int i; + const size_t slotSize = AH->public.numWorkers * sizeof(ParallelSlot); + + Assert(AH->public.numWorkers > 0); + + /* Ensure stdio state is quiesced before forking */ + fflush(NULL); + + pstate = (ParallelState *) pg_malloc(sizeof(ParallelState)); + + pstate->numWorkers = AH->public.numWorkers; + pstate->parallelSlot = NULL; + + if (AH->public.numWorkers == 1) + return pstate; + + pstate->parallelSlot = (ParallelSlot *) pg_malloc(slotSize); + memset((void *) pstate->parallelSlot, 0, slotSize); + + /* + * Set the pstate in the shutdown_info. The exit handler uses pstate if + * set and falls back to AHX otherwise. + */ + shutdown_info.pstate = pstate; + on_exit_msg_func = parallel_exit_msg_func; + +#ifdef WIN32 + tMasterThreadId = GetCurrentThreadId(); + termEvent = CreateEvent(NULL, true, false, "Terminate"); +#else + signal(SIGTERM, sigTermHandler); + signal(SIGINT, sigTermHandler); + signal(SIGQUIT, sigTermHandler); +#endif + + for (i = 0; i < pstate->numWorkers; i++) + { +#ifdef WIN32 + WorkerInfo *wi; + uintptr_t handle; +#else + pid_t pid; +#endif + int pipeMW[2], + pipeWM[2]; + + if (pgpipe(pipeMW) < 0 || pgpipe(pipeWM) < 0) + exit_horribly(modulename, + "Cannot create communication channels: %s\n", + strerror(errno)); + + pstate->parallelSlot[i].workerStatus = WRKR_IDLE; + pstate->parallelSlot[i].args = (ParallelArgs *) pg_malloc(sizeof(ParallelArgs)); + pstate->parallelSlot[i].args->AH = NULL; + pstate->parallelSlot[i].args->te = NULL; +#ifdef WIN32 + /* Allocate a new structure for every worker */ + wi = (WorkerInfo *) pg_malloc(sizeof(WorkerInfo)); + + wi->ropt = ropt; + wi->worker = i; + wi->AH = AH; + wi->pipeRead = pstate->parallelSlot[i].pipeRevRead = pipeMW[PIPE_READ]; + wi->pipeWrite = pstate->parallelSlot[i].pipeRevWrite = pipeWM[PIPE_WRITE]; + + handle = _beginthreadex(NULL, 0, (void *) &init_spawned_worker_win32, + wi, 0, &(pstate->parallelSlot[i].threadId)); + pstate->parallelSlot[i].hThread = handle; +#else + pid = fork(); + if (pid == 0) + { + /* we are the worker */ + int j; + int pipefd[2] = {pipeMW[PIPE_READ], pipeWM[PIPE_WRITE]}; + + /* + * Store the fds for the reverse communication in pstate. Actually + * we only use this in case of an error and don't use pstate + * otherwise in the worker process. On Windows we write to the + * global pstate, in Unix we write to our process-local copy but + * that's also where we'd retrieve this information back from. + */ + pstate->parallelSlot[i].pipeRevRead = pipefd[PIPE_READ]; + pstate->parallelSlot[i].pipeRevWrite = pipefd[PIPE_WRITE]; + pstate->parallelSlot[i].pid = getpid(); + + /* + * Call CloneArchive on Unix as well even though technically we + * don't need to because fork() gives us a copy in our own address + * space already. But CloneArchive resets the state information + * and also clones the database connection (for parallel dump) + * which both seem kinda helpful. + */ + pstate->parallelSlot[i].args->AH = CloneArchive(AH); + + /* close read end of Worker -> Master */ + closesocket(pipeWM[PIPE_READ]); + /* close write end of Master -> Worker */ + closesocket(pipeMW[PIPE_WRITE]); + + /* + * Close all inherited fds for communication of the master with + * the other workers. + */ + for (j = 0; j < i; j++) + { + closesocket(pstate->parallelSlot[j].pipeRead); + closesocket(pstate->parallelSlot[j].pipeWrite); + } + + SetupWorker(pstate->parallelSlot[i].args->AH, pipefd, i, ropt); + + exit(0); + } + else if (pid < 0) + /* fork failed */ + exit_horribly(modulename, + "could not create worker process: %s\n", + strerror(errno)); + + /* we are the Master, pid > 0 here */ + Assert(pid > 0); + + /* close read end of Master -> Worker */ + closesocket(pipeMW[PIPE_READ]); + /* close write end of Worker -> Master */ + closesocket(pipeWM[PIPE_WRITE]); + + pstate->parallelSlot[i].pid = pid; +#endif + + pstate->parallelSlot[i].pipeRead = pipeWM[PIPE_READ]; + pstate->parallelSlot[i].pipeWrite = pipeMW[PIPE_WRITE]; + } + + return pstate; +} + +/* + * Tell all of our workers to terminate. + * + * Pretty straightforward routine, first we tell everyone to terminate, then + * we listen to the workers' replies and finally close the sockets that we + * have used for communication. + */ +void +ParallelBackupEnd(ArchiveHandle *AH, ParallelState *pstate) +{ + int i; + + if (pstate->numWorkers == 1) + return; + + Assert(IsEveryWorkerIdle(pstate)); + + /* close the sockets so that the workers know they can exit */ + for (i = 0; i < pstate->numWorkers; i++) + { + closesocket(pstate->parallelSlot[i].pipeRead); + closesocket(pstate->parallelSlot[i].pipeWrite); + } + WaitForTerminatingWorkers(pstate); + + /* + * Remove the pstate again, so the exit handler in the parent will now + * again fall back to closing AH->connection (if connected). + */ + shutdown_info.pstate = NULL; + + free(pstate->parallelSlot); + free(pstate); +} + + +/* + * The sequence is the following (for dump, similar for restore): + * + * The master process starts the parallel backup in ParllelBackupStart, this + * forks the worker processes which enter WaitForCommand(). + * + * The master process dispatches an individual work item to one of the worker + * processes in DispatchJobForTocEntry(). It calls + * AH->MasterStartParallelItemPtr, a routine of the output format. This + * function's arguments are the parents archive handle AH (containing the full + * catalog information), the TocEntry that the worker should work on and a + * T_Action act indicating whether this is a backup or a restore item. The + * function then converts the TocEntry assignment into a string that is then + * sent over to the worker process. In the simplest case that would be + * something like "DUMP 1234", with 1234 being the TocEntry id. + * + * The worker receives the message in the routine pointed to by + * WorkerJobDumpPtr or WorkerJobRestorePtr. These are also pointers to + * corresponding routines of the respective output format, e.g. + * _WorkerJobDumpDirectory(). + * + * Remember that we have forked off the workers only after we have read in the + * catalog. That's why our worker processes can also access the catalog + * information. Now they re-translate the textual representation to a TocEntry + * on their side and do the required action (restore or dump). + * + * The result is again a textual string that is sent back to the master and is + * interpreted by AH->MasterEndParallelItemPtr. This function can update state + * or catalog information on the master's side, depending on the reply from + * the worker process. In the end it returns status which is 0 for successful + * execution. + * + * --------------------------------------------------------------------- + * Master Worker + * + * enters WaitForCommands() + * DispatchJobForTocEntry(...te...) + * + * [ Worker is IDLE ] + * + * arg = (MasterStartParallelItemPtr)() + * send: DUMP arg + * receive: DUMP arg + * str = (WorkerJobDumpPtr)(arg) + * [ Worker is WORKING ] ... gets te from arg ... + * ... dump te ... + * send: OK DUMP info + * + * In ListenToWorkers(): + * + * [ Worker is FINISHED ] + * receive: OK DUMP info + * status = (MasterEndParallelItemPtr)(info) + * + * In ReapWorkerStatus(&ptr): + * *ptr = status; + * [ Worker is IDLE ] + * --------------------------------------------------------------------- + */ +void +DispatchJobForTocEntry(ArchiveHandle *AH, ParallelState *pstate, TocEntry *te, + T_Action act) +{ + int worker; + char *arg; + + /* our caller makes sure that at least one worker is idle */ + Assert(GetIdleWorker(pstate) != NO_SLOT); + worker = GetIdleWorker(pstate); + Assert(worker != NO_SLOT); + + arg = (AH->MasterStartParallelItemPtr) (AH, te, act); + + sendMessageToWorker(pstate, worker, arg); + + pstate->parallelSlot[worker].workerStatus = WRKR_WORKING; + pstate->parallelSlot[worker].args->te = te; +} + +/* + * Find the first free parallel slot (if any). + */ +int +GetIdleWorker(ParallelState *pstate) +{ + int i; + + for (i = 0; i < pstate->numWorkers; i++) + if (pstate->parallelSlot[i].workerStatus == WRKR_IDLE) + return i; + return NO_SLOT; +} + +/* + * Return true iff every worker process is in the WRKR_TERMINATED state. + */ +static bool +HasEveryWorkerTerminated(ParallelState *pstate) +{ + int i; + + for (i = 0; i < pstate->numWorkers; i++) + if (pstate->parallelSlot[i].workerStatus != WRKR_TERMINATED) + return false; + return true; +} + +/* + * Return true iff every worker is in the WRKR_IDLE state. + */ +bool +IsEveryWorkerIdle(ParallelState *pstate) +{ + int i; + + for (i = 0; i < pstate->numWorkers; i++) + if (pstate->parallelSlot[i].workerStatus != WRKR_IDLE) + return false; + return true; +} + +/* + * --------------------------------------------------------------------- + * One danger of the parallel backup is a possible deadlock: + * + * 1) Master dumps the schema and locks all tables in ACCESS SHARE mode. + * 2) Another process requests an ACCESS EXCLUSIVE lock (which is not granted + * because the master holds a conflicting ACCESS SHARE lock). + * 3) The worker process also requests an ACCESS SHARE lock to read the table. + * The worker's not granted that lock but is enqueued behind the ACCESS + * EXCLUSIVE lock request. + * --------------------------------------------------------------------- + * + * Now what we do here is to just request a lock in ACCESS SHARE but with + * NOWAIT in the worker prior to touching the table. If we don't get the lock, + * then we know that somebody else has requested an ACCESS EXCLUSIVE lock and + * are good to just fail the whole backup because we have detected a deadlock. + */ +static void +lockTableNoWait(ArchiveHandle *AH, TocEntry *te) +{ + Archive *AHX = (Archive *) AH; + const char *qualId; + PQExpBuffer query = createPQExpBuffer(); + PGresult *res; + + Assert(AH->format == archDirectory); + Assert(strcmp(te->desc, "BLOBS") != 0); + + appendPQExpBuffer(query, + "SELECT pg_namespace.nspname," + " pg_class.relname " + " FROM pg_class " + " JOIN pg_namespace on pg_namespace.oid = relnamespace " + " WHERE pg_class.oid = %d", te->catalogId.oid); + + res = PQexec(AH->connection, query->data); + + if (!res || PQresultStatus(res) != PGRES_TUPLES_OK) + exit_horribly(modulename, + "could not get relation name for oid %d: %s\n", + te->catalogId.oid, PQerrorMessage(AH->connection)); + + resetPQExpBuffer(query); + + qualId = fmtQualifiedId(AHX->remoteVersion, + PQgetvalue(res, 0, 0), + PQgetvalue(res, 0, 1)); + + appendPQExpBuffer(query, "LOCK TABLE %s IN ACCESS SHARE MODE NOWAIT", + qualId); + PQclear(res); + + res = PQexec(AH->connection, query->data); + + if (!res || PQresultStatus(res) != PGRES_COMMAND_OK) + exit_horribly(modulename, + "could not obtain lock on relation \"%s\". This " + "usually means that someone requested an ACCESS EXCLUSIVE lock " + "on the table after the pg_dump parent process has gotten the " + "initial ACCESS SHARE lock on the table.\n", qualId); + + PQclear(res); + destroyPQExpBuffer(query); +} + +/* + * That's the main routine for the worker. + * When it starts up it enters this routine and waits for commands from the + * master process. After having processed a command it comes back to here to + * wait for the next command. Finally it will receive a TERMINATE command and + * exit. + */ +static void +WaitForCommands(ArchiveHandle *AH, int pipefd[2]) +{ + char *command; + DumpId dumpId; + int nBytes; + char *str = NULL; + TocEntry *te; + + for (;;) + { + if (!(command = getMessageFromMaster(pipefd))) + { + PQfinish(AH->connection); + AH->connection = NULL; + return; + } + + if (messageStartsWith(command, "DUMP ")) + { + Assert(AH->format == archDirectory); + sscanf(command + strlen("DUMP "), "%d%n", &dumpId, &nBytes); + Assert(nBytes == strlen(command) - strlen("DUMP ")); + + te = getTocEntryByDumpId(AH, dumpId); + Assert(te != NULL); + + /* + * Lock the table but with NOWAIT. Note that the parent is already + * holding a lock. If we cannot acquire another ACCESS SHARE MODE + * lock, then somebody else has requested an exclusive lock in the + * meantime. lockTableNoWait dies in this case to prevent a + * deadlock. + */ + if (strcmp(te->desc, "BLOBS") != 0) + lockTableNoWait(AH, te); + + /* + * The message we return here has been pg_malloc()ed and we are + * responsible for free()ing it. + */ + str = (AH->WorkerJobDumpPtr) (AH, te); + Assert(AH->connection != NULL); + sendMessageToMaster(pipefd, str); + free(str); + } + else if (messageStartsWith(command, "RESTORE ")) + { + Assert(AH->format == archDirectory || AH->format == archCustom); + Assert(AH->connection != NULL); + + sscanf(command + strlen("RESTORE "), "%d%n", &dumpId, &nBytes); + Assert(nBytes == strlen(command) - strlen("RESTORE ")); + + te = getTocEntryByDumpId(AH, dumpId); + Assert(te != NULL); + + /* + * The message we return here has been pg_malloc()ed and we are + * responsible for free()ing it. + */ + str = (AH->WorkerJobRestorePtr) (AH, te); + Assert(AH->connection != NULL); + sendMessageToMaster(pipefd, str); + free(str); + } + else + exit_horribly(modulename, + "Unknown command on communication channel: %s\n", + command); + } +} + +/* + * --------------------------------------------------------------------- + * Note the status change: + * + * DispatchJobForTocEntry WRKR_IDLE -> WRKR_WORKING + * ListenToWorkers WRKR_WORKING -> WRKR_FINISHED / WRKR_TERMINATED + * ReapWorkerStatus WRKR_FINISHED -> WRKR_IDLE + * --------------------------------------------------------------------- + * + * Just calling ReapWorkerStatus() when all workers are working might or might + * not give you an idle worker because you need to call ListenToWorkers() in + * between and only thereafter ReapWorkerStatus(). This is necessary in order + * to get and deal with the status (=result) of the worker's execution. + */ +void +ListenToWorkers(ArchiveHandle *AH, ParallelState *pstate, bool do_wait) +{ + int worker; + char *msg; + + msg = getMessageFromWorker(pstate, do_wait, &worker); + + if (!msg) + { + if (do_wait) + exit_horribly(modulename, "A worker process died unexpectedly\n"); + return; + } + + if (messageStartsWith(msg, "OK ")) + { + char *statusString; + TocEntry *te; + + pstate->parallelSlot[worker].workerStatus = WRKR_FINISHED; + te = pstate->parallelSlot[worker].args->te; + if (messageStartsWith(msg, "OK RESTORE ")) + { + statusString = msg + strlen("OK RESTORE "); + pstate->parallelSlot[worker].status = + (AH->MasterEndParallelItemPtr) + (AH, te, statusString, ACT_RESTORE); + } + else if (messageStartsWith(msg, "OK DUMP ")) + { + statusString = msg + strlen("OK DUMP "); + pstate->parallelSlot[worker].status = + (AH->MasterEndParallelItemPtr) + (AH, te, statusString, ACT_DUMP); + } + else + exit_horribly(modulename, + "Invalid message received from worker: %s\n", msg); + } + else if (messageStartsWith(msg, "ERROR ")) + { + Assert(AH->format == archDirectory || AH->format == archCustom); + pstate->parallelSlot[worker].workerStatus = WRKR_TERMINATED; + exit_horribly(modulename, "%s", msg + strlen("ERROR ")); + } + else + exit_horribly(modulename, "Invalid message received from worker: %s\n", msg); + + /* both Unix and Win32 return pg_malloc()ed space, so we free it */ + free(msg); +} + +/* + * This function is executed in the master process. + * + * This function is used to get the return value of a terminated worker + * process. If a process has terminated, its status is stored in *status and + * the id of the worker is returned. + */ +int +ReapWorkerStatus(ParallelState *pstate, int *status) +{ + int i; + + for (i = 0; i < pstate->numWorkers; i++) + { + if (pstate->parallelSlot[i].workerStatus == WRKR_FINISHED) + { + *status = pstate->parallelSlot[i].status; + pstate->parallelSlot[i].status = 0; + pstate->parallelSlot[i].workerStatus = WRKR_IDLE; + return i; + } + } + return NO_SLOT; +} + +/* + * This function is executed in the master process. + * + * It looks for an idle worker process and only returns if there is one. + */ +void +EnsureIdleWorker(ArchiveHandle *AH, ParallelState *pstate) +{ + int ret_worker; + int work_status; + + for (;;) + { + int nTerm = 0; + + while ((ret_worker = ReapWorkerStatus(pstate, &work_status)) != NO_SLOT) + { + if (work_status != 0) + exit_horribly(modulename, "Error processing a parallel work item.\n"); + + nTerm++; + } + + /* + * We need to make sure that we have an idle worker before dispatching + * the next item. If nTerm > 0 we already have that (quick check). + */ + if (nTerm > 0) + return; + + /* explicit check for an idle worker */ + if (GetIdleWorker(pstate) != NO_SLOT) + return; + + /* + * If we have no idle worker, read the result of one or more workers + * and loop the loop to call ReapWorkerStatus() on them + */ + ListenToWorkers(AH, pstate, true); + } +} + +/* + * This function is executed in the master process. + * + * It waits for all workers to terminate. + */ +void +EnsureWorkersFinished(ArchiveHandle *AH, ParallelState *pstate) +{ + int work_status; + + if (!pstate || pstate->numWorkers == 1) + return; + + /* Waiting for the remaining worker processes to finish */ + while (!IsEveryWorkerIdle(pstate)) + { + if (ReapWorkerStatus(pstate, &work_status) == NO_SLOT) + ListenToWorkers(AH, pstate, true); + else if (work_status != 0) + exit_horribly(modulename, + "Error processing a parallel work item\n"); + } +} + +/* + * This function is executed in the worker process. + * + * It returns the next message on the communication channel, blocking until it + * becomes available. + */ +static char * +getMessageFromMaster(int pipefd[2]) +{ + return readMessageFromPipe(pipefd[PIPE_READ]); +} + +/* + * This function is executed in the worker process. + * + * It sends a message to the master on the communication channel. + */ +static void +sendMessageToMaster(int pipefd[2], const char *str) +{ + int len = strlen(str) + 1; + + if (pipewrite(pipefd[PIPE_WRITE], str, len) != len) + exit_horribly(modulename, + "Error writing to the communication channel: %s\n", + strerror(errno)); +} + +/* + * A select loop that repeats calling select until a descriptor in the read + * set becomes readable. On Windows we have to check for the termination event + * from time to time, on Unix we can just block forever. + */ +static int +select_loop(int maxFd, fd_set *workerset) +{ + int i; + fd_set saveSet = *workerset; + +#ifdef WIN32 + /* should always be the master */ + Assert(tMasterThreadId == GetCurrentThreadId()); + + for (;;) + { + /* + * sleep a quarter of a second before checking if we should terminate. + */ + struct timeval tv = {0, 250000}; + + *workerset = saveSet; + i = select(maxFd + 1, workerset, NULL, NULL, &tv); + + if (i == SOCKET_ERROR && WSAGetLastError() == WSAEINTR) + continue; + if (i) + break; + } + +#else /* UNIX */ + + for (;;) + { + *workerset = saveSet; + i = select(maxFd + 1, workerset, NULL, NULL, NULL); + + /* + * If we Ctrl-C the master process , it's likely that we interrupt + * select() here. The signal handler will set wantAbort == true and + * the shutdown journey starts from here. Note that we'll come back + * here later when we tell all workers to terminate and read their + * responses. But then we have aborting set to true. + */ + if (wantAbort && !aborting) + exit_horribly(modulename, "terminated by user\n"); + + if (i < 0 && errno == EINTR) + continue; + break; + } + +#endif + + return i; +} + + +/* + * This function is executed in the master process. + * + * It returns the next message from the worker on the communication channel, + * optionally blocking (do_wait) until it becomes available. + * + * The id of the worker is returned in *worker. + */ +static char * +getMessageFromWorker(ParallelState *pstate, bool do_wait, int *worker) +{ + int i; + fd_set workerset; + int maxFd = -1; + struct timeval nowait = {0, 0}; + + FD_ZERO(&workerset); + + for (i = 0; i < pstate->numWorkers; i++) + { + if (pstate->parallelSlot[i].workerStatus == WRKR_TERMINATED) + continue; + FD_SET(pstate->parallelSlot[i].pipeRead, &workerset); + /* actually WIN32 ignores the first parameter to select()... */ + if (pstate->parallelSlot[i].pipeRead > maxFd) + maxFd = pstate->parallelSlot[i].pipeRead; + } + + if (do_wait) + { + i = select_loop(maxFd, &workerset); + Assert(i != 0); + } + else + { + if ((i = select(maxFd + 1, &workerset, NULL, NULL, &nowait)) == 0) + return NULL; + } + + if (i < 0) + exit_horribly(modulename, "Error in ListenToWorkers(): %s", strerror(errno)); + + for (i = 0; i < pstate->numWorkers; i++) + { + char *msg; + + if (!FD_ISSET(pstate->parallelSlot[i].pipeRead, &workerset)) + continue; + + msg = readMessageFromPipe(pstate->parallelSlot[i].pipeRead); + *worker = i; + return msg; + } + Assert(false); + return NULL; +} + +/* + * This function is executed in the master process. + * + * It sends a message to a certain worker on the communication channel. + */ +static void +sendMessageToWorker(ParallelState *pstate, int worker, const char *str) +{ + int len = strlen(str) + 1; + + if (pipewrite(pstate->parallelSlot[worker].pipeWrite, str, len) != len) + { + /* + * If we're already aborting anyway, don't care if we succeed or not. + * The child might have gone already. + */ +#ifndef WIN32 + if (!aborting) +#endif + exit_horribly(modulename, + "Error writing to the communication channel: %s\n", + strerror(errno)); + } +} + +/* + * The underlying function to read a message from the communication channel + * (fd) with optional blocking (do_wait). + */ +static char * +readMessageFromPipe(int fd) +{ + char *msg; + int msgsize, + bufsize; + int ret; + + /* + * The problem here is that we need to deal with several possibilites: we + * could receive only a partial message or several messages at once. The + * caller expects us to return exactly one message however. + * + * We could either read in as much as we can and keep track of what we + * delivered back to the caller or we just read byte by byte. Once we see + * (char) 0, we know that it's the message's end. This would be quite + * inefficient for more data but since we are reading only on the command + * channel, the performance loss does not seem worth the trouble of + * keeping internal states for different file descriptors. + */ + bufsize = 64; /* could be any number */ + msg = (char *) pg_malloc(bufsize); + + msgsize = 0; + for (;;) + { + Assert(msgsize <= bufsize); + ret = piperead(fd, msg + msgsize, 1); + + /* worker has closed the connection or another error happened */ + if (ret <= 0) + return NULL; + + Assert(ret == 1); + + if (msg[msgsize] == '\0') + return msg; + + msgsize++; + if (msgsize == bufsize) + { + /* could be any number */ + bufsize += 16; + msg = (char *) realloc(msg, bufsize); + } + } +} + +#ifdef WIN32 +/* + * This is a replacement version of pipe for Win32 which allows returned + * handles to be used in select(). Note that read/write calls must be replaced + * with recv/send. + */ +static int +pgpipe(int handles[2]) +{ + SOCKET s; + struct sockaddr_in serv_addr; + int len = sizeof(serv_addr); + + handles[0] = handles[1] = INVALID_SOCKET; + + if ((s = socket(AF_INET, SOCK_STREAM, 0)) == INVALID_SOCKET) + { + write_msg(modulename, "pgpipe could not create socket: %ui", + WSAGetLastError()); + return -1; + } + + memset((void *) &serv_addr, 0, sizeof(serv_addr)); + serv_addr.sin_family = AF_INET; + serv_addr.sin_port = htons(0); + serv_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + if (bind(s, (SOCKADDR *) & serv_addr, len) == SOCKET_ERROR) + { + write_msg(modulename, "pgpipe could not bind: %ui", + WSAGetLastError()); + closesocket(s); + return -1; + } + if (listen(s, 1) == SOCKET_ERROR) + { + write_msg(modulename, "pgpipe could not listen: %ui", + WSAGetLastError()); + closesocket(s); + return -1; + } + if (getsockname(s, (SOCKADDR *) & serv_addr, &len) == SOCKET_ERROR) + { + write_msg(modulename, "pgpipe could not getsockname: %ui", + WSAGetLastError()); + closesocket(s); + return -1; + } + if ((handles[1] = socket(PF_INET, SOCK_STREAM, 0)) == INVALID_SOCKET) + { + write_msg(modulename, "pgpipe could not create socket 2: %ui", + WSAGetLastError()); + closesocket(s); + return -1; + } + + if (connect(handles[1], (SOCKADDR *) & serv_addr, len) == SOCKET_ERROR) + { + write_msg(modulename, "pgpipe could not connect socket: %ui", + WSAGetLastError()); + closesocket(s); + return -1; + } + if ((handles[0] = accept(s, (SOCKADDR *) & serv_addr, &len)) == INVALID_SOCKET) + { + write_msg(modulename, "pgpipe could not accept socket: %ui", + WSAGetLastError()); + closesocket(handles[1]); + handles[1] = INVALID_SOCKET; + closesocket(s); + return -1; + } + closesocket(s); + return 0; +} + +static int +piperead(int s, char *buf, int len) +{ + int ret = recv(s, buf, len, 0); + + if (ret < 0 && WSAGetLastError() == WSAECONNRESET) + /* EOF on the pipe! (win32 socket based implementation) */ + ret = 0; + return ret; +} + +#endif diff --git a/src/bin/pg_dump/parallel.h b/src/bin/pg_dump/parallel.h new file mode 100644 index 0000000000..858b2a09d2 --- /dev/null +++ b/src/bin/pg_dump/parallel.h @@ -0,0 +1,85 @@ +/*------------------------------------------------------------------------- + * + * parallel.h + * + * Parallel support header file for the pg_dump archiver + * + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * The author is not responsible for loss or damages that may + * result from its use. + * + * IDENTIFICATION + * src/bin/pg_dump/parallel.h + * + *------------------------------------------------------------------------- + */ + +#include "pg_backup_db.h" + +struct _archiveHandle; +struct _tocEntry; + +typedef enum +{ + WRKR_TERMINATED = 0, + WRKR_IDLE, + WRKR_WORKING, + WRKR_FINISHED +} T_WorkerStatus; + +typedef enum T_Action +{ + ACT_DUMP, + ACT_RESTORE, +} T_Action; + +/* Arguments needed for a worker process */ +typedef struct ParallelArgs +{ + struct _archiveHandle *AH; + struct _tocEntry *te; +} ParallelArgs; + +/* State for each parallel activity slot */ +typedef struct ParallelSlot +{ + ParallelArgs *args; + T_WorkerStatus workerStatus; + int status; + int pipeRead; + int pipeWrite; + int pipeRevRead; + int pipeRevWrite; +#ifdef WIN32 + uintptr_t hThread; + unsigned int threadId; +#else + pid_t pid; +#endif +} ParallelSlot; + +#define NO_SLOT (-1) + +typedef struct ParallelState +{ + int numWorkers; + ParallelSlot *parallelSlot; +} ParallelState; + +extern int GetIdleWorker(ParallelState *pstate); +extern bool IsEveryWorkerIdle(ParallelState *pstate); +extern void ListenToWorkers(struct _archiveHandle * AH, ParallelState *pstate, bool do_wait); +extern int ReapWorkerStatus(ParallelState *pstate, int *status); +extern void EnsureIdleWorker(struct _archiveHandle * AH, ParallelState *pstate); +extern void EnsureWorkersFinished(struct _archiveHandle * AH, ParallelState *pstate); + +extern ParallelState *ParallelBackupStart(struct _archiveHandle * AH, + RestoreOptions *ropt); +extern void DispatchJobForTocEntry(struct _archiveHandle * AH, + ParallelState *pstate, + struct _tocEntry * te, T_Action act); +extern void ParallelBackupEnd(struct _archiveHandle * AH, ParallelState *pstate); + +extern void checkAborting(struct _archiveHandle * AH); diff --git a/src/bin/pg_dump/pg_backup.h b/src/bin/pg_dump/pg_backup.h index 473670ddd3..b456f95969 100644 --- a/src/bin/pg_dump/pg_backup.h +++ b/src/bin/pg_dump/pg_backup.h @@ -82,9 +82,14 @@ struct Archive int minRemoteVersion; /* allowable range */ int maxRemoteVersion; + int numWorkers; /* number of parallel processes */ + char *sync_snapshot_id; /* sync snapshot id for parallel + * operation */ + /* info needed for string escaping */ int encoding; /* libpq code for client_encoding */ bool std_strings; /* standard_conforming_strings */ + char *use_role; /* Issue SET ROLE to this */ /* error handling */ bool exit_on_error; /* whether to exit on SQL errors... */ @@ -142,11 +147,12 @@ typedef struct _restoreOptions int suppressDumpWarnings; /* Suppress output of WARNING entries * to stderr */ bool single_txn; - int number_of_jobs; bool *idWanted; /* array showing which dump IDs to emit */ } RestoreOptions; +typedef void (*SetupWorkerPtr) (Archive *AH, RestoreOptions *ropt); + /* * Main archiver interface. */ @@ -189,7 +195,8 @@ extern Archive *OpenArchive(const char *FileSpec, const ArchiveFormat fmt); /* Create a new archive */ extern Archive *CreateArchive(const char *FileSpec, const ArchiveFormat fmt, - const int compression, ArchiveMode mode); + const int compression, ArchiveMode mode, + SetupWorkerPtr setupDumpWorker); /* The --list option */ extern void PrintTOCSummary(Archive *AH, RestoreOptions *ropt); diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c index 19d12788d9..3c2671bb2d 100644 --- a/src/bin/pg_dump/pg_backup_archiver.c +++ b/src/bin/pg_dump/pg_backup_archiver.c @@ -22,8 +22,10 @@ #include "pg_backup_db.h" #include "dumputils.h" +#include "parallel.h" #include +#include #include #include #include @@ -35,72 +37,6 @@ #include "libpq/libpq-fs.h" -/* - * Special exit values from worker children. We reserve 0 for normal - * success; 1 and other small values should be interpreted as crashes. - */ -#define WORKER_CREATE_DONE 10 -#define WORKER_INHIBIT_DATA 11 -#define WORKER_IGNORED_ERRORS 12 - -/* - * Unix uses exit to return result from worker child, so function is void. - * Windows thread result comes via function return. - */ -#ifndef WIN32 -#define parallel_restore_result void -#else -#define parallel_restore_result DWORD -#endif - -/* IDs for worker children are either PIDs or thread handles */ -#ifndef WIN32 -#define thandle pid_t -#else -#define thandle HANDLE -#endif - -typedef struct ParallelStateEntry -{ -#ifdef WIN32 - unsigned int threadId; -#else - pid_t pid; -#endif - ArchiveHandle *AH; -} ParallelStateEntry; - -typedef struct ParallelState -{ - int numWorkers; - ParallelStateEntry *pse; -} ParallelState; - -/* Arguments needed for a worker child */ -typedef struct _restore_args -{ - ArchiveHandle *AH; - TocEntry *te; - ParallelStateEntry *pse; -} RestoreArgs; - -/* State for each parallel activity slot */ -typedef struct _parallel_slot -{ - thandle child_id; - RestoreArgs *args; -} ParallelSlot; - -typedef struct ShutdownInformation -{ - ParallelState *pstate; - Archive *AHX; -} ShutdownInformation; - -static ShutdownInformation shutdown_info; - -#define NO_SLOT (-1) - #define TEXT_DUMP_HEADER "--\n-- PostgreSQL database dump\n--\n\n" #define TEXT_DUMPALL_HEADER "--\n-- PostgreSQL database cluster dump\n--\n\n" @@ -116,7 +52,7 @@ static const char *modulename = gettext_noop("archiver"); static ArchiveHandle *_allocAH(const char *FileSpec, const ArchiveFormat fmt, - const int compression, ArchiveMode mode); + const int compression, ArchiveMode mode, SetupWorkerPtr setupWorkerPtr); static void _getObjectDescription(PQExpBuffer buf, TocEntry *te, ArchiveHandle *AH); static void _printTocEntry(ArchiveHandle *AH, TocEntry *te, RestoreOptions *ropt, bool isData, bool acl_pass); @@ -136,7 +72,6 @@ static bool _tocEntryIsACL(TocEntry *te); static void _disableTriggersIfNecessary(ArchiveHandle *AH, TocEntry *te, RestoreOptions *ropt); static void _enableTriggersIfNecessary(ArchiveHandle *AH, TocEntry *te, RestoreOptions *ropt); static void buildTocEntryArrays(ArchiveHandle *AH); -static TocEntry *getTocEntryByDumpId(ArchiveHandle *AH, DumpId id); static void _moveBefore(ArchiveHandle *AH, TocEntry *pos, TocEntry *te); static int _discoverArchiveFormat(ArchiveHandle *AH); @@ -149,21 +84,19 @@ static void RestoreOutput(ArchiveHandle *AH, OutputContext savedContext); static int restore_toc_entry(ArchiveHandle *AH, TocEntry *te, RestoreOptions *ropt, bool is_parallel); -static void restore_toc_entries_parallel(ArchiveHandle *AH); -static thandle spawn_restore(RestoreArgs *args); -static thandle reap_child(ParallelSlot *slots, int n_slots, int *work_status); -static bool work_in_progress(ParallelSlot *slots, int n_slots); -static int get_next_slot(ParallelSlot *slots, int n_slots); +static void restore_toc_entries_prefork(ArchiveHandle *AH); +static void restore_toc_entries_parallel(ArchiveHandle *AH, ParallelState *pstate, + TocEntry *pending_list); +static void restore_toc_entries_postfork(ArchiveHandle *AH, TocEntry *pending_list); static void par_list_header_init(TocEntry *l); static void par_list_append(TocEntry *l, TocEntry *te); static void par_list_remove(TocEntry *te); static TocEntry *get_next_work_item(ArchiveHandle *AH, TocEntry *ready_list, - ParallelSlot *slots, int n_slots); -static parallel_restore_result parallel_restore(RestoreArgs *args); + ParallelState *pstate); static void mark_work_done(ArchiveHandle *AH, TocEntry *ready_list, - thandle worker, int status, - ParallelSlot *slots, int n_slots); + int worker, int status, + ParallelState *pstate); static void fix_dependencies(ArchiveHandle *AH); static bool has_lock_conflicts(TocEntry *te1, TocEntry *te2); static void repoint_table_dependencies(ArchiveHandle *AH); @@ -172,14 +105,6 @@ static void reduce_dependencies(ArchiveHandle *AH, TocEntry *te, TocEntry *ready_list); static void mark_create_done(ArchiveHandle *AH, TocEntry *te); static void inhibit_data_for_failed_table(ArchiveHandle *AH, TocEntry *te); -static ArchiveHandle *CloneArchive(ArchiveHandle *AH); -static void DeCloneArchive(ArchiveHandle *AH); - -static void setProcessIdentifier(ParallelStateEntry *pse, ArchiveHandle *AH); -static void unsetProcessIdentifier(ParallelStateEntry *pse); -static ParallelStateEntry *GetMyPSEntry(ParallelState *pstate); -static void archive_close_connection(int code, void *arg); - /* * Wrapper functions. @@ -189,15 +114,28 @@ static void archive_close_connection(int code, void *arg); * */ +/* + * The dump worker setup needs lots of knowledge of the internals of pg_dump, + * so It's defined in pg_dump.c and passed into OpenArchive. The restore worker + * setup doesn't need to know anything much, so it's defined here. + */ +static void +setupRestoreWorker(Archive *AHX, RestoreOptions *ropt) +{ + ArchiveHandle *AH = (ArchiveHandle *) AHX; + + (AH->ReopenPtr) (AH); +} + /* Create a new archive */ /* Public */ Archive * CreateArchive(const char *FileSpec, const ArchiveFormat fmt, - const int compression, ArchiveMode mode) + const int compression, ArchiveMode mode, SetupWorkerPtr setupDumpWorker) { - ArchiveHandle *AH = _allocAH(FileSpec, fmt, compression, mode); + ArchiveHandle *AH = _allocAH(FileSpec, fmt, compression, mode, setupDumpWorker); return (Archive *) AH; } @@ -207,7 +145,7 @@ CreateArchive(const char *FileSpec, const ArchiveFormat fmt, Archive * OpenArchive(const char *FileSpec, const ArchiveFormat fmt) { - ArchiveHandle *AH = _allocAH(FileSpec, fmt, 0, archModeRead); + ArchiveHandle *AH = _allocAH(FileSpec, fmt, 0, archModeRead, setupRestoreWorker); return (Archive *) AH; } @@ -311,7 +249,7 @@ RestoreArchive(Archive *AHX) /* * If we're going to do parallel restore, there are some restrictions. */ - parallel_mode = (ropt->number_of_jobs > 1 && ropt->useDB); + parallel_mode = (AH->public.numWorkers > 1 && ropt->useDB); if (parallel_mode) { /* We haven't got round to making this work for all archive formats */ @@ -499,7 +437,25 @@ RestoreArchive(Archive *AHX) * In parallel mode, turn control over to the parallel-restore logic. */ if (parallel_mode) - restore_toc_entries_parallel(AH); + { + ParallelState *pstate; + TocEntry pending_list; + + par_list_header_init(&pending_list); + + /* This runs PRE_DATA items and then disconnects from the database */ + restore_toc_entries_prefork(AH); + Assert(AH->connection == NULL); + + /* ParallelBackupStart() will actually fork the processes */ + pstate = ParallelBackupStart(AH, ropt); + restore_toc_entries_parallel(AH, pstate, &pending_list); + ParallelBackupEnd(AH, pstate); + + /* reconnect the master and see if we missed something */ + restore_toc_entries_postfork(AH, &pending_list); + Assert(AH->connection != NULL); + } else { for (te = AH->toc->next; te != AH->toc; te = te->next) @@ -558,7 +514,7 @@ static int restore_toc_entry(ArchiveHandle *AH, TocEntry *te, RestoreOptions *ropt, bool is_parallel) { - int retval = 0; + int status = WORKER_OK; teReqs reqs; bool defnDumped; @@ -611,7 +567,7 @@ restore_toc_entry(ArchiveHandle *AH, TocEntry *te, if (ropt->noDataForFailedTables) { if (is_parallel) - retval = WORKER_INHIBIT_DATA; + status = WORKER_INHIBIT_DATA; else inhibit_data_for_failed_table(AH, te); } @@ -626,7 +582,7 @@ restore_toc_entry(ArchiveHandle *AH, TocEntry *te, * just set the return value. */ if (is_parallel) - retval = WORKER_CREATE_DONE; + status = WORKER_CREATE_DONE; else mark_create_done(AH, te); } @@ -744,7 +700,10 @@ restore_toc_entry(ArchiveHandle *AH, TocEntry *te, } } - return retval; + if (AH->public.n_errors > 0 && status == WORKER_OK) + status = WORKER_IGNORED_ERRORS; + + return status; } /* @@ -1634,7 +1593,7 @@ buildTocEntryArrays(ArchiveHandle *AH) } } -static TocEntry * +TocEntry * getTocEntryByDumpId(ArchiveHandle *AH, DumpId id) { /* build index arrays if we didn't already */ @@ -2018,7 +1977,7 @@ _discoverArchiveFormat(ArchiveHandle *AH) */ static ArchiveHandle * _allocAH(const char *FileSpec, const ArchiveFormat fmt, - const int compression, ArchiveMode mode) + const int compression, ArchiveMode mode, SetupWorkerPtr setupWorkerPtr) { ArchiveHandle *AH; @@ -2100,6 +2059,8 @@ _allocAH(const char *FileSpec, const ArchiveFormat fmt, } #endif + AH->SetupWorkerPtr = setupWorkerPtr; + if (fmt == archUnknown) AH->format = _discoverArchiveFormat(AH); else @@ -2132,50 +2093,66 @@ _allocAH(const char *FileSpec, const ArchiveFormat fmt, return AH; } - void -WriteDataChunks(ArchiveHandle *AH) +WriteDataChunks(ArchiveHandle *AH, ParallelState *pstate) { TocEntry *te; - StartDataPtr startPtr; - EndDataPtr endPtr; for (te = AH->toc->next; te != AH->toc; te = te->next) { - if (te->dataDumper != NULL && (te->reqs & REQ_DATA) != 0) + if (!te->dataDumper) + continue; + + if ((te->reqs & REQ_DATA) == 0) + continue; + + if (pstate && pstate->numWorkers > 1) { - AH->currToc = te; - /* printf("Writing data for %d (%x)\n", te->id, te); */ - - if (strcmp(te->desc, "BLOBS") == 0) - { - startPtr = AH->StartBlobsPtr; - endPtr = AH->EndBlobsPtr; - } - else - { - startPtr = AH->StartDataPtr; - endPtr = AH->EndDataPtr; - } - - if (startPtr != NULL) - (*startPtr) (AH, te); - /* - * printf("Dumper arg for %d is %x\n", te->id, te->dataDumperArg); + * If we are in a parallel backup, then we are always the master + * process. */ - - /* - * The user-provided DataDumper routine needs to call - * AH->WriteData - */ - (*te->dataDumper) ((Archive *) AH, te->dataDumperArg); - - if (endPtr != NULL) - (*endPtr) (AH, te); - AH->currToc = NULL; + EnsureIdleWorker(AH, pstate); + Assert(GetIdleWorker(pstate) != NO_SLOT); + DispatchJobForTocEntry(AH, pstate, te, ACT_DUMP); } + else + WriteDataChunksForTocEntry(AH, te); } + EnsureWorkersFinished(AH, pstate); +} + +void +WriteDataChunksForTocEntry(ArchiveHandle *AH, TocEntry *te) +{ + StartDataPtr startPtr; + EndDataPtr endPtr; + + AH->currToc = te; + + if (strcmp(te->desc, "BLOBS") == 0) + { + startPtr = AH->StartBlobsPtr; + endPtr = AH->EndBlobsPtr; + } + else + { + startPtr = AH->StartDataPtr; + endPtr = AH->EndDataPtr; + } + + if (startPtr != NULL) + (*startPtr) (AH, te); + + /* + * The user-provided DataDumper routine needs to call AH->WriteData + */ + (*te->dataDumper) ((Archive *) AH, te->dataDumperArg); + + if (endPtr != NULL) + (*endPtr) (AH, te); + + AH->currToc = NULL; } void @@ -2911,7 +2888,7 @@ _getObjectDescription(PQExpBuffer buf, TocEntry *te, ArchiveHandle *AH) const char *type = te->desc; /* Use ALTER TABLE for views and sequences */ - if (strcmp(type, "VIEW") == 0 || strcmp(type, "SEQUENCE") == 0|| + if (strcmp(type, "VIEW") == 0 || strcmp(type, "SEQUENCE") == 0 || strcmp(type, "MATERIALIZED VIEW") == 0) type = "TABLE"; @@ -3404,67 +3381,6 @@ dumpTimestamp(ArchiveHandle *AH, const char *msg, time_t tim) ahprintf(AH, "-- %s %s\n\n", msg, buf); } -static void -setProcessIdentifier(ParallelStateEntry *pse, ArchiveHandle *AH) -{ -#ifdef WIN32 - pse->threadId = GetCurrentThreadId(); -#else - pse->pid = getpid(); -#endif - pse->AH = AH; -} - -static void -unsetProcessIdentifier(ParallelStateEntry *pse) -{ -#ifdef WIN32 - pse->threadId = 0; -#else - pse->pid = 0; -#endif - pse->AH = NULL; -} - -static ParallelStateEntry * -GetMyPSEntry(ParallelState *pstate) -{ - int i; - - for (i = 0; i < pstate->numWorkers; i++) -#ifdef WIN32 - if (pstate->pse[i].threadId == GetCurrentThreadId()) -#else - if (pstate->pse[i].pid == getpid()) -#endif - return &(pstate->pse[i]); - - return NULL; -} - -static void -archive_close_connection(int code, void *arg) -{ - ShutdownInformation *si = (ShutdownInformation *) arg; - - if (si->pstate) - { - ParallelStateEntry *entry = GetMyPSEntry(si->pstate); - - if (entry != NULL && entry->AH) - DisconnectDatabase(&(entry->AH->public)); - } - else if (si->AHX) - DisconnectDatabase(si->AHX); -} - -void -on_exit_close_archive(Archive *AHX) -{ - shutdown_info.AHX = AHX; - on_exit_nicely(archive_close_connection, &shutdown_info); -} - /* * Main engine for parallel restore. * @@ -3477,30 +3393,13 @@ on_exit_close_archive(Archive *AHX) * RestoreArchive). */ static void -restore_toc_entries_parallel(ArchiveHandle *AH) +restore_toc_entries_prefork(ArchiveHandle *AH) { RestoreOptions *ropt = AH->ropt; - int n_slots = ropt->number_of_jobs; - ParallelSlot *slots; - int work_status; - int next_slot; bool skipped_some; - TocEntry pending_list; - TocEntry ready_list; TocEntry *next_work_item; - thandle ret_child; - TocEntry *te; - ParallelState *pstate; - int i; - ahlog(AH, 2, "entering restore_toc_entries_parallel\n"); - - slots = (ParallelSlot *) pg_malloc0(n_slots * sizeof(ParallelSlot)); - pstate = (ParallelState *) pg_malloc(sizeof(ParallelState)); - pstate->pse = (ParallelStateEntry *) pg_malloc0(n_slots * sizeof(ParallelStateEntry)); - pstate->numWorkers = ropt->number_of_jobs; - for (i = 0; i < pstate->numWorkers; i++) - unsetProcessIdentifier(&(pstate->pse[i])); + ahlog(AH, 2, "entering restore_toc_entries_prefork\n"); /* Adjust dependency information */ fix_dependencies(AH); @@ -3509,7 +3408,7 @@ restore_toc_entries_parallel(ArchiveHandle *AH) * Do all the early stuff in a single connection in the parent. There's no * great point in running it in parallel, in fact it will actually run * faster in a single connection because we avoid all the connection and - * setup overhead. Also, pre-9.2 pg_dump versions were not very good + * setup overhead. Also, pre-9.2 pg_dump versions were not very good * about showing all the dependencies of SECTION_PRE_DATA items, so we do * not risk trying to process them out-of-order. * @@ -3561,12 +3460,6 @@ restore_toc_entries_parallel(ArchiveHandle *AH) */ DisconnectDatabase(&AH->public); - /* - * Set the pstate in the shutdown_info. The exit handler uses pstate if - * set and falls back to AHX otherwise. - */ - shutdown_info.pstate = pstate; - /* blow away any transient state from the old connection */ if (AH->currUser) free(AH->currUser); @@ -3578,17 +3471,42 @@ restore_toc_entries_parallel(ArchiveHandle *AH) free(AH->currTablespace); AH->currTablespace = NULL; AH->currWithOids = -1; +} + +/* + * Main engine for parallel restore. + * + * Work is done in three phases. + * First we process all SECTION_PRE_DATA tocEntries, in a single connection, + * just as for a standard restore. This is done in restore_toc_entries_prefork(). + * Second we process the remaining non-ACL steps in parallel worker children + * (threads on Windows, processes on Unix), these fork off and set up their + * connections before we call restore_toc_entries_parallel_forked. + * Finally we process all the ACL entries in a single connection (that happens + * back in RestoreArchive). + */ +static void +restore_toc_entries_parallel(ArchiveHandle *AH, ParallelState *pstate, + TocEntry *pending_list) +{ + int work_status; + bool skipped_some; + TocEntry ready_list; + TocEntry *next_work_item; + int ret_child; + + ahlog(AH, 2, "entering restore_toc_entries_parallel\n"); /* - * Initialize the lists of pending and ready items. After this setup, the - * pending list is everything that needs to be done but is blocked by one - * or more dependencies, while the ready list contains items that have no - * remaining dependencies. Note: we don't yet filter out entries that - * aren't going to be restored. They might participate in dependency + * Initialize the lists of ready items, the list for pending items has + * already been initialized in the caller. After this setup, the pending + * list is everything that needs to be done but is blocked by one or more + * dependencies, while the ready list contains items that have no + * remaining dependencies. Note: we don't yet filter out entries that + * aren't going to be restored. They might participate in dependency * chains connecting entries that should be restored, so we treat them as * live until we actually process them. */ - par_list_header_init(&pending_list); par_list_header_init(&ready_list); skipped_some = false; for (next_work_item = AH->toc->next; next_work_item != AH->toc; next_work_item = next_work_item->next) @@ -3613,7 +3531,7 @@ restore_toc_entries_parallel(ArchiveHandle *AH) } if (next_work_item->depCount > 0) - par_list_append(&pending_list, next_work_item); + par_list_append(pending_list, next_work_item); else par_list_append(&ready_list, next_work_item); } @@ -3627,9 +3545,8 @@ restore_toc_entries_parallel(ArchiveHandle *AH) ahlog(AH, 1, "entering main parallel loop\n"); - while ((next_work_item = get_next_work_item(AH, &ready_list, - slots, n_slots)) != NULL || - work_in_progress(slots, n_slots)) + while ((next_work_item = get_next_work_item(AH, &ready_list, pstate)) != NULL || + !IsEveryWorkerIdle(pstate)) { if (next_work_item != NULL) { @@ -3647,62 +3564,72 @@ restore_toc_entries_parallel(ArchiveHandle *AH) continue; } - if ((next_slot = get_next_slot(slots, n_slots)) != NO_SLOT) - { - /* There is work still to do and a worker slot available */ - thandle child; - RestoreArgs *args; + ahlog(AH, 1, "launching item %d %s %s\n", + next_work_item->dumpId, + next_work_item->desc, next_work_item->tag); - ahlog(AH, 1, "launching item %d %s %s\n", - next_work_item->dumpId, - next_work_item->desc, next_work_item->tag); + par_list_remove(next_work_item); - par_list_remove(next_work_item); - - /* this memory is dealloced in mark_work_done() */ - args = pg_malloc(sizeof(RestoreArgs)); - args->AH = CloneArchive(AH); - args->te = next_work_item; - args->pse = &pstate->pse[next_slot]; - - /* run the step in a worker child */ - child = spawn_restore(args); - - slots[next_slot].child_id = child; - slots[next_slot].args = args; - - continue; - } - } - - /* - * If we get here there must be work being done. Either there is no - * work available to schedule (and work_in_progress returned true) or - * there are no slots available. So we wait for a worker to finish, - * and process the result. - */ - ret_child = reap_child(slots, n_slots, &work_status); - - if (WIFEXITED(work_status)) - { - mark_work_done(AH, &ready_list, - ret_child, WEXITSTATUS(work_status), - slots, n_slots); + Assert(GetIdleWorker(pstate) != NO_SLOT); + DispatchJobForTocEntry(AH, pstate, next_work_item, ACT_RESTORE); } else + /* at least one child is working and we have nothing ready. */ + Assert(!IsEveryWorkerIdle(pstate)); + + for (;;) { - exit_horribly(modulename, "worker process crashed: status %d\n", - work_status); + int nTerm = 0; + + /* + * In order to reduce dependencies as soon as possible and + * especially to reap the status of workers who are working on + * items that pending items depend on, we do a non-blocking check + * for ended workers first. + * + * However, if we do not have any other work items currently that + * workers can work on, we do not busy-loop here but instead + * really wait for at least one worker to terminate. Hence we call + * ListenToWorkers(..., ..., do_wait = true) in this case. + */ + ListenToWorkers(AH, pstate, !next_work_item); + + while ((ret_child = ReapWorkerStatus(pstate, &work_status)) != NO_SLOT) + { + nTerm++; + mark_work_done(AH, &ready_list, ret_child, work_status, pstate); + } + + /* + * We need to make sure that we have an idle worker before + * re-running the loop. If nTerm > 0 we already have that (quick + * check). + */ + if (nTerm > 0) + break; + + /* if nobody terminated, explicitly check for an idle worker */ + if (GetIdleWorker(pstate) != NO_SLOT) + break; + + /* + * If we have no idle worker, read the result of one or more + * workers and loop the loop to call ReapWorkerStatus() on them. + */ + ListenToWorkers(AH, pstate, true); } } ahlog(AH, 1, "finished main parallel loop\n"); +} - /* - * Remove the pstate again, so the exit handler will now fall back to - * closing AH->connection again. - */ - shutdown_info.pstate = NULL; +static void +restore_toc_entries_postfork(ArchiveHandle *AH, TocEntry *pending_list) +{ + RestoreOptions *ropt = AH->ropt; + TocEntry *te; + + ahlog(AH, 2, "entering restore_toc_entries_postfork\n"); /* * Now reconnect the single parent connection. @@ -3718,7 +3645,7 @@ restore_toc_entries_parallel(ArchiveHandle *AH) * dependencies, or some other pathological condition. If so, do it in the * single parent connection. */ - for (te = pending_list.par_next; te != &pending_list; te = te->par_next) + for (te = pending_list->par_next; te != pending_list; te = te->par_next) { ahlog(AH, 1, "processing missed item %d %s %s\n", te->dumpId, te->desc, te->tag); @@ -3728,121 +3655,6 @@ restore_toc_entries_parallel(ArchiveHandle *AH) /* The ACLs will be handled back in RestoreArchive. */ } -/* - * create a worker child to perform a restore step in parallel - */ -static thandle -spawn_restore(RestoreArgs *args) -{ - thandle child; - - /* Ensure stdio state is quiesced before forking */ - fflush(NULL); - -#ifndef WIN32 - child = fork(); - if (child == 0) - { - /* in child process */ - parallel_restore(args); - exit_horribly(modulename, - "parallel_restore should not return\n"); - } - else if (child < 0) - { - /* fork failed */ - exit_horribly(modulename, - "could not create worker process: %s\n", - strerror(errno)); - } -#else - child = (HANDLE) _beginthreadex(NULL, 0, (void *) parallel_restore, - args, 0, NULL); - if (child == 0) - exit_horribly(modulename, - "could not create worker thread: %s\n", - strerror(errno)); -#endif - - return child; -} - -/* - * collect status from a completed worker child - */ -static thandle -reap_child(ParallelSlot *slots, int n_slots, int *work_status) -{ -#ifndef WIN32 - /* Unix is so much easier ... */ - return wait(work_status); -#else - static HANDLE *handles = NULL; - int hindex, - snum, - tnum; - thandle ret_child; - DWORD res; - - /* first time around only, make space for handles to listen on */ - if (handles == NULL) - handles = (HANDLE *) pg_malloc0(n_slots * sizeof(HANDLE)); - - /* set up list of handles to listen to */ - for (snum = 0, tnum = 0; snum < n_slots; snum++) - if (slots[snum].child_id != 0) - handles[tnum++] = slots[snum].child_id; - - /* wait for one to finish */ - hindex = WaitForMultipleObjects(tnum, handles, false, INFINITE); - - /* get handle of finished thread */ - ret_child = handles[hindex - WAIT_OBJECT_0]; - - /* get the result */ - GetExitCodeThread(ret_child, &res); - *work_status = res; - - /* dispose of handle to stop leaks */ - CloseHandle(ret_child); - - return ret_child; -#endif -} - -/* - * are we doing anything now? - */ -static bool -work_in_progress(ParallelSlot *slots, int n_slots) -{ - int i; - - for (i = 0; i < n_slots; i++) - { - if (slots[i].child_id != 0) - return true; - } - return false; -} - -/* - * find the first free parallel slot (if any). - */ -static int -get_next_slot(ParallelSlot *slots, int n_slots) -{ - int i; - - for (i = 0; i < n_slots; i++) - { - if (slots[i].child_id == 0) - return i; - } - return NO_SLOT; -} - - /* * Check if te1 has an exclusive lock requirement for an item that te2 also * requires, whether or not te2's requirement is for an exclusive lock. @@ -3916,7 +3728,7 @@ par_list_remove(TocEntry *te) */ static TocEntry * get_next_work_item(ArchiveHandle *AH, TocEntry *ready_list, - ParallelSlot *slots, int n_slots) + ParallelState *pstate) { bool pref_non_data = false; /* or get from AH->ropt */ TocEntry *data_te = NULL; @@ -3931,11 +3743,11 @@ get_next_work_item(ArchiveHandle *AH, TocEntry *ready_list, { int count = 0; - for (k = 0; k < n_slots; k++) - if (slots[k].args->te != NULL && - slots[k].args->te->section == SECTION_DATA) + for (k = 0; k < pstate->numWorkers; k++) + if (pstate->parallelSlot[k].args->te != NULL && + pstate->parallelSlot[k].args->te->section == SECTION_DATA) count++; - if (n_slots == 0 || count * 4 < n_slots) + if (pstate->numWorkers == 0 || count * 4 < pstate->numWorkers) pref_non_data = false; } @@ -3951,13 +3763,13 @@ get_next_work_item(ArchiveHandle *AH, TocEntry *ready_list, * that a currently running item also needs lock on, or vice versa. If * so, we don't want to schedule them together. */ - for (i = 0; i < n_slots && !conflicts; i++) + for (i = 0; i < pstate->numWorkers && !conflicts; i++) { TocEntry *running_te; - if (slots[i].args == NULL) + if (pstate->parallelSlot[i].workerStatus != WRKR_WORKING) continue; - running_te = slots[i].args->te; + running_te = pstate->parallelSlot[i].args->te; if (has_lock_conflicts(te, running_te) || has_lock_conflicts(running_te, te)) @@ -3992,63 +3804,29 @@ get_next_work_item(ArchiveHandle *AH, TocEntry *ready_list, /* * Restore a single TOC item in parallel with others * - * this is the procedure run as a thread (Windows) or a - * separate process (everything else). + * this is run in the worker, i.e. in a thread (Windows) or a separate process + * (everything else). A worker process executes several such work items during + * a parallel backup or restore. Once we terminate here and report back that + * our work is finished, the master process will assign us a new work item. */ -static parallel_restore_result -parallel_restore(RestoreArgs *args) +int +parallel_restore(ParallelArgs * args) { ArchiveHandle *AH = args->AH; TocEntry *te = args->te; RestoreOptions *ropt = AH->ropt; - int retval; - - setProcessIdentifier(args->pse, AH); - - /* - * Close and reopen the input file so we have a private file pointer that - * doesn't stomp on anyone else's file pointer, if we're actually going to - * need to read from the file. Otherwise, just close it except on Windows, - * where it will possibly be needed by other threads. - * - * Note: on Windows, since we are using threads not processes, the reopen - * call *doesn't* close the original file pointer but just open a new one. - */ - if (te->section == SECTION_DATA) - (AH->ReopenPtr) (AH); -#ifndef WIN32 - else - (AH->ClosePtr) (AH); -#endif - - /* - * We need our own database connection, too - */ - ConnectDatabase((Archive *) AH, ropt->dbname, - ropt->pghost, ropt->pgport, ropt->username, - ropt->promptPassword); + int status; _doSetFixedOutputState(AH); + Assert(AH->connection != NULL); + + AH->public.n_errors = 0; + /* Restore the TOC item */ - retval = restore_toc_entry(AH, te, ropt, true); + status = restore_toc_entry(AH, te, ropt, true); - /* And clean up */ - DisconnectDatabase((Archive *) AH); - unsetProcessIdentifier(args->pse); - - /* If we reopened the file, we are done with it, so close it now */ - if (te->section == SECTION_DATA) - (AH->ClosePtr) (AH); - - if (retval == 0 && AH->public.n_errors) - retval = WORKER_IGNORED_ERRORS; - -#ifndef WIN32 - exit(retval); -#else - return retval; -#endif + return status; } @@ -4060,25 +3838,12 @@ parallel_restore(RestoreArgs *args) */ static void mark_work_done(ArchiveHandle *AH, TocEntry *ready_list, - thandle worker, int status, - ParallelSlot *slots, int n_slots) + int worker, int status, + ParallelState *pstate) { TocEntry *te = NULL; - int i; - for (i = 0; i < n_slots; i++) - { - if (slots[i].child_id == worker) - { - slots[i].child_id = 0; - te = slots[i].args->te; - DeCloneArchive(slots[i].args->AH); - free(slots[i].args); - slots[i].args = NULL; - - break; - } - } + te = pstate->parallelSlot[worker].args->te; if (te == NULL) exit_horribly(modulename, "could not find slot of finished worker\n"); @@ -4179,8 +3944,8 @@ fix_dependencies(ArchiveHandle *AH) /* * Count the incoming dependencies for each item. Also, it is possible * that the dependencies list items that are not in the archive at all - * (that should not happen in 9.2 and later, but is highly likely in - * older archives). Subtract such items from the depCounts. + * (that should not happen in 9.2 and later, but is highly likely in older + * archives). Subtract such items from the depCounts. */ for (te = AH->toc->next; te != AH->toc; te = te->next) { @@ -4377,16 +4142,13 @@ inhibit_data_for_failed_table(ArchiveHandle *AH, TocEntry *te) } } - /* * Clone and de-clone routines used in parallel restoration. * * Enough of the structure is cloned to ensure that there is no * conflict between different threads each with their own clone. - * - * These could be public, but no need at present. */ -static ArchiveHandle * +ArchiveHandle * CloneArchive(ArchiveHandle *AH) { ArchiveHandle *clone; @@ -4412,9 +4174,60 @@ CloneArchive(ArchiveHandle *AH) /* clone has its own error count, too */ clone->public.n_errors = 0; + /* + * Connect our new clone object to the database: In parallel restore the + * parent is already disconnected, because we can connect the worker + * processes independently to the database (no snapshot sync required). In + * parallel backup we clone the parent's existing connection. + */ + if (AH->mode == archModeRead) + { + RestoreOptions *ropt = AH->ropt; + + Assert(AH->connection == NULL); + /* this also sets clone->connection */ + ConnectDatabase((Archive *) clone, ropt->dbname, + ropt->pghost, ropt->pgport, ropt->username, + ropt->promptPassword); + } + else + { + char *dbname; + char *pghost; + char *pgport; + char *username; + const char *encname; + + Assert(AH->connection != NULL); + + /* + * Even though we are technically accessing the parent's database + * object here, these functions are fine to be called like that + * because all just return a pointer and do not actually send/receive + * any data to/from the database. + */ + dbname = PQdb(AH->connection); + pghost = PQhost(AH->connection); + pgport = PQport(AH->connection); + username = PQuser(AH->connection); + encname = pg_encoding_to_char(AH->public.encoding); + + /* this also sets clone->connection */ + ConnectDatabase((Archive *) clone, dbname, pghost, pgport, username, TRI_NO); + + /* + * Set the same encoding, whatever we set here is what we got from + * pg_encoding_to_char(), so we really shouldn't run into an error + * setting that very same value. Also see the comment in + * SetupConnection(). + */ + PQsetClientEncoding(clone->connection, encname); + } + /* Let the format-specific code have a chance too */ (clone->ClonePtr) (clone); + Assert(clone->connection != NULL); return clone; } @@ -4423,7 +4236,7 @@ CloneArchive(ArchiveHandle *AH) * * Note: we assume any clone-local connection was already closed. */ -static void +void DeCloneArchive(ArchiveHandle *AH) { /* Clear format-specific state */ diff --git a/src/bin/pg_dump/pg_backup_archiver.h b/src/bin/pg_dump/pg_backup_archiver.h index 8859bd9776..2f9434efbc 100644 --- a/src/bin/pg_dump/pg_backup_archiver.h +++ b/src/bin/pg_dump/pg_backup_archiver.h @@ -100,8 +100,21 @@ typedef z_stream *z_streamp; #define K_OFFSET_POS_SET 2 #define K_OFFSET_NO_DATA 3 +/* + * Special exit values from worker children. We reserve 0 for normal + * success; 1 and other small values should be interpreted as crashes. + */ +#define WORKER_OK 0 +#define WORKER_CREATE_DONE 10 +#define WORKER_INHIBIT_DATA 11 +#define WORKER_IGNORED_ERRORS 12 + struct _archiveHandle; struct _tocEntry; +struct _restoreList; +struct ParallelArgs; +struct ParallelState; +enum T_Action; typedef void (*ClosePtr) (struct _archiveHandle * AH); typedef void (*ReopenPtr) (struct _archiveHandle * AH); @@ -129,6 +142,13 @@ typedef void (*PrintTocDataPtr) (struct _archiveHandle * AH, struct _tocEntry * typedef void (*ClonePtr) (struct _archiveHandle * AH); typedef void (*DeClonePtr) (struct _archiveHandle * AH); +typedef char *(*WorkerJobRestorePtr) (struct _archiveHandle * AH, struct _tocEntry * te); +typedef char *(*WorkerJobDumpPtr) (struct _archiveHandle * AH, struct _tocEntry * te); +typedef char *(*MasterStartParallelItemPtr) (struct _archiveHandle * AH, struct _tocEntry * te, + enum T_Action act); +typedef int (*MasterEndParallelItemPtr) (struct _archiveHandle * AH, struct _tocEntry * te, + const char *str, enum T_Action act); + typedef size_t (*CustomOutPtr) (struct _archiveHandle * AH, const void *buf, size_t len); typedef enum @@ -227,6 +247,13 @@ typedef struct _archiveHandle StartBlobPtr StartBlobPtr; EndBlobPtr EndBlobPtr; + MasterStartParallelItemPtr MasterStartParallelItemPtr; + MasterEndParallelItemPtr MasterEndParallelItemPtr; + + SetupWorkerPtr SetupWorkerPtr; + WorkerJobDumpPtr WorkerJobDumpPtr; + WorkerJobRestorePtr WorkerJobRestorePtr; + ClonePtr ClonePtr; /* Clone format-specific fields */ DeClonePtr DeClonePtr; /* Clean up cloned fields */ @@ -236,6 +263,7 @@ typedef struct _archiveHandle char *archdbname; /* DB name *read* from archive */ enum trivalue promptPassword; char *savedPassword; /* password for ropt->username, if known */ + char *use_role; PGconn *connection; int connectToDB; /* Flag to indicate if direct DB connection is * required */ @@ -327,6 +355,7 @@ typedef struct _tocEntry int nLockDeps; /* number of such dependencies */ } TocEntry; +extern int parallel_restore(struct ParallelArgs * args); extern void on_exit_close_archive(Archive *AHX); extern void warn_or_exit_horribly(ArchiveHandle *AH, const char *modulename, const char *fmt,...) __attribute__((format(PG_PRINTF_ATTRIBUTE, 3, 4))); @@ -337,9 +366,13 @@ extern void WriteHead(ArchiveHandle *AH); extern void ReadHead(ArchiveHandle *AH); extern void WriteToc(ArchiveHandle *AH); extern void ReadToc(ArchiveHandle *AH); -extern void WriteDataChunks(ArchiveHandle *AH); +extern void WriteDataChunks(ArchiveHandle *AH, struct ParallelState *pstate); +extern void WriteDataChunksForTocEntry(ArchiveHandle *AH, TocEntry *te); +extern ArchiveHandle *CloneArchive(ArchiveHandle *AH); +extern void DeCloneArchive(ArchiveHandle *AH); extern teReqs TocIDRequired(ArchiveHandle *AH, DumpId id); +TocEntry *getTocEntryByDumpId(ArchiveHandle *AH, DumpId id); extern bool checkSeek(FILE *fp); #define appendStringLiteralAHX(buf,str,AH) \ diff --git a/src/bin/pg_dump/pg_backup_custom.c b/src/bin/pg_dump/pg_backup_custom.c index 7081598baa..c2e94ca084 100644 --- a/src/bin/pg_dump/pg_backup_custom.c +++ b/src/bin/pg_dump/pg_backup_custom.c @@ -26,6 +26,7 @@ #include "compress_io.h" #include "dumputils.h" +#include "parallel.h" /*-------- * Routines in the format interface @@ -59,6 +60,10 @@ static void _LoadBlobs(ArchiveHandle *AH, bool drop); static void _Clone(ArchiveHandle *AH); static void _DeClone(ArchiveHandle *AH); +static char *_MasterStartParallelItem(ArchiveHandle *AH, TocEntry *te, T_Action act); +static int _MasterEndParallelItem(ArchiveHandle *AH, TocEntry *te, const char *str, T_Action act); +char *_WorkerJobRestoreCustom(ArchiveHandle *AH, TocEntry *te); + typedef struct { CompressorState *cs; @@ -127,6 +132,13 @@ InitArchiveFmt_Custom(ArchiveHandle *AH) AH->ClonePtr = _Clone; AH->DeClonePtr = _DeClone; + AH->MasterStartParallelItemPtr = _MasterStartParallelItem; + AH->MasterEndParallelItemPtr = _MasterEndParallelItem; + + /* no parallel dump in the custom archive, only parallel restore */ + AH->WorkerJobDumpPtr = NULL; + AH->WorkerJobRestorePtr = _WorkerJobRestoreCustom; + /* Set up a private area. */ ctx = (lclContext *) pg_malloc0(sizeof(lclContext)); AH->formatData = (void *) ctx; @@ -698,7 +710,7 @@ _CloseArchive(ArchiveHandle *AH) tpos = ftello(AH->FH); WriteToc(AH); ctx->dataStart = _getFilePos(AH, ctx); - WriteDataChunks(AH); + WriteDataChunks(AH, NULL); /* * If possible, re-write the TOC in order to update the data offset @@ -796,6 +808,80 @@ _DeClone(ArchiveHandle *AH) free(ctx); } +/* + * This function is executed in the child of a parallel backup for the + * custom format archive and dumps the actual data. + */ +char * +_WorkerJobRestoreCustom(ArchiveHandle *AH, TocEntry *te) +{ + /* + * short fixed-size string + some ID so far, this needs to be malloc'ed + * instead of static because we work with threads on windows + */ + const int buflen = 64; + char *buf = (char *) pg_malloc(buflen); + ParallelArgs pargs; + int status; + + pargs.AH = AH; + pargs.te = te; + + status = parallel_restore(&pargs); + + snprintf(buf, buflen, "OK RESTORE %d %d %d", te->dumpId, status, + status == WORKER_IGNORED_ERRORS ? AH->public.n_errors : 0); + + return buf; +} + +/* + * This function is executed in the parent process. Depending on the desired + * action (dump or restore) it creates a string that is understood by the + * _WorkerJobDump /_WorkerJobRestore functions of the dump format. + */ +static char * +_MasterStartParallelItem(ArchiveHandle *AH, TocEntry *te, T_Action act) +{ + /* + * A static char is okay here, even on Windows because we call this + * function only from one process (the master). + */ + static char buf[64]; /* short fixed-size string + number */ + + /* no parallel dump in the custom archive format */ + Assert(act == ACT_RESTORE); + + snprintf(buf, sizeof(buf), "RESTORE %d", te->dumpId); + + return buf; +} + +/* + * This function is executed in the parent process. It analyzes the response of + * the _WorkerJobDump / _WorkerJobRestore functions of the dump format. + */ +static int +_MasterEndParallelItem(ArchiveHandle *AH, TocEntry *te, const char *str, T_Action act) +{ + DumpId dumpId; + int nBytes, + status, + n_errors; + + /* no parallel dump in the custom archive */ + Assert(act == ACT_RESTORE); + + sscanf(str, "%u %u %u%n", &dumpId, &status, &n_errors, &nBytes); + + Assert(nBytes == strlen(str)); + Assert(dumpId == te->dumpId); + + AH->public.n_errors += n_errors; + + return status; +} + /*-------------------------------------------------- * END OF FORMAT CALLBACKS *-------------------------------------------------- diff --git a/src/bin/pg_dump/pg_backup_db.c b/src/bin/pg_dump/pg_backup_db.c index 4c4f24f7d5..544d01a4dd 100644 --- a/src/bin/pg_dump/pg_backup_db.c +++ b/src/bin/pg_dump/pg_backup_db.c @@ -309,12 +309,30 @@ ConnectDatabase(Archive *AHX, PQsetNoticeProcessor(AH->connection, notice_processor, NULL); } +/* + * Close the connection to the database and also cancel off the query if we + * have one running. + */ void DisconnectDatabase(Archive *AHX) { ArchiveHandle *AH = (ArchiveHandle *) AHX; + PGcancel *cancel; + char errbuf[1]; - PQfinish(AH->connection); /* noop if AH->connection is NULL */ + if (!AH->connection) + return; + + if (PQtransactionStatus(AH->connection) == PQTRANS_ACTIVE) + { + if ((cancel = PQgetCancel(AH->connection))) + { + PQcancel(cancel, errbuf, sizeof(errbuf)); + PQfreeCancel(cancel); + } + } + + PQfinish(AH->connection); AH->connection = NULL; } diff --git a/src/bin/pg_dump/pg_backup_directory.c b/src/bin/pg_dump/pg_backup_directory.c index 5b71ebaeeb..66151f584b 100644 --- a/src/bin/pg_dump/pg_backup_directory.c +++ b/src/bin/pg_dump/pg_backup_directory.c @@ -35,6 +35,7 @@ #include "compress_io.h" #include "dumputils.h" +#include "parallel.h" #include #include @@ -50,6 +51,7 @@ typedef struct cfp *dataFH; /* currently open data file */ cfp *blobsTocFH; /* file handle for blobs.toc */ + ParallelState *pstate; /* for parallel backup / restore */ } lclContext; typedef struct @@ -70,6 +72,7 @@ static int _ReadByte(ArchiveHandle *); static size_t _WriteBuf(ArchiveHandle *AH, const void *buf, size_t len); static size_t _ReadBuf(ArchiveHandle *AH, void *buf, size_t len); static void _CloseArchive(ArchiveHandle *AH); +static void _ReopenArchive(ArchiveHandle *AH); static void _PrintTocData(ArchiveHandle *AH, TocEntry *te, RestoreOptions *ropt); static void _WriteExtraToc(ArchiveHandle *AH, TocEntry *te); @@ -82,8 +85,17 @@ static void _EndBlob(ArchiveHandle *AH, TocEntry *te, Oid oid); static void _EndBlobs(ArchiveHandle *AH, TocEntry *te); static void _LoadBlobs(ArchiveHandle *AH, RestoreOptions *ropt); -static char *prependDirectory(ArchiveHandle *AH, const char *relativeFilename); +static void _Clone(ArchiveHandle *AH); +static void _DeClone(ArchiveHandle *AH); +static char *_MasterStartParallelItem(ArchiveHandle *AH, TocEntry *te, T_Action act); +static int _MasterEndParallelItem(ArchiveHandle *AH, TocEntry *te, + const char *str, T_Action act); +static char *_WorkerJobRestoreDirectory(ArchiveHandle *AH, TocEntry *te); +static char *_WorkerJobDumpDirectory(ArchiveHandle *AH, TocEntry *te); + +static void setFilePath(ArchiveHandle *AH, char *buf, + const char *relativeFilename); /* * Init routine required by ALL formats. This is a global routine @@ -110,7 +122,7 @@ InitArchiveFmt_Directory(ArchiveHandle *AH) AH->WriteBufPtr = _WriteBuf; AH->ReadBufPtr = _ReadBuf; AH->ClosePtr = _CloseArchive; - AH->ReopenPtr = NULL; + AH->ReopenPtr = _ReopenArchive; AH->PrintTocDataPtr = _PrintTocData; AH->ReadExtraTocPtr = _ReadExtraToc; AH->WriteExtraTocPtr = _WriteExtraToc; @@ -121,8 +133,14 @@ InitArchiveFmt_Directory(ArchiveHandle *AH) AH->EndBlobPtr = _EndBlob; AH->EndBlobsPtr = _EndBlobs; - AH->ClonePtr = NULL; - AH->DeClonePtr = NULL; + AH->ClonePtr = _Clone; + AH->DeClonePtr = _DeClone; + + AH->WorkerJobRestorePtr = _WorkerJobRestoreDirectory; + AH->WorkerJobDumpPtr = _WorkerJobDumpDirectory; + + AH->MasterStartParallelItemPtr = _MasterStartParallelItem; + AH->MasterEndParallelItemPtr = _MasterEndParallelItem; /* Set up our private context */ ctx = (lclContext *) pg_malloc0(sizeof(lclContext)); @@ -146,16 +164,41 @@ InitArchiveFmt_Directory(ArchiveHandle *AH) if (AH->mode == archModeWrite) { - if (mkdir(ctx->directory, 0700) < 0) + struct stat st; + bool is_empty = false; + + /* we accept an empty existing directory */ + if (stat(ctx->directory, &st) == 0 && S_ISDIR(st.st_mode)) + { + DIR *dir = opendir(ctx->directory); + + if (dir) + { + struct dirent *d; + + is_empty = true; + while ((d = readdir(dir))) + { + if (strcmp(d->d_name, ".") != 0 && strcmp(d->d_name, "..") != 0) + { + is_empty = false; + break; + } + } + closedir(dir); + } + } + + if (!is_empty && mkdir(ctx->directory, 0700) < 0) exit_horribly(modulename, "could not create directory \"%s\": %s\n", ctx->directory, strerror(errno)); } else { /* Read Mode */ - char *fname; + char fname[MAXPGPATH]; cfp *tocFH; - fname = prependDirectory(AH, "toc.dat"); + setFilePath(AH, fname, "toc.dat"); tocFH = cfopen_read(fname, PG_BINARY_R); if (tocFH == NULL) @@ -281,9 +324,9 @@ _StartData(ArchiveHandle *AH, TocEntry *te) { lclTocEntry *tctx = (lclTocEntry *) te->formatData; lclContext *ctx = (lclContext *) AH->formatData; - char *fname; + char fname[MAXPGPATH]; - fname = prependDirectory(AH, tctx->filename); + setFilePath(AH, fname, tctx->filename); ctx->dataFH = cfopen_write(fname, PG_BINARY_W, AH->compression); if (ctx->dataFH == NULL) @@ -308,6 +351,9 @@ _WriteData(ArchiveHandle *AH, const void *data, size_t dLen) if (dLen == 0) return 0; + /* Are we aborting? */ + checkAborting(AH); + return cfwrite(data, dLen, ctx->dataFH); } @@ -375,8 +421,9 @@ _PrintTocData(ArchiveHandle *AH, TocEntry *te, RestoreOptions *ropt) _LoadBlobs(AH, ropt); else { - char *fname = prependDirectory(AH, tctx->filename); + char fname[MAXPGPATH]; + setFilePath(AH, fname, tctx->filename); _PrintFileData(AH, fname, ropt); } } @@ -386,12 +433,12 @@ _LoadBlobs(ArchiveHandle *AH, RestoreOptions *ropt) { Oid oid; lclContext *ctx = (lclContext *) AH->formatData; - char *fname; + char fname[MAXPGPATH]; char line[MAXPGPATH]; StartRestoreBlobs(AH); - fname = prependDirectory(AH, "blobs.toc"); + setFilePath(AH, fname, "blobs.toc"); ctx->blobsTocFH = cfopen_read(fname, PG_BINARY_R); @@ -474,6 +521,9 @@ _WriteBuf(ArchiveHandle *AH, const void *buf, size_t len) lclContext *ctx = (lclContext *) AH->formatData; size_t res; + /* Are we aborting? */ + checkAborting(AH); + res = cfwrite(buf, len, ctx->dataFH); if (res != len) exit_horribly(modulename, "could not write to output file: %s\n", @@ -518,7 +568,12 @@ _CloseArchive(ArchiveHandle *AH) if (AH->mode == archModeWrite) { cfp *tocFH; - char *fname = prependDirectory(AH, "toc.dat"); + char fname[MAXPGPATH]; + + setFilePath(AH, fname, "toc.dat"); + + /* this will actually fork the processes for a parallel backup */ + ctx->pstate = ParallelBackupStart(AH, NULL); /* The TOC is always created uncompressed */ tocFH = cfopen_write(fname, PG_BINARY_W, 0); @@ -539,11 +594,25 @@ _CloseArchive(ArchiveHandle *AH) if (cfclose(tocFH) != 0) exit_horribly(modulename, "could not close TOC file: %s\n", strerror(errno)); - WriteDataChunks(AH); + WriteDataChunks(AH, ctx->pstate); + + ParallelBackupEnd(AH, ctx->pstate); } AH->FH = NULL; } +/* + * Reopen the archive's file handle. + */ +static void +_ReopenArchive(ArchiveHandle *AH) +{ + /* + * Our TOC is in memory, our data files are opened by each child anyway as + * they are separate. We support reopening the archive by just doing + * nothing. + */ +} /* * BLOB support @@ -560,9 +629,9 @@ static void _StartBlobs(ArchiveHandle *AH, TocEntry *te) { lclContext *ctx = (lclContext *) AH->formatData; - char *fname; + char fname[MAXPGPATH]; - fname = prependDirectory(AH, "blobs.toc"); + setFilePath(AH, fname, "blobs.toc"); /* The blob TOC file is never compressed */ ctx->blobsTocFH = cfopen_write(fname, "ab", 0); @@ -627,12 +696,16 @@ _EndBlobs(ArchiveHandle *AH, TocEntry *te) ctx->blobsTocFH = NULL; } - -static char * -prependDirectory(ArchiveHandle *AH, const char *relativeFilename) +/* + * Gets a relative file name and prepends the output directory, writing the + * result to buf. The caller needs to make sure that buf is MAXPGPATH bytes + * big. Can't use a static char[MAXPGPATH] inside the function because we run + * multithreaded on Windows. + */ +static void +setFilePath(ArchiveHandle *AH, char *buf, const char *relativeFilename) { lclContext *ctx = (lclContext *) AH->formatData; - static char buf[MAXPGPATH]; char *dname; dname = ctx->directory; @@ -643,6 +716,157 @@ prependDirectory(ArchiveHandle *AH, const char *relativeFilename) strcpy(buf, dname); strcat(buf, "/"); strcat(buf, relativeFilename); +} + +/* + * Clone format-specific fields during parallel restoration. + */ +static void +_Clone(ArchiveHandle *AH) +{ + lclContext *ctx = (lclContext *) AH->formatData; + + AH->formatData = (lclContext *) pg_malloc(sizeof(lclContext)); + memcpy(AH->formatData, ctx, sizeof(lclContext)); + ctx = (lclContext *) AH->formatData; + + /* + * Note: we do not make a local lo_buf because we expect at most one BLOBS + * entry per archive, so no parallelism is possible. Likewise, + * TOC-entry-local state isn't an issue because any one TOC entry is + * touched by just one worker child. + */ + + /* + * We also don't copy the ParallelState pointer (pstate), only the master + * process ever writes to it. + */ +} + +static void +_DeClone(ArchiveHandle *AH) +{ + lclContext *ctx = (lclContext *) AH->formatData; + + free(ctx); +} + +/* + * This function is executed in the parent process. Depending on the desired + * action (dump or restore) it creates a string that is understood by the + * _WorkerJobDump /_WorkerJobRestore functions of the dump format. + */ +static char * +_MasterStartParallelItem(ArchiveHandle *AH, TocEntry *te, T_Action act) +{ + /* + * A static char is okay here, even on Windows because we call this + * function only from one process (the master). + */ + static char buf[64]; + + if (act == ACT_DUMP) + snprintf(buf, sizeof(buf), "DUMP %d", te->dumpId); + else if (act == ACT_RESTORE) + snprintf(buf, sizeof(buf), "RESTORE %d", te->dumpId); return buf; } + +/* + * This function is executed in the child of a parallel backup for the + * directory archive and dumps the actual data. + * + * We are currently returning only the DumpId so theoretically we could + * make this function returning an int (or a DumpId). However, to + * facilitate further enhancements and because sooner or later we need to + * convert this to a string and send it via a message anyway, we stick with + * char *. It is parsed on the other side by the _EndMasterParallel() + * function of the respective dump format. + */ +static char * +_WorkerJobDumpDirectory(ArchiveHandle *AH, TocEntry *te) +{ + /* + * short fixed-size string + some ID so far, this needs to be malloc'ed + * instead of static because we work with threads on windows + */ + const int buflen = 64; + char *buf = (char *) pg_malloc(buflen); + lclTocEntry *tctx = (lclTocEntry *) te->formatData; + + /* This should never happen */ + if (!tctx) + exit_horribly(modulename, "Error during backup\n"); + + /* + * This function returns void. We either fail and die horribly or + * succeed... A failure will be detected by the parent when the child dies + * unexpectedly. + */ + WriteDataChunksForTocEntry(AH, te); + + snprintf(buf, buflen, "OK DUMP %d", te->dumpId); + + return buf; +} + +/* + * This function is executed in the child of a parallel backup for the + * directory archive and dumps the actual data. + */ +static char * +_WorkerJobRestoreDirectory(ArchiveHandle *AH, TocEntry *te) +{ + /* + * short fixed-size string + some ID so far, this needs to be malloc'ed + * instead of static because we work with threads on windows + */ + const int buflen = 64; + char *buf = (char *) pg_malloc(buflen); + ParallelArgs pargs; + int status; + + pargs.AH = AH; + pargs.te = te; + + status = parallel_restore(&pargs); + + snprintf(buf, buflen, "OK RESTORE %d %d %d", te->dumpId, status, + status == WORKER_IGNORED_ERRORS ? AH->public.n_errors : 0); + + return buf; +} + +/* + * This function is executed in the parent process. It analyzes the response of + * the _WorkerJobDumpDirectory/_WorkerJobRestoreDirectory functions of the + * respective dump format. + */ +static int +_MasterEndParallelItem(ArchiveHandle *AH, TocEntry *te, const char *str, T_Action act) +{ + DumpId dumpId; + int nBytes, + n_errors; + int status = 0; + + if (act == ACT_DUMP) + { + sscanf(str, "%u%n", &dumpId, &nBytes); + + Assert(dumpId == te->dumpId); + Assert(nBytes == strlen(str)); + } + else if (act == ACT_RESTORE) + { + sscanf(str, "%u %u %u%n", &dumpId, &status, &n_errors, &nBytes); + + Assert(dumpId == te->dumpId); + Assert(nBytes == strlen(str)); + + AH->public.n_errors += n_errors; + } + + return status; +} diff --git a/src/bin/pg_dump/pg_backup_tar.c b/src/bin/pg_dump/pg_backup_tar.c index 03ae4f82bc..6465ac3e6c 100644 --- a/src/bin/pg_dump/pg_backup_tar.c +++ b/src/bin/pg_dump/pg_backup_tar.c @@ -158,6 +158,12 @@ InitArchiveFmt_Tar(ArchiveHandle *AH) AH->ClonePtr = NULL; AH->DeClonePtr = NULL; + AH->MasterStartParallelItemPtr = NULL; + AH->MasterEndParallelItemPtr = NULL; + + AH->WorkerJobDumpPtr = NULL; + AH->WorkerJobRestorePtr = NULL; + /* * Set up some special context used in compressing data. */ @@ -828,7 +834,7 @@ _CloseArchive(ArchiveHandle *AH) /* * Now send the data (tables & blobs) */ - WriteDataChunks(AH); + WriteDataChunks(AH, NULL); /* * Now this format wants to append a script which does a full restore diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 093be9e16d..b50e540622 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -135,6 +135,7 @@ static int disable_dollar_quoting = 0; static int dump_inserts = 0; static int column_inserts = 0; static int no_security_labels = 0; +static int no_synchronized_snapshots = 0; static int no_unlogged_table_data = 0; static int serializable_deferrable = 0; @@ -243,8 +244,6 @@ static Oid findLastBuiltinOid_V70(Archive *fout); static void selectSourceSchema(Archive *fout, const char *schemaName); static char *getFormattedTypeName(Archive *fout, Oid oid, OidOptions opts); static char *myFormatType(const char *typname, int32 typmod); -static const char *fmtQualifiedId(Archive *fout, - const char *schema, const char *id); static void getBlobs(Archive *fout); static void dumpBlob(Archive *fout, BlobInfo *binfo); static int dumpBlobs(Archive *fout, void *arg); @@ -262,8 +261,10 @@ static void binary_upgrade_extension_member(PQExpBuffer upgrade_buffer, DumpableObject *dobj, const char *objlabel); static const char *getAttrName(int attrnum, TableInfo *tblInfo); -static const char *fmtCopyColumnList(const TableInfo *ti); +static const char *fmtCopyColumnList(const TableInfo *ti, PQExpBuffer buffer); +static char *get_synchronized_snapshot(Archive *fout); static PGresult *ExecuteSqlQueryForSingleRow(Archive *fout, char *query); +static void setupDumpWorker(Archive *AHX, RestoreOptions *ropt); int @@ -284,6 +285,7 @@ main(int argc, char **argv) int numObjs; DumpableObject *boundaryObjs; int i; + int numWorkers = 1; enum trivalue prompt_password = TRI_DEFAULT; int compressLevel = -1; int plainText = 0; @@ -314,6 +316,7 @@ main(int argc, char **argv) {"format", required_argument, NULL, 'F'}, {"host", required_argument, NULL, 'h'}, {"ignore-version", no_argument, NULL, 'i'}, + {"jobs", 1, NULL, 'j'}, {"no-reconnect", no_argument, NULL, 'R'}, {"oids", no_argument, NULL, 'o'}, {"no-owner", no_argument, NULL, 'O'}, @@ -353,6 +356,7 @@ main(int argc, char **argv) {"serializable-deferrable", no_argument, &serializable_deferrable, 1}, {"use-set-session-authorization", no_argument, &use_setsessauth, 1}, {"no-security-labels", no_argument, &no_security_labels, 1}, + {"no-synchronized-snapshots", no_argument, &no_synchronized_snapshots, 1}, {"no-unlogged-table-data", no_argument, &no_unlogged_table_data, 1}, {NULL, 0, NULL, 0} @@ -360,6 +364,12 @@ main(int argc, char **argv) set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_dump")); + /* + * Initialize what we need for parallel execution, especially for thread + * support on Windows. + */ + init_parallel_dump_utils(); + g_verbose = false; strcpy(g_comment_start, "-- "); @@ -390,7 +400,7 @@ main(int argc, char **argv) } } - while ((c = getopt_long(argc, argv, "abcCd:E:f:F:h:iK:n:N:oOp:RsS:t:T:U:vwWxZ:", + while ((c = getopt_long(argc, argv, "abcCd:E:f:F:h:ij:K:n:N:oOp:RsS:t:T:U:vwWxZ:", long_options, &optindex)) != -1) { switch (c) @@ -435,6 +445,10 @@ main(int argc, char **argv) /* ignored, deprecated option */ break; + case 'j': /* number of dump jobs */ + numWorkers = atoi(optarg); + break; + case 'n': /* include schema(s) */ simple_string_list_append(&schema_include_patterns, optarg); include_everything = false; @@ -577,8 +591,25 @@ main(int argc, char **argv) compressLevel = 0; } + /* + * On Windows we can only have at most MAXIMUM_WAIT_OBJECTS (= 64 usually) + * parallel jobs because that's the maximum limit for the + * WaitForMultipleObjects() call. + */ + if (numWorkers <= 0 +#ifdef WIN32 + || numWorkers > MAXIMUM_WAIT_OBJECTS +#endif + ) + exit_horribly(NULL, "%s: invalid number of parallel jobs\n", progname); + + /* Parallel backup only in the directory archive format so far */ + if (archiveFormat != archDirectory && numWorkers > 1) + exit_horribly(NULL, "parallel backup only supported by the directory format\n"); + /* Open the output file */ - fout = CreateArchive(filename, archiveFormat, compressLevel, archiveMode); + fout = CreateArchive(filename, archiveFormat, compressLevel, archiveMode, + setupDumpWorker); /* Register the cleanup hook */ on_exit_close_archive(fout); @@ -600,6 +631,8 @@ main(int argc, char **argv) fout->minRemoteVersion = 70000; fout->maxRemoteVersion = (my_version / 100) * 100 + 99; + fout->numWorkers = numWorkers; + /* * Open the database using the Archiver, so it knows about it. Errors mean * death. @@ -620,7 +653,8 @@ main(int argc, char **argv) */ if (fout->remoteVersion >= 90000) { - PGresult *res = ExecuteSqlQueryForSingleRow(fout, "SELECT pg_catalog.pg_is_in_recovery()"); + PGresult *res = ExecuteSqlQueryForSingleRow(fout, "SELECT pg_catalog.pg_is_in_recovery()"); + if (strcmp(PQgetvalue(res, 0, 0), "t") == 0) { /* @@ -632,32 +666,6 @@ main(int argc, char **argv) PQclear(res); } - /* - * Start transaction-snapshot mode transaction to dump consistent data. - */ - ExecuteSqlStatement(fout, "BEGIN"); - if (fout->remoteVersion >= 90100) - { - if (serializable_deferrable) - ExecuteSqlStatement(fout, - "SET TRANSACTION ISOLATION LEVEL " - "SERIALIZABLE, READ ONLY, DEFERRABLE"); - else - ExecuteSqlStatement(fout, - "SET TRANSACTION ISOLATION LEVEL " - "REPEATABLE READ, READ ONLY"); - } - else if (fout->remoteVersion >= 70400) - { - /* note: comma was not accepted in SET TRANSACTION before 8.0 */ - ExecuteSqlStatement(fout, - "SET TRANSACTION ISOLATION LEVEL " - "SERIALIZABLE READ ONLY"); - } - else - ExecuteSqlStatement(fout, - "SET TRANSACTION ISOLATION LEVEL SERIALIZABLE"); - /* Select the appropriate subquery to convert user IDs to names */ if (fout->remoteVersion >= 80100) username_subquery = "SELECT rolname FROM pg_catalog.pg_roles WHERE oid ="; @@ -666,6 +674,14 @@ main(int argc, char **argv) else username_subquery = "SELECT usename FROM pg_user WHERE usesysid ="; + /* check the version for the synchronized snapshots feature */ + if (numWorkers > 1 && fout->remoteVersion < 90200 + && !no_synchronized_snapshots) + exit_horribly(NULL, + "No synchronized snapshots available in this server version.\n" + "Run with --no-synchronized-snapshots instead if you do not\n" + "need synchronized snapshots.\n"); + /* Find the last built-in OID, if needed */ if (fout->remoteVersion < 70300) { @@ -763,6 +779,10 @@ main(int argc, char **argv) else sortDumpableObjectsByTypeOid(dobjs, numObjs); + /* If we do a parallel dump, we want the largest tables to go first */ + if (archiveFormat == archDirectory && numWorkers > 1) + sortDataAndIndexObjectsBySize(dobjs, numObjs); + sortDumpableObjects(dobjs, numObjs, boundaryObjs[0].dumpId, boundaryObjs[1].dumpId); @@ -810,9 +830,9 @@ main(int argc, char **argv) SetArchiveRestoreOptions(fout, ropt); /* - * The archive's TOC entries are now marked as to which ones will - * actually be output, so we can set up their dependency lists properly. - * This isn't necessary for plain-text output, though. + * The archive's TOC entries are now marked as to which ones will actually + * be output, so we can set up their dependency lists properly. This isn't + * necessary for plain-text output, though. */ if (!plainText) BuildArchiveDependencies(fout); @@ -844,6 +864,7 @@ help(const char *progname) printf(_(" -f, --file=FILENAME output file or directory name\n")); printf(_(" -F, --format=c|d|t|p output file format (custom, directory, tar,\n" " plain text (default))\n")); + printf(_(" -j, --jobs=NUM use this many parallel jobs to dump\n")); printf(_(" -v, --verbose verbose mode\n")); printf(_(" -V, --version output version information, then exit\n")); printf(_(" -Z, --compress=0-9 compression level for compressed formats\n")); @@ -873,6 +894,7 @@ help(const char *progname) printf(_(" --exclude-table-data=TABLE do NOT dump data for the named table(s)\n")); printf(_(" --inserts dump data as INSERT commands, rather than COPY\n")); printf(_(" --no-security-labels do not dump security label assignments\n")); + printf(_(" --no-synchronized-snapshots parallel processes should not use synchronized snapshots\n")); printf(_(" --no-tablespaces do not dump tablespace assignments\n")); printf(_(" --no-unlogged-table-data do not dump unlogged table data\n")); printf(_(" --quote-all-identifiers quote all identifiers, even if not key words\n")); @@ -902,7 +924,12 @@ setup_connection(Archive *AH, const char *dumpencoding, char *use_role) PGconn *conn = GetConnection(AH); const char *std_strings; - /* Set the client encoding if requested */ + /* + * Set the client encoding if requested. If dumpencoding == NULL then + * either it hasn't been requested or we're a cloned connection and then + * this has already been set in CloneArchive according to the original + * connection encoding. + */ if (dumpencoding) { if (PQsetClientEncoding(conn, dumpencoding) < 0) @@ -919,6 +946,10 @@ setup_connection(Archive *AH, const char *dumpencoding, char *use_role) std_strings = PQparameterStatus(conn, "standard_conforming_strings"); AH->std_strings = (std_strings && strcmp(std_strings, "on") == 0); + /* Set the role if requested */ + if (!use_role && AH->use_role) + use_role = AH->use_role; + /* Set the role if requested */ if (use_role && AH->remoteVersion >= 80100) { @@ -927,6 +958,10 @@ setup_connection(Archive *AH, const char *dumpencoding, char *use_role) appendPQExpBuffer(query, "SET ROLE %s", fmtId(use_role)); ExecuteSqlStatement(AH, query->data); destroyPQExpBuffer(query); + + /* save this for later use on parallel connections */ + if (!AH->use_role) + AH->use_role = strdup(use_role); } /* Set the datestyle to ISO to ensure the dump's portability */ @@ -965,6 +1000,68 @@ setup_connection(Archive *AH, const char *dumpencoding, char *use_role) */ if (quote_all_identifiers && AH->remoteVersion >= 90100) ExecuteSqlStatement(AH, "SET quote_all_identifiers = true"); + + /* + * Start transaction-snapshot mode transaction to dump consistent data. + */ + ExecuteSqlStatement(AH, "BEGIN"); + if (AH->remoteVersion >= 90100) + { + if (serializable_deferrable) + ExecuteSqlStatement(AH, + "SET TRANSACTION ISOLATION LEVEL " + "SERIALIZABLE, READ ONLY, DEFERRABLE"); + else + ExecuteSqlStatement(AH, + "SET TRANSACTION ISOLATION LEVEL " + "REPEATABLE READ, READ ONLY"); + } + else if (AH->remoteVersion >= 70400) + { + /* note: comma was not accepted in SET TRANSACTION before 8.0 */ + ExecuteSqlStatement(AH, + "SET TRANSACTION ISOLATION LEVEL " + "SERIALIZABLE READ ONLY"); + } + else + ExecuteSqlStatement(AH, + "SET TRANSACTION ISOLATION LEVEL SERIALIZABLE"); + + + + if (AH->numWorkers > 1 && AH->remoteVersion >= 90200 && !no_synchronized_snapshots) + { + if (AH->sync_snapshot_id) + { + PQExpBuffer query = createPQExpBuffer(); + + appendPQExpBuffer(query, "SET TRANSACTION SNAPSHOT "); + appendStringLiteralConn(query, AH->sync_snapshot_id, conn); + destroyPQExpBuffer(query); + } + else + AH->sync_snapshot_id = get_synchronized_snapshot(AH); + } +} + +static void +setupDumpWorker(Archive *AHX, RestoreOptions *ropt) +{ + setup_connection(AHX, NULL, NULL); +} + +static char * +get_synchronized_snapshot(Archive *fout) +{ + char *query = "SELECT pg_export_snapshot()"; + char *result; + PGresult *res; + + res = ExecuteSqlQueryForSingleRow(fout, query); + result = strdup(PQgetvalue(res, 0, 0)); + PQclear(res); + + return result; } static ArchiveFormat @@ -1080,7 +1177,7 @@ expand_table_name_patterns(Archive *fout, "SELECT c.oid" "\nFROM pg_catalog.pg_class c" "\n LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace" - "\nWHERE c.relkind in ('%c', '%c', '%c', '%c', '%c')\n", + "\nWHERE c.relkind in ('%c', '%c', '%c', '%c', '%c')\n", RELKIND_RELATION, RELKIND_SEQUENCE, RELKIND_VIEW, RELKIND_MATVIEW, RELKIND_FOREIGN_TABLE); processSQLNamePattern(GetConnection(fout), query, cell->val, true, @@ -1282,6 +1379,12 @@ dumpTableData_copy(Archive *fout, void *dcontext) const bool hasoids = tbinfo->hasoids; const bool oids = tdinfo->oids; PQExpBuffer q = createPQExpBuffer(); + + /* + * Note: can't use getThreadLocalPQExpBuffer() here, we're calling fmtId + * which uses it already. + */ + PQExpBuffer clistBuf = createPQExpBuffer(); PGconn *conn = GetConnection(fout); PGresult *res; int ret; @@ -1306,14 +1409,14 @@ dumpTableData_copy(Archive *fout, void *dcontext) * cases involving ADD COLUMN and inheritance.) */ if (fout->remoteVersion >= 70300) - column_list = fmtCopyColumnList(tbinfo); + column_list = fmtCopyColumnList(tbinfo, clistBuf); else column_list = ""; /* can't select columns in COPY */ if (oids && hasoids) { appendPQExpBuffer(q, "COPY %s %s WITH OIDS TO stdout;", - fmtQualifiedId(fout, + fmtQualifiedId(fout->remoteVersion, tbinfo->dobj.namespace->dobj.name, classname), column_list); @@ -1331,7 +1434,7 @@ dumpTableData_copy(Archive *fout, void *dcontext) else appendPQExpBufferStr(q, "* "); appendPQExpBuffer(q, "FROM %s %s) TO stdout;", - fmtQualifiedId(fout, + fmtQualifiedId(fout->remoteVersion, tbinfo->dobj.namespace->dobj.name, classname), tdinfo->filtercond); @@ -1339,13 +1442,14 @@ dumpTableData_copy(Archive *fout, void *dcontext) else { appendPQExpBuffer(q, "COPY %s %s TO stdout;", - fmtQualifiedId(fout, + fmtQualifiedId(fout->remoteVersion, tbinfo->dobj.namespace->dobj.name, classname), column_list); } res = ExecuteSqlQuery(fout, q->data, PGRES_COPY_OUT); PQclear(res); + destroyPQExpBuffer(clistBuf); for (;;) { @@ -1464,7 +1568,7 @@ dumpTableData_insert(Archive *fout, void *dcontext) { appendPQExpBuffer(q, "DECLARE _pg_dump_cursor CURSOR FOR " "SELECT * FROM ONLY %s", - fmtQualifiedId(fout, + fmtQualifiedId(fout->remoteVersion, tbinfo->dobj.namespace->dobj.name, classname)); } @@ -1472,7 +1576,7 @@ dumpTableData_insert(Archive *fout, void *dcontext) { appendPQExpBuffer(q, "DECLARE _pg_dump_cursor CURSOR FOR " "SELECT * FROM %s", - fmtQualifiedId(fout, + fmtQualifiedId(fout->remoteVersion, tbinfo->dobj.namespace->dobj.name, classname)); } @@ -1604,6 +1708,7 @@ dumpTableData(Archive *fout, TableDataInfo *tdinfo) { TableInfo *tbinfo = tdinfo->tdtable; PQExpBuffer copyBuf = createPQExpBuffer(); + PQExpBuffer clistBuf = createPQExpBuffer(); DataDumperPtr dumpFn; char *copyStmt; @@ -1615,7 +1720,7 @@ dumpTableData(Archive *fout, TableDataInfo *tdinfo) appendPQExpBuffer(copyBuf, "COPY %s ", fmtId(tbinfo->dobj.name)); appendPQExpBuffer(copyBuf, "%s %sFROM stdin;\n", - fmtCopyColumnList(tbinfo), + fmtCopyColumnList(tbinfo, clistBuf), (tdinfo->oids && tbinfo->hasoids) ? "WITH OIDS " : ""); copyStmt = copyBuf->data; } @@ -1640,6 +1745,7 @@ dumpTableData(Archive *fout, TableDataInfo *tdinfo) dumpFn, tdinfo); destroyPQExpBuffer(copyBuf); + destroyPQExpBuffer(clistBuf); } /* @@ -1665,22 +1771,22 @@ refreshMatViewData(Archive *fout, TableDataInfo *tdinfo) fmtId(tbinfo->dobj.name)); ArchiveEntry(fout, - tdinfo->dobj.catId, /* catalog ID */ - tdinfo->dobj.dumpId, /* dump ID */ - tbinfo->dobj.name, /* Name */ - tbinfo->dobj.namespace->dobj.name, /* Namespace */ - NULL, /* Tablespace */ - tbinfo->rolname, /* Owner */ - false, /* with oids */ - "MATERIALIZED VIEW DATA", /* Desc */ - SECTION_POST_DATA, /* Section */ - q->data, /* Create */ - "", /* Del */ - NULL, /* Copy */ - tdinfo->dobj.dependencies, /* Deps */ - tdinfo->dobj.nDeps, /* # Deps */ - NULL, /* Dumper */ - NULL); /* Dumper Arg */ + tdinfo->dobj.catId, /* catalog ID */ + tdinfo->dobj.dumpId, /* dump ID */ + tbinfo->dobj.name, /* Name */ + tbinfo->dobj.namespace->dobj.name, /* Namespace */ + NULL, /* Tablespace */ + tbinfo->rolname, /* Owner */ + false, /* with oids */ + "MATERIALIZED VIEW DATA", /* Desc */ + SECTION_POST_DATA, /* Section */ + q->data, /* Create */ + "", /* Del */ + NULL, /* Copy */ + tdinfo->dobj.dependencies, /* Deps */ + tdinfo->dobj.nDeps, /* # Deps */ + NULL, /* Dumper */ + NULL); /* Dumper Arg */ destroyPQExpBuffer(q); } @@ -1790,12 +1896,12 @@ buildMatViewRefreshDependencies(Archive *fout) appendPQExpBuffer(query, "with recursive w as " "( " - "select d1.objid, d2.refobjid, c2.relkind as refrelkind " + "select d1.objid, d2.refobjid, c2.relkind as refrelkind " "from pg_depend d1 " "join pg_class c1 on c1.oid = d1.objid " "and c1.relkind = 'm' " "join pg_rewrite r1 on r1.ev_class = d1.objid " - "join pg_depend d2 on d2.classid = 'pg_rewrite'::regclass " + "join pg_depend d2 on d2.classid = 'pg_rewrite'::regclass " "and d2.objid = r1.oid " "and d2.refobjid <> d1.objid " "join pg_class c2 on c2.oid = d2.refobjid " @@ -1805,13 +1911,13 @@ buildMatViewRefreshDependencies(Archive *fout) "select w.objid, d3.refobjid, c3.relkind " "from w " "join pg_rewrite r3 on r3.ev_class = w.refobjid " - "join pg_depend d3 on d3.classid = 'pg_rewrite'::regclass " + "join pg_depend d3 on d3.classid = 'pg_rewrite'::regclass " "and d3.objid = r3.oid " "and d3.refobjid <> w.refobjid " "join pg_class c3 on c3.oid = d3.refobjid " "and c3.relkind in ('m','v') " ") " - "select 'pg_class'::regclass::oid as classid, objid, refobjid " + "select 'pg_class'::regclass::oid as classid, objid, refobjid " "from w " "where refrelkind = 'm'"); @@ -1827,10 +1933,10 @@ buildMatViewRefreshDependencies(Archive *fout) { CatalogId objId; CatalogId refobjId; - DumpableObject *dobj; - DumpableObject *refdobj; - TableInfo *tbinfo; - TableInfo *reftbinfo; + DumpableObject *dobj; + DumpableObject *refdobj; + TableInfo *tbinfo; + TableInfo *reftbinfo; objId.tableoid = atooid(PQgetvalue(res, i, i_classid)); objId.oid = atooid(PQgetvalue(res, i, i_objid)); @@ -3760,7 +3866,7 @@ getAggregates(Archive *fout, int *numAggs) appendPQExpBuffer(query, "SELECT tableoid, oid, proname AS aggname, " "pronamespace AS aggnamespace, " "pronargs, proargtypes, " - "pg_catalog.pg_get_function_identity_arguments(oid) AS proiargs," + "pg_catalog.pg_get_function_identity_arguments(oid) AS proiargs," "(%s proowner) AS rolname, " "proacl AS aggacl " "FROM pg_proc p " @@ -3953,7 +4059,7 @@ getFuncs(Archive *fout, int *numFuncs) "SELECT tableoid, oid, proname, prolang, " "pronargs, proargtypes, prorettype, proacl, " "pronamespace, " - "pg_catalog.pg_get_function_identity_arguments(oid) AS proiargs," + "pg_catalog.pg_get_function_identity_arguments(oid) AS proiargs," "(%s proowner) AS rolname " "FROM pg_proc p " "WHERE NOT proisagg AND (" @@ -4122,6 +4228,7 @@ getTables(Archive *fout, int *numTables) int i_reloptions; int i_toastreloptions; int i_reloftype; + int i_relpages; /* Make sure we are in proper schema */ selectSourceSchema(fout, "pg_catalog"); @@ -4160,7 +4267,8 @@ getTables(Archive *fout, int *numTables) "c.relhasindex, c.relhasrules, c.relhasoids, " "c.relfrozenxid, tc.oid AS toid, " "tc.relfrozenxid AS tfrozenxid, " - "c.relpersistence, pg_relation_is_scannable(c.oid) as isscannable, " + "c.relpersistence, pg_relation_is_scannable(c.oid) as isscannable, " + "c.relpages, " "CASE WHEN c.reloftype <> 0 THEN c.reloftype::pg_catalog.regtype ELSE NULL END AS reloftype, " "d.refobjid AS owning_tab, " "d.refobjsubid AS owning_col, " @@ -4174,7 +4282,7 @@ getTables(Archive *fout, int *numTables) "d.objsubid = 0 AND " "d.refclassid = c.tableoid AND d.deptype = 'a') " "LEFT JOIN pg_class tc ON (c.reltoastrelid = tc.oid) " - "WHERE c.relkind in ('%c', '%c', '%c', '%c', '%c', '%c') " + "WHERE c.relkind in ('%c', '%c', '%c', '%c', '%c', '%c') " "ORDER BY c.oid", username_subquery, RELKIND_SEQUENCE, @@ -4210,7 +4318,7 @@ getTables(Archive *fout, int *numTables) "d.objsubid = 0 AND " "d.refclassid = c.tableoid AND d.deptype = 'a') " "LEFT JOIN pg_class tc ON (c.reltoastrelid = tc.oid) " - "WHERE c.relkind in ('%c', '%c', '%c', '%c', '%c', '%c') " + "WHERE c.relkind in ('%c', '%c', '%c', '%c', '%c', '%c') " "ORDER BY c.oid", username_subquery, RELKIND_SEQUENCE, @@ -4233,6 +4341,7 @@ getTables(Archive *fout, int *numTables) "c.relfrozenxid, tc.oid AS toid, " "tc.relfrozenxid AS tfrozenxid, " "'p' AS relpersistence, 't'::bool as isscannable, " + "c.relpages, " "CASE WHEN c.reloftype <> 0 THEN c.reloftype::pg_catalog.regtype ELSE NULL END AS reloftype, " "d.refobjid AS owning_tab, " "d.refobjsubid AS owning_col, " @@ -4268,6 +4377,7 @@ getTables(Archive *fout, int *numTables) "c.relfrozenxid, tc.oid AS toid, " "tc.relfrozenxid AS tfrozenxid, " "'p' AS relpersistence, 't'::bool as isscannable, " + "c.relpages, " "NULL AS reloftype, " "d.refobjid AS owning_tab, " "d.refobjsubid AS owning_col, " @@ -4303,6 +4413,7 @@ getTables(Archive *fout, int *numTables) "c.relfrozenxid, tc.oid AS toid, " "tc.relfrozenxid AS tfrozenxid, " "'p' AS relpersistence, 't'::bool as isscannable, " + "c.relpages, " "NULL AS reloftype, " "d.refobjid AS owning_tab, " "d.refobjsubid AS owning_col, " @@ -4339,6 +4450,7 @@ getTables(Archive *fout, int *numTables) "0 AS toid, " "0 AS tfrozenxid, " "'p' AS relpersistence, 't'::bool as isscannable, " + "relpages, " "NULL AS reloftype, " "d.refobjid AS owning_tab, " "d.refobjsubid AS owning_col, " @@ -4374,6 +4486,7 @@ getTables(Archive *fout, int *numTables) "0 AS toid, " "0 AS tfrozenxid, " "'p' AS relpersistence, 't'::bool as isscannable, " + "relpages, " "NULL AS reloftype, " "d.refobjid AS owning_tab, " "d.refobjsubid AS owning_col, " @@ -4405,6 +4518,7 @@ getTables(Archive *fout, int *numTables) "0 AS toid, " "0 AS tfrozenxid, " "'p' AS relpersistence, 't'::bool as isscannable, " + "relpages, " "NULL AS reloftype, " "NULL::oid AS owning_tab, " "NULL::int4 AS owning_col, " @@ -4431,6 +4545,7 @@ getTables(Archive *fout, int *numTables) "0 AS toid, " "0 AS tfrozenxid, " "'p' AS relpersistence, 't'::bool as isscannable, " + "relpages, " "NULL AS reloftype, " "NULL::oid AS owning_tab, " "NULL::int4 AS owning_col, " @@ -4467,6 +4582,7 @@ getTables(Archive *fout, int *numTables) "0 AS toid, " "0 AS tfrozenxid, " "'p' AS relpersistence, 't'::bool as isscannable, " + "0 AS relpages, " "NULL AS reloftype, " "NULL::oid AS owning_tab, " "NULL::int4 AS owning_col, " @@ -4515,6 +4631,7 @@ getTables(Archive *fout, int *numTables) i_toastfrozenxid = PQfnumber(res, "tfrozenxid"); i_relpersistence = PQfnumber(res, "relpersistence"); i_isscannable = PQfnumber(res, "isscannable"); + i_relpages = PQfnumber(res, "relpages"); i_owning_tab = PQfnumber(res, "owning_tab"); i_owning_col = PQfnumber(res, "owning_col"); i_reltablespace = PQfnumber(res, "reltablespace"); @@ -4557,6 +4674,7 @@ getTables(Archive *fout, int *numTables) tblinfo[i].hastriggers = (strcmp(PQgetvalue(res, i, i_relhastriggers), "t") == 0); tblinfo[i].hasoids = (strcmp(PQgetvalue(res, i, i_relhasoids), "t") == 0); tblinfo[i].isscannable = (strcmp(PQgetvalue(res, i, i_isscannable), "t") == 0); + tblinfo[i].relpages = atoi(PQgetvalue(res, i, i_relpages)); tblinfo[i].frozenxid = atooid(PQgetvalue(res, i, i_relfrozenxid)); tblinfo[i].toast_oid = atooid(PQgetvalue(res, i, i_toastoid)); tblinfo[i].toast_frozenxid = atooid(PQgetvalue(res, i, i_toastfrozenxid)); @@ -4606,7 +4724,7 @@ getTables(Archive *fout, int *numTables) resetPQExpBuffer(query); appendPQExpBuffer(query, "LOCK TABLE %s IN ACCESS SHARE MODE", - fmtQualifiedId(fout, + fmtQualifiedId(fout->remoteVersion, tblinfo[i].dobj.namespace->dobj.name, tblinfo[i].dobj.name)); ExecuteSqlStatement(fout, query->data); @@ -4745,7 +4863,8 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) i_conoid, i_condef, i_tablespace, - i_options; + i_options, + i_relpages; int ntups; for (i = 0; i < numTables; i++) @@ -4790,6 +4909,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) "pg_catalog.pg_get_indexdef(i.indexrelid) AS indexdef, " "t.relnatts AS indnkeys, " "i.indkey, i.indisclustered, " + "t.relpages, " "c.contype, c.conname, " "c.condeferrable, c.condeferred, " "c.tableoid AS contableoid, " @@ -4815,6 +4935,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) "pg_catalog.pg_get_indexdef(i.indexrelid) AS indexdef, " "t.relnatts AS indnkeys, " "i.indkey, i.indisclustered, " + "t.relpages, " "c.contype, c.conname, " "c.condeferrable, c.condeferred, " "c.tableoid AS contableoid, " @@ -4843,6 +4964,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) "pg_catalog.pg_get_indexdef(i.indexrelid) AS indexdef, " "t.relnatts AS indnkeys, " "i.indkey, i.indisclustered, " + "t.relpages, " "c.contype, c.conname, " "c.condeferrable, c.condeferred, " "c.tableoid AS contableoid, " @@ -4871,6 +4993,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) "pg_catalog.pg_get_indexdef(i.indexrelid) AS indexdef, " "t.relnatts AS indnkeys, " "i.indkey, i.indisclustered, " + "t.relpages, " "c.contype, c.conname, " "c.condeferrable, c.condeferred, " "c.tableoid AS contableoid, " @@ -4899,6 +5022,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) "pg_get_indexdef(i.indexrelid) AS indexdef, " "t.relnatts AS indnkeys, " "i.indkey, false AS indisclustered, " + "t.relpages, " "CASE WHEN i.indisprimary THEN 'p'::char " "ELSE '0'::char END AS contype, " "t.relname AS conname, " @@ -4925,6 +5049,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) "pg_get_indexdef(i.indexrelid) AS indexdef, " "t.relnatts AS indnkeys, " "i.indkey, false AS indisclustered, " + "t.relpages, " "CASE WHEN i.indisprimary THEN 'p'::char " "ELSE '0'::char END AS contype, " "t.relname AS conname, " @@ -4953,6 +5078,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) i_indnkeys = PQfnumber(res, "indnkeys"); i_indkey = PQfnumber(res, "indkey"); i_indisclustered = PQfnumber(res, "indisclustered"); + i_relpages = PQfnumber(res, "relpages"); i_contype = PQfnumber(res, "contype"); i_conname = PQfnumber(res, "conname"); i_condeferrable = PQfnumber(res, "condeferrable"); @@ -4995,6 +5121,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) parseOidArray(PQgetvalue(res, j, i_indkey), indxinfo[j].indkeys, INDEX_MAX_KEYS); indxinfo[j].indisclustered = (PQgetvalue(res, j, i_indisclustered)[0] == 't'); + indxinfo[j].relpages = atoi(PQgetvalue(res, j, i_relpages)); contype = *(PQgetvalue(res, j, i_contype)); if (contype == 'p' || contype == 'u' || contype == 'x') @@ -5345,7 +5472,7 @@ getRules(Archive *fout, int *numRules) * table. */ if ((ruleinfo[i].ruletable->relkind == RELKIND_VIEW || - ruleinfo[i].ruletable->relkind == RELKIND_MATVIEW) && + ruleinfo[i].ruletable->relkind == RELKIND_MATVIEW) && ruleinfo[i].ev_type == '1' && ruleinfo[i].is_instead) { addObjectDependency(&ruleinfo[i].ruletable->dobj, @@ -7733,11 +7860,11 @@ dumpExtension(Archive *fout, ExtensionInfo *extinfo) appendPQExpBuffer(q, "-- For binary upgrade, create an empty extension and insert objects into it\n"); /* - * We unconditionally create the extension, so we must drop it if it - * exists. This could happen if the user deleted 'plpgsql' and then - * readded it, causing its oid to be greater than FirstNormalObjectId. - * The FirstNormalObjectId test was kept to avoid repeatedly dropping - * and recreating extensions like 'plpgsql'. + * We unconditionally create the extension, so we must drop it if it + * exists. This could happen if the user deleted 'plpgsql' and then + * readded it, causing its oid to be greater than FirstNormalObjectId. + * The FirstNormalObjectId test was kept to avoid repeatedly dropping + * and recreating extensions like 'plpgsql'. */ appendPQExpBuffer(q, "DROP EXTENSION IF EXISTS %s;\n", qextname); @@ -12138,7 +12265,7 @@ dumpDefaultACL(Archive *fout, DefaultACLInfo *daclinfo) default: /* shouldn't get here */ exit_horribly(NULL, - "unrecognized object type in default privileges: %d\n", + "unrecognized object type in default privileges: %d\n", (int) daclinfo->defaclobjtype); type = ""; /* keep compiler quiet */ } @@ -12611,7 +12738,7 @@ createViewAsClause(Archive *fout, TableInfo *tbinfo) { /* Beginning in 7.3, viewname is not unique; rely on OID */ appendPQExpBuffer(query, - "SELECT pg_catalog.pg_get_viewdef('%u'::pg_catalog.oid) AS viewdef", + "SELECT pg_catalog.pg_get_viewdef('%u'::pg_catalog.oid) AS viewdef", tbinfo->dobj.catId.oid); } else @@ -12641,7 +12768,7 @@ createViewAsClause(Archive *fout, TableInfo *tbinfo) tbinfo->dobj.name); /* Strip off the trailing semicolon so that other things may follow. */ - Assert(PQgetvalue(res, 0, 0)[len-1] == ';'); + Assert(PQgetvalue(res, 0, 0)[len - 1] == ';'); appendBinaryPQExpBuffer(result, PQgetvalue(res, 0, 0), len - 1); PQclear(res); @@ -12712,37 +12839,37 @@ dumpTableSchema(Archive *fout, TableInfo *tbinfo) switch (tbinfo->relkind) { case (RELKIND_FOREIGN_TABLE): - { - PQExpBuffer query = createPQExpBuffer(); - PGresult *res; - int i_srvname; - int i_ftoptions; + { + PQExpBuffer query = createPQExpBuffer(); + PGresult *res; + int i_srvname; + int i_ftoptions; - reltypename = "FOREIGN TABLE"; + reltypename = "FOREIGN TABLE"; - /* retrieve name of foreign server and generic options */ - appendPQExpBuffer(query, - "SELECT fs.srvname, " - "pg_catalog.array_to_string(ARRAY(" - "SELECT pg_catalog.quote_ident(option_name) || " - "' ' || pg_catalog.quote_literal(option_value) " - "FROM pg_catalog.pg_options_to_table(ftoptions) " - "ORDER BY option_name" - "), E',\n ') AS ftoptions " - "FROM pg_catalog.pg_foreign_table ft " - "JOIN pg_catalog.pg_foreign_server fs " - "ON (fs.oid = ft.ftserver) " - "WHERE ft.ftrelid = '%u'", - tbinfo->dobj.catId.oid); - res = ExecuteSqlQueryForSingleRow(fout, query->data); - i_srvname = PQfnumber(res, "srvname"); - i_ftoptions = PQfnumber(res, "ftoptions"); - srvname = pg_strdup(PQgetvalue(res, 0, i_srvname)); - ftoptions = pg_strdup(PQgetvalue(res, 0, i_ftoptions)); - PQclear(res); - destroyPQExpBuffer(query); - break; - } + /* retrieve name of foreign server and generic options */ + appendPQExpBuffer(query, + "SELECT fs.srvname, " + "pg_catalog.array_to_string(ARRAY(" + "SELECT pg_catalog.quote_ident(option_name) || " + "' ' || pg_catalog.quote_literal(option_value) " + "FROM pg_catalog.pg_options_to_table(ftoptions) " + "ORDER BY option_name" + "), E',\n ') AS ftoptions " + "FROM pg_catalog.pg_foreign_table ft " + "JOIN pg_catalog.pg_foreign_server fs " + "ON (fs.oid = ft.ftserver) " + "WHERE ft.ftrelid = '%u'", + tbinfo->dobj.catId.oid); + res = ExecuteSqlQueryForSingleRow(fout, query->data); + i_srvname = PQfnumber(res, "srvname"); + i_ftoptions = PQfnumber(res, "ftoptions"); + srvname = pg_strdup(PQgetvalue(res, 0, i_srvname)); + ftoptions = pg_strdup(PQgetvalue(res, 0, i_ftoptions)); + PQclear(res); + destroyPQExpBuffer(query); + break; + } case (RELKIND_MATVIEW): reltypename = "MATERIALIZED VIEW"; srvname = NULL; @@ -12788,156 +12915,158 @@ dumpTableSchema(Archive *fout, TableInfo *tbinfo) if (tbinfo->relkind != RELKIND_MATVIEW) { - /* Dump the attributes */ - actual_atts = 0; - for (j = 0; j < tbinfo->numatts; j++) - { - /* - * Normally, dump if it's locally defined in this table, and not - * dropped. But for binary upgrade, we'll dump all the columns, - * and then fix up the dropped and nonlocal cases below. - */ - if (shouldPrintColumn(tbinfo, j)) + /* Dump the attributes */ + actual_atts = 0; + for (j = 0; j < tbinfo->numatts; j++) { /* - * Default value --- suppress if to be printed separately. + * Normally, dump if it's locally defined in this table, and + * not dropped. But for binary upgrade, we'll dump all the + * columns, and then fix up the dropped and nonlocal cases + * below. */ - bool has_default = (tbinfo->attrdefs[j] != NULL && - !tbinfo->attrdefs[j]->separate); - - /* - * Not Null constraint --- suppress if inherited, except in - * binary-upgrade case where that won't work. - */ - bool has_notnull = (tbinfo->notnull[j] && - (!tbinfo->inhNotNull[j] || - binary_upgrade)); - - /* Skip column if fully defined by reloftype */ - if (tbinfo->reloftype && - !has_default && !has_notnull && !binary_upgrade) - continue; - - /* Format properly if not first attr */ - if (actual_atts == 0) - appendPQExpBuffer(q, " ("); - else - appendPQExpBuffer(q, ","); - appendPQExpBuffer(q, "\n "); - actual_atts++; - - /* Attribute name */ - appendPQExpBuffer(q, "%s", - fmtId(tbinfo->attnames[j])); - - if (tbinfo->attisdropped[j]) + if (shouldPrintColumn(tbinfo, j)) { /* - * ALTER TABLE DROP COLUMN clears pg_attribute.atttypid, - * so we will not have gotten a valid type name; insert - * INTEGER as a stopgap. We'll clean things up later. + * Default value --- suppress if to be printed separately. */ - appendPQExpBuffer(q, " INTEGER /* dummy */"); - /* Skip all the rest, too */ - continue; - } + bool has_default = (tbinfo->attrdefs[j] != NULL && + !tbinfo->attrdefs[j]->separate); - /* Attribute type */ - if (tbinfo->reloftype && !binary_upgrade) - { - appendPQExpBuffer(q, " WITH OPTIONS"); - } - else if (fout->remoteVersion >= 70100) - { - appendPQExpBuffer(q, " %s", - tbinfo->atttypnames[j]); - } - else - { - /* If no format_type, fake it */ - appendPQExpBuffer(q, " %s", - myFormatType(tbinfo->atttypnames[j], - tbinfo->atttypmod[j])); - } + /* + * Not Null constraint --- suppress if inherited, except + * in binary-upgrade case where that won't work. + */ + bool has_notnull = (tbinfo->notnull[j] && + (!tbinfo->inhNotNull[j] || + binary_upgrade)); - /* Add collation if not default for the type */ - if (OidIsValid(tbinfo->attcollation[j])) - { - CollInfo *coll; + /* Skip column if fully defined by reloftype */ + if (tbinfo->reloftype && + !has_default && !has_notnull && !binary_upgrade) + continue; - coll = findCollationByOid(tbinfo->attcollation[j]); - if (coll) + /* Format properly if not first attr */ + if (actual_atts == 0) + appendPQExpBuffer(q, " ("); + else + appendPQExpBuffer(q, ","); + appendPQExpBuffer(q, "\n "); + actual_atts++; + + /* Attribute name */ + appendPQExpBuffer(q, "%s", + fmtId(tbinfo->attnames[j])); + + if (tbinfo->attisdropped[j]) { - /* always schema-qualify, don't try to be smart */ - appendPQExpBuffer(q, " COLLATE %s.", - fmtId(coll->dobj.namespace->dobj.name)); - appendPQExpBuffer(q, "%s", - fmtId(coll->dobj.name)); + /* + * ALTER TABLE DROP COLUMN clears + * pg_attribute.atttypid, so we will not have gotten a + * valid type name; insert INTEGER as a stopgap. We'll + * clean things up later. + */ + appendPQExpBuffer(q, " INTEGER /* dummy */"); + /* Skip all the rest, too */ + continue; } + + /* Attribute type */ + if (tbinfo->reloftype && !binary_upgrade) + { + appendPQExpBuffer(q, " WITH OPTIONS"); + } + else if (fout->remoteVersion >= 70100) + { + appendPQExpBuffer(q, " %s", + tbinfo->atttypnames[j]); + } + else + { + /* If no format_type, fake it */ + appendPQExpBuffer(q, " %s", + myFormatType(tbinfo->atttypnames[j], + tbinfo->atttypmod[j])); + } + + /* Add collation if not default for the type */ + if (OidIsValid(tbinfo->attcollation[j])) + { + CollInfo *coll; + + coll = findCollationByOid(tbinfo->attcollation[j]); + if (coll) + { + /* always schema-qualify, don't try to be smart */ + appendPQExpBuffer(q, " COLLATE %s.", + fmtId(coll->dobj.namespace->dobj.name)); + appendPQExpBuffer(q, "%s", + fmtId(coll->dobj.name)); + } + } + + if (has_default) + appendPQExpBuffer(q, " DEFAULT %s", + tbinfo->attrdefs[j]->adef_expr); + + if (has_notnull) + appendPQExpBuffer(q, " NOT NULL"); } - - if (has_default) - appendPQExpBuffer(q, " DEFAULT %s", - tbinfo->attrdefs[j]->adef_expr); - - if (has_notnull) - appendPQExpBuffer(q, " NOT NULL"); } - } - /* - * Add non-inherited CHECK constraints, if any. - */ - for (j = 0; j < tbinfo->ncheck; j++) - { - ConstraintInfo *constr = &(tbinfo->checkexprs[j]); - - if (constr->separate || !constr->conislocal) - continue; - - if (actual_atts == 0) - appendPQExpBuffer(q, " (\n "); - else - appendPQExpBuffer(q, ",\n "); - - appendPQExpBuffer(q, "CONSTRAINT %s ", - fmtId(constr->dobj.name)); - appendPQExpBuffer(q, "%s", constr->condef); - - actual_atts++; - } - - if (actual_atts) - appendPQExpBuffer(q, "\n)"); - else if (!(tbinfo->reloftype && !binary_upgrade)) - { /* - * We must have a parenthesized attribute list, even though empty, - * when not using the OF TYPE syntax. + * Add non-inherited CHECK constraints, if any. */ - appendPQExpBuffer(q, " (\n)"); - } - - if (numParents > 0 && !binary_upgrade) - { - appendPQExpBuffer(q, "\nINHERITS ("); - for (k = 0; k < numParents; k++) + for (j = 0; j < tbinfo->ncheck; j++) { - TableInfo *parentRel = parents[k]; + ConstraintInfo *constr = &(tbinfo->checkexprs[j]); - if (k > 0) - appendPQExpBuffer(q, ", "); - if (parentRel->dobj.namespace != tbinfo->dobj.namespace) - appendPQExpBuffer(q, "%s.", - fmtId(parentRel->dobj.namespace->dobj.name)); - appendPQExpBuffer(q, "%s", - fmtId(parentRel->dobj.name)); + if (constr->separate || !constr->conislocal) + continue; + + if (actual_atts == 0) + appendPQExpBuffer(q, " (\n "); + else + appendPQExpBuffer(q, ",\n "); + + appendPQExpBuffer(q, "CONSTRAINT %s ", + fmtId(constr->dobj.name)); + appendPQExpBuffer(q, "%s", constr->condef); + + actual_atts++; } - appendPQExpBuffer(q, ")"); - } - if (tbinfo->relkind == RELKIND_FOREIGN_TABLE) - appendPQExpBuffer(q, "\nSERVER %s", fmtId(srvname)); + if (actual_atts) + appendPQExpBuffer(q, "\n)"); + else if (!(tbinfo->reloftype && !binary_upgrade)) + { + /* + * We must have a parenthesized attribute list, even though + * empty, when not using the OF TYPE syntax. + */ + appendPQExpBuffer(q, " (\n)"); + } + + if (numParents > 0 && !binary_upgrade) + { + appendPQExpBuffer(q, "\nINHERITS ("); + for (k = 0; k < numParents; k++) + { + TableInfo *parentRel = parents[k]; + + if (k > 0) + appendPQExpBuffer(q, ", "); + if (parentRel->dobj.namespace != tbinfo->dobj.namespace) + appendPQExpBuffer(q, "%s.", + fmtId(parentRel->dobj.namespace->dobj.name)); + appendPQExpBuffer(q, "%s", + fmtId(parentRel->dobj.name)); + } + appendPQExpBuffer(q, ")"); + } + + if (tbinfo->relkind == RELKIND_FOREIGN_TABLE) + appendPQExpBuffer(q, "\nSERVER %s", fmtId(srvname)); } if ((tbinfo->reloptions && strlen(tbinfo->reloptions) > 0) || @@ -13853,8 +13982,8 @@ dumpSequence(Archive *fout, TableInfo *tbinfo) /* * If the sequence is owned by a table column, emit the ALTER for it as a - * separate TOC entry immediately following the sequence's own entry. - * It's OK to do this rather than using full sorting logic, because the + * separate TOC entry immediately following the sequence's own entry. It's + * OK to do this rather than using full sorting logic, because the * dependency that tells us it's owned will have forced the table to be * created first. We can't just include the ALTER in the TOC entry * because it will fail if we haven't reassigned the sequence owner to @@ -14763,7 +14892,7 @@ addBoundaryDependencies(DumpableObject **dobjs, int numObjs, * chains linking through objects that don't appear explicitly in the dump. * For example, a view will depend on its _RETURN rule while the _RETURN rule * will depend on other objects --- but the rule will not appear as a separate - * object in the dump. We need to adjust the view's dependencies to include + * object in the dump. We need to adjust the view's dependencies to include * whatever the rule depends on that is included in the dump. * * Just to make things more complicated, there are also "special" dependencies @@ -14851,7 +14980,7 @@ findDumpableDependencies(ArchiveHandle *AH, DumpableObject *dobj, { *allocDeps *= 2; *dependencies = (DumpId *) pg_realloc(*dependencies, - *allocDeps * sizeof(DumpId)); + *allocDeps * sizeof(DumpId)); } (*dependencies)[*nDeps] = depid; (*nDeps)++; @@ -14859,9 +14988,9 @@ findDumpableDependencies(ArchiveHandle *AH, DumpableObject *dobj, else { /* - * Object will not be dumped, so recursively consider its deps. - * We rely on the assumption that sortDumpableObjects already - * broke any dependency loops, else we might recurse infinitely. + * Object will not be dumped, so recursively consider its deps. We + * rely on the assumption that sortDumpableObjects already broke + * any dependency loops, else we might recurse infinitely. */ DumpableObject *otherdobj = findObjectByDumpId(depid); @@ -14884,22 +15013,21 @@ findDumpableDependencies(ArchiveHandle *AH, DumpableObject *dobj, * * Whenever the selected schema is not pg_catalog, be careful to qualify * references to system catalogs and types in our emitted commands! + * + * This function is called only from selectSourceSchemaOnAH and + * selectSourceSchema. */ static void selectSourceSchema(Archive *fout, const char *schemaName) { - static char *curSchemaName = NULL; PQExpBuffer query; + /* This is checked by the callers already */ + Assert(schemaName != NULL && *schemaName != '\0'); + /* Not relevant if fetching from pre-7.3 DB */ if (fout->remoteVersion < 70300) return; - /* Ignore null schema names */ - if (schemaName == NULL || *schemaName == '\0') - return; - /* Optimize away repeated selection of same schema */ - if (curSchemaName && strcmp(curSchemaName, schemaName) == 0) - return; query = createPQExpBuffer(); appendPQExpBuffer(query, "SET search_path = %s", @@ -14910,9 +15038,6 @@ selectSourceSchema(Archive *fout, const char *schemaName) ExecuteSqlStatement(fout, query->data); destroyPQExpBuffer(query); - if (curSchemaName) - free(curSchemaName); - curSchemaName = pg_strdup(schemaName); } /* @@ -15049,34 +15174,6 @@ myFormatType(const char *typname, int32 typmod) return result; } -/* - * fmtQualifiedId - convert a qualified name to the proper format for - * the source database. - * - * Like fmtId, use the result before calling again. - */ -static const char * -fmtQualifiedId(Archive *fout, const char *schema, const char *id) -{ - static PQExpBuffer id_return = NULL; - - if (id_return) /* first time through? */ - resetPQExpBuffer(id_return); - else - id_return = createPQExpBuffer(); - - /* Suppress schema name if fetching from pre-7.3 DB */ - if (fout->remoteVersion >= 70300 && schema && *schema) - { - appendPQExpBuffer(id_return, "%s.", - fmtId(schema)); - } - appendPQExpBuffer(id_return, "%s", - fmtId(id)); - - return id_return->data; -} - /* * Return a column list clause for the given relation. * @@ -15084,37 +15181,31 @@ fmtQualifiedId(Archive *fout, const char *schema, const char *id) * "", not an invalid "()" column list. */ static const char * -fmtCopyColumnList(const TableInfo *ti) +fmtCopyColumnList(const TableInfo *ti, PQExpBuffer buffer) { - static PQExpBuffer q = NULL; int numatts = ti->numatts; char **attnames = ti->attnames; bool *attisdropped = ti->attisdropped; bool needComma; int i; - if (q) /* first time through? */ - resetPQExpBuffer(q); - else - q = createPQExpBuffer(); - - appendPQExpBuffer(q, "("); + appendPQExpBuffer(buffer, "("); needComma = false; for (i = 0; i < numatts; i++) { if (attisdropped[i]) continue; if (needComma) - appendPQExpBuffer(q, ", "); - appendPQExpBuffer(q, "%s", fmtId(attnames[i])); + appendPQExpBuffer(buffer, ", "); + appendPQExpBuffer(buffer, "%s", fmtId(attnames[i])); needComma = true; } if (!needComma) return ""; /* no undropped columns */ - appendPQExpBuffer(q, ")"); - return q->data; + appendPQExpBuffer(buffer, ")"); + return buffer->data; } /* diff --git a/src/bin/pg_dump/pg_dump.h b/src/bin/pg_dump/pg_dump.h index 01ec27b632..7970a359bd 100644 --- a/src/bin/pg_dump/pg_dump.h +++ b/src/bin/pg_dump/pg_dump.h @@ -252,6 +252,7 @@ typedef struct _tableInfo /* these two are set only if table is a sequence owned by a column: */ Oid owning_tab; /* OID of table owning sequence */ int owning_col; /* attr # of column owning sequence */ + int relpages; bool interesting; /* true if need to collect more data */ @@ -315,6 +316,7 @@ typedef struct _indxInfo bool indisclustered; /* if there is an associated constraint object, its dumpId: */ DumpId indexconstraint; + int relpages; /* relpages of the underlying table */ } IndxInfo; typedef struct _ruleInfo @@ -532,6 +534,7 @@ extern void sortDumpableObjects(DumpableObject **objs, int numObjs, DumpId preBoundaryId, DumpId postBoundaryId); extern void sortDumpableObjectsByTypeName(DumpableObject **objs, int numObjs); extern void sortDumpableObjectsByTypeOid(DumpableObject **objs, int numObjs); +extern void sortDataAndIndexObjectsBySize(DumpableObject **objs, int numObjs); /* * version specific routines diff --git a/src/bin/pg_dump/pg_dump_sort.c b/src/bin/pg_dump/pg_dump_sort.c index 2c3d850f3d..8a6d36329d 100644 --- a/src/bin/pg_dump/pg_dump_sort.c +++ b/src/bin/pg_dump/pg_dump_sort.c @@ -143,6 +143,96 @@ static void repairDependencyLoop(DumpableObject **loop, static void describeDumpableObject(DumpableObject *obj, char *buf, int bufsize); +static int DOSizeCompare(const void *p1, const void *p2); + +static int +findFirstEqualType(DumpableObjectType type, DumpableObject **objs, int numObjs) +{ + int i; + + for (i = 0; i < numObjs; i++) + if (objs[i]->objType == type) + return i; + return -1; +} + +static int +findFirstDifferentType(DumpableObjectType type, DumpableObject **objs, int numObjs, int start) +{ + int i; + + for (i = start; i < numObjs; i++) + if (objs[i]->objType != type) + return i; + return numObjs - 1; +} + +/* + * When we do a parallel dump, we want to start with the largest items first. + * + * Say we have the objects in this order: + * ....DDDDD....III.... + * + * with D = Table data, I = Index, . = other object + * + * This sorting function now takes each of the D or I blocks and sorts them + * according to their size. + */ +void +sortDataAndIndexObjectsBySize(DumpableObject **objs, int numObjs) +{ + int startIdx, + endIdx; + void *startPtr; + + if (numObjs <= 1) + return; + + startIdx = findFirstEqualType(DO_TABLE_DATA, objs, numObjs); + if (startIdx >= 0) + { + endIdx = findFirstDifferentType(DO_TABLE_DATA, objs, numObjs, startIdx); + startPtr = objs + startIdx; + qsort(startPtr, endIdx - startIdx, sizeof(DumpableObject *), + DOSizeCompare); + } + + startIdx = findFirstEqualType(DO_INDEX, objs, numObjs); + if (startIdx >= 0) + { + endIdx = findFirstDifferentType(DO_INDEX, objs, numObjs, startIdx); + startPtr = objs + startIdx; + qsort(startPtr, endIdx - startIdx, sizeof(DumpableObject *), + DOSizeCompare); + } +} + +static int +DOSizeCompare(const void *p1, const void *p2) +{ + DumpableObject *obj1 = *(DumpableObject **) p1; + DumpableObject *obj2 = *(DumpableObject **) p2; + int obj1_size = 0; + int obj2_size = 0; + + if (obj1->objType == DO_TABLE_DATA) + obj1_size = ((TableDataInfo *) obj1)->tdtable->relpages; + if (obj1->objType == DO_INDEX) + obj1_size = ((IndxInfo *) obj1)->relpages; + + if (obj2->objType == DO_TABLE_DATA) + obj2_size = ((TableDataInfo *) obj2)->tdtable->relpages; + if (obj2->objType == DO_INDEX) + obj2_size = ((IndxInfo *) obj2)->relpages; + + /* we want to see the biggest item go first */ + if (obj1_size > obj2_size) + return -1; + if (obj2_size > obj1_size) + return 1; + + return 0; +} /* * Sort the given objects into a type/name-based ordering @@ -735,7 +825,7 @@ repairViewRuleMultiLoop(DumpableObject *viewobj, /* remove view's dependency on rule */ removeObjectDependency(viewobj, ruleobj->dumpId); /* pretend view is a plain table and dump it that way */ - viewinfo->relkind = 'r'; /* RELKIND_RELATION */ + viewinfo->relkind = 'r'; /* RELKIND_RELATION */ /* mark rule as needing its own dump */ ruleinfo->separate = true; /* move any reloptions from view to rule */ diff --git a/src/bin/pg_dump/pg_dumpall.c b/src/bin/pg_dump/pg_dumpall.c index 5488021bca..b26aa99f03 100644 --- a/src/bin/pg_dump/pg_dumpall.c +++ b/src/bin/pg_dump/pg_dumpall.c @@ -657,8 +657,8 @@ dumpRoles(PGconn *conn) "rolcreaterole, rolcreatedb, " "rolcanlogin, rolconnlimit, rolpassword, " "rolvaliduntil, rolreplication, " - "pg_catalog.shobj_description(oid, 'pg_authid') as rolcomment, " - "rolname = current_user AS is_current_user " + "pg_catalog.shobj_description(oid, 'pg_authid') as rolcomment, " + "rolname = current_user AS is_current_user " "FROM pg_authid " "ORDER BY 2"); else if (server_version >= 80200) @@ -667,8 +667,8 @@ dumpRoles(PGconn *conn) "rolcreaterole, rolcreatedb, " "rolcanlogin, rolconnlimit, rolpassword, " "rolvaliduntil, false as rolreplication, " - "pg_catalog.shobj_description(oid, 'pg_authid') as rolcomment, " - "rolname = current_user AS is_current_user " + "pg_catalog.shobj_description(oid, 'pg_authid') as rolcomment, " + "rolname = current_user AS is_current_user " "FROM pg_authid " "ORDER BY 2"); else if (server_version >= 80100) @@ -678,7 +678,7 @@ dumpRoles(PGconn *conn) "rolcanlogin, rolconnlimit, rolpassword, " "rolvaliduntil, false as rolreplication, " "null as rolcomment, " - "rolname = current_user AS is_current_user " + "rolname = current_user AS is_current_user " "FROM pg_authid " "ORDER BY 2"); else @@ -694,7 +694,7 @@ dumpRoles(PGconn *conn) "valuntil as rolvaliduntil, " "false as rolreplication, " "null as rolcomment, " - "rolname = current_user AS is_current_user " + "rolname = current_user AS is_current_user " "FROM pg_shadow " "UNION ALL " "SELECT 0, groname as rolname, " @@ -755,7 +755,7 @@ dumpRoles(PGconn *conn) * will acquire the right properties even if it already exists (ie, it * won't hurt for the CREATE to fail). This is particularly important * for the role we are connected as, since even with --clean we will - * have failed to drop it. binary_upgrade cannot generate any errors, + * have failed to drop it. binary_upgrade cannot generate any errors, * so we assume the current role is already created. */ if (!binary_upgrade || @@ -1857,8 +1857,8 @@ connectDatabase(const char *dbname, const char *connection_string, } /* - * Ok, connected successfully. Remember the options used, in the form of - * a connection string. + * Ok, connected successfully. Remember the options used, in the form of a + * connection string. */ connstr = constructConnStr(keywords, values); @@ -2039,7 +2039,7 @@ static void doConnStrQuoting(PQExpBuffer buf, const char *str) { const char *s; - bool needquotes; + bool needquotes; /* * If the string consists entirely of plain ASCII characters, no need to diff --git a/src/bin/pg_dump/pg_restore.c b/src/bin/pg_dump/pg_restore.c index 5dbe98f714..0cc17fd416 100644 --- a/src/bin/pg_dump/pg_restore.c +++ b/src/bin/pg_dump/pg_restore.c @@ -71,6 +71,7 @@ main(int argc, char **argv) RestoreOptions *opts; int c; int exit_code; + int numWorkers = 1; Archive *AH; char *inputFileSpec; static int disable_triggers = 0; @@ -182,7 +183,7 @@ main(int argc, char **argv) break; case 'j': /* number of restore jobs */ - opts->number_of_jobs = atoi(optarg); + numWorkers = atoi(optarg); break; case 'l': /* Dump the TOC summary */ @@ -313,7 +314,7 @@ main(int argc, char **argv) } /* Can't do single-txn mode with multiple connections */ - if (opts->single_txn && opts->number_of_jobs > 1) + if (opts->single_txn && numWorkers > 1) { fprintf(stderr, _("%s: cannot specify both --single-transaction and multiple jobs\n"), progname); @@ -372,6 +373,18 @@ main(int argc, char **argv) if (opts->tocFile) SortTocFromFile(AH, opts); + /* See comments in pg_dump.c */ +#ifdef WIN32 + if (numWorkers > MAXIMUM_WAIT_OBJECTS) + { + fprintf(stderr, _("%s: maximum number of parallel jobs is %d\n"), + progname, MAXIMUM_WAIT_OBJECTS); + exit(1); + } +#endif + + AH->numWorkers = numWorkers; + if (opts->tocSummary) PrintTOCSummary(AH, opts); else diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm index 5bf0e5949b..a4bd2b6210 100644 --- a/src/tools/msvc/Mkvcbuild.pm +++ b/src/tools/msvc/Mkvcbuild.pm @@ -395,6 +395,7 @@ sub mkvcbuild $psql->AddIncludeDir('src\bin\pg_dump'); $psql->AddIncludeDir('src\backend'); $psql->AddFile('src\bin\psql\psqlscan.l'); + $psql->AddLibrary('ws2_32.lib'); my $pgdump = AddSimpleFrontend('pg_dump', 1); $pgdump->AddIncludeDir('src\backend'); @@ -403,6 +404,7 @@ sub mkvcbuild $pgdump->AddFile('src\bin\pg_dump\pg_dump_sort.c'); $pgdump->AddFile('src\bin\pg_dump\keywords.c'); $pgdump->AddFile('src\backend\parser\kwlookup.c'); + $pgdump->AddLibrary('ws2_32.lib'); my $pgdumpall = AddSimpleFrontend('pg_dump', 1); @@ -419,6 +421,7 @@ sub mkvcbuild $pgdumpall->AddFile('src\bin\pg_dump\dumputils.c'); $pgdumpall->AddFile('src\bin\pg_dump\keywords.c'); $pgdumpall->AddFile('src\backend\parser\kwlookup.c'); + $pgdumpall->AddLibrary('ws2_32.lib'); my $pgrestore = AddSimpleFrontend('pg_dump', 1); $pgrestore->{name} = 'pg_restore'; @@ -426,6 +429,7 @@ sub mkvcbuild $pgrestore->AddFile('src\bin\pg_dump\pg_restore.c'); $pgrestore->AddFile('src\bin\pg_dump\keywords.c'); $pgrestore->AddFile('src\backend\parser\kwlookup.c'); + $pgrestore->AddLibrary('ws2_32.lib'); my $zic = $solution->AddProject('zic', 'exe', 'utils'); $zic->AddFiles('src\timezone', 'zic.c', 'ialloc.c', 'scheck.c', @@ -572,6 +576,7 @@ sub mkvcbuild $proj->AddIncludeDir('src\bin\psql'); $proj->AddReference($libpq, $libpgport, $libpgcommon); $proj->AddResourceFile('src\bin\scripts', 'PostgreSQL Utility'); + $proj->AddLibrary('ws2_32.lib'); } # Regression DLL and EXE