diff --git a/contrib/file_fdw/file_fdw.c b/contrib/file_fdw/file_fdw.c index 735b79484c..277639f6e9 100644 --- a/contrib/file_fdw/file_fdw.c +++ b/contrib/file_fdw/file_fdw.c @@ -662,6 +662,7 @@ fileBeginForeignScan(ForeignScanState *node, int eflags) node->ss.ss_currentRelation, filename, is_program, + NULL, NIL, options); @@ -737,6 +738,7 @@ fileReScanForeignScan(ForeignScanState *node) node->ss.ss_currentRelation, festate->filename, festate->is_program, + NULL, NIL, festate->options); } @@ -1100,7 +1102,8 @@ file_acquire_sample_rows(Relation onerel, int elevel, /* * Create CopyState from FDW options. */ - cstate = BeginCopyFrom(NULL, onerel, filename, is_program, NIL, options); + cstate = BeginCopyFrom(NULL, onerel, filename, is_program, NULL, NIL, + options); /* * Use per-tuple memory context to prevent leak of memory used to read diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index df0435c3f0..228ec78031 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -300,6 +300,11 @@ logical replication subscriptions + + pg_subscription_rel + relation state for subscriptions + + pg_tablespace tablespaces within this database cluster @@ -6418,6 +6423,79 @@ + + <structname>pg_subscription_rel</structname> + + + pg_subscription_rel + + + + The catalog pg_subscription_rel contains the + state for each replicated relation in each subscription. This is a + many-to-many mapping. + + + + This catalog only contains tables known to the subscription after running + either CREATE SUBSCRIPTION or + ALTER SUBSCRIPTION ... REFRESH. + + + + <structname>pg_subscription_rel</structname> Columns + + + + + Name + Type + References + Description + + + + + + srsubid + oid + pg_subscription.oid + Reference to subscription + + + + srrelid + oid + pg_class.oid + Reference to relation + + + + srsubstate + char + + + State code: + i = initialize, + d = data is being copied, + s = synchronized, + r = ready (normal replication) + + + + + srsublsn + pg_lsn + + + End LSN for s and r states. + + + + +
+
+ <structname>pg_tablespace</structname> diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index b379b67b30..2de3540def 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -3449,6 +3449,31 @@ ANY num_sync ( + max_sync_workers_per_subscription (integer) + + max_sync_workers_per_subscription configuration parameter + + + + + Maximum number of synchronization workers per subscription. This + parameter controls the amount of paralelism of the initial data copy + during the subscription initialization or when new tables are added. + + + Currently, there can be only one synchronization worker per table. + + + The synchronization workers are taken from the pool defined by + max_logical_replication_workers. + + + The default value is 2. + + + + diff --git a/doc/src/sgml/logical-replication.sgml b/doc/src/sgml/logical-replication.sgml index 44cd78563d..48db9cd08b 100644 --- a/doc/src/sgml/logical-replication.sgml +++ b/doc/src/sgml/logical-replication.sgml @@ -24,9 +24,11 @@ - Logical replication sends changes on the publisher to the subscriber as - they occur in real-time. The subscriber applies the data in the same order - as the publisher so that transactional consistency is guaranteed for + Logical replication of a table typically starts with a taking a snapshot + of the data on the publisher database and copying that to the subscriber. + Once that is done, the changes on the publisher are sent to the subscriber + as they occur in real-time. The subscriber applies the data in the same + order as the publisher so that transactional consistency is guaranteed for publications within a single subscription. This method of data replication is sometimes referred to as transactional replication. @@ -159,7 +161,9 @@ Each subscription will receive changes via one replication slot (see - ). + ). Additional temporary + replication slots may be required for the initial data synchronization + of pre-existing table data. @@ -264,9 +268,25 @@ to replica, which produces the usual effects on triggers and constraints. + + + Initial Snapshot + + The initial data in existing subscribed tables are snapshotted and + copied in a parallel instance of a special kind of apply process. + This process will create its own temporary replication slot and + copy the existing data. Once existing data is copied, the worker + enters synchronization mode, which ensures that the table is brought + up to a synchronized state with the main apply process by streaming + any changes that happened during the initial data copy using standard + logical replication. Once the synchronization is done, the control + of the replication of the table is given back to the main apply + process where the replication continues as normal. + + - + Monitoring @@ -287,7 +307,9 @@ Normally, there is a single apply process running for an enabled subscription. A disabled subscription or a crashed subscription will have - zero rows in this view. + zero rows in this view. If the initial data synchronization of any + table is in progress, there will be additional workers for the tables + being synchronized. @@ -337,10 +359,11 @@ On the publisher side, wal_level must be set to logical, and max_replication_slots - must be set to at least the number of subscriptions expected to connect. - And max_wal_senders should be set to at least the same - as max_replication_slots plus the number of physical replicas - that are connected at the same time. + must be set to at least the number of subscriptions expected to connect, + plus some reserve for table synchronization. And + max_wal_senders should be set to at least the same as + max_replication_slots plus the number of physical + replicas that are connected at the same time. @@ -348,9 +371,9 @@ to be set. In this case it should be set to at least the number of subscriptions that will be added to the subscriber. max_logical_replication_workers must be set to at - least the number of subscriptions. Additionally the - max_worker_processes may need to be adjusted to - accommodate for replication workers, at least + least the number of subscriptions, again plus some reserve for the table + synchronization. Additionally the max_worker_processes + may need to be adjusted to accommodate for replication workers, at least (max_logical_replication_workers + 1). Note that some extensions and parallel queries also take worker slots from max_worker_processes. @@ -393,8 +416,10 @@ CREATE SUBSCRIPTION mysub CONNECTION 'dbname=foo host=bar user=repuser' PUBLICAT - The above will start the replication process of changes to - users and departments tables. + The above will start the replication process, which synchronizes the + initial table contents of the tables users and + departments and then starts replicating + incremental changes to those tables. diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index dcb2d3303c..eb6f486677 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -1863,6 +1863,12 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i integer Process ID of the subscription worker process
+ + relid + Oid + OID of the relation that the worker is synchronizing; null for the + main apply worker + received_lsn pg_lsn @@ -1899,7 +1905,8 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i The pg_stat_subscription view will contain one row per subscription for main worker (with null PID if the worker is - not running). + not running), and additional rows for workers handling the initial data + copy of the subscribed tables. diff --git a/doc/src/sgml/protocol.sgml b/doc/src/sgml/protocol.sgml index 244e381de9..48ca414031 100644 --- a/doc/src/sgml/protocol.sgml +++ b/doc/src/sgml/protocol.sgml @@ -1487,7 +1487,7 @@ The commands accepted in walsender mode are: - CREATE_REPLICATION_SLOT slot_name [ TEMPORARY ] { PHYSICAL [ RESERVE_WAL ] | LOGICAL output_plugin [ EXPORT_SNAPSHOT | NOEXPORT_SNAPSHOT ] } + CREATE_REPLICATION_SLOT slot_name [ TEMPORARY ] { PHYSICAL [ RESERVE_WAL ] | LOGICAL output_plugin [ EXPORT_SNAPSHOT | NOEXPORT_SNAPSHOT | USE_SNAPSHOT ] } CREATE_REPLICATION_SLOT @@ -1542,12 +1542,17 @@ The commands accepted in walsender mode are: EXPORT_SNAPSHOT NOEXPORT_SNAPSHOT + USE_SNAPSHOT Decides what to do with the snapshot created during logical slot initialization. EXPORT_SNAPSHOT, which is the default, will export the snapshot for use in other sessions. This option can't - be used inside a transaction. NOEXPORT_SNAPSHOT will + be used inside a transaction. USE_SNAPSHOT will use the + snapshot for the current transaction executing the command. This + option must be used in a transaction, and + CREATE_REPLICATION_SLOT must be the first command + run in that transaction. Finally, NOEXPORT_SNAPSHOT will just use the snapshot for logical decoding as normal but won't do anything else with it. diff --git a/doc/src/sgml/ref/alter_subscription.sgml b/doc/src/sgml/ref/alter_subscription.sgml index 5e18e2ff6c..6f94247b92 100644 --- a/doc/src/sgml/ref/alter_subscription.sgml +++ b/doc/src/sgml/ref/alter_subscription.sgml @@ -21,15 +21,21 @@ PostgreSQL documentation -ALTER SUBSCRIPTION name WITH ( option [, ... ] ) ] +ALTER SUBSCRIPTION name WITH ( suboption [, ... ] ) ] -where option can be: +where suboption can be: - SLOT NAME = slot_name + SLOT NAME = slot_name + +ALTER SUBSCRIPTION name SET PUBLICATION publication_name [, ...] { REFRESH WITH ( puboption [, ... ] ) | NOREFRESH } +ALTER SUBSCRIPTION name REFRESH PUBLICATION WITH ( puboption [, ... ] ) + +where puboption can be: + + COPY DATA | NOCOPY DATA ALTER SUBSCRIPTION name OWNER TO { new_owner | CURRENT_USER | SESSION_USER } ALTER SUBSCRIPTION name CONNECTION 'conninfo' -ALTER SUBSCRIPTION name SET PUBLICATION publication_name [, ...] ALTER SUBSCRIPTION name ENABLE ALTER SUBSCRIPTION name DISABLE @@ -65,7 +71,6 @@ ALTER SUBSCRIPTION name DISABLE CONNECTION 'conninfo' - SET PUBLICATION publication_name SLOT NAME = slot_name @@ -76,6 +81,40 @@ ALTER SUBSCRIPTION name DISABLE + + SET PUBLICATION publication_name + + + Changes list of subscribed publications. See + for more information. + + + When REFRESH is specified, this command will also + act like REFRESH PUBLICATION. When + NOREFRESH is specified, the comamnd will not try to + refresh table information. + + + + + + REFRESH PUBLICATION + + + Fetch missing table information from publisher. This will start + replication of tables that were added to the subscribed-to publications + since the last invocation of REFRESH PUBLICATION or + since CREATE SUBSCRIPTION. + + + The COPY DATA and NOCOPY DATA + options specify if the existing data in the publications that are being + subscribed to should be copied. COPY DATA is the + default. + + + + ENABLE @@ -95,6 +134,7 @@ ALTER SUBSCRIPTION name DISABLE + diff --git a/doc/src/sgml/ref/create_subscription.sgml b/doc/src/sgml/ref/create_subscription.sgml index e200076700..8f3c30b9b0 100644 --- a/doc/src/sgml/ref/create_subscription.sgml +++ b/doc/src/sgml/ref/create_subscription.sgml @@ -31,6 +31,8 @@ CREATE SUBSCRIPTION subscription_nameslot_name + | COPY DATA | NOCOPY DATA + | NOCONNECT @@ -132,6 +134,42 @@ CREATE SUBSCRIPTION subscription_name + + + COPY DATA + NOCOPY DATA + + + Specifies if the existing data in the publications that are being + subscribed to should be copied once the replication starts. + COPY DATA is the default. + + + + + + NOCONNECT + + + Instructs CREATE SUBSCRIPTION to skip the initial + connection to the provider. This will change default values of other + options to DISABLED, + NOCREATE SLOT, and NOCOPY DATA. + + + It's not allowed to combine NOCONNECT and + ENABLED, CREATE SLOT, or + COPY DATA. + + + Since no connection is made when this option is specified, the tables + are not subscribed, so after you enable the subscription nothing will + be replicated. It is required to run + ALTER SUBSCRIPTION ... REFRESH PUBLICATION in order for + tables to be subscribed. + + + diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml index bb32fb12e0..4f19b89232 100644 --- a/doc/src/sgml/ref/pg_dump.sgml +++ b/doc/src/sgml/ref/pg_dump.sgml @@ -799,22 +799,23 @@ PostgreSQL documentation - + - When dumping logical replication subscriptions, - generate CREATE SUBSCRIPTION commands that do not - create the remote replication slot. That way, the dump can be - restored without requiring network access to the remote servers. + Do not dump security labels. - + - Do not dump security labels. + When dumping logical replication subscriptions, + generate CREATE SUBSCRIPTION commands that do not + make remote connections for creating replication slot or initial table + copy. That way, the dump can be restored without requiring network + access to the remote servers. diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile index 31368585d2..159cab5c18 100644 --- a/src/backend/catalog/Makefile +++ b/src/backend/catalog/Makefile @@ -44,6 +44,7 @@ POSTGRES_BKI_SRCS = $(addprefix $(top_srcdir)/src/include/catalog/,\ pg_default_acl.h pg_init_privs.h pg_seclabel.h pg_shseclabel.h \ pg_collation.h pg_partitioned_table.h pg_range.h pg_transform.h \ pg_sequence.h pg_publication.h pg_publication_rel.h pg_subscription.h \ + pg_subscription_rel.h toasting.h indexing.h \ toasting.h indexing.h \ ) diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index 41c0056556..d49dcdc015 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -52,6 +52,7 @@ #include "catalog/pg_opclass.h" #include "catalog/pg_partitioned_table.h" #include "catalog/pg_statistic.h" +#include "catalog/pg_subscription_rel.h" #include "catalog/pg_tablespace.h" #include "catalog/pg_type.h" #include "catalog/pg_type_fn.h" @@ -1831,6 +1832,11 @@ heap_drop_with_catalog(Oid relid) */ relation_close(rel, NoLock); + /* + * Remove any associated relation synchronization states. + */ + RemoveSubscriptionRel(InvalidOid, relid); + /* * Forget any ON COMMIT action for the rel */ diff --git a/src/backend/catalog/pg_publication.c b/src/backend/catalog/pg_publication.c index 0f784690ce..9330e2380a 100644 --- a/src/backend/catalog/pg_publication.c +++ b/src/backend/catalog/pg_publication.c @@ -221,8 +221,8 @@ GetPublicationRelations(Oid pubid) BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(pubid)); - scan = systable_beginscan(pubrelsrel, PublicationRelMapIndexId, true, - NULL, 1, &scankey); + scan = systable_beginscan(pubrelsrel, PublicationRelPrrelidPrpubidIndexId, + true, NULL, 1, &scankey); result = NIL; while (HeapTupleIsValid(tup = systable_getnext(scan))) diff --git a/src/backend/catalog/pg_subscription.c b/src/backend/catalog/pg_subscription.c index 20fdd6a54f..e420ec14d2 100644 --- a/src/backend/catalog/pg_subscription.c +++ b/src/backend/catalog/pg_subscription.c @@ -19,15 +19,20 @@ #include "access/genam.h" #include "access/heapam.h" #include "access/htup_details.h" +#include "access/xact.h" +#include "catalog/indexing.h" #include "catalog/pg_type.h" #include "catalog/pg_subscription.h" +#include "catalog/pg_subscription_rel.h" #include "nodes/makefuncs.h" #include "utils/array.h" #include "utils/builtins.h" #include "utils/fmgroids.h" +#include "utils/pg_lsn.h" +#include "utils/rel.h" #include "utils/syscache.h" @@ -206,3 +211,280 @@ textarray_to_stringlist(ArrayType *textarray) return res; } + +/* + * Set the state of a subscription table. + */ +Oid +SetSubscriptionRelState(Oid subid, Oid relid, char state, + XLogRecPtr sublsn) +{ + Relation rel; + HeapTuple tup; + Oid subrelid; + bool nulls[Natts_pg_subscription_rel]; + Datum values[Natts_pg_subscription_rel]; + + /* Prevent concurrent changes. */ + rel = heap_open(SubscriptionRelRelationId, ShareRowExclusiveLock); + + /* Try finding existing mapping. */ + tup = SearchSysCacheCopy2(SUBSCRIPTIONRELMAP, + ObjectIdGetDatum(relid), + ObjectIdGetDatum(subid)); + + /* + * If the record for given table does not exist yet create new + * record, otherwise update the existing one. + */ + if (!HeapTupleIsValid(tup)) + { + /* Form the tuple. */ + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + values[Anum_pg_subscription_rel_srsubid - 1] = ObjectIdGetDatum(subid); + values[Anum_pg_subscription_rel_srrelid - 1] = ObjectIdGetDatum(relid); + values[Anum_pg_subscription_rel_srsubstate - 1] = CharGetDatum(state); + if (sublsn != InvalidXLogRecPtr) + values[Anum_pg_subscription_rel_srsublsn - 1] = LSNGetDatum(sublsn); + else + nulls[Anum_pg_subscription_rel_srsublsn - 1] = true; + + tup = heap_form_tuple(RelationGetDescr(rel), values, nulls); + + /* Insert tuple into catalog. */ + subrelid = CatalogTupleInsert(rel, tup); + + heap_freetuple(tup); + } + else + { + bool replaces[Natts_pg_subscription_rel]; + + /* Update the tuple. */ + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + memset(replaces, false, sizeof(replaces)); + + replaces[Anum_pg_subscription_rel_srsubstate - 1] = true; + values[Anum_pg_subscription_rel_srsubstate - 1] = CharGetDatum(state); + + replaces[Anum_pg_subscription_rel_srsublsn - 1] = true; + if (sublsn != InvalidXLogRecPtr) + values[Anum_pg_subscription_rel_srsublsn - 1] = LSNGetDatum(sublsn); + else + nulls[Anum_pg_subscription_rel_srsublsn - 1] = true; + + tup = heap_modify_tuple(tup, RelationGetDescr(rel), values, nulls, + replaces); + + /* Update the catalog. */ + CatalogTupleUpdate(rel, &tup->t_self, tup); + + subrelid = HeapTupleGetOid(tup); + } + + /* Cleanup. */ + heap_close(rel, NoLock); + + return subrelid; +} + +/* + * Get state of subscription table. + * + * Returns SUBREL_STATE_UNKNOWN when not found and missing_ok is true. + */ +char +GetSubscriptionRelState(Oid subid, Oid relid, XLogRecPtr *sublsn, + bool missing_ok) +{ + Relation rel; + HeapTuple tup; + char substate; + bool isnull; + Datum d; + + rel = heap_open(SubscriptionRelRelationId, AccessShareLock); + + /* Try finding the mapping. */ + tup = SearchSysCache2(SUBSCRIPTIONRELMAP, + ObjectIdGetDatum(relid), + ObjectIdGetDatum(subid)); + + if (!HeapTupleIsValid(tup)) + { + if (missing_ok) + { + heap_close(rel, AccessShareLock); + *sublsn = InvalidXLogRecPtr; + return SUBREL_STATE_UNKNOWN; + } + + elog(ERROR, "subscription table %u in subscription %u does not exist", + relid, subid); + } + + /* Get the state. */ + d = SysCacheGetAttr(SUBSCRIPTIONRELMAP, tup, + Anum_pg_subscription_rel_srsubstate, &isnull); + Assert(!isnull); + substate = DatumGetChar(d); + d = SysCacheGetAttr(SUBSCRIPTIONRELMAP, tup, + Anum_pg_subscription_rel_srsublsn, &isnull); + if (isnull) + *sublsn = InvalidXLogRecPtr; + else + *sublsn = DatumGetLSN(d); + + /* Cleanup */ + ReleaseSysCache(tup); + heap_close(rel, AccessShareLock); + + return substate; +} + +/* + * Drop subscription relation mapping. These can be for a particular + * subscription, or for a particular relation, or both. + */ +void +RemoveSubscriptionRel(Oid subid, Oid relid) +{ + Relation rel; + HeapScanDesc scan; + ScanKeyData skey[2]; + HeapTuple tup; + int nkeys = 0; + + /* Prevent concurrent changes (see SetSubscriptionRelState()). */ + rel = heap_open(SubscriptionRelRelationId, ShareRowExclusiveLock); + + if (OidIsValid(subid)) + { + ScanKeyInit(&skey[nkeys++], + Anum_pg_subscription_rel_srsubid, + BTEqualStrategyNumber, + F_OIDEQ, + ObjectIdGetDatum(subid)); + } + + if (OidIsValid(relid)) + { + ScanKeyInit(&skey[nkeys++], + Anum_pg_subscription_rel_srrelid, + BTEqualStrategyNumber, + F_OIDEQ, + ObjectIdGetDatum(relid)); + } + + /* Do the search and delete what we found. */ + scan = heap_beginscan_catalog(rel, nkeys, skey); + while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection))) + { + simple_heap_delete(rel, &tup->t_self); + } + heap_endscan(scan); + + heap_close(rel, ShareRowExclusiveLock); +} + + +/* + * Get all relations for subscription. + * + * Returned list is palloced in current memory context. + */ +List * +GetSubscriptionRelations(Oid subid) +{ + List *res = NIL; + Relation rel; + HeapTuple tup; + int nkeys = 0; + ScanKeyData skey[2]; + SysScanDesc scan; + + rel = heap_open(SubscriptionRelRelationId, AccessShareLock); + + ScanKeyInit(&skey[nkeys++], + Anum_pg_subscription_rel_srsubid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(subid)); + + scan = systable_beginscan(rel, InvalidOid, false, + NULL, nkeys, skey); + + while (HeapTupleIsValid(tup = systable_getnext(scan))) + { + Form_pg_subscription_rel subrel; + SubscriptionRelState *relstate; + + subrel = (Form_pg_subscription_rel) GETSTRUCT(tup); + + relstate = (SubscriptionRelState *)palloc(sizeof(SubscriptionRelState)); + relstate->relid = subrel->srrelid; + relstate->state = subrel->srsubstate; + relstate->lsn = subrel->srsublsn; + + res = lappend(res, relstate); + } + + /* Cleanup */ + systable_endscan(scan); + heap_close(rel, AccessShareLock); + + return res; +} + +/* + * Get all relations for subscription that are not in a ready state. + * + * Returned list is palloced in current memory context. + */ +List * +GetSubscriptionNotReadyRelations(Oid subid) +{ + List *res = NIL; + Relation rel; + HeapTuple tup; + int nkeys = 0; + ScanKeyData skey[2]; + SysScanDesc scan; + + rel = heap_open(SubscriptionRelRelationId, AccessShareLock); + + ScanKeyInit(&skey[nkeys++], + Anum_pg_subscription_rel_srsubid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(subid)); + + ScanKeyInit(&skey[nkeys++], + Anum_pg_subscription_rel_srsubstate, + BTEqualStrategyNumber, F_CHARNE, + CharGetDatum(SUBREL_STATE_READY)); + + scan = systable_beginscan(rel, InvalidOid, false, + NULL, nkeys, skey); + + while (HeapTupleIsValid(tup = systable_getnext(scan))) + { + Form_pg_subscription_rel subrel; + SubscriptionRelState *relstate; + + subrel = (Form_pg_subscription_rel) GETSTRUCT(tup); + + relstate = (SubscriptionRelState *)palloc(sizeof(SubscriptionRelState)); + relstate->relid = subrel->srrelid; + relstate->state = subrel->srsubstate; + relstate->lsn = subrel->srsublsn; + + res = lappend(res, relstate); + } + + /* Cleanup */ + systable_endscan(scan); + heap_close(rel, AccessShareLock); + + return res; +} diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index c2b0bedc1d..5723714fb9 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -733,6 +733,7 @@ CREATE VIEW pg_stat_subscription AS su.oid AS subid, su.subname, st.pid, + st.relid, st.received_lsn, st.last_msg_send_time, st.last_msg_receipt_time, diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index ba89b292d1..b0fd09f458 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -60,7 +60,8 @@ typedef enum CopyDest { COPY_FILE, /* to/from file (or a piped program) */ COPY_OLD_FE, /* to/from frontend (2.0 protocol) */ - COPY_NEW_FE /* to/from frontend (3.0 protocol) */ + COPY_NEW_FE, /* to/from frontend (3.0 protocol) */ + COPY_CALLBACK /* to/from callback function */ } CopyDest; /* @@ -109,6 +110,7 @@ typedef struct CopyStateData List *attnumlist; /* integer list of attnums to copy */ char *filename; /* filename, or NULL for STDIN/STDOUT */ bool is_program; /* is 'filename' a program to popen? */ + copy_data_source_cb data_source_cb; /* function for reading data*/ bool binary; /* binary format? */ bool oids; /* include OIDs? */ bool freeze; /* freeze rows on loading? */ @@ -299,7 +301,6 @@ static uint64 DoCopyTo(CopyState cstate); static uint64 CopyTo(CopyState cstate); static void CopyOneRowTo(CopyState cstate, Oid tupleOid, Datum *values, bool *nulls); -static uint64 CopyFrom(CopyState cstate); static void CopyFromInsertBatch(CopyState cstate, EState *estate, CommandId mycid, int hi_options, ResultRelInfo *resultRelInfo, TupleTableSlot *myslot, @@ -529,6 +530,9 @@ CopySendEndOfRow(CopyState cstate) /* Dump the accumulated row as one CopyData message */ (void) pq_putmessage('d', fe_msgbuf->data, fe_msgbuf->len); break; + case COPY_CALLBACK: + Assert(false); /* Not yet supported. */ + break; } resetStringInfo(fe_msgbuf); @@ -643,6 +647,9 @@ CopyGetData(CopyState cstate, void *databuf, int minread, int maxread) bytesread += avail; } break; + case COPY_CALLBACK: + bytesread = cstate->data_source_cb(databuf, minread, maxread); + break; } return bytesread; @@ -969,7 +976,7 @@ DoCopy(ParseState *pstate, const CopyStmt *stmt, PreventCommandIfParallelMode("COPY FROM"); cstate = BeginCopyFrom(pstate, rel, stmt->filename, stmt->is_program, - stmt->attlist, stmt->options); + NULL, stmt->attlist, stmt->options); cstate->range_table = range_table; *processed = CopyFrom(cstate); /* copy from file to database */ EndCopyFrom(cstate); @@ -2286,7 +2293,7 @@ limit_printout_length(const char *str) /* * Copy FROM file to relation. */ -static uint64 +uint64 CopyFrom(CopyState cstate) { HeapTuple tuple; @@ -2878,6 +2885,7 @@ BeginCopyFrom(ParseState *pstate, Relation rel, const char *filename, bool is_program, + copy_data_source_cb data_source_cb, List *attnamelist, List *options) { @@ -2992,7 +3000,12 @@ BeginCopyFrom(ParseState *pstate, cstate->num_defaults = num_defaults; cstate->is_program = is_program; - if (pipe) + if (data_source_cb) + { + cstate->copy_dest = COPY_CALLBACK; + cstate->data_source_cb = data_source_cb; + } + else if (pipe) { Assert(!is_program); /* the grammar does not allow this */ if (whereToSendOutput == DestRemote) diff --git a/src/backend/commands/subscriptioncmds.c b/src/backend/commands/subscriptioncmds.c index 0198e6d75b..0784ca7951 100644 --- a/src/backend/commands/subscriptioncmds.c +++ b/src/backend/commands/subscriptioncmds.c @@ -20,27 +20,36 @@ #include "access/htup_details.h" #include "access/xact.h" +#include "catalog/dependency.h" #include "catalog/indexing.h" +#include "catalog/namespace.h" #include "catalog/objectaccess.h" #include "catalog/objectaddress.h" #include "catalog/pg_type.h" #include "catalog/pg_subscription.h" +#include "catalog/pg_subscription_rel.h" #include "commands/defrem.h" #include "commands/event_trigger.h" #include "commands/subscriptioncmds.h" +#include "nodes/makefuncs.h" + #include "replication/logicallauncher.h" #include "replication/origin.h" #include "replication/walreceiver.h" +#include "replication/walsender.h" #include "replication/worker_internal.h" #include "storage/lmgr.h" #include "utils/builtins.h" +#include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/syscache.h" +static List *fetch_table_list(WalReceiverConn *wrconn, List *publications); + /* * Common option parsing function for CREATE and ALTER SUBSCRIPTION commands. * @@ -49,17 +58,17 @@ * accomodate that. */ static void -parse_subscription_options(List *options, char **conninfo, - List **publications, bool *enabled_given, - bool *enabled, bool *create_slot, char **slot_name) +parse_subscription_options(List *options, bool *connect, bool *enabled_given, + bool *enabled, bool *create_slot, char **slot_name, + bool *copy_data) { ListCell *lc; + bool connect_given = false; bool create_slot_given = false; + bool copy_data_given = false; - if (conninfo) - *conninfo = NULL; - if (publications) - *publications = NIL; + if (connect) + *connect = true; if (enabled) { *enabled_given = false; @@ -69,29 +78,23 @@ parse_subscription_options(List *options, char **conninfo, *create_slot = true; if (slot_name) *slot_name = NULL; + if (copy_data) + *copy_data = true; /* Parse options */ foreach (lc, options) { DefElem *defel = (DefElem *) lfirst(lc); - if (strcmp(defel->defname, "conninfo") == 0 && conninfo) + if (strcmp(defel->defname, "noconnect") == 0 && connect) { - if (*conninfo) + if (connect_given) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("conflicting or redundant options"))); - *conninfo = defGetString(defel); - } - else if (strcmp(defel->defname, "publication") == 0 && publications) - { - if (*publications) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting or redundant options"))); - - *publications = defGetStringList(defel); + connect_given = true; + *connect = !defGetBoolean(defel); } else if (strcmp(defel->defname, "enabled") == 0 && enabled) { @@ -142,9 +145,57 @@ parse_subscription_options(List *options, char **conninfo, *slot_name = defGetString(defel); } + else if (strcmp(defel->defname, "copy data") == 0 && copy_data) + { + if (copy_data_given) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"))); + + copy_data_given = true; + *copy_data = defGetBoolean(defel); + } + else if (strcmp(defel->defname, "nocopy data") == 0 && copy_data) + { + if (copy_data_given) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"))); + + copy_data_given = true; + *copy_data = !defGetBoolean(defel); + } else elog(ERROR, "unrecognized option: %s", defel->defname); } + + /* + * We've been explicitly asked to not connect, that requires some + * additional processing. + */ + if (connect && !*connect) + { + /* Check for incompatible options from the user. */ + if (*enabled_given && *enabled) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("noconnect and enabled are mutually exclusive options"))); + + if (create_slot_given && *create_slot) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("noconnect and create slot are mutually exclusive options"))); + + if (copy_data_given && *copy_data) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("noconnect and copy data are mutually exclusive options"))); + + /* Change the defaults of other options. */ + *enabled = false; + *create_slot = false; + *copy_data = false; + } } /* @@ -214,8 +265,10 @@ CreateSubscription(CreateSubscriptionStmt *stmt, bool isTopLevel) Datum values[Natts_pg_subscription]; Oid owner = GetUserId(); HeapTuple tup; + bool connect; bool enabled_given; bool enabled; + bool copy_data; char *conninfo; char *slotname; char originname[NAMEDATALEN]; @@ -226,9 +279,8 @@ CreateSubscription(CreateSubscriptionStmt *stmt, bool isTopLevel) * Parse and check options. * Connection and publication should not be specified here. */ - parse_subscription_options(stmt->options, NULL, NULL, - &enabled_given, &enabled, - &create_slot, &slotname); + parse_subscription_options(stmt->options, &connect, &enabled_given, + &enabled, &create_slot, &slotname, ©_data); /* * Since creating a replication slot is not transactional, rolling back @@ -297,14 +349,17 @@ CreateSubscription(CreateSubscriptionStmt *stmt, bool isTopLevel) replorigin_create(originname); /* - * If requested, create the replication slot on remote side for our - * newly created subscription. + * Connect to remote side to execute requested commands and fetch table + * info. */ - if (create_slot) + if (connect) { XLogRecPtr lsn; char *err; WalReceiverConn *wrconn; + List *tables; + ListCell *lc; + char table_state; /* Try to connect to the publisher. */ wrconn = walrcv_connect(conninfo, true, stmt->subname, &err); @@ -315,13 +370,43 @@ CreateSubscription(CreateSubscriptionStmt *stmt, bool isTopLevel) PG_TRY(); { /* - * Create permanent slot for the subscription. We won't use the - * initial snapshot for anything, so no need to export it. + * If requested, create permanent slot for the subscription. + * We won't use the initial snapshot for anything, so no need + * to export it. */ - walrcv_create_slot(wrconn, slotname, false, false, &lsn); + if (create_slot) + { + walrcv_create_slot(wrconn, slotname, false, + CRS_NOEXPORT_SNAPSHOT, &lsn); + ereport(NOTICE, + (errmsg("created replication slot \"%s\" on publisher", + slotname))); + } + + /* + * Set sync state based on if we were asked to do data copy or + * not. + */ + table_state = copy_data ? SUBREL_STATE_INIT : SUBREL_STATE_READY; + + /* + * Get the table list from publisher and build local table status + * info. + */ + tables = fetch_table_list(wrconn, publications); + foreach (lc, tables) + { + RangeVar *rv = (RangeVar *) lfirst(lc); + Oid relid; + + relid = RangeVarGetRelid(rv, AccessShareLock, true); + + SetSubscriptionRelState(subid, relid, table_state, + InvalidXLogRecPtr); + } + ereport(NOTICE, - (errmsg("created replication slot \"%s\" on publisher", - slotname))); + (errmsg("synchronized table states"))); } PG_CATCH(); { @@ -334,6 +419,11 @@ CreateSubscription(CreateSubscriptionStmt *stmt, bool isTopLevel) /* And we are done with the remote side. */ walrcv_disconnect(wrconn); } + else + ereport(WARNING, + (errmsg("tables were not subscribed, you will have to run " + "ALTER SUBSCRIPTION ... REFRESH PUBLICATION to " + "subscribe the tables"))); heap_close(rel, RowExclusiveLock); @@ -346,6 +436,108 @@ CreateSubscription(CreateSubscriptionStmt *stmt, bool isTopLevel) return myself; } +static void +AlterSubscription_refresh(Subscription *sub, bool copy_data) +{ + char *err; + List *pubrel_names; + List *subrel_states; + Oid *subrel_local_oids; + Oid *pubrel_local_oids; + ListCell *lc; + int off; + + /* Load the library providing us libpq calls. */ + load_file("libpqwalreceiver", false); + + /* Try to connect to the publisher. */ + wrconn = walrcv_connect(sub->conninfo, true, sub->name, &err); + if (!wrconn) + ereport(ERROR, + (errmsg("could not connect to the publisher: %s", err))); + + /* Get the table list from publisher. */ + pubrel_names = fetch_table_list(wrconn, sub->publications); + + /* We are done with the remote side, close connection. */ + walrcv_disconnect(wrconn); + + /* Get local table list. */ + subrel_states = GetSubscriptionRelations(sub->oid); + + /* + * Build qsorted array of local table oids for faster lookup. + * This can potentially contain all tables in the database so + * speed of lookup is important. + */ + subrel_local_oids = palloc(list_length(subrel_states) * sizeof(Oid)); + off = 0; + foreach(lc, subrel_states) + { + SubscriptionRelState *relstate = (SubscriptionRelState *) lfirst(lc); + subrel_local_oids[off++] = relstate->relid; + } + qsort(subrel_local_oids, list_length(subrel_states), + sizeof(Oid), oid_cmp); + + /* + * Walk over the remote tables and try to match them to locally + * known tables. If the table is not known locally create a new state + * for it. + * + * Also builds array of local oids of remote tables for the next step. + */ + off = 0; + pubrel_local_oids = palloc(list_length(pubrel_names) * sizeof(Oid)); + + foreach (lc, pubrel_names) + { + RangeVar *rv = (RangeVar *) lfirst(lc); + Oid relid; + + relid = RangeVarGetRelid(rv, AccessShareLock, false); + pubrel_local_oids[off++] = relid; + + if (!bsearch(&relid, subrel_local_oids, + list_length(subrel_states), sizeof(Oid), oid_cmp)) + { + SetSubscriptionRelState(sub->oid, relid, + copy_data ? SUBREL_STATE_INIT : SUBREL_STATE_READY, + InvalidXLogRecPtr); + ereport(NOTICE, + (errmsg("added subscription for table %s.%s", + quote_identifier(rv->schemaname), + quote_identifier(rv->relname)))); + } + } + + /* + * Next remove state for tables we should not care about anymore using + * the data we collected above + */ + qsort(pubrel_local_oids, list_length(pubrel_names), + sizeof(Oid), oid_cmp); + + for (off = 0; off < list_length(subrel_states); off++) + { + Oid relid = subrel_local_oids[off]; + + if (!bsearch(&relid, pubrel_local_oids, + list_length(pubrel_names), sizeof(Oid), oid_cmp)) + { + char *namespace; + + RemoveSubscriptionRel(sub->oid, relid); + + namespace = get_namespace_name(get_rel_namespace(relid)); + ereport(NOTICE, + (errmsg("removed subscription for table %s.%s", + quote_identifier(namespace), + quote_identifier(get_rel_name(relid))))); + } + } +} + /* * Alter the existing subscription. */ @@ -359,11 +551,7 @@ AlterSubscription(AlterSubscriptionStmt *stmt) Datum values[Natts_pg_subscription]; HeapTuple tup; Oid subid; - bool enabled_given; - bool enabled; - char *conninfo; - char *slot_name; - List *publications; + bool update_tuple = false; rel = heap_open(SubscriptionRelationId, RowExclusiveLock); @@ -384,52 +572,113 @@ AlterSubscription(AlterSubscriptionStmt *stmt) subid = HeapTupleGetOid(tup); - /* Parse options. */ - parse_subscription_options(stmt->options, &conninfo, &publications, - &enabled_given, &enabled, - NULL, &slot_name); - /* Form a new tuple. */ memset(values, 0, sizeof(values)); memset(nulls, false, sizeof(nulls)); memset(replaces, false, sizeof(replaces)); - if (enabled_given) + switch (stmt->kind) { - values[Anum_pg_subscription_subenabled - 1] = BoolGetDatum(enabled); - replaces[Anum_pg_subscription_subenabled - 1] = true; - } - if (conninfo) - { - values[Anum_pg_subscription_subconninfo - 1] = - CStringGetTextDatum(conninfo); - replaces[Anum_pg_subscription_subconninfo - 1] = true; - } - if (slot_name) - { - values[Anum_pg_subscription_subslotname - 1] = - DirectFunctionCall1(namein, CStringGetDatum(slot_name)); - replaces[Anum_pg_subscription_subslotname - 1] = true; - } - if (publications != NIL) - { - values[Anum_pg_subscription_subpublications - 1] = - publicationListToArray(publications); - replaces[Anum_pg_subscription_subpublications - 1] = true; + case ALTER_SUBSCRIPTION_OPTIONS: + { + char *slot_name; + + parse_subscription_options(stmt->options, NULL, NULL, NULL, + NULL, &slot_name, NULL); + + values[Anum_pg_subscription_subslotname - 1] = + DirectFunctionCall1(namein, CStringGetDatum(slot_name)); + replaces[Anum_pg_subscription_subslotname - 1] = true; + + update_tuple = true; + break; + } + + case ALTER_SUBSCRIPTION_ENABLED: + { + bool enabled, + enabled_given; + + parse_subscription_options(stmt->options, NULL, + &enabled_given, &enabled, NULL, + NULL, NULL); + Assert(enabled_given); + + values[Anum_pg_subscription_subenabled - 1] = + BoolGetDatum(enabled); + replaces[Anum_pg_subscription_subenabled - 1] = true; + + update_tuple = true; + break; + } + + case ALTER_SUBSCRIPTION_CONNECTION: + values[Anum_pg_subscription_subconninfo - 1] = + CStringGetTextDatum(stmt->conninfo); + replaces[Anum_pg_subscription_subconninfo - 1] = true; + update_tuple = true; + break; + + case ALTER_SUBSCRIPTION_PUBLICATION: + case ALTER_SUBSCRIPTION_PUBLICATION_REFRESH: + { + bool copy_data; + Subscription *sub = GetSubscription(subid, false); + + parse_subscription_options(stmt->options, NULL, NULL, NULL, + NULL, NULL, ©_data); + + values[Anum_pg_subscription_subpublications - 1] = + publicationListToArray(stmt->publication); + replaces[Anum_pg_subscription_subpublications - 1] = true; + + update_tuple = true; + + /* Refresh if user asked us to. */ + if (stmt->kind == ALTER_SUBSCRIPTION_PUBLICATION_REFRESH) + { + /* Make sure refresh sees the new list of publications. */ + sub->publications = stmt->publication; + + AlterSubscription_refresh(sub, copy_data); + } + + break; + } + + case ALTER_SUBSCRIPTION_REFRESH: + { + bool copy_data; + Subscription *sub = GetSubscription(subid, false); + + parse_subscription_options(stmt->options, NULL, NULL, NULL, + NULL, NULL, ©_data); + + AlterSubscription_refresh(sub, copy_data); + + break; + } + + default: + elog(ERROR, "unrecognized ALTER SUBSCRIPTION kind %d", + stmt->kind); } - tup = heap_modify_tuple(tup, RelationGetDescr(rel), values, nulls, - replaces); + /* Update the catalog if needed. */ + if (update_tuple) + { + tup = heap_modify_tuple(tup, RelationGetDescr(rel), values, nulls, + replaces); - /* Update the catalog. */ - CatalogTupleUpdate(rel, &tup->t_self, tup); + CatalogTupleUpdate(rel, &tup->t_self, tup); + + heap_freetuple(tup); + } + + heap_close(rel, RowExclusiveLock); ObjectAddressSet(myself, SubscriptionRelationId, subid); - /* Cleanup. */ - heap_freetuple(tup); - heap_close(rel, RowExclusiveLock); - InvokeObjectPostAlterHook(SubscriptionRelationId, subid, 0); return myself; @@ -537,8 +786,11 @@ DropSubscription(DropSubscriptionStmt *stmt, bool isTopLevel) /* Clean up dependencies */ deleteSharedDependencyRecordsFor(SubscriptionRelationId, subid, 0); + /* Remove any associated relation synchronization states. */ + RemoveSubscriptionRel(subid, InvalidOid); + /* Kill the apply worker so that the slot becomes accessible. */ - logicalrep_worker_stop(subid); + logicalrep_worker_stop(subid, InvalidOid); /* Remove the origin tracking if exists. */ snprintf(originname, sizeof(originname), "pg_%u", subid); @@ -571,15 +823,20 @@ DropSubscription(DropSubscriptionStmt *stmt, bool isTopLevel) PG_TRY(); { - if (!walrcv_command(wrconn, cmd.data, &err)) + WalRcvExecResult *res; + res = walrcv_exec(wrconn, cmd.data, 0, NULL); + + if (res->status != WALRCV_OK_COMMAND) ereport(ERROR, (errmsg("could not drop the replication slot \"%s\" on publisher", slotname), - errdetail("The error was: %s", err))); + errdetail("The error was: %s", res->err))); else ereport(NOTICE, (errmsg("dropped replication slot \"%s\" on publisher", slotname))); + + walrcv_clear_result(res); } PG_CATCH(); { @@ -691,3 +948,72 @@ AlterSubscriptionOwner_oid(Oid subid, Oid newOwnerId) heap_close(rel, RowExclusiveLock); } + +/* + * Get the list of tables which belong to specified publications on the + * publisher connection. + */ +static List * +fetch_table_list(WalReceiverConn *wrconn, List *publications) +{ + WalRcvExecResult *res; + StringInfoData cmd; + TupleTableSlot *slot; + Oid tableRow[2] = {TEXTOID, TEXTOID}; + ListCell *lc; + bool first; + List *tablelist = NIL; + + Assert(list_length(publications) > 0); + + initStringInfo(&cmd); + appendStringInfo(&cmd, "SELECT DISTINCT t.schemaname, t.tablename\n" + " FROM pg_catalog.pg_publication_tables t\n" + " WHERE t.pubname IN ("); + first = true; + foreach (lc, publications) + { + char *pubname = strVal(lfirst(lc)); + + if (first) + first = false; + else + appendStringInfoString(&cmd, ", "); + + appendStringInfo(&cmd, "%s", quote_literal_cstr(pubname)); + } + appendStringInfoString(&cmd, ")"); + + res = walrcv_exec(wrconn, cmd.data, 2, tableRow); + pfree(cmd.data); + + if (res->status != WALRCV_OK_TUPLES) + ereport(ERROR, + (errmsg("could not receive list of replicated tables from the publisher: %s", + res->err))); + + /* Process tables. */ + slot = MakeSingleTupleTableSlot(res->tupledesc); + while (tuplestore_gettupleslot(res->tuplestore, true, false, slot)) + { + char *nspname; + char *relname; + bool isnull; + RangeVar *rv; + + nspname = TextDatumGetCString(slot_getattr(slot, 1, &isnull)); + Assert(!isnull); + relname = TextDatumGetCString(slot_getattr(slot, 2, &isnull)); + Assert(!isnull); + + rv = makeRangeVar(pstrdup(nspname), pstrdup(relname), -1); + tablelist = lappend(tablelist, rv); + + ExecClearTuple(slot); + } + ExecDropSingleTupleTableSlot(slot); + + walrcv_clear_result(res); + + return tablelist; +} diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index d0d45a557b..50126baacf 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -651,7 +651,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); MAPPING MATCH MATERIALIZED MAXVALUE METHOD MINUTE_P MINVALUE MODE MONTH_P MOVE NAME_P NAMES NATIONAL NATURAL NCHAR NEW NEXT NO NONE - NOT NOTHING NOTIFY NOTNULL NOWAIT NULL_P NULLIF + NOREFRESH NOT NOTHING NOTIFY NOTNULL NOWAIT NULL_P NULLIF NULLS_P NUMERIC OBJECT_P OF OFF OFFSET OIDS OLD ON ONLY OPERATOR OPTION OPTIONS OR @@ -9095,6 +9095,7 @@ AlterSubscriptionStmt: { AlterSubscriptionStmt *n = makeNode(AlterSubscriptionStmt); + n->kind = ALTER_SUBSCRIPTION_OPTIONS; n->subname = $3; n->options = $5; $$ = (Node *)n; @@ -9103,24 +9104,45 @@ AlterSubscriptionStmt: { AlterSubscriptionStmt *n = makeNode(AlterSubscriptionStmt); + n->kind = ALTER_SUBSCRIPTION_CONNECTION; n->subname = $3; - n->options = list_make1(makeDefElem("conninfo", - (Node *)makeString($5), @1)); + n->conninfo = $5; $$ = (Node *)n; } - | ALTER SUBSCRIPTION name SET PUBLICATION publication_name_list + | ALTER SUBSCRIPTION name REFRESH PUBLICATION opt_definition { AlterSubscriptionStmt *n = makeNode(AlterSubscriptionStmt); + n->kind = ALTER_SUBSCRIPTION_REFRESH; n->subname = $3; - n->options = list_make1(makeDefElem("publication", - (Node *)$6, @1)); + n->options = $6; + $$ = (Node *)n; + } + | ALTER SUBSCRIPTION name SET PUBLICATION publication_name_list REFRESH opt_definition + { + AlterSubscriptionStmt *n = + makeNode(AlterSubscriptionStmt); + n->kind = ALTER_SUBSCRIPTION_PUBLICATION_REFRESH; + n->subname = $3; + n->publication = $6; + n->options = $8; + $$ = (Node *)n; + } + | ALTER SUBSCRIPTION name SET PUBLICATION publication_name_list NOREFRESH + { + AlterSubscriptionStmt *n = + makeNode(AlterSubscriptionStmt); + n->kind = ALTER_SUBSCRIPTION_PUBLICATION; + n->subname = $3; + n->publication = $6; + n->options = NIL; $$ = (Node *)n; } | ALTER SUBSCRIPTION name ENABLE_P { AlterSubscriptionStmt *n = makeNode(AlterSubscriptionStmt); + n->kind = ALTER_SUBSCRIPTION_ENABLED; n->subname = $3; n->options = list_make1(makeDefElem("enabled", (Node *)makeInteger(TRUE), @1)); @@ -9130,11 +9152,13 @@ AlterSubscriptionStmt: { AlterSubscriptionStmt *n = makeNode(AlterSubscriptionStmt); + n->kind = ALTER_SUBSCRIPTION_ENABLED; n->subname = $3; n->options = list_make1(makeDefElem("enabled", (Node *)makeInteger(FALSE), @1)); $$ = (Node *)n; - } ; + } + ; /***************************************************************************** * @@ -14548,6 +14572,7 @@ unreserved_keyword: | NEW | NEXT | NO + | NOREFRESH | NOTHING | NOTIFY | NOWAIT diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 3a50488db3..b704788eb5 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -3415,6 +3415,12 @@ pgstat_get_wait_ipc(WaitEventIPC w) case WAIT_EVENT_SYNC_REP: event_name = "SyncRep"; break; + case WAIT_EVENT_LOGICAL_SYNC_DATA: + event_name = "LogicalSyncData"; + break; + case WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE: + event_name = "LogicalSyncStateChange"; + break; /* no default case, so that compiler will warn */ } diff --git a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c index 65a9e6c81c..4dd8eef1f9 100644 --- a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c +++ b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c @@ -22,14 +22,16 @@ #include "libpq-fe.h" #include "pqexpbuffer.h" #include "access/xlog.h" +#include "catalog/pg_type.h" +#include "funcapi.h" #include "mb/pg_wchar.h" #include "miscadmin.h" #include "pgstat.h" -#include "replication/logicalproto.h" #include "replication/walreceiver.h" -#include "storage/proc.h" #include "utils/builtins.h" +#include "utils/memutils.h" #include "utils/pg_lsn.h" +#include "utils/tuplestore.h" PG_MODULE_MAGIC; @@ -68,10 +70,12 @@ static void libpqrcv_send(WalReceiverConn *conn, const char *buffer, static char *libpqrcv_create_slot(WalReceiverConn *conn, const char *slotname, bool temporary, - bool export_snapshot, + CRSSnapshotAction snapshot_action, XLogRecPtr *lsn); -static bool libpqrcv_command(WalReceiverConn *conn, - const char *cmd, char **err); +static WalRcvExecResult *libpqrcv_exec(WalReceiverConn *conn, + const char *query, + const int nRetTypes, + const Oid *retTypes); static void libpqrcv_disconnect(WalReceiverConn *conn); static WalReceiverFunctionsType PQWalReceiverFunctions = { @@ -85,7 +89,7 @@ static WalReceiverFunctionsType PQWalReceiverFunctions = { libpqrcv_receive, libpqrcv_send, libpqrcv_create_slot, - libpqrcv_command, + libpqrcv_exec, libpqrcv_disconnect }; @@ -431,10 +435,8 @@ libpqrcv_endstreaming(WalReceiverConn *conn, TimeLineID *next_tli) * next timeline's ID, or just CommandComplete if the server was shut * down. * - * If we had not yet received CopyDone from the backend, PGRES_COPY_IN - * would also be possible. However, at the moment this function is only - * called after receiving CopyDone from the backend - the walreceiver - * never terminates replication on its own initiative. + * If we had not yet received CopyDone from the backend, PGRES_COPY_OUT + * is also possible in case we aborted the copy in mid-stream. */ res = PQgetResult(conn->streamConn); if (PQresultStatus(res) == PGRES_TUPLES_OK) @@ -531,7 +533,7 @@ libpqrcv_readtimelinehistoryfile(WalReceiverConn *conn, * Windows. * * The function is modeled on PQexec() in libpq, but only implements - * those parts that are in use in the walreceiver. + * those parts that are in use in the walreceiver api. * * Queries are always executed on the connection in streamConn. */ @@ -543,8 +545,9 @@ libpqrcv_PQexec(PGconn *streamConn, const char *query) /* * PQexec() silently discards any prior query results on the connection. - * This is not required for walreceiver since it's expected that walsender - * won't generate any such junk results. + * This is not required for this function as it's expected that the + * caller (which is this library in all cases) will behave correctly and + * we don't have to be backwards compatible with old libpq. */ /* @@ -593,8 +596,7 @@ libpqrcv_PQexec(PGconn *streamConn, const char *query) /* * Emulate the PQexec()'s behavior of returning the last result when - * there are many. Since walsender will never generate multiple - * results, we skip the concatenation of error messages. + * there are many. We are fine with returning just last error message. */ result = PQgetResult(streamConn); if (result == NULL) @@ -675,8 +677,19 @@ libpqrcv_receive(WalReceiverConn *conn, char **buffer, PGresult *res; res = PQgetResult(conn->streamConn); - if (PQresultStatus(res) == PGRES_COMMAND_OK || - PQresultStatus(res) == PGRES_COPY_IN) + if (PQresultStatus(res) == PGRES_COMMAND_OK) + { + PQclear(res); + + /* Verify that there are no more results */ + res = PQgetResult(conn->streamConn); + if (res != NULL) + ereport(ERROR, + (errmsg("unexpected result after CommandComplete: %s", + PQerrorMessage(conn->streamConn)))); + return -1; + } + else if (PQresultStatus(res) == PGRES_COPY_IN) { PQclear(res); return -1; @@ -721,7 +734,8 @@ libpqrcv_send(WalReceiverConn *conn, const char *buffer, int nbytes) */ static char * libpqrcv_create_slot(WalReceiverConn *conn, const char *slotname, - bool temporary, bool export_snapshot, XLogRecPtr *lsn) + bool temporary, CRSSnapshotAction snapshot_action, + XLogRecPtr *lsn) { PGresult *res; StringInfoData cmd; @@ -737,10 +751,18 @@ libpqrcv_create_slot(WalReceiverConn *conn, const char *slotname, if (conn->logical) { appendStringInfo(&cmd, " LOGICAL pgoutput"); - if (export_snapshot) - appendStringInfo(&cmd, " EXPORT_SNAPSHOT"); - else - appendStringInfo(&cmd, " NOEXPORT_SNAPSHOT"); + switch (snapshot_action) + { + case CRS_EXPORT_SNAPSHOT: + appendStringInfo(&cmd, " EXPORT_SNAPSHOT"); + break; + case CRS_NOEXPORT_SNAPSHOT: + appendStringInfo(&cmd, " NOEXPORT_SNAPSHOT"); + break; + case CRS_USE_SNAPSHOT: + appendStringInfo(&cmd, " USE_SNAPSHOT"); + break; + } } res = libpqrcv_PQexec(conn->streamConn, cmd.data); @@ -767,28 +789,139 @@ libpqrcv_create_slot(WalReceiverConn *conn, const char *slotname, } /* - * Run command. - * - * Returns if the command has succeeded and fills the err with palloced - * error message if not. + * Convert tuple query result to tuplestore. */ -static bool -libpqrcv_command(WalReceiverConn *conn, const char *cmd, char **err) +static void +libpqrcv_processTuples(PGresult *pgres, WalRcvExecResult *walres, + const int nRetTypes, const Oid *retTypes) { - PGresult *res; + int tupn; + int coln; + int nfields = PQnfields(pgres); + HeapTuple tuple; + AttInMetadata *attinmeta; + MemoryContext rowcontext; + MemoryContext oldcontext; - res = libpqrcv_PQexec(conn->streamConn, cmd); + /* No point in doing anything here if there were no tuples returned. */ + if (PQntuples(pgres) == 0) + return; - if (PQresultStatus(res) != PGRES_COMMAND_OK) + /* Make sure we got expected number of fields. */ + if (nfields != nRetTypes) + ereport(ERROR, + (errmsg("invalid query responser"), + errdetail("Expected %d fields, got %d fields.", + nRetTypes, nfields))); + + + walres->tuplestore = tuplestore_begin_heap(true, false, work_mem); + + /* Create tuple descriptor corresponding to expected result. */ + walres->tupledesc = CreateTemplateTupleDesc(nRetTypes, false); + for (coln = 0; coln < nRetTypes; coln++) + TupleDescInitEntry(walres->tupledesc, (AttrNumber) coln + 1, + PQfname(pgres, coln), retTypes[coln], -1, 0); + attinmeta = TupleDescGetAttInMetadata(walres->tupledesc); + + /* Create temporary context for local allocations. */ + rowcontext = AllocSetContextCreate(CurrentMemoryContext, + "libpqrcv query result context", + ALLOCSET_DEFAULT_SIZES); + + /* Process returned rows. */ + for (tupn = 0; tupn < PQntuples(pgres); tupn++) { - PQclear(res); - *err = pchomp(PQerrorMessage(conn->streamConn)); - return false; + char *cstrs[MaxTupleAttributeNumber]; + + CHECK_FOR_INTERRUPTS(); + + /* Do the allocations in temporary context. */ + oldcontext = MemoryContextSwitchTo(rowcontext); + + /* + * Fill cstrs with null-terminated strings of column values. + */ + for (coln = 0; coln < nfields; coln++) + { + if (PQgetisnull(pgres, tupn, coln)) + cstrs[coln] = NULL; + else + cstrs[coln] = PQgetvalue(pgres, tupn, coln); + } + + /* Convert row to a tuple, and add it to the tuplestore */ + tuple = BuildTupleFromCStrings(attinmeta, cstrs); + tuplestore_puttuple(walres->tuplestore, tuple); + + /* Clean up */ + MemoryContextSwitchTo(oldcontext); + MemoryContextReset(rowcontext); } - PQclear(res); + MemoryContextDelete(rowcontext); +} - return true; +/* + * Public interface for sending generic queries (and commands). + * + * This can only be called from process connected to database. + */ +static WalRcvExecResult * +libpqrcv_exec(WalReceiverConn *conn, const char *query, + const int nRetTypes, const Oid *retTypes) +{ + PGresult *pgres = NULL; + WalRcvExecResult *walres = palloc0(sizeof(WalRcvExecResult)); + + if (MyDatabaseId == InvalidOid) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("the query interface requires a database connection"))); + + pgres = libpqrcv_PQexec(conn->streamConn, query); + + switch (PQresultStatus(pgres)) + { + case PGRES_SINGLE_TUPLE: + case PGRES_TUPLES_OK: + walres->status = WALRCV_OK_TUPLES; + libpqrcv_processTuples(pgres, walres, nRetTypes, retTypes); + break; + + case PGRES_COPY_IN: + walres->status = WALRCV_OK_COPY_IN; + break; + + case PGRES_COPY_OUT: + walres->status = WALRCV_OK_COPY_OUT; + break; + + case PGRES_COPY_BOTH: + walres->status = WALRCV_OK_COPY_BOTH; + break; + + case PGRES_COMMAND_OK: + walres->status = WALRCV_OK_COMMAND; + break; + + /* Empty query is considered error. */ + case PGRES_EMPTY_QUERY: + walres->status = WALRCV_ERROR; + walres->err = _("empty query"); + break; + + case PGRES_NONFATAL_ERROR: + case PGRES_FATAL_ERROR: + case PGRES_BAD_RESPONSE: + walres->status = WALRCV_ERROR; + walres->err = pchomp(PQerrorMessage(conn->streamConn)); + break; + } + + PQclear(pgres); + + return walres; } /* diff --git a/src/backend/replication/logical/Makefile b/src/backend/replication/logical/Makefile index 259befa4e6..bb417b042e 100644 --- a/src/backend/replication/logical/Makefile +++ b/src/backend/replication/logical/Makefile @@ -15,6 +15,6 @@ include $(top_builddir)/src/Makefile.global override CPPFLAGS := -I$(srcdir) $(CPPFLAGS) OBJS = decode.o launcher.o logical.o logicalfuncs.o message.o origin.o \ - proto.o relation.o reorderbuffer.o snapbuild.o worker.o + proto.o relation.o reorderbuffer.o snapbuild.o tablesync.o worker.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/replication/logical/launcher.c b/src/backend/replication/logical/launcher.c index 20b43626dd..255b22597b 100644 --- a/src/backend/replication/logical/launcher.c +++ b/src/backend/replication/logical/launcher.c @@ -27,6 +27,7 @@ #include "access/xact.h" #include "catalog/pg_subscription.h" +#include "catalog/pg_subscription_rel.h" #include "libpq/pqsignal.h" @@ -56,6 +57,8 @@ #define DEFAULT_NAPTIME_PER_CYCLE 180000L int max_logical_replication_workers = 4; +int max_sync_workers_per_subscription = 2; + LogicalRepWorker *MyLogicalRepWorker = NULL; typedef struct LogicalRepCtxStruct @@ -198,20 +201,22 @@ WaitForReplicationWorkerAttach(LogicalRepWorker *worker, /* * Walks the workers array and searches for one that matches given - * subscription id. + * subscription id and relid. */ LogicalRepWorker * -logicalrep_worker_find(Oid subid) +logicalrep_worker_find(Oid subid, Oid relid, bool only_running) { int i; LogicalRepWorker *res = NULL; Assert(LWLockHeldByMe(LogicalRepWorkerLock)); + /* Search for attached worker for a given subscription id. */ for (i = 0; i < max_logical_replication_workers; i++) { LogicalRepWorker *w = &LogicalRepCtx->workers[i]; - if (w->subid == subid && w->proc && IsBackendPid(w->proc->pid)) + if (w->subid == subid && w->relid == relid && + (!only_running || (w->proc && IsBackendPid(w->proc->pid)))) { res = w; break; @@ -225,7 +230,8 @@ logicalrep_worker_find(Oid subid) * Start new apply background worker. */ void -logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, Oid userid) +logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, Oid userid, + Oid relid) { BackgroundWorker bgw; BackgroundWorkerHandle *bgw_handle; @@ -270,10 +276,18 @@ logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, Oid userid) } /* Prepare the worker info. */ - memset(worker, 0, sizeof(LogicalRepWorker)); + worker->proc = NULL; worker->dbid = dbid; worker->userid = userid; worker->subid = subid; + worker->relid = relid; + worker->relstate = SUBREL_STATE_UNKNOWN; + worker->relstate_lsn = InvalidXLogRecPtr; + worker->last_lsn = InvalidXLogRecPtr; + TIMESTAMP_NOBEGIN(worker->last_send_time); + TIMESTAMP_NOBEGIN(worker->last_recv_time); + worker->reply_lsn = InvalidXLogRecPtr; + TIMESTAMP_NOBEGIN(worker->reply_time); LWLockRelease(LogicalRepWorkerLock); @@ -282,8 +296,12 @@ logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, Oid userid) BGWORKER_BACKEND_DATABASE_CONNECTION; bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; bgw.bgw_main = ApplyWorkerMain; - snprintf(bgw.bgw_name, BGW_MAXLEN, - "logical replication worker for subscription %u", subid); + if (OidIsValid(relid)) + snprintf(bgw.bgw_name, BGW_MAXLEN, + "logical replication worker for subscription %u sync %u", subid, relid); + else + snprintf(bgw.bgw_name, BGW_MAXLEN, + "logical replication worker for subscription %u", subid); bgw.bgw_restart_time = BGW_NEVER_RESTART; bgw.bgw_notify_pid = MyProcPid; @@ -307,13 +325,13 @@ logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, Oid userid) * slot. */ void -logicalrep_worker_stop(Oid subid) +logicalrep_worker_stop(Oid subid, Oid relid) { LogicalRepWorker *worker; LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); - worker = logicalrep_worker_find(subid); + worker = logicalrep_worker_find(subid, relid, false); /* No worker, nothing to do. */ if (!worker) @@ -395,6 +413,31 @@ logicalrep_worker_stop(Oid subid) } } +/* + * Wake up (using latch) the logical replication worker. + */ +void +logicalrep_worker_wakeup(Oid subid, Oid relid) +{ + LogicalRepWorker *worker; + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + worker = logicalrep_worker_find(subid, relid, true); + LWLockRelease(LogicalRepWorkerLock); + + if (worker) + logicalrep_worker_wakeup_ptr(worker); +} + +/* + * Wake up (using latch) the logical replication worker. + */ +void +logicalrep_worker_wakeup_ptr(LogicalRepWorker *worker) +{ + SetLatch(&worker->proc->procLatch); +} + /* * Attach to a slot. */ @@ -457,6 +500,29 @@ logicalrep_worker_sigterm(SIGNAL_ARGS) SetLatch(MyLatch); } +/* + * Count the number of registered (not necessarily running) sync workers + * for a subscription. + */ +int +logicalrep_sync_worker_count(Oid subid) +{ + int i; + int res = 0; + + Assert(LWLockHeldByMe(LogicalRepWorkerLock)); + + /* Search for attached worker for a given subscription id. */ + for (i = 0; i < max_logical_replication_workers; i++) + { + LogicalRepWorker *w = &LogicalRepCtx->workers[i]; + if (w->subid == subid && OidIsValid(w->relid)) + res++; + } + + return res; +} + /* * ApplyLauncherShmemSize * Compute space needed for replication launcher shared memory @@ -512,7 +578,20 @@ ApplyLauncherShmemInit(void) &found); if (!found) + { + int slot; + memset(LogicalRepCtx, 0, ApplyLauncherShmemSize()); + + /* Initialize memory and spin locks for each worker slot. */ + for (slot = 0; slot < max_logical_replication_workers; slot++) + { + LogicalRepWorker *worker = &LogicalRepCtx->workers[slot]; + + memset(worker, 0, sizeof(LogicalRepWorker)); + SpinLockInit(&worker->relmutex); + } + } } /* @@ -607,12 +686,13 @@ ApplyLauncherMain(Datum main_arg) LogicalRepWorker *w; LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); - w = logicalrep_worker_find(sub->oid); + w = logicalrep_worker_find(sub->oid, InvalidOid, false); LWLockRelease(LogicalRepWorkerLock); if (sub->enabled && w == NULL) { - logicalrep_worker_launch(sub->dbid, sub->oid, sub->name, sub->owner); + logicalrep_worker_launch(sub->dbid, sub->oid, sub->name, + sub->owner, InvalidOid); last_start_time = now; wait_time = wal_retrieve_retry_interval; /* Limit to one worker per mainloop cycle. */ @@ -664,7 +744,7 @@ ApplyLauncherMain(Datum main_arg) Datum pg_stat_get_subscription(PG_FUNCTION_ARGS) { -#define PG_STAT_GET_SUBSCRIPTION_COLS 7 +#define PG_STAT_GET_SUBSCRIPTION_COLS 8 Oid subid = PG_ARGISNULL(0) ? InvalidOid : PG_GETARG_OID(0); int i; ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; @@ -723,27 +803,31 @@ pg_stat_get_subscription(PG_FUNCTION_ARGS) MemSet(nulls, 0, sizeof(nulls)); values[0] = ObjectIdGetDatum(worker.subid); - values[1] = Int32GetDatum(worker_pid); - if (XLogRecPtrIsInvalid(worker.last_lsn)) - nulls[2] = true; + if (OidIsValid(worker.relid)) + values[1] = ObjectIdGetDatum(worker.relid); else - values[2] = LSNGetDatum(worker.last_lsn); - if (worker.last_send_time == 0) + nulls[1] = true; + values[2] = Int32GetDatum(worker_pid); + if (XLogRecPtrIsInvalid(worker.last_lsn)) nulls[3] = true; else - values[3] = TimestampTzGetDatum(worker.last_send_time); - if (worker.last_recv_time == 0) + values[3] = LSNGetDatum(worker.last_lsn); + if (worker.last_send_time == 0) nulls[4] = true; else - values[4] = TimestampTzGetDatum(worker.last_recv_time); - if (XLogRecPtrIsInvalid(worker.reply_lsn)) + values[4] = TimestampTzGetDatum(worker.last_send_time); + if (worker.last_recv_time == 0) nulls[5] = true; else - values[5] = LSNGetDatum(worker.reply_lsn); - if (worker.reply_time == 0) + values[5] = TimestampTzGetDatum(worker.last_recv_time); + if (XLogRecPtrIsInvalid(worker.reply_lsn)) nulls[6] = true; else - values[6] = TimestampTzGetDatum(worker.reply_time); + values[6] = LSNGetDatum(worker.reply_lsn); + if (worker.reply_time == 0) + nulls[7] = true; + else + values[7] = TimestampTzGetDatum(worker.reply_time); tuplestore_putvalues(tupstore, tupdesc, values, nulls); diff --git a/src/backend/replication/logical/relation.c b/src/backend/replication/logical/relation.c index d8dc0c7194..875a08185a 100644 --- a/src/backend/replication/logical/relation.c +++ b/src/backend/replication/logical/relation.c @@ -19,6 +19,7 @@ #include "access/heapam.h" #include "access/sysattr.h" #include "catalog/namespace.h" +#include "catalog/pg_subscription_rel.h" #include "nodes/makefuncs.h" #include "replication/logicalrelation.h" #include "replication/worker_internal.h" @@ -357,6 +358,12 @@ logicalrep_rel_open(LogicalRepRelId remoteid, LOCKMODE lockmode) else entry->localrel = heap_open(entry->localreloid, lockmode); + if (entry->state != SUBREL_STATE_READY) + entry->state = GetSubscriptionRelState(MySubscription->oid, + entry->localreloid, + &entry->statelsn, + true); + return entry; } diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index 3f242a8ed7..a73a7b98f9 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -499,51 +499,32 @@ SnapBuildBuildSnapshot(SnapBuild *builder, TransactionId xid) } /* - * Export a snapshot so it can be set in another session with SET TRANSACTION - * SNAPSHOT. + * Build the initial slot snapshot and convert it to normal snapshot that + * is understood by HeapTupleSatisfiesMVCC. * - * For that we need to start a transaction in the current backend as the - * importing side checks whether the source transaction is still open to make - * sure the xmin horizon hasn't advanced since then. - * - * After that we convert a locally built snapshot into the normal variant - * understood by HeapTupleSatisfiesMVCC et al. + * The snapshot will be usable directly in current transaction or exported + * for loading in different transaction. */ -const char * -SnapBuildExportSnapshot(SnapBuild *builder) +Snapshot +SnapBuildInitalSnapshot(SnapBuild *builder) { Snapshot snap; - char *snapname; TransactionId xid; TransactionId *newxip; int newxcnt = 0; + Assert(!FirstSnapshotSet); + Assert(XactIsoLevel = XACT_REPEATABLE_READ); + if (builder->state != SNAPBUILD_CONSISTENT) - elog(ERROR, "cannot export a snapshot before reaching a consistent state"); + elog(ERROR, "cannot build an initial slot snapshot before reaching a consistent state"); if (!builder->committed.includes_all_transactions) - elog(ERROR, "cannot export a snapshot, not all transactions are monitored anymore"); + elog(ERROR, "cannot build an initial slot snapshot, not all transactions are monitored anymore"); /* so we don't overwrite the existing value */ if (TransactionIdIsValid(MyPgXact->xmin)) - elog(ERROR, "cannot export a snapshot when MyPgXact->xmin already is valid"); - - if (IsTransactionOrTransactionBlock()) - elog(ERROR, "cannot export a snapshot from within a transaction"); - - if (SavedResourceOwnerDuringExport) - elog(ERROR, "can only export one snapshot at a time"); - - SavedResourceOwnerDuringExport = CurrentResourceOwner; - ExportInProgress = true; - - StartTransactionCommand(); - - Assert(!FirstSnapshotSet); - - /* There doesn't seem to a nice API to set these */ - XactIsoLevel = XACT_REPEATABLE_READ; - XactReadOnly = true; + elog(ERROR, "cannot build an initial slot snapshot when MyPgXact->xmin already is valid"); snap = SnapBuildBuildSnapshot(builder, GetTopTransactionId()); @@ -578,7 +559,9 @@ SnapBuildExportSnapshot(SnapBuild *builder) if (test == NULL) { if (newxcnt >= GetMaxSnapshotXidCount()) - elog(ERROR, "snapshot too large"); + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("initial slot snapshot too large"))); newxip[newxcnt++] = xid; } @@ -589,9 +572,43 @@ SnapBuildExportSnapshot(SnapBuild *builder) snap->xcnt = newxcnt; snap->xip = newxip; + return snap; +} + +/* + * Export a snapshot so it can be set in another session with SET TRANSACTION + * SNAPSHOT. + * + * For that we need to start a transaction in the current backend as the + * importing side checks whether the source transaction is still open to make + * sure the xmin horizon hasn't advanced since then. + */ +const char * +SnapBuildExportSnapshot(SnapBuild *builder) +{ + Snapshot snap; + char *snapname; + + if (IsTransactionOrTransactionBlock()) + elog(ERROR, "cannot export a snapshot from within a transaction"); + + if (SavedResourceOwnerDuringExport) + elog(ERROR, "can only export one snapshot at a time"); + + SavedResourceOwnerDuringExport = CurrentResourceOwner; + ExportInProgress = true; + + StartTransactionCommand(); + + /* There doesn't seem to a nice API to set these */ + XactIsoLevel = XACT_REPEATABLE_READ; + XactReadOnly = true; + + snap = SnapBuildInitalSnapshot(builder); + /* - * now that we've built a plain snapshot, use the normal mechanisms for - * exporting it + * now that we've built a plain snapshot, make it active and use the + * normal mechanisms for exporting it */ snapname = ExportSnapshot(snap); diff --git a/src/backend/replication/logical/tablesync.c b/src/backend/replication/logical/tablesync.c new file mode 100644 index 0000000000..3e16b0d576 --- /dev/null +++ b/src/backend/replication/logical/tablesync.c @@ -0,0 +1,840 @@ +/*------------------------------------------------------------------------- + * tablesync.c + * PostgreSQL logical replication + * + * Copyright (c) 2012-2016, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/tablesync.c + * + * NOTES + * This file contains code for initial table data synchronization for + * logical replication. + * + * The initial data synchronization is done separately for each table, + * in separate apply worker that only fetches the initial snapshot data + * from the publisher and then synchronizes the position in stream with + * the main apply worker. + * + * The are several reasons for doing the synchronization this way: + * - It allows us to parallelize the initial data synchronization + * which lowers the time needed for it to happen. + * - The initial synchronization does not have to hold the xid and LSN + * for the time it takes to copy data of all tables, causing less + * bloat and lower disk consumption compared to doing the + * synchronization in single process for whole database. + * - It allows us to synchronize the tables added after the initial + * synchronization has finished. + * + * The stream position synchronization works in multiple steps. + * - Sync finishes copy and sets table state as SYNCWAIT and waits + * for state to change in a loop. + * - Apply periodically checks tables that are synchronizing for SYNCWAIT. + * When the desired state appears it will compare its position in the + * stream with the SYNCWAIT position and based on that changes the + * state to based on following rules: + * - if the apply is in front of the sync in the wal stream the new + * state is set to CATCHUP and apply loops until the sync process + * catches up to the same LSN as apply + * - if the sync is in front of the apply in the wal stream the new + * state is set to SYNCDONE + * - if both apply and sync are at the same position in the wal stream + * the state of the table is set to READY + * - If the state was set to CATCHUP sync will read the stream and + * apply changes until it catches up to the specified stream + * position and then sets state to READY and signals apply that it + * can stop waiting and exits, if the state was set to something + * else than CATCHUP the sync process will simply end. + * - If the state was set to SYNCDONE by apply, the apply will + * continue tracking the table until it reaches the SYNCDONE stream + * position at which point it sets state to READY and stops tracking. + * + * The catalog pg_subscription_rel is used to keep information about + * subscribed tables and their state and some transient state during + * data synchronization is kept in shared memory. + * + * Example flows look like this: + * - Apply is in front: + * sync:8 + * -> set SYNCWAIT + * apply:10 + * -> set CATCHUP + * -> enter wait-loop + * sync:10 + * -> set READY + * -> exit + * apply:10 + * -> exit wait-loop + * -> continue rep + * - Sync in front: + * sync:10 + * -> set SYNCWAIT + * apply:8 + * -> set SYNCDONE + * -> continue per-table filtering + * sync:10 + * -> exit + * apply:10 + * -> set READY + * -> stop per-table filtering + * -> continue rep + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "miscadmin.h" +#include "pgstat.h" + +#include "access/xact.h" + +#include "catalog/pg_subscription_rel.h" +#include "catalog/pg_type.h" + +#include "commands/copy.h" + +#include "replication/logicallauncher.h" +#include "replication/logicalrelation.h" +#include "replication/walreceiver.h" +#include "replication/worker_internal.h" + +#include "storage/ipc.h" + +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" + +static bool table_states_valid = false; + +StringInfo copybuf = NULL; + +/* + * Exit routine for synchronization worker. + */ +static void pg_attribute_noreturn() +finish_sync_worker(void) +{ + /* Commit any outstanding transaction. */ + if (IsTransactionState()) + CommitTransactionCommand(); + + /* And flush all writes. */ + XLogFlush(GetXLogWriteRecPtr()); + + /* Find the main apply worker and signal it. */ + logicalrep_worker_wakeup(MyLogicalRepWorker->subid, InvalidOid); + + ereport(LOG, + (errmsg("logical replication synchronization worker finished processing"))); + + /* Stop gracefully */ + walrcv_disconnect(wrconn); + proc_exit(0); +} + +/* + * Wait until the table synchronization change. + * + * Returns false if the relation subscription state disappeared. + */ +static bool +wait_for_sync_status_change(Oid relid, char origstate) +{ + int rc; + char state = origstate; + + while (!got_SIGTERM) + { + LogicalRepWorker *worker; + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + worker = logicalrep_worker_find(MyLogicalRepWorker->subid, + relid, false); + if (!worker) + { + LWLockRelease(LogicalRepWorkerLock); + return false; + } + state = worker->relstate; + LWLockRelease(LogicalRepWorkerLock); + + if (state == SUBREL_STATE_UNKNOWN) + return false; + + if (state != origstate) + return true; + + rc = WaitLatch(&MyProc->procLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, + 10000L, WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE); + + /* emergency bailout if postmaster has died */ + if (rc & WL_POSTMASTER_DEATH) + proc_exit(1); + + ResetLatch(&MyProc->procLatch); + } + + return false; +} + +/* + * Callback from syscache invalidation. + */ +void +invalidate_syncing_table_states(Datum arg, int cacheid, uint32 hashvalue) +{ + table_states_valid = false; +} + +/* + * Handle table synchronization cooperation from the synchronization + * worker. + * + * If the sync worker is in catch up mode and reached the predetermined + * synchronization point in the WAL stream, mark the table as READY and + * finish. If it caught up too far, set to SYNCDONE and finish. Things will + * then proceed in the "sync in front" scenario. + */ +static void +process_syncing_tables_for_sync(XLogRecPtr current_lsn) +{ + Assert(IsTransactionState()); + + SpinLockAcquire(&MyLogicalRepWorker->relmutex); + + if (MyLogicalRepWorker->relstate == SUBREL_STATE_CATCHUP && + current_lsn >= MyLogicalRepWorker->relstate_lsn) + { + TimeLineID tli; + + MyLogicalRepWorker->relstate = + (current_lsn == MyLogicalRepWorker->relstate_lsn) + ? SUBREL_STATE_READY + : SUBREL_STATE_SYNCDONE; + MyLogicalRepWorker->relstate_lsn = current_lsn; + + SpinLockRelease(&MyLogicalRepWorker->relmutex); + + SetSubscriptionRelState(MyLogicalRepWorker->subid, + MyLogicalRepWorker->relid, + MyLogicalRepWorker->relstate, + MyLogicalRepWorker->relstate_lsn); + + walrcv_endstreaming(wrconn, &tli); + finish_sync_worker(); + } + else + SpinLockRelease(&MyLogicalRepWorker->relmutex); +} + +/* + * Handle table synchronization cooperation from the apply worker. + * + * Walk over all subscription tables that are individually tracked by the + * apply process (currently, all that have state other than + * SUBREL_STATE_READY) and manage synchronization for them. + * + * If there are tables that need synchronizing and are not being synchronized + * yet, start sync workers for them (if there are free slots for sync + * workers). + * + * For tables that are being synchronized already, check if sync workers + * either need action from the apply worker or have finished. + * + * The usual scenario is that the apply got ahead of the sync while the sync + * ran, and then the action needed by apply is to mark a table for CATCHUP and + * wait for the catchup to happen. In the less common case that sync worker + * got in front of the apply worker, the table is marked as SYNCDONE but not + * ready yet, as it needs to be tracked until apply reaches the same position + * to which it was synced. + * + * If the synchronization position is reached, then the table can be marked as + * READY and is no longer tracked. + */ +static void +process_syncing_tables_for_apply(XLogRecPtr current_lsn) +{ + static List *table_states = NIL; + ListCell *lc; + + Assert(!IsTransactionState()); + + /* We need up to date sync state info for subscription tables here. */ + if (!table_states_valid) + { + MemoryContext oldctx; + List *rstates; + ListCell *lc; + SubscriptionRelState *rstate; + + /* Clean the old list. */ + list_free_deep(table_states); + table_states = NIL; + + StartTransactionCommand(); + + /* Fetch all non-ready tables. */ + rstates = GetSubscriptionNotReadyRelations(MySubscription->oid); + + /* Allocate the tracking info in a permanent memory context. */ + oldctx = MemoryContextSwitchTo(CacheMemoryContext); + foreach(lc, rstates) + { + rstate = palloc(sizeof(SubscriptionRelState)); + memcpy(rstate, lfirst(lc), sizeof(SubscriptionRelState)); + table_states = lappend(table_states, rstate); + } + MemoryContextSwitchTo(oldctx); + + CommitTransactionCommand(); + + table_states_valid = true; + } + + /* Process all tables that are being synchronized. */ + foreach(lc, table_states) + { + SubscriptionRelState *rstate = (SubscriptionRelState *)lfirst(lc); + + if (rstate->state == SUBREL_STATE_SYNCDONE) + { + /* + * Apply has caught up to the position where the table sync + * has finished. Time to mark the table as ready so that + * apply will just continue to replicate it normally. + */ + if (current_lsn >= rstate->lsn) + { + rstate->state = SUBREL_STATE_READY; + rstate->lsn = current_lsn; + StartTransactionCommand(); + SetSubscriptionRelState(MyLogicalRepWorker->subid, + rstate->relid, rstate->state, + rstate->lsn); + CommitTransactionCommand(); + } + } + else + { + LogicalRepWorker *syncworker; + int nsyncworkers = 0; + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + syncworker = logicalrep_worker_find(MyLogicalRepWorker->subid, + rstate->relid, false); + if (syncworker) + { + SpinLockAcquire(&syncworker->relmutex); + rstate->state = syncworker->relstate; + rstate->lsn = syncworker->relstate_lsn; + SpinLockRelease(&syncworker->relmutex); + } + else + /* + * If no sync worker for this table yet, could running sync + * workers for this subscription, while we have the lock, for + * later. + */ + nsyncworkers = logicalrep_sync_worker_count(MyLogicalRepWorker->subid); + LWLockRelease(LogicalRepWorkerLock); + + /* + * There is a worker synchronizing the relation and waiting for + * apply to do something. + */ + if (syncworker && rstate->state == SUBREL_STATE_SYNCWAIT) + { + /* + * There are three possible synchronization situations here. + * + * a) Apply is in front of the table sync: We tell the table + * sync to CATCHUP. + * + * b) Apply is behind the table sync: We tell the table sync + * to mark the table as SYNCDONE and finish. + + * c) Apply and table sync are at the same position: We tell + * table sync to mark the table as READY and finish. + * + * In any case we'll need to wait for table sync to change + * the state in catalog and only then continue ourselves. + */ + if (current_lsn > rstate->lsn) + { + rstate->state = SUBREL_STATE_CATCHUP; + rstate->lsn = current_lsn; + } + else if (current_lsn == rstate->lsn) + { + rstate->state = SUBREL_STATE_READY; + rstate->lsn = current_lsn; + } + else + rstate->state = SUBREL_STATE_SYNCDONE; + + SpinLockAcquire(&syncworker->relmutex); + syncworker->relstate = rstate->state; + syncworker->relstate_lsn = rstate->lsn; + SpinLockRelease(&syncworker->relmutex); + + /* Signal the sync worker, as it may be waiting for us. */ + logicalrep_worker_wakeup_ptr(syncworker); + + /* + * Enter busy loop and wait for synchronization status + * change. + */ + wait_for_sync_status_change(rstate->relid, rstate->state); + } + + /* + * If there is no sync worker registered for the table and + * there is some free sync worker slot, start new sync worker + * for the table. + */ + else if (!syncworker && nsyncworkers < max_sync_workers_per_subscription) + { + logicalrep_worker_launch(MyLogicalRepWorker->dbid, + MySubscription->oid, + MySubscription->name, + MyLogicalRepWorker->userid, + rstate->relid); + } + } + } +} + +/* + * Process state possible change(s) of tables that are being synchronized. + */ +void +process_syncing_tables(XLogRecPtr current_lsn) +{ + if (am_tablesync_worker()) + process_syncing_tables_for_sync(current_lsn); + else + process_syncing_tables_for_apply(current_lsn); +} + +/* + * Create list of columns for COPY based on logical relation mapping. + */ +static List * +make_copy_attnamelist(LogicalRepRelMapEntry *rel) +{ + List *attnamelist = NIL; + TupleDesc desc = RelationGetDescr(rel->localrel); + int i; + + for (i = 0; i < desc->natts; i++) + { + int remoteattnum = rel->attrmap[i]; + + /* Skip dropped attributes. */ + if (desc->attrs[i]->attisdropped) + continue; + + /* Skip attributes that are missing on remote side. */ + if (remoteattnum < 0) + continue; + + attnamelist = lappend(attnamelist, + makeString(rel->remoterel.attnames[remoteattnum])); + } + + return attnamelist; +} + +/* + * Data source callback for the COPY FROM, which reads from the remote + * connection and passes the data back to our local COPY. + */ +static int +copy_read_data(void *outbuf, int minread, int maxread) +{ + int bytesread = 0; + int avail; + + /* If there are some leftover data from previous read, use them. */ + avail = copybuf->len - copybuf->cursor; + if (avail) + { + if (avail > maxread) + avail = maxread; + memcpy(outbuf, ©buf->data[copybuf->cursor], avail); + copybuf->cursor += avail; + maxread -= avail; + bytesread += avail; + } + + while (!got_SIGTERM && maxread > 0 && bytesread < minread) + { + pgsocket fd = PGINVALID_SOCKET; + int rc; + int len; + char *buf = NULL; + + for (;;) + { + /* Try read the data. */ + len = walrcv_receive(wrconn, &buf, &fd); + + CHECK_FOR_INTERRUPTS(); + + if (len == 0) + break; + else if (len < 0) + return bytesread; + else + { + /* Process the data */ + copybuf->data = buf; + copybuf->len = len; + copybuf->cursor = 0; + + avail = copybuf->len - copybuf->cursor; + if (avail > maxread) + avail = maxread; + memcpy(outbuf, ©buf->data[copybuf->cursor], avail); + outbuf = (void *) ((char *) outbuf + avail); + copybuf->cursor += avail; + maxread -= avail; + bytesread += avail; + } + + if (maxread <= 0 || bytesread >= minread) + return bytesread; + } + + /* + * Wait for more data or latch. + */ + rc = WaitLatchOrSocket(&MyProc->procLatch, + WL_SOCKET_READABLE | WL_LATCH_SET | + WL_TIMEOUT | WL_POSTMASTER_DEATH, + fd, 1000L, WAIT_EVENT_LOGICAL_SYNC_DATA); + + /* Emergency bailout if postmaster has died */ + if (rc & WL_POSTMASTER_DEATH) + proc_exit(1); + + ResetLatch(&MyProc->procLatch); + } + + /* Check for exit condition. */ + if (got_SIGTERM) + proc_exit(0); + + return bytesread; +} + + +/* + * Get information about remote relation in similar fashion the RELATION + * message provides during replication. + */ +static void +fetch_remote_table_info(char *nspname, char *relname, + LogicalRepRelation *lrel) +{ + WalRcvExecResult *res; + StringInfoData cmd; + TupleTableSlot *slot; + Oid tableRow[2] = {OIDOID, CHAROID}; + Oid attrRow[4] = {TEXTOID, OIDOID, INT4OID, BOOLOID}; + bool isnull; + int natt; + + lrel->nspname = nspname; + lrel->relname = relname; + + /* First fetch Oid and replica identity. */ + initStringInfo(&cmd); + appendStringInfo(&cmd, "SELECT c.oid, c.relreplident" + " FROM pg_catalog.pg_class c," + " pg_catalog.pg_namespace n" + " WHERE n.nspname = %s" + " AND c.relname = %s" + " AND c.relkind = 'r'", + quote_literal_cstr(nspname), + quote_literal_cstr(relname)); + res = walrcv_exec(wrconn, cmd.data, 2, tableRow); + + if (res->status != WALRCV_OK_TUPLES) + ereport(ERROR, + (errmsg("could not fetch table info for table \"%s.%s\" from publisher: %s", + nspname, relname, res->err))); + + slot = MakeSingleTupleTableSlot(res->tupledesc); + if (!tuplestore_gettupleslot(res->tuplestore, true, false, slot)) + ereport(ERROR, + (errmsg("table \"%s.%s\" not found on publisher", + nspname, relname))); + + lrel->remoteid = DatumGetObjectId(slot_getattr(slot, 1, &isnull)); + Assert(!isnull); + lrel->replident = DatumGetChar(slot_getattr(slot, 2, &isnull)); + Assert(!isnull); + + ExecDropSingleTupleTableSlot(slot); + walrcv_clear_result(res); + + /* Now fetch columns. */ + resetStringInfo(&cmd); + appendStringInfo(&cmd, + "SELECT a.attname," + " a.atttypid," + " a.atttypmod," + " a.attnum = ANY(i.indkey)" + " FROM pg_catalog.pg_attribute a" + " LEFT JOIN pg_catalog.pg_index i" + " ON (i.indexrelid = pg_get_replica_identity_index(%u))" + " WHERE a.attnum > 0::pg_catalog.int2" + " AND NOT a.attisdropped" + " AND a.attrelid = %u" + " ORDER BY a.attnum", + lrel->remoteid, lrel->remoteid); + res = walrcv_exec(wrconn, cmd.data, 4, attrRow); + + if (res->status != WALRCV_OK_TUPLES) + ereport(ERROR, + (errmsg("could not fetch table info for table \"%s.%s\": %s", + nspname, relname, res->err))); + + /* We don't know number of rows coming, so allocate enough space. */ + lrel->attnames = palloc0(MaxTupleAttributeNumber * sizeof(char *)); + lrel->atttyps = palloc0(MaxTupleAttributeNumber * sizeof(Oid)); + lrel->attkeys = NULL; + + natt = 0; + slot = MakeSingleTupleTableSlot(res->tupledesc); + while (tuplestore_gettupleslot(res->tuplestore, true, false, slot)) + { + lrel->attnames[natt] = + pstrdup(TextDatumGetCString(slot_getattr(slot, 1, &isnull))); + Assert(!isnull); + lrel->atttyps[natt] = DatumGetObjectId(slot_getattr(slot, 2, &isnull)); + Assert(!isnull); + if (DatumGetBool(slot_getattr(slot, 4, &isnull))) + lrel->attkeys = bms_add_member(lrel->attkeys, natt); + + /* Should never happen. */ + if (++natt >= MaxTupleAttributeNumber) + elog(ERROR, "too many columns in remote table \"%s.%s\"", + nspname, relname); + + ExecClearTuple(slot); + } + ExecDropSingleTupleTableSlot(slot); + + lrel->natts = natt; + + walrcv_clear_result(res); + pfree(cmd.data); +} + +/* + * Copy existing data of a table from publisher. + * + * Caller is responsible for locking the local relation. + */ +static void +copy_table(Relation rel) +{ + LogicalRepRelMapEntry *relmapentry; + LogicalRepRelation lrel; + WalRcvExecResult *res; + StringInfoData cmd; + CopyState cstate; + List *attnamelist; + + /* Get the publisher relation info. */ + fetch_remote_table_info(get_namespace_name(RelationGetNamespace(rel)), + RelationGetRelationName(rel), &lrel); + + /* Put the relation into relmap. */ + logicalrep_relmap_update(&lrel); + + /* Map the publisher relation to local one. */ + relmapentry = logicalrep_rel_open(lrel.remoteid, NoLock); + Assert(rel == relmapentry->localrel); + + /* Start copy on the publisher. */ + initStringInfo(&cmd); + appendStringInfo(&cmd, "COPY %s TO STDOUT", + quote_qualified_identifier(lrel.nspname, lrel.relname)); + res = walrcv_exec(wrconn, cmd.data, 0, NULL); + pfree(cmd.data); + if (res->status != WALRCV_OK_COPY_OUT) + ereport(ERROR, + (errmsg("could not start initial contents copy for table \"%s.%s\": %s", + lrel.nspname, lrel.relname, res->err))); + walrcv_clear_result(res); + + copybuf = makeStringInfo(); + + /* Create CopyState for ingestion of the data from publisher. */ + attnamelist = make_copy_attnamelist(relmapentry); + cstate = BeginCopyFrom(NULL, rel, NULL, false, copy_read_data, attnamelist, NIL); + + /* Do the copy */ + (void) CopyFrom(cstate); + + logicalrep_rel_close(relmapentry, NoLock); +} + +/* + * Start syncing the table in the sync worker. + * + * The returned slot name is palloced in current memory context. + */ +char * +LogicalRepSyncTableStart(XLogRecPtr *origin_startpos) +{ + char *slotname; + char *err; + + /* Check the state of the table synchronization. */ + StartTransactionCommand(); + SpinLockAcquire(&MyLogicalRepWorker->relmutex); + MyLogicalRepWorker->relstate = + GetSubscriptionRelState(MyLogicalRepWorker->subid, + MyLogicalRepWorker->relid, + &MyLogicalRepWorker->relstate_lsn, + false); + SpinLockRelease(&MyLogicalRepWorker->relmutex); + CommitTransactionCommand(); + + /* + * To build a slot name for the sync work, we are limited to NAMEDATALEN - + * 1 characters. We cut the original slot name to NAMEDATALEN - 28 chars + * and append _%u_sync_%u (1 + 10 + 6 + 10 + '\0'). (It's actually the + * NAMEDATALEN on the remote that matters, but this scheme will also work + * reasonably if that is different.) + */ + StaticAssertStmt(NAMEDATALEN >= 32, "NAMEDATALEN too small"); /* for sanity */ + slotname = psprintf("%.*s_%u_sync_%u", + NAMEDATALEN - 28, + MySubscription->slotname, + MySubscription->oid, + MyLogicalRepWorker->relid); + + wrconn = walrcv_connect(MySubscription->conninfo, true, slotname, &err); + if (wrconn == NULL) + ereport(ERROR, + (errmsg("could not connect to the publisher: %s", err))); + + switch (MyLogicalRepWorker->relstate) + { + case SUBREL_STATE_INIT: + case SUBREL_STATE_DATASYNC: + { + Relation rel; + WalRcvExecResult *res; + + SpinLockAcquire(&MyLogicalRepWorker->relmutex); + MyLogicalRepWorker->relstate = SUBREL_STATE_DATASYNC; + MyLogicalRepWorker->relstate_lsn = InvalidXLogRecPtr; + SpinLockRelease(&MyLogicalRepWorker->relmutex); + + /* Update the state and make it visible to others. */ + StartTransactionCommand(); + SetSubscriptionRelState(MyLogicalRepWorker->subid, + MyLogicalRepWorker->relid, + MyLogicalRepWorker->relstate, + MyLogicalRepWorker->relstate_lsn); + CommitTransactionCommand(); + + /* + * We want to do the table data sync in single + * transaction. + */ + StartTransactionCommand(); + + /* + * Use standard write lock here. It might be better to + * disallow access to table while it's being synchronized. + * But we don't want to block the main apply process from + * working and it has to open relation in RowExclusiveLock + * when remapping remote relation id to local one. + */ + rel = heap_open(MyLogicalRepWorker->relid, RowExclusiveLock); + + /* + * Create temporary slot for the sync process. + * We do this inside transaction so that we can use the + * snapshot made by the slot to get existing data. + */ + res = walrcv_exec(wrconn, + "BEGIN READ ONLY ISOLATION LEVEL " + "REPEATABLE READ", 0, NULL); + if (res->status != WALRCV_OK_COMMAND) + ereport(ERROR, + (errmsg("table copy could not start transaction on publisher"), + errdetail("The error was: %s", res->err))); + walrcv_clear_result(res); + + /* + * Create new temporary logical decoding slot. + * + * We'll use slot for data copy so make sure the snapshot + * is used for the transaction, that way the COPY will get + * data that is consistent with the lsn used by the slot + * to start decoding. + */ + walrcv_create_slot(wrconn, slotname, true, + CRS_USE_SNAPSHOT, origin_startpos); + + copy_table(rel); + + res = walrcv_exec(wrconn, "COMMIT", 0, NULL); + if (res->status != WALRCV_OK_COMMAND) + ereport(ERROR, + (errmsg("table copy could not finish transaction on publisher"), + errdetail("The error was: %s", res->err))); + walrcv_clear_result(res); + + heap_close(rel, NoLock); + + /* Make the copy visible. */ + CommandCounterIncrement(); + + /* + * We are done with the initial data synchronization, + * update the state. + */ + SpinLockAcquire(&MyLogicalRepWorker->relmutex); + MyLogicalRepWorker->relstate = SUBREL_STATE_SYNCWAIT; + MyLogicalRepWorker->relstate_lsn = *origin_startpos; + SpinLockRelease(&MyLogicalRepWorker->relmutex); + + /* + * Wait for main apply worker to either tell us to + * catchup or that we are done. + */ + wait_for_sync_status_change(MyLogicalRepWorker->relid, + MyLogicalRepWorker->relstate); + if (MyLogicalRepWorker->relstate != SUBREL_STATE_CATCHUP) + { + /* Update the new state. */ + SetSubscriptionRelState(MyLogicalRepWorker->subid, + MyLogicalRepWorker->relid, + MyLogicalRepWorker->relstate, + MyLogicalRepWorker->relstate_lsn); + finish_sync_worker(); + } + break; + } + case SUBREL_STATE_SYNCDONE: + case SUBREL_STATE_READY: + /* Nothing to do here but finish. */ + finish_sync_worker(); + break; + default: + elog(ERROR, "unknown relation state \"%c\"", + MyLogicalRepWorker->relstate); + } + + return slotname; +} diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index c3e54af259..bbf3506be0 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -32,6 +32,7 @@ #include "catalog/namespace.h" #include "catalog/pg_subscription.h" +#include "catalog/pg_subscription_rel.h" #include "commands/trigger.h" @@ -101,7 +102,7 @@ typedef struct SlotErrCallbackArg } SlotErrCallbackArg; static MemoryContext ApplyContext = NULL; -static MemoryContext ApplyCacheContext = NULL; +MemoryContext ApplyCacheContext = NULL; WalReceiverConn *wrconn = NULL; @@ -109,6 +110,7 @@ Subscription *MySubscription = NULL; bool MySubscriptionValid = false; bool in_remote_transaction = false; +static XLogRecPtr remote_final_lsn = InvalidXLogRecPtr; static void send_feedback(XLogRecPtr recvpos, bool force, bool requestReply); @@ -116,6 +118,30 @@ static void store_flush_position(XLogRecPtr remote_lsn); static void reread_subscription(void); +/* + * Should this worker apply changes for given relation. + * + * This is mainly needed for initial relation data sync as that runs in + * separate worker process running in parallel and we need some way to skip + * changes coming to the main apply worker during the sync of a table. + * + * Note we need to do smaller or equals comparison for SYNCDONE state because + * it might hold position of end of intitial slot consistent point WAL + * record + 1 (ie start of next record) and next record can be COMMIT of + * transaction we are now processing (which is what we set remote_final_lsn + * to in apply_handle_begin). + */ +static bool +should_apply_changes_for_rel(LogicalRepRelMapEntry *rel) +{ + if (am_tablesync_worker()) + return MyLogicalRepWorker->relid == rel->localreloid; + else + return (rel->state == SUBREL_STATE_READY || + (rel->state == SUBREL_STATE_SYNCDONE && + rel->statelsn <= remote_final_lsn)); +} + /* * Make sure that we started local transaction. * @@ -398,6 +424,8 @@ apply_handle_begin(StringInfo s) replorigin_session_origin_timestamp = begin_data.committime; replorigin_session_origin_lsn = begin_data.final_lsn; + remote_final_lsn = begin_data.final_lsn; + in_remote_transaction = true; pgstat_report_activity(STATE_RUNNING, NULL); @@ -418,7 +446,10 @@ apply_handle_commit(StringInfo s) Assert(commit_data.commit_lsn == replorigin_session_origin_lsn); Assert(commit_data.committime == replorigin_session_origin_timestamp); - if (IsTransactionState()) + Assert(commit_data.commit_lsn == remote_final_lsn); + + /* The synchronization worker runs in single transaction. */ + if (IsTransactionState() && !am_tablesync_worker()) { CommitTransactionCommand(); @@ -427,6 +458,9 @@ apply_handle_commit(StringInfo s) in_remote_transaction = false; + /* Process any tables that are being synchronized in parallel. */ + process_syncing_tables(commit_data.end_lsn); + pgstat_report_activity(STATE_IDLE, NULL); } @@ -442,7 +476,8 @@ apply_handle_origin(StringInfo s) * ORIGIN message can only come inside remote transaction and before * any actual writes. */ - if (!in_remote_transaction || IsTransactionState()) + if (!in_remote_transaction || + (IsTransactionState() && !am_tablesync_worker())) ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("ORIGIN message sent out of order"))); @@ -515,6 +550,15 @@ apply_handle_insert(StringInfo s) relid = logicalrep_read_insert(s, &newtup); rel = logicalrep_rel_open(relid, RowExclusiveLock); + if (!should_apply_changes_for_rel(rel)) + { + /* + * The relation can't become interesting in the middle of the + * transaction so it's safe to unlock it. + */ + logicalrep_rel_close(rel, RowExclusiveLock); + return; + } /* Initialize the executor state. */ estate = create_estate_for_relation(rel); @@ -607,6 +651,15 @@ apply_handle_update(StringInfo s) relid = logicalrep_read_update(s, &has_oldtup, &oldtup, &newtup); rel = logicalrep_rel_open(relid, RowExclusiveLock); + if (!should_apply_changes_for_rel(rel)) + { + /* + * The relation can't become interesting in the middle of the + * transaction so it's safe to unlock it. + */ + logicalrep_rel_close(rel, RowExclusiveLock); + return; + } /* Check if we can do the update. */ check_relation_updatable(rel); @@ -716,6 +769,15 @@ apply_handle_delete(StringInfo s) relid = logicalrep_read_delete(s, &oldtup); rel = logicalrep_rel_open(relid, RowExclusiveLock); + if (!should_apply_changes_for_rel(rel)) + { + /* + * The relation can't become interesting in the middle of the + * transaction so it's safe to unlock it. + */ + logicalrep_rel_close(rel, RowExclusiveLock); + return; + } /* Check if we can do the delete. */ check_relation_updatable(rel); @@ -927,10 +989,8 @@ UpdateWorkerStats(XLogRecPtr last_lsn, TimestampTz send_time, bool reply) * Apply main loop. */ static void -ApplyLoop(void) +LogicalRepApplyLoop(XLogRecPtr last_received) { - XLogRecPtr last_received = InvalidXLogRecPtr; - /* Init the ApplyContext which we use for easier cleanup. */ ApplyContext = AllocSetContextCreate(TopMemoryContext, "ApplyContext", @@ -1014,15 +1074,18 @@ ApplyLoop(void) } else if (c == 'k') { - XLogRecPtr endpos; + XLogRecPtr end_lsn; TimestampTz timestamp; bool reply_requested; - endpos = pq_getmsgint64(&s); + end_lsn = pq_getmsgint64(&s); timestamp = pq_getmsgint64(&s); reply_requested = pq_getmsgbyte(&s); - send_feedback(endpos, reply_requested, false); + if (last_received < end_lsn) + last_received = end_lsn; + + send_feedback(last_received, reply_requested, false); UpdateWorkerStats(last_received, timestamp, true); } /* other message types are purposefully ignored */ @@ -1030,6 +1093,9 @@ ApplyLoop(void) len = walrcv_receive(wrconn, &buf, &fd); } + + /* confirm all writes at once */ + send_feedback(last_received, false, false); } if (!in_remote_transaction) @@ -1038,15 +1104,13 @@ ApplyLoop(void) * If we didn't get any transactions for a while there might be * unconsumed invalidation messages in the queue, consume them now. */ - StartTransactionCommand(); - /* Check for subscription change */ + AcceptInvalidationMessages(); if (!MySubscriptionValid) reread_subscription(); - CommitTransactionCommand(); - } - /* confirm all writes at once */ - send_feedback(last_received, false, false); + /* Process any table synchronization changes. */ + process_syncing_tables(last_received); + } /* Cleanup the memory. */ MemoryContextResetAndDeleteChildren(ApplyContext); @@ -1054,7 +1118,11 @@ ApplyLoop(void) /* Check if we need to exit the streaming loop. */ if (endofstream) + { + TimeLineID tli; + walrcv_endstreaming(wrconn, &tli); break; + } /* * Wait for more data or latch. @@ -1222,6 +1290,14 @@ reread_subscription(void) { MemoryContext oldctx; Subscription *newsub; + bool started_tx = false; + + /* This function might be called inside or outside of transaction. */ + if (!IsTransactionState()) + { + StartTransactionCommand(); + started_tx = true; + } /* Ensure allocations in permanent context. */ oldctx = MemoryContextSwitchTo(ApplyCacheContext); @@ -1319,6 +1395,9 @@ reread_subscription(void) MemoryContextSwitchTo(oldctx); + if (started_tx) + CommitTransactionCommand(); + MySubscriptionValid = true; } @@ -1339,11 +1418,8 @@ ApplyWorkerMain(Datum main_arg) int worker_slot = DatumGetObjectId(main_arg); MemoryContext oldctx; char originname[NAMEDATALEN]; - RepOriginId originid; XLogRecPtr origin_startpos; - char *err; - int server_version; - TimeLineID startpointTLI; + char *myslotname; WalRcvStreamOptions options; /* Attach to slot */ @@ -1402,49 +1478,90 @@ ApplyWorkerMain(Datum main_arg) subscription_change_cb, (Datum) 0); - ereport(LOG, - (errmsg("logical replication apply for subscription \"%s\" has started", - MySubscription->name))); - - /* Setup replication origin tracking. */ - snprintf(originname, sizeof(originname), "pg_%u", MySubscription->oid); - originid = replorigin_by_name(originname, true); - if (!OidIsValid(originid)) - originid = replorigin_create(originname); - replorigin_session_setup(originid); - replorigin_session_origin = originid; - origin_startpos = replorigin_session_get_progress(false); + if (am_tablesync_worker()) + elog(LOG, "logical replication sync for subscription %s, table %s started", + MySubscription->name, get_rel_name(MyLogicalRepWorker->relid)); + else + elog(LOG, "logical replication apply for subscription %s started", + MySubscription->name); CommitTransactionCommand(); /* Connect to the origin and start the replication. */ elog(DEBUG1, "connecting to publisher using connection string \"%s\"", MySubscription->conninfo); - wrconn = walrcv_connect(MySubscription->conninfo, true, - MySubscription->name, &err); - if (wrconn == NULL) - ereport(ERROR, - (errmsg("could not connect to the publisher: %s", err))); + + if (am_tablesync_worker()) + { + char *syncslotname; + + /* This is table synchroniation worker, call initial sync. */ + syncslotname = LogicalRepSyncTableStart(&origin_startpos); + + /* The slot name needs to be allocated in permanent memory context. */ + oldctx = MemoryContextSwitchTo(ApplyCacheContext); + myslotname = pstrdup(syncslotname); + MemoryContextSwitchTo(oldctx); + + pfree(syncslotname); + } + else + { + /* This is main apply worker */ + RepOriginId originid; + TimeLineID startpointTLI; + char *err; + int server_version; + + myslotname = MySubscription->slotname; + + /* Setup replication origin tracking. */ + StartTransactionCommand(); + snprintf(originname, sizeof(originname), "pg_%u", MySubscription->oid); + originid = replorigin_by_name(originname, true); + if (!OidIsValid(originid)) + originid = replorigin_create(originname); + replorigin_session_setup(originid); + replorigin_session_origin = originid; + origin_startpos = replorigin_session_get_progress(false); + CommitTransactionCommand(); + + wrconn = walrcv_connect(MySubscription->conninfo, true, myslotname, + &err); + if (wrconn == NULL) + ereport(ERROR, + (errmsg("could not connect to the publisher: %s", err))); + + /* + * We don't really use the output identify_system for anything + * but it does some initializations on the upstream so let's still + * call it. + */ + (void) walrcv_identify_system(wrconn, &startpointTLI, + &server_version); + + } /* - * We don't really use the output identify_system for anything - * but it does some initializations on the upstream so let's still - * call it. + * Setup callback for syscache so that we know when something + * changes in the subscription relation state. */ - (void) walrcv_identify_system(wrconn, &startpointTLI, &server_version); + CacheRegisterSyscacheCallback(SUBSCRIPTIONRELMAP, + invalidate_syncing_table_states, + (Datum) 0); /* Build logical replication streaming options. */ options.logical = true; options.startpoint = origin_startpos; - options.slotname = MySubscription->slotname; + options.slotname = myslotname; options.proto.logical.proto_version = LOGICALREP_PROTO_VERSION_NUM; options.proto.logical.publication_names = MySubscription->publications; - /* Start streaming from the slot. */ + /* Start normal logical streaming replication. */ walrcv_startstreaming(wrconn, &options); /* Run the main loop. */ - ApplyLoop(); + LogicalRepApplyLoop(origin_startpos); walrcv_disconnect(wrconn); diff --git a/src/backend/replication/repl_gram.y b/src/backend/replication/repl_gram.y index f1e43bc9f3..ec047c827c 100644 --- a/src/backend/replication/repl_gram.y +++ b/src/backend/replication/repl_gram.y @@ -25,6 +25,8 @@ /* Result of the parsing is returned here */ Node *replication_parse_result; +static SQLCmd *make_sqlcmd(void); + /* * Bison doesn't allocate anything that needs to live across parser calls, @@ -57,6 +59,7 @@ Node *replication_parse_result; %token SCONST IDENT %token UCONST %token RECPTR +%token T_WORD /* Keyword tokens. */ %token K_BASE_BACKUP @@ -81,11 +84,12 @@ Node *replication_parse_result; %token K_TEMPORARY %token K_EXPORT_SNAPSHOT %token K_NOEXPORT_SNAPSHOT +%token K_USE_SNAPSHOT %type command %type base_backup start_replication start_logical_replication create_replication_slot drop_replication_slot identify_system - timeline_history show + timeline_history show sql_cmd %type base_backup_opt_list %type base_backup_opt %type opt_timeline @@ -118,6 +122,7 @@ command: | drop_replication_slot | timeline_history | show + | sql_cmd ; /* @@ -248,6 +253,11 @@ create_slot_opt: $$ = makeDefElem("export_snapshot", (Node *)makeInteger(FALSE), -1); } + | K_USE_SNAPSHOT + { + $$ = makeDefElem("use_snapshot", + (Node *)makeInteger(TRUE), -1); + } | K_RESERVE_WAL { $$ = makeDefElem("reserve_wal", @@ -373,6 +383,26 @@ plugin_opt_arg: SCONST { $$ = (Node *) makeString($1); } | /* EMPTY */ { $$ = NULL; } ; + +sql_cmd: + IDENT { $$ = (Node *) make_sqlcmd(); } + ; %% +static SQLCmd * +make_sqlcmd(void) +{ + SQLCmd *cmd = makeNode(SQLCmd); + int tok; + + /* Just move lexer to the end of command. */ + for (;;) + { + tok = yylex(); + if (tok == ';' || tok == 0) + break; + } + return cmd; +} + #include "repl_scanner.c" diff --git a/src/backend/replication/repl_scanner.l b/src/backend/replication/repl_scanner.l index f56d41d59c..52ae7b343f 100644 --- a/src/backend/replication/repl_scanner.l +++ b/src/backend/replication/repl_scanner.l @@ -102,6 +102,7 @@ SLOT { return K_SLOT; } TEMPORARY { return K_TEMPORARY; } EXPORT_SNAPSHOT { return K_EXPORT_SNAPSHOT; } NOEXPORT_SNAPSHOT { return K_NOEXPORT_SNAPSHOT; } +USE_SNAPSHOT { return K_USE_SNAPSHOT; } "," { return ','; } ";" { return ';'; } @@ -180,9 +181,7 @@ NOEXPORT_SNAPSHOT { return K_NOEXPORT_SNAPSHOT; } } . { - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("syntax error: unexpected character \"%s\"", yytext))); + return T_WORD; } %% diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 75617709ec..c6ba916c49 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -753,7 +753,7 @@ logical_read_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, int req static void parseCreateReplSlotOptions(CreateReplicationSlotCmd *cmd, bool *reserve_wal, - bool *export_snapshot) + CRSSnapshotAction *snapshot_action) { ListCell *lc; bool snapshot_action_given = false; @@ -772,7 +772,18 @@ parseCreateReplSlotOptions(CreateReplicationSlotCmd *cmd, errmsg("conflicting or redundant options"))); snapshot_action_given = true; - *export_snapshot = defGetBoolean(defel); + *snapshot_action = defGetBoolean(defel) ? CRS_EXPORT_SNAPSHOT : + CRS_NOEXPORT_SNAPSHOT; + } + else if (strcmp(defel->defname, "use_snapshot") == 0) + { + if (snapshot_action_given || cmd->kind != REPLICATION_KIND_LOGICAL) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"))); + + snapshot_action_given = true; + *snapshot_action = CRS_USE_SNAPSHOT; } else if (strcmp(defel->defname, "reserve_wal") == 0) { @@ -799,7 +810,7 @@ CreateReplicationSlot(CreateReplicationSlotCmd *cmd) char xpos[MAXFNAMELEN]; char *slot_name; bool reserve_wal = false; - bool export_snapshot = true; + CRSSnapshotAction snapshot_action = CRS_EXPORT_SNAPSHOT; DestReceiver *dest; TupOutputState *tstate; TupleDesc tupdesc; @@ -808,7 +819,7 @@ CreateReplicationSlot(CreateReplicationSlotCmd *cmd) Assert(!MyReplicationSlot); - parseCreateReplSlotOptions(cmd, &reserve_wal, &export_snapshot); + parseCreateReplSlotOptions(cmd, &reserve_wal, &snapshot_action); /* setup state for XLogReadPage */ sendTimeLineIsHistoric = false; @@ -838,6 +849,40 @@ CreateReplicationSlot(CreateReplicationSlotCmd *cmd) { LogicalDecodingContext *ctx; + /* + * Do options check early so that we can bail before calling the + * DecodingContextFindStartpoint which can take long time. + */ + if (snapshot_action == CRS_EXPORT_SNAPSHOT) + { + if (IsTransactionBlock()) + ereport(ERROR, + (errmsg("CREATE_REPLICATION_SLOT ... EXPORT_SNAPSHOT " + "must not be called inside a transaction"))); + } + else if (snapshot_action == CRS_USE_SNAPSHOT) + { + if (!IsTransactionBlock()) + ereport(ERROR, + (errmsg("CREATE_REPLICATION_SLOT ... USE_SNAPSHOT " + "must be called inside a transaction"))); + + if (XactIsoLevel != XACT_REPEATABLE_READ) + ereport(ERROR, + (errmsg("CREATE_REPLICATION_SLOT ... USE_SNAPSHOT " + "must be called in REPEATABLE READ isolation mode transaction"))); + + if (FirstSnapshotSet) + ereport(ERROR, + (errmsg("CREATE_REPLICATION_SLOT ... USE_SNAPSHOT " + "must be called before any query"))); + + if (IsSubTransaction()) + ereport(ERROR, + (errmsg("CREATE_REPLICATION_SLOT ... USE_SNAPSHOT " + "must not be called in a subtransaction"))); + } + ctx = CreateInitDecodingContext(cmd->plugin, NIL, logical_read_xlog_page, WalSndPrepareWrite, WalSndWriteData); @@ -855,13 +900,22 @@ CreateReplicationSlot(CreateReplicationSlotCmd *cmd) DecodingContextFindStartpoint(ctx); /* - * Export the snapshot if we've been asked to do so. + * Export or use the snapshot if we've been asked to do so. * * NB. We will convert the snapbuild.c kind of snapshot to normal * snapshot when doing this. */ - if (export_snapshot) + if (snapshot_action == CRS_EXPORT_SNAPSHOT) + { snapshot_name = SnapBuildExportSnapshot(ctx->snapshot_builder); + } + else if (snapshot_action == CRS_USE_SNAPSHOT) + { + Snapshot snap; + + snap = SnapBuildInitalSnapshot(ctx->snapshot_builder); + RestoreTransactionSnapshot(snap, MyProc); + } /* don't need the decoding context anymore */ FreeDecodingContext(ctx); @@ -1277,8 +1331,11 @@ WalSndWaitForWal(XLogRecPtr loc) /* * Execute an incoming replication command. + * + * Returns true if the cmd_string was recognized as WalSender command, false + * if not. */ -void +bool exec_replication_command(const char *cmd_string) { int parse_rc; @@ -1317,6 +1374,25 @@ exec_replication_command(const char *cmd_string) cmd_node = replication_parse_result; + /* + * CREATE_REPLICATION_SLOT ... LOGICAL exports a snapshot. If it was + * called outside of transaction the snapshot should be cleared here. + */ + if (!IsTransactionBlock()) + SnapBuildClearExportedSnapshot(); + + /* + * For aborted transactions, don't allow anything except pure SQL, + * the exec_simple_query() will handle it correctly. + */ + if (IsAbortedTransactionBlockState() && !IsA(cmd_node, SQLCmd)) + ereport(ERROR, + (errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION), + errmsg("current transaction is aborted, " + "commands ignored until end of transaction block"))); + + CHECK_FOR_INTERRUPTS(); + /* * Allocate buffers that will be used for each outgoing and incoming * message. We do this just once per command to reduce palloc overhead. @@ -1332,6 +1408,7 @@ exec_replication_command(const char *cmd_string) break; case T_BaseBackupCmd: + PreventTransactionChain(true, "BASE_BACKUP"); SendBaseBackup((BaseBackupCmd *) cmd_node); break; @@ -1347,6 +1424,8 @@ exec_replication_command(const char *cmd_string) { StartReplicationCmd *cmd = (StartReplicationCmd *) cmd_node; + PreventTransactionChain(true, "START_REPLICATION"); + if (cmd->kind == REPLICATION_KIND_PHYSICAL) StartReplication(cmd); else @@ -1355,6 +1434,7 @@ exec_replication_command(const char *cmd_string) } case T_TimeLineHistoryCmd: + PreventTransactionChain(true, "TIMELINE_HISTORY"); SendTimeLineHistory((TimeLineHistoryCmd *) cmd_node); break; @@ -1367,6 +1447,14 @@ exec_replication_command(const char *cmd_string) } break; + case T_SQLCmd: + if (MyDatabaseId == InvalidOid) + ereport(ERROR, + (errmsg("not connected to database"))); + + /* Tell the caller that this wasn't a WalSender command. */ + return false; + default: elog(ERROR, "unrecognized replication command node tag: %u", cmd_node->type); @@ -1378,6 +1466,8 @@ exec_replication_command(const char *cmd_string) /* Send CommandComplete message */ EndCommand("SELECT", DestRemote); + + return true; } /* diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index b07d6c6cb9..ba41f90712 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -4061,7 +4061,10 @@ PostgresMain(int argc, char *argv[], pq_getmsgend(&input_message); if (am_walsender) - exec_replication_command(query_string); + { + if (!exec_replication_command(query_string)) + exec_simple_query(query_string); + } else exec_simple_query(query_string); diff --git a/src/backend/utils/adt/misc.c b/src/backend/utils/adt/misc.c index 1ec7f32470..7dcecb2f0f 100644 --- a/src/backend/utils/adt/misc.c +++ b/src/backend/utils/adt/misc.c @@ -982,3 +982,23 @@ pg_current_logfile_1arg(PG_FUNCTION_ARGS) { return pg_current_logfile(fcinfo); } + +/* + * SQL wrapper around RelationGetReplicaIndex(). + */ +Datum +pg_get_replica_identity_index(PG_FUNCTION_ARGS) +{ + Oid reloid = PG_GETARG_OID(0); + Oid idxoid; + Relation rel; + + rel = heap_open(reloid, AccessShareLock); + idxoid = RelationGetReplicaIndex(rel); + heap_close(rel, AccessShareLock); + + if (OidIsValid(idxoid)) + PG_RETURN_OID(idxoid); + else + PG_RETURN_NULL(); +} diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c index b1c0b4b1be..d5a376406f 100644 --- a/src/backend/utils/cache/syscache.c +++ b/src/backend/utils/cache/syscache.c @@ -62,6 +62,7 @@ #include "catalog/pg_replication_origin.h" #include "catalog/pg_statistic.h" #include "catalog/pg_subscription.h" +#include "catalog/pg_subscription_rel.h" #include "catalog/pg_tablespace.h" #include "catalog/pg_transform.h" #include "catalog/pg_ts_config.h" @@ -693,7 +694,7 @@ static const struct cachedesc cacheinfo[] = { 64 }, {PublicationRelRelationId, /* PUBLICATIONRELMAP */ - PublicationRelMapIndexId, + PublicationRelPrrelidPrpubidIndexId, 2, { Anum_pg_publication_rel_prrelid, @@ -758,6 +759,17 @@ static const struct cachedesc cacheinfo[] = { }, 4 }, + {SubscriptionRelRelationId, /* SUBSCRIPTIONRELMAP */ + SubscriptionRelSrrelidSrsubidIndexId, + 2, + { + Anum_pg_subscription_rel_srrelid, + Anum_pg_subscription_rel_srsubid, + 0, + 0 + }, + 64 + }, {TableSpaceRelationId, /* TABLESPACEOID */ TablespaceOidIndexId, 1, diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 4feb26aa7a..291bf7631d 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2497,6 +2497,18 @@ static struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + { + {"max_sync_workers_per_subscription", + PGC_SIGHUP, + RESOURCES_ASYNCHRONOUS, + gettext_noop("Maximum number of table synchronization workers per subscription."), + NULL, + }, + &max_sync_workers_per_subscription, + 2, 0, MAX_BACKENDS, + NULL, NULL, NULL + }, + { {"log_rotation_age", PGC_SIGHUP, LOGGING_WHERE, gettext_noop("Automatic log file rotation will occur after N minutes."), diff --git a/src/bin/pg_dump/pg_backup.h b/src/bin/pg_dump/pg_backup.h index 610bed531c..98bc1a586a 100644 --- a/src/bin/pg_dump/pg_backup.h +++ b/src/bin/pg_dump/pg_backup.h @@ -155,7 +155,7 @@ typedef struct _dumpOptions int use_setsessauth; int enable_row_security; int include_subscriptions; - int no_create_subscription_slots; + int no_subscription_connect; /* default, if no "inclusion" switches appear, is to dump everything */ bool include_everything; diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 2b5a52656c..a98747d89a 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -351,8 +351,8 @@ main(int argc, char **argv) {"snapshot", required_argument, NULL, 6}, {"strict-names", no_argument, &strict_names, 1}, {"use-set-session-authorization", no_argument, &dopt.use_setsessauth, 1}, - {"no-create-subscription-slots", no_argument, &dopt.no_create_subscription_slots, 1}, {"no-security-labels", no_argument, &dopt.no_security_labels, 1}, + {"no-subscription-connect", no_argument, &dopt.no_subscription_connect, 1}, {"no-synchronized-snapshots", no_argument, &dopt.no_synchronized_snapshots, 1}, {"no-unlogged-table-data", no_argument, &dopt.no_unlogged_table_data, 1}, {"no-sync", no_argument, NULL, 7}, @@ -951,9 +951,8 @@ help(const char *progname) printf(_(" --if-exists use IF EXISTS when dropping objects\n")); printf(_(" --include-subscriptions dump logical replication subscriptions\n")); printf(_(" --inserts dump data as INSERT commands, rather than COPY\n")); - printf(_(" --no-create-subscription-slots\n" - " do not create replication slots for subscriptions\n")); printf(_(" --no-security-labels do not dump security label assignments\n")); + printf(_(" --no-subscription-connect dump subscriptions so they don't connect on restore\n")); printf(_(" --no-synchronized-snapshots do not use synchronized snapshots in parallel jobs\n")); printf(_(" --no-tablespaces do not dump tablespace assignments\n")); printf(_(" --no-unlogged-table-data do not dump unlogged table data\n")); @@ -3774,8 +3773,8 @@ dumpSubscription(Archive *fout, SubscriptionInfo *subinfo) appendPQExpBufferStr(query, ", SLOT NAME = "); appendStringLiteralAH(query, subinfo->subslotname, fout); - if (dopt->no_create_subscription_slots) - appendPQExpBufferStr(query, ", NOCREATE SLOT"); + if (dopt->no_subscription_connect) + appendPQExpBufferStr(query, ", NOCONNECT"); appendPQExpBufferStr(query, ");\n"); diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl index a46dcdbcd7..021f4bf081 100644 --- a/src/bin/pg_dump/t/002_pg_dump.pl +++ b/src/bin/pg_dump/t/002_pg_dump.pl @@ -4224,7 +4224,7 @@ qr/CREATE TRANSFORM FOR integer LANGUAGE sql \(FROM SQL WITH FUNCTION pg_catalog create_order => 50, create_sql => 'CREATE SUBSCRIPTION sub1 CONNECTION \'dbname=doesnotexist\' PUBLICATION pub1 - WITH (DISABLED, NOCREATE SLOT);', + WITH (DISABLED, NOCONNECT);', regexp => qr/^ \QCREATE SUBSCRIPTION sub1 CONNECTION 'dbname=doesnotexist' PUBLICATION pub1 WITH (DISABLED, SLOT NAME = 'sub1');\E /xm, diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 315f155b64..d8679f5f59 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 201703221 +#define CATALOG_VERSION_NO 201703231 #endif diff --git a/src/include/catalog/indexing.h b/src/include/catalog/indexing.h index 6bce7328a2..5d4190c05e 100644 --- a/src/include/catalog/indexing.h +++ b/src/include/catalog/indexing.h @@ -340,8 +340,8 @@ DECLARE_UNIQUE_INDEX(pg_publication_pubname_index, 6111, on pg_publication using DECLARE_UNIQUE_INDEX(pg_publication_rel_oid_index, 6112, on pg_publication_rel using btree(oid oid_ops)); #define PublicationRelObjectIndexId 6112 -DECLARE_UNIQUE_INDEX(pg_publication_rel_map_index, 6113, on pg_publication_rel using btree(prrelid oid_ops, prpubid oid_ops)); -#define PublicationRelMapIndexId 6113 +DECLARE_UNIQUE_INDEX(pg_publication_rel_prrelid_prpubid_index, 6113, on pg_publication_rel using btree(prrelid oid_ops, prpubid oid_ops)); +#define PublicationRelPrrelidPrpubidIndexId 6113 DECLARE_UNIQUE_INDEX(pg_subscription_oid_index, 6114, on pg_subscription using btree(oid oid_ops)); #define SubscriptionObjectIndexId 6114 @@ -349,6 +349,9 @@ DECLARE_UNIQUE_INDEX(pg_subscription_oid_index, 6114, on pg_subscription using b DECLARE_UNIQUE_INDEX(pg_subscription_subname_index, 6115, on pg_subscription using btree(subdbid oid_ops, subname name_ops)); #define SubscriptionNameIndexId 6115 +DECLARE_UNIQUE_INDEX(pg_subscription_rel_srrelid_srsubid_index, 6117, on pg_subscription_rel using btree(srrelid oid_ops, srsubid oid_ops)); +#define SubscriptionRelSrrelidSrsubidIndexId 6117 + /* last step of initialization script: build the indexes declared above */ BUILD_INDICES diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 22635655f5..78c23e3f5d 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -2021,6 +2021,9 @@ DESCR("is a relation insertable/updatable/deletable"); DATA(insert OID = 3843 ( pg_column_is_updatable PGNSP PGUID 12 10 0 0 0 f f f f t f s s 3 0 16 "2205 21 16" _null_ _null_ _null_ _null_ _null_ pg_column_is_updatable _null_ _null_ _null_ )); DESCR("is a column updatable"); +DATA(insert OID = 6120 ( pg_get_replica_identity_index PGNSP PGUID 12 10 0 0 0 f f f f t f s s 1 0 2205 "2205" _null_ _null_ _null_ _null_ _null_ pg_get_replica_identity_index _null_ _null_ _null_ )); +DESCR("oid of replica identity index if any"); + /* Deferrable unique constraint trigger */ DATA(insert OID = 1250 ( unique_key_recheck PGNSP PGUID 12 1 0 0 0 f f f f t f v s 0 0 2279 "" _null_ _null_ _null_ _null_ _null_ unique_key_recheck _null_ _null_ _null_ )); DESCR("deferred UNIQUE constraint check"); @@ -2805,7 +2808,7 @@ DATA(insert OID = 3099 ( pg_stat_get_wal_senders PGNSP PGUID 12 1 10 0 0 f f f DESCR("statistics: information about currently active replication"); DATA(insert OID = 3317 ( pg_stat_get_wal_receiver PGNSP PGUID 12 1 0 0 0 f f f f f f s r 0 0 2249 "" "{23,25,3220,23,3220,23,1184,1184,3220,1184,25,25}" "{o,o,o,o,o,o,o,o,o,o,o,o}" "{pid,status,receive_start_lsn,receive_start_tli,received_lsn,received_tli,last_msg_send_time,last_msg_receipt_time,latest_end_lsn,latest_end_time,slot_name,conninfo}" _null_ _null_ pg_stat_get_wal_receiver _null_ _null_ _null_ )); DESCR("statistics: information about WAL receiver"); -DATA(insert OID = 6118 ( pg_stat_get_subscription PGNSP PGUID 12 1 0 0 0 f f f f f f s r 1 0 2249 "26" "{26,26,23,3220,1184,1184,3220,1184}" "{i,o,o,o,o,o,o,o}" "{subid,subid,pid,received_lsn,last_msg_send_time,last_msg_receipt_time,latest_end_lsn,latest_end_time}" _null_ _null_ pg_stat_get_subscription _null_ _null_ _null_ )); +DATA(insert OID = 6118 ( pg_stat_get_subscription PGNSP PGUID 12 1 0 0 0 f f f f f f s r 1 0 2249 "26" "{26,26,26,23,3220,1184,1184,3220,1184}" "{i,o,o,o,o,o,o,o,o}" "{subid,subid,relid,pid,received_lsn,last_msg_send_time,last_msg_receipt_time,latest_end_lsn,latest_end_time}" _null_ _null_ pg_stat_get_subscription _null_ _null_ _null_ )); DESCR("statistics: information about subscription"); DATA(insert OID = 2026 ( pg_backend_pid PGNSP PGUID 12 1 0 0 0 f f f f t f s r 0 0 23 "" _null_ _null_ _null_ _null_ _null_ pg_backend_pid _null_ _null_ _null_ )); DESCR("statistics: current backend PID"); diff --git a/src/include/catalog/pg_subscription_rel.h b/src/include/catalog/pg_subscription_rel.h new file mode 100644 index 0000000000..129aa99f29 --- /dev/null +++ b/src/include/catalog/pg_subscription_rel.h @@ -0,0 +1,78 @@ +/* ------------------------------------------------------------------------- + * + * pg_subscription_rel.h + * Local info about tables that come from the publisher of a + * subscription (pg_subscription_rel). + * + * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * ------------------------------------------------------------------------- + */ +#ifndef PG_SUBSCRIPTION_REL_H +#define PG_SUBSCRIPTION_REL_H + +#include "catalog/genbki.h" + +/* ---------------- + * pg_subscription_rel definition. cpp turns this into + * typedef struct FormData_pg_subscription_rel + * ---------------- + */ +#define SubscriptionRelRelationId 6102 + +/* Workaround for genbki not knowing about XLogRecPtr */ +#define pg_lsn XLogRecPtr + +CATALOG(pg_subscription_rel,6102) BKI_WITHOUT_OIDS +{ + Oid srsubid; /* Oid of subscription */ + Oid srrelid; /* Oid of relation */ + char srsubstate; /* state of the relation in subscription */ + pg_lsn srsublsn; /* remote lsn of the state change + * used for synchronization coordination */ +} FormData_pg_subscription_rel; + +typedef FormData_pg_subscription_rel *Form_pg_subscription_rel; + +/* ---------------- + * compiler constants for pg_subscription_rel + * ---------------- + */ +#define Natts_pg_subscription_rel 4 +#define Anum_pg_subscription_rel_srsubid 1 +#define Anum_pg_subscription_rel_srrelid 2 +#define Anum_pg_subscription_rel_srsubstate 3 +#define Anum_pg_subscription_rel_srsublsn 4 + +/* ---------------- + * substate constants + * ---------------- + */ +#define SUBREL_STATE_INIT 'i' /* initializing (sublsn NULL) */ +#define SUBREL_STATE_DATASYNC 'd' /* data is being synchronized (sublsn NULL) */ +#define SUBREL_STATE_SYNCDONE 's' /* synchronization finished infront of apply (sublsn set) */ +#define SUBREL_STATE_READY 'r' /* ready (sublsn set) */ + +/* These are never stored in the catalog, we only use them for IPC. */ +#define SUBREL_STATE_UNKNOWN '\0' /* unknown state */ +#define SUBREL_STATE_SYNCWAIT 'w' /* waiting for sync */ +#define SUBREL_STATE_CATCHUP 'c' /* catching up with apply */ + +typedef struct SubscriptionRelState +{ + Oid relid; + XLogRecPtr lsn; + char state; +} SubscriptionRelState; + +extern Oid SetSubscriptionRelState(Oid subid, Oid relid, char state, + XLogRecPtr sublsn); +extern char GetSubscriptionRelState(Oid subid, Oid relid, + XLogRecPtr *sublsn, bool missing_ok); +extern void RemoveSubscriptionRel(Oid subid, Oid relid); + +extern List *GetSubscriptionRelations(Oid subid); +extern List *GetSubscriptionNotReadyRelations(Oid subid); + +#endif /* PG_SUBSCRIPTION_REL_H */ diff --git a/src/include/commands/copy.h b/src/include/commands/copy.h index d63ca0f5e9..f081f2219f 100644 --- a/src/include/commands/copy.h +++ b/src/include/commands/copy.h @@ -21,6 +21,7 @@ /* CopyStateData is private in commands/copy.c */ typedef struct CopyStateData *CopyState; +typedef int (*copy_data_source_cb) (void *outbuf, int minread, int maxread); extern void DoCopy(ParseState *state, const CopyStmt *stmt, int stmt_location, int stmt_len, @@ -28,7 +29,7 @@ extern void DoCopy(ParseState *state, const CopyStmt *stmt, extern void ProcessCopyOptions(ParseState *pstate, CopyState cstate, bool is_from, List *options); extern CopyState BeginCopyFrom(ParseState *pstate, Relation rel, const char *filename, - bool is_program, List *attnamelist, List *options); + bool is_program, copy_data_source_cb data_source_cb, List *attnamelist, List *options); extern void EndCopyFrom(CopyState cstate); extern bool NextCopyFrom(CopyState cstate, ExprContext *econtext, Datum *values, bool *nulls, Oid *tupleOid); @@ -36,6 +37,8 @@ extern bool NextCopyFromRawFields(CopyState cstate, char ***fields, int *nfields); extern void CopyFromErrorCallback(void *arg); +extern uint64 CopyFrom(CopyState cstate); + extern DestReceiver *CreateCopyDestReceiver(void); #endif /* COPY_H */ diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index 2cbd6d77b8..9a4221a9e7 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -488,6 +488,7 @@ typedef enum NodeTag T_DropReplicationSlotCmd, T_StartReplicationCmd, T_TimeLineHistoryCmd, + T_SQLCmd, /* * TAGS FOR RANDOM OTHER STUFF diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index a15df229a4..582e0e0ebe 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -3319,10 +3319,23 @@ typedef struct CreateSubscriptionStmt List *options; /* List of DefElem nodes */ } CreateSubscriptionStmt; +typedef enum AlterSubscriptionType +{ + ALTER_SUBSCRIPTION_OPTIONS, + ALTER_SUBSCRIPTION_CONNECTION, + ALTER_SUBSCRIPTION_PUBLICATION, + ALTER_SUBSCRIPTION_PUBLICATION_REFRESH, + ALTER_SUBSCRIPTION_REFRESH, + ALTER_SUBSCRIPTION_ENABLED +} AlterSubscriptionType; + typedef struct AlterSubscriptionStmt { NodeTag type; + AlterSubscriptionType kind; /* ALTER_SUBSCRIPTION_OPTIONS, etc */ char *subname; /* Name of of the subscription */ + char *conninfo; /* Connection string to publisher */ + List *publication; /* One or more publication to subscribe to */ List *options; /* List of DefElem nodes */ } AlterSubscriptionStmt; diff --git a/src/include/nodes/replnodes.h b/src/include/nodes/replnodes.h index 996da3c02e..92ada41b6d 100644 --- a/src/include/nodes/replnodes.h +++ b/src/include/nodes/replnodes.h @@ -96,4 +96,13 @@ typedef struct TimeLineHistoryCmd TimeLineID timeline; } TimeLineHistoryCmd; +/* ---------------------- + * SQL commands + * ---------------------- + */ +typedef struct SQLCmd +{ + NodeTag type; +} SQLCmd; + #endif /* REPLNODES_H */ diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h index 28c4dab258..6cd36c7fe3 100644 --- a/src/include/parser/kwlist.h +++ b/src/include/parser/kwlist.h @@ -258,6 +258,7 @@ PG_KEYWORD("new", NEW, UNRESERVED_KEYWORD) PG_KEYWORD("next", NEXT, UNRESERVED_KEYWORD) PG_KEYWORD("no", NO, UNRESERVED_KEYWORD) PG_KEYWORD("none", NONE, COL_NAME_KEYWORD) +PG_KEYWORD("norefresh", NOREFRESH, UNRESERVED_KEYWORD) PG_KEYWORD("not", NOT, RESERVED_KEYWORD) PG_KEYWORD("nothing", NOTHING, UNRESERVED_KEYWORD) PG_KEYWORD("notify", NOTIFY, UNRESERVED_KEYWORD) diff --git a/src/include/pgstat.h b/src/include/pgstat.h index f2daf32e1a..a675242971 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -790,7 +790,9 @@ typedef enum WAIT_EVENT_PARALLEL_FINISH, WAIT_EVENT_PARALLEL_BITMAP_SCAN, WAIT_EVENT_SAFE_SNAPSHOT, - WAIT_EVENT_SYNC_REP + WAIT_EVENT_SYNC_REP, + WAIT_EVENT_LOGICAL_SYNC_DATA, + WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE } WaitEventIPC; /* ---------- diff --git a/src/include/replication/logical.h b/src/include/replication/logical.h index fd34964bad..d10dd2c90a 100644 --- a/src/include/replication/logical.h +++ b/src/include/replication/logical.h @@ -31,9 +31,11 @@ typedef struct LogicalDecodingContext /* memory context this is all allocated in */ MemoryContext context; - /* infrastructure pieces */ - XLogReaderState *reader; + /* The associated replication slot */ ReplicationSlot *slot; + + /* infrastructure pieces for decoding */ + XLogReaderState *reader; struct ReorderBuffer *reorder; struct SnapBuild *snapshot_builder; @@ -75,6 +77,7 @@ typedef struct LogicalDecodingContext TransactionId write_xid; } LogicalDecodingContext; + extern void CheckLogicalDecodingRequirements(void); extern LogicalDecodingContext *CreateInitDecodingContext(char *plugin, @@ -92,6 +95,12 @@ extern void DecodingContextFindStartpoint(LogicalDecodingContext *ctx); extern bool DecodingContextReady(LogicalDecodingContext *ctx); extern void FreeDecodingContext(LogicalDecodingContext *ctx); +extern LogicalDecodingContext *CreateCopyDecodingContext( + List *output_plugin_options, + LogicalOutputPluginWriterPrepareWrite prepare_write, + LogicalOutputPluginWriterWrite do_write); +extern List *DecodingContextGetTableList(LogicalDecodingContext *ctx); + extern void LogicalIncreaseXminForSlot(XLogRecPtr lsn, TransactionId xmin); extern void LogicalIncreaseRestartDecodingForSlot(XLogRecPtr current_lsn, XLogRecPtr restart_lsn); diff --git a/src/include/replication/logicallauncher.h b/src/include/replication/logicallauncher.h index cfe3db10dd..060946a096 100644 --- a/src/include/replication/logicallauncher.h +++ b/src/include/replication/logicallauncher.h @@ -13,6 +13,7 @@ #define LOGICALLAUNCHER_H extern int max_logical_replication_workers; +extern int max_sync_workers_per_subscription; extern void ApplyLauncherRegister(void); extern void ApplyLauncherMain(Datum main_arg); diff --git a/src/include/replication/snapbuild.h b/src/include/replication/snapbuild.h index 5e824ae6fc..091a9f91e3 100644 --- a/src/include/replication/snapbuild.h +++ b/src/include/replication/snapbuild.h @@ -59,6 +59,7 @@ extern void FreeSnapshotBuilder(SnapBuild *cache); extern void SnapBuildSnapDecRefcount(Snapshot snap); +extern Snapshot SnapBuildInitalSnapshot(SnapBuild *builder); extern const char *SnapBuildExportSnapshot(SnapBuild *snapstate); extern void SnapBuildClearExportedSnapshot(void); diff --git a/src/include/replication/walreceiver.h b/src/include/replication/walreceiver.h index 78e577c89b..fb55c30fa1 100644 --- a/src/include/replication/walreceiver.h +++ b/src/include/replication/walreceiver.h @@ -15,9 +15,12 @@ #include "access/xlog.h" #include "access/xlogdefs.h" #include "fmgr.h" +#include "replication/logicalproto.h" +#include "replication/walsender.h" #include "storage/latch.h" #include "storage/spin.h" #include "pgtime.h" +#include "utils/tuplestore.h" /* user-settable parameters */ extern int wal_receiver_status_interval; @@ -160,6 +163,33 @@ typedef struct struct WalReceiverConn; typedef struct WalReceiverConn WalReceiverConn; +/* + * Status of walreceiver query execution. + * + * We only define statuses that are currently used. + */ +typedef enum +{ + WALRCV_ERROR, /* There was error when executing the query. */ + WALRCV_OK_COMMAND, /* Query executed utility or replication command. */ + WALRCV_OK_TUPLES, /* Query returned tuples. */ + WALRCV_OK_COPY_IN, /* Query started COPY FROM. */ + WALRCV_OK_COPY_OUT, /* Query started COPY TO. */ + WALRCV_OK_COPY_BOTH, /* Query started COPY BOTH replication protocol. */ +} WalRcvExecStatus; + +/* + * Return value for walrcv_query, returns the status of the execution and + * tuples if any. + */ +typedef struct WalRcvExecResult +{ + WalRcvExecStatus status; + char *err; + Tuplestorestate *tuplestore; + TupleDesc tupledesc; +} WalRcvExecResult; + /* libpqwalreceiver hooks */ typedef WalReceiverConn *(*walrcv_connect_fn) (const char *conninfo, bool logical, const char *appname, @@ -183,9 +213,12 @@ typedef void (*walrcv_send_fn) (WalReceiverConn *conn, const char *buffer, int nbytes); typedef char *(*walrcv_create_slot_fn) (WalReceiverConn *conn, const char *slotname, bool temporary, - bool export_snapshot, XLogRecPtr *lsn); -typedef bool (*walrcv_command_fn) (WalReceiverConn *conn, const char *cmd, - char **err); + CRSSnapshotAction snapshot_action, + XLogRecPtr *lsn); +typedef WalRcvExecResult *(*walrcv_exec_fn) (WalReceiverConn *conn, + const char *query, + const int nRetTypes, + const Oid *retTypes); typedef void (*walrcv_disconnect_fn) (WalReceiverConn *conn); typedef struct WalReceiverFunctionsType @@ -200,7 +233,7 @@ typedef struct WalReceiverFunctionsType walrcv_receive_fn walrcv_receive; walrcv_send_fn walrcv_send; walrcv_create_slot_fn walrcv_create_slot; - walrcv_command_fn walrcv_command; + walrcv_exec_fn walrcv_exec; walrcv_disconnect_fn walrcv_disconnect; } WalReceiverFunctionsType; @@ -224,13 +257,31 @@ extern PGDLLIMPORT WalReceiverFunctionsType *WalReceiverFunctions; WalReceiverFunctions->walrcv_receive(conn, buffer, wait_fd) #define walrcv_send(conn, buffer, nbytes) \ WalReceiverFunctions->walrcv_send(conn, buffer, nbytes) -#define walrcv_create_slot(conn, slotname, temporary, export_snapshot, lsn) \ - WalReceiverFunctions->walrcv_create_slot(conn, slotname, temporary, export_snapshot, lsn) -#define walrcv_command(conn, cmd, err) \ - WalReceiverFunctions->walrcv_command(conn, cmd, err) +#define walrcv_create_slot(conn, slotname, temporary, snapshot_action, lsn) \ + WalReceiverFunctions->walrcv_create_slot(conn, slotname, temporary, snapshot_action, lsn) +#define walrcv_exec(conn, exec, nRetTypes, retTypes) \ + WalReceiverFunctions->walrcv_exec(conn, exec, nRetTypes, retTypes) #define walrcv_disconnect(conn) \ WalReceiverFunctions->walrcv_disconnect(conn) +static inline void +walrcv_clear_result(WalRcvExecResult *walres) +{ + if (!walres) + return; + + if (walres->err) + pfree(walres->err); + + if (walres->tuplestore) + tuplestore_end(walres->tuplestore); + + if (walres->tupledesc) + FreeTupleDesc(walres->tupledesc); + + pfree(walres); +} + /* prototypes for functions in walreceiver.c */ extern void WalReceiverMain(void) pg_attribute_noreturn(); diff --git a/src/include/replication/walsender.h b/src/include/replication/walsender.h index fe23f6619f..2ca903872e 100644 --- a/src/include/replication/walsender.h +++ b/src/include/replication/walsender.h @@ -16,6 +16,16 @@ #include "fmgr.h" +/* + * What to do with a snapshot in create replication slot command. + */ +typedef enum +{ + CRS_EXPORT_SNAPSHOT, + CRS_NOEXPORT_SNAPSHOT, + CRS_USE_SNAPSHOT +} CRSSnapshotAction; + /* global state */ extern bool am_walsender; extern bool am_cascading_walsender; @@ -28,7 +38,7 @@ extern int wal_sender_timeout; extern bool log_replication_commands; extern void InitWalSender(void); -extern void exec_replication_command(const char *query_string); +extern bool exec_replication_command(const char *query_string); extern void WalSndErrorCleanup(void); extern void WalSndSignals(void); extern Size WalSndShmemSize(void); diff --git a/src/include/replication/worker_internal.h b/src/include/replication/worker_internal.h index 8cbf2687a9..bf96d340ca 100644 --- a/src/include/replication/worker_internal.h +++ b/src/include/replication/worker_internal.h @@ -33,6 +33,9 @@ typedef struct LogicalRepWorker /* Used for initial table synchronization. */ Oid relid; + char relstate; + XLogRecPtr relstate_lsn; + slock_t relmutex; /* Stats. */ XLogRecPtr last_lsn; @@ -42,6 +45,9 @@ typedef struct LogicalRepWorker TimestampTz reply_time; } LogicalRepWorker; +/* Memory context for cached variables in apply worker. */ +MemoryContext ApplyCacheContext; + /* libpqreceiver connection */ extern struct WalReceiverConn *wrconn; @@ -53,12 +59,26 @@ extern bool in_remote_transaction; extern bool got_SIGTERM; extern void logicalrep_worker_attach(int slot); -extern LogicalRepWorker *logicalrep_worker_find(Oid subid); -extern int logicalrep_worker_count(Oid subid); -extern void logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, Oid userid); -extern void logicalrep_worker_stop(Oid subid); -extern void logicalrep_worker_wakeup(Oid subid); +extern LogicalRepWorker *logicalrep_worker_find(Oid subid, Oid relid, + bool only_running); +extern void logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, + Oid userid, Oid relid); +extern void logicalrep_worker_stop(Oid subid, Oid relid); +extern void logicalrep_worker_wakeup(Oid subid, Oid relid); +extern void logicalrep_worker_wakeup_ptr(LogicalRepWorker *worker); + +extern int logicalrep_sync_worker_count(Oid subid); extern void logicalrep_worker_sigterm(SIGNAL_ARGS); +extern char *LogicalRepSyncTableStart(XLogRecPtr *origin_startpos); +void process_syncing_tables(XLogRecPtr current_lsn); +void invalidate_syncing_table_states(Datum arg, int cacheid, + uint32 hashvalue); + +static inline bool +am_tablesync_worker(void) +{ + return OidIsValid(MyLogicalRepWorker->relid); +} #endif /* WORKER_INTERNAL_H */ diff --git a/src/include/utils/syscache.h b/src/include/utils/syscache.h index 66f60d271e..b35faf81b9 100644 --- a/src/include/utils/syscache.h +++ b/src/include/utils/syscache.h @@ -89,6 +89,7 @@ enum SysCacheIdentifier STATRELATTINH, SUBSCRIPTIONOID, SUBSCRIPTIONNAME, + SUBSCRIPTIONRELMAP, TABLESPACEOID, TRFOID, TRFTYPELANG, diff --git a/src/test/regress/expected/object_address.out b/src/test/regress/expected/object_address.out index 90c4ba4608..978d9a9a0f 100644 --- a/src/test/regress/expected/object_address.out +++ b/src/test/regress/expected/object_address.out @@ -37,7 +37,8 @@ CREATE TRANSFORM FOR int LANGUAGE SQL ( FROM SQL WITH FUNCTION varchar_transform(internal), TO SQL WITH FUNCTION int4recv(internal)); CREATE PUBLICATION addr_pub FOR TABLE addr_nsp.gentable; -CREATE SUBSCRIPTION addr_sub CONNECTION '' PUBLICATION bar WITH (DISABLED, NOCREATE SLOT); +CREATE SUBSCRIPTION addr_sub CONNECTION '' PUBLICATION bar WITH (DISABLED, NOCONNECT); +WARNING: tables were not subscribed, you will have to run ALTER SUBSCRIPTION ... REFRESH PUBLICATION to subscribe the tables -- test some error cases SELECT pg_get_object_address('stone', '{}', '{}'); ERROR: unrecognized object type "stone" diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index bd13ae6010..f7c3a637b5 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1847,13 +1847,14 @@ pg_stat_ssl| SELECT s.pid, pg_stat_subscription| SELECT su.oid AS subid, su.subname, st.pid, + st.relid, st.received_lsn, st.last_msg_send_time, st.last_msg_receipt_time, st.latest_end_lsn, st.latest_end_time FROM (pg_subscription su - LEFT JOIN pg_stat_get_subscription(NULL::oid) st(subid, pid, received_lsn, last_msg_send_time, last_msg_receipt_time, latest_end_lsn, latest_end_time) ON ((st.subid = su.oid))); + LEFT JOIN pg_stat_get_subscription(NULL::oid) st(subid, relid, pid, received_lsn, last_msg_send_time, last_msg_receipt_time, latest_end_lsn, latest_end_time) ON ((st.subid = su.oid))); pg_stat_sys_indexes| SELECT pg_stat_all_indexes.relid, pg_stat_all_indexes.indexrelid, pg_stat_all_indexes.schemaname, diff --git a/src/test/regress/expected/sanity_check.out b/src/test/regress/expected/sanity_check.out index 88b4c973a1..8e3028edaa 100644 --- a/src/test/regress/expected/sanity_check.out +++ b/src/test/regress/expected/sanity_check.out @@ -143,6 +143,7 @@ pg_shdescription|t pg_shseclabel|t pg_statistic|t pg_subscription|t +pg_subscription_rel|t pg_tablespace|t pg_transform|t pg_trigger|t diff --git a/src/test/regress/expected/subscription.out b/src/test/regress/expected/subscription.out index 3471d88ca7..0912bef657 100644 --- a/src/test/regress/expected/subscription.out +++ b/src/test/regress/expected/subscription.out @@ -14,7 +14,6 @@ CREATE SUBSCRIPTION testsub PUBLICATION foo; ERROR: syntax error at or near "PUBLICATION" LINE 1: CREATE SUBSCRIPTION testsub PUBLICATION foo; ^ -set client_min_messages to error; -- fail - cannot do CREATE SUBSCRIPTION CREATE SLOT inside transaction block BEGIN; CREATE SUBSCRIPTION testsub CONNECTION 'testconn' PUBLICATION testpub WITH (CREATE SLOT); @@ -23,8 +22,8 @@ COMMIT; CREATE SUBSCRIPTION testsub CONNECTION 'testconn' PUBLICATION testpub; ERROR: invalid connection string syntax: missing "=" after "testconn" in connection info string -CREATE SUBSCRIPTION testsub CONNECTION 'dbname=doesnotexist' PUBLICATION testpub WITH (DISABLED, NOCREATE SLOT); -reset client_min_messages; +CREATE SUBSCRIPTION testsub CONNECTION 'dbname=doesnotexist' PUBLICATION testpub WITH (NOCONNECT); +WARNING: tables were not subscribed, you will have to run ALTER SUBSCRIPTION ... REFRESH PUBLICATION to subscribe the tables \dRs+ List of subscriptions Name | Owner | Enabled | Publication | Conninfo @@ -32,38 +31,30 @@ reset client_min_messages; testsub | regress_subscription_user | f | {testpub} | dbname=doesnotexist (1 row) -ALTER SUBSCRIPTION testsub SET PUBLICATION testpub2, testpub3; -\dRs - List of subscriptions - Name | Owner | Enabled | Publication ----------+---------------------------+---------+--------------------- - testsub | regress_subscription_user | f | {testpub2,testpub3} -(1 row) - +ALTER SUBSCRIPTION testsub SET PUBLICATION testpub2, testpub3 NOREFRESH; ALTER SUBSCRIPTION testsub CONNECTION 'dbname=doesnotexist2'; -ALTER SUBSCRIPTION testsub SET PUBLICATION testpub, testpub1; \dRs+ List of subscriptions - Name | Owner | Enabled | Publication | Conninfo ----------+---------------------------+---------+--------------------+---------------------- - testsub | regress_subscription_user | f | {testpub,testpub1} | dbname=doesnotexist2 + Name | Owner | Enabled | Publication | Conninfo +---------+---------------------------+---------+---------------------+---------------------- + testsub | regress_subscription_user | f | {testpub2,testpub3} | dbname=doesnotexist2 (1 row) BEGIN; ALTER SUBSCRIPTION testsub ENABLE; \dRs - List of subscriptions - Name | Owner | Enabled | Publication ----------+---------------------------+---------+-------------------- - testsub | regress_subscription_user | t | {testpub,testpub1} + List of subscriptions + Name | Owner | Enabled | Publication +---------+---------------------------+---------+--------------------- + testsub | regress_subscription_user | t | {testpub2,testpub3} (1 row) ALTER SUBSCRIPTION testsub DISABLE; \dRs - List of subscriptions - Name | Owner | Enabled | Publication ----------+---------------------------+---------+-------------------- - testsub | regress_subscription_user | f | {testpub,testpub1} + List of subscriptions + Name | Owner | Enabled | Publication +---------+---------------------------+---------+--------------------- + testsub | regress_subscription_user | f | {testpub2,testpub3} (1 row) COMMIT; @@ -74,10 +65,10 @@ ERROR: must be owner of subscription testsub RESET ROLE; ALTER SUBSCRIPTION testsub RENAME TO testsub_foo; \dRs - List of subscriptions - Name | Owner | Enabled | Publication --------------+---------------------------+---------+-------------------- - testsub_foo | regress_subscription_user | f | {testpub,testpub1} + List of subscriptions + Name | Owner | Enabled | Publication +-------------+---------------------------+---------+--------------------- + testsub_foo | regress_subscription_user | f | {testpub2,testpub3} (1 row) -- rename back to keep the rest simple diff --git a/src/test/regress/sql/object_address.sql b/src/test/regress/sql/object_address.sql index 6b85fe2949..28476daff1 100644 --- a/src/test/regress/sql/object_address.sql +++ b/src/test/regress/sql/object_address.sql @@ -40,7 +40,7 @@ CREATE TRANSFORM FOR int LANGUAGE SQL ( FROM SQL WITH FUNCTION varchar_transform(internal), TO SQL WITH FUNCTION int4recv(internal)); CREATE PUBLICATION addr_pub FOR TABLE addr_nsp.gentable; -CREATE SUBSCRIPTION addr_sub CONNECTION '' PUBLICATION bar WITH (DISABLED, NOCREATE SLOT); +CREATE SUBSCRIPTION addr_sub CONNECTION '' PUBLICATION bar WITH (DISABLED, NOCONNECT); -- test some error cases SELECT pg_get_object_address('stone', '{}', '{}'); diff --git a/src/test/regress/sql/subscription.sql b/src/test/regress/sql/subscription.sql index 5c05b14f9e..c1199ee629 100644 --- a/src/test/regress/sql/subscription.sql +++ b/src/test/regress/sql/subscription.sql @@ -12,24 +12,19 @@ CREATE SUBSCRIPTION testsub CONNECTION 'foo'; -- fail - no connection CREATE SUBSCRIPTION testsub PUBLICATION foo; -set client_min_messages to error; -- fail - cannot do CREATE SUBSCRIPTION CREATE SLOT inside transaction block BEGIN; CREATE SUBSCRIPTION testsub CONNECTION 'testconn' PUBLICATION testpub WITH (CREATE SLOT); COMMIT; CREATE SUBSCRIPTION testsub CONNECTION 'testconn' PUBLICATION testpub; -CREATE SUBSCRIPTION testsub CONNECTION 'dbname=doesnotexist' PUBLICATION testpub WITH (DISABLED, NOCREATE SLOT); -reset client_min_messages; + +CREATE SUBSCRIPTION testsub CONNECTION 'dbname=doesnotexist' PUBLICATION testpub WITH (NOCONNECT); \dRs+ -ALTER SUBSCRIPTION testsub SET PUBLICATION testpub2, testpub3; - -\dRs - +ALTER SUBSCRIPTION testsub SET PUBLICATION testpub2, testpub3 NOREFRESH; ALTER SUBSCRIPTION testsub CONNECTION 'dbname=doesnotexist2'; -ALTER SUBSCRIPTION testsub SET PUBLICATION testpub, testpub1; \dRs+ diff --git a/src/test/subscription/t/001_rep_changes.pl b/src/test/subscription/t/001_rep_changes.pl index b81028aed1..d1817f57da 100644 --- a/src/test/subscription/t/001_rep_changes.pl +++ b/src/test/subscription/t/001_rep_changes.pl @@ -3,7 +3,7 @@ use strict; use warnings; use PostgresNode; use TestLib; -use Test::More tests => 11; +use Test::More tests => 14; # Initialize publisher node my $node_publisher = get_new_node('publisher'); @@ -19,7 +19,7 @@ $node_subscriber->start; $node_publisher->safe_psql('postgres', "CREATE TABLE tab_notrep AS SELECT generate_series(1,10) AS a"); $node_publisher->safe_psql('postgres', - "CREATE TABLE tab_ins (a int)"); + "CREATE TABLE tab_ins AS SELECT generate_series(1,1002) AS a"); $node_publisher->safe_psql('postgres', "CREATE TABLE tab_full AS SELECT generate_series(1,10) AS a"); $node_publisher->safe_psql('postgres', @@ -56,10 +56,20 @@ my $caughtup_query = $node_publisher->poll_query_until('postgres', $caughtup_query) or die "Timed out while waiting for subscriber to catch up"; +# Also wait for initial table sync to finish +my $synced_query = +"SELECT count(1) = 0 FROM pg_subscription_rel WHERE srsubstate NOT IN ('r', 's');"; +$node_subscriber->poll_query_until('postgres', $synced_query) + or die "Timed out while waiting for subscriber to synchronize data"; + my $result = $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM tab_notrep"); is($result, qq(0), 'check non-replicated table is empty on subscriber'); +$result = + $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM tab_ins"); +is($result, qq(1002), 'check initial data was copied to subscriber'); + $node_publisher->safe_psql('postgres', "INSERT INTO tab_ins SELECT generate_series(1,50)"); $node_publisher->safe_psql('postgres', @@ -79,7 +89,7 @@ $node_publisher->poll_query_until('postgres', $caughtup_query) $result = $node_subscriber->safe_psql('postgres', "SELECT count(*), min(a), max(a) FROM tab_ins"); -is($result, qq(50|1|50), 'check replicated inserts on subscriber'); +is($result, qq(1052|1|1002), 'check replicated inserts on subscriber'); $result = $node_subscriber->safe_psql('postgres', "SELECT count(*), min(a), max(a) FROM tab_rep"); @@ -109,7 +119,7 @@ $node_publisher->poll_query_until('postgres', $caughtup_query) $result = $node_subscriber->safe_psql('postgres', "SELECT count(*), min(a), max(a) FROM tab_full"); -is($result, qq(10|1|100), 'update works with REPLICA IDENTITY FULL and duplicate tuples'); +is($result, qq(20|1|100), 'update works with REPLICA IDENTITY FULL and duplicate tuples'); # check that change of connection string and/or publication list causes # restart of subscription workers. Not all of these are registered as tests @@ -126,7 +136,7 @@ $node_publisher->poll_query_until('postgres', $oldpid = $node_publisher->safe_psql('postgres', "SELECT pid FROM pg_stat_replication WHERE application_name = '$appname';"); $node_subscriber->safe_psql('postgres', - "ALTER SUBSCRIPTION tap_sub SET PUBLICATION tap_pub_ins_only"); + "ALTER SUBSCRIPTION tap_sub SET PUBLICATION tap_pub_ins_only REFRESH WITH (NOCOPY DATA)"); $node_publisher->poll_query_until('postgres', "SELECT pid != $oldpid FROM pg_stat_replication WHERE application_name = '$appname';") or die "Timed out while waiting for apply to restart"; @@ -141,7 +151,7 @@ $node_publisher->poll_query_until('postgres', $caughtup_query) $result = $node_subscriber->safe_psql('postgres', "SELECT count(*), min(a), max(a) FROM tab_ins"); -is($result, qq(150|1|1100), 'check replicated inserts after subscription publication change'); +is($result, qq(1152|1|1100), 'check replicated inserts after subscription publication change'); $result = $node_subscriber->safe_psql('postgres', "SELECT count(*), min(a), max(a) FROM tab_rep"); @@ -154,6 +164,8 @@ $node_publisher->safe_psql('postgres', "ALTER PUBLICATION tap_pub_ins_only ADD TABLE tab_full"); $node_publisher->safe_psql('postgres', "DELETE FROM tab_ins WHERE a > 0"); +$node_subscriber->safe_psql('postgres', + "ALTER SUBSCRIPTION tap_sub REFRESH PUBLICATION WITH (NOCOPY DATA)"); $node_publisher->safe_psql('postgres', "INSERT INTO tab_full VALUES(0)"); @@ -163,11 +175,11 @@ $node_publisher->poll_query_until('postgres', $caughtup_query) # note that data are different on provider and subscriber $result = $node_subscriber->safe_psql('postgres', "SELECT count(*), min(a), max(a) FROM tab_ins"); -is($result, qq(50|1|50), 'check replicated deletes after alter publication'); +is($result, qq(1052|1|1002), 'check replicated deletes after alter publication'); $result = $node_subscriber->safe_psql('postgres', "SELECT count(*), min(a), max(a) FROM tab_full"); -is($result, qq(11|0|100), 'check replicated insert after alter publication'); +is($result, qq(21|0|100), 'check replicated insert after alter publication'); # check restart on rename $oldpid = $node_publisher->safe_psql('postgres', @@ -189,6 +201,14 @@ $result = $node_publisher->safe_psql('postgres', "SELECT count(*) FROM pg_replication_slots"); is($result, qq(0), 'check replication slot was dropped on publisher'); +$result = + $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM pg_subscription_rel"); +is($result, qq(0), 'check subscription relation status was dropped on subscriber'); + +$result = + $node_publisher->safe_psql('postgres', "SELECT count(*) FROM pg_replication_slots"); +is($result, qq(0), 'check replication slot was dropped on publisher'); + $result = $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM pg_replication_origin"); is($result, qq(0), 'check replication origin was dropped on subscriber'); diff --git a/src/test/subscription/t/002_types.pl b/src/test/subscription/t/002_types.pl index f44e1e671d..ad15e85c0c 100644 --- a/src/test/subscription/t/002_types.pl +++ b/src/test/subscription/t/002_types.pl @@ -111,6 +111,12 @@ my $caughtup_query = $node_publisher->poll_query_until('postgres', $caughtup_query) or die "Timed out while waiting for subscriber to catch up"; +# Wait for initial sync to finish as well +my $synced_query = +"SELECT count(1) = 0 FROM pg_subscription_rel WHERE srsubstate NOT IN ('s', 'r');"; +$node_subscriber->poll_query_until('postgres', $synced_query) + or die "Timed out while waiting for subscriber to synchronize data"; + # Insert initial test data $node_publisher->safe_psql('postgres', qq( -- test_tbl_one_array_col diff --git a/src/test/subscription/t/003_constraints.pl b/src/test/subscription/t/003_constraints.pl index b785132f5b..11b8254155 100644 --- a/src/test/subscription/t/003_constraints.pl +++ b/src/test/subscription/t/003_constraints.pl @@ -34,7 +34,7 @@ $node_publisher->safe_psql('postgres', my $appname = 'tap_sub'; $node_subscriber->safe_psql('postgres', - "CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub;"); + "CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub WITH (NOCOPY DATA)"); # Wait for subscriber to finish initialization my $caughtup_query = diff --git a/src/test/subscription/t/004_sync.pl b/src/test/subscription/t/004_sync.pl new file mode 100644 index 0000000000..87541a0e6e --- /dev/null +++ b/src/test/subscription/t/004_sync.pl @@ -0,0 +1,159 @@ +# Tests for logical replication table syncing +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More tests => 7; + +# Initialize publisher node +my $node_publisher = get_new_node('publisher'); +$node_publisher->init(allows_streaming => 'logical'); +$node_publisher->start; + +# Create subscriber node +my $node_subscriber = get_new_node('subscriber'); +$node_subscriber->init(allows_streaming => 'logical'); +$node_subscriber->start; + +# Create some preexisting content on publisher +$node_publisher->safe_psql('postgres', + "CREATE TABLE tab_rep (a int primary key)"); +$node_publisher->safe_psql('postgres', + "INSERT INTO tab_rep SELECT generate_series(1,10)"); + +# Setup structure on subscriber +$node_subscriber->safe_psql('postgres', + "CREATE TABLE tab_rep (a int primary key)"); + +# Setup logical replication +my $publisher_connstr = $node_publisher->connstr . ' dbname=postgres'; +$node_publisher->safe_psql('postgres', + "CREATE PUBLICATION tap_pub FOR ALL TABLES"); + +my $appname = 'tap_sub'; +$node_subscriber->safe_psql('postgres', + "CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub"); + +# Wait for subscriber to finish initialization +my $caughtup_query = +"SELECT pg_current_wal_location() <= replay_location FROM pg_stat_replication WHERE application_name = '$appname';"; +$node_publisher->poll_query_until('postgres', $caughtup_query) + or die "Timed out while waiting for subscriber to catch up"; + +# Also wait for initial table sync to finish +my $synced_query = +"SELECT count(1) = 0 FROM pg_subscription_rel WHERE srsubstate NOT IN ('r', 's');"; +$node_subscriber->poll_query_until('postgres', $synced_query) + or die "Timed out while waiting for subscriber to synchronize data"; + +my $result = + $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM tab_rep"); +is($result, qq(10), 'initial data synced for first sub'); + +# drop subscription so that there is unreplicated data +$node_subscriber->safe_psql('postgres', "DROP SUBSCRIPTION tap_sub"); + +$node_publisher->safe_psql('postgres', + "INSERT INTO tab_rep SELECT generate_series(11,20)"); + +# recreate the subscription, it will try to do initial copy +$node_subscriber->safe_psql('postgres', + "CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub"); + +# but it will be stuck on data copy as it will fail on constraint +my $started_query = +"SELECT srsubstate = 'd' FROM pg_subscription_rel;"; +$node_subscriber->poll_query_until('postgres', $started_query) + or die "Timed out while waiting for subscriber to start sync"; + +# remove the conflicting data +$node_subscriber->safe_psql('postgres', + "DELETE FROM tab_rep;"); + +# wait for sync to finish this time +$node_subscriber->poll_query_until('postgres', $synced_query) + or die "Timed out while waiting for subscriber to synchronize data"; + +# check that all data is synced +$result = + $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM tab_rep"); +is($result, qq(20), 'initial data synced for second sub'); + +# now check another subscription for the same node pair +$node_subscriber->safe_psql('postgres', + "CREATE SUBSCRIPTION tap_sub2 CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub WITH (NOCOPY DATA)"); + +# wait for it to start +$node_subscriber->poll_query_until('postgres', "SELECT pid IS NOT NULL FROM pg_stat_subscription WHERE subname = 'tap_sub2' AND relid IS NULL") + or die "Timed out while waiting for subscriber to start"; + +# and drop both subscriptions +$node_subscriber->safe_psql('postgres', "DROP SUBSCRIPTION tap_sub"); +$node_subscriber->safe_psql('postgres', "DROP SUBSCRIPTION tap_sub2"); + +# check subscriptions are removed +$result = + $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM pg_subscription"); +is($result, qq(0), 'second and third sub are dropped'); + +# remove the conflicting data +$node_subscriber->safe_psql('postgres', + "DELETE FROM tab_rep;"); + +# recreate the subscription again +$node_subscriber->safe_psql('postgres', + "CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub"); + +# and wait for data sync to finish again +$node_subscriber->poll_query_until('postgres', $synced_query) + or die "Timed out while waiting for subscriber to synchronize data"; + +# check that all data is synced +$result = + $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM tab_rep"); +is($result, qq(20), 'initial data synced for fourth sub'); + +# add new table on subscriber +$node_subscriber->safe_psql('postgres', + "CREATE TABLE tab_rep_next (a int)"); + +# setup structure with existing data on pubisher +$node_publisher->safe_psql('postgres', + "CREATE TABLE tab_rep_next (a) AS SELECT generate_series(1,10)"); + +# Wait for subscription to catch up +$node_publisher->poll_query_until('postgres', $caughtup_query) + or die "Timed out while waiting for subscriber to catch up"; + +$result = + $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM tab_rep_next"); +is($result, qq(0), 'no data for table added after subscription initialized'); + +# ask for data sync +$node_subscriber->safe_psql('postgres', + "ALTER SUBSCRIPTION tap_sub REFRESH PUBLICATION"); + +# wait for sync to finish +$node_subscriber->poll_query_until('postgres', $synced_query) + or die "Timed out while waiting for subscriber to synchronize data"; + +$result = + $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM tab_rep_next"); +is($result, qq(10), 'data for table added after subscription initialized are now synced'); + +# Add some data +$node_publisher->safe_psql('postgres', + "INSERT INTO tab_rep_next SELECT generate_series(1,10)"); + +# Wait for subscription to catch up +$node_publisher->poll_query_until('postgres', $caughtup_query) + or die "Timed out while waiting for subscriber to catch up"; + +$result = + $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM tab_rep_next"); +is($result, qq(20), 'changes for table added after subscription initialized replicated'); + +$node_subscriber->safe_psql('postgres', "DROP SUBSCRIPTION tap_sub"); + +$node_subscriber->stop('fast'); +$node_publisher->stop('fast');