From 32a9c0bdf493cf5fc029ab44a22384d547290eff Mon Sep 17 00:00:00 2001 From: Fujii Masao Date: Tue, 6 Oct 2020 10:31:09 +0900 Subject: [PATCH] postgres_fdw: reestablish new connection if cached one is detected as broken. In postgres_fdw, once remote connections are established, they are cached and re-used for subsequent queries and transactions. There can be some cases where those cached connections are unavaiable, for example, by the restart of remote server. In these cases, previously an error was reported and the query accessing to remote server failed if new remote transaction failed to start because the cached connection was broken. This commit improves postgres_fdw so that new connection is remade if broken connection is detected when starting new remote transaction. This is useful to avoid unnecessary failure of queries when connection is broken but can be reestablished. Author: Bharath Rupireddy, tweaked a bit by Fujii Masao Reviewed-by: Ashutosh Bapat, Tatsuhito Kasahara, Fujii Masao Discussion: https://postgr.es/m/CALj2ACUAi23vf1WiHNar_LksM9EDOWXcbHCo-fD4Mbr1d=78YQ@mail.gmail.com --- contrib/postgres_fdw/connection.c | 60 ++++++++++++++----- .../postgres_fdw/expected/postgres_fdw.out | 48 +++++++++++++++ contrib/postgres_fdw/sql/postgres_fdw.sql | 41 +++++++++++++ 3 files changed, 134 insertions(+), 15 deletions(-) diff --git a/contrib/postgres_fdw/connection.c b/contrib/postgres_fdw/connection.c index 08daf26fdf..76994f3820 100644 --- a/contrib/postgres_fdw/connection.c +++ b/contrib/postgres_fdw/connection.c @@ -108,6 +108,7 @@ PGconn * GetConnection(UserMapping *user, bool will_prep_stmt) { bool found; + volatile bool retry_conn = false; ConnCacheEntry *entry; ConnCacheKey key; @@ -159,22 +160,25 @@ GetConnection(UserMapping *user, bool will_prep_stmt) /* Reject further use of connections which failed abort cleanup. */ pgfdw_reject_incomplete_xact_state_change(entry); - /* - * If the connection needs to be remade due to invalidation, disconnect as - * soon as we're out of all transactions. - */ - if (entry->conn != NULL && entry->invalidated && entry->xact_depth == 0) - { - elog(DEBUG3, "closing connection %p for option changes to take effect", - entry->conn); - disconnect_pg_server(entry); - } +retry: /* - * We don't check the health of cached connection here, because it would - * require some overhead. Broken connection will be detected when the - * connection is actually used. + * If the connection needs to be remade due to invalidation, disconnect as + * soon as we're out of all transactions. Also, if previous attempt to + * start new remote transaction failed on the cached connection, + * disconnect it to retry a new connection. */ + if ((entry->conn != NULL && entry->invalidated && + entry->xact_depth == 0) || retry_conn) + { + if (retry_conn) + elog(DEBUG3, "closing connection %p to reestablish a new one", + entry->conn); + else + elog(DEBUG3, "closing connection %p for option changes to take effect", + entry->conn); + disconnect_pg_server(entry); + } /* * If cache entry doesn't have a connection, we have to establish a new @@ -206,9 +210,35 @@ GetConnection(UserMapping *user, bool will_prep_stmt) } /* - * Start a new transaction or subtransaction if needed. + * We check the health of the cached connection here when starting a new + * remote transaction. If a broken connection is detected in the first + * attempt, we try to reestablish a new connection. If broken connection + * is detected again here, we give up getting a connection. */ - begin_remote_xact(entry); + PG_TRY(); + { + /* Start a new transaction or subtransaction if needed. */ + begin_remote_xact(entry); + retry_conn = false; + } + PG_CATCH(); + { + if (PQstatus(entry->conn) != CONNECTION_BAD || + entry->xact_depth > 0 || + retry_conn) + PG_RE_THROW(); + retry_conn = true; + } + PG_END_TRY(); + + if (retry_conn) + { + ereport(DEBUG3, + (errmsg_internal("could not start remote transaction on connection %p", + entry->conn)), + errdetail_internal("%s", pchomp(PQerrorMessage(entry->conn)))); + goto retry; + } /* Remember if caller will prepare statements */ entry->have_prep_stmt |= will_prep_stmt; diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out index 10e23d02ed..2c5614073f 100644 --- a/contrib/postgres_fdw/expected/postgres_fdw.out +++ b/contrib/postgres_fdw/expected/postgres_fdw.out @@ -8987,3 +8987,51 @@ PREPARE TRANSACTION 'fdw_tpc'; ERROR: cannot PREPARE a transaction that has operated on postgres_fdw foreign tables ROLLBACK; WARNING: there is no transaction in progress +-- =================================================================== +-- reestablish new connection +-- =================================================================== +-- Terminate the backend having the specified application_name and wait for +-- the termination to complete. +CREATE OR REPLACE PROCEDURE terminate_backend_and_wait(appname text) AS $$ +BEGIN + PERFORM pg_terminate_backend(pid) FROM pg_stat_activity + WHERE application_name = appname; + LOOP + PERFORM * FROM pg_stat_activity WHERE application_name = appname; + EXIT WHEN NOT FOUND; + PERFORM pg_sleep(1), pg_stat_clear_snapshot(); + END LOOP; +END; +$$ LANGUAGE plpgsql; +-- Change application_name of remote connection to special one +-- so that we can easily terminate the connection later. +ALTER SERVER loopback OPTIONS (application_name 'fdw_retry_check'); +SELECT 1 FROM ft1 LIMIT 1; + ?column? +---------- + 1 +(1 row) + +-- Terminate the remote connection. +CALL terminate_backend_and_wait('fdw_retry_check'); +-- This query should detect the broken connection when starting new remote +-- transaction, reestablish new connection, and then succeed. +BEGIN; +SELECT 1 FROM ft1 LIMIT 1; + ?column? +---------- + 1 +(1 row) + +-- If the query detects the broken connection when starting new remote +-- subtransaction, it doesn't reestablish new connection and should fail. +CALL terminate_backend_and_wait('fdw_retry_check'); +SAVEPOINT s; +SELECT 1 FROM ft1 LIMIT 1; -- should fail +ERROR: server closed the connection unexpectedly + This probably means the server terminated abnormally + before or while processing the request. +CONTEXT: remote SQL command: SAVEPOINT s2 +COMMIT; +-- Clean up +DROP PROCEDURE terminate_backend_and_wait(text); diff --git a/contrib/postgres_fdw/sql/postgres_fdw.sql b/contrib/postgres_fdw/sql/postgres_fdw.sql index 78156d10b4..4da1f78956 100644 --- a/contrib/postgres_fdw/sql/postgres_fdw.sql +++ b/contrib/postgres_fdw/sql/postgres_fdw.sql @@ -2653,3 +2653,44 @@ SELECT count(*) FROM ft1; -- error here PREPARE TRANSACTION 'fdw_tpc'; ROLLBACK; + +-- =================================================================== +-- reestablish new connection +-- =================================================================== + +-- Terminate the backend having the specified application_name and wait for +-- the termination to complete. +CREATE OR REPLACE PROCEDURE terminate_backend_and_wait(appname text) AS $$ +BEGIN + PERFORM pg_terminate_backend(pid) FROM pg_stat_activity + WHERE application_name = appname; + LOOP + PERFORM * FROM pg_stat_activity WHERE application_name = appname; + EXIT WHEN NOT FOUND; + PERFORM pg_sleep(1), pg_stat_clear_snapshot(); + END LOOP; +END; +$$ LANGUAGE plpgsql; + +-- Change application_name of remote connection to special one +-- so that we can easily terminate the connection later. +ALTER SERVER loopback OPTIONS (application_name 'fdw_retry_check'); +SELECT 1 FROM ft1 LIMIT 1; + +-- Terminate the remote connection. +CALL terminate_backend_and_wait('fdw_retry_check'); + +-- This query should detect the broken connection when starting new remote +-- transaction, reestablish new connection, and then succeed. +BEGIN; +SELECT 1 FROM ft1 LIMIT 1; + +-- If the query detects the broken connection when starting new remote +-- subtransaction, it doesn't reestablish new connection and should fail. +CALL terminate_backend_and_wait('fdw_retry_check'); +SAVEPOINT s; +SELECT 1 FROM ft1 LIMIT 1; -- should fail +COMMIT; + +-- Clean up +DROP PROCEDURE terminate_backend_and_wait(text);