postgres_fdw: reestablish new connection if cached one is detected as broken.

In postgres_fdw, once remote connections are established, they are cached
and re-used for subsequent queries and transactions. There can be some
cases where those cached connections are unavaiable, for example,
by the restart of remote server. In these cases, previously an error was
reported and the query accessing to remote server failed if new remote
transaction failed to start because the cached connection was broken.

This commit improves postgres_fdw so that new connection is remade
if broken connection is detected when starting new remote transaction.
This is useful to avoid unnecessary failure of queries when connection is
broken but can be reestablished.

Author: Bharath Rupireddy, tweaked a bit by Fujii Masao
Reviewed-by: Ashutosh Bapat, Tatsuhito Kasahara, Fujii Masao
Discussion: https://postgr.es/m/CALj2ACUAi23vf1WiHNar_LksM9EDOWXcbHCo-fD4Mbr1d=78YQ@mail.gmail.com
This commit is contained in:
Fujii Masao 2020-10-06 10:31:09 +09:00
parent dd0a64ed43
commit 32a9c0bdf4
3 changed files with 134 additions and 15 deletions

View File

@ -108,6 +108,7 @@ PGconn *
GetConnection(UserMapping *user, bool will_prep_stmt)
{
bool found;
volatile bool retry_conn = false;
ConnCacheEntry *entry;
ConnCacheKey key;
@ -159,22 +160,25 @@ GetConnection(UserMapping *user, bool will_prep_stmt)
/* Reject further use of connections which failed abort cleanup. */
pgfdw_reject_incomplete_xact_state_change(entry);
/*
* If the connection needs to be remade due to invalidation, disconnect as
* soon as we're out of all transactions.
*/
if (entry->conn != NULL && entry->invalidated && entry->xact_depth == 0)
{
elog(DEBUG3, "closing connection %p for option changes to take effect",
entry->conn);
disconnect_pg_server(entry);
}
retry:
/*
* We don't check the health of cached connection here, because it would
* require some overhead. Broken connection will be detected when the
* connection is actually used.
* If the connection needs to be remade due to invalidation, disconnect as
* soon as we're out of all transactions. Also, if previous attempt to
* start new remote transaction failed on the cached connection,
* disconnect it to retry a new connection.
*/
if ((entry->conn != NULL && entry->invalidated &&
entry->xact_depth == 0) || retry_conn)
{
if (retry_conn)
elog(DEBUG3, "closing connection %p to reestablish a new one",
entry->conn);
else
elog(DEBUG3, "closing connection %p for option changes to take effect",
entry->conn);
disconnect_pg_server(entry);
}
/*
* If cache entry doesn't have a connection, we have to establish a new
@ -206,9 +210,35 @@ GetConnection(UserMapping *user, bool will_prep_stmt)
}
/*
* Start a new transaction or subtransaction if needed.
* We check the health of the cached connection here when starting a new
* remote transaction. If a broken connection is detected in the first
* attempt, we try to reestablish a new connection. If broken connection
* is detected again here, we give up getting a connection.
*/
begin_remote_xact(entry);
PG_TRY();
{
/* Start a new transaction or subtransaction if needed. */
begin_remote_xact(entry);
retry_conn = false;
}
PG_CATCH();
{
if (PQstatus(entry->conn) != CONNECTION_BAD ||
entry->xact_depth > 0 ||
retry_conn)
PG_RE_THROW();
retry_conn = true;
}
PG_END_TRY();
if (retry_conn)
{
ereport(DEBUG3,
(errmsg_internal("could not start remote transaction on connection %p",
entry->conn)),
errdetail_internal("%s", pchomp(PQerrorMessage(entry->conn))));
goto retry;
}
/* Remember if caller will prepare statements */
entry->have_prep_stmt |= will_prep_stmt;

View File

@ -8987,3 +8987,51 @@ PREPARE TRANSACTION 'fdw_tpc';
ERROR: cannot PREPARE a transaction that has operated on postgres_fdw foreign tables
ROLLBACK;
WARNING: there is no transaction in progress
-- ===================================================================
-- reestablish new connection
-- ===================================================================
-- Terminate the backend having the specified application_name and wait for
-- the termination to complete.
CREATE OR REPLACE PROCEDURE terminate_backend_and_wait(appname text) AS $$
BEGIN
PERFORM pg_terminate_backend(pid) FROM pg_stat_activity
WHERE application_name = appname;
LOOP
PERFORM * FROM pg_stat_activity WHERE application_name = appname;
EXIT WHEN NOT FOUND;
PERFORM pg_sleep(1), pg_stat_clear_snapshot();
END LOOP;
END;
$$ LANGUAGE plpgsql;
-- Change application_name of remote connection to special one
-- so that we can easily terminate the connection later.
ALTER SERVER loopback OPTIONS (application_name 'fdw_retry_check');
SELECT 1 FROM ft1 LIMIT 1;
?column?
----------
1
(1 row)
-- Terminate the remote connection.
CALL terminate_backend_and_wait('fdw_retry_check');
-- This query should detect the broken connection when starting new remote
-- transaction, reestablish new connection, and then succeed.
BEGIN;
SELECT 1 FROM ft1 LIMIT 1;
?column?
----------
1
(1 row)
-- If the query detects the broken connection when starting new remote
-- subtransaction, it doesn't reestablish new connection and should fail.
CALL terminate_backend_and_wait('fdw_retry_check');
SAVEPOINT s;
SELECT 1 FROM ft1 LIMIT 1; -- should fail
ERROR: server closed the connection unexpectedly
This probably means the server terminated abnormally
before or while processing the request.
CONTEXT: remote SQL command: SAVEPOINT s2
COMMIT;
-- Clean up
DROP PROCEDURE terminate_backend_and_wait(text);

View File

@ -2653,3 +2653,44 @@ SELECT count(*) FROM ft1;
-- error here
PREPARE TRANSACTION 'fdw_tpc';
ROLLBACK;
-- ===================================================================
-- reestablish new connection
-- ===================================================================
-- Terminate the backend having the specified application_name and wait for
-- the termination to complete.
CREATE OR REPLACE PROCEDURE terminate_backend_and_wait(appname text) AS $$
BEGIN
PERFORM pg_terminate_backend(pid) FROM pg_stat_activity
WHERE application_name = appname;
LOOP
PERFORM * FROM pg_stat_activity WHERE application_name = appname;
EXIT WHEN NOT FOUND;
PERFORM pg_sleep(1), pg_stat_clear_snapshot();
END LOOP;
END;
$$ LANGUAGE plpgsql;
-- Change application_name of remote connection to special one
-- so that we can easily terminate the connection later.
ALTER SERVER loopback OPTIONS (application_name 'fdw_retry_check');
SELECT 1 FROM ft1 LIMIT 1;
-- Terminate the remote connection.
CALL terminate_backend_and_wait('fdw_retry_check');
-- This query should detect the broken connection when starting new remote
-- transaction, reestablish new connection, and then succeed.
BEGIN;
SELECT 1 FROM ft1 LIMIT 1;
-- If the query detects the broken connection when starting new remote
-- subtransaction, it doesn't reestablish new connection and should fail.
CALL terminate_backend_and_wait('fdw_retry_check');
SAVEPOINT s;
SELECT 1 FROM ft1 LIMIT 1; -- should fail
COMMIT;
-- Clean up
DROP PROCEDURE terminate_backend_and_wait(text);