Take care to de-duplicate entries in standby.c's table of locks.

The RecoveryLockLists data structure, which tracks all exclusive
locks that the startup process is holding on behalf of transactions
being replayed, did not have any provision for avoiding duplicate
entries for the same lock.  Maybe that was okay when the code was
first written.  However, modern practice is for checkpoints to
write fresh lists of all active exclusive locks into the WAL.
Thus, an exclusive lock that survives across multiple checkpoints
causes bloat in standbys' startup processes.  If there are a lot
of such locks this can look like a memory leak, and it's even
possible to drive the startup process into a palloc failure from
an over-length List.

To fix, use a hash table instead of simple lists to track the
locks being held.  Allowing for dynahash overhead, this requires
a little more space per lock than the old way (although it's the
same size as what we were allocating prior to c6e0fe1f2).  It's
probably a shade slower too.  However, testing indicates that the
penalty is negligible on ordinary workloads, so let's make this
change to improve robustness in extreme cases.

Patch by me, per report from Dmitriy Kuzmin.  No back-patch
(for now anyway), since it seems that a significant improvement
would only occur in corner cases.

Discussion: https://postgr.es/m/CAHLDt=_ts0A7Agn=hCpUh+RCFkxd+G6uuT=kcTfqFtGur0dp=A@mail.gmail.com
This commit is contained in:
Tom Lane 2022-10-06 12:27:36 -04:00
parent 58640f37d9
commit 66c2922e76
1 changed files with 106 additions and 65 deletions

View File

@ -42,7 +42,29 @@ int max_standby_archive_delay = 30 * 1000;
int max_standby_streaming_delay = 30 * 1000;
bool log_recovery_conflict_waits = false;
static HTAB *RecoveryLockLists;
/*
* Keep track of all the exclusive locks owned by original transactions.
* For each known exclusive lock, there is a RecoveryLockEntry in the
* RecoveryLockHash hash table. All RecoveryLockEntrys belonging to a
* given XID are chained together so that we can find them easily.
* For each original transaction that is known to have any such locks,
* there is a RecoveryLockXidEntry in the RecoveryLockXidHash hash table,
* which stores the head of the chain of its locks.
*/
typedef struct RecoveryLockEntry
{
xl_standby_lock key; /* hash key: xid, dbOid, relOid */
struct RecoveryLockEntry *next; /* chain link */
} RecoveryLockEntry;
typedef struct RecoveryLockXidEntry
{
TransactionId xid; /* hash key -- must be first */
struct RecoveryLockEntry *head; /* chain head */
} RecoveryLockXidEntry;
static HTAB *RecoveryLockHash = NULL;
static HTAB *RecoveryLockXidHash = NULL;
/* Flags set by timeout handlers */
static volatile sig_atomic_t got_standby_deadlock_timeout = false;
@ -58,15 +80,6 @@ static XLogRecPtr LogCurrentRunningXacts(RunningTransactions CurrRunningXacts);
static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks);
static const char *get_recovery_conflict_desc(ProcSignalReason reason);
/*
* Keep track of all the locks owned by a given transaction.
*/
typedef struct RecoveryLockListsEntry
{
TransactionId xid;
List *locks;
} RecoveryLockListsEntry;
/*
* InitRecoveryTransactionEnvironment
* Initialize tracking of our primary's in-progress transactions.
@ -85,16 +98,24 @@ InitRecoveryTransactionEnvironment(void)
VirtualTransactionId vxid;
HASHCTL hash_ctl;
Assert(RecoveryLockHash == NULL); /* don't run this twice */
/*
* Initialize the hash table for tracking the list of locks held by each
* Initialize the hash tables for tracking the locks held by each
* transaction.
*/
hash_ctl.keysize = sizeof(xl_standby_lock);
hash_ctl.entrysize = sizeof(RecoveryLockEntry);
RecoveryLockHash = hash_create("RecoveryLockHash",
64,
&hash_ctl,
HASH_ELEM | HASH_BLOBS);
hash_ctl.keysize = sizeof(TransactionId);
hash_ctl.entrysize = sizeof(RecoveryLockListsEntry);
RecoveryLockLists = hash_create("RecoveryLockLists",
64,
&hash_ctl,
HASH_ELEM | HASH_BLOBS);
hash_ctl.entrysize = sizeof(RecoveryLockXidEntry);
RecoveryLockXidHash = hash_create("RecoveryLockXidHash",
64,
&hash_ctl,
HASH_ELEM | HASH_BLOBS);
/*
* Initialize shared invalidation management for Startup process, being
@ -140,12 +161,12 @@ void
ShutdownRecoveryTransactionEnvironment(void)
{
/*
* Do nothing if RecoveryLockLists is NULL because which means that
* transaction tracking has not been yet initialized or has been already
* shutdowned. This prevents transaction tracking from being shutdowned
* unexpectedly more than once.
* Do nothing if RecoveryLockHash is NULL because that means that
* transaction tracking has not yet been initialized or has already been
* shut down. This makes it safe to have possibly-redundant calls of this
* function during process exit.
*/
if (RecoveryLockLists == NULL)
if (RecoveryLockHash == NULL)
return;
/* Mark all tracked in-progress transactions as finished. */
@ -154,9 +175,11 @@ ShutdownRecoveryTransactionEnvironment(void)
/* Release all locks the tracked transactions were holding */
StandbyReleaseAllLocks();
/* Destroy the hash table of locks. */
hash_destroy(RecoveryLockLists);
RecoveryLockLists = NULL;
/* Destroy the lock hash tables. */
hash_destroy(RecoveryLockHash);
hash_destroy(RecoveryLockXidHash);
RecoveryLockHash = NULL;
RecoveryLockXidHash = NULL;
/* Cleanup our VirtualTransaction */
VirtualXactLockTableCleanup();
@ -932,12 +955,12 @@ StandbyLockTimeoutHandler(void)
* We only keep track of AccessExclusiveLocks, which are only ever held by
* one transaction on one relation.
*
* We keep a hash table of lists of locks in local memory keyed by xid,
* RecoveryLockLists, so we can keep track of the various entries made by
* the Startup process's virtual xid in the shared lock table.
*
* List elements use type xl_standby_lock, since the WAL record type exactly
* matches the information that we need to keep track of.
* We keep a table of known locks in the RecoveryLockHash hash table.
* The point of that table is to let us efficiently de-duplicate locks,
* which is important because checkpoints will re-report the same locks
* already held. There is also a RecoveryLockXidHash table with one entry
* per xid, which allows us to efficiently find all the locks held by a
* given original transaction.
*
* We use session locks rather than normal locks so we don't need
* ResourceOwners.
@ -947,8 +970,9 @@ StandbyLockTimeoutHandler(void)
void
StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid)
{
RecoveryLockListsEntry *entry;
xl_standby_lock *newlock;
RecoveryLockXidEntry *xidentry;
RecoveryLockEntry *lockentry;
xl_standby_lock key;
LOCKTAG locktag;
bool found;
@ -964,62 +988,79 @@ StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid)
/* dbOid is InvalidOid when we are locking a shared relation. */
Assert(OidIsValid(relOid));
/* Create a new list for this xid, if we don't have one already. */
entry = hash_search(RecoveryLockLists, &xid, HASH_ENTER, &found);
/* Create a hash entry for this xid, if we don't have one already. */
xidentry = hash_search(RecoveryLockXidHash, &xid, HASH_ENTER, &found);
if (!found)
{
entry->xid = xid;
entry->locks = NIL;
Assert(xidentry->xid == xid); /* dynahash should have set this */
xidentry->head = NULL;
}
newlock = palloc(sizeof(xl_standby_lock));
newlock->xid = xid;
newlock->dbOid = dbOid;
newlock->relOid = relOid;
entry->locks = lappend(entry->locks, newlock);
/* Create a hash entry for this lock, unless we have one already. */
key.xid = xid;
key.dbOid = dbOid;
key.relOid = relOid;
lockentry = hash_search(RecoveryLockHash, &key, HASH_ENTER, &found);
if (!found)
{
/* It's new, so link it into the XID's list ... */
lockentry->next = xidentry->head;
xidentry->head = lockentry;
SET_LOCKTAG_RELATION(locktag, newlock->dbOid, newlock->relOid);
/* ... and acquire the lock locally. */
SET_LOCKTAG_RELATION(locktag, dbOid, relOid);
(void) LockAcquire(&locktag, AccessExclusiveLock, true, false);
(void) LockAcquire(&locktag, AccessExclusiveLock, true, false);
}
}
/*
* Release all the locks associated with this RecoveryLockXidEntry.
*/
static void
StandbyReleaseLockList(List *locks)
StandbyReleaseXidEntryLocks(RecoveryLockXidEntry *xidentry)
{
ListCell *lc;
RecoveryLockEntry *entry;
RecoveryLockEntry *next;
foreach(lc, locks)
for (entry = xidentry->head; entry != NULL; entry = next)
{
xl_standby_lock *lock = (xl_standby_lock *) lfirst(lc);
LOCKTAG locktag;
elog(trace_recovery(DEBUG4),
"releasing recovery lock: xid %u db %u rel %u",
lock->xid, lock->dbOid, lock->relOid);
SET_LOCKTAG_RELATION(locktag, lock->dbOid, lock->relOid);
entry->key.xid, entry->key.dbOid, entry->key.relOid);
/* Release the lock ... */
SET_LOCKTAG_RELATION(locktag, entry->key.dbOid, entry->key.relOid);
if (!LockRelease(&locktag, AccessExclusiveLock, true))
{
elog(LOG,
"RecoveryLockLists contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u",
lock->xid, lock->dbOid, lock->relOid);
"RecoveryLockHash contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u",
entry->key.xid, entry->key.dbOid, entry->key.relOid);
Assert(false);
}
/* ... and remove the per-lock hash entry */
next = entry->next;
hash_search(RecoveryLockHash, entry, HASH_REMOVE, NULL);
}
list_free_deep(locks);
xidentry->head = NULL; /* just for paranoia */
}
/*
* Release locks for specific XID, or all locks if it's InvalidXid.
*/
static void
StandbyReleaseLocks(TransactionId xid)
{
RecoveryLockListsEntry *entry;
RecoveryLockXidEntry *entry;
if (TransactionIdIsValid(xid))
{
if ((entry = hash_search(RecoveryLockLists, &xid, HASH_FIND, NULL)))
if ((entry = hash_search(RecoveryLockXidHash, &xid, HASH_FIND, NULL)))
{
StandbyReleaseLockList(entry->locks);
hash_search(RecoveryLockLists, entry, HASH_REMOVE, NULL);
StandbyReleaseXidEntryLocks(entry);
hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
}
}
else
@ -1028,7 +1069,7 @@ StandbyReleaseLocks(TransactionId xid)
/*
* Release locks for a transaction tree, starting at xid down, from
* RecoveryLockLists.
* RecoveryLockXidHash.
*
* Called during WAL replay of COMMIT/ROLLBACK when in hot standby mode,
* to remove any AccessExclusiveLocks requested by a transaction.
@ -1051,15 +1092,15 @@ void
StandbyReleaseAllLocks(void)
{
HASH_SEQ_STATUS status;
RecoveryLockListsEntry *entry;
RecoveryLockXidEntry *entry;
elog(trace_recovery(DEBUG2), "release all standby locks");
hash_seq_init(&status, RecoveryLockLists);
hash_seq_init(&status, RecoveryLockXidHash);
while ((entry = hash_seq_search(&status)))
{
StandbyReleaseLockList(entry->locks);
hash_search(RecoveryLockLists, entry, HASH_REMOVE, NULL);
StandbyReleaseXidEntryLocks(entry);
hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
}
}
@ -1072,9 +1113,9 @@ void
StandbyReleaseOldLocks(TransactionId oldxid)
{
HASH_SEQ_STATUS status;
RecoveryLockListsEntry *entry;
RecoveryLockXidEntry *entry;
hash_seq_init(&status, RecoveryLockLists);
hash_seq_init(&status, RecoveryLockXidHash);
while ((entry = hash_seq_search(&status)))
{
Assert(TransactionIdIsValid(entry->xid));
@ -1088,8 +1129,8 @@ StandbyReleaseOldLocks(TransactionId oldxid)
continue;
/* Remove all locks and hash table entry. */
StandbyReleaseLockList(entry->locks);
hash_search(RecoveryLockLists, entry, HASH_REMOVE, NULL);
StandbyReleaseXidEntryLocks(entry);
hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
}
}