diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 9a3864f0b0..6185dc9215 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -2783,74 +2783,74 @@ pgstat_initialize(void) * Apart from auxiliary processes, MyBackendId, MyDatabaseId, * session userid, and application_name must be set for a * backend (hence, this cannot be combined with pgstat_initialize). + * Note also that we must be inside a transaction if this isn't an aux + * process, as we may need to do encoding conversion on some strings. * ---------- */ void pgstat_bestart(void) { - TimestampTz proc_start_timestamp; - SockAddr clientaddr; - volatile PgBackendStatus *beentry; - - /* - * To minimize the time spent modifying the PgBackendStatus entry, fetch - * all the needed data first. - * - * If we have a MyProcPort, use its session start time (for consistency, - * and to save a kernel call). - */ - if (MyProcPort) - proc_start_timestamp = MyProcPort->SessionStartTime; - else - proc_start_timestamp = GetCurrentTimestamp(); - - /* - * We may not have a MyProcPort (eg, if this is the autovacuum process). - * If so, use all-zeroes client address, which is dealt with specially in - * pg_stat_get_backend_client_addr and pg_stat_get_backend_client_port. - */ - if (MyProcPort) - memcpy(&clientaddr, &MyProcPort->raddr, sizeof(clientaddr)); - else - MemSet(&clientaddr, 0, sizeof(clientaddr)); - - /* - * Initialize my status entry, following the protocol of bumping - * st_changecount before and after; and make sure it's even afterwards. We - * use a volatile pointer here to ensure the compiler doesn't try to get - * cute. - */ - beentry = MyBEEntry; + volatile PgBackendStatus *vbeentry = MyBEEntry; + PgBackendStatus lbeentry; +#ifdef USE_SSL + PgBackendSSLStatus lsslstatus; +#endif /* pgstats state must be initialized from pgstat_initialize() */ - Assert(beentry != NULL); + Assert(vbeentry != NULL); + + /* + * To minimize the time spent modifying the PgBackendStatus entry, and + * avoid risk of errors inside the critical section, we first copy the + * shared-memory struct to a local variable, then modify the data in the + * local variable, then copy the local variable back to shared memory. + * Only the last step has to be inside the critical section. + * + * Most of the data we copy from shared memory is just going to be + * overwritten, but the struct's not so large that it's worth the + * maintenance hassle to copy only the needful fields. + */ + memcpy(&lbeentry, + (char *) vbeentry, + sizeof(PgBackendStatus)); + + /* This struct can just start from zeroes each time, though */ +#ifdef USE_SSL + memset(&lsslstatus, 0, sizeof(lsslstatus)); +#endif + + /* + * Now fill in all the fields of lbeentry, except for strings that are + * out-of-line data. Those have to be handled separately, below. + */ + lbeentry.st_procpid = MyProcPid; if (MyBackendId != InvalidBackendId) { if (IsAutoVacuumLauncherProcess()) { /* Autovacuum Launcher */ - beentry->st_backendType = B_AUTOVAC_LAUNCHER; + lbeentry.st_backendType = B_AUTOVAC_LAUNCHER; } else if (IsAutoVacuumWorkerProcess()) { /* Autovacuum Worker */ - beentry->st_backendType = B_AUTOVAC_WORKER; + lbeentry.st_backendType = B_AUTOVAC_WORKER; } else if (am_walsender) { /* Wal sender */ - beentry->st_backendType = B_WAL_SENDER; + lbeentry.st_backendType = B_WAL_SENDER; } else if (IsBackgroundWorker) { /* bgworker */ - beentry->st_backendType = B_BG_WORKER; + lbeentry.st_backendType = B_BG_WORKER; } else { /* client-backend */ - beentry->st_backendType = B_BACKEND; + lbeentry.st_backendType = B_BACKEND; } } else @@ -2860,79 +2860,80 @@ pgstat_bestart(void) switch (MyAuxProcType) { case StartupProcess: - beentry->st_backendType = B_STARTUP; + lbeentry.st_backendType = B_STARTUP; break; case BgWriterProcess: - beentry->st_backendType = B_BG_WRITER; + lbeentry.st_backendType = B_BG_WRITER; break; case CheckpointerProcess: - beentry->st_backendType = B_CHECKPOINTER; + lbeentry.st_backendType = B_CHECKPOINTER; break; case WalWriterProcess: - beentry->st_backendType = B_WAL_WRITER; + lbeentry.st_backendType = B_WAL_WRITER; break; case WalReceiverProcess: - beentry->st_backendType = B_WAL_RECEIVER; + lbeentry.st_backendType = B_WAL_RECEIVER; break; default: elog(FATAL, "unrecognized process type: %d", (int) MyAuxProcType); - proc_exit(1); } } - do - { - pgstat_increment_changecount_before(beentry); - } while ((beentry->st_changecount & 1) == 0); + /* + * If we have a MyProcPort, use its session start time (for consistency, + * and to save a kernel call). + */ + if (MyProcPort) + lbeentry.st_proc_start_timestamp = MyProcPort->SessionStartTime; + else + lbeentry.st_proc_start_timestamp = GetCurrentTimestamp(); - beentry->st_procpid = MyProcPid; - beentry->st_proc_start_timestamp = proc_start_timestamp; - beentry->st_activity_start_timestamp = 0; - beentry->st_state_start_timestamp = 0; - beentry->st_xact_start_timestamp = 0; - beentry->st_databaseid = MyDatabaseId; + lbeentry.st_activity_start_timestamp = 0; + lbeentry.st_state_start_timestamp = 0; + lbeentry.st_xact_start_timestamp = 0; + lbeentry.st_databaseid = MyDatabaseId; /* We have userid for client-backends, wal-sender and bgworker processes */ - if (beentry->st_backendType == B_BACKEND - || beentry->st_backendType == B_WAL_SENDER - || beentry->st_backendType == B_BG_WORKER) - beentry->st_userid = GetSessionUserId(); + if (lbeentry.st_backendType == B_BACKEND + || lbeentry.st_backendType == B_WAL_SENDER + || lbeentry.st_backendType == B_BG_WORKER) + lbeentry.st_userid = GetSessionUserId(); else - beentry->st_userid = InvalidOid; + lbeentry.st_userid = InvalidOid; - beentry->st_clientaddr = clientaddr; - if (MyProcPort && MyProcPort->remote_hostname) - strlcpy(beentry->st_clienthostname, MyProcPort->remote_hostname, - NAMEDATALEN); + /* + * We may not have a MyProcPort (eg, if this is the autovacuum process). + * If so, use all-zeroes client address, which is dealt with specially in + * pg_stat_get_backend_client_addr and pg_stat_get_backend_client_port. + */ + if (MyProcPort) + memcpy(&lbeentry.st_clientaddr, &MyProcPort->raddr, + sizeof(lbeentry.st_clientaddr)); else - beentry->st_clienthostname[0] = '\0'; + MemSet(&lbeentry.st_clientaddr, 0, sizeof(lbeentry.st_clientaddr)); + #ifdef USE_SSL if (MyProcPort && MyProcPort->ssl != NULL) { - beentry->st_ssl = true; - beentry->st_sslstatus->ssl_bits = be_tls_get_cipher_bits(MyProcPort); - beentry->st_sslstatus->ssl_compression = be_tls_get_compression(MyProcPort); - strlcpy(beentry->st_sslstatus->ssl_version, be_tls_get_version(MyProcPort), NAMEDATALEN); - strlcpy(beentry->st_sslstatus->ssl_cipher, be_tls_get_cipher(MyProcPort), NAMEDATALEN); - be_tls_get_peerdn_name(MyProcPort, beentry->st_sslstatus->ssl_clientdn, NAMEDATALEN); + lbeentry.st_ssl = true; + lsslstatus.ssl_bits = be_tls_get_cipher_bits(MyProcPort); + lsslstatus.ssl_compression = be_tls_get_compression(MyProcPort); + strlcpy(lsslstatus.ssl_version, be_tls_get_version(MyProcPort), NAMEDATALEN); + strlcpy(lsslstatus.ssl_cipher, be_tls_get_cipher(MyProcPort), NAMEDATALEN); + be_tls_get_peerdn_name(MyProcPort, lsslstatus.ssl_clientdn, NAMEDATALEN); } else { - beentry->st_ssl = false; + lbeentry.st_ssl = false; } #else - beentry->st_ssl = false; + lbeentry.st_ssl = false; #endif - beentry->st_state = STATE_UNDEFINED; - beentry->st_appname[0] = '\0'; - beentry->st_activity_raw[0] = '\0'; - /* Also make sure the last byte in each string area is always 0 */ - beentry->st_clienthostname[NAMEDATALEN - 1] = '\0'; - beentry->st_appname[NAMEDATALEN - 1] = '\0'; - beentry->st_activity_raw[pgstat_track_activity_query_size - 1] = '\0'; - beentry->st_progress_command = PROGRESS_COMMAND_INVALID; - beentry->st_progress_command_target = InvalidOid; + + lbeentry.st_state = STATE_UNDEFINED; + lbeentry.st_progress_command = PROGRESS_COMMAND_INVALID; + lbeentry.st_progress_command_target = InvalidOid; /* * we don't zero st_progress_param here to save cycles; nobody should @@ -2940,7 +2941,42 @@ pgstat_bestart(void) * than PROGRESS_COMMAND_INVALID */ - pgstat_increment_changecount_after(beentry); + /* + * We're ready to enter the critical section that fills the shared-memory + * status entry. We follow the protocol of bumping st_changecount before + * and after; and make sure it's even afterwards. We use a volatile + * pointer here to ensure the compiler doesn't try to get cute. + */ + PGSTAT_BEGIN_WRITE_ACTIVITY(vbeentry); + + /* make sure we'll memcpy the same st_changecount back */ + lbeentry.st_changecount = vbeentry->st_changecount; + + memcpy((char *) vbeentry, + &lbeentry, + sizeof(PgBackendStatus)); + + /* + * We can write the out-of-line strings and structs using the pointers + * that are in lbeentry; this saves some de-volatilizing messiness. + */ + lbeentry.st_appname[0] = '\0'; + if (MyProcPort && MyProcPort->remote_hostname) + strlcpy(lbeentry.st_clienthostname, MyProcPort->remote_hostname, + NAMEDATALEN); + else + lbeentry.st_clienthostname[0] = '\0'; + lbeentry.st_activity_raw[0] = '\0'; + /* Also make sure the last byte in each string area is always 0 */ + lbeentry.st_appname[NAMEDATALEN - 1] = '\0'; + lbeentry.st_clienthostname[NAMEDATALEN - 1] = '\0'; + lbeentry.st_activity_raw[pgstat_track_activity_query_size - 1] = '\0'; + +#ifdef USE_SSL + memcpy(lbeentry.st_sslstatus, &lsslstatus, sizeof(PgBackendSSLStatus)); +#endif + + PGSTAT_END_WRITE_ACTIVITY(vbeentry); /* Update app name to current GUC setting */ if (application_name) @@ -2975,11 +3011,11 @@ pgstat_beshutdown_hook(int code, Datum arg) * before and after. We use a volatile pointer here to ensure the * compiler doesn't try to get cute. */ - pgstat_increment_changecount_before(beentry); + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry); beentry->st_procpid = 0; /* mark invalid */ - pgstat_increment_changecount_after(beentry); + PGSTAT_END_WRITE_ACTIVITY(beentry); } @@ -3018,7 +3054,7 @@ pgstat_report_activity(BackendState state, const char *cmd_str) * non-disabled state. As our final update, change the state and * clear fields we will not be updating anymore. */ - pgstat_increment_changecount_before(beentry); + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry); beentry->st_state = STATE_DISABLED; beentry->st_state_start_timestamp = 0; beentry->st_activity_raw[0] = '\0'; @@ -3026,14 +3062,14 @@ pgstat_report_activity(BackendState state, const char *cmd_str) /* st_xact_start_timestamp and wait_event_info are also disabled */ beentry->st_xact_start_timestamp = 0; proc->wait_event_info = 0; - pgstat_increment_changecount_after(beentry); + PGSTAT_END_WRITE_ACTIVITY(beentry); } return; } /* - * To minimize the time spent modifying the entry, fetch all the needed - * data first. + * To minimize the time spent modifying the entry, and avoid risk of + * errors inside the critical section, fetch all the needed data first. */ start_timestamp = GetCurrentStatementStartTimestamp(); if (cmd_str != NULL) @@ -3050,7 +3086,7 @@ pgstat_report_activity(BackendState state, const char *cmd_str) /* * Now update the status entry */ - pgstat_increment_changecount_before(beentry); + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry); beentry->st_state = state; beentry->st_state_start_timestamp = current_timestamp; @@ -3062,7 +3098,7 @@ pgstat_report_activity(BackendState state, const char *cmd_str) beentry->st_activity_start_timestamp = start_timestamp; } - pgstat_increment_changecount_after(beentry); + PGSTAT_END_WRITE_ACTIVITY(beentry); } /*----------- @@ -3080,11 +3116,11 @@ pgstat_progress_start_command(ProgressCommandType cmdtype, Oid relid) if (!beentry || !pgstat_track_activities) return; - pgstat_increment_changecount_before(beentry); + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry); beentry->st_progress_command = cmdtype; beentry->st_progress_command_target = relid; MemSet(&beentry->st_progress_param, 0, sizeof(beentry->st_progress_param)); - pgstat_increment_changecount_after(beentry); + PGSTAT_END_WRITE_ACTIVITY(beentry); } /*----------- @@ -3103,9 +3139,9 @@ pgstat_progress_update_param(int index, int64 val) if (!beentry || !pgstat_track_activities) return; - pgstat_increment_changecount_before(beentry); + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry); beentry->st_progress_param[index] = val; - pgstat_increment_changecount_after(beentry); + PGSTAT_END_WRITE_ACTIVITY(beentry); } /*----------- @@ -3125,7 +3161,7 @@ pgstat_progress_update_multi_param(int nparam, const int *index, if (!beentry || !pgstat_track_activities || nparam == 0) return; - pgstat_increment_changecount_before(beentry); + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry); for (i = 0; i < nparam; ++i) { @@ -3134,7 +3170,7 @@ pgstat_progress_update_multi_param(int nparam, const int *index, beentry->st_progress_param[index[i]] = val[i]; } - pgstat_increment_changecount_after(beentry); + PGSTAT_END_WRITE_ACTIVITY(beentry); } /*----------- @@ -3155,10 +3191,10 @@ pgstat_progress_end_command(void) && beentry->st_progress_command == PROGRESS_COMMAND_INVALID) return; - pgstat_increment_changecount_before(beentry); + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry); beentry->st_progress_command = PROGRESS_COMMAND_INVALID; beentry->st_progress_command_target = InvalidOid; - pgstat_increment_changecount_after(beentry); + PGSTAT_END_WRITE_ACTIVITY(beentry); } /* ---------- @@ -3184,12 +3220,12 @@ pgstat_report_appname(const char *appname) * st_changecount before and after. We use a volatile pointer here to * ensure the compiler doesn't try to get cute. */ - pgstat_increment_changecount_before(beentry); + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry); memcpy((char *) beentry->st_appname, appname, len); beentry->st_appname[len] = '\0'; - pgstat_increment_changecount_after(beentry); + PGSTAT_END_WRITE_ACTIVITY(beentry); } /* @@ -3209,9 +3245,11 @@ pgstat_report_xact_timestamp(TimestampTz tstamp) * st_changecount before and after. We use a volatile pointer here to * ensure the compiler doesn't try to get cute. */ - pgstat_increment_changecount_before(beentry); + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry); + beentry->st_xact_start_timestamp = tstamp; - pgstat_increment_changecount_after(beentry); + + PGSTAT_END_WRITE_ACTIVITY(beentry); } /* ---------- @@ -3277,14 +3315,19 @@ pgstat_read_current_status(void) int before_changecount; int after_changecount; - pgstat_save_changecount_before(beentry, before_changecount); + pgstat_begin_read_activity(beentry, before_changecount); localentry->backendStatus.st_procpid = beentry->st_procpid; + /* Skip all the data-copying work if entry is not in use */ if (localentry->backendStatus.st_procpid > 0) { memcpy(&localentry->backendStatus, (char *) beentry, sizeof(PgBackendStatus)); /* + * For each PgBackendStatus field that is a pointer, copy the + * pointed-to data, then adjust the local copy of the pointer + * field to point at the local copy of the data. + * * strcpy is safe even if the string is modified concurrently, * because there's always a \0 at the end of the buffer. */ @@ -3294,7 +3337,6 @@ pgstat_read_current_status(void) localentry->backendStatus.st_clienthostname = localclienthostname; strcpy(localactivity, (char *) beentry->st_activity_raw); localentry->backendStatus.st_activity_raw = localactivity; - localentry->backendStatus.st_ssl = beentry->st_ssl; #ifdef USE_SSL if (beentry->st_ssl) { @@ -3304,9 +3346,10 @@ pgstat_read_current_status(void) #endif } - pgstat_save_changecount_after(beentry, after_changecount); - if (before_changecount == after_changecount && - (before_changecount & 1) == 0) + pgstat_end_read_activity(beentry, after_changecount); + + if (pgstat_read_activity_complete(before_changecount, + after_changecount)) break; /* Make sure we can break out of loop if stuck... */ @@ -3989,14 +4032,14 @@ pgstat_get_backend_current_activity(int pid, bool checkUser) int before_changecount; int after_changecount; - pgstat_save_changecount_before(vbeentry, before_changecount); + pgstat_begin_read_activity(vbeentry, before_changecount); found = (vbeentry->st_procpid == pid); - pgstat_save_changecount_after(vbeentry, after_changecount); + pgstat_end_read_activity(vbeentry, after_changecount); - if (before_changecount == after_changecount && - (before_changecount & 1) == 0) + if (pgstat_read_activity_complete(before_changecount, + after_changecount)) break; /* Make sure we can break out of loop if stuck... */ diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 19d75ccdda..682b088eda 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -983,11 +983,12 @@ typedef struct PgBackendStatus * the copy is valid; otherwise start over. This makes updates cheap * while reads are potentially expensive, but that's the tradeoff we want. * - * The above protocol needs the memory barriers to ensure that the - * apparent order of execution is as it desires. Otherwise, for example, - * the CPU might rearrange the code so that st_changecount is incremented - * twice before the modification on a machine with weak memory ordering. - * This surprising result can lead to bugs. + * The above protocol needs memory barriers to ensure that the apparent + * order of execution is as it desires. Otherwise, for example, the CPU + * might rearrange the code so that st_changecount is incremented twice + * before the modification on a machine with weak memory ordering. Hence, + * use the macros defined below for manipulating st_changecount, rather + * than touching it directly. */ int st_changecount; @@ -1043,41 +1044,75 @@ typedef struct PgBackendStatus } PgBackendStatus; /* - * Macros to load and store st_changecount with the memory barriers. + * Macros to load and store st_changecount with appropriate memory barriers. * - * pgstat_increment_changecount_before() and - * pgstat_increment_changecount_after() need to be called before and after - * PgBackendStatus entries are modified, respectively. This makes sure that - * st_changecount is incremented around the modification. + * Use PGSTAT_BEGIN_WRITE_ACTIVITY() before, and PGSTAT_END_WRITE_ACTIVITY() + * after, modifying the current process's PgBackendStatus data. Note that, + * since there is no mechanism for cleaning up st_changecount after an error, + * THESE MACROS FORM A CRITICAL SECTION. Any error between them will be + * promoted to PANIC, causing a database restart to clean up shared memory! + * Hence, keep the critical section as short and straight-line as possible. + * Aside from being safer, that minimizes the window in which readers will + * have to loop. * - * Also pgstat_save_changecount_before() and pgstat_save_changecount_after() - * need to be called before and after PgBackendStatus entries are copied into - * private memory, respectively. + * Reader logic should follow this sketch: + * + * for (;;) + * { + * int before_ct, after_ct; + * + * pgstat_begin_read_activity(beentry, before_ct); + * ... copy beentry data to local memory ... + * pgstat_end_read_activity(beentry, after_ct); + * if (pgstat_read_activity_complete(before_ct, after_ct)) + * break; + * CHECK_FOR_INTERRUPTS(); + * } + * + * For extra safety, we generally use volatile beentry pointers, although + * the memory barriers should theoretically be sufficient. */ -#define pgstat_increment_changecount_before(beentry) \ - do { \ - beentry->st_changecount++; \ +#define PGSTAT_BEGIN_WRITE_ACTIVITY(beentry) \ + do { \ + START_CRIT_SECTION(); \ + (beentry)->st_changecount++; \ pg_write_barrier(); \ } while (0) +#define PGSTAT_END_WRITE_ACTIVITY(beentry) \ + do { \ + pg_write_barrier(); \ + (beentry)->st_changecount++; \ + Assert(((beentry)->st_changecount & 1) == 0); \ + END_CRIT_SECTION(); \ + } while (0) + +#define pgstat_begin_read_activity(beentry, before_changecount) \ + do { \ + (before_changecount) = (beentry)->st_changecount; \ + pg_read_barrier(); \ + } while (0) + +#define pgstat_end_read_activity(beentry, after_changecount) \ + do { \ + pg_read_barrier(); \ + (after_changecount) = (beentry)->st_changecount; \ + } while (0) + +#define pgstat_read_activity_complete(before_changecount, after_changecount) \ + ((before_changecount) == (after_changecount) && \ + ((before_changecount) & 1) == 0) + +/* deprecated names for above macros; these are gone in v12 */ +#define pgstat_increment_changecount_before(beentry) \ + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry) #define pgstat_increment_changecount_after(beentry) \ - do { \ - pg_write_barrier(); \ - beentry->st_changecount++; \ - Assert((beentry->st_changecount & 1) == 0); \ - } while (0) + PGSTAT_END_WRITE_ACTIVITY(beentry) +#define pgstat_save_changecount_before(beentry, save_changecount) \ + pgstat_begin_read_activity(beentry, save_changecount) +#define pgstat_save_changecount_after(beentry, save_changecount) \ + pgstat_end_read_activity(beentry, save_changecount) -#define pgstat_save_changecount_before(beentry, save_changecount) \ - do { \ - save_changecount = beentry->st_changecount; \ - pg_read_barrier(); \ - } while (0) - -#define pgstat_save_changecount_after(beentry, save_changecount) \ - do { \ - pg_read_barrier(); \ - save_changecount = beentry->st_changecount; \ - } while (0) /* ---------- * LocalPgBackendStatus