postgresql/src/backend/access/transam/timeline.c

582 lines
14 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* timeline.c
* Functions for reading and writing timeline history files.
*
* A timeline history file lists the timeline changes of the timeline, in
* a simple text format. They are archived along with the WAL segments.
*
* The files are named like "<tli>.history". For example, if the database
* starts up and switches to timeline 5, the timeline history file would be
* called "00000005.history".
*
* Each line in the file represents a timeline switch:
*
* <parentTLI> <switchpoint> <reason>
*
* parentTLI ID of the parent timeline
* switchpoint XLogRecPtr of the WAL location where the switch happened
* reason human-readable explanation of why the timeline was changed
*
* The fields are separated by tabs. Lines beginning with # are comments, and
* are ignored. Empty lines are also ignored.
*
* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/backend/access/transam/timeline.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <sys/stat.h>
#include <unistd.h>
#include "access/timeline.h"
#include "access/xlog.h"
#include "access/xlog_internal.h"
#include "access/xlogdefs.h"
#include "pgstat.h"
#include "storage/fd.h"
Fix more issues with cascading replication and timeline switches. When a standby server follows the master using WAL archive, and it chooses a new timeline (recovery_target_timeline='latest'), it only fetches the timeline history file for the chosen target timeline, not any other history files that might be missing from pg_xlog. For example, if the current timeline is 2, and we choose 4 as the new recovery target timeline, the history file for timeline 3 is not fetched, even if it's part of this server's history. That's enough for the standby itself - the history file for timeline 4 includes timeline 3 as well - but if a cascading standby server wants to recover to timeline 3, it needs the history file. To fix, when a new recovery target timeline is chosen, try to copy any missing history files from the archive to pg_xlog between the old and new target timeline. A second similar issue was with the WAL files. When a standby recovers from archive, and it reaches a segment that contains a switch to a new timeline, recovery fetches only the WAL file labelled with the new timeline's ID. The file from the new timeline contains a copy of the WAL from the old timeline up to the point where the switch happened, and recovery recovers it from the new file. But in streaming replication, walsender only tries to read it from the old timeline's file. To fix, change walsender to read it from the new file, so that it behaves the same as recovery in that sense, and doesn't try to open the possibly nonexistent file with the old timeline's ID.
2013-01-23 09:01:04 +01:00
/*
* Copies all timeline history files with id's between 'begin' and 'end'
* from archive to pg_wal.
Fix more issues with cascading replication and timeline switches. When a standby server follows the master using WAL archive, and it chooses a new timeline (recovery_target_timeline='latest'), it only fetches the timeline history file for the chosen target timeline, not any other history files that might be missing from pg_xlog. For example, if the current timeline is 2, and we choose 4 as the new recovery target timeline, the history file for timeline 3 is not fetched, even if it's part of this server's history. That's enough for the standby itself - the history file for timeline 4 includes timeline 3 as well - but if a cascading standby server wants to recover to timeline 3, it needs the history file. To fix, when a new recovery target timeline is chosen, try to copy any missing history files from the archive to pg_xlog between the old and new target timeline. A second similar issue was with the WAL files. When a standby recovers from archive, and it reaches a segment that contains a switch to a new timeline, recovery fetches only the WAL file labelled with the new timeline's ID. The file from the new timeline contains a copy of the WAL from the old timeline up to the point where the switch happened, and recovery recovers it from the new file. But in streaming replication, walsender only tries to read it from the old timeline's file. To fix, change walsender to read it from the new file, so that it behaves the same as recovery in that sense, and doesn't try to open the possibly nonexistent file with the old timeline's ID.
2013-01-23 09:01:04 +01:00
*/
void
restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end)
{
char path[MAXPGPATH];
char histfname[MAXFNAMELEN];
TimeLineID tli;
Fix more issues with cascading replication and timeline switches. When a standby server follows the master using WAL archive, and it chooses a new timeline (recovery_target_timeline='latest'), it only fetches the timeline history file for the chosen target timeline, not any other history files that might be missing from pg_xlog. For example, if the current timeline is 2, and we choose 4 as the new recovery target timeline, the history file for timeline 3 is not fetched, even if it's part of this server's history. That's enough for the standby itself - the history file for timeline 4 includes timeline 3 as well - but if a cascading standby server wants to recover to timeline 3, it needs the history file. To fix, when a new recovery target timeline is chosen, try to copy any missing history files from the archive to pg_xlog between the old and new target timeline. A second similar issue was with the WAL files. When a standby recovers from archive, and it reaches a segment that contains a switch to a new timeline, recovery fetches only the WAL file labelled with the new timeline's ID. The file from the new timeline contains a copy of the WAL from the old timeline up to the point where the switch happened, and recovery recovers it from the new file. But in streaming replication, walsender only tries to read it from the old timeline's file. To fix, change walsender to read it from the new file, so that it behaves the same as recovery in that sense, and doesn't try to open the possibly nonexistent file with the old timeline's ID.
2013-01-23 09:01:04 +01:00
for (tli = begin; tli < end; tli++)
{
if (tli == 1)
continue;
TLHistoryFileName(histfname, tli);
if (RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0, false))
KeepFileRestoredFromArchive(path, histfname);
}
}
/*
* Try to read a timeline's history file.
*
* If successful, return the list of component TLIs (the given TLI followed by
* its ancestor TLIs). If we can't find the history file, assume that the
* timeline has no parents, and return a list of just the specified timeline
* ID.
*/
List *
readTimeLineHistory(TimeLineID targetTLI)
{
List *result;
char path[MAXPGPATH];
char histfname[MAXFNAMELEN];
char fline[MAXPGPATH];
FILE *fd;
TimeLineHistoryEntry *entry;
TimeLineID lasttli = 0;
XLogRecPtr prevend;
bool fromArchive = false;
/* Timeline 1 does not have a history file, so no need to check */
if (targetTLI == 1)
{
entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry));
entry->tli = targetTLI;
entry->begin = entry->end = InvalidXLogRecPtr;
return list_make1(entry);
}
if (ArchiveRecoveryRequested)
{
TLHistoryFileName(histfname, targetTLI);
fromArchive =
RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0, false);
}
else
TLHistoryFilePath(path, targetTLI);
fd = AllocateFile(path, "r");
if (fd == NULL)
{
if (errno != ENOENT)
ereport(FATAL,
(errcode_for_file_access(),
errmsg("could not open file \"%s\": %m", path)));
/* Not there, so assume no parents */
entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry));
entry->tli = targetTLI;
entry->begin = entry->end = InvalidXLogRecPtr;
return list_make1(entry);
}
result = NIL;
/*
* Parse the file...
*/
prevend = InvalidXLogRecPtr;
while (fgets(fline, sizeof(fline), fd) != NULL)
{
/* skip leading whitespace and check for # comment */
char *ptr;
TimeLineID tli;
uint32 switchpoint_hi;
uint32 switchpoint_lo;
int nfields;
for (ptr = fline; *ptr; ptr++)
{
if (!isspace((unsigned char) *ptr))
break;
}
if (*ptr == '\0' || *ptr == '#')
continue;
nfields = sscanf(fline, "%u\t%X/%X", &tli, &switchpoint_hi, &switchpoint_lo);
if (nfields < 1)
{
/* expect a numeric timeline ID as first field of line */
ereport(FATAL,
(errmsg("syntax error in history file: %s", fline),
errhint("Expected a numeric timeline ID.")));
}
if (nfields != 3)
ereport(FATAL,
(errmsg("syntax error in history file: %s", fline),
errhint("Expected a write-ahead log switchpoint location.")));
if (result && tli <= lasttli)
ereport(FATAL,
(errmsg("invalid data in history file: %s", fline),
errhint("Timeline IDs must be in increasing sequence.")));
lasttli = tli;
entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry));
entry->tli = tli;
entry->begin = prevend;
entry->end = ((uint64) (switchpoint_hi)) << 32 | (uint64) switchpoint_lo;
prevend = entry->end;
/* Build list with newest item first */
result = lcons(entry, result);
/* we ignore the remainder of each line */
}
FreeFile(fd);
if (result && targetTLI <= lasttli)
ereport(FATAL,
(errmsg("invalid data in history file \"%s\"", path),
errhint("Timeline IDs must be less than child timeline's ID.")));
/*
* Create one more entry for the "tip" of the timeline, which has no entry
* in the history file.
*/
entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry));
entry->tli = targetTLI;
entry->begin = prevend;
entry->end = InvalidXLogRecPtr;
result = lcons(entry, result);
/*
* If the history file was fetched from archive, save it in pg_wal for
* future reference.
*/
if (fromArchive)
KeepFileRestoredFromArchive(path, histfname);
return result;
}
/*
* Probe whether a timeline history file exists for the given timeline ID
*/
bool
existsTimeLineHistory(TimeLineID probeTLI)
{
char path[MAXPGPATH];
char histfname[MAXFNAMELEN];
FILE *fd;
/* Timeline 1 does not have a history file, so no need to check */
if (probeTLI == 1)
return false;
if (ArchiveRecoveryRequested)
{
TLHistoryFileName(histfname, probeTLI);
RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0, false);
}
else
TLHistoryFilePath(path, probeTLI);
fd = AllocateFile(path, "r");
if (fd != NULL)
{
FreeFile(fd);
return true;
}
else
{
if (errno != ENOENT)
ereport(FATAL,
(errcode_for_file_access(),
errmsg("could not open file \"%s\": %m", path)));
return false;
}
}
/*
* Find the newest existing timeline, assuming that startTLI exists.
*
* Note: while this is somewhat heuristic, it does positively guarantee
* that (result + 1) is not a known timeline, and therefore it should
* be safe to assign that ID to a new timeline.
*/
TimeLineID
findNewestTimeLine(TimeLineID startTLI)
{
TimeLineID newestTLI;
TimeLineID probeTLI;
/*
* The algorithm is just to probe for the existence of timeline history
* files. XXX is it useful to allow gaps in the sequence?
*/
newestTLI = startTLI;
for (probeTLI = startTLI + 1;; probeTLI++)
{
if (existsTimeLineHistory(probeTLI))
{
Phase 2 of pgindent updates. Change pg_bsd_indent to follow upstream rules for placement of comments to the right of code, and remove pgindent hack that caused comments following #endif to not obey the general rule. Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using the published version of pg_bsd_indent, but a hacked-up version that tried to minimize the amount of movement of comments to the right of code. The situation of interest is where such a comment has to be moved to the right of its default placement at column 33 because there's code there. BSD indent has always moved right in units of tab stops in such cases --- but in the previous incarnation, indent was working in 8-space tab stops, while now it knows we use 4-space tabs. So the net result is that in about half the cases, such comments are placed one tab stop left of before. This is better all around: it leaves more room on the line for comment text, and it means that in such cases the comment uniformly starts at the next 4-space tab stop after the code, rather than sometimes one and sometimes two tabs after. Also, ensure that comments following #endif are indented the same as comments following other preprocessor commands such as #else. That inconsistency turns out to have been self-inflicted damage from a poorly-thought-through post-indent "fixup" in pgindent. This patch is much less interesting than the first round of indent changes, but also bulkier, so I thought it best to separate the effects. Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
newestTLI = probeTLI; /* probeTLI exists */
}
else
{
/* doesn't exist, assume we're done */
break;
}
}
return newestTLI;
}
/*
* Create a new timeline history file.
*
* newTLI: ID of the new timeline
* parentTLI: ID of its immediate parent
* switchpoint: WAL location where the system switched to the new timeline
* reason: human-readable explanation of why the timeline was switched
*
* Currently this is only used at the end recovery, and so there are no locking
* considerations. But we should be just as tense as XLogFileInit to avoid
* emplacing a bogus file.
*/
void
writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
XLogRecPtr switchpoint, char *reason)
{
char path[MAXPGPATH];
char tmppath[MAXPGPATH];
char histfname[MAXFNAMELEN];
char buffer[BLCKSZ];
int srcfd;
int fd;
int nbytes;
Assert(newTLI > parentTLI); /* else bad selection of newTLI */
/*
* Write into a temp file name.
*/
snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
unlink(tmppath);
/* do not use get_sync_bit() here --- want to fsync only at end of fill */
fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL);
if (fd < 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not create file \"%s\": %m", tmppath)));
/*
* If a history file exists for the parent, copy it verbatim
*/
if (ArchiveRecoveryRequested)
{
TLHistoryFileName(histfname, parentTLI);
RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0, false);
}
else
TLHistoryFilePath(path, parentTLI);
srcfd = OpenTransientFile(path, O_RDONLY);
if (srcfd < 0)
{
if (errno != ENOENT)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open file \"%s\": %m", path)));
/* Not there, so assume parent has no parents */
}
else
{
for (;;)
{
errno = 0;
pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_READ);
nbytes = (int) read(srcfd, buffer, sizeof(buffer));
pgstat_report_wait_end();
if (nbytes < 0 || errno != 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not read file \"%s\": %m", path)));
if (nbytes == 0)
break;
errno = 0;
pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_WRITE);
if ((int) write(fd, buffer, nbytes) != nbytes)
{
int save_errno = errno;
/*
* If we fail to make the file, delete it to release disk
* space
*/
unlink(tmppath);
/*
* if write didn't set errno, assume problem is no disk space
*/
errno = save_errno ? save_errno : ENOSPC;
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not write to file \"%s\": %m", tmppath)));
}
pgstat_report_wait_end();
}
CloseTransientFile(srcfd);
}
/*
* Append one line with the details of this timeline split.
*
* If we did have a parent file, insert an extra newline just in case the
* parent file failed to end with one.
*/
snprintf(buffer, sizeof(buffer),
"%s%u\t%X/%X\t%s\n",
(srcfd < 0) ? "" : "\n",
parentTLI,
(uint32) (switchpoint >> 32), (uint32) (switchpoint),
reason);
nbytes = strlen(buffer);
errno = 0;
if ((int) write(fd, buffer, nbytes) != nbytes)
{
int save_errno = errno;
/*
* If we fail to make the file, delete it to release disk space
*/
unlink(tmppath);
/* if write didn't set errno, assume problem is no disk space */
errno = save_errno ? save_errno : ENOSPC;
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not write to file \"%s\": %m", tmppath)));
}
pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_SYNC);
if (pg_fsync(fd) != 0)
PANIC on fsync() failure. On some operating systems, it doesn't make sense to retry fsync(), because dirty data cached by the kernel may have been dropped on write-back failure. In that case the only remaining copy of the data is in the WAL. A subsequent fsync() could appear to succeed, but not have flushed the data. That means that a future checkpoint could apparently complete successfully but have lost data. Therefore, violently prevent any future checkpoint attempts by panicking on the first fsync() failure. Note that we already did the same for WAL data; this change extends that behavior to non-temporary data files. Provide a GUC data_sync_retry to control this new behavior, for users of operating systems that don't eject dirty data, and possibly forensic/testing uses. If it is set to on and the write-back error was transient, a later checkpoint might genuinely succeed (on a system that does not throw away buffers on failure); if the error is permanent, later checkpoints will continue to fail. The GUC defaults to off, meaning that we panic. Back-patch to all supported releases. There is still a narrow window for error-loss on some operating systems: if the file is closed and later reopened and a write-back error occurs in the intervening time, but the inode has the bad luck to be evicted due to memory pressure before we reopen, we could miss the error. A later patch will address that with a scheme for keeping files with dirty data open at all times, but we judge that to be too complicated to back-patch. Author: Craig Ringer, with some adjustments by Thomas Munro Reported-by: Craig Ringer Reviewed-by: Robert Haas, Thomas Munro, Andres Freund Discussion: https://postgr.es/m/20180427222842.in2e4mibx45zdth5%40alap3.anarazel.de
2018-11-19 01:31:10 +01:00
ereport(data_sync_elevel(ERROR),
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m", tmppath)));
pgstat_report_wait_end();
if (CloseTransientFile(fd))
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not close file \"%s\": %m", tmppath)));
/*
* Now move the completed history file into place with its final name.
*/
TLHistoryFilePath(path, newTLI);
/*
* Perform the rename using link if available, paranoidly trying to avoid
* overwriting an existing file (there shouldn't be one).
*/
durable_link_or_rename(tmppath, path, ERROR);
/* The history file can be archived immediately. */
if (XLogArchivingActive())
{
TLHistoryFileName(histfname, newTLI);
XLogArchiveNotify(histfname);
}
}
Allow a streaming replication standby to follow a timeline switch. Before this patch, streaming replication would refuse to start replicating if the timeline in the primary doesn't exactly match the standby. The situation where it doesn't match is when you have a master, and two standbys, and you promote one of the standbys to become new master. Promoting bumps up the timeline ID, and after that bump, the other standby would refuse to continue. There's significantly more timeline related logic in streaming replication now. First of all, when a standby connects to primary, it will ask the primary for any timeline history files that are missing from the standby. The missing files are sent using a new replication command TIMELINE_HISTORY, and stored in standby's pg_xlog directory. Using the timeline history files, the standby can follow the latest timeline present in the primary (recovery_target_timeline='latest'), just as it can follow new timelines appearing in an archive directory. START_REPLICATION now takes a TIMELINE parameter, to specify exactly which timeline to stream WAL from. This allows the standby to request the primary to send over WAL that precedes the promotion. The replication protocol is changed slightly (in a backwards-compatible way although there's little hope of streaming replication working across major versions anyway), to allow replication to stop when the end of timeline reached, putting the walsender back into accepting a replication command. Many thanks to Amit Kapila for testing and reviewing various versions of this patch.
2012-12-13 18:00:00 +01:00
/*
* Writes a history file for given timeline and contents.
*
* Currently this is only used in the walreceiver process, and so there are
* no locking considerations. But we should be just as tense as XLogFileInit
* to avoid emplacing a bogus file.
*/
void
writeTimeLineHistoryFile(TimeLineID tli, char *content, int size)
{
char path[MAXPGPATH];
char tmppath[MAXPGPATH];
int fd;
/*
* Write into a temp file name.
*/
snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
unlink(tmppath);
/* do not use get_sync_bit() here --- want to fsync only at end of fill */
fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL);
Allow a streaming replication standby to follow a timeline switch. Before this patch, streaming replication would refuse to start replicating if the timeline in the primary doesn't exactly match the standby. The situation where it doesn't match is when you have a master, and two standbys, and you promote one of the standbys to become new master. Promoting bumps up the timeline ID, and after that bump, the other standby would refuse to continue. There's significantly more timeline related logic in streaming replication now. First of all, when a standby connects to primary, it will ask the primary for any timeline history files that are missing from the standby. The missing files are sent using a new replication command TIMELINE_HISTORY, and stored in standby's pg_xlog directory. Using the timeline history files, the standby can follow the latest timeline present in the primary (recovery_target_timeline='latest'), just as it can follow new timelines appearing in an archive directory. START_REPLICATION now takes a TIMELINE parameter, to specify exactly which timeline to stream WAL from. This allows the standby to request the primary to send over WAL that precedes the promotion. The replication protocol is changed slightly (in a backwards-compatible way although there's little hope of streaming replication working across major versions anyway), to allow replication to stop when the end of timeline reached, putting the walsender back into accepting a replication command. Many thanks to Amit Kapila for testing and reviewing various versions of this patch.
2012-12-13 18:00:00 +01:00
if (fd < 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not create file \"%s\": %m", tmppath)));
errno = 0;
pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_FILE_WRITE);
Allow a streaming replication standby to follow a timeline switch. Before this patch, streaming replication would refuse to start replicating if the timeline in the primary doesn't exactly match the standby. The situation where it doesn't match is when you have a master, and two standbys, and you promote one of the standbys to become new master. Promoting bumps up the timeline ID, and after that bump, the other standby would refuse to continue. There's significantly more timeline related logic in streaming replication now. First of all, when a standby connects to primary, it will ask the primary for any timeline history files that are missing from the standby. The missing files are sent using a new replication command TIMELINE_HISTORY, and stored in standby's pg_xlog directory. Using the timeline history files, the standby can follow the latest timeline present in the primary (recovery_target_timeline='latest'), just as it can follow new timelines appearing in an archive directory. START_REPLICATION now takes a TIMELINE parameter, to specify exactly which timeline to stream WAL from. This allows the standby to request the primary to send over WAL that precedes the promotion. The replication protocol is changed slightly (in a backwards-compatible way although there's little hope of streaming replication working across major versions anyway), to allow replication to stop when the end of timeline reached, putting the walsender back into accepting a replication command. Many thanks to Amit Kapila for testing and reviewing various versions of this patch.
2012-12-13 18:00:00 +01:00
if ((int) write(fd, content, size) != size)
{
int save_errno = errno;
/*
* If we fail to make the file, delete it to release disk space
*/
unlink(tmppath);
/* if write didn't set errno, assume problem is no disk space */
errno = save_errno ? save_errno : ENOSPC;
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not write to file \"%s\": %m", tmppath)));
}
pgstat_report_wait_end();
Allow a streaming replication standby to follow a timeline switch. Before this patch, streaming replication would refuse to start replicating if the timeline in the primary doesn't exactly match the standby. The situation where it doesn't match is when you have a master, and two standbys, and you promote one of the standbys to become new master. Promoting bumps up the timeline ID, and after that bump, the other standby would refuse to continue. There's significantly more timeline related logic in streaming replication now. First of all, when a standby connects to primary, it will ask the primary for any timeline history files that are missing from the standby. The missing files are sent using a new replication command TIMELINE_HISTORY, and stored in standby's pg_xlog directory. Using the timeline history files, the standby can follow the latest timeline present in the primary (recovery_target_timeline='latest'), just as it can follow new timelines appearing in an archive directory. START_REPLICATION now takes a TIMELINE parameter, to specify exactly which timeline to stream WAL from. This allows the standby to request the primary to send over WAL that precedes the promotion. The replication protocol is changed slightly (in a backwards-compatible way although there's little hope of streaming replication working across major versions anyway), to allow replication to stop when the end of timeline reached, putting the walsender back into accepting a replication command. Many thanks to Amit Kapila for testing and reviewing various versions of this patch.
2012-12-13 18:00:00 +01:00
pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_FILE_SYNC);
Allow a streaming replication standby to follow a timeline switch. Before this patch, streaming replication would refuse to start replicating if the timeline in the primary doesn't exactly match the standby. The situation where it doesn't match is when you have a master, and two standbys, and you promote one of the standbys to become new master. Promoting bumps up the timeline ID, and after that bump, the other standby would refuse to continue. There's significantly more timeline related logic in streaming replication now. First of all, when a standby connects to primary, it will ask the primary for any timeline history files that are missing from the standby. The missing files are sent using a new replication command TIMELINE_HISTORY, and stored in standby's pg_xlog directory. Using the timeline history files, the standby can follow the latest timeline present in the primary (recovery_target_timeline='latest'), just as it can follow new timelines appearing in an archive directory. START_REPLICATION now takes a TIMELINE parameter, to specify exactly which timeline to stream WAL from. This allows the standby to request the primary to send over WAL that precedes the promotion. The replication protocol is changed slightly (in a backwards-compatible way although there's little hope of streaming replication working across major versions anyway), to allow replication to stop when the end of timeline reached, putting the walsender back into accepting a replication command. Many thanks to Amit Kapila for testing and reviewing various versions of this patch.
2012-12-13 18:00:00 +01:00
if (pg_fsync(fd) != 0)
PANIC on fsync() failure. On some operating systems, it doesn't make sense to retry fsync(), because dirty data cached by the kernel may have been dropped on write-back failure. In that case the only remaining copy of the data is in the WAL. A subsequent fsync() could appear to succeed, but not have flushed the data. That means that a future checkpoint could apparently complete successfully but have lost data. Therefore, violently prevent any future checkpoint attempts by panicking on the first fsync() failure. Note that we already did the same for WAL data; this change extends that behavior to non-temporary data files. Provide a GUC data_sync_retry to control this new behavior, for users of operating systems that don't eject dirty data, and possibly forensic/testing uses. If it is set to on and the write-back error was transient, a later checkpoint might genuinely succeed (on a system that does not throw away buffers on failure); if the error is permanent, later checkpoints will continue to fail. The GUC defaults to off, meaning that we panic. Back-patch to all supported releases. There is still a narrow window for error-loss on some operating systems: if the file is closed and later reopened and a write-back error occurs in the intervening time, but the inode has the bad luck to be evicted due to memory pressure before we reopen, we could miss the error. A later patch will address that with a scheme for keeping files with dirty data open at all times, but we judge that to be too complicated to back-patch. Author: Craig Ringer, with some adjustments by Thomas Munro Reported-by: Craig Ringer Reviewed-by: Robert Haas, Thomas Munro, Andres Freund Discussion: https://postgr.es/m/20180427222842.in2e4mibx45zdth5%40alap3.anarazel.de
2018-11-19 01:31:10 +01:00
ereport(data_sync_elevel(ERROR),
Allow a streaming replication standby to follow a timeline switch. Before this patch, streaming replication would refuse to start replicating if the timeline in the primary doesn't exactly match the standby. The situation where it doesn't match is when you have a master, and two standbys, and you promote one of the standbys to become new master. Promoting bumps up the timeline ID, and after that bump, the other standby would refuse to continue. There's significantly more timeline related logic in streaming replication now. First of all, when a standby connects to primary, it will ask the primary for any timeline history files that are missing from the standby. The missing files are sent using a new replication command TIMELINE_HISTORY, and stored in standby's pg_xlog directory. Using the timeline history files, the standby can follow the latest timeline present in the primary (recovery_target_timeline='latest'), just as it can follow new timelines appearing in an archive directory. START_REPLICATION now takes a TIMELINE parameter, to specify exactly which timeline to stream WAL from. This allows the standby to request the primary to send over WAL that precedes the promotion. The replication protocol is changed slightly (in a backwards-compatible way although there's little hope of streaming replication working across major versions anyway), to allow replication to stop when the end of timeline reached, putting the walsender back into accepting a replication command. Many thanks to Amit Kapila for testing and reviewing various versions of this patch.
2012-12-13 18:00:00 +01:00
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m", tmppath)));
pgstat_report_wait_end();
Allow a streaming replication standby to follow a timeline switch. Before this patch, streaming replication would refuse to start replicating if the timeline in the primary doesn't exactly match the standby. The situation where it doesn't match is when you have a master, and two standbys, and you promote one of the standbys to become new master. Promoting bumps up the timeline ID, and after that bump, the other standby would refuse to continue. There's significantly more timeline related logic in streaming replication now. First of all, when a standby connects to primary, it will ask the primary for any timeline history files that are missing from the standby. The missing files are sent using a new replication command TIMELINE_HISTORY, and stored in standby's pg_xlog directory. Using the timeline history files, the standby can follow the latest timeline present in the primary (recovery_target_timeline='latest'), just as it can follow new timelines appearing in an archive directory. START_REPLICATION now takes a TIMELINE parameter, to specify exactly which timeline to stream WAL from. This allows the standby to request the primary to send over WAL that precedes the promotion. The replication protocol is changed slightly (in a backwards-compatible way although there's little hope of streaming replication working across major versions anyway), to allow replication to stop when the end of timeline reached, putting the walsender back into accepting a replication command. Many thanks to Amit Kapila for testing and reviewing various versions of this patch.
2012-12-13 18:00:00 +01:00
if (CloseTransientFile(fd))
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not close file \"%s\": %m", tmppath)));
/*
* Now move the completed history file into place with its final name.
*/
TLHistoryFilePath(path, tli);
/*
* Perform the rename using link if available, paranoidly trying to avoid
* overwriting an existing file (there shouldn't be one).
Allow a streaming replication standby to follow a timeline switch. Before this patch, streaming replication would refuse to start replicating if the timeline in the primary doesn't exactly match the standby. The situation where it doesn't match is when you have a master, and two standbys, and you promote one of the standbys to become new master. Promoting bumps up the timeline ID, and after that bump, the other standby would refuse to continue. There's significantly more timeline related logic in streaming replication now. First of all, when a standby connects to primary, it will ask the primary for any timeline history files that are missing from the standby. The missing files are sent using a new replication command TIMELINE_HISTORY, and stored in standby's pg_xlog directory. Using the timeline history files, the standby can follow the latest timeline present in the primary (recovery_target_timeline='latest'), just as it can follow new timelines appearing in an archive directory. START_REPLICATION now takes a TIMELINE parameter, to specify exactly which timeline to stream WAL from. This allows the standby to request the primary to send over WAL that precedes the promotion. The replication protocol is changed slightly (in a backwards-compatible way although there's little hope of streaming replication working across major versions anyway), to allow replication to stop when the end of timeline reached, putting the walsender back into accepting a replication command. Many thanks to Amit Kapila for testing and reviewing various versions of this patch.
2012-12-13 18:00:00 +01:00
*/
durable_link_or_rename(tmppath, path, ERROR);
Allow a streaming replication standby to follow a timeline switch. Before this patch, streaming replication would refuse to start replicating if the timeline in the primary doesn't exactly match the standby. The situation where it doesn't match is when you have a master, and two standbys, and you promote one of the standbys to become new master. Promoting bumps up the timeline ID, and after that bump, the other standby would refuse to continue. There's significantly more timeline related logic in streaming replication now. First of all, when a standby connects to primary, it will ask the primary for any timeline history files that are missing from the standby. The missing files are sent using a new replication command TIMELINE_HISTORY, and stored in standby's pg_xlog directory. Using the timeline history files, the standby can follow the latest timeline present in the primary (recovery_target_timeline='latest'), just as it can follow new timelines appearing in an archive directory. START_REPLICATION now takes a TIMELINE parameter, to specify exactly which timeline to stream WAL from. This allows the standby to request the primary to send over WAL that precedes the promotion. The replication protocol is changed slightly (in a backwards-compatible way although there's little hope of streaming replication working across major versions anyway), to allow replication to stop when the end of timeline reached, putting the walsender back into accepting a replication command. Many thanks to Amit Kapila for testing and reviewing various versions of this patch.
2012-12-13 18:00:00 +01:00
}
/*
* Returns true if 'expectedTLEs' contains a timeline with id 'tli'
*/
bool
tliInHistory(TimeLineID tli, List *expectedTLEs)
{
ListCell *cell;
foreach(cell, expectedTLEs)
{
if (((TimeLineHistoryEntry *) lfirst(cell))->tli == tli)
return true;
}
return false;
}
/*
* Returns the ID of the timeline in use at a particular point in time, in
* the given timeline history.
*/
TimeLineID
tliOfPointInHistory(XLogRecPtr ptr, List *history)
{
ListCell *cell;
foreach(cell, history)
{
TimeLineHistoryEntry *tle = (TimeLineHistoryEntry *) lfirst(cell);
if ((XLogRecPtrIsInvalid(tle->begin) || tle->begin <= ptr) &&
(XLogRecPtrIsInvalid(tle->end) || ptr < tle->end))
{
/* found it */
return tle->tli;
}
}
/* shouldn't happen. */
elog(ERROR, "timeline history was not contiguous");
return 0; /* keep compiler quiet */
}
/*
* Returns the point in history where we branched off the given timeline,
* and the timeline we branched to (*nextTLI). Returns InvalidXLogRecPtr if
* the timeline is current, ie. we have not branched off from it, and throws
* an error if the timeline is not part of this server's history.
*/
XLogRecPtr
tliSwitchPoint(TimeLineID tli, List *history, TimeLineID *nextTLI)
{
ListCell *cell;
if (nextTLI)
*nextTLI = 0;
foreach(cell, history)
{
TimeLineHistoryEntry *tle = (TimeLineHistoryEntry *) lfirst(cell);
if (tle->tli == tli)
return tle->end;
if (nextTLI)
*nextTLI = tle->tli;
}
ereport(ERROR,
(errmsg("requested timeline %u is not in this server's history",
tli)));
return InvalidXLogRecPtr; /* keep compiler quiet */
}