postgresql/src/backend/access/transam/xlogfuncs.c

700 lines
19 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* xlogfuncs.c
*
* PostgreSQL write-ahead log manager user interface functions
*
* This file contains WAL control and information functions.
*
*
* Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/backend/access/transam/xlogfuncs.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/htup_details.h"
#include "access/xlog.h"
#include "access/xlog_internal.h"
#include "access/xlogutils.h"
#include "catalog/pg_type.h"
#include "funcapi.h"
#include "miscadmin.h"
#include "replication/walreceiver.h"
#include "storage/smgr.h"
#include "utils/builtins.h"
#include "utils/memutils.h"
#include "utils/numeric.h"
#include "utils/guc.h"
#include "utils/pg_lsn.h"
#include "utils/timestamp.h"
#include "utils/tuplestore.h"
#include "storage/fd.h"
#include "storage/ipc.h"
/*
* Store label file and tablespace map during non-exclusive backups.
*/
static StringInfo label_file;
static StringInfo tblspc_map_file;
/*
* Called when the backend exits with a running non-exclusive base backup,
* to clean up state.
*/
static void
nonexclusive_base_backup_cleanup(int code, Datum arg)
{
do_pg_abort_backup();
ereport(WARNING,
(errmsg("aborting backup due to backend exiting before pg_stop_backup was called")));
}
/*
* pg_start_backup: set up for taking an on-line backup dump
*
* Essentially what this does is to create a backup label file in $PGDATA,
* where it will be archived as part of the backup dump. The label file
* contains the user-supplied label string (typically this would be used
* to tell where the backup dump will be stored) and the starting time and
* starting WAL location for the dump.
*
* Permission checking for this function is managed through the normal
* GRANT system.
*/
Datum
pg_start_backup(PG_FUNCTION_ARGS)
{
text *backupid = PG_GETARG_TEXT_PP(0);
bool fast = PG_GETARG_BOOL(1);
bool exclusive = PG_GETARG_BOOL(2);
char *backupidstr;
XLogRecPtr startpoint;
SessionBackupState status = get_backup_status();
backupidstr = text_to_cstring(backupid);
if (status == SESSION_BACKUP_NON_EXCLUSIVE)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("a backup is already in progress in this session")));
if (exclusive)
{
startpoint = do_pg_start_backup(backupidstr, fast, NULL, NULL,
NULL, NULL, false, true);
}
else
{
MemoryContext oldcontext;
/*
2016-06-10 00:02:36 +02:00
* Label file and tablespace map file need to be long-lived, since
* they are read in pg_stop_backup.
*/
oldcontext = MemoryContextSwitchTo(TopMemoryContext);
label_file = makeStringInfo();
tblspc_map_file = makeStringInfo();
MemoryContextSwitchTo(oldcontext);
startpoint = do_pg_start_backup(backupidstr, fast, NULL, label_file,
NULL, tblspc_map_file, false, true);
before_shmem_exit(nonexclusive_base_backup_cleanup, (Datum) 0);
}
PG_RETURN_LSN(startpoint);
}
/*
* pg_stop_backup: finish taking an on-line backup dump
*
* We write an end-of-backup WAL record, and remove the backup label file
* created by pg_start_backup, creating a backup history file in pg_wal
* instead (whence it will immediately be archived). The backup history file
* contains the same info found in the label file, plus the backup-end time
* and WAL location. Before 9.0, the backup-end time was read from the backup
* history file at the beginning of archive recovery, but we now use the WAL
* record for that and the file is for informational and debug purposes only.
*
* Note: different from CancelBackup which just cancels online backup mode.
*
* Note: this version is only called to stop an exclusive backup. The function
2016-06-10 00:02:36 +02:00
* pg_stop_backup_v2 (overloaded as pg_stop_backup in SQL) is called to
* stop non-exclusive backups.
*
* Permission checking for this function is managed through the normal
* GRANT system.
*/
Datum
pg_stop_backup(PG_FUNCTION_ARGS)
{
XLogRecPtr stoppoint;
SessionBackupState status = get_backup_status();
if (status == SESSION_BACKUP_NON_EXCLUSIVE)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("non-exclusive backup in progress"),
2016-06-07 20:18:08 +02:00
errhint("Did you mean to use pg_stop_backup('f')?")));
/*
2016-06-10 00:02:36 +02:00
* Exclusive backups were typically started in a different connection, so
* don't try to verify that status of backup is set to
* SESSION_BACKUP_EXCLUSIVE in this function. Actual verification that an
* exclusive backup is in fact running is handled inside
* do_pg_stop_backup.
*/
stoppoint = do_pg_stop_backup(NULL, true, NULL);
PG_RETURN_LSN(stoppoint);
}
/*
* pg_stop_backup_v2: finish taking exclusive or nonexclusive on-line backup.
*
* Works the same as pg_stop_backup, except for non-exclusive backups it returns
* the backup label and tablespace map files as text fields in as part of the
* resultset.
*
* The first parameter (variable 'exclusive') allows the user to tell us if
* this is an exclusive or a non-exclusive backup.
*
* The second parameter (variable 'waitforarchive'), which is optional,
* allows the user to choose if they want to wait for the WAL to be archived
* or if we should just return as soon as the WAL record is written.
*
* Permission checking for this function is managed through the normal
* GRANT system.
*/
Datum
pg_stop_backup_v2(PG_FUNCTION_ARGS)
{
2016-06-10 00:02:36 +02:00
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
TupleDesc tupdesc;
Tuplestorestate *tupstore;
2016-06-10 00:02:36 +02:00
MemoryContext per_query_ctx;
MemoryContext oldcontext;
Datum values[3];
bool nulls[3];
2016-06-10 00:02:36 +02:00
bool exclusive = PG_GETARG_BOOL(0);
bool waitforarchive = PG_GETARG_BOOL(1);
2016-06-10 00:02:36 +02:00
XLogRecPtr stoppoint;
SessionBackupState status = get_backup_status();
/* check to see if caller supports us returning a tuplestore */
if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("set-valued function called in context that cannot accept a set")));
if (!(rsinfo->allowedModes & SFRM_Materialize))
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("materialize mode required, but it is not " \
"allowed in this context")));
/* Build a tuple descriptor for our result type */
if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
elog(ERROR, "return type must be a row type");
per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
oldcontext = MemoryContextSwitchTo(per_query_ctx);
tupstore = tuplestore_begin_heap(true, false, work_mem);
rsinfo->returnMode = SFRM_Materialize;
rsinfo->setResult = tupstore;
rsinfo->setDesc = tupdesc;
MemoryContextSwitchTo(oldcontext);
MemSet(values, 0, sizeof(values));
MemSet(nulls, 0, sizeof(nulls));
if (exclusive)
{
if (status == SESSION_BACKUP_NON_EXCLUSIVE)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("non-exclusive backup in progress"),
2016-06-07 20:18:08 +02:00
errhint("Did you mean to use pg_stop_backup('f')?")));
/*
* Stop the exclusive backup, and since we're in an exclusive backup
* return NULL for both backup_label and tablespace_map.
*/
stoppoint = do_pg_stop_backup(NULL, waitforarchive, NULL);
nulls[1] = true;
nulls[2] = true;
}
else
{
if (status != SESSION_BACKUP_NON_EXCLUSIVE)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("non-exclusive backup is not in progress"),
2016-06-07 20:18:08 +02:00
errhint("Did you mean to use pg_stop_backup('t')?")));
/*
2016-06-10 00:02:36 +02:00
* Stop the non-exclusive backup. Return a copy of the backup label
* and tablespace map so they can be written to disk by the caller.
*/
stoppoint = do_pg_stop_backup(label_file->data, waitforarchive, NULL);
cancel_before_shmem_exit(nonexclusive_base_backup_cleanup, (Datum) 0);
values[1] = CStringGetTextDatum(label_file->data);
values[2] = CStringGetTextDatum(tblspc_map_file->data);
/* Free structures allocated in TopMemoryContext */
pfree(label_file->data);
pfree(label_file);
label_file = NULL;
pfree(tblspc_map_file->data);
pfree(tblspc_map_file);
tblspc_map_file = NULL;
}
/* Stoppoint is included on both exclusive and nonexclusive backups */
2016-06-10 00:02:36 +02:00
values[0] = LSNGetDatum(stoppoint);
tuplestore_putvalues(tupstore, tupdesc, values, nulls);
tuplestore_donestoring(typstore);
return (Datum) 0;
}
/*
* pg_switch_wal: switch to next xlog file
*
* Permission checking for this function is managed through the normal
* GRANT system.
*/
Datum
pg_switch_wal(PG_FUNCTION_ARGS)
{
XLogRecPtr switchpoint;
if (RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is in progress"),
errhint("WAL control functions cannot be executed during recovery.")));
switchpoint = RequestXLogSwitch(false);
/*
* As a convenience, return the WAL location of the switch record
*/
PG_RETURN_LSN(switchpoint);
}
/*
* pg_create_restore_point: a named point for restore
*
* Permission checking for this function is managed through the normal
* GRANT system.
*/
Datum
pg_create_restore_point(PG_FUNCTION_ARGS)
{
text *restore_name = PG_GETARG_TEXT_PP(0);
char *restore_name_str;
XLogRecPtr restorepoint;
if (RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
(errmsg("recovery is in progress"),
errhint("WAL control functions cannot be executed during recovery."))));
if (!XLogIsNeeded())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("WAL level not sufficient for creating a restore point"),
errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
restore_name_str = text_to_cstring(restore_name);
if (strlen(restore_name_str) >= MAXFNAMELEN)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("value too long for restore point (maximum %d characters)", MAXFNAMELEN - 1)));
restorepoint = XLogRestorePoint(restore_name_str);
/*
* As a convenience, return the WAL location of the restore point record
*/
PG_RETURN_LSN(restorepoint);
}
/*
* Report the current WAL write location (same format as pg_start_backup etc)
*
* This is useful for determining how much of WAL is visible to an external
* archiving process. Note that the data before this point is written out
* to the kernel, but is not necessarily synced to disk.
*/
Datum
pg_current_wal_lsn(PG_FUNCTION_ARGS)
{
XLogRecPtr current_recptr;
if (RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is in progress"),
errhint("WAL control functions cannot be executed during recovery.")));
current_recptr = GetXLogWriteRecPtr();
PG_RETURN_LSN(current_recptr);
}
/*
* Report the current WAL insert location (same format as pg_start_backup etc)
*
* This function is mostly for debugging purposes.
*/
Datum
pg_current_wal_insert_lsn(PG_FUNCTION_ARGS)
{
XLogRecPtr current_recptr;
if (RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is in progress"),
errhint("WAL control functions cannot be executed during recovery.")));
current_recptr = GetXLogInsertRecPtr();
PG_RETURN_LSN(current_recptr);
}
/*
* Report the current WAL flush location (same format as pg_start_backup etc)
*
* This function is mostly for debugging purposes.
*/
Datum
pg_current_wal_flush_lsn(PG_FUNCTION_ARGS)
{
XLogRecPtr current_recptr;
if (RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is in progress"),
errhint("WAL control functions cannot be executed during recovery.")));
current_recptr = GetFlushRecPtr();
PG_RETURN_LSN(current_recptr);
}
/*
* Report the last WAL receive location (same format as pg_start_backup etc)
*
* This is useful for determining how much of WAL is guaranteed to be received
* and synced to disk by walreceiver.
*/
Datum
pg_last_wal_receive_lsn(PG_FUNCTION_ARGS)
{
XLogRecPtr recptr;
Allow a streaming replication standby to follow a timeline switch. Before this patch, streaming replication would refuse to start replicating if the timeline in the primary doesn't exactly match the standby. The situation where it doesn't match is when you have a master, and two standbys, and you promote one of the standbys to become new master. Promoting bumps up the timeline ID, and after that bump, the other standby would refuse to continue. There's significantly more timeline related logic in streaming replication now. First of all, when a standby connects to primary, it will ask the primary for any timeline history files that are missing from the standby. The missing files are sent using a new replication command TIMELINE_HISTORY, and stored in standby's pg_xlog directory. Using the timeline history files, the standby can follow the latest timeline present in the primary (recovery_target_timeline='latest'), just as it can follow new timelines appearing in an archive directory. START_REPLICATION now takes a TIMELINE parameter, to specify exactly which timeline to stream WAL from. This allows the standby to request the primary to send over WAL that precedes the promotion. The replication protocol is changed slightly (in a backwards-compatible way although there's little hope of streaming replication working across major versions anyway), to allow replication to stop when the end of timeline reached, putting the walsender back into accepting a replication command. Many thanks to Amit Kapila for testing and reviewing various versions of this patch.
2012-12-13 18:00:00 +01:00
recptr = GetWalRcvWriteRecPtr(NULL, NULL);
if (recptr == 0)
PG_RETURN_NULL();
PG_RETURN_LSN(recptr);
}
/*
* Report the last WAL replay location (same format as pg_start_backup etc)
*
* This is useful for determining how much of WAL is visible to read-only
* connections during recovery.
*/
Datum
pg_last_wal_replay_lsn(PG_FUNCTION_ARGS)
{
XLogRecPtr recptr;
Follow TLI of last replayed record, not recovery target TLI, in walsenders. Most of the time, the last replayed record comes from the recovery target timeline, but there is a corner case where it makes a difference. When the startup process scans for a new timeline, and decides to change recovery target timeline, there is a window where the recovery target TLI has already been bumped, but there are no WAL segments from the new timeline in pg_xlog yet. For example, if we have just replayed up to point 0/30002D8, on timeline 1, there is a WAL file called 000000010000000000000003 in pg_xlog that contains the WAL up to that point. When recovery switches recovery target timeline to 2, a walsender can immediately try to read WAL from 0/30002D8, from timeline 2, so it will try to open WAL file 000000020000000000000003. However, that doesn't exist yet - the startup process hasn't copied that file from the archive yet nor has the walreceiver streamed it yet, so walsender fails with error "requested WAL segment 000000020000000000000003 has already been removed". That's harmless, in that the standby will try to reconnect later and by that time the segment is already created, but error messages that should be ignored are not good. To fix that, have walsender track the TLI of the last replayed record, instead of the recovery target timeline. That way walsender will not try to read anything from timeline 2, until the WAL segment has been created and at least one record has been replayed from it. The recovery target timeline is now xlog.c's internal affair, it doesn't need to be exposed in shared memory anymore. This fixes the error reported by Thom Brown. depesz the same error message, but I'm not sure if this fixes his scenario.
2012-12-20 13:23:31 +01:00
recptr = GetXLogReplayRecPtr(NULL);
if (recptr == 0)
PG_RETURN_NULL();
PG_RETURN_LSN(recptr);
}
/*
* Compute an xlog file name and decimal byte offset given a WAL location,
* such as is returned by pg_stop_backup() or pg_switch_wal().
*
* Note that a location exactly at a segment boundary is taken to be in
* the previous segment. This is usually the right thing, since the
* expected usage is to determine which xlog file(s) are ready to archive.
*/
Datum
pg_walfile_name_offset(PG_FUNCTION_ARGS)
{
XLogSegNo xlogsegno;
uint32 xrecoff;
XLogRecPtr locationpoint = PG_GETARG_LSN(0);
char xlogfilename[MAXFNAMELEN];
Datum values[2];
bool isnull[2];
TupleDesc resultTupleDesc;
HeapTuple resultHeapTuple;
Datum result;
if (RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is in progress"),
errhint("pg_walfile_name_offset() cannot be executed during recovery.")));
/*
* Construct a tuple descriptor for the result row. This must match this
* function's pg_proc entry!
*/
resultTupleDesc = CreateTemplateTupleDesc(2, false);
TupleDescInitEntry(resultTupleDesc, (AttrNumber) 1, "file_name",
TEXTOID, -1, 0);
TupleDescInitEntry(resultTupleDesc, (AttrNumber) 2, "file_offset",
INT4OID, -1, 0);
resultTupleDesc = BlessTupleDesc(resultTupleDesc);
/*
* xlogfilename
*/
XLByteToPrevSeg(locationpoint, xlogsegno, wal_segment_size);
XLogFileName(xlogfilename, ThisTimeLineID, xlogsegno, wal_segment_size);
values[0] = CStringGetTextDatum(xlogfilename);
isnull[0] = false;
/*
* offset
*/
xrecoff = XLogSegmentOffset(locationpoint, wal_segment_size);
values[1] = UInt32GetDatum(xrecoff);
isnull[1] = false;
/*
* Tuple jam: Having first prepared your Datums, then squash together
*/
resultHeapTuple = heap_form_tuple(resultTupleDesc, values, isnull);
result = HeapTupleGetDatum(resultHeapTuple);
PG_RETURN_DATUM(result);
}
/*
* Compute an xlog file name given a WAL location,
* such as is returned by pg_stop_backup() or pg_switch_wal().
*/
Datum
pg_walfile_name(PG_FUNCTION_ARGS)
{
XLogSegNo xlogsegno;
XLogRecPtr locationpoint = PG_GETARG_LSN(0);
char xlogfilename[MAXFNAMELEN];
if (RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is in progress"),
errhint("pg_walfile_name() cannot be executed during recovery.")));
XLByteToPrevSeg(locationpoint, xlogsegno, wal_segment_size);
XLogFileName(xlogfilename, ThisTimeLineID, xlogsegno, wal_segment_size);
PG_RETURN_TEXT_P(cstring_to_text(xlogfilename));
}
/*
* pg_wal_replay_pause - pause recovery now
*
* Permission checking for this function is managed through the normal
* GRANT system.
*/
Datum
pg_wal_replay_pause(PG_FUNCTION_ARGS)
{
if (!RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is not in progress"),
errhint("Recovery control functions can only be executed during recovery.")));
SetRecoveryPause(true);
PG_RETURN_VOID();
}
/*
* pg_wal_replay_resume - resume recovery now
*
* Permission checking for this function is managed through the normal
* GRANT system.
*/
Datum
pg_wal_replay_resume(PG_FUNCTION_ARGS)
{
if (!RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is not in progress"),
errhint("Recovery control functions can only be executed during recovery.")));
SetRecoveryPause(false);
PG_RETURN_VOID();
}
/*
* pg_is_wal_replay_paused
*/
Datum
pg_is_wal_replay_paused(PG_FUNCTION_ARGS)
{
if (!RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is not in progress"),
errhint("Recovery control functions can only be executed during recovery.")));
PG_RETURN_BOOL(RecoveryIsPaused());
}
/*
* Returns timestamp of latest processed commit/abort record.
*
* When the server has been started normally without recovery the function
* returns NULL.
*/
Datum
pg_last_xact_replay_timestamp(PG_FUNCTION_ARGS)
{
TimestampTz xtime;
xtime = GetLatestXTime();
if (xtime == 0)
PG_RETURN_NULL();
PG_RETURN_TIMESTAMPTZ(xtime);
}
/*
* Returns bool with current recovery mode, a global state.
*/
Datum
pg_is_in_recovery(PG_FUNCTION_ARGS)
{
PG_RETURN_BOOL(RecoveryInProgress());
}
/*
* Compute the difference in bytes between two WAL locations.
*/
Datum
pg_wal_lsn_diff(PG_FUNCTION_ARGS)
{
Datum result;
result = DirectFunctionCall2(pg_lsn_mi,
PG_GETARG_DATUM(0),
PG_GETARG_DATUM(1));
PG_RETURN_NUMERIC(result);
}
/*
* Returns bool with current on-line backup mode, a global state.
*/
Datum
pg_is_in_backup(PG_FUNCTION_ARGS)
{
PG_RETURN_BOOL(BackupInProgress());
}
/*
* Returns start time of an online exclusive backup.
*
* When there's no exclusive backup in progress, the function
* returns NULL.
*/
Datum
pg_backup_start_time(PG_FUNCTION_ARGS)
{
Datum xtime;
FILE *lfp;
char fline[MAXPGPATH];
char backup_start_time[30];
/*
* See if label file is present
*/
lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
if (lfp == NULL)
{
if (errno != ENOENT)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not read file \"%s\": %m",
BACKUP_LABEL_FILE)));
PG_RETURN_NULL();
}
/*
* Parse the file to find the START TIME line.
*/
backup_start_time[0] = '\0';
while (fgets(fline, sizeof(fline), lfp) != NULL)
{
if (sscanf(fline, "START TIME: %25[^\n]\n", backup_start_time) == 1)
break;
}
/* Check for a read error. */
if (ferror(lfp))
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not read file \"%s\": %m", BACKUP_LABEL_FILE)));
/* Close the backup label file. */
if (FreeFile(lfp))
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not close file \"%s\": %m", BACKUP_LABEL_FILE)));
if (strlen(backup_start_time) == 0)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
/*
* Convert the time string read from file to TimestampTz form.
*/
xtime = DirectFunctionCall3(timestamptz_in,
CStringGetDatum(backup_start_time),
ObjectIdGetDatum(InvalidOid),
Int32GetDatum(-1));
PG_RETURN_DATUM(xtime);
}