postgresql/src/bin/pg_basebackup/receivelog.c

722 lines
17 KiB
C

/*-------------------------------------------------------------------------
*
* receivelog.c - receive transaction log files using the streaming
* replication protocol.
*
* Author: Magnus Hagander <magnus@hagander.net>
*
* Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
*
* IDENTIFICATION
* src/bin/pg_basebackup/receivelog.c
*-------------------------------------------------------------------------
*/
#include "postgres_fe.h"
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <unistd.h>
/* for ntohl/htonl */
#include <netinet/in.h>
#include <arpa/inet.h>
#include "libpq-fe.h"
#include "access/xlog_internal.h"
#include "receivelog.h"
#include "streamutil.h"
/* fd for currently open WAL file */
static int walfile = -1;
/*
* Open a new WAL file in the specified directory. Store the name
* (not including the full directory) in namebuf. Assumes there is
* enough room in this buffer...
*
* The file will be padded to 16Mb with zeroes.
*/
static int
open_walfile(XLogRecPtr startpoint, uint32 timeline, char *basedir,
char *namebuf)
{
int f;
char fn[MAXPGPATH];
struct stat statbuf;
char *zerobuf;
int bytes;
XLogSegNo segno;
XLByteToSeg(startpoint, segno);
XLogFileName(namebuf, timeline, segno);
snprintf(fn, sizeof(fn), "%s/%s.partial", basedir, namebuf);
f = open(fn, O_WRONLY | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR);
if (f == -1)
{
fprintf(stderr,
_("%s: could not open transaction log file \"%s\": %s\n"),
progname, fn, strerror(errno));
return -1;
}
/*
* Verify that the file is either empty (just created), or a complete
* XLogSegSize segment. Anything in between indicates a corrupt file.
*/
if (fstat(f, &statbuf) != 0)
{
fprintf(stderr,
_("%s: could not stat transaction log file \"%s\": %s\n"),
progname, fn, strerror(errno));
close(f);
return -1;
}
if (statbuf.st_size == XLogSegSize)
return f; /* File is open and ready to use */
if (statbuf.st_size != 0)
{
fprintf(stderr,
_("%s: transaction log file \"%s\" has %d bytes, should be 0 or %d\n"),
progname, fn, (int) statbuf.st_size, XLogSegSize);
close(f);
return -1;
}
/* New, empty, file. So pad it to 16Mb with zeroes */
zerobuf = pg_malloc0(XLOG_BLCKSZ);
for (bytes = 0; bytes < XLogSegSize; bytes += XLOG_BLCKSZ)
{
if (write(f, zerobuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
{
fprintf(stderr,
_("%s: could not pad transaction log file \"%s\": %s\n"),
progname, fn, strerror(errno));
free(zerobuf);
close(f);
unlink(fn);
return -1;
}
}
free(zerobuf);
if (lseek(f, SEEK_SET, 0) != 0)
{
fprintf(stderr,
_("%s: could not seek to beginning of transaction log file \"%s\": %s\n"),
progname, fn, strerror(errno));
close(f);
return -1;
}
return f;
}
/*
* Close the current WAL file, and rename it to the correct filename if it's
* complete.
*
* If segment_complete is true, rename the current WAL file even if we've not
* completed writing the whole segment.
*/
static bool
close_walfile(char *basedir, char *walname, bool segment_complete)
{
off_t currpos = lseek(walfile, 0, SEEK_CUR);
if (currpos == -1)
{
fprintf(stderr,
_("%s: could not determine seek position in file \"%s\": %s\n"),
progname, walname, strerror(errno));
return false;
}
if (fsync(walfile) != 0)
{
fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"),
progname, walname, strerror(errno));
return false;
}
if (close(walfile) != 0)
{
fprintf(stderr, _("%s: could not close file \"%s\": %s\n"),
progname, walname, strerror(errno));
walfile = -1;
return false;
}
walfile = -1;
/*
* Rename the .partial file only if we've completed writing the whole
* segment or segment_complete is true.
*/
if (currpos == XLOG_SEG_SIZE || segment_complete)
{
char oldfn[MAXPGPATH];
char newfn[MAXPGPATH];
snprintf(oldfn, sizeof(oldfn), "%s/%s.partial", basedir, walname);
snprintf(newfn, sizeof(newfn), "%s/%s", basedir, walname);
if (rename(oldfn, newfn) != 0)
{
fprintf(stderr, _("%s: could not rename file \"%s\": %s\n"),
progname, walname, strerror(errno));
return false;
}
}
else
fprintf(stderr,
_("%s: not renaming \"%s\", segment is not complete\n"),
progname, walname);
return true;
}
/*
* Local version of GetCurrentTimestamp(), since we are not linked with
* backend code. The protocol always uses integer timestamps, regardless of
* server setting.
*/
static int64
localGetCurrentTimestamp(void)
{
int64 result;
struct timeval tp;
gettimeofday(&tp, NULL);
result = (int64) tp.tv_sec -
((POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY);
result = (result * USECS_PER_SEC) + tp.tv_usec;
return result;
}
/*
* Local version of TimestampDifference(), since we are not linked with
* backend code.
*/
static void
localTimestampDifference(int64 start_time, int64 stop_time,
long *secs, int *microsecs)
{
int64 diff = stop_time - start_time;
if (diff <= 0)
{
*secs = 0;
*microsecs = 0;
}
else
{
*secs = (long) (diff / USECS_PER_SEC);
*microsecs = (int) (diff % USECS_PER_SEC);
}
}
/*
* Local version of TimestampDifferenceExceeds(), since we are not
* linked with backend code.
*/
static bool
localTimestampDifferenceExceeds(int64 start_time,
int64 stop_time,
int msec)
{
int64 diff = stop_time - start_time;
return (diff >= msec * INT64CONST(1000));
}
/*
* Converts an int64 to network byte order.
*/
static void
sendint64(int64 i, char *buf)
{
uint32 n32;
/* High order half first, since we're doing MSB-first */
n32 = (uint32) (i >> 32);
n32 = htonl(n32);
memcpy(&buf[0], &n32, 4);
/* Now the low order half */
n32 = (uint32) i;
n32 = htonl(n32);
memcpy(&buf[4], &n32, 4);
}
/*
* Converts an int64 from network byte order to native format.
*/
static int64
recvint64(char *buf)
{
int64 result;
uint32 h32;
uint32 l32;
memcpy(&h32, buf, 4);
memcpy(&l32, buf + 4, 4);
h32 = ntohl(h32);
l32 = ntohl(l32);
result = h32;
result <<= 32;
result |= l32;
return result;
}
/*
* Send a Standby Status Update message to server.
*/
static bool
sendFeedback(PGconn *conn, XLogRecPtr blockpos, int64 now, bool replyRequested)
{
char replybuf[1 + 8 + 8 + 8 + 8 + 1];
int len = 0;
replybuf[len] = 'r';
len += 1;
sendint64(blockpos, &replybuf[len]); /* write */
len += 8;
sendint64(InvalidXLogRecPtr, &replybuf[len]); /* flush */
len += 8;
sendint64(InvalidXLogRecPtr, &replybuf[len]); /* apply */
len += 8;
sendint64(now, &replybuf[len]); /* sendTime */
len += 8;
replybuf[len] = replyRequested ? 1 : 0; /* replyRequested */
len += 1;
if (PQputCopyData(conn, replybuf, len) <= 0 || PQflush(conn))
{
fprintf(stderr, _("%s: could not send feedback packet: %s"),
progname, PQerrorMessage(conn));
return false;
}
return true;
}
/*
* Receive a log stream starting at the specified position.
*
* If sysidentifier is specified, validate that both the system
* identifier and the timeline matches the specified ones
* (by sending an extra IDENTIFY_SYSTEM command)
*
* All received segments will be written to the directory
* specified by basedir.
*
* The stream_stop callback will be called every time data
* is received, and whenever a segment is completed. If it returns
* true, the streaming will stop and the function
* return. As long as it returns false, streaming will continue
* indefinitely.
*
* standby_message_timeout controls how often we send a message
* back to the master letting it know our progress, in seconds.
* This message will only contain the write location, and never
* flush or replay.
*
* Note: The log position *must* be at a log segment start!
*/
bool
ReceiveXlogStream(PGconn *conn, XLogRecPtr startpos, uint32 timeline,
char *sysidentifier, char *basedir,
stream_stop_callback stream_stop,
int standby_message_timeout, bool rename_partial)
{
char query[128];
char current_walfile_name[MAXPGPATH];
PGresult *res;
char *copybuf = NULL;
int64 last_status = -1;
XLogRecPtr blockpos = InvalidXLogRecPtr;
/*
* The message format used in streaming replication changed in 9.3, so we
* cannot stream from older servers. Don't know if we would work with
* newer versions, but let's not take the risk.
*/
if (PQserverVersion(conn) / 100 != PG_VERSION_NUM / 100)
{
const char *serverver = PQparameterStatus(conn, "server_version");
fprintf(stderr, _("%s: incompatible server version %s; streaming is only supported with server version %s\n"),
progname,
serverver ? serverver : "'unknown'",
PG_MAJORVERSION);
return false;
}
if (sysidentifier != NULL)
{
/* Validate system identifier and timeline hasn't changed */
res = PQexec(conn, "IDENTIFY_SYSTEM");
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
fprintf(stderr,
_("%s: could not send replication command \"%s\": %s"),
progname, "IDENTIFY_SYSTEM", PQerrorMessage(conn));
PQclear(res);
return false;
}
if (PQnfields(res) != 3 || PQntuples(res) != 1)
{
fprintf(stderr,
_("%s: could not identify system: got %d rows and %d fields, expected %d rows and %d fields\n"),
progname, PQntuples(res), PQnfields(res), 1, 3);
PQclear(res);
return false;
}
if (strcmp(sysidentifier, PQgetvalue(res, 0, 0)) != 0)
{
fprintf(stderr,
_("%s: system identifier does not match between base backup and streaming connection\n"),
progname);
PQclear(res);
return false;
}
if (timeline != atoi(PQgetvalue(res, 0, 1)))
{
fprintf(stderr,
_("%s: timeline does not match between base backup and streaming connection\n"),
progname);
PQclear(res);
return false;
}
PQclear(res);
}
/* Initiate the replication stream at specified location */
snprintf(query, sizeof(query), "START_REPLICATION %X/%X",
(uint32) (startpos >> 32), (uint32) startpos);
res = PQexec(conn, query);
if (PQresultStatus(res) != PGRES_COPY_BOTH)
{
fprintf(stderr, _("%s: could not send replication command \"%s\": %s"),
progname, "START_REPLICATION", PQresultErrorMessage(res));
PQclear(res);
return false;
}
PQclear(res);
/*
* Receive the actual xlog data
*/
while (1)
{
int r;
int xlogoff;
int bytes_left;
int bytes_written;
int64 now;
int hdr_len;
if (copybuf != NULL)
{
PQfreemem(copybuf);
copybuf = NULL;
}
/*
* Check if we should continue streaming, or abort at this point.
*/
if (stream_stop && stream_stop(blockpos, timeline, false))
{
if (walfile != -1 && !close_walfile(basedir, current_walfile_name,
rename_partial))
/* Potential error message is written by close_walfile */
goto error;
return true;
}
/*
* Potentially send a status message to the master
*/
now = localGetCurrentTimestamp();
if (standby_message_timeout > 0 &&
localTimestampDifferenceExceeds(last_status, now,
standby_message_timeout))
{
/* Time to send feedback! */
if (!sendFeedback(conn, blockpos, now, false))
goto error;
last_status = now;
}
r = PQgetCopyData(conn, &copybuf, 1);
if (r == 0)
{
/*
* In async mode, and no data available. We block on reading but
* not more than the specified timeout, so that we can send a
* response back to the client.
*/
fd_set input_mask;
struct timeval timeout;
struct timeval *timeoutptr;
FD_ZERO(&input_mask);
FD_SET(PQsocket(conn), &input_mask);
if (standby_message_timeout)
{
int64 targettime;
long secs;
int usecs;
targettime = last_status + (standby_message_timeout - 1) * ((int64) 1000);
localTimestampDifference(now,
targettime,
&secs,
&usecs);
if (secs <= 0)
timeout.tv_sec = 1; /* Always sleep at least 1 sec */
else
timeout.tv_sec = secs;
timeout.tv_usec = usecs;
timeoutptr = &timeout;
}
else
timeoutptr = NULL;
r = select(PQsocket(conn) + 1, &input_mask, NULL, NULL, timeoutptr);
if (r == 0 || (r < 0 && errno == EINTR))
{
/*
* Got a timeout or signal. Continue the loop and either
* deliver a status packet to the server or just go back into
* blocking.
*/
continue;
}
else if (r < 0)
{
fprintf(stderr, _("%s: select() failed: %s\n"),
progname, strerror(errno));
goto error;
}
/* Else there is actually data on the socket */
if (PQconsumeInput(conn) == 0)
{
fprintf(stderr,
_("%s: could not receive data from WAL stream: %s"),
progname, PQerrorMessage(conn));
goto error;
}
continue;
}
if (r == -1)
/* End of copy stream */
break;
if (r == -2)
{
fprintf(stderr, _("%s: could not read COPY data: %s"),
progname, PQerrorMessage(conn));
goto error;
}
/* Check the message type. */
if (copybuf[0] == 'k')
{
int pos;
bool replyRequested;
/*
* Parse the keepalive message, enclosed in the CopyData message.
* We just check if the server requested a reply, and ignore the
* rest.
*/
pos = 1; /* skip msgtype 'k' */
pos += 8; /* skip walEnd */
pos += 8; /* skip sendTime */
if (r < pos + 1)
{
fprintf(stderr, _("%s: streaming header too small: %d\n"),
progname, r);
goto error;
}
replyRequested = copybuf[pos];
/* If the server requested an immediate reply, send one. */
if (replyRequested)
{
now = localGetCurrentTimestamp();
if (!sendFeedback(conn, blockpos, now, false))
goto error;
last_status = now;
}
continue;
}
else if (copybuf[0] != 'w')
{
fprintf(stderr, _("%s: unrecognized streaming header: \"%c\"\n"),
progname, copybuf[0]);
goto error;
}
/*
* Read the header of the XLogData message, enclosed in the CopyData
* message. We only need the WAL location field (dataStart), the rest
* of the header is ignored.
*/
hdr_len = 1; /* msgtype 'w' */
hdr_len += 8; /* dataStart */
hdr_len += 8; /* walEnd */
hdr_len += 8; /* sendTime */
if (r < hdr_len + 1)
{
fprintf(stderr, _("%s: streaming header too small: %d\n"),
progname, r);
goto error;
}
blockpos = recvint64(&copybuf[1]);
/* Extract WAL location for this block */
xlogoff = blockpos % XLOG_SEG_SIZE;
/*
* Verify that the initial location in the stream matches where we
* think we are.
*/
if (walfile == -1)
{
/* No file open yet */
if (xlogoff != 0)
{
fprintf(stderr,
_("%s: received transaction log record for offset %u with no file open\n"),
progname, xlogoff);
goto error;
}
}
else
{
/* More data in existing segment */
/* XXX: store seek value don't reseek all the time */
if (lseek(walfile, 0, SEEK_CUR) != xlogoff)
{
fprintf(stderr,
_("%s: got WAL data offset %08x, expected %08x\n"),
progname, xlogoff, (int) lseek(walfile, 0, SEEK_CUR));
goto error;
}
}
bytes_left = r - hdr_len;
bytes_written = 0;
while (bytes_left)
{
int bytes_to_write;
/*
* If crossing a WAL boundary, only write up until we reach
* XLOG_SEG_SIZE.
*/
if (xlogoff + bytes_left > XLOG_SEG_SIZE)
bytes_to_write = XLOG_SEG_SIZE - xlogoff;
else
bytes_to_write = bytes_left;
if (walfile == -1)
{
walfile = open_walfile(blockpos, timeline,
basedir, current_walfile_name);
if (walfile == -1)
/* Error logged by open_walfile */
goto error;
}
if (write(walfile,
copybuf + hdr_len + bytes_written,
bytes_to_write) != bytes_to_write)
{
fprintf(stderr,
_("%s: could not write %u bytes to WAL file \"%s\": %s\n"),
progname, bytes_to_write, current_walfile_name,
strerror(errno));
goto error;
}
/* Write was successful, advance our position */
bytes_written += bytes_to_write;
bytes_left -= bytes_to_write;
blockpos += bytes_to_write;
xlogoff += bytes_to_write;
/* Did we reach the end of a WAL segment? */
if (blockpos % XLOG_SEG_SIZE == 0)
{
if (!close_walfile(basedir, current_walfile_name, false))
/* Error message written in close_walfile() */
goto error;
xlogoff = 0;
if (stream_stop != NULL)
{
/*
* Callback when the segment finished, and return if it
* told us to.
*/
if (stream_stop(blockpos, timeline, true))
return true;
}
}
}
/* No more data left to write, start receiving next copy packet */
}
/*
* The only way to get out of the loop is if the server shut down the
* replication stream. If it's a controlled shutdown, the server will send
* a shutdown message, and we'll return the latest xlog location that has
* been streamed.
*/
res = PQgetResult(conn);
if (PQresultStatus(res) != PGRES_COMMAND_OK)
{
fprintf(stderr,
_("%s: unexpected termination of replication stream: %s"),
progname, PQresultErrorMessage(res));
goto error;
}
PQclear(res);
/* Complain if we've not reached stop point yet */
if (stream_stop != NULL && !stream_stop(blockpos, timeline, false))
{
fprintf(stderr, _("%s: replication stream was terminated before stop point\n"),
progname);
goto error;
}
if (copybuf != NULL)
PQfreemem(copybuf);
if (walfile != -1 && close(walfile) != 0)
fprintf(stderr, _("%s: could not close file \"%s\": %s\n"),
progname, current_walfile_name, strerror(errno));
walfile = -1;
return true;
error:
if (copybuf != NULL)
PQfreemem(copybuf);
if (walfile != -1 && close(walfile) != 0)
fprintf(stderr, _("%s: could not close file \"%s\": %s\n"),
progname, current_walfile_name, strerror(errno));
walfile = -1;
return false;
}