2013-01-16 20:12:53 +01:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
|
|
|
* xlogreader.h
|
|
|
|
* Definitions for the generic XLog reading facility
|
|
|
|
*
|
2016-01-02 19:33:40 +01:00
|
|
|
* Portions Copyright (c) 2013-2016, PostgreSQL Global Development Group
|
2013-01-16 20:12:53 +01:00
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
|
|
|
* src/include/access/xlogreader.h
|
|
|
|
*
|
|
|
|
* NOTES
|
|
|
|
* See the definition of the XLogReaderState struct for instructions on
|
|
|
|
* how to use the XLogReader infrastructure.
|
|
|
|
*
|
|
|
|
* The basic idea is to allocate an XLogReaderState via
|
|
|
|
* XLogReaderAllocate(), and call XLogReadRecord() until it returns NULL.
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
*
|
|
|
|
* After reading a record with XLogReadRecord(), it's decomposed into
|
|
|
|
* the per-block and main data parts, and the parts can be accessed
|
|
|
|
* with the XLogRec* macros and functions. You can also decode a
|
|
|
|
* record that's already constructed in memory, without reading from
|
|
|
|
* disk, by calling the DecodeXLogRecord() function.
|
2013-01-16 20:12:53 +01:00
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
#ifndef XLOGREADER_H
|
|
|
|
#define XLOGREADER_H
|
|
|
|
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
#include "access/xlogrecord.h"
|
2013-01-16 20:12:53 +01:00
|
|
|
|
|
|
|
typedef struct XLogReaderState XLogReaderState;
|
|
|
|
|
|
|
|
/* Function type definition for the read_page callback */
|
|
|
|
typedef int (*XLogPageReadCB) (XLogReaderState *xlogreader,
|
|
|
|
XLogRecPtr targetPagePtr,
|
|
|
|
int reqLen,
|
Use the right timeline when beginning to stream from master.
The xlogreader refactoring broke the logic to decide which timeline to start
streaming from. XLogPageRead() uses the timeline history to check which
timeline the requested WAL position falls into. However, after the
refactoring, XLogPageRead() is always first called with the first page in
the segment, to verify the segment header, and only then with the actual WAL
position we're interested in. That first read of the segment's header made
XLogPageRead() to always start streaming from the old timeline containing
the segment header, not the timeline containing the actual record, if there
was a timeline switch within the segment.
I thought I fixed this yesterday, but that fix was too narrow and only fixed
this for the corner-case that the timeline switch happened in the first page
of the segment. To fix this more robustly, pass explicitly the position of
the record we're actually interested in to XLogPageRead, and use that to
decide which timeline to read from, rather than deduce it from the page and
offset.
Per report from Fujii Masao.
2013-01-18 10:41:36 +01:00
|
|
|
XLogRecPtr targetRecPtr,
|
2013-01-16 20:12:53 +01:00
|
|
|
char *readBuf,
|
|
|
|
TimeLineID *pageTLI);
|
|
|
|
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
typedef struct
|
|
|
|
{
|
|
|
|
/* Is this block ref in use? */
|
|
|
|
bool in_use;
|
|
|
|
|
|
|
|
/* Identify the block this refers to */
|
|
|
|
RelFileNode rnode;
|
|
|
|
ForkNumber forknum;
|
|
|
|
BlockNumber blkno;
|
|
|
|
|
|
|
|
/* copy of the fork_flags field from the XLogRecordBlockHeader */
|
|
|
|
uint8 flags;
|
|
|
|
|
|
|
|
/* Information on full-page image, if any */
|
|
|
|
bool has_image;
|
|
|
|
char *bkp_image;
|
|
|
|
uint16 hole_offset;
|
|
|
|
uint16 hole_length;
|
Add GUC to enable compression of full page images stored in WAL.
When newly-added GUC parameter, wal_compression, is on, the PostgreSQL server
compresses a full page image written to WAL when full_page_writes is on or
during a base backup. A compressed page image will be decompressed during WAL
replay. Turning this parameter on can reduce the WAL volume without increasing
the risk of unrecoverable data corruption, but at the cost of some extra CPU
spent on the compression during WAL logging and on the decompression during
WAL replay.
This commit changes the WAL format (so bumping WAL version number) so that
the one-byte flag indicating whether a full page image is compressed or not is
included in its header information. This means that the commit increases the
WAL volume one-byte per a full page image even if WAL compression is not used
at all. We can save that one-byte by borrowing one-bit from the existing field
like hole_offset in the header and using it as the flag, for example. But which
would reduce the code readability and the extensibility of the feature.
Per discussion, it's not worth paying those prices to save only one-byte, so we
decided to add the one-byte flag to the header.
This commit doesn't introduce any new compression algorithm like lz4.
Currently a full page image is compressed using the existing PGLZ algorithm.
Per discussion, we decided to use it at least in the first version of the
feature because there were no performance reports showing that its compression
ratio is unacceptably lower than that of other algorithm. Of course,
in the future, it's worth considering the support of other compression
algorithm for the better compression.
Rahila Syed and Michael Paquier, reviewed in various versions by myself,
Andres Freund, Robert Haas, Abhijit Menon-Sen and many others.
2015-03-11 07:52:24 +01:00
|
|
|
uint16 bimg_len;
|
|
|
|
uint8 bimg_info;
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
|
|
|
|
/* Buffer holding the rmgr-specific data associated with this block */
|
|
|
|
bool has_data;
|
|
|
|
char *data;
|
|
|
|
uint16 data_len;
|
|
|
|
uint16 data_bufsz;
|
|
|
|
} DecodedBkpBlock;
|
|
|
|
|
2013-01-16 20:12:53 +01:00
|
|
|
struct XLogReaderState
|
|
|
|
{
|
|
|
|
/* ----------------------------------------
|
|
|
|
* Public parameters
|
|
|
|
* ----------------------------------------
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Data input callback (mandatory).
|
|
|
|
*
|
|
|
|
* This callback shall read at least reqLen valid bytes of the xlog page
|
|
|
|
* starting at targetPagePtr, and store them in readBuf. The callback
|
|
|
|
* shall return the number of bytes read (never more than XLOG_BLCKSZ), or
|
|
|
|
* -1 on failure. The callback shall sleep, if necessary, to wait for the
|
|
|
|
* requested bytes to become available. The callback will not be invoked
|
|
|
|
* again for the same page unless more than the returned number of bytes
|
Use the right timeline when beginning to stream from master.
The xlogreader refactoring broke the logic to decide which timeline to start
streaming from. XLogPageRead() uses the timeline history to check which
timeline the requested WAL position falls into. However, after the
refactoring, XLogPageRead() is always first called with the first page in
the segment, to verify the segment header, and only then with the actual WAL
position we're interested in. That first read of the segment's header made
XLogPageRead() to always start streaming from the old timeline containing
the segment header, not the timeline containing the actual record, if there
was a timeline switch within the segment.
I thought I fixed this yesterday, but that fix was too narrow and only fixed
this for the corner-case that the timeline switch happened in the first page
of the segment. To fix this more robustly, pass explicitly the position of
the record we're actually interested in to XLogPageRead, and use that to
decide which timeline to read from, rather than deduce it from the page and
offset.
Per report from Fujii Masao.
2013-01-18 10:41:36 +01:00
|
|
|
* are needed.
|
2013-01-16 20:12:53 +01:00
|
|
|
*
|
Use the right timeline when beginning to stream from master.
The xlogreader refactoring broke the logic to decide which timeline to start
streaming from. XLogPageRead() uses the timeline history to check which
timeline the requested WAL position falls into. However, after the
refactoring, XLogPageRead() is always first called with the first page in
the segment, to verify the segment header, and only then with the actual WAL
position we're interested in. That first read of the segment's header made
XLogPageRead() to always start streaming from the old timeline containing
the segment header, not the timeline containing the actual record, if there
was a timeline switch within the segment.
I thought I fixed this yesterday, but that fix was too narrow and only fixed
this for the corner-case that the timeline switch happened in the first page
of the segment. To fix this more robustly, pass explicitly the position of
the record we're actually interested in to XLogPageRead, and use that to
decide which timeline to read from, rather than deduce it from the page and
offset.
Per report from Fujii Masao.
2013-01-18 10:41:36 +01:00
|
|
|
* targetRecPtr is the position of the WAL record we're reading. Usually
|
|
|
|
* it is equal to targetPagePtr + reqLen, but sometimes xlogreader needs
|
|
|
|
* to read and verify the page or segment header, before it reads the
|
|
|
|
* actual WAL record it's interested in. In that case, targetRecPtr can
|
|
|
|
* be used to determine which timeline to read the page from.
|
|
|
|
*
|
|
|
|
* The callback shall set *pageTLI to the TLI of the file the page was
|
|
|
|
* read from. It is currently used only for error reporting purposes, to
|
|
|
|
* reconstruct the name of the WAL file where an error occurred.
|
2013-01-16 20:12:53 +01:00
|
|
|
*/
|
|
|
|
XLogPageReadCB read_page;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* System identifier of the xlog files we're about to read. Set to zero
|
|
|
|
* (the default value) if unknown or unimportant.
|
|
|
|
*/
|
|
|
|
uint64 system_identifier;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Opaque data for callbacks to use. Not used by XLogReader.
|
|
|
|
*/
|
|
|
|
void *private_data;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Start and end point of last record read. EndRecPtr is also used as the
|
|
|
|
* position to read next, if XLogReadRecord receives an invalid recptr.
|
|
|
|
*/
|
|
|
|
XLogRecPtr ReadRecPtr; /* start of last record read */
|
|
|
|
XLogRecPtr EndRecPtr; /* end+1 of last record read */
|
|
|
|
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
|
|
|
|
/* ----------------------------------------
|
|
|
|
* Decoded representation of current record
|
|
|
|
*
|
|
|
|
* Use XLogRecGet* functions to investigate the record; these fields
|
|
|
|
* should not be accessed directly.
|
|
|
|
* ----------------------------------------
|
|
|
|
*/
|
|
|
|
XLogRecord *decoded_record; /* currently decoded record */
|
|
|
|
|
|
|
|
char *main_data; /* record's main data portion */
|
|
|
|
uint32 main_data_len; /* main data portion's length */
|
|
|
|
uint32 main_data_bufsz; /* allocated size of the buffer */
|
|
|
|
|
2015-05-24 03:35:49 +02:00
|
|
|
RepOriginId record_origin;
|
Introduce replication progress tracking infrastructure.
When implementing a replication solution ontop of logical decoding, two
related problems exist:
* How to safely keep track of replication progress
* How to change replication behavior, based on the origin of a row;
e.g. to avoid loops in bi-directional replication setups
The solution to these problems, as implemented here, consist out of
three parts:
1) 'replication origins', which identify nodes in a replication setup.
2) 'replication progress tracking', which remembers, for each
replication origin, how far replay has progressed in a efficient and
crash safe manner.
3) The ability to filter out changes performed on the behest of a
replication origin during logical decoding; this allows complex
replication topologies. E.g. by filtering all replayed changes out.
Most of this could also be implemented in "userspace", e.g. by inserting
additional rows contain origin information, but that ends up being much
less efficient and more complicated. We don't want to require various
replication solutions to reimplement logic for this independently. The
infrastructure is intended to be generic enough to be reusable.
This infrastructure also replaces the 'nodeid' infrastructure of commit
timestamps. It is intended to provide all the former capabilities,
except that there's only 2^16 different origins; but now they integrate
with logical decoding. Additionally more functionality is accessible via
SQL. Since the commit timestamp infrastructure has also been introduced
in 9.5 (commit 73c986add) changing the API is not a problem.
For now the number of origins for which the replication progress can be
tracked simultaneously is determined by the max_replication_slots
GUC. That GUC is not a perfect match to configure this, but there
doesn't seem to be sufficient reason to introduce a separate new one.
Bumps both catversion and wal page magic.
Author: Andres Freund, with contributions from Petr Jelinek and Craig Ringer
Reviewed-By: Heikki Linnakangas, Petr Jelinek, Robert Haas, Steve Singer
Discussion: 20150216002155.GI15326@awork2.anarazel.de,
20140923182422.GA15776@alap3.anarazel.de,
20131114172632.GE7522@alap2.anarazel.de
2015-04-29 19:30:53 +02:00
|
|
|
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
/* information about blocks referenced by the record. */
|
|
|
|
DecodedBkpBlock blocks[XLR_MAX_BLOCK_ID + 1];
|
|
|
|
|
|
|
|
int max_block_id; /* highest block_id in use (-1 if none) */
|
|
|
|
|
2013-01-16 20:12:53 +01:00
|
|
|
/* ----------------------------------------
|
|
|
|
* private/internal state
|
|
|
|
* ----------------------------------------
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* Buffer for currently read page (XLOG_BLCKSZ bytes) */
|
|
|
|
char *readBuf;
|
|
|
|
|
|
|
|
/* last read segment, segment offset, read length, TLI */
|
|
|
|
XLogSegNo readSegNo;
|
|
|
|
uint32 readOff;
|
|
|
|
uint32 readLen;
|
|
|
|
TimeLineID readPageTLI;
|
|
|
|
|
|
|
|
/* beginning of last page read, and its TLI */
|
|
|
|
XLogRecPtr latestPagePtr;
|
|
|
|
TimeLineID latestPageTLI;
|
|
|
|
|
Use the right timeline when beginning to stream from master.
The xlogreader refactoring broke the logic to decide which timeline to start
streaming from. XLogPageRead() uses the timeline history to check which
timeline the requested WAL position falls into. However, after the
refactoring, XLogPageRead() is always first called with the first page in
the segment, to verify the segment header, and only then with the actual WAL
position we're interested in. That first read of the segment's header made
XLogPageRead() to always start streaming from the old timeline containing
the segment header, not the timeline containing the actual record, if there
was a timeline switch within the segment.
I thought I fixed this yesterday, but that fix was too narrow and only fixed
this for the corner-case that the timeline switch happened in the first page
of the segment. To fix this more robustly, pass explicitly the position of
the record we're actually interested in to XLogPageRead, and use that to
decide which timeline to read from, rather than deduce it from the page and
offset.
Per report from Fujii Masao.
2013-01-18 10:41:36 +01:00
|
|
|
/* beginning of the WAL record being read. */
|
|
|
|
XLogRecPtr currRecPtr;
|
|
|
|
|
2013-01-16 20:12:53 +01:00
|
|
|
/* Buffer for current ReadRecord result (expandable) */
|
|
|
|
char *readRecordBuf;
|
|
|
|
uint32 readRecordBufSize;
|
|
|
|
|
|
|
|
/* Buffer to hold error message */
|
|
|
|
char *errormsg_buf;
|
|
|
|
};
|
|
|
|
|
|
|
|
/* Get a new XLogReader */
|
|
|
|
extern XLogReaderState *XLogReaderAllocate(XLogPageReadCB pagereadfunc,
|
|
|
|
void *private_data);
|
|
|
|
|
|
|
|
/* Free an XLogReader */
|
|
|
|
extern void XLogReaderFree(XLogReaderState *state);
|
|
|
|
|
|
|
|
/* Read the next XLog record. Returns NULL on end-of-WAL or failure */
|
|
|
|
extern struct XLogRecord *XLogReadRecord(XLogReaderState *state,
|
|
|
|
XLogRecPtr recptr, char **errormsg);
|
|
|
|
|
|
|
|
#ifdef FRONTEND
|
|
|
|
extern XLogRecPtr XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr);
|
|
|
|
#endif /* FRONTEND */
|
|
|
|
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
/* Functions for decoding an XLogRecord */
|
|
|
|
|
|
|
|
extern bool DecodeXLogRecord(XLogReaderState *state, XLogRecord *record,
|
|
|
|
char **errmsg);
|
|
|
|
|
|
|
|
#define XLogRecGetTotalLen(decoder) ((decoder)->decoded_record->xl_tot_len)
|
|
|
|
#define XLogRecGetPrev(decoder) ((decoder)->decoded_record->xl_prev)
|
|
|
|
#define XLogRecGetInfo(decoder) ((decoder)->decoded_record->xl_info)
|
|
|
|
#define XLogRecGetRmid(decoder) ((decoder)->decoded_record->xl_rmid)
|
|
|
|
#define XLogRecGetXid(decoder) ((decoder)->decoded_record->xl_xid)
|
Introduce replication progress tracking infrastructure.
When implementing a replication solution ontop of logical decoding, two
related problems exist:
* How to safely keep track of replication progress
* How to change replication behavior, based on the origin of a row;
e.g. to avoid loops in bi-directional replication setups
The solution to these problems, as implemented here, consist out of
three parts:
1) 'replication origins', which identify nodes in a replication setup.
2) 'replication progress tracking', which remembers, for each
replication origin, how far replay has progressed in a efficient and
crash safe manner.
3) The ability to filter out changes performed on the behest of a
replication origin during logical decoding; this allows complex
replication topologies. E.g. by filtering all replayed changes out.
Most of this could also be implemented in "userspace", e.g. by inserting
additional rows contain origin information, but that ends up being much
less efficient and more complicated. We don't want to require various
replication solutions to reimplement logic for this independently. The
infrastructure is intended to be generic enough to be reusable.
This infrastructure also replaces the 'nodeid' infrastructure of commit
timestamps. It is intended to provide all the former capabilities,
except that there's only 2^16 different origins; but now they integrate
with logical decoding. Additionally more functionality is accessible via
SQL. Since the commit timestamp infrastructure has also been introduced
in 9.5 (commit 73c986add) changing the API is not a problem.
For now the number of origins for which the replication progress can be
tracked simultaneously is determined by the max_replication_slots
GUC. That GUC is not a perfect match to configure this, but there
doesn't seem to be sufficient reason to introduce a separate new one.
Bumps both catversion and wal page magic.
Author: Andres Freund, with contributions from Petr Jelinek and Craig Ringer
Reviewed-By: Heikki Linnakangas, Petr Jelinek, Robert Haas, Steve Singer
Discussion: 20150216002155.GI15326@awork2.anarazel.de,
20140923182422.GA15776@alap3.anarazel.de,
20131114172632.GE7522@alap2.anarazel.de
2015-04-29 19:30:53 +02:00
|
|
|
#define XLogRecGetOrigin(decoder) ((decoder)->record_origin)
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
#define XLogRecGetData(decoder) ((decoder)->main_data)
|
|
|
|
#define XLogRecGetDataLen(decoder) ((decoder)->main_data_len)
|
|
|
|
#define XLogRecHasAnyBlockRefs(decoder) ((decoder)->max_block_id >= 0)
|
|
|
|
#define XLogRecHasBlockRef(decoder, block_id) \
|
|
|
|
((decoder)->blocks[block_id].in_use)
|
|
|
|
#define XLogRecHasBlockImage(decoder, block_id) \
|
|
|
|
((decoder)->blocks[block_id].has_image)
|
|
|
|
|
|
|
|
extern bool RestoreBlockImage(XLogReaderState *recoder, uint8 block_id, char *dst);
|
|
|
|
extern char *XLogRecGetBlockData(XLogReaderState *record, uint8 block_id, Size *len);
|
|
|
|
extern bool XLogRecGetBlockTag(XLogReaderState *record, uint8 block_id,
|
|
|
|
RelFileNode *rnode, ForkNumber *forknum,
|
|
|
|
BlockNumber *blknum);
|
|
|
|
|
2013-01-16 20:12:53 +01:00
|
|
|
#endif /* XLOGREADER_H */
|