2013-01-16 20:12:53 +01:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
|
|
|
* xlogreader.h
|
|
|
|
* Definitions for the generic XLog reading facility
|
|
|
|
*
|
2015-01-06 17:43:47 +01:00
|
|
|
* Portions Copyright (c) 2013-2015, PostgreSQL Global Development Group
|
2013-01-16 20:12:53 +01:00
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
|
|
|
* src/include/access/xlogreader.h
|
|
|
|
*
|
|
|
|
* NOTES
|
|
|
|
* See the definition of the XLogReaderState struct for instructions on
|
|
|
|
* how to use the XLogReader infrastructure.
|
|
|
|
*
|
|
|
|
* The basic idea is to allocate an XLogReaderState via
|
|
|
|
* XLogReaderAllocate(), and call XLogReadRecord() until it returns NULL.
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
*
|
|
|
|
* After reading a record with XLogReadRecord(), it's decomposed into
|
|
|
|
* the per-block and main data parts, and the parts can be accessed
|
|
|
|
* with the XLogRec* macros and functions. You can also decode a
|
|
|
|
* record that's already constructed in memory, without reading from
|
|
|
|
* disk, by calling the DecodeXLogRecord() function.
|
2013-01-16 20:12:53 +01:00
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
#ifndef XLOGREADER_H
|
|
|
|
#define XLOGREADER_H
|
|
|
|
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
#include "access/xlogrecord.h"
|
2013-01-16 20:12:53 +01:00
|
|
|
|
|
|
|
typedef struct XLogReaderState XLogReaderState;
|
|
|
|
|
|
|
|
/* Function type definition for the read_page callback */
|
|
|
|
typedef int (*XLogPageReadCB) (XLogReaderState *xlogreader,
|
|
|
|
XLogRecPtr targetPagePtr,
|
|
|
|
int reqLen,
|
Use the right timeline when beginning to stream from master.
The xlogreader refactoring broke the logic to decide which timeline to start
streaming from. XLogPageRead() uses the timeline history to check which
timeline the requested WAL position falls into. However, after the
refactoring, XLogPageRead() is always first called with the first page in
the segment, to verify the segment header, and only then with the actual WAL
position we're interested in. That first read of the segment's header made
XLogPageRead() to always start streaming from the old timeline containing
the segment header, not the timeline containing the actual record, if there
was a timeline switch within the segment.
I thought I fixed this yesterday, but that fix was too narrow and only fixed
this for the corner-case that the timeline switch happened in the first page
of the segment. To fix this more robustly, pass explicitly the position of
the record we're actually interested in to XLogPageRead, and use that to
decide which timeline to read from, rather than deduce it from the page and
offset.
Per report from Fujii Masao.
2013-01-18 10:41:36 +01:00
|
|
|
XLogRecPtr targetRecPtr,
|
2013-01-16 20:12:53 +01:00
|
|
|
char *readBuf,
|
|
|
|
TimeLineID *pageTLI);
|
|
|
|
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
typedef struct
|
|
|
|
{
|
|
|
|
/* Is this block ref in use? */
|
|
|
|
bool in_use;
|
|
|
|
|
|
|
|
/* Identify the block this refers to */
|
|
|
|
RelFileNode rnode;
|
|
|
|
ForkNumber forknum;
|
|
|
|
BlockNumber blkno;
|
|
|
|
|
|
|
|
/* copy of the fork_flags field from the XLogRecordBlockHeader */
|
|
|
|
uint8 flags;
|
|
|
|
|
|
|
|
/* Information on full-page image, if any */
|
|
|
|
bool has_image;
|
|
|
|
char *bkp_image;
|
|
|
|
uint16 hole_offset;
|
|
|
|
uint16 hole_length;
|
|
|
|
|
|
|
|
/* Buffer holding the rmgr-specific data associated with this block */
|
|
|
|
bool has_data;
|
|
|
|
char *data;
|
|
|
|
uint16 data_len;
|
|
|
|
uint16 data_bufsz;
|
|
|
|
} DecodedBkpBlock;
|
|
|
|
|
2013-01-16 20:12:53 +01:00
|
|
|
struct XLogReaderState
|
|
|
|
{
|
|
|
|
/* ----------------------------------------
|
|
|
|
* Public parameters
|
|
|
|
* ----------------------------------------
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Data input callback (mandatory).
|
|
|
|
*
|
|
|
|
* This callback shall read at least reqLen valid bytes of the xlog page
|
|
|
|
* starting at targetPagePtr, and store them in readBuf. The callback
|
|
|
|
* shall return the number of bytes read (never more than XLOG_BLCKSZ), or
|
|
|
|
* -1 on failure. The callback shall sleep, if necessary, to wait for the
|
|
|
|
* requested bytes to become available. The callback will not be invoked
|
|
|
|
* again for the same page unless more than the returned number of bytes
|
Use the right timeline when beginning to stream from master.
The xlogreader refactoring broke the logic to decide which timeline to start
streaming from. XLogPageRead() uses the timeline history to check which
timeline the requested WAL position falls into. However, after the
refactoring, XLogPageRead() is always first called with the first page in
the segment, to verify the segment header, and only then with the actual WAL
position we're interested in. That first read of the segment's header made
XLogPageRead() to always start streaming from the old timeline containing
the segment header, not the timeline containing the actual record, if there
was a timeline switch within the segment.
I thought I fixed this yesterday, but that fix was too narrow and only fixed
this for the corner-case that the timeline switch happened in the first page
of the segment. To fix this more robustly, pass explicitly the position of
the record we're actually interested in to XLogPageRead, and use that to
decide which timeline to read from, rather than deduce it from the page and
offset.
Per report from Fujii Masao.
2013-01-18 10:41:36 +01:00
|
|
|
* are needed.
|
2013-01-16 20:12:53 +01:00
|
|
|
*
|
Use the right timeline when beginning to stream from master.
The xlogreader refactoring broke the logic to decide which timeline to start
streaming from. XLogPageRead() uses the timeline history to check which
timeline the requested WAL position falls into. However, after the
refactoring, XLogPageRead() is always first called with the first page in
the segment, to verify the segment header, and only then with the actual WAL
position we're interested in. That first read of the segment's header made
XLogPageRead() to always start streaming from the old timeline containing
the segment header, not the timeline containing the actual record, if there
was a timeline switch within the segment.
I thought I fixed this yesterday, but that fix was too narrow and only fixed
this for the corner-case that the timeline switch happened in the first page
of the segment. To fix this more robustly, pass explicitly the position of
the record we're actually interested in to XLogPageRead, and use that to
decide which timeline to read from, rather than deduce it from the page and
offset.
Per report from Fujii Masao.
2013-01-18 10:41:36 +01:00
|
|
|
* targetRecPtr is the position of the WAL record we're reading. Usually
|
|
|
|
* it is equal to targetPagePtr + reqLen, but sometimes xlogreader needs
|
|
|
|
* to read and verify the page or segment header, before it reads the
|
|
|
|
* actual WAL record it's interested in. In that case, targetRecPtr can
|
|
|
|
* be used to determine which timeline to read the page from.
|
|
|
|
*
|
|
|
|
* The callback shall set *pageTLI to the TLI of the file the page was
|
|
|
|
* read from. It is currently used only for error reporting purposes, to
|
|
|
|
* reconstruct the name of the WAL file where an error occurred.
|
2013-01-16 20:12:53 +01:00
|
|
|
*/
|
|
|
|
XLogPageReadCB read_page;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* System identifier of the xlog files we're about to read. Set to zero
|
|
|
|
* (the default value) if unknown or unimportant.
|
|
|
|
*/
|
|
|
|
uint64 system_identifier;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Opaque data for callbacks to use. Not used by XLogReader.
|
|
|
|
*/
|
|
|
|
void *private_data;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Start and end point of last record read. EndRecPtr is also used as the
|
|
|
|
* position to read next, if XLogReadRecord receives an invalid recptr.
|
|
|
|
*/
|
|
|
|
XLogRecPtr ReadRecPtr; /* start of last record read */
|
|
|
|
XLogRecPtr EndRecPtr; /* end+1 of last record read */
|
|
|
|
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
|
|
|
|
/* ----------------------------------------
|
|
|
|
* Decoded representation of current record
|
|
|
|
*
|
|
|
|
* Use XLogRecGet* functions to investigate the record; these fields
|
|
|
|
* should not be accessed directly.
|
|
|
|
* ----------------------------------------
|
|
|
|
*/
|
|
|
|
XLogRecord *decoded_record; /* currently decoded record */
|
|
|
|
|
|
|
|
char *main_data; /* record's main data portion */
|
|
|
|
uint32 main_data_len; /* main data portion's length */
|
|
|
|
uint32 main_data_bufsz; /* allocated size of the buffer */
|
|
|
|
|
|
|
|
/* information about blocks referenced by the record. */
|
|
|
|
DecodedBkpBlock blocks[XLR_MAX_BLOCK_ID + 1];
|
|
|
|
|
|
|
|
int max_block_id; /* highest block_id in use (-1 if none) */
|
|
|
|
|
2013-01-16 20:12:53 +01:00
|
|
|
/* ----------------------------------------
|
|
|
|
* private/internal state
|
|
|
|
* ----------------------------------------
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* Buffer for currently read page (XLOG_BLCKSZ bytes) */
|
|
|
|
char *readBuf;
|
|
|
|
|
|
|
|
/* last read segment, segment offset, read length, TLI */
|
|
|
|
XLogSegNo readSegNo;
|
|
|
|
uint32 readOff;
|
|
|
|
uint32 readLen;
|
|
|
|
TimeLineID readPageTLI;
|
|
|
|
|
|
|
|
/* beginning of last page read, and its TLI */
|
|
|
|
XLogRecPtr latestPagePtr;
|
|
|
|
TimeLineID latestPageTLI;
|
|
|
|
|
Use the right timeline when beginning to stream from master.
The xlogreader refactoring broke the logic to decide which timeline to start
streaming from. XLogPageRead() uses the timeline history to check which
timeline the requested WAL position falls into. However, after the
refactoring, XLogPageRead() is always first called with the first page in
the segment, to verify the segment header, and only then with the actual WAL
position we're interested in. That first read of the segment's header made
XLogPageRead() to always start streaming from the old timeline containing
the segment header, not the timeline containing the actual record, if there
was a timeline switch within the segment.
I thought I fixed this yesterday, but that fix was too narrow and only fixed
this for the corner-case that the timeline switch happened in the first page
of the segment. To fix this more robustly, pass explicitly the position of
the record we're actually interested in to XLogPageRead, and use that to
decide which timeline to read from, rather than deduce it from the page and
offset.
Per report from Fujii Masao.
2013-01-18 10:41:36 +01:00
|
|
|
/* beginning of the WAL record being read. */
|
|
|
|
XLogRecPtr currRecPtr;
|
|
|
|
|
2013-01-16 20:12:53 +01:00
|
|
|
/* Buffer for current ReadRecord result (expandable) */
|
|
|
|
char *readRecordBuf;
|
|
|
|
uint32 readRecordBufSize;
|
|
|
|
|
|
|
|
/* Buffer to hold error message */
|
|
|
|
char *errormsg_buf;
|
|
|
|
};
|
|
|
|
|
|
|
|
/* Get a new XLogReader */
|
|
|
|
extern XLogReaderState *XLogReaderAllocate(XLogPageReadCB pagereadfunc,
|
|
|
|
void *private_data);
|
|
|
|
|
|
|
|
/* Free an XLogReader */
|
|
|
|
extern void XLogReaderFree(XLogReaderState *state);
|
|
|
|
|
|
|
|
/* Read the next XLog record. Returns NULL on end-of-WAL or failure */
|
|
|
|
extern struct XLogRecord *XLogReadRecord(XLogReaderState *state,
|
|
|
|
XLogRecPtr recptr, char **errormsg);
|
|
|
|
|
|
|
|
#ifdef FRONTEND
|
|
|
|
extern XLogRecPtr XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr);
|
|
|
|
#endif /* FRONTEND */
|
|
|
|
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
/* Functions for decoding an XLogRecord */
|
|
|
|
|
|
|
|
extern bool DecodeXLogRecord(XLogReaderState *state, XLogRecord *record,
|
|
|
|
char **errmsg);
|
|
|
|
|
|
|
|
#define XLogRecGetTotalLen(decoder) ((decoder)->decoded_record->xl_tot_len)
|
|
|
|
#define XLogRecGetPrev(decoder) ((decoder)->decoded_record->xl_prev)
|
|
|
|
#define XLogRecGetInfo(decoder) ((decoder)->decoded_record->xl_info)
|
|
|
|
#define XLogRecGetRmid(decoder) ((decoder)->decoded_record->xl_rmid)
|
|
|
|
#define XLogRecGetXid(decoder) ((decoder)->decoded_record->xl_xid)
|
|
|
|
#define XLogRecGetData(decoder) ((decoder)->main_data)
|
|
|
|
#define XLogRecGetDataLen(decoder) ((decoder)->main_data_len)
|
|
|
|
#define XLogRecHasAnyBlockRefs(decoder) ((decoder)->max_block_id >= 0)
|
|
|
|
#define XLogRecHasBlockRef(decoder, block_id) \
|
|
|
|
((decoder)->blocks[block_id].in_use)
|
|
|
|
#define XLogRecHasBlockImage(decoder, block_id) \
|
|
|
|
((decoder)->blocks[block_id].has_image)
|
|
|
|
|
|
|
|
extern bool RestoreBlockImage(XLogReaderState *recoder, uint8 block_id, char *dst);
|
|
|
|
extern char *XLogRecGetBlockData(XLogReaderState *record, uint8 block_id, Size *len);
|
|
|
|
extern bool XLogRecGetBlockTag(XLogReaderState *record, uint8 block_id,
|
|
|
|
RelFileNode *rnode, ForkNumber *forknum,
|
|
|
|
BlockNumber *blknum);
|
|
|
|
|
2013-01-16 20:12:53 +01:00
|
|
|
#endif /* XLOGREADER_H */
|