postgresql/src/include/access/xlogreader.h
Thomas Munro 1d257577e0 Optionally prefetch referenced data in recovery.
Introduce a new GUC recovery_prefetch, disabled by default.  When
enabled, look ahead in the WAL and try to initiate asynchronous reading
of referenced data blocks that are not yet cached in our buffer pool.
For now, this is done with posix_fadvise(), which has several caveats.
Better mechanisms will follow in later work on the I/O subsystem.

The GUC maintenance_io_concurrency is used to limit the number of
concurrent I/Os we allow ourselves to initiate, based on pessimistic
heuristics used to infer that I/Os have begun and completed.

The GUC wal_decode_buffer_size is used to limit the maximum distance we
are prepared to read ahead in the WAL to find uncached blocks.

Reviewed-by: Alvaro Herrera <alvherre@2ndquadrant.com> (parts)
Reviewed-by: Andres Freund <andres@anarazel.de> (parts)
Reviewed-by: Tomas Vondra <tomas.vondra@2ndquadrant.com> (parts)
Tested-by: Tomas Vondra <tomas.vondra@2ndquadrant.com>
Tested-by: Jakub Wartak <Jakub.Wartak@tomtom.com>
Tested-by: Dmitry Dolgov <9erthalion6@gmail.com>
Tested-by: Sait Talha Nisanci <Sait.Nisanci@microsoft.com>
Discussion: https://postgr.es/m/CA%2BhUKGJ4VJN8ttxScUFM8dOKX0BrBiboo5uz1cq%3DAovOddfHpA%40mail.gmail.com
2021-04-08 23:20:42 +12:00

409 lines
14 KiB
C

/*-------------------------------------------------------------------------
*
* xlogreader.h
* Definitions for the generic XLog reading facility
*
* Portions Copyright (c) 2013-2021, PostgreSQL Global Development Group
*
* IDENTIFICATION
* src/include/access/xlogreader.h
*
* NOTES
* See the definition of the XLogReaderState struct for instructions on
* how to use the XLogReader infrastructure.
*
* The basic idea is to allocate an XLogReaderState via
* XLogReaderAllocate(), position the reader to the first record with
* XLogBeginRead() or XLogFindNextRecord(), and call XLogReadRecord()
* until it returns NULL.
*
* Callers supply a page_read callback if they want to call
* XLogReadRecord or XLogFindNextRecord; it can be passed in as NULL
* otherwise. The WALRead function can be used as a helper to write
* page_read callbacks, but it is not mandatory; callers that use it,
* must supply segment_open callbacks. The segment_close callback
* must always be supplied.
*
* After reading a record with XLogReadRecord(), it's decomposed into
* the per-block and main data parts, and the parts can be accessed
* with the XLogRec* macros and functions. You can also decode a
* record that's already constructed in memory, without reading from
* disk, by calling the DecodeXLogRecord() function.
*-------------------------------------------------------------------------
*/
#ifndef XLOGREADER_H
#define XLOGREADER_H
#ifndef FRONTEND
#include "access/transam.h"
#endif
#include "access/xlogrecord.h"
#include "storage/buf.h"
/* WALOpenSegment represents a WAL segment being read. */
typedef struct WALOpenSegment
{
int ws_file; /* segment file descriptor */
XLogSegNo ws_segno; /* segment number */
TimeLineID ws_tli; /* timeline ID of the currently open file */
} WALOpenSegment;
/* WALSegmentContext carries context information about WAL segments to read */
typedef struct WALSegmentContext
{
char ws_dir[MAXPGPATH];
int ws_segsize;
} WALSegmentContext;
typedef struct XLogReaderState XLogReaderState;
typedef struct XLogFindNextRecordState XLogFindNextRecordState;
/* Function type definition for the segment cleanup callback */
typedef void (*WALSegmentCleanupCB) (XLogReaderState *xlogreader);
/* Function type definition for the open/close callbacks for WALRead() */
typedef void (*WALSegmentOpenCB) (XLogReaderState *xlogreader,
XLogSegNo nextSegNo,
TimeLineID *tli_p);
typedef void (*WALSegmentCloseCB) (XLogReaderState *xlogreader);
typedef struct
{
/* Is this block ref in use? */
bool in_use;
/* Identify the block this refers to */
RelFileNode rnode;
ForkNumber forknum;
BlockNumber blkno;
/* Workspace for remembering last known buffer holding this block. */
Buffer recent_buffer;
/* copy of the fork_flags field from the XLogRecordBlockHeader */
uint8 flags;
/* Information on full-page image, if any */
bool has_image; /* has image, even for consistency checking */
bool apply_image; /* has image that should be restored */
char *bkp_image;
uint16 hole_offset;
uint16 hole_length;
uint16 bimg_len;
uint8 bimg_info;
/* Buffer holding the rmgr-specific data associated with this block */
bool has_data;
char *data;
uint16 data_len;
uint16 data_bufsz;
} DecodedBkpBlock;
/* Return code from XLogReadRecord */
typedef enum XLogReadRecordResult
{
XLREAD_SUCCESS, /* record is successfully read */
XLREAD_NEED_DATA, /* need more data. see XLogReadRecord. */
XLREAD_FULL, /* cannot hold more data while reading ahead */
XLREAD_FAIL /* failed during reading a record */
} XLogReadRecordResult;
/*
* internal state of XLogReadRecord
*
* XLogReadState runs a state machine while reading a record. Theses states
* are not seen outside the function. Each state may repeat several times
* exiting requesting caller for new data. See the comment of XLogReadRecrod
* for details.
*/
typedef enum XLogReadRecordState
{
XLREAD_NEXT_RECORD,
XLREAD_TOT_LEN,
XLREAD_FIRST_FRAGMENT,
XLREAD_CONTINUATION
} XLogReadRecordState;
/*
* The decoded contents of a record. This occupies a contiguous region of
* memory, with main_data and blocks[n].data pointing to memory after the
* members declared here.
*/
typedef struct DecodedXLogRecord
{
/* Private member used for resource management. */
size_t size; /* total size of decoded record */
bool oversized; /* outside the regular decode buffer? */
struct DecodedXLogRecord *next; /* decoded record queue link */
/* Public members. */
XLogRecPtr lsn; /* location */
XLogRecPtr next_lsn; /* location of next record */
XLogRecord header; /* header */
RepOriginId record_origin;
TransactionId toplevel_xid; /* XID of top-level transaction */
char *main_data; /* record's main data portion */
uint32 main_data_len; /* main data portion's length */
int max_block_id; /* highest block_id in use (-1 if none) */
DecodedBkpBlock blocks[FLEXIBLE_ARRAY_MEMBER];
} DecodedXLogRecord;
struct XLogReaderState
{
/*
* Operational callbacks
*/
WALSegmentCleanupCB cleanup_cb;
/* ----------------------------------------
* Public parameters
* ----------------------------------------
*/
/*
* System identifier of the xlog files we're about to read. Set to zero
* (the default value) if unknown or unimportant.
*/
uint64 system_identifier;
/*
* Start and end point of last record read. EndRecPtr is also used as the
* position to read next. Calling XLogBeginRead() sets EndRecPtr to the
* starting position and ReadRecPtr to invalid.
*
* Start and end point of last record returned by XLogReadRecord(). These
* are also available as record->lsn and record->next_lsn.
*/
XLogRecPtr ReadRecPtr; /* start of last record read or being read */
XLogRecPtr EndRecPtr; /* end+1 of last record read */
/* ----------------------------------------
* Communication with page reader
* readBuf is XLOG_BLCKSZ bytes, valid up to at least reqLen bytes.
* ----------------------------------------
*/
/* variables the clients of xlogreader can examine */
XLogRecPtr readPagePtr; /* page pointer to read */
int32 reqLen; /* bytes requested to the caller */
char *readBuf; /* buffer to store data */
bool page_verified; /* is the page header on the buffer verified? */
bool record_verified;/* is the current record header verified? */
/* variables set by the client of xlogreader */
int32 readLen; /* actual bytes copied into readBuf by client,
* which should be >= reqLen. Client should
* use XLogReaderSetInputData() to set. */
/* ----------------------------------------
* Decoded representation of current record
*
* Use XLogRecGet* functions to investigate the record; these fields
* should not be accessed directly.
* ----------------------------------------
* Start and end point of the last record read and decoded by
* XLogReadRecordInternal(). NextRecPtr is also used as the position to
* decode next. Calling XLogBeginRead() sets NextRecPtr and EndRecPtr to
* the requested starting position.
*/
XLogRecPtr DecodeRecPtr; /* start of last record decoded */
XLogRecPtr NextRecPtr; /* end+1 of last record decoded */
XLogRecPtr PrevRecPtr; /* start of previous record decoded */
/* Last record returned by XLogReadRecord(). */
DecodedXLogRecord *record;
/* ----------------------------------------
* private/internal state
* ----------------------------------------
*/
/*
* Buffer for decoded records. This is a circular buffer, though
* individual records can't be split in the middle, so some space is often
* wasted at the end. Oversized records that don't fit in this space are
* allocated separately.
*/
char *decode_buffer;
size_t decode_buffer_size;
bool free_decode_buffer; /* need to free? */
char *decode_buffer_head; /* write head */
char *decode_buffer_tail; /* read head */
/*
* Queue of records that have been decoded. This is a linked list that
* usually consists of consecutive records in decode_buffer, but may also
* contain oversized records allocated with palloc().
*/
DecodedXLogRecord *decode_queue_head; /* newest decoded record */
DecodedXLogRecord *decode_queue_tail; /* oldest decoded record */
/* last read XLOG position for data currently in readBuf */
WALSegmentContext segcxt;
WALOpenSegment seg;
uint32 segoff;
/*
* beginning of prior page read, and its TLI. Doesn't necessarily
* correspond to what's in readBuf; used for timeline sanity checks.
*/
XLogRecPtr latestPagePtr;
TimeLineID latestPageTLI;
/* timeline to read it from, 0 if a lookup is required */
TimeLineID currTLI;
/*
* Safe point to read to in currTLI if current TLI is historical
* (tliSwitchPoint) or InvalidXLogRecPtr if on current timeline.
*
* Actually set to the start of the segment containing the timeline switch
* that ends currTLI's validity, not the LSN of the switch its self, since
* we can't assume the old segment will be present.
*/
XLogRecPtr currTLIValidUntil;
/*
* If currTLI is not the most recent known timeline, the next timeline to
* read from when currTLIValidUntil is reached.
*/
TimeLineID nextTLI;
/*
* Buffer for current ReadRecord result (expandable), used when a record
* crosses a page boundary.
*/
char *readRecordBuf;
uint32 readRecordBufSize;
/*
* XLogReadRecordInternal() state
*/
XLogReadRecordState readRecordState; /* state machine state */
int recordGotLen; /* amount of current record that has already
* been read */
int recordRemainLen; /* length of current record that remains */
XLogRecPtr recordContRecPtr; /* where the current record continues */
DecodedXLogRecord *decoding; /* record currently being decoded */
/* Buffer to hold error message */
char *errormsg_buf;
bool errormsg_deferred;
};
struct XLogFindNextRecordState
{
XLogReaderState *reader_state;
XLogRecPtr targetRecPtr;
XLogRecPtr currRecPtr;
};
/* Report that data is available for decoding. */
static inline void
XLogReaderSetInputData(XLogReaderState *state, int32 len)
{
state->readLen = len;
}
/* Get a new XLogReader */
extern XLogReaderState *XLogReaderAllocate(int wal_segment_size,
const char *waldir,
WALSegmentCleanupCB cleanup_cb);
/* Free an XLogReader */
extern void XLogReaderFree(XLogReaderState *state);
/* Optionally provide a circular decoding buffer to allow readahead. */
extern void XLogReaderSetDecodeBuffer(XLogReaderState *state,
void *buffer,
size_t size);
/* Position the XLogReader to given record */
extern void XLogBeginRead(XLogReaderState *state, XLogRecPtr RecPtr);
#ifdef FRONTEND
extern XLogFindNextRecordState *InitXLogFindNextRecord(XLogReaderState *reader_state, XLogRecPtr start_ptr);
extern bool XLogFindNextRecord(XLogFindNextRecordState *state);
#endif /* FRONTEND */
/* Read the next record's header. Returns NULL on end-of-WAL or failure. */
extern XLogReadRecordResult XLogReadRecord(XLogReaderState *state,
XLogRecord **record,
char **errormsg);
/* Read the next decoded record. Returns NULL on end-of-WAL or failure. */
extern XLogReadRecordResult XLogNextRecord(XLogReaderState *state,
DecodedXLogRecord **record,
char **errormsg);
/* Try to read ahead, if there is space in the decoding buffer. */
extern XLogReadRecordResult XLogReadAhead(XLogReaderState *state,
DecodedXLogRecord **record,
char **errormsg);
/* Validate a page */
extern bool XLogReaderValidatePageHeader(XLogReaderState *state,
XLogRecPtr recptr, char *phdr);
/*
* Error information from WALRead that both backend and frontend caller can
* process. Currently only errors from pg_pread can be reported.
*/
typedef struct WALReadError
{
int wre_errno; /* errno set by the last pg_pread() */
int wre_off; /* Offset we tried to read from. */
int wre_req; /* Bytes requested to be read. */
int wre_read; /* Bytes read by the last read(). */
WALOpenSegment wre_seg; /* Segment we tried to read from. */
} WALReadError;
extern bool WALRead(XLogReaderState *state,
WALSegmentOpenCB segopenfn, WALSegmentCloseCB sgclosefn,
char *buf, XLogRecPtr startptr, Size count,
TimeLineID tli, WALReadError *errinfo);
/* Functions for decoding an XLogRecord */
extern size_t DecodeXLogRecordRequiredSpace(size_t xl_tot_len);
extern bool DecodeXLogRecord(XLogReaderState *state,
DecodedXLogRecord *decoded,
XLogRecord *record,
XLogRecPtr lsn,
char **errmsg);
#define XLogRecGetTotalLen(decoder) ((decoder)->record->header.xl_tot_len)
#define XLogRecGetPrev(decoder) ((decoder)->record->header.xl_prev)
#define XLogRecGetInfo(decoder) ((decoder)->record->header.xl_info)
#define XLogRecGetRmid(decoder) ((decoder)->record->header.xl_rmid)
#define XLogRecGetXid(decoder) ((decoder)->record->header.xl_xid)
#define XLogRecGetOrigin(decoder) ((decoder)->record->record_origin)
#define XLogRecGetTopXid(decoder) ((decoder)->record->toplevel_xid)
#define XLogRecGetData(decoder) ((decoder)->record->main_data)
#define XLogRecGetDataLen(decoder) ((decoder)->record->main_data_len)
#define XLogRecHasAnyBlockRefs(decoder) ((decoder)->record->max_block_id >= 0)
#define XLogRecMaxBlockId(decoder) ((decoder)->record->max_block_id)
#define XLogRecGetBlock(decoder, i) (&(decoder)->record->blocks[(i)])
#define XLogRecHasBlockRef(decoder, block_id) \
((decoder)->record->max_block_id >= (block_id)) && \
((decoder)->record->blocks[block_id].in_use)
#define XLogRecHasBlockImage(decoder, block_id) \
((decoder)->record->blocks[block_id].has_image)
#define XLogRecBlockImageApply(decoder, block_id) \
((decoder)->record->blocks[block_id].apply_image)
#ifndef FRONTEND
extern FullTransactionId XLogRecGetFullXid(XLogReaderState *record);
#endif
extern bool RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page);
extern char *XLogRecGetBlockData(XLogReaderState *record, uint8 block_id, Size *len);
extern bool XLogRecGetBlockTag(XLogReaderState *record, uint8 block_id,
RelFileNode *rnode, ForkNumber *forknum,
BlockNumber *blknum);
extern bool XLogRecGetRecentBuffer(XLogReaderState *record, uint8 block_id,
RelFileNode *rnode, ForkNumber *forknum,
BlockNumber *blknum, Buffer *recent_buffer);
#endif /* XLOGREADER_H */