Add circular WAL decoding buffer.

Teach xlogreader.c to decode its output into a circular buffer, to
support optimizations based on looking ahead.

 * XLogReadRecord() works as before, consuming records one by one, and
   allowing them to be examined via the traditional XLogRecGetXXX()
   macros.

 * An alternative new interface XLogNextRecord() is added that returns
   pointers to DecodedXLogRecord structs that can be examined directly.

 * XLogReadAhead() provides a second cursor that lets you see
   further ahead, as long as data is available and there is enough space
   in the decoding buffer.  This returns DecodedXLogRecord pointers to the
   caller, but also adds them to a queue of records that will later be
   consumed by XLogNextRecord()/XLogReadRecord().

The buffer's size is controlled with wal_decode_buffer_size.  The buffer
could potentially be placed into shared memory, for future projects.
Large records that don't fit in the circular buffer are called
"oversized" and allocated separately with palloc().

Discussion: https://postgr.es/m/CA+hUKGJ4VJN8ttxScUFM8dOKX0BrBiboo5uz1cq=AovOddfHpA@mail.gmail.com
This commit is contained in:
Thomas Munro 2021-04-08 23:03:34 +12:00
parent 323cbe7c7d
commit f003d9f872
8 changed files with 734 additions and 200 deletions

View File

@ -482,10 +482,10 @@ generic_redo(XLogReaderState *record)
uint8 block_id;
/* Protect limited size of buffers[] array */
Assert(record->max_block_id < MAX_GENERIC_XLOG_PAGES);
Assert(XLogRecMaxBlockId(record) < MAX_GENERIC_XLOG_PAGES);
/* Iterate over blocks */
for (block_id = 0; block_id <= record->max_block_id; block_id++)
for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
{
XLogRedoAction action;
@ -525,7 +525,7 @@ generic_redo(XLogReaderState *record)
}
/* Changes are done: unlock and release all buffers */
for (block_id = 0; block_id <= record->max_block_id; block_id++)
for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
{
if (BufferIsValid(buffers[block_id]))
UnlockReleaseBuffer(buffers[block_id]);

View File

@ -1209,6 +1209,7 @@ XLogInsertRecord(XLogRecData *rdata,
StringInfoData recordBuf;
char *errormsg = NULL;
MemoryContext oldCxt;
DecodedXLogRecord *decoded;
oldCxt = MemoryContextSwitchTo(walDebugCxt);
@ -1224,6 +1225,9 @@ XLogInsertRecord(XLogRecData *rdata,
for (; rdata != NULL; rdata = rdata->next)
appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
/* How much space would it take to decode this record? */
decoded = palloc(DecodeXLogRecordRequiredSpace(recordBuf.len));
if (!debug_reader)
debug_reader = XLogReaderAllocate(wal_segment_size, NULL, NULL);
@ -1231,7 +1235,9 @@ XLogInsertRecord(XLogRecData *rdata,
{
appendStringInfoString(&buf, "error decoding record: out of memory");
}
else if (!DecodeXLogRecord(debug_reader, (XLogRecord *) recordBuf.data,
else if (!DecodeXLogRecord(debug_reader, decoded,
(XLogRecord *) recordBuf.data,
EndPos,
&errormsg))
{
appendStringInfo(&buf, "error decoding record: %s",
@ -1240,10 +1246,17 @@ XLogInsertRecord(XLogRecData *rdata,
else
{
appendStringInfoString(&buf, " - ");
/*
* Temporarily make this decoded record the current record for
* XLogRecGetXXX() macros.
*/
debug_reader->record = decoded;
xlog_outdesc(&buf, debug_reader);
debug_reader->record = NULL;
}
elog(LOG, "%s", buf.data);
pfree(decoded);
pfree(buf.data);
pfree(recordBuf.data);
MemoryContextSwitchTo(oldCxt);
@ -1417,7 +1430,7 @@ checkXLogConsistency(XLogReaderState *record)
Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
for (block_id = 0; block_id <= record->max_block_id; block_id++)
for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
{
Buffer buf;
Page page;
@ -4383,6 +4396,7 @@ ReadRecord(XLogReaderState *xlogreader, int emode,
ReadRecPtr = xlogreader->ReadRecPtr;
EndRecPtr = xlogreader->EndRecPtr;
if (record == NULL)
{
if (readFile >= 0)
@ -10300,7 +10314,7 @@ xlog_redo(XLogReaderState *record)
* XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info
* code just to distinguish them for statistics purposes.
*/
for (uint8 block_id = 0; block_id <= record->max_block_id; block_id++)
for (uint8 block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
{
Buffer buffer;
@ -10435,7 +10449,7 @@ xlog_block_info(StringInfo buf, XLogReaderState *record)
int block_id;
/* decode block references */
for (block_id = 0; block_id <= record->max_block_id; block_id++)
for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
{
RelFileNode rnode;
ForkNumber forknum;
@ -12104,7 +12118,7 @@ XLogPageRead(XLogReaderState *state,
XLogRecPtr targetPagePtr = state->readPagePtr;
int reqLen = state->reqLen;
int readLen = 0;
XLogRecPtr targetRecPtr = state->ReadRecPtr;
XLogRecPtr targetRecPtr = state->DecodeRecPtr;
uint32 targetPageOff;
XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
int r;
@ -12122,6 +12136,9 @@ XLogPageRead(XLogReaderState *state,
/*
* Request a restartpoint if we've replayed too much xlog since the
* last one.
*
* XXX Why is this here? Move it to recovery loop, since it's based
* on replay position, not read position?
*/
if (bgwriterLaunched)
{
@ -12613,6 +12630,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
* be updated on each cycle. When we are behind,
* XLogReceiptTime will not advance, so the grace time
* allotted to conflicting queries will decrease.
*
*/
if (RecPtr < flushedUpto)
havedata = true;

File diff suppressed because it is too large Load Diff

View File

@ -350,7 +350,7 @@ XLogReadBufferForRedoExtended(XLogReaderState *record,
* going to initialize it. And vice versa.
*/
zeromode = (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK);
willinit = (record->blocks[block_id].flags & BKPBLOCK_WILL_INIT) != 0;
willinit = (record->record->blocks[block_id].flags & BKPBLOCK_WILL_INIT) != 0;
if (willinit && !zeromode)
elog(PANIC, "block with WILL_INIT flag in WAL record must be zeroed by redo routine");
if (!willinit && zeromode)

View File

@ -123,7 +123,7 @@ LogicalDecodingProcessRecord(LogicalDecodingContext *ctx, XLogReaderState *recor
{
ReorderBufferAssignChild(ctx->reorder,
txid,
record->decoded_record->xl_xid,
XLogRecGetXid(record),
buf.origptr);
}

View File

@ -439,7 +439,7 @@ extractPageInfo(XLogReaderState *record)
RmgrNames[rmid], info);
}
for (block_id = 0; block_id <= record->max_block_id; block_id++)
for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
{
RelFileNode rnode;
ForkNumber forknum;

View File

@ -397,10 +397,10 @@ XLogDumpRecordLen(XLogReaderState *record, uint32 *rec_len, uint32 *fpi_len)
* add an accessor macro for this.
*/
*fpi_len = 0;
for (block_id = 0; block_id <= record->max_block_id; block_id++)
for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
{
if (XLogRecHasBlockImage(record, block_id))
*fpi_len += record->blocks[block_id].bimg_len;
*fpi_len += record->record->blocks[block_id].bimg_len;
}
/*
@ -498,7 +498,7 @@ XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record)
if (!config->bkp_details)
{
/* print block references (short format) */
for (block_id = 0; block_id <= record->max_block_id; block_id++)
for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
{
if (!XLogRecHasBlockRef(record, block_id))
continue;
@ -529,7 +529,7 @@ XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record)
{
/* print block references (detailed format) */
putchar('\n');
for (block_id = 0; block_id <= record->max_block_id; block_id++)
for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
{
if (!XLogRecHasBlockRef(record, block_id))
continue;
@ -542,26 +542,26 @@ XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record)
blk);
if (XLogRecHasBlockImage(record, block_id))
{
if (record->blocks[block_id].bimg_info &
if (record->record->blocks[block_id].bimg_info &
BKPIMAGE_IS_COMPRESSED)
{
printf(" (FPW%s); hole: offset: %u, length: %u, "
"compression saved: %u",
XLogRecBlockImageApply(record, block_id) ?
"" : " for WAL verification",
record->blocks[block_id].hole_offset,
record->blocks[block_id].hole_length,
record->record->blocks[block_id].hole_offset,
record->record->blocks[block_id].hole_length,
BLCKSZ -
record->blocks[block_id].hole_length -
record->blocks[block_id].bimg_len);
record->record->blocks[block_id].hole_length -
record->record->blocks[block_id].bimg_len);
}
else
{
printf(" (FPW%s); hole: offset: %u, length: %u",
XLogRecBlockImageApply(record, block_id) ?
"" : " for WAL verification",
record->blocks[block_id].hole_offset,
record->blocks[block_id].hole_length);
record->record->blocks[block_id].hole_offset,
record->record->blocks[block_id].hole_length);
}
}
putchar('\n');

View File

@ -101,6 +101,7 @@ typedef enum XLogReadRecordResult
{
XLREAD_SUCCESS, /* record is successfully read */
XLREAD_NEED_DATA, /* need more data. see XLogReadRecord. */
XLREAD_FULL, /* cannot hold more data while reading ahead */
XLREAD_FAIL /* failed during reading a record */
} XLogReadRecordResult;
@ -120,6 +121,30 @@ typedef enum XLogReadRecordState
XLREAD_CONTINUATION
} XLogReadRecordState;
/*
* The decoded contents of a record. This occupies a contiguous region of
* memory, with main_data and blocks[n].data pointing to memory after the
* members declared here.
*/
typedef struct DecodedXLogRecord
{
/* Private member used for resource management. */
size_t size; /* total size of decoded record */
bool oversized; /* outside the regular decode buffer? */
struct DecodedXLogRecord *next; /* decoded record queue link */
/* Public members. */
XLogRecPtr lsn; /* location */
XLogRecPtr next_lsn; /* location of next record */
XLogRecord header; /* header */
RepOriginId record_origin;
TransactionId toplevel_xid; /* XID of top-level transaction */
char *main_data; /* record's main data portion */
uint32 main_data_len; /* main data portion's length */
int max_block_id; /* highest block_id in use (-1 if none) */
DecodedBkpBlock blocks[FLEXIBLE_ARRAY_MEMBER];
} DecodedXLogRecord;
struct XLogReaderState
{
/*
@ -142,10 +167,12 @@ struct XLogReaderState
* Start and end point of last record read. EndRecPtr is also used as the
* position to read next. Calling XLogBeginRead() sets EndRecPtr to the
* starting position and ReadRecPtr to invalid.
*
* Start and end point of last record returned by XLogReadRecord(). These
* are also available as record->lsn and record->next_lsn.
*/
XLogRecPtr ReadRecPtr; /* start of last record read or being read */
XLogRecPtr EndRecPtr; /* end+1 of last record read */
XLogRecPtr PrevRecPtr; /* start of previous record read */
/* ----------------------------------------
* Communication with page reader
@ -170,27 +197,43 @@ struct XLogReaderState
* Use XLogRecGet* functions to investigate the record; these fields
* should not be accessed directly.
* ----------------------------------------
* Start and end point of the last record read and decoded by
* XLogReadRecordInternal(). NextRecPtr is also used as the position to
* decode next. Calling XLogBeginRead() sets NextRecPtr and EndRecPtr to
* the requested starting position.
*/
XLogRecord *decoded_record; /* currently decoded record */
XLogRecPtr DecodeRecPtr; /* start of last record decoded */
XLogRecPtr NextRecPtr; /* end+1 of last record decoded */
XLogRecPtr PrevRecPtr; /* start of previous record decoded */
char *main_data; /* record's main data portion */
uint32 main_data_len; /* main data portion's length */
uint32 main_data_bufsz; /* allocated size of the buffer */
RepOriginId record_origin;
TransactionId toplevel_xid; /* XID of top-level transaction */
/* information about blocks referenced by the record. */
DecodedBkpBlock blocks[XLR_MAX_BLOCK_ID + 1];
int max_block_id; /* highest block_id in use (-1 if none) */
/* Last record returned by XLogReadRecord(). */
DecodedXLogRecord *record;
/* ----------------------------------------
* private/internal state
* ----------------------------------------
*/
/*
* Buffer for decoded records. This is a circular buffer, though
* individual records can't be split in the middle, so some space is often
* wasted at the end. Oversized records that don't fit in this space are
* allocated separately.
*/
char *decode_buffer;
size_t decode_buffer_size;
bool free_decode_buffer; /* need to free? */
char *decode_buffer_head; /* write head */
char *decode_buffer_tail; /* read head */
/*
* Queue of records that have been decoded. This is a linked list that
* usually consists of consecutive records in decode_buffer, but may also
* contain oversized records allocated with palloc().
*/
DecodedXLogRecord *decode_queue_head; /* newest decoded record */
DecodedXLogRecord *decode_queue_tail; /* oldest decoded record */
/* last read XLOG position for data currently in readBuf */
WALSegmentContext segcxt;
WALOpenSegment seg;
@ -230,7 +273,7 @@ struct XLogReaderState
uint32 readRecordBufSize;
/*
* XLogReadRecord() state
* XLogReadRecordInternal() state
*/
XLogReadRecordState readRecordState; /* state machine state */
int recordGotLen; /* amount of current record that has already
@ -238,8 +281,11 @@ struct XLogReaderState
int recordRemainLen; /* length of current record that remains */
XLogRecPtr recordContRecPtr; /* where the current record continues */
DecodedXLogRecord *decoding; /* record currently being decoded */
/* Buffer to hold error message */
char *errormsg_buf;
bool errormsg_deferred;
};
struct XLogFindNextRecordState
@ -264,6 +310,11 @@ extern XLogReaderState *XLogReaderAllocate(int wal_segment_size,
/* Free an XLogReader */
extern void XLogReaderFree(XLogReaderState *state);
/* Optionally provide a circular decoding buffer to allow readahead. */
extern void XLogReaderSetDecodeBuffer(XLogReaderState *state,
void *buffer,
size_t size);
/* Position the XLogReader to given record */
extern void XLogBeginRead(XLogReaderState *state, XLogRecPtr RecPtr);
#ifdef FRONTEND
@ -271,11 +322,21 @@ extern XLogFindNextRecordState *InitXLogFindNextRecord(XLogReaderState *reader_s
extern bool XLogFindNextRecord(XLogFindNextRecordState *state);
#endif /* FRONTEND */
/* Read the next XLog record. Returns NULL on end-of-WAL or failure */
/* Read the next record's header. Returns NULL on end-of-WAL or failure. */
extern XLogReadRecordResult XLogReadRecord(XLogReaderState *state,
XLogRecord **record,
char **errormsg);
/* Read the next decoded record. Returns NULL on end-of-WAL or failure. */
extern XLogReadRecordResult XLogNextRecord(XLogReaderState *state,
DecodedXLogRecord **record,
char **errormsg);
/* Try to read ahead, if there is space in the decoding buffer. */
extern XLogReadRecordResult XLogReadAhead(XLogReaderState *state,
DecodedXLogRecord **record,
char **errormsg);
/* Validate a page */
extern bool XLogReaderValidatePageHeader(XLogReaderState *state,
XLogRecPtr recptr, char *phdr);
@ -300,25 +361,32 @@ extern bool WALRead(XLogReaderState *state,
/* Functions for decoding an XLogRecord */
extern bool DecodeXLogRecord(XLogReaderState *state, XLogRecord *record,
extern size_t DecodeXLogRecordRequiredSpace(size_t xl_tot_len);
extern bool DecodeXLogRecord(XLogReaderState *state,
DecodedXLogRecord *decoded,
XLogRecord *record,
XLogRecPtr lsn,
char **errmsg);
#define XLogRecGetTotalLen(decoder) ((decoder)->decoded_record->xl_tot_len)
#define XLogRecGetPrev(decoder) ((decoder)->decoded_record->xl_prev)
#define XLogRecGetInfo(decoder) ((decoder)->decoded_record->xl_info)
#define XLogRecGetRmid(decoder) ((decoder)->decoded_record->xl_rmid)
#define XLogRecGetXid(decoder) ((decoder)->decoded_record->xl_xid)
#define XLogRecGetOrigin(decoder) ((decoder)->record_origin)
#define XLogRecGetTopXid(decoder) ((decoder)->toplevel_xid)
#define XLogRecGetData(decoder) ((decoder)->main_data)
#define XLogRecGetDataLen(decoder) ((decoder)->main_data_len)
#define XLogRecHasAnyBlockRefs(decoder) ((decoder)->max_block_id >= 0)
#define XLogRecGetTotalLen(decoder) ((decoder)->record->header.xl_tot_len)
#define XLogRecGetPrev(decoder) ((decoder)->record->header.xl_prev)
#define XLogRecGetInfo(decoder) ((decoder)->record->header.xl_info)
#define XLogRecGetRmid(decoder) ((decoder)->record->header.xl_rmid)
#define XLogRecGetXid(decoder) ((decoder)->record->header.xl_xid)
#define XLogRecGetOrigin(decoder) ((decoder)->record->record_origin)
#define XLogRecGetTopXid(decoder) ((decoder)->record->toplevel_xid)
#define XLogRecGetData(decoder) ((decoder)->record->main_data)
#define XLogRecGetDataLen(decoder) ((decoder)->record->main_data_len)
#define XLogRecHasAnyBlockRefs(decoder) ((decoder)->record->max_block_id >= 0)
#define XLogRecMaxBlockId(decoder) ((decoder)->record->max_block_id)
#define XLogRecGetBlock(decoder, i) (&(decoder)->record->blocks[(i)])
#define XLogRecHasBlockRef(decoder, block_id) \
((decoder)->blocks[block_id].in_use)
((decoder)->record->max_block_id >= (block_id)) && \
((decoder)->record->blocks[block_id].in_use)
#define XLogRecHasBlockImage(decoder, block_id) \
((decoder)->blocks[block_id].has_image)
((decoder)->record->blocks[block_id].has_image)
#define XLogRecBlockImageApply(decoder, block_id) \
((decoder)->blocks[block_id].apply_image)
((decoder)->record->blocks[block_id].apply_image)
#ifndef FRONTEND
extern FullTransactionId XLogRecGetFullXid(XLogReaderState *record);