diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index f129a87501..45bd1f1b7e 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -3588,89 +3588,6 @@ include_dir 'conf.d' - - - Recovery - - - configuration - of recovery - general settings - - - - This section describes the settings that apply to recovery in general, - affecting crash recovery, streaming replication and archive-based - replication. - - - - - - recovery_prefetch (boolean) - - recovery_prefetch configuration parameter - - - - - Whether to try to prefetch blocks that are referenced in the WAL that - are not yet in the buffer pool, during recovery. Prefetching blocks - that will soon be needed can reduce I/O wait times in some workloads. - See also the and - settings, which limit - prefetching activity. - This setting is disabled by default. - - - This feature currently depends on an effective - posix_fadvise function, which some - operating systems lack. - - - - - - recovery_prefetch_fpw (boolean) - - recovery_prefetch_fpw configuration parameter - - - - - Whether to prefetch blocks that were logged with full page images, - during recovery. Often this doesn't help, since such blocks will not - be read the first time they are needed and might remain in the buffer - pool after that. However, on file systems with a block size larger - than - PostgreSQL's, prefetching can avoid a - costly read-before-write when blocks are later written. - The default is off. - - - - - - wal_decode_buffer_size (integer) - - wal_decode_buffer_size configuration parameter - - - - - A limit on how far ahead the server can look in the WAL, to find - blocks to prefetch. Setting it too high might be counterproductive, - if it means that data falls out of the - kernel cache before it is needed. If this value is specified without - units, it is taken as bytes. - The default is 512kB. - - - - - - - Archive Recovery diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 370cdc2e1a..dcbb10fb6f 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -337,13 +337,6 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser - - pg_stat_prefetch_recoverypg_stat_prefetch_recovery - Only one row, showing statistics about blocks prefetched during recovery. - See for details. - - - pg_stat_subscriptionpg_stat_subscription At least one row per subscription, showing information about @@ -2948,78 +2941,6 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i copy of the subscribed tables. - - <structname>pg_stat_prefetch_recovery</structname> View - - - - Column - Type - Description - - - - - - prefetch - bigint - Number of blocks prefetched because they were not in the buffer pool - - - skip_hit - bigint - Number of blocks not prefetched because they were already in the buffer pool - - - skip_new - bigint - Number of blocks not prefetched because they were new (usually relation extension) - - - skip_fpw - bigint - Number of blocks not prefetched because a full page image was included in the WAL and was set to off - - - skip_seq - bigint - Number of blocks not prefetched because of repeated access - - - distance - integer - How far ahead of recovery the prefetcher is currently reading, in bytes - - - queue_depth - integer - How many prefetches have been initiated but are not yet known to have completed - - - avg_distance - float4 - How far ahead of recovery the prefetcher is on average, while recovery is not idle - - - avg_queue_depth - float4 - Average number of prefetches in flight while recovery is not idle - - - -
- - - The pg_stat_prefetch_recovery view will contain only - one row. It is filled with nulls if recovery is not running or WAL - prefetching is not enabled. See - for more information. The counters in this view are reset whenever the - , - or - setting is changed and - the server configuration is reloaded. - - <structname>pg_stat_subscription</structname> View @@ -5152,11 +5073,8 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i all the counters shown in the pg_stat_bgwriter view, archiver to reset all the counters shown in - the pg_stat_archiver view, - wal to reset all the counters shown in the - pg_stat_wal view or - prefetch_recovery to reset all the counters shown - in the pg_stat_prefetch_recovery view. + the pg_stat_archiver view or wal + to reset all the counters shown in the pg_stat_wal view. This function is restricted to superusers by default, but other users diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml index 9606c617d4..60f066d247 100644 --- a/doc/src/sgml/wal.sgml +++ b/doc/src/sgml/wal.sgml @@ -803,21 +803,6 @@ counted as wal_write and wal_sync in pg_stat_wal, respectively. - - - The parameter can - be used to improve I/O performance during recovery by instructing - PostgreSQL to initiate reads - of disk blocks that will soon be needed but are not currently in - PostgreSQL's buffer pool. - The and - settings limit prefetching - concurrency and distance, respectively. The - prefetching mechanism is most likely to be effective on systems - with full_page_writes set to - off (where that is safe), and where the working - set is larger than RAM. By default, prefetching in recovery is disabled. - diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile index 39f9d4e77d..595e02de72 100644 --- a/src/backend/access/transam/Makefile +++ b/src/backend/access/transam/Makefile @@ -31,7 +31,6 @@ OBJS = \ xlogarchive.o \ xlogfuncs.o \ xloginsert.o \ - xlogprefetch.o \ xlogreader.o \ xlogutils.o diff --git a/src/backend/access/transam/generic_xlog.c b/src/backend/access/transam/generic_xlog.c index 0e9bcc7159..63301a1ab1 100644 --- a/src/backend/access/transam/generic_xlog.c +++ b/src/backend/access/transam/generic_xlog.c @@ -482,10 +482,10 @@ generic_redo(XLogReaderState *record) uint8 block_id; /* Protect limited size of buffers[] array */ - Assert(XLogRecMaxBlockId(record) < MAX_GENERIC_XLOG_PAGES); + Assert(record->max_block_id < MAX_GENERIC_XLOG_PAGES); /* Iterate over blocks */ - for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++) + for (block_id = 0; block_id <= record->max_block_id; block_id++) { XLogRedoAction action; @@ -525,7 +525,7 @@ generic_redo(XLogReaderState *record) } /* Changes are done: unlock and release all buffers */ - for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++) + for (block_id = 0; block_id <= record->max_block_id; block_id++) { if (BufferIsValid(buffers[block_id])) UnlockReleaseBuffer(buffers[block_id]); diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index b6581349a3..46f3d08249 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -1330,8 +1330,11 @@ XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len) char *errormsg; TimeLineID save_currtli = ThisTimeLineID; - xlogreader = XLogReaderAllocate(wal_segment_size, NULL, wal_segment_close); - + xlogreader = XLogReaderAllocate(wal_segment_size, NULL, + XL_ROUTINE(.page_read = &read_local_xlog_page, + .segment_open = &wal_segment_open, + .segment_close = &wal_segment_close), + NULL); if (!xlogreader) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), @@ -1339,12 +1342,7 @@ XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len) errdetail("Failed while allocating a WAL reading processor."))); XLogBeginRead(xlogreader, lsn); - while (XLogReadRecord(xlogreader, &record, &errormsg) == - XLREAD_NEED_DATA) - { - if (!read_local_xlog_page(xlogreader)) - break; - } + record = XLogReadRecord(xlogreader, &errormsg); /* * Restore immediately the timeline where it was previously, as diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index adfc6f67e2..c1d4415a43 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -35,7 +35,6 @@ #include "access/xlog_internal.h" #include "access/xlogarchive.h" #include "access/xloginsert.h" -#include "access/xlogprefetch.h" #include "access/xlogreader.h" #include "access/xlogutils.h" #include "catalog/catversion.h" @@ -111,7 +110,6 @@ int CommitDelay = 0; /* precommit delay in microseconds */ int CommitSiblings = 5; /* # concurrent xacts needed to sleep */ int wal_retrieve_retry_interval = 5000; int max_slot_wal_keep_size_mb = -1; -int wal_decode_buffer_size = 512 * 1024; bool track_wal_io_timing = false; #ifdef WAL_DEBUG @@ -813,13 +811,17 @@ static XLogSegNo openLogSegNo = 0; * These variables are used similarly to the ones above, but for reading * the XLOG. Note, however, that readOff generally represents the offset * of the page just read, not the seek position of the FD itself, which - * will be just past that page. readSource indicates where we got the - * currently open file from. + * will be just past that page. readLen indicates how much of the current + * page has been read into readBuf, and readSource indicates where we got + * the currently open file from. * Note: we could use Reserve/ReleaseExternalFD to track consumption of * this FD too; but it doesn't currently seem worthwhile, since the XLOG is * not read by general-purpose sessions. */ static int readFile = -1; +static XLogSegNo readSegNo = 0; +static uint32 readOff = 0; +static uint32 readLen = 0; static XLogSource readSource = XLOG_FROM_ANY; /* @@ -836,6 +838,13 @@ static XLogSource currentSource = XLOG_FROM_ANY; static bool lastSourceFailed = false; static bool pendingWalRcvRestart = false; +typedef struct XLogPageReadPrivate +{ + int emode; + bool fetching_ckpt; /* are we fetching a checkpoint record? */ + bool randAccess; +} XLogPageReadPrivate; + /* * These variables track when we last obtained some WAL data to process, * and where we got it from. (XLogReceiptSource is initially the same as @@ -911,13 +920,10 @@ static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, XLogSource source, bool notfoundOk); static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source); -static bool XLogPageRead(XLogReaderState *state, - bool fetching_ckpt, int emode, bool randAccess, - bool nowait); +static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, + int reqLen, XLogRecPtr targetRecPtr, char *readBuf); static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, - bool fetching_ckpt, - XLogRecPtr tliRecPtr, - XLogSegNo readSegNo); + bool fetching_ckpt, XLogRecPtr tliRecPtr); static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr); static void XLogFileClose(void); static void PreallocXlogFiles(XLogRecPtr endptr); @@ -1212,7 +1218,6 @@ XLogInsertRecord(XLogRecData *rdata, StringInfoData recordBuf; char *errormsg = NULL; MemoryContext oldCxt; - DecodedXLogRecord *decoded; oldCxt = MemoryContextSwitchTo(walDebugCxt); @@ -1228,19 +1233,15 @@ XLogInsertRecord(XLogRecData *rdata, for (; rdata != NULL; rdata = rdata->next) appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len); - /* How much space would it take to decode this record? */ - decoded = palloc(DecodeXLogRecordRequiredSpace(recordBuf.len)); - if (!debug_reader) - debug_reader = XLogReaderAllocate(wal_segment_size, NULL, NULL); + debug_reader = XLogReaderAllocate(wal_segment_size, NULL, + XL_ROUTINE(), NULL); if (!debug_reader) { appendStringInfoString(&buf, "error decoding record: out of memory"); } - else if (!DecodeXLogRecord(debug_reader, decoded, - (XLogRecord *) recordBuf.data, - EndPos, + else if (!DecodeXLogRecord(debug_reader, (XLogRecord *) recordBuf.data, &errormsg)) { appendStringInfo(&buf, "error decoding record: %s", @@ -1249,17 +1250,10 @@ XLogInsertRecord(XLogRecData *rdata, else { appendStringInfoString(&buf, " - "); - /* - * Temporarily make this decoded record the current record for - * XLogRecGetXXX() macros. - */ - debug_reader->record = decoded; xlog_outdesc(&buf, debug_reader); - debug_reader->record = NULL; } elog(LOG, "%s", buf.data); - pfree(decoded); pfree(buf.data); pfree(recordBuf.data); MemoryContextSwitchTo(oldCxt); @@ -1433,7 +1427,7 @@ checkXLogConsistency(XLogReaderState *record) Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0); - for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++) + for (block_id = 0; block_id <= record->max_block_id; block_id++) { Buffer buf; Page page; @@ -1464,7 +1458,7 @@ checkXLogConsistency(XLogReaderState *record) * temporary page. */ buf = XLogReadBufferExtended(rnode, forknum, blkno, - RBM_NORMAL_NO_LOG, InvalidBuffer); + RBM_NORMAL_NO_LOG); if (!BufferIsValid(buf)) continue; @@ -3732,6 +3726,7 @@ XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, snprintf(activitymsg, sizeof(activitymsg), "waiting for %s", xlogfname); set_ps_display(activitymsg); + restoredFromArchive = RestoreArchivedFile(path, xlogfname, "RECOVERYXLOG", wal_segment_size, @@ -4378,7 +4373,12 @@ ReadRecord(XLogReaderState *xlogreader, int emode, bool fetching_ckpt) { XLogRecord *record; - bool randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr); + XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data; + + /* Pass through parameters to XLogPageRead */ + private->fetching_ckpt = fetching_ckpt; + private->emode = emode; + private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr); /* This is the first attempt to read this page. */ lastSourceFailed = false; @@ -4386,19 +4386,10 @@ ReadRecord(XLogReaderState *xlogreader, int emode, for (;;) { char *errormsg; - XLogReadRecordResult result; - - while ((result = XLogReadRecord(xlogreader, &record, &errormsg)) - == XLREAD_NEED_DATA) - { - if (!XLogPageRead(xlogreader, fetching_ckpt, emode, randAccess, - false /* wait for data if streaming */)) - break; - } + record = XLogReadRecord(xlogreader, &errormsg); ReadRecPtr = xlogreader->ReadRecPtr; EndRecPtr = xlogreader->EndRecPtr; - if (record == NULL) { if (readFile >= 0) @@ -6466,6 +6457,7 @@ StartupXLOG(void) bool backupFromStandby = false; DBState dbstate_at_startup; XLogReaderState *xlogreader; + XLogPageReadPrivate private; bool promoted = false; struct stat st; @@ -6624,9 +6616,13 @@ StartupXLOG(void) OwnLatch(&XLogCtl->recoveryWakeupLatch); /* Set up XLOG reader facility */ + MemSet(&private, 0, sizeof(XLogPageReadPrivate)); xlogreader = - XLogReaderAllocate(wal_segment_size, NULL, wal_segment_close); - + XLogReaderAllocate(wal_segment_size, NULL, + XL_ROUTINE(.page_read = &XLogPageRead, + .segment_open = NULL, + .segment_close = wal_segment_close), + &private); if (!xlogreader) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), @@ -6634,12 +6630,6 @@ StartupXLOG(void) errdetail("Failed while allocating a WAL reading processor."))); xlogreader->system_identifier = ControlFile->system_identifier; - /* - * Set the WAL decode buffer size. This limits how far ahead we can read - * in the WAL. - */ - XLogReaderSetDecodeBuffer(xlogreader, NULL, wal_decode_buffer_size); - /* * Allocate two page buffers dedicated to WAL consistency checks. We do * it this way, rather than just making static arrays, for two reasons: @@ -7320,7 +7310,6 @@ StartupXLOG(void) { ErrorContextCallback errcallback; TimestampTz xtime; - XLogPrefetchState prefetch; PGRUsage ru0; pg_rusage_init(&ru0); @@ -7331,9 +7320,6 @@ StartupXLOG(void) (errmsg("redo starts at %X/%X", LSN_FORMAT_ARGS(ReadRecPtr)))); - /* Prepare to prefetch, if configured. */ - XLogPrefetchBegin(&prefetch, xlogreader); - /* * main redo apply loop */ @@ -7363,14 +7349,6 @@ StartupXLOG(void) /* Handle interrupt signals of startup process */ HandleStartupProcInterrupts(); - /* Perform WAL prefetching, if enabled. */ - while (XLogPrefetch(&prefetch, xlogreader->ReadRecPtr) == XLREAD_NEED_DATA) - { - if (!XLogPageRead(xlogreader, false, LOG, false, - true /* don't wait for streaming data */)) - break; - } - /* * Pause WAL replay, if requested by a hot-standby session via * SetRecoveryPause(). @@ -7544,9 +7522,6 @@ StartupXLOG(void) */ if (AllowCascadeReplication()) WalSndWakeup(); - - /* Reset the prefetcher. */ - XLogPrefetchReconfigure(); } /* Exit loop if we reached inclusive recovery target */ @@ -7563,7 +7538,6 @@ StartupXLOG(void) /* * end of main redo apply loop */ - XLogPrefetchEnd(&prefetch); if (reachedRecoveryTarget) { @@ -7845,8 +7819,7 @@ StartupXLOG(void) XLogRecPtr pageBeginPtr; pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ); - Assert(XLogSegmentOffset(xlogreader->readPagePtr, wal_segment_size) == - XLogSegmentOffset(pageBeginPtr, wal_segment_size)); + Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size)); firstIdx = XLogRecPtrToBufIdx(EndOfLog); @@ -10338,7 +10311,7 @@ xlog_redo(XLogReaderState *record) * XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info * code just to distinguish them for statistics purposes. */ - for (uint8 block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++) + for (uint8 block_id = 0; block_id <= record->max_block_id; block_id++) { Buffer buffer; @@ -10473,7 +10446,7 @@ xlog_block_info(StringInfo buf, XLogReaderState *record) int block_id; /* decode block references */ - for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++) + for (block_id = 0; block_id <= record->max_block_id; block_id++) { RelFileNode rnode; ForkNumber forknum; @@ -12133,19 +12106,14 @@ CancelBackup(void) * and call XLogPageRead() again with the same arguments. This lets * XLogPageRead() to try fetching the record from another source, or to * sleep and retry. - * - * If nowait is true, then return false immediately if the requested data isn't - * available yet. */ -static bool -XLogPageRead(XLogReaderState *state, - bool fetching_ckpt, int emode, bool randAccess, bool nowait) +static int +XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, + XLogRecPtr targetRecPtr, char *readBuf) { - char *readBuf = state->readBuf; - XLogRecPtr targetPagePtr = state->readPagePtr; - int reqLen = state->reqLen; - int readLen = 0; - XLogRecPtr targetRecPtr = state->DecodeRecPtr; + XLogPageReadPrivate *private = + (XLogPageReadPrivate *) xlogreader->private_data; + int emode = private->emode; uint32 targetPageOff; XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY; int r; @@ -12158,7 +12126,7 @@ XLogPageRead(XLogReaderState *state, * is not in the currently open one. */ if (readFile >= 0 && - !XLByteInSeg(targetPagePtr, state->seg.ws_segno, wal_segment_size)) + !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size)) { /* * Request a restartpoint if we've replayed too much xlog since the @@ -12166,10 +12134,10 @@ XLogPageRead(XLogReaderState *state, */ if (bgwriterLaunched) { - if (XLogCheckpointNeeded(state->seg.ws_segno)) + if (XLogCheckpointNeeded(readSegNo)) { (void) GetRedoRecPtr(); - if (XLogCheckpointNeeded(state->seg.ws_segno)) + if (XLogCheckpointNeeded(readSegNo)) RequestCheckpoint(CHECKPOINT_CAUSE_XLOG); } } @@ -12179,7 +12147,7 @@ XLogPageRead(XLogReaderState *state, readSource = XLOG_FROM_ANY; } - XLByteToSeg(targetPagePtr, state->seg.ws_segno, wal_segment_size); + XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size); retry: /* See if we need to retrieve more data */ @@ -12187,22 +12155,18 @@ retry: (readSource == XLOG_FROM_STREAM && flushedUpto < targetPagePtr + reqLen)) { - if (nowait) - { - XLogReaderSetInputData(state, -1); - return false; - } - if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen, - randAccess, fetching_ckpt, - targetRecPtr, state->seg.ws_segno)) + private->randAccess, + private->fetching_ckpt, + targetRecPtr)) { if (readFile >= 0) close(readFile); readFile = -1; + readLen = 0; readSource = XLOG_FROM_ANY; - XLogReaderSetInputData(state, -1); - return false; + + return -1; } } @@ -12229,36 +12193,40 @@ retry: else readLen = XLOG_BLCKSZ; + /* Read the requested page */ + readOff = targetPageOff; + pgstat_report_wait_start(WAIT_EVENT_WAL_READ); - r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) targetPageOff); + r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff); if (r != XLOG_BLCKSZ) { char fname[MAXFNAMELEN]; int save_errno = errno; pgstat_report_wait_end(); - XLogFileName(fname, curFileTLI, state->seg.ws_segno, wal_segment_size); + XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size); if (r < 0) { errno = save_errno; ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen), (errcode_for_file_access(), errmsg("could not read from log segment %s, offset %u: %m", - fname, targetPageOff))); + fname, readOff))); } else ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen), (errcode(ERRCODE_DATA_CORRUPTED), errmsg("could not read from log segment %s, offset %u: read %d of %zu", - fname, targetPageOff, r, (Size) XLOG_BLCKSZ))); + fname, readOff, r, (Size) XLOG_BLCKSZ))); goto next_record_is_invalid; } pgstat_report_wait_end(); - Assert(targetSegNo == state->seg.ws_segno); - Assert(readLen >= reqLen); + Assert(targetSegNo == readSegNo); + Assert(targetPageOff == readOff); + Assert(reqLen <= readLen); - state->seg.ws_tli = curFileTLI; + xlogreader->seg.ws_tli = curFileTLI; /* * Check the page header immediately, so that we can retry immediately if @@ -12286,16 +12254,14 @@ retry: * Validating the page header is cheap enough that doing it twice * shouldn't be a big deal from a performance point of view. */ - if (!XLogReaderValidatePageHeader(state, targetPagePtr, readBuf)) + if (!XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf)) { - /* reset any error StateValidatePageHeader() might have set */ - state->errormsg_buf[0] = '\0'; + /* reset any error XLogReaderValidatePageHeader() might have set */ + xlogreader->errormsg_buf[0] = '\0'; goto next_record_is_invalid; } - Assert(state->readPagePtr == targetPagePtr); - XLogReaderSetInputData(state, readLen); - return true; + return readLen; next_record_is_invalid: lastSourceFailed = true; @@ -12303,14 +12269,14 @@ next_record_is_invalid: if (readFile >= 0) close(readFile); readFile = -1; + readLen = 0; readSource = XLOG_FROM_ANY; /* In standby-mode, keep trying */ if (StandbyMode) goto retry; - - XLogReaderSetInputData(state, -1); - return false; + else + return -1; } /* @@ -12341,8 +12307,7 @@ next_record_is_invalid: */ static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, - bool fetching_ckpt, XLogRecPtr tliRecPtr, - XLogSegNo readSegNo) + bool fetching_ckpt, XLogRecPtr tliRecPtr) { static TimestampTz last_fail_time = 0; TimestampTz now; @@ -12426,7 +12391,6 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, */ currentSource = XLOG_FROM_STREAM; startWalReceiver = true; - XLogPrefetchReconfigure(); break; case XLOG_FROM_STREAM: @@ -12661,7 +12625,6 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, * be updated on each cycle. When we are behind, * XLogReceiptTime will not advance, so the grace time * allotted to conflicting queries will decrease. - * */ if (RecPtr < flushedUpto) havedata = true; @@ -12682,7 +12645,6 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, else havedata = false; } - if (havedata) { /* diff --git a/src/backend/access/transam/xlogprefetch.c b/src/backend/access/transam/xlogprefetch.c deleted file mode 100644 index ae4585232b..0000000000 --- a/src/backend/access/transam/xlogprefetch.c +++ /dev/null @@ -1,923 +0,0 @@ -/*------------------------------------------------------------------------- - * - * xlogprefetch.c - * Prefetching support for recovery. - * - * Portions Copyright (c) 2021, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * - * IDENTIFICATION - * src/backend/access/transam/xlogprefetch.c - * - * The goal of this module is to read future WAL records and issue - * PrefetchSharedBuffer() calls for referenced blocks, so that we avoid I/O - * stalls in the main recovery loop. - * - * When examining a WAL record from the future, we need to consider that a - * referenced block or segment file might not exist on disk until this record - * or some earlier record has been replayed. After a crash, a file might also - * be missing because it was dropped by a later WAL record; in that case, it - * will be recreated when this record is replayed. These cases are handled by - * recognizing them and adding a "filter" that prevents all prefetching of a - * certain block range until the present WAL record has been replayed. Blocks - * skipped for these reasons are counted as "skip_new" (that is, cases where we - * didn't try to prefetch "new" blocks). - * - * Blocks found in the buffer pool already are counted as "skip_hit". - * Repeated access to the same buffer is detected and skipped, and this is - * counted with "skip_seq". Blocks that were logged with FPWs are skipped if - * recovery_prefetch_fpw is off, since on most systems there will be no I/O - * stall; this is counted with "skip_fpw". - * - * The only way we currently have to know that an I/O initiated with - * PrefetchSharedBuffer() has completed is to wait for the corresponding call - * to XLogReadBufferInRedo() to return. Therefore, we track the number of - * potentially in-flight I/Os by using a circular buffer of LSNs. When it's - * full, we have to wait for recovery to replay enough records to remove some - * LSNs, and only then can we initiate more prefetching. Ideally, this keeps - * us just the right distance ahead to respect maintenance_io_concurrency, - * though in practice it errs on the side of being too conservative because - * many I/Os complete sooner than we know. - * - *------------------------------------------------------------------------- - */ - -#include "postgres.h" - -#include "access/xlog.h" -#include "access/xlogprefetch.h" -#include "access/xlogreader.h" -#include "access/xlogutils.h" -#include "catalog/storage_xlog.h" -#include "utils/fmgrprotos.h" -#include "utils/timestamp.h" -#include "funcapi.h" -#include "pgstat.h" -#include "miscadmin.h" -#include "port/atomics.h" -#include "storage/bufmgr.h" -#include "storage/shmem.h" -#include "storage/smgr.h" -#include "utils/guc.h" -#include "utils/hsearch.h" - -/* - * Sample the queue depth and distance every time we replay this much WAL. - * This is used to compute avg_queue_depth and avg_distance for the log - * message that appears at the end of crash recovery. It's also used to send - * messages periodically to the stats collector, to save the counters on disk. - */ -#define XLOGPREFETCHER_SAMPLE_DISTANCE 0x40000 - -/* GUCs */ -bool recovery_prefetch = false; -bool recovery_prefetch_fpw = false; - -int XLogPrefetchReconfigureCount; - -/* - * A prefetcher object. There is at most one of these in existence at a time, - * recreated whenever there is a configuration change. - */ -struct XLogPrefetcher -{ - /* Reader and current reading state. */ - XLogReaderState *reader; - DecodedXLogRecord *record; - int next_block_id; - bool shutdown; - - /* Details of last prefetch to skip repeats and seq scans. */ - SMgrRelation last_reln; - RelFileNode last_rnode; - BlockNumber last_blkno; - - /* Online averages. */ - uint64 samples; - double avg_queue_depth; - double avg_distance; - XLogRecPtr next_sample_lsn; - - /* Book-keeping required to avoid accessing non-existing blocks. */ - HTAB *filter_table; - dlist_head filter_queue; - - /* Book-keeping required to limit concurrent prefetches. */ - int prefetch_head; - int prefetch_tail; - int prefetch_queue_size; - XLogRecPtr prefetch_queue[MAX_IO_CONCURRENCY + 1]; -}; - -/* - * A temporary filter used to track block ranges that haven't been created - * yet, whole relations that haven't been created yet, and whole relations - * that we must assume have already been dropped. - */ -typedef struct XLogPrefetcherFilter -{ - RelFileNode rnode; - XLogRecPtr filter_until_replayed; - BlockNumber filter_from_block; - dlist_node link; -} XLogPrefetcherFilter; - -/* - * Counters exposed in shared memory for pg_stat_prefetch_recovery. - */ -typedef struct XLogPrefetchStats -{ - pg_atomic_uint64 reset_time; /* Time of last reset. */ - pg_atomic_uint64 prefetch; /* Prefetches initiated. */ - pg_atomic_uint64 skip_hit; /* Blocks already buffered. */ - pg_atomic_uint64 skip_new; /* New/missing blocks filtered. */ - pg_atomic_uint64 skip_fpw; /* FPWs skipped. */ - pg_atomic_uint64 skip_seq; /* Repeat blocks skipped. */ - float avg_distance; - float avg_queue_depth; - - /* Reset counters */ - pg_atomic_uint32 reset_request; - uint32 reset_handled; - - /* Dynamic values */ - int distance; /* Number of bytes ahead in the WAL. */ - int queue_depth; /* Number of I/Os possibly in progress. */ -} XLogPrefetchStats; - -static inline void XLogPrefetcherAddFilter(XLogPrefetcher *prefetcher, - RelFileNode rnode, - BlockNumber blockno, - XLogRecPtr lsn); -static inline bool XLogPrefetcherIsFiltered(XLogPrefetcher *prefetcher, - RelFileNode rnode, - BlockNumber blockno); -static inline void XLogPrefetcherCompleteFilters(XLogPrefetcher *prefetcher, - XLogRecPtr replaying_lsn); -static inline void XLogPrefetcherInitiatedIO(XLogPrefetcher *prefetcher, - XLogRecPtr prefetching_lsn); -static inline void XLogPrefetcherCompletedIO(XLogPrefetcher *prefetcher, - XLogRecPtr replaying_lsn); -static inline bool XLogPrefetcherSaturated(XLogPrefetcher *prefetcher); -static bool XLogPrefetcherScanRecords(XLogPrefetcher *prefetcher, - XLogRecPtr replaying_lsn); -static bool XLogPrefetcherScanBlocks(XLogPrefetcher *prefetcher); -static void XLogPrefetchSaveStats(void); -static void XLogPrefetchRestoreStats(void); - -static XLogPrefetchStats *SharedStats; - -size_t -XLogPrefetchShmemSize(void) -{ - return sizeof(XLogPrefetchStats); -} - -static void -XLogPrefetchResetStats(void) -{ - pg_atomic_write_u64(&SharedStats->reset_time, GetCurrentTimestamp()); - pg_atomic_write_u64(&SharedStats->prefetch, 0); - pg_atomic_write_u64(&SharedStats->skip_hit, 0); - pg_atomic_write_u64(&SharedStats->skip_new, 0); - pg_atomic_write_u64(&SharedStats->skip_fpw, 0); - pg_atomic_write_u64(&SharedStats->skip_seq, 0); - SharedStats->avg_distance = 0; - SharedStats->avg_queue_depth = 0; -} - -void -XLogPrefetchShmemInit(void) -{ - bool found; - - SharedStats = (XLogPrefetchStats *) - ShmemInitStruct("XLogPrefetchStats", - sizeof(XLogPrefetchStats), - &found); - - if (!found) - { - pg_atomic_init_u32(&SharedStats->reset_request, 0); - SharedStats->reset_handled = 0; - - pg_atomic_init_u64(&SharedStats->reset_time, GetCurrentTimestamp()); - pg_atomic_init_u64(&SharedStats->prefetch, 0); - pg_atomic_init_u64(&SharedStats->skip_hit, 0); - pg_atomic_init_u64(&SharedStats->skip_new, 0); - pg_atomic_init_u64(&SharedStats->skip_fpw, 0); - pg_atomic_init_u64(&SharedStats->skip_seq, 0); - SharedStats->avg_distance = 0; - SharedStats->avg_queue_depth = 0; - SharedStats->distance = 0; - SharedStats->queue_depth = 0; - } -} - -/* - * Called when any GUC is changed that affects prefetching. - */ -void -XLogPrefetchReconfigure(void) -{ - XLogPrefetchReconfigureCount++; -} - -/* - * Called by any backend to request that the stats be reset. - */ -void -XLogPrefetchRequestResetStats(void) -{ - pg_atomic_fetch_add_u32(&SharedStats->reset_request, 1); -} - -/* - * Tell the stats collector to serialize the shared memory counters into the - * stats file. - */ -static void -XLogPrefetchSaveStats(void) -{ - PgStat_RecoveryPrefetchStats serialized = { - .prefetch = pg_atomic_read_u64(&SharedStats->prefetch), - .skip_hit = pg_atomic_read_u64(&SharedStats->skip_hit), - .skip_new = pg_atomic_read_u64(&SharedStats->skip_new), - .skip_fpw = pg_atomic_read_u64(&SharedStats->skip_fpw), - .skip_seq = pg_atomic_read_u64(&SharedStats->skip_seq), - .stat_reset_timestamp = pg_atomic_read_u64(&SharedStats->reset_time) - }; - - pgstat_send_recoveryprefetch(&serialized); -} - -/* - * Try to restore the shared memory counters from the stats file. - */ -static void -XLogPrefetchRestoreStats(void) -{ - PgStat_RecoveryPrefetchStats *serialized = pgstat_fetch_recoveryprefetch(); - - if (serialized->stat_reset_timestamp != 0) - { - pg_atomic_write_u64(&SharedStats->prefetch, serialized->prefetch); - pg_atomic_write_u64(&SharedStats->skip_hit, serialized->skip_hit); - pg_atomic_write_u64(&SharedStats->skip_new, serialized->skip_new); - pg_atomic_write_u64(&SharedStats->skip_fpw, serialized->skip_fpw); - pg_atomic_write_u64(&SharedStats->skip_seq, serialized->skip_seq); - pg_atomic_write_u64(&SharedStats->reset_time, serialized->stat_reset_timestamp); - } -} - -/* - * Increment a counter in shared memory. This is equivalent to *counter++ on a - * plain uint64 without any memory barrier or locking, except on platforms - * where readers can't read uint64 without possibly observing a torn value. - */ -static inline void -XLogPrefetchIncrement(pg_atomic_uint64 *counter) -{ - Assert(AmStartupProcess() || !IsUnderPostmaster); - pg_atomic_write_u64(counter, pg_atomic_read_u64(counter) + 1); -} - -/* - * Initialize an XLogPrefetchState object and restore the last saved - * statistics from disk. - */ -void -XLogPrefetchBegin(XLogPrefetchState *state, XLogReaderState *reader) -{ - XLogPrefetchRestoreStats(); - - /* We'll reconfigure on the first call to XLogPrefetch(). */ - state->reader = reader; - state->prefetcher = NULL; - state->reconfigure_count = XLogPrefetchReconfigureCount - 1; -} - -/* - * Shut down the prefetching infrastructure, if configured. - */ -void -XLogPrefetchEnd(XLogPrefetchState *state) -{ - XLogPrefetchSaveStats(); - - if (state->prefetcher) - XLogPrefetcherFree(state->prefetcher); - state->prefetcher = NULL; - - SharedStats->queue_depth = 0; - SharedStats->distance = 0; -} - -/* - * Create a prefetcher that is ready to begin prefetching blocks referenced by - * WAL records. - */ -XLogPrefetcher * -XLogPrefetcherAllocate(XLogReaderState *reader) -{ - XLogPrefetcher *prefetcher; - static HASHCTL hash_table_ctl = { - .keysize = sizeof(RelFileNode), - .entrysize = sizeof(XLogPrefetcherFilter) - }; - - /* - * The size of the queue is based on the maintenance_io_concurrency - * setting. In theory we might have a separate queue for each tablespace, - * but it's not clear how that should work, so for now we'll just use the - * general GUC to rate-limit all prefetching. The queue has space for up - * the highest possible value of the GUC + 1, because our circular buffer - * has a gap between head and tail when full. - */ - prefetcher = palloc0(sizeof(XLogPrefetcher)); - prefetcher->prefetch_queue_size = maintenance_io_concurrency + 1; - prefetcher->reader = reader; - prefetcher->filter_table = hash_create("XLogPrefetcherFilterTable", 1024, - &hash_table_ctl, - HASH_ELEM | HASH_BLOBS); - dlist_init(&prefetcher->filter_queue); - - SharedStats->queue_depth = 0; - SharedStats->distance = 0; - - return prefetcher; -} - -/* - * Destroy a prefetcher and release all resources. - */ -void -XLogPrefetcherFree(XLogPrefetcher *prefetcher) -{ - /* Log final statistics. */ - ereport(LOG, - (errmsg("recovery finished prefetching at %X/%X; " - "prefetch = %llu, " - "skip_hit = %llu, " - "skip_new = %llu, " - "skip_fpw = %llu, " - "skip_seq = %llu, " - "avg_distance = %f, " - "avg_queue_depth = %f", - LSN_FORMAT_ARGS(prefetcher->reader->EndRecPtr), - (unsigned long long) pg_atomic_read_u64(&SharedStats->prefetch), - (unsigned long long) pg_atomic_read_u64(&SharedStats->skip_hit), - (unsigned long long) pg_atomic_read_u64(&SharedStats->skip_new), - (unsigned long long) pg_atomic_read_u64(&SharedStats->skip_fpw), - (unsigned long long) pg_atomic_read_u64(&SharedStats->skip_seq), - SharedStats->avg_distance, - SharedStats->avg_queue_depth))); - hash_destroy(prefetcher->filter_table); - pfree(prefetcher); -} - -/* - * Called when recovery is replaying a new LSN, to check if we can read ahead. - */ -bool -XLogPrefetcherReadAhead(XLogPrefetcher *prefetcher, XLogRecPtr replaying_lsn) -{ - uint32 reset_request; - - /* If an error has occurred or we've hit the end of the WAL, do nothing. */ - if (prefetcher->shutdown) - return false; - - /* - * Have any in-flight prefetches definitely completed, judging by the LSN - * that is currently being replayed? - */ - XLogPrefetcherCompletedIO(prefetcher, replaying_lsn); - - /* - * Do we already have the maximum permitted number of I/Os running - * (according to the information we have)? If so, we have to wait for at - * least one to complete, so give up early and let recovery catch up. - */ - if (XLogPrefetcherSaturated(prefetcher)) - return false; - - /* - * Can we drop any filters yet? This happens when the LSN that is - * currently being replayed has moved past a record that prevents - * prefetching of a block range, such as relation extension. - */ - XLogPrefetcherCompleteFilters(prefetcher, replaying_lsn); - - /* - * Have we been asked to reset our stats counters? This is checked with - * an unsynchronized memory read, but we'll see it eventually and we'll be - * accessing that cache line anyway. - */ - reset_request = pg_atomic_read_u32(&SharedStats->reset_request); - if (reset_request != SharedStats->reset_handled) - { - XLogPrefetchResetStats(); - SharedStats->reset_handled = reset_request; - - prefetcher->avg_distance = 0; - prefetcher->avg_queue_depth = 0; - prefetcher->samples = 0; - } - - /* OK, we can now try reading ahead. */ - return XLogPrefetcherScanRecords(prefetcher, replaying_lsn); -} - -/* - * Read ahead as far as we are allowed to, considering the LSN that recovery - * is currently replaying. - * - * Return true if the xlogreader would like more data. - */ -static bool -XLogPrefetcherScanRecords(XLogPrefetcher *prefetcher, XLogRecPtr replaying_lsn) -{ - XLogReaderState *reader = prefetcher->reader; - DecodedXLogRecord *record; - - Assert(!XLogPrefetcherSaturated(prefetcher)); - - for (;;) - { - char *error; - int64 distance; - - /* If we don't already have a record, then try to read one. */ - if (prefetcher->record == NULL) - { - switch (XLogReadAhead(reader, &record, &error)) - { - case XLREAD_NEED_DATA: - return true; - case XLREAD_FAIL: - if (error) - ereport(LOG, - (errmsg("recovery no longer prefetching: %s", - error))); - else - ereport(LOG, - (errmsg("recovery no longer prefetching"))); - prefetcher->shutdown = true; - SharedStats->queue_depth = 0; - SharedStats->distance = 0; - - return false; - case XLREAD_FULL: - return false; - case XLREAD_SUCCESS: - prefetcher->record = record; - prefetcher->next_block_id = 0; - break; - } - } - else - { - /* - * We ran out of I/O queue while part way through a record. We'll - * carry on where we left off, according to next_block_id. - */ - record = prefetcher->record; - } - - /* How far ahead of replay are we now? */ - distance = record->lsn - replaying_lsn; - - /* Update distance shown in shm. */ - SharedStats->distance = distance; - - /* Periodically recompute some statistics. */ - if (unlikely(replaying_lsn >= prefetcher->next_sample_lsn)) - { - /* Compute online averages. */ - prefetcher->samples++; - if (prefetcher->samples == 1) - { - prefetcher->avg_distance = SharedStats->distance; - prefetcher->avg_queue_depth = SharedStats->queue_depth; - } - else - { - prefetcher->avg_distance += - (SharedStats->distance - prefetcher->avg_distance) / - prefetcher->samples; - prefetcher->avg_queue_depth += - (SharedStats->queue_depth - prefetcher->avg_queue_depth) / - prefetcher->samples; - } - - /* Expose it in shared memory. */ - SharedStats->avg_distance = prefetcher->avg_distance; - SharedStats->avg_queue_depth = prefetcher->avg_queue_depth; - - /* Also periodically save the simple counters. */ - XLogPrefetchSaveStats(); - - prefetcher->next_sample_lsn = - replaying_lsn + XLOGPREFETCHER_SAMPLE_DISTANCE; - } - - /* Are we not far enough ahead? */ - if (distance <= 0) - { - /* XXX Is this still possible? */ - prefetcher->record = NULL; /* skip this record */ - continue; - } - - /* - * If this is a record that creates a new SMGR relation, we'll avoid - * prefetching anything from that rnode until it has been replayed. - */ - if (replaying_lsn < record->lsn && - record->header.xl_rmid == RM_SMGR_ID && - (record->header.xl_info & ~XLR_INFO_MASK) == XLOG_SMGR_CREATE) - { - xl_smgr_create *xlrec = (xl_smgr_create *) record->main_data; - - XLogPrefetcherAddFilter(prefetcher, xlrec->rnode, 0, record->lsn); - } - - /* Scan the record's block references. */ - if (!XLogPrefetcherScanBlocks(prefetcher)) - return false; - - /* Advance to the next record. */ - prefetcher->record = NULL; - } -} - -/* - * Scan the current record for block references, and consider prefetching. - * - * Return true if we processed the current record to completion and still have - * queue space to process a new record, and false if we saturated the I/O - * queue and need to wait for recovery to advance before we continue. - */ -static bool -XLogPrefetcherScanBlocks(XLogPrefetcher *prefetcher) -{ - DecodedXLogRecord *record = prefetcher->record; - - Assert(!XLogPrefetcherSaturated(prefetcher)); - - /* - * We might already have been partway through processing this record when - * our queue became saturated, so we need to start where we left off. - */ - for (int block_id = prefetcher->next_block_id; - block_id <= record->max_block_id; - ++block_id) - { - DecodedBkpBlock *block = &record->blocks[block_id]; - PrefetchBufferResult prefetch; - SMgrRelation reln; - - /* Ignore everything but the main fork for now. */ - if (block->forknum != MAIN_FORKNUM) - continue; - - /* - * If there is a full page image attached, we won't be reading the - * page, so you might think we should skip it. However, if the - * underlying filesystem uses larger logical blocks than us, it might - * still need to perform a read-before-write some time later. - * Therefore, only prefetch if configured to do so. - */ - if (block->has_image && !recovery_prefetch_fpw) - { - XLogPrefetchIncrement(&SharedStats->skip_fpw); - continue; - } - - /* - * If this block will initialize a new page then it's probably a - * relation extension. Since that might create a new segment, we - * can't try to prefetch this block until the record has been - * replayed, or we might try to open a file that doesn't exist yet. - */ - if (block->flags & BKPBLOCK_WILL_INIT) - { - XLogPrefetcherAddFilter(prefetcher, block->rnode, block->blkno, - record->lsn); - XLogPrefetchIncrement(&SharedStats->skip_new); - continue; - } - - /* Should we skip this block due to a filter? */ - if (XLogPrefetcherIsFiltered(prefetcher, block->rnode, block->blkno)) - { - XLogPrefetchIncrement(&SharedStats->skip_new); - continue; - } - - /* Fast path for repeated references to the same relation. */ - if (RelFileNodeEquals(block->rnode, prefetcher->last_rnode)) - { - /* - * If this is a repeat access to the same block, then skip it. - * - * XXX We could also check for last_blkno + 1 too, and also update - * last_blkno; it's not clear if the kernel would do a better job - * of sequential prefetching. - */ - if (block->blkno == prefetcher->last_blkno) - { - XLogPrefetchIncrement(&SharedStats->skip_seq); - continue; - } - - /* We can avoid calling smgropen(). */ - reln = prefetcher->last_reln; - } - else - { - /* Otherwise we have to open it. */ - reln = smgropen(block->rnode, InvalidBackendId); - prefetcher->last_rnode = block->rnode; - prefetcher->last_reln = reln; - } - prefetcher->last_blkno = block->blkno; - - /* Try to prefetch this block! */ - prefetch = PrefetchSharedBuffer(reln, block->forknum, block->blkno); - if (BufferIsValid(prefetch.recent_buffer)) - { - /* - * It was already cached, so do nothing. We'll remember the - * buffer, so that recovery can try to avoid looking it up again. - */ - block->recent_buffer = prefetch.recent_buffer; - XLogPrefetchIncrement(&SharedStats->skip_hit); - } - else if (prefetch.initiated_io) - { - /* - * I/O has possibly been initiated (though we don't know if it was - * already cached by the kernel, so we just have to assume that it - * has due to lack of better information). Record this as an I/O - * in progress until eventually we replay this LSN. - */ - XLogPrefetchIncrement(&SharedStats->prefetch); - XLogPrefetcherInitiatedIO(prefetcher, record->lsn); - - /* - * If the queue is now full, we'll have to wait before processing - * any more blocks from this record, or move to a new record if - * that was the last block. - */ - if (XLogPrefetcherSaturated(prefetcher)) - { - prefetcher->next_block_id = block_id + 1; - return false; - } - } - else - { - /* - * Neither cached nor initiated. The underlying segment file - * doesn't exist. Presumably it will be unlinked by a later WAL - * record. When recovery reads this block, it will use the - * EXTENSION_CREATE_RECOVERY flag. We certainly don't want to do - * that sort of thing while merely prefetching, so let's just - * ignore references to this relation until this record is - * replayed, and let recovery create the dummy file or complain if - * something is wrong. - */ - XLogPrefetcherAddFilter(prefetcher, block->rnode, 0, - record->lsn); - XLogPrefetchIncrement(&SharedStats->skip_new); - } - } - - return true; -} - -/* - * Expose statistics about recovery prefetching. - */ -Datum -pg_stat_get_prefetch_recovery(PG_FUNCTION_ARGS) -{ -#define PG_STAT_GET_PREFETCH_RECOVERY_COLS 10 - ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; - TupleDesc tupdesc; - Tuplestorestate *tupstore; - MemoryContext per_query_ctx; - MemoryContext oldcontext; - Datum values[PG_STAT_GET_PREFETCH_RECOVERY_COLS]; - bool nulls[PG_STAT_GET_PREFETCH_RECOVERY_COLS]; - - if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("set-valued function called in context that cannot accept a set"))); - if (!(rsinfo->allowedModes & SFRM_Materialize)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("materialize mode required, but it is not allowed in this context"))); - - if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) - elog(ERROR, "return type must be a row type"); - - per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; - oldcontext = MemoryContextSwitchTo(per_query_ctx); - - tupstore = tuplestore_begin_heap(true, false, work_mem); - rsinfo->returnMode = SFRM_Materialize; - rsinfo->setResult = tupstore; - rsinfo->setDesc = tupdesc; - - MemoryContextSwitchTo(oldcontext); - - if (pg_atomic_read_u32(&SharedStats->reset_request) != SharedStats->reset_handled) - { - /* There's an unhandled reset request, so just show NULLs */ - for (int i = 0; i < PG_STAT_GET_PREFETCH_RECOVERY_COLS; ++i) - nulls[i] = true; - } - else - { - for (int i = 0; i < PG_STAT_GET_PREFETCH_RECOVERY_COLS; ++i) - nulls[i] = false; - } - - values[0] = TimestampTzGetDatum(pg_atomic_read_u64(&SharedStats->reset_time)); - values[1] = Int64GetDatum(pg_atomic_read_u64(&SharedStats->prefetch)); - values[2] = Int64GetDatum(pg_atomic_read_u64(&SharedStats->skip_hit)); - values[3] = Int64GetDatum(pg_atomic_read_u64(&SharedStats->skip_new)); - values[4] = Int64GetDatum(pg_atomic_read_u64(&SharedStats->skip_fpw)); - values[5] = Int64GetDatum(pg_atomic_read_u64(&SharedStats->skip_seq)); - values[6] = Int32GetDatum(SharedStats->distance); - values[7] = Int32GetDatum(SharedStats->queue_depth); - values[8] = Float4GetDatum(SharedStats->avg_distance); - values[9] = Float4GetDatum(SharedStats->avg_queue_depth); - tuplestore_putvalues(tupstore, tupdesc, values, nulls); - tuplestore_donestoring(tupstore); - - return (Datum) 0; -} - -/* - * Compute (n + 1) % prefetch_queue_size, assuming n < prefetch_queue_size, - * without using division. - */ -static inline int -XLogPrefetcherNext(XLogPrefetcher *prefetcher, int n) -{ - int next = n + 1; - - return next == prefetcher->prefetch_queue_size ? 0 : next; -} - -/* - * Don't prefetch any blocks >= 'blockno' from a given 'rnode', until 'lsn' - * has been replayed. - */ -static inline void -XLogPrefetcherAddFilter(XLogPrefetcher *prefetcher, RelFileNode rnode, - BlockNumber blockno, XLogRecPtr lsn) -{ - XLogPrefetcherFilter *filter; - bool found; - - filter = hash_search(prefetcher->filter_table, &rnode, HASH_ENTER, &found); - if (!found) - { - /* - * Don't allow any prefetching of this block or higher until replayed. - */ - filter->filter_until_replayed = lsn; - filter->filter_from_block = blockno; - dlist_push_head(&prefetcher->filter_queue, &filter->link); - } - else - { - /* - * We were already filtering this rnode. Extend the filter's lifetime - * to cover this WAL record, but leave the (presumably lower) block - * number there because we don't want to have to track individual - * blocks. - */ - filter->filter_until_replayed = lsn; - dlist_delete(&filter->link); - dlist_push_head(&prefetcher->filter_queue, &filter->link); - } -} - -/* - * Have we replayed the records that caused us to begin filtering a block - * range? That means that relations should have been created, extended or - * dropped as required, so we can drop relevant filters. - */ -static inline void -XLogPrefetcherCompleteFilters(XLogPrefetcher *prefetcher, XLogRecPtr replaying_lsn) -{ - while (unlikely(!dlist_is_empty(&prefetcher->filter_queue))) - { - XLogPrefetcherFilter *filter = dlist_tail_element(XLogPrefetcherFilter, - link, - &prefetcher->filter_queue); - - if (filter->filter_until_replayed >= replaying_lsn) - break; - dlist_delete(&filter->link); - hash_search(prefetcher->filter_table, filter, HASH_REMOVE, NULL); - } -} - -/* - * Check if a given block should be skipped due to a filter. - */ -static inline bool -XLogPrefetcherIsFiltered(XLogPrefetcher *prefetcher, RelFileNode rnode, - BlockNumber blockno) -{ - /* - * Test for empty queue first, because we expect it to be empty most of - * the time and we can avoid the hash table lookup in that case. - */ - if (unlikely(!dlist_is_empty(&prefetcher->filter_queue))) - { - XLogPrefetcherFilter *filter = hash_search(prefetcher->filter_table, &rnode, - HASH_FIND, NULL); - - if (filter && filter->filter_from_block <= blockno) - return true; - } - - return false; -} - -/* - * Insert an LSN into the queue. The queue must not be full already. This - * tracks the fact that we have (to the best of our knowledge) initiated an - * I/O, so that we can impose a cap on concurrent prefetching. - */ -static inline void -XLogPrefetcherInitiatedIO(XLogPrefetcher *prefetcher, - XLogRecPtr prefetching_lsn) -{ - Assert(!XLogPrefetcherSaturated(prefetcher)); - prefetcher->prefetch_queue[prefetcher->prefetch_head] = prefetching_lsn; - prefetcher->prefetch_head = - XLogPrefetcherNext(prefetcher, prefetcher->prefetch_head); - SharedStats->queue_depth++; - - Assert(SharedStats->queue_depth <= prefetcher->prefetch_queue_size); -} - -/* - * Have we replayed the records that caused us to initiate the oldest - * prefetches yet? That means that they're definitely finished, so we can can - * forget about them and allow ourselves to initiate more prefetches. For now - * we don't have any awareness of when I/O really completes. - */ -static inline void -XLogPrefetcherCompletedIO(XLogPrefetcher *prefetcher, XLogRecPtr replaying_lsn) -{ - while (prefetcher->prefetch_head != prefetcher->prefetch_tail && - prefetcher->prefetch_queue[prefetcher->prefetch_tail] < replaying_lsn) - { - prefetcher->prefetch_tail = - XLogPrefetcherNext(prefetcher, prefetcher->prefetch_tail); - SharedStats->queue_depth--; - - Assert(SharedStats->queue_depth >= 0); - } -} - -/* - * Check if the maximum allowed number of I/Os is already in flight. - */ -static inline bool -XLogPrefetcherSaturated(XLogPrefetcher *prefetcher) -{ - int next = XLogPrefetcherNext(prefetcher, prefetcher->prefetch_head); - - return next == prefetcher->prefetch_tail; -} - -void -assign_recovery_prefetch(bool new_value, void *extra) -{ - /* Reconfigure prefetching, because a setting it depends on changed. */ - recovery_prefetch = new_value; - if (AmStartupProcess()) - XLogPrefetchReconfigure(); -} - -void -assign_recovery_prefetch_fpw(bool new_value, void *extra) -{ - /* Reconfigure prefetching, because a setting it depends on changed. */ - recovery_prefetch_fpw = new_value; - if (AmStartupProcess()) - XLogPrefetchReconfigure(); -} diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 4277e92d7c..42738eb940 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -36,14 +36,11 @@ static void report_invalid_record(XLogReaderState *state, const char *fmt,...) pg_attribute_printf(2, 3); static bool allocate_recordbuf(XLogReaderState *state, uint32 reclength); -static bool XLogNeedData(XLogReaderState *state, XLogRecPtr pageptr, - int reqLen, bool header_inclusive); -size_t DecodeXLogRecordRequiredSpace(size_t xl_tot_len); -static XLogReadRecordResult XLogDecodeOneRecord(XLogReaderState *state, - bool allow_oversized); +static int ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, + int reqLen); static void XLogReaderInvalReadState(XLogReaderState *state); static bool ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, - XLogRecPtr PrevRecPtr, XLogRecord *record); + XLogRecPtr PrevRecPtr, XLogRecord *record, bool randAccess); static bool ValidXLogRecord(XLogReaderState *state, XLogRecord *record, XLogRecPtr recptr); static void ResetDecoder(XLogReaderState *state); @@ -53,8 +50,6 @@ static void WALOpenSegmentInit(WALOpenSegment *seg, WALSegmentContext *segcxt, /* size of the buffer allocated for error message. */ #define MAX_ERRORMSG_LEN 1000 -#define DEFAULT_DECODE_BUFFER_SIZE 0x10000 - /* * Construct a string in state->errormsg_buf explaining what's wrong with * the current record being read. @@ -69,8 +64,6 @@ report_invalid_record(XLogReaderState *state, const char *fmt,...) va_start(args, fmt); vsnprintf(state->errormsg_buf, MAX_ERRORMSG_LEN, fmt, args); va_end(args); - - state->errormsg_deferred = true; } /* @@ -80,7 +73,7 @@ report_invalid_record(XLogReaderState *state, const char *fmt,...) */ XLogReaderState * XLogReaderAllocate(int wal_segment_size, const char *waldir, - WALSegmentCleanupCB cleanup_cb) + XLogReaderRoutine *routine, void *private_data) { XLogReaderState *state; @@ -91,7 +84,9 @@ XLogReaderAllocate(int wal_segment_size, const char *waldir, return NULL; /* initialize caller-provided support functions */ - state->cleanup_cb = cleanup_cb; + state->routine = *routine; + + state->max_block_id = -1; /* * Permanently allocate readBuf. We do it this way, rather than just @@ -112,7 +107,9 @@ XLogReaderAllocate(int wal_segment_size, const char *waldir, WALOpenSegmentInit(&state->seg, &state->segcxt, wal_segment_size, waldir); - /* ReadRecPtr, EndRecPtr, reqLen and readLen initialized to zeroes above */ + /* system_identifier initialized to zeroes above */ + state->private_data = private_data; + /* ReadRecPtr, EndRecPtr and readLen initialized to zeroes above */ state->errormsg_buf = palloc_extended(MAX_ERRORMSG_LEN + 1, MCXT_ALLOC_NO_OOM); if (!state->errormsg_buf) @@ -141,11 +138,18 @@ XLogReaderAllocate(int wal_segment_size, const char *waldir, void XLogReaderFree(XLogReaderState *state) { - if (state->seg.ws_file >= 0) - state->cleanup_cb(state); + int block_id; - if (state->decode_buffer && state->free_decode_buffer) - pfree(state->decode_buffer); + if (state->seg.ws_file != -1) + state->routine.segment_close(state); + + for (block_id = 0; block_id <= XLR_MAX_BLOCK_ID; block_id++) + { + if (state->blocks[block_id].data) + pfree(state->blocks[block_id].data); + } + if (state->main_data) + pfree(state->main_data); pfree(state->errormsg_buf); if (state->readRecordBuf) @@ -154,22 +158,6 @@ XLogReaderFree(XLogReaderState *state) pfree(state); } -/* - * Set the size of the decoding buffer. A pointer to a caller supplied memory - * region may also be passed in, in which case non-oversized records will be - * decoded there. - */ -void -XLogReaderSetDecodeBuffer(XLogReaderState *state, void *buffer, size_t size) -{ - Assert(state->decode_buffer == NULL); - - state->decode_buffer = buffer; - state->decode_buffer_size = size; - state->decode_buffer_head = buffer; - state->decode_buffer_tail = buffer; -} - /* * Allocate readRecordBuf to fit a record of at least the given length. * Returns true if successful, false if out of memory. @@ -257,799 +245,290 @@ XLogBeginRead(XLogReaderState *state, XLogRecPtr RecPtr) /* Begin at the passed-in record pointer. */ state->EndRecPtr = RecPtr; - state->NextRecPtr = RecPtr; state->ReadRecPtr = InvalidXLogRecPtr; - state->DecodeRecPtr = InvalidXLogRecPtr; - state->readRecordState = XLREAD_NEXT_RECORD; } /* - * See if we can release the last record that was returned by - * XLogReadRecord(), to free up space. + * Attempt to read an XLOG record. + * + * XLogBeginRead() or XLogFindNextRecord() must be called before the first call + * to XLogReadRecord(). + * + * If the page_read callback fails to read the requested data, NULL is + * returned. The callback is expected to have reported the error; errormsg + * is set to NULL. + * + * If the reading fails for some other reason, NULL is also returned, and + * *errormsg is set to a string with details of the failure. + * + * The returned pointer (or *errormsg) points to an internal buffer that's + * valid until the next call to XLogReadRecord. */ -static void -XLogReleasePreviousRecord(XLogReaderState *state) -{ - DecodedXLogRecord *record; - - /* - * Remove it from the decoded record queue. It must be the oldest - * item decoded, decode_queue_tail. - */ - record = state->record; - Assert(record == state->decode_queue_tail); - state->record = NULL; - state->decode_queue_tail = record->next; - - /* It might also be the newest item decoded, decode_queue_head. */ - if (state->decode_queue_head == record) - state->decode_queue_head = NULL; - - /* Release the space. */ - if (unlikely(record->oversized)) - { - /* It's not in the the decode buffer, so free it to release space. */ - pfree(record); - } - else - { - /* It must be the tail record in the decode buffer. */ - Assert(state->decode_buffer_tail == (char *) record); - - /* - * We need to update tail to point to the next record that is in the - * decode buffer, if any, being careful to skip oversized ones - * (they're not in the decode buffer). - */ - record = record->next; - while (unlikely(record && record->oversized)) - record = record->next; - - if (record) - { - /* Adjust tail to release space up to the next record. */ - state->decode_buffer_tail = (char *) record; - } - else if (state->decoding && !state->decoding->oversized) - { - /* - * We're releasing the last fully decoded record in - * XLogReadRecord(), but some time earlier we partially decoded a - * record in XLogReadAhead() and were unable to complete the job. - * We'll set the buffer head and tail to point to the record we - * started working on, so that we can continue (perhaps from a - * different source). - */ - state->decode_buffer_tail = (char *) state->decoding; - state->decode_buffer_head = (char *) state->decoding; - } - else - { - /* - * Otherwise we might as well just reset head and tail to the - * start of the buffer space, because we're empty. This means - * we'll keep overwriting the same piece of memory if we're not - * doing any prefetching. - */ - state->decode_buffer_tail = state->decode_buffer; - state->decode_buffer_head = state->decode_buffer; - } - } -} - -/* - * Similar to XLogNextRecord(), but this traditional interface is for code - * that just wants the header, not the decoded record. Callers can access the - * decoded record through the XLogRecGetXXX() macros. - */ -XLogReadRecordResult -XLogReadRecord(XLogReaderState *state, XLogRecord **record, char **errormsg) -{ - XLogReadRecordResult result; - DecodedXLogRecord *decoded; - - /* Consume the next decoded record. */ - result = XLogNextRecord(state, &decoded, errormsg); - if (result == XLREAD_SUCCESS) - { - /* - * The traditional interface just returns the header, not the decoded - * record. The caller will access the decoded record through the - * XLogRecGetXXX() macros. - */ - *record = &decoded->header; - } - else - *record = NULL; - return result; -} - -/* - * Consume the next record. XLogBeginRead() or XLogFindNextRecord() must be - * called before the first call to XLogNextRecord(). - * - * This function may return XLREAD_NEED_DATA several times before returning a - * result record. The caller shall read in some new data then call this - * function again with the same parameters. - * - * When a record is successfully read, returns XLREAD_SUCCESS with result - * record being stored in *record. Otherwise *record is set to NULL. - * - * Returns XLREAD_NEED_DATA if more data is needed to finish decoding the - * current record. In that case, state->readPagePtr and state->reqLen inform - * the desired position and minimum length of data needed. The caller shall - * read in the requested data and set state->readBuf to point to a buffer - * containing it. The caller must also set state->seg->ws_tli and - * state->readLen to indicate the timeline that it was read from, and the - * length of data that is now available (which must be >= given reqLen), - * respectively. - * - * Returns XLREAD_FULL if allow_oversized is true, and no space is available. - * This is intended for readahead. - * - * If invalid data is encountered, returns XLREAD_FAIL with *record being set - * to NULL. *errormsg is set to a string with details of the failure. The - * returned pointer (or *errormsg) points to an internal buffer that's valid - * until the next call to XLogReadRecord. - * - */ -XLogReadRecordResult -XLogNextRecord(XLogReaderState *state, - DecodedXLogRecord **record, - char **errormsg) -{ - /* Release the space occupied by the last record we returned. */ - if (state->record) - XLogReleasePreviousRecord(state); - - for (;;) - { - XLogReadRecordResult result; - - /* We can now return the oldest item in the queue, if there is one. */ - if (state->decode_queue_tail) - { - /* - * Record this as the most recent record returned, so that we'll - * release it next time. This also exposes it to the - * XLogRecXXX(decoder) macros, which pass in the decoder rather - * than the record for historical reasons. - */ - state->record = state->decode_queue_tail; - - /* - * It should be immediately after the last the record returned by - * XLogReadRecord(), or at the position set by XLogBeginRead() if - * XLogReadRecord() hasn't been called yet. It may be after a - * page header, though. - */ - Assert(state->record->lsn == state->EndRecPtr || - (state->EndRecPtr % XLOG_BLCKSZ == 0 && - (state->record->lsn == state->EndRecPtr + SizeOfXLogShortPHD || - state->record->lsn == state->EndRecPtr + SizeOfXLogLongPHD))); - - /* - * Set ReadRecPtr and EndRecPtr to correspond to that - * record. - * - * Calling code could access these through the returned decoded - * record, but for now we'll update them directly here, for the - * benefit of all the existing code that accesses these variables - * directly. - */ - state->ReadRecPtr = state->record->lsn; - state->EndRecPtr = state->record->next_lsn; - - *errormsg = NULL; - *record = state->record; - - return XLREAD_SUCCESS; - } - else if (state->errormsg_deferred) - { - /* - * If we've run out of records, but we have a deferred error, now - * is the time to report it. - */ - state->errormsg_deferred = false; - if (state->errormsg_buf[0] != '\0') - *errormsg = state->errormsg_buf; - else - *errormsg = NULL; - *record = NULL; - state->EndRecPtr = state->DecodeRecPtr; - - return XLREAD_FAIL; - } - - /* We need to get a decoded record into our queue first. */ - result = XLogDecodeOneRecord(state, true /* allow_oversized */ ); - switch(result) - { - case XLREAD_NEED_DATA: - *errormsg = NULL; - *record = NULL; - return result; - case XLREAD_SUCCESS: - Assert(state->decode_queue_tail != NULL); - break; - case XLREAD_FULL: - /* Not expected because we passed allow_oversized = true */ - Assert(false); - break; - case XLREAD_FAIL: - /* - * If that produced neither a queued record nor a queued error, - * then we're at the end (for example, archive recovery with no - * more files available). - */ - Assert(state->decode_queue_tail == NULL); - if (!state->errormsg_deferred) - { - state->EndRecPtr = state->DecodeRecPtr; - *errormsg = NULL; - *record = NULL; - return result; - } - break; - } - } - - /* unreachable */ - return XLREAD_FAIL; -} - -/* - * Try to decode the next available record. The next record will also be - * returned to XLogRecordRead(). - * - * In addition to the values that XLogReadRecord() can return, XLogReadAhead() - * can also return XLREAD_FULL to indicate that further readahead is not - * possible yet due to lack of space. - */ -XLogReadRecordResult -XLogReadAhead(XLogReaderState *state, DecodedXLogRecord **record, char **errormsg) -{ - XLogReadRecordResult result; - - /* We stop trying after encountering an error. */ - if (unlikely(state->errormsg_deferred)) - { - /* We only report the error message the first time, see below. */ - *errormsg = NULL; - return XLREAD_FAIL; - } - - /* - * Try to decode one more record, if we have space. Pass allow_oversized - * = false, so that this call returns fast if the decode buffer is full. - */ - result = XLogDecodeOneRecord(state, false); - switch (result) - { - case XLREAD_SUCCESS: - /* New record at head of decode record queue. */ - Assert(state->decode_queue_head != NULL); - *record = state->decode_queue_head; - return result; - case XLREAD_FULL: - /* No space in circular decode buffer. */ - return result; - case XLREAD_NEED_DATA: - /* The caller needs to insert more data. */ - return result; - case XLREAD_FAIL: - /* Report the error. XLogReadRecord() will also report it. */ - Assert(state->errormsg_deferred); - if (state->errormsg_buf[0] != '\0') - *errormsg = state->errormsg_buf; - return result; - } - - /* Unreachable. */ - return XLREAD_FAIL; -} - -/* - * Allocate space for a decoded record. The only member of the returned - * object that is initialized is the 'oversized' flag, indicating that the - * decoded record wouldn't fit in the decode buffer and must eventually be - * freed explicitly. - * - * Return NULL if there is no space in the decode buffer and allow_oversized - * is false, or if memory allocation fails for an oversized buffer. - */ -static DecodedXLogRecord * -XLogReadRecordAlloc(XLogReaderState *state, size_t xl_tot_len, bool allow_oversized) -{ - size_t required_space = DecodeXLogRecordRequiredSpace(xl_tot_len); - DecodedXLogRecord *decoded = NULL; - - /* Allocate a circular decode buffer if we don't have one already. */ - if (unlikely(state->decode_buffer == NULL)) - { - if (state->decode_buffer_size == 0) - state->decode_buffer_size = DEFAULT_DECODE_BUFFER_SIZE; - state->decode_buffer = palloc(state->decode_buffer_size); - state->decode_buffer_head = state->decode_buffer; - state->decode_buffer_tail = state->decode_buffer; - state->free_decode_buffer = true; - } - if (state->decode_buffer_head >= state->decode_buffer_tail) - { - /* Empty, or head is to the right of tail. */ - if (state->decode_buffer_head + required_space <= - state->decode_buffer + state->decode_buffer_size) - { - /* There is space between head and end. */ - decoded = (DecodedXLogRecord *) state->decode_buffer_head; - decoded->oversized = false; - return decoded; - } - else if (state->decode_buffer + required_space < - state->decode_buffer_tail) - { - /* There is space between start and tail. */ - decoded = (DecodedXLogRecord *) state->decode_buffer; - decoded->oversized = false; - return decoded; - } - } - else - { - /* Head is to the left of tail. */ - if (state->decode_buffer_head + required_space < - state->decode_buffer_tail) - { - /* There is space between head and tail. */ - decoded = (DecodedXLogRecord *) state->decode_buffer_head; - decoded->oversized = false; - return decoded; - } - } - - /* Not enough space in the decode buffer. Are we allowed to allocate? */ - if (allow_oversized) - { - decoded = palloc_extended(required_space, MCXT_ALLOC_NO_OOM); - if (decoded == NULL) - return NULL; - decoded->oversized = true; - return decoded; - } - - return decoded; -} - -/* - * Try to read and decode the next record and add it to the head of the - * decoded record queue. If 'allow_oversized' is false, then XLREAD_FULL can - * be returned to indicate the decoding buffer is full. XLogBeginRead() or - * XLogFindNextRecord() must be called before the first call to - * XLogReadRecord(). - * - * This function runs a state machine consisting of the following states. - * - * XLREAD_NEXT_RECORD: - * The initial state. If called with a valid XLogRecPtr, try to read a - * record at that position. If invalid RecPtr is given try to read a record - * just after the last one read. The next state is XLREAD_TOT_LEN. - * - * XLREAD_TOT_LEN: - * Examining record header. Ends after reading record length. - * recordRemainLen and recordGotLen are initialized. The next state is - * XLREAD_FIRST_FRAGMENT. - * - * XLREAD_FIRST_FRAGMENT: - * Reading the first fragment. Goes to XLREAD_NEXT_RECORD if that's all or - * XLREAD_CONTINUATION if we need more data. - - * XLREAD_CONTINUATION: - * Reading continuation of record. If the whole record is now decoded, goes - * to XLREAD_NEXT_RECORD. During this state, recordRemainLen indicates how - * much is left. - * - * If invalid data is found in any state, the state machine stays at the - * current state. This behavior allows us to continue reading a record - * after switching to a different source, during streaming replication. - */ -static XLogReadRecordResult -XLogDecodeOneRecord(XLogReaderState *state, bool allow_oversized) +XLogRecord * +XLogReadRecord(XLogReaderState *state, char **errormsg) { + XLogRecPtr RecPtr; XLogRecord *record; - char *errormsg; /* not used */ - XLogRecord *prec; + XLogRecPtr targetPagePtr; + bool randAccess; + uint32 len, + total_len; + uint32 targetRecOff; + uint32 pageHeaderSize; + bool gotheader; + int readOff; + + /* + * randAccess indicates whether to verify the previous-record pointer of + * the record we're reading. We only do this if we're reading + * sequentially, which is what we initially assume. + */ + randAccess = false; /* reset error state */ + *errormsg = NULL; state->errormsg_buf[0] = '\0'; - record = NULL; - switch (state->readRecordState) + ResetDecoder(state); + + RecPtr = state->EndRecPtr; + + if (state->ReadRecPtr != InvalidXLogRecPtr) { - case XLREAD_NEXT_RECORD: - Assert(!state->decoding); + /* read the record after the one we just read */ - if (state->DecodeRecPtr != InvalidXLogRecPtr) + /* + * EndRecPtr is pointing to end+1 of the previous WAL record. If + * we're at a page boundary, no more records can fit on the current + * page. We must skip over the page header, but we can't do that until + * we've read in the page, since the header size is variable. + */ + } + else + { + /* + * Caller supplied a position to start at. + * + * In this case, EndRecPtr should already be pointing to a valid + * record starting position. + */ + Assert(XRecOffIsValid(RecPtr)); + randAccess = true; + } + + state->currRecPtr = RecPtr; + + targetPagePtr = RecPtr - (RecPtr % XLOG_BLCKSZ); + targetRecOff = RecPtr % XLOG_BLCKSZ; + + /* + * Read the page containing the record into state->readBuf. Request enough + * byte to cover the whole record header, or at least the part of it that + * fits on the same page. + */ + readOff = ReadPageInternal(state, targetPagePtr, + Min(targetRecOff + SizeOfXLogRecord, XLOG_BLCKSZ)); + if (readOff < 0) + goto err; + + /* + * ReadPageInternal always returns at least the page header, so we can + * examine it now. + */ + pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) state->readBuf); + if (targetRecOff == 0) + { + /* + * At page start, so skip over page header. + */ + RecPtr += pageHeaderSize; + targetRecOff = pageHeaderSize; + } + else if (targetRecOff < pageHeaderSize) + { + report_invalid_record(state, "invalid record offset at %X/%X", + LSN_FORMAT_ARGS(RecPtr)); + goto err; + } + + if ((((XLogPageHeader) state->readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) && + targetRecOff == pageHeaderSize) + { + report_invalid_record(state, "contrecord is requested by %X/%X", + LSN_FORMAT_ARGS(RecPtr)); + goto err; + } + + /* ReadPageInternal has verified the page header */ + Assert(pageHeaderSize <= readOff); + + /* + * Read the record length. + * + * NB: Even though we use an XLogRecord pointer here, the whole record + * header might not fit on this page. xl_tot_len is the first field of the + * struct, so it must be on this page (the records are MAXALIGNed), but we + * cannot access any other fields until we've verified that we got the + * whole header. + */ + record = (XLogRecord *) (state->readBuf + RecPtr % XLOG_BLCKSZ); + total_len = record->xl_tot_len; + + /* + * If the whole record header is on this page, validate it immediately. + * Otherwise do just a basic sanity check on xl_tot_len, and validate the + * rest of the header after reading it from the next page. The xl_tot_len + * check is necessary here to ensure that we enter the "Need to reassemble + * record" code path below; otherwise we might fail to apply + * ValidXLogRecordHeader at all. + */ + if (targetRecOff <= XLOG_BLCKSZ - SizeOfXLogRecord) + { + if (!ValidXLogRecordHeader(state, RecPtr, state->ReadRecPtr, record, + randAccess)) + goto err; + gotheader = true; + } + else + { + /* XXX: more validation should be done here */ + if (total_len < SizeOfXLogRecord) + { + report_invalid_record(state, + "invalid record length at %X/%X: wanted %u, got %u", + LSN_FORMAT_ARGS(RecPtr), + (uint32) SizeOfXLogRecord, total_len); + goto err; + } + gotheader = false; + } + + len = XLOG_BLCKSZ - RecPtr % XLOG_BLCKSZ; + if (total_len > len) + { + /* Need to reassemble record */ + char *contdata; + XLogPageHeader pageHeader; + char *buffer; + uint32 gotlen; + + /* + * Enlarge readRecordBuf as needed. + */ + if (total_len > state->readRecordBufSize && + !allocate_recordbuf(state, total_len)) + { + /* We treat this as a "bogus data" condition */ + report_invalid_record(state, "record length %u at %X/%X too long", + total_len, LSN_FORMAT_ARGS(RecPtr)); + goto err; + } + + /* Copy the first fragment of the record from the first page. */ + memcpy(state->readRecordBuf, + state->readBuf + RecPtr % XLOG_BLCKSZ, len); + buffer = state->readRecordBuf + len; + gotlen = len; + + do + { + /* Calculate pointer to beginning of next page */ + targetPagePtr += XLOG_BLCKSZ; + + /* Wait for the next page to become available */ + readOff = ReadPageInternal(state, targetPagePtr, + Min(total_len - gotlen + SizeOfXLogShortPHD, + XLOG_BLCKSZ)); + + if (readOff < 0) + goto err; + + Assert(SizeOfXLogShortPHD <= readOff); + + /* Check that the continuation on next page looks valid */ + pageHeader = (XLogPageHeader) state->readBuf; + if (!(pageHeader->xlp_info & XLP_FIRST_IS_CONTRECORD)) { - /* read the record after the one we just read */ - - /* - * NextRecPtr is pointing to end+1 of the previous WAL record. - * If we're at a page boundary, no more records can fit on the - * current page. We must skip over the page header, but we - * can't do that until we've read in the page, since the - * header size is variable. - */ - state->PrevRecPtr = state->DecodeRecPtr; - state->DecodeRecPtr = state->NextRecPtr; - } - else - { - /* - * Caller supplied a position to start at. - * - * In this case, EndRecPtr should already be pointing to a - * valid record starting position. - */ - Assert(XRecOffIsValid(state->NextRecPtr)); - state->DecodeRecPtr = state->NextRecPtr; - - /* - * We cannot verify the previous-record pointer when we're - * seeking to a particular record. Reset PrevRecPtr so that we - * won't try doing that. - */ - state->PrevRecPtr = InvalidXLogRecPtr; + report_invalid_record(state, + "there is no contrecord flag at %X/%X", + LSN_FORMAT_ARGS(RecPtr)); + goto err; } - state->record_verified = false; - state->readRecordState = XLREAD_TOT_LEN; - /* fall through */ - - case XLREAD_TOT_LEN: + /* + * Cross-check that xlp_rem_len agrees with how much of the record + * we expect there to be left. + */ + if (pageHeader->xlp_rem_len == 0 || + total_len != (pageHeader->xlp_rem_len + gotlen)) { - uint32 total_len; - uint32 pageHeaderSize; - XLogRecPtr targetPagePtr; - uint32 targetRecOff; - XLogPageHeader pageHeader; - - Assert(!state->decoding); - - targetPagePtr = - state->DecodeRecPtr - (state->DecodeRecPtr % XLOG_BLCKSZ); - targetRecOff = state->DecodeRecPtr % XLOG_BLCKSZ; - - /* - * Check if we have enough data. For the first record in the - * page, the requesting length doesn't contain page header. - */ - if (XLogNeedData(state, targetPagePtr, - Min(targetRecOff + SizeOfXLogRecord, XLOG_BLCKSZ), - targetRecOff != 0)) - return XLREAD_NEED_DATA; - - /* error out if caller supplied bogus page */ - if (!state->page_verified) - goto err; - - /* examine page header now. */ - pageHeaderSize = - XLogPageHeaderSize((XLogPageHeader) state->readBuf); - if (targetRecOff == 0) - { - /* At page start, so skip over page header. */ - state->DecodeRecPtr += pageHeaderSize; - targetRecOff = pageHeaderSize; - } - else if (targetRecOff < pageHeaderSize) - { - report_invalid_record(state, "invalid record offset at %X/%X", - LSN_FORMAT_ARGS(state->DecodeRecPtr)); - goto err; - } - - pageHeader = (XLogPageHeader) state->readBuf; - if ((pageHeader->xlp_info & XLP_FIRST_IS_CONTRECORD) && - targetRecOff == pageHeaderSize) - { - report_invalid_record(state, "contrecord is requested by %X/%X", - LSN_FORMAT_ARGS(state->DecodeRecPtr)); - goto err; - } - - /* XLogNeedData has verified the page header */ - Assert(pageHeaderSize <= state->readLen); - - /* - * Read the record length. - * - * NB: Even though we use an XLogRecord pointer here, the - * whole record header might not fit on this page. xl_tot_len - * is the first field of the struct, so it must be on this - * page (the records are MAXALIGNed), but we cannot access any - * other fields until we've verified that we got the whole - * header. - */ - prec = (XLogRecord *) (state->readBuf + - state->DecodeRecPtr % XLOG_BLCKSZ); - total_len = prec->xl_tot_len; - - /* Find space to decode this record. */ - Assert(state->decoding == NULL); - state->decoding = XLogReadRecordAlloc(state, total_len, - allow_oversized); - if (state->decoding == NULL) - { - /* - * We couldn't get space. If allow_oversized was true, - * then palloc() must have failed. Otherwise, report that - * our decoding buffer is full. This means that weare - * trying to read too far ahead. - */ - if (allow_oversized) - goto err; - return XLREAD_FULL; - } - - /* - * If the whole record header is on this page, validate it - * immediately. Otherwise do just a basic sanity check on - * xl_tot_len, and validate the rest of the header after - * reading it from the next page. The xl_tot_len check is - * necessary here to ensure that we enter the - * XLREAD_CONTINUATION state below; otherwise we might fail to - * apply ValidXLogRecordHeader at all. - */ - if (targetRecOff <= XLOG_BLCKSZ - SizeOfXLogRecord) - { - if (!ValidXLogRecordHeader(state, state->DecodeRecPtr, - state->PrevRecPtr, prec)) - goto err; - - state->record_verified = true; - } - else - { - /* XXX: more validation should be done here */ - if (total_len < SizeOfXLogRecord) - { - report_invalid_record(state, - "invalid record length at %X/%X: wanted %u, got %u", - LSN_FORMAT_ARGS(state->DecodeRecPtr), - (uint32) SizeOfXLogRecord, total_len); - goto err; - } - } - - /* - * Wait for the rest of the record, or the part of the record - * that fit on the first page if crossed a page boundary, to - * become available. - */ - state->recordGotLen = 0; - state->recordRemainLen = total_len; - state->readRecordState = XLREAD_FIRST_FRAGMENT; + report_invalid_record(state, + "invalid contrecord length %u (expected %lld) at %X/%X", + pageHeader->xlp_rem_len, + ((long long) total_len) - gotlen, + LSN_FORMAT_ARGS(RecPtr)); + goto err; } - /* fall through */ - case XLREAD_FIRST_FRAGMENT: + /* Append the continuation from this page to the buffer */ + pageHeaderSize = XLogPageHeaderSize(pageHeader); + + if (readOff < pageHeaderSize) + readOff = ReadPageInternal(state, targetPagePtr, + pageHeaderSize); + + Assert(pageHeaderSize <= readOff); + + contdata = (char *) state->readBuf + pageHeaderSize; + len = XLOG_BLCKSZ - pageHeaderSize; + if (pageHeader->xlp_rem_len < len) + len = pageHeader->xlp_rem_len; + + if (readOff < pageHeaderSize + len) + readOff = ReadPageInternal(state, targetPagePtr, + pageHeaderSize + len); + + memcpy(buffer, (char *) contdata, len); + buffer += len; + gotlen += len; + + /* If we just reassembled the record header, validate it. */ + if (!gotheader) { - uint32 total_len = state->recordRemainLen; - uint32 request_len; - uint32 record_len; - XLogRecPtr targetPagePtr; - uint32 targetRecOff; - - Assert(state->decoding); - - /* - * Wait for the rest of the record on the first page to become - * available - */ - targetPagePtr = - state->DecodeRecPtr - (state->DecodeRecPtr % XLOG_BLCKSZ); - targetRecOff = state->DecodeRecPtr % XLOG_BLCKSZ; - - request_len = Min(targetRecOff + total_len, XLOG_BLCKSZ); - record_len = request_len - targetRecOff; - - /* ReadRecPtr contains page header */ - Assert(targetRecOff != 0); - if (XLogNeedData(state, targetPagePtr, request_len, true)) - return XLREAD_NEED_DATA; - - /* error out if caller supplied bogus page */ - if (!state->page_verified) + record = (XLogRecord *) state->readRecordBuf; + if (!ValidXLogRecordHeader(state, RecPtr, state->ReadRecPtr, + record, randAccess)) goto err; - - prec = (XLogRecord *) (state->readBuf + targetRecOff); - - /* validate record header if not yet */ - if (!state->record_verified && record_len >= SizeOfXLogRecord) - { - if (!ValidXLogRecordHeader(state, state->DecodeRecPtr, - state->PrevRecPtr, prec)) - goto err; - - state->record_verified = true; - } - - - if (total_len == record_len) - { - /* Record does not cross a page boundary */ - Assert(state->record_verified); - - if (!ValidXLogRecord(state, prec, state->DecodeRecPtr)) - goto err; - - state->record_verified = true; /* to be tidy */ - - /* We already checked the header earlier */ - state->NextRecPtr = state->DecodeRecPtr + MAXALIGN(record_len); - - record = prec; - state->readRecordState = XLREAD_NEXT_RECORD; - break; - } - - /* - * The record continues on the next page. Need to reassemble - * record - */ - Assert(total_len > record_len); - - /* Enlarge readRecordBuf as needed. */ - if (total_len > state->readRecordBufSize && - !allocate_recordbuf(state, total_len)) - { - /* We treat this as a "bogus data" condition */ - report_invalid_record(state, - "record length %u at %X/%X too long", - total_len, - LSN_FORMAT_ARGS(state->DecodeRecPtr)); - goto err; - } - - /* Copy the first fragment of the record from the first page. */ - memcpy(state->readRecordBuf, state->readBuf + targetRecOff, - record_len); - state->recordGotLen += record_len; - state->recordRemainLen -= record_len; - - /* Calculate pointer to beginning of next page */ - state->recordContRecPtr = state->DecodeRecPtr + record_len; - Assert(state->recordContRecPtr % XLOG_BLCKSZ == 0); - - state->readRecordState = XLREAD_CONTINUATION; + gotheader = true; } - /* fall through */ + } while (gotlen < total_len); - case XLREAD_CONTINUATION: - { - XLogPageHeader pageHeader = NULL; - uint32 pageHeaderSize; - XLogRecPtr targetPagePtr = InvalidXLogRecPtr; + Assert(gotheader); - /* - * we enter this state only if we haven't read the whole - * record. - */ - Assert(state->decoding); - Assert(state->recordRemainLen > 0); + record = (XLogRecord *) state->readRecordBuf; + if (!ValidXLogRecord(state, record, RecPtr)) + goto err; - while (state->recordRemainLen > 0) - { - char *contdata; - uint32 request_len PG_USED_FOR_ASSERTS_ONLY; - uint32 record_len; + pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) state->readBuf); + state->ReadRecPtr = RecPtr; + state->EndRecPtr = targetPagePtr + pageHeaderSize + + MAXALIGN(pageHeader->xlp_rem_len); + } + else + { + /* Wait for the record data to become available */ + readOff = ReadPageInternal(state, targetPagePtr, + Min(targetRecOff + total_len, XLOG_BLCKSZ)); + if (readOff < 0) + goto err; - /* Wait for the next page to become available */ - targetPagePtr = state->recordContRecPtr; + /* Record does not cross a page boundary */ + if (!ValidXLogRecord(state, record, RecPtr)) + goto err; - /* this request contains page header */ - Assert(targetPagePtr != 0); - if (XLogNeedData(state, targetPagePtr, - Min(state->recordRemainLen, XLOG_BLCKSZ), - false)) - return XLREAD_NEED_DATA; + state->EndRecPtr = RecPtr + MAXALIGN(total_len); - if (!state->page_verified) - goto err_continue; - - Assert(SizeOfXLogShortPHD <= state->readLen); - - /* Check that the continuation on next page looks valid */ - pageHeader = (XLogPageHeader) state->readBuf; - if (!(pageHeader->xlp_info & XLP_FIRST_IS_CONTRECORD)) - { - report_invalid_record( - state, - "there is no contrecord flag at %X/%X reading %X/%X", - LSN_FORMAT_ARGS(state->recordContRecPtr), - LSN_FORMAT_ARGS(state->DecodeRecPtr)); - goto err; - } - - /* - * Cross-check that xlp_rem_len agrees with how much of - * the record we expect there to be left. - */ - if (pageHeader->xlp_rem_len == 0 || - pageHeader->xlp_rem_len != state->recordRemainLen) - { - report_invalid_record( - state, - "invalid contrecord length %u at %X/%X reading %X/%X, expected %u", - pageHeader->xlp_rem_len, - LSN_FORMAT_ARGS(state->recordContRecPtr), - LSN_FORMAT_ARGS(state->DecodeRecPtr), - state->recordRemainLen); - goto err; - } - - /* Append the continuation from this page to the buffer */ - pageHeaderSize = XLogPageHeaderSize(pageHeader); - - /* - * XLogNeedData should have ensured that the whole page - * header was read - */ - Assert(pageHeaderSize <= state->readLen); - - contdata = (char *) state->readBuf + pageHeaderSize; - record_len = XLOG_BLCKSZ - pageHeaderSize; - if (pageHeader->xlp_rem_len < record_len) - record_len = pageHeader->xlp_rem_len; - - request_len = record_len + pageHeaderSize; - - /* - * XLogNeedData should have ensured all needed data was - * read - */ - Assert(request_len <= state->readLen); - - memcpy(state->readRecordBuf + state->recordGotLen, - (char *) contdata, record_len); - state->recordGotLen += record_len; - state->recordRemainLen -= record_len; - - /* If we just reassembled the record header, validate it. */ - if (!state->record_verified) - { - Assert(state->recordGotLen >= SizeOfXLogRecord); - if (!ValidXLogRecordHeader(state, state->DecodeRecPtr, - state->PrevRecPtr, - (XLogRecord *) state->readRecordBuf)) - goto err; - - state->record_verified = true; - } - - /* - * Calculate pointer to beginning of next page, and - * continue - */ - state->recordContRecPtr += XLOG_BLCKSZ; - } - - /* targetPagePtr is pointing the last-read page here */ - prec = (XLogRecord *) state->readRecordBuf; - if (!ValidXLogRecord(state, prec, state->DecodeRecPtr)) - goto err; - - pageHeaderSize = - XLogPageHeaderSize((XLogPageHeader) state->readBuf); - state->NextRecPtr = targetPagePtr + pageHeaderSize - + MAXALIGN(pageHeader->xlp_rem_len); - - record = prec; - state->readRecordState = XLREAD_NEXT_RECORD; - - break; - } + state->ReadRecPtr = RecPtr; } /* @@ -1059,195 +538,133 @@ XLogDecodeOneRecord(XLogReaderState *state, bool allow_oversized) (record->xl_info & ~XLR_INFO_MASK) == XLOG_SWITCH) { /* Pretend it extends to end of segment */ - state->NextRecPtr += state->segcxt.ws_segsize - 1; - state->NextRecPtr -= XLogSegmentOffset(state->NextRecPtr, state->segcxt.ws_segsize); + state->EndRecPtr += state->segcxt.ws_segsize - 1; + state->EndRecPtr -= XLogSegmentOffset(state->EndRecPtr, state->segcxt.ws_segsize); } - Assert(!record || state->readLen >= 0); - if (DecodeXLogRecord(state, state->decoding, record, state->DecodeRecPtr, &errormsg)) - { - /* Record the location of the next record. */ - state->decoding->next_lsn = state->NextRecPtr; - - /* - * If it's in the decode buffer (not an "oversized" record allocated - * with palloc()), mark the decode buffer space as occupied. - */ - if (!state->decoding->oversized) - { - /* The new decode buffer head must be MAXALIGNed. */ - Assert(state->decoding->size == MAXALIGN(state->decoding->size)); - if ((char *) state->decoding == state->decode_buffer) - state->decode_buffer_head = state->decode_buffer + - state->decoding->size; - else - state->decode_buffer_head += state->decoding->size; - } - - /* Insert it into the queue of decoded records. */ - Assert(state->decode_queue_head != state->decoding); - if (state->decode_queue_head) - state->decode_queue_head->next = state->decoding; - state->decode_queue_head = state->decoding; - if (!state->decode_queue_tail) - state->decode_queue_tail = state->decoding; - state->decoding = NULL; - - return XLREAD_SUCCESS; - } + if (DecodeXLogRecord(state, record, errormsg)) + return record; + else + return NULL; err: - if (state->decoding && state->decoding->oversized) - pfree(state->decoding); - state->decoding = NULL; -err_continue: /* - * Invalidate the read page. We might read from a different source after + * Invalidate the read state. We might read from a different source after * failure. */ XLogReaderInvalReadState(state); - /* - * If an error was written to errmsg_buf, it'll be returned to the caller - * of XLogReadRecord() after all successfully decoded records from the - * read queue. - */ + if (state->errormsg_buf[0] != '\0') + *errormsg = state->errormsg_buf; - return XLREAD_FAIL; + return NULL; } /* - * Checks that an xlog page loaded in state->readBuf is including at least - * [pageptr, reqLen] and the page is valid. header_inclusive indicates that - * reqLen is calculated including page header length. + * Read a single xlog page including at least [pageptr, reqLen] of valid data + * via the page_read() callback. * - * Returns false if the buffer already contains the requested data, or found - * error. state->page_verified is set to true for the former and false for the - * latter. + * Returns -1 if the required page cannot be read for some reason; errormsg_buf + * is set in that case (unless the error occurs in the page_read callback). * - * Otherwise returns true and requests data loaded onto state->readBuf by - * state->readPagePtr and state->readLen. The caller shall call this function - * again after filling the buffer at least with that portion of data and set - * state->readLen to the length of actually loaded data. - * - * If header_inclusive is false, corrects reqLen internally by adding the - * actual page header length and may request caller for new data. + * We fetch the page from a reader-local cache if we know we have the required + * data and if there hasn't been any error since caching the data. */ -static bool -XLogNeedData(XLogReaderState *state, XLogRecPtr pageptr, int reqLen, - bool header_inclusive) +static int +ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen) { + int readLen; uint32 targetPageOff; XLogSegNo targetSegNo; - uint32 addLen = 0; + XLogPageHeader hdr; - /* Some data is loaded, but page header is not verified yet. */ - if (!state->page_verified && - !XLogRecPtrIsInvalid(state->readPagePtr) && state->readLen >= 0) - { - uint32 pageHeaderSize; - - /* just loaded new data so needs to verify page header */ - - /* The caller must have loaded at least page header */ - Assert(state->readLen >= SizeOfXLogShortPHD); - - /* - * We have enough data to check the header length. Recheck the loaded - * length against the actual header length. - */ - pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) state->readBuf); - - /* Request more data if we don't have the full header. */ - if (state->readLen < pageHeaderSize) - { - state->reqLen = pageHeaderSize; - return true; - } - - /* Now that we know we have the full header, validate it. */ - if (!XLogReaderValidatePageHeader(state, state->readPagePtr, - (char *) state->readBuf)) - { - /* That's bad. Force reading the page again. */ - XLogReaderInvalReadState(state); - - return false; - } - - state->page_verified = true; - - XLByteToSeg(state->readPagePtr, state->seg.ws_segno, - state->segcxt.ws_segsize); - } - - /* - * The loaded page may not be the one caller is supposing to read when we - * are verifying the first page of new segment. In that case, skip further - * verification and immediately load the target page. - */ - if (state->page_verified && pageptr == state->readPagePtr) - { - /* - * calculate additional length for page header keeping the total - * length within the block size. - */ - if (!header_inclusive) - { - uint32 pageHeaderSize = - XLogPageHeaderSize((XLogPageHeader) state->readBuf); - - addLen = pageHeaderSize; - if (reqLen + pageHeaderSize <= XLOG_BLCKSZ) - addLen = pageHeaderSize; - else - addLen = XLOG_BLCKSZ - reqLen; - } - - /* Return if we already have it. */ - if (reqLen + addLen <= state->readLen) - return false; - } - - /* Data is not in our buffer, request the caller for it. */ - XLByteToSeg(pageptr, targetSegNo, state->segcxt.ws_segsize); - targetPageOff = XLogSegmentOffset(pageptr, state->segcxt.ws_segsize); Assert((pageptr % XLOG_BLCKSZ) == 0); - /* - * Every time we request to load new data of a page to the caller, even if - * we looked at a part of it before, we need to do verification on the - * next invocation as the caller might now be rereading data from a - * different source. - */ - state->page_verified = false; + XLByteToSeg(pageptr, targetSegNo, state->segcxt.ws_segsize); + targetPageOff = XLogSegmentOffset(pageptr, state->segcxt.ws_segsize); + + /* check whether we have all the requested data already */ + if (targetSegNo == state->seg.ws_segno && + targetPageOff == state->segoff && reqLen <= state->readLen) + return state->readLen; /* + * Data is not in our buffer. + * + * Every time we actually read the segment, even if we looked at parts of + * it before, we need to do verification as the page_read callback might + * now be rereading data from a different source. + * * Whenever switching to a new WAL segment, we read the first page of the * file and validate its header, even if that's not where the target * record is. This is so that we can check the additional identification - * info that is present in the first page's "long" header. Don't do this - * if the caller requested the first page in the segment. + * info that is present in the first page's "long" header. */ if (targetSegNo != state->seg.ws_segno && targetPageOff != 0) { - /* - * Then we'll see that the targetSegNo now matches the ws_segno, and - * will not come back here, but will request the actual target page. - */ - state->readPagePtr = pageptr - targetPageOff; - state->reqLen = XLOG_BLCKSZ; - return true; + XLogRecPtr targetSegmentPtr = pageptr - targetPageOff; + + readLen = state->routine.page_read(state, targetSegmentPtr, XLOG_BLCKSZ, + state->currRecPtr, + state->readBuf); + if (readLen < 0) + goto err; + + /* we can be sure to have enough WAL available, we scrolled back */ + Assert(readLen == XLOG_BLCKSZ); + + if (!XLogReaderValidatePageHeader(state, targetSegmentPtr, + state->readBuf)) + goto err; } /* - * Request the caller to load the page. We need at least a short page - * header so that we can validate it. + * First, read the requested data length, but at least a short page header + * so that we can validate it. */ - state->readPagePtr = pageptr; - state->reqLen = Max(reqLen + addLen, SizeOfXLogShortPHD); - return true; + readLen = state->routine.page_read(state, pageptr, Max(reqLen, SizeOfXLogShortPHD), + state->currRecPtr, + state->readBuf); + if (readLen < 0) + goto err; + + Assert(readLen <= XLOG_BLCKSZ); + + /* Do we have enough data to check the header length? */ + if (readLen <= SizeOfXLogShortPHD) + goto err; + + Assert(readLen >= reqLen); + + hdr = (XLogPageHeader) state->readBuf; + + /* still not enough */ + if (readLen < XLogPageHeaderSize(hdr)) + { + readLen = state->routine.page_read(state, pageptr, XLogPageHeaderSize(hdr), + state->currRecPtr, + state->readBuf); + if (readLen < 0) + goto err; + } + + /* + * Now that we know we have the full header, validate it. + */ + if (!XLogReaderValidatePageHeader(state, pageptr, (char *) hdr)) + goto err; + + /* update read state information */ + state->seg.ws_segno = targetSegNo; + state->segoff = targetPageOff; + state->readLen = readLen; + + return readLen; + +err: + XLogReaderInvalReadState(state); + return -1; } /* @@ -1256,7 +673,9 @@ XLogNeedData(XLogReaderState *state, XLogRecPtr pageptr, int reqLen, static void XLogReaderInvalReadState(XLogReaderState *state) { - state->readPagePtr = InvalidXLogRecPtr; + state->seg.ws_segno = 0; + state->segoff = 0; + state->readLen = 0; } /* @@ -1264,12 +683,11 @@ XLogReaderInvalReadState(XLogReaderState *state) * * This is just a convenience subroutine to avoid duplicated code in * XLogReadRecord. It's not intended for use from anywhere else. - * - * If PrevRecPtr is valid, the xl_prev is is cross-checked with it. */ static bool ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, - XLogRecPtr PrevRecPtr, XLogRecord *record) + XLogRecPtr PrevRecPtr, XLogRecord *record, + bool randAccess) { if (record->xl_tot_len < SizeOfXLogRecord) { @@ -1286,7 +704,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, record->xl_rmid, LSN_FORMAT_ARGS(RecPtr)); return false; } - if (PrevRecPtr == InvalidXLogRecPtr) + if (randAccess) { /* * We can't exactly verify the prev-link, but surely it should be less @@ -1504,22 +922,6 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, * here. */ -XLogFindNextRecordState * -InitXLogFindNextRecord(XLogReaderState *reader_state, XLogRecPtr start_ptr) -{ - XLogFindNextRecordState *state = (XLogFindNextRecordState *) - palloc_extended(sizeof(XLogFindNextRecordState), - MCXT_ALLOC_NO_OOM | MCXT_ALLOC_ZERO); - if (!state) - return NULL; - - state->reader_state = reader_state; - state->targetRecPtr = start_ptr; - state->currRecPtr = start_ptr; - - return state; -} - /* * Find the first record with an lsn >= RecPtr. * @@ -1531,25 +933,27 @@ InitXLogFindNextRecord(XLogReaderState *reader_state, XLogRecPtr start_ptr) * This positions the reader, like XLogBeginRead(), so that the next call to * XLogReadRecord() will read the next valid record. */ -bool -XLogFindNextRecord(XLogFindNextRecordState *state) +XLogRecPtr +XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr) { + XLogRecPtr tmpRecPtr; + XLogRecPtr found = InvalidXLogRecPtr; XLogPageHeader header; - XLogRecord *record; - XLogReadRecordResult result; char *errormsg; - Assert(!XLogRecPtrIsInvalid(state->currRecPtr)); + Assert(!XLogRecPtrIsInvalid(RecPtr)); /* * skip over potential continuation data, keeping in mind that it may span * multiple pages */ + tmpRecPtr = RecPtr; while (true) { XLogRecPtr targetPagePtr; int targetRecOff; uint32 pageHeaderSize; + int readLen; /* * Compute targetRecOff. It should typically be equal or greater than @@ -1557,27 +961,27 @@ XLogFindNextRecord(XLogFindNextRecordState *state) * that, except when caller has explicitly specified the offset that * falls somewhere there or when we are skipping multi-page * continuation record. It doesn't matter though because - * XLogNeedData() is prepared to handle that and will read at least - * short page-header worth of data + * ReadPageInternal() is prepared to handle that and will read at + * least short page-header worth of data */ - targetRecOff = state->currRecPtr % XLOG_BLCKSZ; + targetRecOff = tmpRecPtr % XLOG_BLCKSZ; /* scroll back to page boundary */ - targetPagePtr = state->currRecPtr - targetRecOff; + targetPagePtr = tmpRecPtr - targetRecOff; - if (XLogNeedData(state->reader_state, targetPagePtr, targetRecOff, - targetRecOff != 0)) - return true; - - if (!state->reader_state->page_verified) + /* Read the page containing the record */ + readLen = ReadPageInternal(state, targetPagePtr, targetRecOff); + if (readLen < 0) goto err; - header = (XLogPageHeader) state->reader_state->readBuf; + header = (XLogPageHeader) state->readBuf; pageHeaderSize = XLogPageHeaderSize(header); - /* we should have read the page header */ - Assert(state->reader_state->readLen >= pageHeaderSize); + /* make sure we have enough data for the page header */ + readLen = ReadPageInternal(state, targetPagePtr, pageHeaderSize); + if (readLen < 0) + goto err; /* skip over potential continuation data */ if (header->xlp_info & XLP_FIRST_IS_CONTRECORD) @@ -1592,21 +996,21 @@ XLogFindNextRecord(XLogFindNextRecordState *state) * Note that record headers are MAXALIGN'ed */ if (MAXALIGN(header->xlp_rem_len) >= (XLOG_BLCKSZ - pageHeaderSize)) - state->currRecPtr = targetPagePtr + XLOG_BLCKSZ; + tmpRecPtr = targetPagePtr + XLOG_BLCKSZ; else { /* * The previous continuation record ends in this page. Set - * state->currRecPtr to point to the first valid record + * tmpRecPtr to point to the first valid record */ - state->currRecPtr = targetPagePtr + pageHeaderSize + tmpRecPtr = targetPagePtr + pageHeaderSize + MAXALIGN(header->xlp_rem_len); break; } } else { - state->currRecPtr = targetPagePtr + pageHeaderSize; + tmpRecPtr = targetPagePtr + pageHeaderSize; break; } } @@ -1616,36 +1020,31 @@ XLogFindNextRecord(XLogFindNextRecordState *state) * because either we're at the first record after the beginning of a page * or we just jumped over the remaining data of a continuation. */ - XLogBeginRead(state->reader_state, state->currRecPtr); - while ((result = XLogReadRecord(state->reader_state, &record, &errormsg)) != - XLREAD_FAIL) + XLogBeginRead(state, tmpRecPtr); + while (XLogReadRecord(state, &errormsg) != NULL) { - if (result == XLREAD_NEED_DATA) - return true; - /* past the record we've found, break out */ - if (state->targetRecPtr <= state->reader_state->ReadRecPtr) + if (RecPtr <= state->ReadRecPtr) { /* Rewind the reader to the beginning of the last record. */ - state->currRecPtr = state->reader_state->ReadRecPtr; - XLogBeginRead(state->reader_state, state->currRecPtr); - return false; + found = state->ReadRecPtr; + XLogBeginRead(state, found); + return found; } } err: - XLogReaderInvalReadState(state->reader_state); + XLogReaderInvalReadState(state); - state->currRecPtr = InvalidXLogRecPtr;; - return false; + return InvalidXLogRecPtr; } #endif /* FRONTEND */ /* - * Helper function to ease writing of routines that read raw WAL data. - * If this function is used, caller must supply a segment_open callback and - * segment_close callback as that is used here. + * Helper function to ease writing of XLogRoutine->page_read callbacks. + * If this function is used, caller must supply a segment_open callback in + * 'state', as that is used here. * * Read 'count' bytes into 'buf', starting at location 'startptr', from WAL * fetched from timeline 'tli'. @@ -1658,7 +1057,6 @@ err: */ bool WALRead(XLogReaderState *state, - WALSegmentOpenCB segopenfn, WALSegmentCloseCB segclosefn, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli, WALReadError *errinfo) { @@ -1690,10 +1088,10 @@ WALRead(XLogReaderState *state, XLogSegNo nextSegNo; if (state->seg.ws_file >= 0) - segclosefn(state); + state->routine.segment_close(state); XLByteToSeg(recptr, nextSegNo, state->segcxt.ws_segsize); - segopenfn(state, nextSegNo, &tli); + state->routine.segment_open(state, nextSegNo, &tli); /* This shouldn't happen -- indicates a bug in segment_open */ Assert(state->seg.ws_file >= 0); @@ -1745,84 +1143,34 @@ WALRead(XLogReaderState *state, * ---------------------------------------- */ -/* - * Private function to reset the state, forgetting all decoded records, if we - * are asked to move to a new read position. - */ +/* private function to reset the state between records */ static void ResetDecoder(XLogReaderState *state) { - DecodedXLogRecord *r; + int block_id; - /* Reset the decoded record queue, freeing any oversized records. */ - while ((r = state->decode_queue_tail)) + state->decoded_record = NULL; + + state->main_data_len = 0; + + for (block_id = 0; block_id <= state->max_block_id; block_id++) { - state->decode_queue_tail = r->next; - if (r->oversized) - pfree(r); + state->blocks[block_id].in_use = false; + state->blocks[block_id].has_image = false; + state->blocks[block_id].has_data = false; + state->blocks[block_id].apply_image = false; } - state->decode_queue_head = NULL; - state->decode_queue_tail = NULL; - state->record = NULL; - state->decoding = NULL; - - /* Reset the decode buffer to empty. */ - state->decode_buffer_head = state->decode_buffer; - state->decode_buffer_tail = state->decode_buffer; - - /* Clear error state. */ - state->errormsg_buf[0] = '\0'; - state->errormsg_deferred = false; + state->max_block_id = -1; } /* - * Compute the maximum possible amount of padding that could be required to - * decode a record, given xl_tot_len from the record's header. This is the - * amount of output buffer space that we need to decode a record, though we - * might not finish up using it all. - * - * This computation is pessimistic and assumes the maximum possible number of - * blocks, due to lack of better information. - */ -size_t -DecodeXLogRecordRequiredSpace(size_t xl_tot_len) -{ - size_t size = 0; - - /* Account for the fixed size part of the decoded record struct. */ - size += offsetof(DecodedXLogRecord, blocks[0]); - /* Account for the flexible blocks array of maximum possible size. */ - size += sizeof(DecodedBkpBlock) * (XLR_MAX_BLOCK_ID + 1); - /* Account for all the raw main and block data. */ - size += xl_tot_len; - /* We might insert padding before main_data. */ - size += (MAXIMUM_ALIGNOF - 1); - /* We might insert padding before each block's data. */ - size += (MAXIMUM_ALIGNOF - 1) * (XLR_MAX_BLOCK_ID + 1); - /* We might insert padding at the end. */ - size += (MAXIMUM_ALIGNOF - 1); - - return size; -} - -/* - * Decode a record. "decoded" must point to a MAXALIGNed memory area that has - * space for at least DecodeXLogRecordRequiredSpace(record) bytes. On - * success, decoded->size contains the actual space occupied by the decoded - * record, which may turn out to be less. - * - * Only decoded->oversized member must be initialized already, and will not be - * modified. Other members will be initialized as required. + * Decode the previously read record. * * On error, a human-readable error message is returned in *errormsg, and * the return value is false. */ bool -DecodeXLogRecord(XLogReaderState *state, - DecodedXLogRecord *decoded, - XLogRecord *record, - XLogRecPtr lsn, - char **errormsg) +DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg) { /* * read next _size bytes from record buffer, but check for overrun first. @@ -1837,20 +1185,17 @@ DecodeXLogRecord(XLogReaderState *state, } while(0) char *ptr; - char *out; uint32 remaining; uint32 datatotal; RelFileNode *rnode = NULL; uint8 block_id; - decoded->header = *record; - decoded->lsn = lsn; - decoded->next = NULL; - decoded->record_origin = InvalidRepOriginId; - decoded->toplevel_xid = InvalidTransactionId; - decoded->main_data = NULL; - decoded->main_data_len = 0; - decoded->max_block_id = -1; + ResetDecoder(state); + + state->decoded_record = record; + state->record_origin = InvalidRepOriginId; + state->toplevel_xid = InvalidTransactionId; + ptr = (char *) record; ptr += SizeOfXLogRecord; remaining = record->xl_tot_len - SizeOfXLogRecord; @@ -1868,7 +1213,7 @@ DecodeXLogRecord(XLogReaderState *state, COPY_HEADER_FIELD(&main_data_len, sizeof(uint8)); - decoded->main_data_len = main_data_len; + state->main_data_len = main_data_len; datatotal += main_data_len; break; /* by convention, the main data fragment is * always last */ @@ -1879,18 +1224,18 @@ DecodeXLogRecord(XLogReaderState *state, uint32 main_data_len; COPY_HEADER_FIELD(&main_data_len, sizeof(uint32)); - decoded->main_data_len = main_data_len; + state->main_data_len = main_data_len; datatotal += main_data_len; break; /* by convention, the main data fragment is * always last */ } else if (block_id == XLR_BLOCK_ID_ORIGIN) { - COPY_HEADER_FIELD(&decoded->record_origin, sizeof(RepOriginId)); + COPY_HEADER_FIELD(&state->record_origin, sizeof(RepOriginId)); } else if (block_id == XLR_BLOCK_ID_TOPLEVEL_XID) { - COPY_HEADER_FIELD(&decoded->toplevel_xid, sizeof(TransactionId)); + COPY_HEADER_FIELD(&state->toplevel_xid, sizeof(TransactionId)); } else if (block_id <= XLR_MAX_BLOCK_ID) { @@ -1898,11 +1243,7 @@ DecodeXLogRecord(XLogReaderState *state, DecodedBkpBlock *blk; uint8 fork_flags; - /* mark any intervening block IDs as not in use */ - for (int i = decoded->max_block_id + 1; i < block_id; ++i) - decoded->blocks[i].in_use = false; - - if (block_id <= decoded->max_block_id) + if (block_id <= state->max_block_id) { report_invalid_record(state, "out-of-order block_id %u at %X/%X", @@ -1910,9 +1251,9 @@ DecodeXLogRecord(XLogReaderState *state, LSN_FORMAT_ARGS(state->ReadRecPtr)); goto err; } - decoded->max_block_id = block_id; + state->max_block_id = block_id; - blk = &decoded->blocks[block_id]; + blk = &state->blocks[block_id]; blk->in_use = true; blk->apply_image = false; @@ -1922,8 +1263,6 @@ DecodeXLogRecord(XLogReaderState *state, blk->has_image = ((fork_flags & BKPBLOCK_HAS_IMAGE) != 0); blk->has_data = ((fork_flags & BKPBLOCK_HAS_DATA) != 0); - blk->recent_buffer = InvalidBuffer; - COPY_HEADER_FIELD(&blk->data_len, sizeof(uint16)); /* cross-check that the HAS_DATA flag is set iff data_length > 0 */ if (blk->has_data && blk->data_len == 0) @@ -2058,18 +1397,17 @@ DecodeXLogRecord(XLogReaderState *state, /* * Ok, we've parsed the fragment headers, and verified that the total * length of the payload in the fragments is equal to the amount of data - * left. Copy the data of each fragment to contiguous space after the - * blocks array, inserting alignment padding before the data fragments so - * they can be cast to struct pointers by REDO routines. + * left. Copy the data of each fragment to a separate buffer. + * + * We could just set up pointers into readRecordBuf, but we want to align + * the data for the convenience of the callers. Backup images are not + * copied, however; they don't need alignment. */ - out = ((char *) decoded) + - offsetof(DecodedXLogRecord, blocks) + - sizeof(decoded->blocks[0]) * (decoded->max_block_id + 1); /* block data first */ - for (block_id = 0; block_id <= decoded->max_block_id; block_id++) + for (block_id = 0; block_id <= state->max_block_id; block_id++) { - DecodedBkpBlock *blk = &decoded->blocks[block_id]; + DecodedBkpBlock *blk = &state->blocks[block_id]; if (!blk->in_use) continue; @@ -2078,36 +1416,57 @@ DecodeXLogRecord(XLogReaderState *state, if (blk->has_image) { - /* no need to align image */ - blk->bkp_image = out; - memcpy(out, ptr, blk->bimg_len); + blk->bkp_image = ptr; ptr += blk->bimg_len; - out += blk->bimg_len; } if (blk->has_data) { - out = (char *) MAXALIGN(out); - blk->data = out; + if (!blk->data || blk->data_len > blk->data_bufsz) + { + if (blk->data) + pfree(blk->data); + + /* + * Force the initial request to be BLCKSZ so that we don't + * waste time with lots of trips through this stanza as a + * result of WAL compression. + */ + blk->data_bufsz = MAXALIGN(Max(blk->data_len, BLCKSZ)); + blk->data = palloc(blk->data_bufsz); + } memcpy(blk->data, ptr, blk->data_len); ptr += blk->data_len; - out += blk->data_len; } } /* and finally, the main data */ - if (decoded->main_data_len > 0) + if (state->main_data_len > 0) { - out = (char *) MAXALIGN(out); - decoded->main_data = out; - memcpy(decoded->main_data, ptr, decoded->main_data_len); - ptr += decoded->main_data_len; - out += decoded->main_data_len; - } + if (!state->main_data || state->main_data_len > state->main_data_bufsz) + { + if (state->main_data) + pfree(state->main_data); - /* Report the actual size we used. */ - decoded->size = MAXALIGN(out - (char *) decoded); - Assert(DecodeXLogRecordRequiredSpace(record->xl_tot_len) >= - decoded->size); + /* + * main_data_bufsz must be MAXALIGN'ed. In many xlog record + * types, we omit trailing struct padding on-disk to save a few + * bytes; but compilers may generate accesses to the xlog struct + * that assume that padding bytes are present. If the palloc + * request is not large enough to include such padding bytes then + * we'll get valgrind complaints due to otherwise-harmless fetches + * of the padding bytes. + * + * In addition, force the initial request to be reasonably large + * so that we don't waste time with lots of trips through this + * stanza. BLCKSZ / 2 seems like a good compromise choice. + */ + state->main_data_bufsz = MAXALIGN(Max(state->main_data_len, + BLCKSZ / 2)); + state->main_data = palloc(state->main_data_bufsz); + } + memcpy(state->main_data, ptr, state->main_data_len); + ptr += state->main_data_len; + } return true; @@ -2131,31 +1490,19 @@ err: bool XLogRecGetBlockTag(XLogReaderState *record, uint8 block_id, RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum) -{ - return XLogRecGetRecentBuffer(record, block_id, rnode, forknum, blknum, - NULL); -} - -bool -XLogRecGetRecentBuffer(XLogReaderState *record, uint8 block_id, - RelFileNode *rnode, ForkNumber *forknum, - BlockNumber *blknum, Buffer *recent_buffer) { DecodedBkpBlock *bkpb; - if (block_id > record->record->max_block_id || - !record->record->blocks[block_id].in_use) + if (!record->blocks[block_id].in_use) return false; - bkpb = &record->record->blocks[block_id]; + bkpb = &record->blocks[block_id]; if (rnode) *rnode = bkpb->rnode; if (forknum) *forknum = bkpb->forknum; if (blknum) *blknum = bkpb->blkno; - if (recent_buffer) - *recent_buffer = bkpb->recent_buffer; return true; } @@ -2169,11 +1516,10 @@ XLogRecGetBlockData(XLogReaderState *record, uint8 block_id, Size *len) { DecodedBkpBlock *bkpb; - if (block_id > record->record->max_block_id || - !record->record->blocks[block_id].in_use) + if (!record->blocks[block_id].in_use) return NULL; - bkpb = &record->record->blocks[block_id]; + bkpb = &record->blocks[block_id]; if (!bkpb->has_data) { @@ -2201,13 +1547,12 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) char *ptr; PGAlignedBlock tmp; - if (block_id > record->record->max_block_id || - !record->record->blocks[block_id].in_use) + if (!record->blocks[block_id].in_use) return false; - if (!record->record->blocks[block_id].has_image) + if (!record->blocks[block_id].has_image) return false; - bkpb = &record->record->blocks[block_id]; + bkpb = &record->blocks[block_id]; ptr = bkpb->bkp_image; if (bkpb->bimg_info & BKPIMAGE_IS_COMPRESSED) diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 4d5c9bb08f..d17d660f46 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -335,13 +335,11 @@ XLogReadBufferForRedoExtended(XLogReaderState *record, RelFileNode rnode; ForkNumber forknum; BlockNumber blkno; - Buffer recent_buffer; Page page; bool zeromode; bool willinit; - if (!XLogRecGetRecentBuffer(record, block_id, &rnode, &forknum, &blkno, - &recent_buffer)) + if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno)) { /* Caller specified a bogus block_id */ elog(PANIC, "failed to locate backup block with ID %d", block_id); @@ -352,7 +350,7 @@ XLogReadBufferForRedoExtended(XLogReaderState *record, * going to initialize it. And vice versa. */ zeromode = (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK); - willinit = (record->record->blocks[block_id].flags & BKPBLOCK_WILL_INIT) != 0; + willinit = (record->blocks[block_id].flags & BKPBLOCK_WILL_INIT) != 0; if (willinit && !zeromode) elog(PANIC, "block with WILL_INIT flag in WAL record must be zeroed by redo routine"); if (!willinit && zeromode) @@ -363,8 +361,7 @@ XLogReadBufferForRedoExtended(XLogReaderState *record, { Assert(XLogRecHasBlockImage(record, block_id)); *buf = XLogReadBufferExtended(rnode, forknum, blkno, - get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK, - recent_buffer); + get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK); page = BufferGetPage(*buf); if (!RestoreBlockImage(record, block_id, page)) elog(ERROR, "failed to restore block image"); @@ -393,8 +390,7 @@ XLogReadBufferForRedoExtended(XLogReaderState *record, } else { - *buf = XLogReadBufferExtended(rnode, forknum, blkno, mode, - recent_buffer); + *buf = XLogReadBufferExtended(rnode, forknum, blkno, mode); if (BufferIsValid(*buf)) { if (mode != RBM_ZERO_AND_LOCK && mode != RBM_ZERO_AND_CLEANUP_LOCK) @@ -441,8 +437,7 @@ XLogReadBufferForRedoExtended(XLogReaderState *record, */ Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, - BlockNumber blkno, ReadBufferMode mode, - Buffer recent_buffer) + BlockNumber blkno, ReadBufferMode mode) { BlockNumber lastblock; Buffer buffer; @@ -450,15 +445,6 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, Assert(blkno != P_NEW); - /* Do we have a clue where the buffer might be already? */ - if (BufferIsValid(recent_buffer) && - mode == RBM_NORMAL && - ReadRecentBuffer(rnode, forknum, blkno, recent_buffer)) - { - buffer = recent_buffer; - goto recent_buffer_fast_path; - } - /* Open the relation at smgr level */ smgr = smgropen(rnode, InvalidBackendId); @@ -517,7 +503,6 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, } } -recent_buffer_fast_path: if (mode == RBM_NORMAL) { /* check that page has been initialized */ @@ -701,7 +686,8 @@ XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum, void XLogReadDetermineTimeline(XLogReaderState *state, XLogRecPtr wantPage, uint32 wantLength) { - const XLogRecPtr lastReadPage = state->readPagePtr; + const XLogRecPtr lastReadPage = (state->seg.ws_segno * + state->segcxt.ws_segsize + state->segoff); Assert(wantPage != InvalidXLogRecPtr && wantPage % XLOG_BLCKSZ == 0); Assert(wantLength <= XLOG_BLCKSZ); @@ -716,7 +702,7 @@ XLogReadDetermineTimeline(XLogReaderState *state, XLogRecPtr wantPage, uint32 wa * current TLI has since become historical. */ if (lastReadPage == wantPage && - state->page_verified && + state->readLen != 0 && lastReadPage + state->readLen >= wantPage + Min(wantLength, XLOG_BLCKSZ - 1)) return; @@ -838,12 +824,10 @@ wal_segment_close(XLogReaderState *state) * exists for normal backends, so we have to do a check/sleep/repeat style of * loop for now. */ -bool -read_local_xlog_page(XLogReaderState *state) +int +read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, + int reqLen, XLogRecPtr targetRecPtr, char *cur_page) { - XLogRecPtr targetPagePtr = state->readPagePtr; - int reqLen = state->reqLen; - char *cur_page = state->readBuf; XLogRecPtr read_upto, loc; TimeLineID tli; @@ -942,8 +926,7 @@ read_local_xlog_page(XLogReaderState *state) else if (targetPagePtr + reqLen > read_upto) { /* not enough data there */ - XLogReaderSetInputData(state, -1); - return false; + return -1; } else { @@ -956,14 +939,12 @@ read_local_xlog_page(XLogReaderState *state) * as 'count', read the whole page anyway. It's guaranteed to be * zero-padded up to the page boundary if it's incomplete. */ - if (!WALRead(state, wal_segment_open, wal_segment_close, - cur_page, targetPagePtr, XLOG_BLCKSZ, tli, &errinfo)) + if (!WALRead(state, cur_page, targetPagePtr, XLOG_BLCKSZ, tli, + &errinfo)) WALReadRaiseError(&errinfo); /* number of valid bytes in the buffer */ - state->readPagePtr = targetPagePtr; - XLogReaderSetInputData(state, count); - return true; + return count; } /* diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 08f95c43ca..5c84d758bb 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -899,20 +899,6 @@ CREATE VIEW pg_stat_wal_receiver AS FROM pg_stat_get_wal_receiver() s WHERE s.pid IS NOT NULL; -CREATE VIEW pg_stat_prefetch_recovery AS - SELECT - s.stats_reset, - s.prefetch, - s.skip_hit, - s.skip_new, - s.skip_fpw, - s.skip_seq, - s.distance, - s.queue_depth, - s.avg_distance, - s.avg_queue_depth - FROM pg_stat_get_prefetch_recovery() s; - CREATE VIEW pg_stat_subscription AS SELECT su.oid AS subid, diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index ba335fd342..e94f5f55c7 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -38,7 +38,6 @@ #include "access/transam.h" #include "access/twophase_rmgr.h" #include "access/xact.h" -#include "access/xlogprefetch.h" #include "catalog/partition.h" #include "catalog/pg_database.h" #include "catalog/pg_proc.h" @@ -280,7 +279,6 @@ static PgStat_GlobalStats globalStats; static PgStat_WalStats walStats; static PgStat_SLRUStats slruStats[SLRU_NUM_ELEMENTS]; static HTAB *replSlotStatHash = NULL; -static PgStat_RecoveryPrefetchStats recoveryPrefetchStats; /* * List of OIDs of databases we need to write out. If an entry is InvalidOid, @@ -352,7 +350,6 @@ static void pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len); static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len); static void pgstat_recv_wal(PgStat_MsgWal *msg, int len); static void pgstat_recv_slru(PgStat_MsgSLRU *msg, int len); -static void pgstat_recv_recoveryprefetch(PgStat_MsgRecoveryPrefetch *msg, int len); static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len); static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len); static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len); @@ -1446,20 +1443,11 @@ pgstat_reset_shared_counters(const char *target) msg.m_resettarget = RESET_BGWRITER; else if (strcmp(target, "wal") == 0) msg.m_resettarget = RESET_WAL; - else if (strcmp(target, "prefetch_recovery") == 0) - { - /* - * We can't ask the stats collector to do this for us as it is not - * attached to shared memory. - */ - XLogPrefetchRequestResetStats(); - return; - } else ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("unrecognized reset target: \"%s\"", target), - errhint("Target must be \"archiver\", \"bgwriter\", \"wal\" or \"prefetch_recovery\"."))); + errhint("Target must be \"archiver\", \"bgwriter\" or \"wal\"."))); pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSHAREDCOUNTER); pgstat_send(&msg, sizeof(msg)); @@ -2897,22 +2885,6 @@ pgstat_fetch_replslot(NameData slotname) return pgstat_get_replslot_entry(slotname, false); } -/* - * --------- - * pgstat_fetch_recoveryprefetch() - - * - * Support function for restoring the counters managed by xlogprefetch.c. - * --------- - */ -PgStat_RecoveryPrefetchStats * -pgstat_fetch_recoveryprefetch(void) -{ - backend_read_statsfile(); - - return &recoveryPrefetchStats; -} - - /* * Shut down a single backend's statistics reporting at process exit. * @@ -3188,23 +3160,6 @@ pgstat_send_slru(void) } -/* ---------- - * pgstat_send_recoveryprefetch() - - * - * Send recovery prefetch statistics to the collector - * ---------- - */ -void -pgstat_send_recoveryprefetch(PgStat_RecoveryPrefetchStats *stats) -{ - PgStat_MsgRecoveryPrefetch msg; - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RECOVERYPREFETCH); - msg.m_stats = *stats; - pgstat_send(&msg, sizeof(msg)); -} - - /* ---------- * PgstatCollectorMain() - * @@ -3422,10 +3377,6 @@ PgstatCollectorMain(int argc, char *argv[]) pgstat_recv_slru(&msg.msg_slru, len); break; - case PGSTAT_MTYPE_RECOVERYPREFETCH: - pgstat_recv_recoveryprefetch(&msg.msg_recoveryprefetch, len); - break; - case PGSTAT_MTYPE_FUNCSTAT: pgstat_recv_funcstat(&msg.msg_funcstat, len); break; @@ -3718,13 +3669,6 @@ pgstat_write_statsfiles(bool permanent, bool allDbs) rc = fwrite(slruStats, sizeof(slruStats), 1, fpout); (void) rc; /* we'll check for error with ferror */ - /* - * Write recovery prefetch stats struct - */ - rc = fwrite(&recoveryPrefetchStats, sizeof(recoveryPrefetchStats), 1, - fpout); - (void) rc; /* we'll check for error with ferror */ - /* * Walk through the database table. */ @@ -4000,7 +3944,6 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep) memset(&archiverStats, 0, sizeof(archiverStats)); memset(&walStats, 0, sizeof(walStats)); memset(&slruStats, 0, sizeof(slruStats)); - memset(&recoveryPrefetchStats, 0, sizeof(recoveryPrefetchStats)); /* * Set the current timestamp (will be kept only in case we can't load an @@ -4100,18 +4043,6 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep) goto done; } - /* - * Read recoveryPrefetchStats struct - */ - if (fread(&recoveryPrefetchStats, 1, sizeof(recoveryPrefetchStats), - fpin) != sizeof(recoveryPrefetchStats)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", statfile))); - memset(&recoveryPrefetchStats, 0, sizeof(recoveryPrefetchStats)); - goto done; - } - /* * We found an existing collector stats file. Read it and put all the * hashtable entries into place. @@ -4452,7 +4383,6 @@ pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent, PgStat_WalStats myWalStats; PgStat_SLRUStats mySLRUStats[SLRU_NUM_ELEMENTS]; PgStat_StatReplSlotEntry myReplSlotStats; - PgStat_RecoveryPrefetchStats myRecoveryPrefetchStats; FILE *fpin; int32 format_id; const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename; @@ -4529,18 +4459,6 @@ pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent, return false; } - /* - * Read recovery prefetch stats struct - */ - if (fread(&myRecoveryPrefetchStats, 1, sizeof(myRecoveryPrefetchStats), - fpin) != sizeof(myRecoveryPrefetchStats)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", statfile))); - FreeFile(fpin); - return false; - } - /* By default, we're going to return the timestamp of the global file. */ *ts = myGlobalStats.stats_timestamp; @@ -4724,13 +4642,6 @@ backend_read_statsfile(void) if (ok && file_ts >= min_ts) break; - /* - * If we're in crash recovery, the collector may not even be running, - * so work with what we have. - */ - if (InRecovery) - break; - /* Not there or too old, so kick the collector and wait a bit */ if ((count % PGSTAT_INQ_LOOP_COUNT) == 0) pgstat_send_inquiry(cur_ts, min_ts, inquiry_db); @@ -5470,18 +5381,6 @@ pgstat_recv_slru(PgStat_MsgSLRU *msg, int len) slruStats[msg->m_index].truncate += msg->m_truncate; } -/* ---------- - * pgstat_recv_recoveryprefetch() - - * - * Process a recovery prefetch message. - * ---------- - */ -static void -pgstat_recv_recoveryprefetch(PgStat_MsgRecoveryPrefetch *msg, int len) -{ - recoveryPrefetchStats = msg->m_stats; -} - /* ---------- * pgstat_recv_recoveryconflict() - * diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index 888e064ec0..70670169ac 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -123,7 +123,7 @@ LogicalDecodingProcessRecord(LogicalDecodingContext *ctx, XLogReaderState *recor { ReorderBufferAssignChild(ctx->reorder, txid, - XLogRecGetXid(record), + record->decoded_record->xl_xid, buf.origptr); } diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c index 00543ede45..ffc6160e9f 100644 --- a/src/backend/replication/logical/logical.c +++ b/src/backend/replication/logical/logical.c @@ -148,8 +148,7 @@ StartupDecodingContext(List *output_plugin_options, TransactionId xmin_horizon, bool need_full_snapshot, bool fast_forward, - LogicalDecodingXLogPageReadCB page_read, - WALSegmentCleanupCB cleanup_cb, + XLogReaderRoutine *xl_routine, LogicalOutputPluginWriterPrepareWrite prepare_write, LogicalOutputPluginWriterWrite do_write, LogicalOutputPluginWriterUpdateProgress update_progress) @@ -199,12 +198,11 @@ StartupDecodingContext(List *output_plugin_options, ctx->slot = slot; - ctx->reader = XLogReaderAllocate(wal_segment_size, NULL, cleanup_cb); + ctx->reader = XLogReaderAllocate(wal_segment_size, NULL, xl_routine, ctx); if (!ctx->reader) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); - ctx->page_read = page_read; ctx->reorder = ReorderBufferAllocate(); ctx->snapshot_builder = @@ -321,8 +319,7 @@ CreateInitDecodingContext(const char *plugin, List *output_plugin_options, bool need_full_snapshot, XLogRecPtr restart_lsn, - LogicalDecodingXLogPageReadCB page_read, - WALSegmentCleanupCB cleanup_cb, + XLogReaderRoutine *xl_routine, LogicalOutputPluginWriterPrepareWrite prepare_write, LogicalOutputPluginWriterWrite do_write, LogicalOutputPluginWriterUpdateProgress update_progress) @@ -425,7 +422,7 @@ CreateInitDecodingContext(const char *plugin, ctx = StartupDecodingContext(NIL, restart_lsn, xmin_horizon, need_full_snapshot, false, - page_read, cleanup_cb, prepare_write, do_write, + xl_routine, prepare_write, do_write, update_progress); /* call output plugin initialization callback */ @@ -479,8 +476,7 @@ LogicalDecodingContext * CreateDecodingContext(XLogRecPtr start_lsn, List *output_plugin_options, bool fast_forward, - LogicalDecodingXLogPageReadCB page_read, - WALSegmentCleanupCB cleanup_cb, + XLogReaderRoutine *xl_routine, LogicalOutputPluginWriterPrepareWrite prepare_write, LogicalOutputPluginWriterWrite do_write, LogicalOutputPluginWriterUpdateProgress update_progress) @@ -532,8 +528,8 @@ CreateDecodingContext(XLogRecPtr start_lsn, ctx = StartupDecodingContext(output_plugin_options, start_lsn, InvalidTransactionId, false, - fast_forward, page_read, cleanup_cb, - prepare_write, do_write, update_progress); + fast_forward, xl_routine, prepare_write, + do_write, update_progress); /* call output plugin initialization callback */ old_context = MemoryContextSwitchTo(ctx->context); @@ -589,13 +585,7 @@ DecodingContextFindStartpoint(LogicalDecodingContext *ctx) char *err = NULL; /* the read_page callback waits for new WAL */ - while (XLogReadRecord(ctx->reader, &record, &err) == - XLREAD_NEED_DATA) - { - if (!ctx->page_read(ctx->reader)) - break; - } - + record = XLogReadRecord(ctx->reader, &err); if (err) elog(ERROR, "%s", err); if (!record) diff --git a/src/backend/replication/logical/logicalfuncs.c b/src/backend/replication/logical/logicalfuncs.c index 8f8c129620..01d354829b 100644 --- a/src/backend/replication/logical/logicalfuncs.c +++ b/src/backend/replication/logical/logicalfuncs.c @@ -233,8 +233,9 @@ pg_logical_slot_get_changes_guts(FunctionCallInfo fcinfo, bool confirm, bool bin ctx = CreateDecodingContext(InvalidXLogRecPtr, options, false, - read_local_xlog_page, - wal_segment_close, + XL_ROUTINE(.page_read = read_local_xlog_page, + .segment_open = wal_segment_open, + .segment_close = wal_segment_close), LogicalOutputPrepareWrite, LogicalOutputWrite, NULL); @@ -283,13 +284,7 @@ pg_logical_slot_get_changes_guts(FunctionCallInfo fcinfo, bool confirm, bool bin XLogRecord *record; char *errm = NULL; - while (XLogReadRecord(ctx->reader, &record, &errm) == - XLREAD_NEED_DATA) - { - if (!ctx->page_read(ctx->reader)) - break; - } - + record = XLogReadRecord(ctx->reader, &errm); if (errm) elog(ERROR, "%s", errm); diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c index 7ab0b804e4..d9d36879ed 100644 --- a/src/backend/replication/slotfuncs.c +++ b/src/backend/replication/slotfuncs.c @@ -153,8 +153,9 @@ create_logical_replication_slot(char *name, char *plugin, ctx = CreateInitDecodingContext(plugin, NIL, false, /* just catalogs is OK */ restart_lsn, - read_local_xlog_page, - wal_segment_close, + XL_ROUTINE(.page_read = read_local_xlog_page, + .segment_open = wal_segment_open, + .segment_close = wal_segment_close), NULL, NULL, NULL); /* @@ -511,8 +512,9 @@ pg_logical_replication_slot_advance(XLogRecPtr moveto) ctx = CreateDecodingContext(InvalidXLogRecPtr, NIL, true, /* fast_forward */ - read_local_xlog_page, - wal_segment_close, + XL_ROUTINE(.page_read = read_local_xlog_page, + .segment_open = wal_segment_open, + .segment_close = wal_segment_close), NULL, NULL, NULL); /* @@ -534,13 +536,7 @@ pg_logical_replication_slot_advance(XLogRecPtr moveto) * Read records. No changes are generated in fast_forward mode, * but snapbuilder/slot statuses are updated properly. */ - while (XLogReadRecord(ctx->reader, &record, &errm) == - XLREAD_NEED_DATA) - { - if (!ctx->page_read(ctx->reader)) - break; - } - + record = XLogReadRecord(ctx->reader, &errm); if (errm) elog(ERROR, "%s", errm); diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 6fefc3bedc..628c8d49d9 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -580,7 +580,10 @@ StartReplication(StartReplicationCmd *cmd) /* create xlogreader for physical replication */ xlogreader = - XLogReaderAllocate(wal_segment_size, NULL, wal_segment_close); + XLogReaderAllocate(wal_segment_size, NULL, + XL_ROUTINE(.segment_open = WalSndSegmentOpen, + .segment_close = wal_segment_close), + NULL); if (!xlogreader) ereport(ERROR, @@ -803,12 +806,10 @@ StartReplication(StartReplicationCmd *cmd) * which has to do a plain sleep/busy loop, because the walsender's latch gets * set every time WAL is flushed. */ -static bool -logical_read_xlog_page(XLogReaderState *state) +static int +logical_read_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen, + XLogRecPtr targetRecPtr, char *cur_page) { - XLogRecPtr targetPagePtr = state->readPagePtr; - int reqLen = state->reqLen; - char *cur_page = state->readBuf; XLogRecPtr flushptr; int count; WALReadError errinfo; @@ -825,10 +826,7 @@ logical_read_xlog_page(XLogReaderState *state) /* fail if not (implies we are going to shut down) */ if (flushptr < targetPagePtr + reqLen) - { - XLogReaderSetInputData(state, -1); - return false; - } + return -1; if (targetPagePtr + XLOG_BLCKSZ <= flushptr) count = XLOG_BLCKSZ; /* more than one block available */ @@ -836,7 +834,7 @@ logical_read_xlog_page(XLogReaderState *state) count = flushptr - targetPagePtr; /* part of the page available */ /* now actually read the data, we know it's there */ - if (!WALRead(state, WalSndSegmentOpen, wal_segment_close, + if (!WALRead(state, cur_page, targetPagePtr, XLOG_BLCKSZ, @@ -856,8 +854,7 @@ logical_read_xlog_page(XLogReaderState *state) XLByteToSeg(targetPagePtr, segno, state->segcxt.ws_segsize); CheckXLogRemoved(segno, state->seg.ws_tli); - XLogReaderSetInputData(state, count); - return true; + return count; } /* @@ -1010,8 +1007,9 @@ CreateReplicationSlot(CreateReplicationSlotCmd *cmd) ctx = CreateInitDecodingContext(cmd->plugin, NIL, need_full_snapshot, InvalidXLogRecPtr, - logical_read_xlog_page, - wal_segment_close, + XL_ROUTINE(.page_read = logical_read_xlog_page, + .segment_open = WalSndSegmentOpen, + .segment_close = wal_segment_close), WalSndPrepareWrite, WalSndWriteData, WalSndUpdateProgress); @@ -1169,8 +1167,9 @@ StartLogicalReplication(StartReplicationCmd *cmd) */ logical_decoding_ctx = CreateDecodingContext(cmd->startpoint, cmd->options, false, - logical_read_xlog_page, - wal_segment_close, + XL_ROUTINE(.page_read = logical_read_xlog_page, + .segment_open = WalSndSegmentOpen, + .segment_close = wal_segment_close), WalSndPrepareWrite, WalSndWriteData, WalSndUpdateProgress); xlogreader = logical_decoding_ctx->reader; @@ -2763,7 +2762,7 @@ XLogSendPhysical(void) enlargeStringInfo(&output_message, nbytes); retry: - if (!WALRead(xlogreader, WalSndSegmentOpen, wal_segment_close, + if (!WALRead(xlogreader, &output_message.data[output_message.len], startptr, nbytes, @@ -2861,12 +2860,7 @@ XLogSendLogical(void) */ WalSndCaughtUp = false; - while (XLogReadRecord(logical_decoding_ctx->reader, &record, &errm) == - XLREAD_NEED_DATA) - { - if (!logical_decoding_ctx->page_read(logical_decoding_ctx->reader)) - break; - } + record = XLogReadRecord(logical_decoding_ctx->reader, &errm); /* xlog record was invalid */ if (errm != NULL) diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c index cfa0414e5a..8c12dda238 100644 --- a/src/backend/storage/freespace/freespace.c +++ b/src/backend/storage/freespace/freespace.c @@ -210,8 +210,7 @@ XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk, blkno = fsm_logical_to_physical(addr); /* If the page doesn't exist already, extend */ - buf = XLogReadBufferExtended(rnode, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR, - InvalidBuffer); + buf = XLogReadBufferExtended(rnode, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(buf); diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 47847563ef..3e4ec53a97 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -22,7 +22,6 @@ #include "access/subtrans.h" #include "access/syncscan.h" #include "access/twophase.h" -#include "access/xlogprefetch.h" #include "commands/async.h" #include "miscadmin.h" #include "pgstat.h" @@ -127,7 +126,6 @@ CreateSharedMemoryAndSemaphores(void) size = add_size(size, PredicateLockShmemSize()); size = add_size(size, ProcGlobalShmemSize()); size = add_size(size, XLOGShmemSize()); - size = add_size(size, XLogPrefetchShmemSize()); size = add_size(size, CLOGShmemSize()); size = add_size(size, CommitTsShmemSize()); size = add_size(size, SUBTRANSShmemSize()); @@ -219,7 +217,6 @@ CreateSharedMemoryAndSemaphores(void) * Set up xlog, clog, and buffers */ XLOGShmemInit(); - XLogPrefetchShmemInit(); CLOGShmemInit(); CommitTsShmemInit(); SUBTRANSShmemInit(); diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 9db40b134a..0a180341c2 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -41,7 +41,6 @@ #include "access/twophase.h" #include "access/xact.h" #include "access/xlog_internal.h" -#include "access/xlogprefetch.h" #include "catalog/namespace.h" #include "catalog/pg_authid.h" #include "catalog/storage.h" @@ -210,7 +209,6 @@ static bool check_effective_io_concurrency(int *newval, void **extra, GucSource static bool check_maintenance_io_concurrency(int *newval, void **extra, GucSource source); static bool check_huge_page_size(int *newval, void **extra, GucSource source); static bool check_client_connection_check_interval(int *newval, void **extra, GucSource source); -static void assign_maintenance_io_concurrency(int newval, void *extra); static void assign_pgstat_temp_directory(const char *newval, void *extra); static bool check_application_name(char **newval, void **extra, GucSource source); static void assign_application_name(const char *newval, void *extra); @@ -727,8 +725,6 @@ const char *const config_group_names[] = gettext_noop("Write-Ahead Log / Checkpoints"), /* WAL_ARCHIVING */ gettext_noop("Write-Ahead Log / Archiving"), - /* WAL_RECOVERY */ - gettext_noop("Write-Ahead Log / Recovery"), /* WAL_ARCHIVE_RECOVERY */ gettext_noop("Write-Ahead Log / Archive Recovery"), /* WAL_RECOVERY_TARGET */ @@ -1280,27 +1276,6 @@ static struct config_bool ConfigureNamesBool[] = true, NULL, NULL, NULL }, - { - {"recovery_prefetch", PGC_SIGHUP, WAL_RECOVERY, - gettext_noop("Prefetch referenced blocks during recovery."), - gettext_noop("Read ahead of the current replay position to find uncached blocks.") - }, - &recovery_prefetch, - false, - NULL, assign_recovery_prefetch, NULL - }, - { - {"recovery_prefetch_fpw", PGC_SIGHUP, WAL_RECOVERY, - gettext_noop("Prefetch blocks that have full page images in the WAL."), - gettext_noop("On some systems, there is no benefit to prefetching pages that will be " - "entirely overwritten, but if the logical page size of the filesystem is " - "larger than PostgreSQL's, this can be beneficial. This option has no " - "effect unless recovery_prefetch is enabled.") - }, - &recovery_prefetch_fpw, - false, - NULL, assign_recovery_prefetch_fpw, NULL - }, { {"wal_log_hints", PGC_POSTMASTER, WAL_SETTINGS, @@ -2755,17 +2730,6 @@ static struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, - { - {"wal_decode_buffer_size", PGC_POSTMASTER, WAL_RECOVERY, - gettext_noop("Maximum buffer size for reading ahead in the WAL during recovery."), - gettext_noop("This controls the maximum distance we can read ahead in the WAL to prefetch referenced blocks."), - GUC_UNIT_BYTE - }, - &wal_decode_buffer_size, - 512 * 1024, 64 * 1024, INT_MAX, - NULL, NULL, NULL - }, - { {"wal_keep_size", PGC_SIGHUP, REPLICATION_SENDING, gettext_noop("Sets the size of WAL files held for standby servers."), @@ -3086,8 +3050,7 @@ static struct config_int ConfigureNamesInt[] = 0, #endif 0, MAX_IO_CONCURRENCY, - check_maintenance_io_concurrency, assign_maintenance_io_concurrency, - NULL + check_maintenance_io_concurrency, NULL, NULL }, { @@ -12091,20 +12054,6 @@ check_client_connection_check_interval(int *newval, void **extra, GucSource sour return true; } -static void -assign_maintenance_io_concurrency(int newval, void *extra) -{ -#ifdef USE_PREFETCH - /* - * Reconfigure recovery prefetching, because a setting it depends on - * changed. - */ - maintenance_io_concurrency = newval; - if (AmStartupProcess()) - XLogPrefetchReconfigure(); -#endif -} - static void assign_pgstat_temp_directory(const char *newval, void *extra) { diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 3307d3a635..efde01ee56 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -250,12 +250,6 @@ #archive_timeout = 0 # force a logfile segment switch after this # number of seconds; 0 disables -# - Recovery - - -#recovery_prefetch = off # prefetch pages referenced in the WAL? -#recovery_prefetch_fpw = off # even pages logged with full page? -#wal_decode_buffer_size = 512kB # lookahead window used for prefetching - # - Archive Recovery - # These are only used in recovery mode. diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c index 81e186270a..59ebac7d6a 100644 --- a/src/bin/pg_rewind/parsexlog.c +++ b/src/bin/pg_rewind/parsexlog.c @@ -41,9 +41,15 @@ static int xlogreadfd = -1; static XLogSegNo xlogreadsegno = -1; static char xlogfpath[MAXPGPATH]; -static bool SimpleXLogPageRead(XLogReaderState *xlogreader, - const char *datadir, int *tliIndex, - const char *restoreCommand); +typedef struct XLogPageReadPrivate +{ + const char *restoreCommand; + int tliIndex; +} XLogPageReadPrivate; + +static int SimpleXLogPageRead(XLogReaderState *xlogreader, + XLogRecPtr targetPagePtr, + int reqLen, XLogRecPtr targetRecPtr, char *readBuf); /* * Read WAL from the datadir/pg_wal, starting from 'startpoint' on timeline @@ -60,22 +66,20 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, XLogRecord *record; XLogReaderState *xlogreader; char *errormsg; + XLogPageReadPrivate private; - xlogreader = XLogReaderAllocate(WalSegSz, datadir, NULL); - + private.tliIndex = tliIndex; + private.restoreCommand = restoreCommand; + xlogreader = XLogReaderAllocate(WalSegSz, datadir, + XL_ROUTINE(.page_read = &SimpleXLogPageRead), + &private); if (xlogreader == NULL) pg_fatal("out of memory"); XLogBeginRead(xlogreader, startpoint); do { - while (XLogReadRecord(xlogreader, &record, &errormsg) == - XLREAD_NEED_DATA) - { - if (!SimpleXLogPageRead(xlogreader, datadir, - &tliIndex, restoreCommand)) - break; - } + record = XLogReadRecord(xlogreader, &errormsg); if (record == NULL) { @@ -119,19 +123,19 @@ readOneRecord(const char *datadir, XLogRecPtr ptr, int tliIndex, XLogRecord *record; XLogReaderState *xlogreader; char *errormsg; + XLogPageReadPrivate private; XLogRecPtr endptr; - xlogreader = XLogReaderAllocate(WalSegSz, datadir, NULL); + private.tliIndex = tliIndex; + private.restoreCommand = restoreCommand; + xlogreader = XLogReaderAllocate(WalSegSz, datadir, + XL_ROUTINE(.page_read = &SimpleXLogPageRead), + &private); if (xlogreader == NULL) pg_fatal("out of memory"); XLogBeginRead(xlogreader, ptr); - while (XLogReadRecord(xlogreader, &record, &errormsg) == - XLREAD_NEED_DATA) - { - if (!SimpleXLogPageRead(xlogreader, datadir, &tliIndex, restoreCommand)) - break; - } + record = XLogReadRecord(xlogreader, &errormsg); if (record == NULL) { if (errormsg) @@ -166,6 +170,7 @@ findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex, XLogRecPtr searchptr; XLogReaderState *xlogreader; char *errormsg; + XLogPageReadPrivate private; /* * The given fork pointer points to the end of the last common record, @@ -181,7 +186,11 @@ findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex, forkptr += SizeOfXLogShortPHD; } - xlogreader = XLogReaderAllocate(WalSegSz, datadir, NULL); + private.tliIndex = tliIndex; + private.restoreCommand = restoreCommand; + xlogreader = XLogReaderAllocate(WalSegSz, datadir, + XL_ROUTINE(.page_read = &SimpleXLogPageRead), + &private); if (xlogreader == NULL) pg_fatal("out of memory"); @@ -191,13 +200,7 @@ findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex, uint8 info; XLogBeginRead(xlogreader, searchptr); - while (XLogReadRecord(xlogreader, &record, &errormsg) == - XLREAD_NEED_DATA) - { - if (!SimpleXLogPageRead(xlogreader, datadir, - &tliIndex, restoreCommand)) - break; - } + record = XLogReadRecord(xlogreader, &errormsg); if (record == NULL) { @@ -243,19 +246,16 @@ findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex, } /* XLogReader callback function, to read a WAL page */ -static bool -SimpleXLogPageRead(XLogReaderState *xlogreader, const char *datadir, - int *tliIndex, const char *restoreCommand) +static int +SimpleXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, + int reqLen, XLogRecPtr targetRecPtr, char *readBuf) { - XLogRecPtr targetPagePtr = xlogreader->readPagePtr; - char *readBuf = xlogreader->readBuf; + XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data; uint32 targetPageOff; XLogRecPtr targetSegEnd; XLogSegNo targetSegNo; int r; - Assert(xlogreader->reqLen <= XLOG_BLCKSZ); - XLByteToSeg(targetPagePtr, targetSegNo, WalSegSz); XLogSegNoOffsetToRecPtr(targetSegNo + 1, 0, WalSegSz, targetSegEnd); targetPageOff = XLogSegmentOffset(targetPagePtr, WalSegSz); @@ -283,14 +283,14 @@ SimpleXLogPageRead(XLogReaderState *xlogreader, const char *datadir, * be done both forward and backward, consider also switching timeline * accordingly. */ - while (*tliIndex < targetNentries - 1 && - targetHistory[*tliIndex].end < targetSegEnd) - (*tliIndex)++; - while (*tliIndex > 0 && - targetHistory[*tliIndex].begin >= targetSegEnd) - (*tliIndex)--; + while (private->tliIndex < targetNentries - 1 && + targetHistory[private->tliIndex].end < targetSegEnd) + private->tliIndex++; + while (private->tliIndex > 0 && + targetHistory[private->tliIndex].begin >= targetSegEnd) + private->tliIndex--; - XLogFileName(xlogfname, targetHistory[*tliIndex].tli, + XLogFileName(xlogfname, targetHistory[private->tliIndex].tli, xlogreadsegno, WalSegSz); snprintf(xlogfpath, MAXPGPATH, "%s/" XLOGDIR "/%s", @@ -303,11 +303,10 @@ SimpleXLogPageRead(XLogReaderState *xlogreader, const char *datadir, /* * If we have no restore_command to execute, then exit. */ - if (restoreCommand == NULL) + if (private->restoreCommand == NULL) { pg_log_error("could not open file \"%s\": %m", xlogfpath); - XLogReaderSetInputData(xlogreader, -1); - return false; + return -1; } /* @@ -317,13 +316,10 @@ SimpleXLogPageRead(XLogReaderState *xlogreader, const char *datadir, xlogreadfd = RestoreArchivedFile(xlogreader->segcxt.ws_dir, xlogfname, WalSegSz, - restoreCommand); + private->restoreCommand); if (xlogreadfd < 0) - { - XLogReaderSetInputData(xlogreader, -1); - return false; - } + return -1; else pg_log_debug("using file \"%s\" restored from archive", xlogfpath); @@ -339,8 +335,7 @@ SimpleXLogPageRead(XLogReaderState *xlogreader, const char *datadir, if (lseek(xlogreadfd, (off_t) targetPageOff, SEEK_SET) < 0) { pg_log_error("could not seek in file \"%s\": %m", xlogfpath); - XLogReaderSetInputData(xlogreader, -1); - return false; + return -1; } @@ -353,15 +348,13 @@ SimpleXLogPageRead(XLogReaderState *xlogreader, const char *datadir, pg_log_error("could not read file \"%s\": read %d of %zu", xlogfpath, r, (Size) XLOG_BLCKSZ); - XLogReaderSetInputData(xlogreader, -1); - return false; + return -1; } Assert(targetSegNo == xlogreadsegno); - xlogreader->seg.ws_tli = targetHistory[*tliIndex].tli; - XLogReaderSetInputData(xlogreader, XLOG_BLCKSZ); - return true; + xlogreader->seg.ws_tli = targetHistory[private->tliIndex].tli; + return XLOG_BLCKSZ; } /* @@ -439,7 +432,7 @@ extractPageInfo(XLogReaderState *record) RmgrNames[rmid], info); } - for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++) + for (block_id = 0; block_id <= record->max_block_id; block_id++) { RelFileNode rnode; ForkNumber forknum; diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c index 4ec273e6d2..f8b8afe4a7 100644 --- a/src/bin/pg_waldump/pg_waldump.c +++ b/src/bin/pg_waldump/pg_waldump.c @@ -29,6 +29,14 @@ static const char *progname; static int WalSegSz; +typedef struct XLogDumpPrivate +{ + TimeLineID timeline; + XLogRecPtr startptr; + XLogRecPtr endptr; + bool endptr_reached; +} XLogDumpPrivate; + typedef struct XLogDumpConfig { /* display options */ @@ -322,41 +330,30 @@ WALDumpCloseSegment(XLogReaderState *state) state->seg.ws_file = -1; } -/* - * pg_waldump's WAL page reader - * - * timeline and startptr specifies the LSN, and reads up to endptr. - */ -static bool -WALDumpReadPage(XLogReaderState *state, TimeLineID timeline, - XLogRecPtr startptr, XLogRecPtr endptr) +/* pg_waldump's XLogReaderRoutine->page_read callback */ +static int +WALDumpReadPage(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen, + XLogRecPtr targetPtr, char *readBuff) { - XLogRecPtr targetPagePtr = state->readPagePtr; - int reqLen = state->reqLen; - char *readBuff = state->readBuf; + XLogDumpPrivate *private = state->private_data; int count = XLOG_BLCKSZ; WALReadError errinfo; - /* determine the number of bytes to read on the page */ - if (endptr != InvalidXLogRecPtr) + if (private->endptr != InvalidXLogRecPtr) { - if (targetPagePtr + XLOG_BLCKSZ <= endptr) + if (targetPagePtr + XLOG_BLCKSZ <= private->endptr) count = XLOG_BLCKSZ; - else if (targetPagePtr + reqLen <= endptr) - count = endptr - targetPagePtr; + else if (targetPagePtr + reqLen <= private->endptr) + count = private->endptr - targetPagePtr; else { - /* Notify xlogreader that we didn't read at all */ - XLogReaderSetInputData(state, -1); - return false; + private->endptr_reached = true; + return -1; } } - /* We should read more than requested by xlogreader */ - Assert(count >= state->readLen); - - if (!WALRead(state, WALDumpOpenSegment, WALDumpCloseSegment, - readBuff, targetPagePtr, count, timeline, &errinfo)) + if (!WALRead(state, readBuff, targetPagePtr, count, private->timeline, + &errinfo)) { WALOpenSegment *seg = &errinfo.wre_seg; char fname[MAXPGPATH]; @@ -376,9 +373,7 @@ WALDumpReadPage(XLogReaderState *state, TimeLineID timeline, (Size) errinfo.wre_req); } - /* Notify xlogreader of how many bytes we have read */ - XLogReaderSetInputData(state, count); - return true; + return count; } /* @@ -397,10 +392,10 @@ XLogDumpRecordLen(XLogReaderState *record, uint32 *rec_len, uint32 *fpi_len) * add an accessor macro for this. */ *fpi_len = 0; - for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++) + for (block_id = 0; block_id <= record->max_block_id; block_id++) { if (XLogRecHasBlockImage(record, block_id)) - *fpi_len += record->record->blocks[block_id].bimg_len; + *fpi_len += record->blocks[block_id].bimg_len; } /* @@ -498,7 +493,7 @@ XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record) if (!config->bkp_details) { /* print block references (short format) */ - for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++) + for (block_id = 0; block_id <= record->max_block_id; block_id++) { if (!XLogRecHasBlockRef(record, block_id)) continue; @@ -529,7 +524,7 @@ XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record) { /* print block references (detailed format) */ putchar('\n'); - for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++) + for (block_id = 0; block_id <= record->max_block_id; block_id++) { if (!XLogRecHasBlockRef(record, block_id)) continue; @@ -542,26 +537,26 @@ XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record) blk); if (XLogRecHasBlockImage(record, block_id)) { - if (record->record->blocks[block_id].bimg_info & + if (record->blocks[block_id].bimg_info & BKPIMAGE_IS_COMPRESSED) { printf(" (FPW%s); hole: offset: %u, length: %u, " "compression saved: %u", XLogRecBlockImageApply(record, block_id) ? "" : " for WAL verification", - record->record->blocks[block_id].hole_offset, - record->record->blocks[block_id].hole_length, + record->blocks[block_id].hole_offset, + record->blocks[block_id].hole_length, BLCKSZ - - record->record->blocks[block_id].hole_length - - record->record->blocks[block_id].bimg_len); + record->blocks[block_id].hole_length - + record->blocks[block_id].bimg_len); } else { printf(" (FPW%s); hole: offset: %u, length: %u", XLogRecBlockImageApply(record, block_id) ? "" : " for WAL verification", - record->record->blocks[block_id].hole_offset, - record->record->blocks[block_id].hole_length); + record->blocks[block_id].hole_offset, + record->blocks[block_id].hole_length); } } putchar('\n'); @@ -759,10 +754,7 @@ main(int argc, char **argv) uint32 xlogid; uint32 xrecoff; XLogReaderState *xlogreader_state; - XLogFindNextRecordState *findnext_state; - TimeLineID timeline; - XLogRecPtr startptr; - XLogRecPtr endptr; + XLogDumpPrivate private; XLogDumpConfig config; XLogDumpStats stats; XLogRecord *record; @@ -808,12 +800,14 @@ main(int argc, char **argv) } } + memset(&private, 0, sizeof(XLogDumpPrivate)); memset(&config, 0, sizeof(XLogDumpConfig)); memset(&stats, 0, sizeof(XLogDumpStats)); - timeline = 1; - startptr = InvalidXLogRecPtr; - endptr = InvalidXLogRecPtr; + private.timeline = 1; + private.startptr = InvalidXLogRecPtr; + private.endptr = InvalidXLogRecPtr; + private.endptr_reached = false; config.quiet = false; config.bkp_details = false; @@ -847,7 +841,7 @@ main(int argc, char **argv) optarg); goto bad_argument; } - endptr = (uint64) xlogid << 32 | xrecoff; + private.endptr = (uint64) xlogid << 32 | xrecoff; break; case 'f': config.follow = true; @@ -900,10 +894,10 @@ main(int argc, char **argv) goto bad_argument; } else - startptr = (uint64) xlogid << 32 | xrecoff; + private.startptr = (uint64) xlogid << 32 | xrecoff; break; case 't': - if (sscanf(optarg, "%d", &timeline) != 1) + if (sscanf(optarg, "%d", &private.timeline) != 1) { pg_log_error("could not parse timeline \"%s\"", optarg); goto bad_argument; @@ -980,21 +974,21 @@ main(int argc, char **argv) close(fd); /* parse position from file */ - XLogFromFileName(fname, &timeline, &segno, WalSegSz); + XLogFromFileName(fname, &private.timeline, &segno, WalSegSz); - if (XLogRecPtrIsInvalid(startptr)) - XLogSegNoOffsetToRecPtr(segno, 0, WalSegSz, startptr); - else if (!XLByteInSeg(startptr, segno, WalSegSz)) + if (XLogRecPtrIsInvalid(private.startptr)) + XLogSegNoOffsetToRecPtr(segno, 0, WalSegSz, private.startptr); + else if (!XLByteInSeg(private.startptr, segno, WalSegSz)) { pg_log_error("start WAL location %X/%X is not inside file \"%s\"", - LSN_FORMAT_ARGS(startptr), + LSN_FORMAT_ARGS(private.startptr), fname); goto bad_argument; } /* no second file specified, set end position */ - if (!(optind + 1 < argc) && XLogRecPtrIsInvalid(endptr)) - XLogSegNoOffsetToRecPtr(segno + 1, 0, WalSegSz, endptr); + if (!(optind + 1 < argc) && XLogRecPtrIsInvalid(private.endptr)) + XLogSegNoOffsetToRecPtr(segno + 1, 0, WalSegSz, private.endptr); /* parse ENDSEG if passed */ if (optind + 1 < argc) @@ -1010,26 +1004,26 @@ main(int argc, char **argv) close(fd); /* parse position from file */ - XLogFromFileName(fname, &timeline, &endsegno, WalSegSz); + XLogFromFileName(fname, &private.timeline, &endsegno, WalSegSz); if (endsegno < segno) fatal_error("ENDSEG %s is before STARTSEG %s", argv[optind + 1], argv[optind]); - if (XLogRecPtrIsInvalid(endptr)) + if (XLogRecPtrIsInvalid(private.endptr)) XLogSegNoOffsetToRecPtr(endsegno + 1, 0, WalSegSz, - endptr); + private.endptr); /* set segno to endsegno for check of --end */ segno = endsegno; } - if (!XLByteInSeg(endptr, segno, WalSegSz) && - endptr != (segno + 1) * WalSegSz) + if (!XLByteInSeg(private.endptr, segno, WalSegSz) && + private.endptr != (segno + 1) * WalSegSz) { pg_log_error("end WAL location %X/%X is not inside file \"%s\"", - LSN_FORMAT_ARGS(endptr), + LSN_FORMAT_ARGS(private.endptr), argv[argc - 1]); goto bad_argument; } @@ -1038,7 +1032,7 @@ main(int argc, char **argv) waldir = identify_target_directory(waldir, NULL); /* we don't know what to print */ - if (XLogRecPtrIsInvalid(startptr)) + if (XLogRecPtrIsInvalid(private.startptr)) { pg_log_error("no start WAL location given"); goto bad_argument; @@ -1048,56 +1042,42 @@ main(int argc, char **argv) /* we have everything we need, start reading */ xlogreader_state = - XLogReaderAllocate(WalSegSz, waldir, WALDumpCloseSegment); - + XLogReaderAllocate(WalSegSz, waldir, + XL_ROUTINE(.page_read = WALDumpReadPage, + .segment_open = WALDumpOpenSegment, + .segment_close = WALDumpCloseSegment), + &private); if (!xlogreader_state) fatal_error("out of memory"); - findnext_state = - InitXLogFindNextRecord(xlogreader_state, startptr); - - if (!findnext_state) - fatal_error("out of memory"); - /* first find a valid recptr to start from */ - while (XLogFindNextRecord(findnext_state)) - { - if (!WALDumpReadPage(xlogreader_state, timeline, startptr, endptr)) - break; - } + first_record = XLogFindNextRecord(xlogreader_state, private.startptr); - first_record = findnext_state->currRecPtr; if (first_record == InvalidXLogRecPtr) fatal_error("could not find a valid record after %X/%X", - LSN_FORMAT_ARGS(startptr)); + LSN_FORMAT_ARGS(private.startptr)); /* * Display a message that we're skipping data if `from` wasn't a pointer * to the start of a record and also wasn't a pointer to the beginning of * a segment (e.g. we were used in file mode). */ - if (first_record != startptr && - XLogSegmentOffset(startptr, WalSegSz) != 0) + if (first_record != private.startptr && + XLogSegmentOffset(private.startptr, WalSegSz) != 0) printf(ngettext("first record is after %X/%X, at %X/%X, skipping over %u byte\n", "first record is after %X/%X, at %X/%X, skipping over %u bytes\n", - (first_record - startptr)), - LSN_FORMAT_ARGS(startptr), + (first_record - private.startptr)), + LSN_FORMAT_ARGS(private.startptr), LSN_FORMAT_ARGS(first_record), - (uint32) (first_record - startptr)); + (uint32) (first_record - private.startptr)); for (;;) { /* try to read the next record */ - while (XLogReadRecord(xlogreader_state, &record, &errormsg) == - XLREAD_NEED_DATA) - { - if (!WALDumpReadPage(xlogreader_state, timeline, startptr, endptr)) - break; - } - + record = XLogReadRecord(xlogreader_state, &errormsg); if (!record) { - if (!config.follow) + if (!config.follow || private.endptr_reached) break; else { diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index f542af0a26..77187c12be 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -132,7 +132,6 @@ extern char *PrimaryConnInfo; extern char *PrimarySlotName; extern bool wal_receiver_create_temp_slot; extern bool track_wal_io_timing; -extern int wal_decode_buffer_size; /* indirectly set via GUC system */ extern TransactionId recoveryTargetXid; diff --git a/src/include/access/xlogprefetch.h b/src/include/access/xlogprefetch.h deleted file mode 100644 index 0a7902ee47..0000000000 --- a/src/include/access/xlogprefetch.h +++ /dev/null @@ -1,82 +0,0 @@ -/*------------------------------------------------------------------------- - * - * xlogprefetch.h - * Declarations for the recovery prefetching module. - * - * Portions Copyright (c) 2021, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * IDENTIFICATION - * src/include/access/xlogprefetch.h - *------------------------------------------------------------------------- - */ -#ifndef XLOGPREFETCH_H -#define XLOGPREFETCH_H - -#include "access/xlogreader.h" - -/* GUCs */ -extern bool recovery_prefetch; -extern bool recovery_prefetch_fpw; - -struct XLogPrefetcher; -typedef struct XLogPrefetcher XLogPrefetcher; - -extern int XLogPrefetchReconfigureCount; - -typedef struct XLogPrefetchState -{ - XLogReaderState *reader; - XLogPrefetcher *prefetcher; - int reconfigure_count; -} XLogPrefetchState; - -extern size_t XLogPrefetchShmemSize(void); -extern void XLogPrefetchShmemInit(void); - -extern void XLogPrefetchReconfigure(void); -extern void XLogPrefetchRequestResetStats(void); - -extern void XLogPrefetchBegin(XLogPrefetchState *state, XLogReaderState *reader); -extern void XLogPrefetchEnd(XLogPrefetchState *state); - -/* Functions exposed only for the use of XLogPrefetch(). */ -extern XLogPrefetcher *XLogPrefetcherAllocate(XLogReaderState *reader); -extern void XLogPrefetcherFree(XLogPrefetcher *prefetcher); -extern bool XLogPrefetcherReadAhead(XLogPrefetcher *prefetch, - XLogRecPtr replaying_lsn); - -/* - * Tell the prefetching module that we are now replaying a given LSN, so that - * it can decide how far ahead to read in the WAL, if configured. Return - * true if more data is needed by the reader. - */ -static inline bool -XLogPrefetch(XLogPrefetchState *state, XLogRecPtr replaying_lsn) -{ - /* - * Handle any configuration changes. Rather than trying to deal with - * various parameter changes, we just tear down and set up a new - * prefetcher if anything we depend on changes. - */ - if (unlikely(state->reconfigure_count != XLogPrefetchReconfigureCount)) - { - /* If we had a prefetcher, tear it down. */ - if (state->prefetcher) - { - XLogPrefetcherFree(state->prefetcher); - state->prefetcher = NULL; - } - /* If we want a prefetcher, set it up. */ - if (recovery_prefetch) - state->prefetcher = XLogPrefetcherAllocate(state->reader); - state->reconfigure_count = XLogPrefetchReconfigureCount; - } - - if (state->prefetcher) - return XLogPrefetcherReadAhead(state->prefetcher, replaying_lsn); - - return false; -} - -#endif diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h index 3b8af31a8f..21d200d3df 100644 --- a/src/include/access/xlogreader.h +++ b/src/include/access/xlogreader.h @@ -39,7 +39,6 @@ #endif #include "access/xlogrecord.h" -#include "storage/buf.h" /* WALOpenSegment represents a WAL segment being read. */ typedef struct WALOpenSegment @@ -57,17 +56,65 @@ typedef struct WALSegmentContext } WALSegmentContext; typedef struct XLogReaderState XLogReaderState; -typedef struct XLogFindNextRecordState XLogFindNextRecordState; -/* Function type definition for the segment cleanup callback */ -typedef void (*WALSegmentCleanupCB) (XLogReaderState *xlogreader); - -/* Function type definition for the open/close callbacks for WALRead() */ +/* Function type definitions for various xlogreader interactions */ +typedef int (*XLogPageReadCB) (XLogReaderState *xlogreader, + XLogRecPtr targetPagePtr, + int reqLen, + XLogRecPtr targetRecPtr, + char *readBuf); typedef void (*WALSegmentOpenCB) (XLogReaderState *xlogreader, XLogSegNo nextSegNo, TimeLineID *tli_p); typedef void (*WALSegmentCloseCB) (XLogReaderState *xlogreader); +typedef struct XLogReaderRoutine +{ + /* + * Data input callback + * + * This callback shall read at least reqLen valid bytes of the xlog page + * starting at targetPagePtr, and store them in readBuf. The callback + * shall return the number of bytes read (never more than XLOG_BLCKSZ), or + * -1 on failure. The callback shall sleep, if necessary, to wait for the + * requested bytes to become available. The callback will not be invoked + * again for the same page unless more than the returned number of bytes + * are needed. + * + * targetRecPtr is the position of the WAL record we're reading. Usually + * it is equal to targetPagePtr + reqLen, but sometimes xlogreader needs + * to read and verify the page or segment header, before it reads the + * actual WAL record it's interested in. In that case, targetRecPtr can + * be used to determine which timeline to read the page from. + * + * The callback shall set ->seg.ws_tli to the TLI of the file the page was + * read from. + */ + XLogPageReadCB page_read; + + /* + * Callback to open the specified WAL segment for reading. ->seg.ws_file + * shall be set to the file descriptor of the opened segment. In case of + * failure, an error shall be raised by the callback and it shall not + * return. + * + * "nextSegNo" is the number of the segment to be opened. + * + * "tli_p" is an input/output argument. WALRead() uses it to pass the + * timeline in which the new segment should be found, but the callback can + * use it to return the TLI that it actually opened. + */ + WALSegmentOpenCB segment_open; + + /* + * WAL segment close callback. ->seg.ws_file shall be set to a negative + * number. + */ + WALSegmentCloseCB segment_close; +} XLogReaderRoutine; + +#define XL_ROUTINE(...) &(XLogReaderRoutine){__VA_ARGS__} + typedef struct { /* Is this block ref in use? */ @@ -78,9 +125,6 @@ typedef struct ForkNumber forknum; BlockNumber blkno; - /* Workspace for remembering last known buffer holding this block. */ - Buffer recent_buffer; - /* copy of the fork_flags field from the XLogRecordBlockHeader */ uint8 flags; @@ -100,61 +144,12 @@ typedef struct uint16 data_bufsz; } DecodedBkpBlock; -/* Return code from XLogReadRecord */ -typedef enum XLogReadRecordResult -{ - XLREAD_SUCCESS, /* record is successfully read */ - XLREAD_NEED_DATA, /* need more data. see XLogReadRecord. */ - XLREAD_FULL, /* cannot hold more data while reading ahead */ - XLREAD_FAIL /* failed during reading a record */ -} XLogReadRecordResult; - -/* - * internal state of XLogReadRecord - * - * XLogReadState runs a state machine while reading a record. Theses states - * are not seen outside the function. Each state may repeat several times - * exiting requesting caller for new data. See the comment of XLogReadRecrod - * for details. - */ -typedef enum XLogReadRecordState -{ - XLREAD_NEXT_RECORD, - XLREAD_TOT_LEN, - XLREAD_FIRST_FRAGMENT, - XLREAD_CONTINUATION -} XLogReadRecordState; - -/* - * The decoded contents of a record. This occupies a contiguous region of - * memory, with main_data and blocks[n].data pointing to memory after the - * members declared here. - */ -typedef struct DecodedXLogRecord -{ - /* Private member used for resource management. */ - size_t size; /* total size of decoded record */ - bool oversized; /* outside the regular decode buffer? */ - struct DecodedXLogRecord *next; /* decoded record queue link */ - - /* Public members. */ - XLogRecPtr lsn; /* location */ - XLogRecPtr next_lsn; /* location of next record */ - XLogRecord header; /* header */ - RepOriginId record_origin; - TransactionId toplevel_xid; /* XID of top-level transaction */ - char *main_data; /* record's main data portion */ - uint32 main_data_len; /* main data portion's length */ - int max_block_id; /* highest block_id in use (-1 if none) */ - DecodedBkpBlock blocks[FLEXIBLE_ARRAY_MEMBER]; -} DecodedXLogRecord; - struct XLogReaderState { /* * Operational callbacks */ - WALSegmentCleanupCB cleanup_cb; + XLogReaderRoutine routine; /* ---------------------------------------- * Public parameters @@ -167,33 +162,19 @@ struct XLogReaderState */ uint64 system_identifier; + /* + * Opaque data for callbacks to use. Not used by XLogReader. + */ + void *private_data; + /* * Start and end point of last record read. EndRecPtr is also used as the * position to read next. Calling XLogBeginRead() sets EndRecPtr to the * starting position and ReadRecPtr to invalid. - * - * Start and end point of last record returned by XLogReadRecord(). These - * are also available as record->lsn and record->next_lsn. */ - XLogRecPtr ReadRecPtr; /* start of last record read or being read */ + XLogRecPtr ReadRecPtr; /* start of last record read */ XLogRecPtr EndRecPtr; /* end+1 of last record read */ - /* ---------------------------------------- - * Communication with page reader - * readBuf is XLOG_BLCKSZ bytes, valid up to at least reqLen bytes. - * ---------------------------------------- - */ - /* variables the clients of xlogreader can examine */ - XLogRecPtr readPagePtr; /* page pointer to read */ - int32 reqLen; /* bytes requested to the caller */ - char *readBuf; /* buffer to store data */ - bool page_verified; /* is the page header on the buffer verified? */ - bool record_verified;/* is the current record header verified? */ - - /* variables set by the client of xlogreader */ - int32 readLen; /* actual bytes copied into readBuf by client, - * which should be >= reqLen. Client should - * use XLogReaderSetInputData() to set. */ /* ---------------------------------------- * Decoded representation of current record @@ -201,17 +182,21 @@ struct XLogReaderState * Use XLogRecGet* functions to investigate the record; these fields * should not be accessed directly. * ---------------------------------------- - * Start and end point of the last record read and decoded by - * XLogReadRecordInternal(). NextRecPtr is also used as the position to - * decode next. Calling XLogBeginRead() sets NextRecPtr and EndRecPtr to - * the requested starting position. */ - XLogRecPtr DecodeRecPtr; /* start of last record decoded */ - XLogRecPtr NextRecPtr; /* end+1 of last record decoded */ - XLogRecPtr PrevRecPtr; /* start of previous record decoded */ + XLogRecord *decoded_record; /* currently decoded record */ - /* Last record returned by XLogReadRecord(). */ - DecodedXLogRecord *record; + char *main_data; /* record's main data portion */ + uint32 main_data_len; /* main data portion's length */ + uint32 main_data_bufsz; /* allocated size of the buffer */ + + RepOriginId record_origin; + + TransactionId toplevel_xid; /* XID of top-level transaction */ + + /* information about blocks referenced by the record. */ + DecodedBkpBlock blocks[XLR_MAX_BLOCK_ID + 1]; + + int max_block_id; /* highest block_id in use (-1 if none) */ /* ---------------------------------------- * private/internal state @@ -219,24 +204,11 @@ struct XLogReaderState */ /* - * Buffer for decoded records. This is a circular buffer, though - * individual records can't be split in the middle, so some space is often - * wasted at the end. Oversized records that don't fit in this space are - * allocated separately. + * Buffer for currently read page (XLOG_BLCKSZ bytes, valid up to at least + * readLen bytes) */ - char *decode_buffer; - size_t decode_buffer_size; - bool free_decode_buffer; /* need to free? */ - char *decode_buffer_head; /* write head */ - char *decode_buffer_tail; /* read head */ - - /* - * Queue of records that have been decoded. This is a linked list that - * usually consists of consecutive records in decode_buffer, but may also - * contain oversized records allocated with palloc(). - */ - DecodedXLogRecord *decode_queue_head; /* newest decoded record */ - DecodedXLogRecord *decode_queue_tail; /* oldest decoded record */ + char *readBuf; + uint32 readLen; /* last read XLOG position for data currently in readBuf */ WALSegmentContext segcxt; @@ -250,6 +222,8 @@ struct XLogReaderState XLogRecPtr latestPagePtr; TimeLineID latestPageTLI; + /* beginning of the WAL record being read. */ + XLogRecPtr currRecPtr; /* timeline to read it from, 0 if a lookup is required */ TimeLineID currTLI; @@ -276,70 +250,29 @@ struct XLogReaderState char *readRecordBuf; uint32 readRecordBufSize; - /* - * XLogReadRecordInternal() state - */ - XLogReadRecordState readRecordState; /* state machine state */ - int recordGotLen; /* amount of current record that has already - * been read */ - int recordRemainLen; /* length of current record that remains */ - XLogRecPtr recordContRecPtr; /* where the current record continues */ - - DecodedXLogRecord *decoding; /* record currently being decoded */ - /* Buffer to hold error message */ char *errormsg_buf; - bool errormsg_deferred; }; -struct XLogFindNextRecordState -{ - XLogReaderState *reader_state; - XLogRecPtr targetRecPtr; - XLogRecPtr currRecPtr; -}; - -/* Report that data is available for decoding. */ -static inline void -XLogReaderSetInputData(XLogReaderState *state, int32 len) -{ - state->readLen = len; -} - /* Get a new XLogReader */ extern XLogReaderState *XLogReaderAllocate(int wal_segment_size, const char *waldir, - WALSegmentCleanupCB cleanup_cb); + XLogReaderRoutine *routine, + void *private_data); +extern XLogReaderRoutine *LocalXLogReaderRoutine(void); /* Free an XLogReader */ extern void XLogReaderFree(XLogReaderState *state); -/* Optionally provide a circular decoding buffer to allow readahead. */ -extern void XLogReaderSetDecodeBuffer(XLogReaderState *state, - void *buffer, - size_t size); - /* Position the XLogReader to given record */ extern void XLogBeginRead(XLogReaderState *state, XLogRecPtr RecPtr); #ifdef FRONTEND -extern XLogFindNextRecordState *InitXLogFindNextRecord(XLogReaderState *reader_state, XLogRecPtr start_ptr); -extern bool XLogFindNextRecord(XLogFindNextRecordState *state); +extern XLogRecPtr XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr); #endif /* FRONTEND */ -/* Read the next record's header. Returns NULL on end-of-WAL or failure. */ -extern XLogReadRecordResult XLogReadRecord(XLogReaderState *state, - XLogRecord **record, - char **errormsg); - -/* Read the next decoded record. Returns NULL on end-of-WAL or failure. */ -extern XLogReadRecordResult XLogNextRecord(XLogReaderState *state, - DecodedXLogRecord **record, - char **errormsg); - -/* Try to read ahead, if there is space in the decoding buffer. */ -extern XLogReadRecordResult XLogReadAhead(XLogReaderState *state, - DecodedXLogRecord **record, - char **errormsg); +/* Read the next XLog record. Returns NULL on end-of-WAL or failure */ +extern struct XLogRecord *XLogReadRecord(XLogReaderState *state, + char **errormsg); /* Validate a page */ extern bool XLogReaderValidatePageHeader(XLogReaderState *state, @@ -359,38 +292,30 @@ typedef struct WALReadError } WALReadError; extern bool WALRead(XLogReaderState *state, - WALSegmentOpenCB segopenfn, WALSegmentCloseCB sgclosefn, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli, WALReadError *errinfo); /* Functions for decoding an XLogRecord */ -extern size_t DecodeXLogRecordRequiredSpace(size_t xl_tot_len); -extern bool DecodeXLogRecord(XLogReaderState *state, - DecodedXLogRecord *decoded, - XLogRecord *record, - XLogRecPtr lsn, +extern bool DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errmsg); -#define XLogRecGetTotalLen(decoder) ((decoder)->record->header.xl_tot_len) -#define XLogRecGetPrev(decoder) ((decoder)->record->header.xl_prev) -#define XLogRecGetInfo(decoder) ((decoder)->record->header.xl_info) -#define XLogRecGetRmid(decoder) ((decoder)->record->header.xl_rmid) -#define XLogRecGetXid(decoder) ((decoder)->record->header.xl_xid) -#define XLogRecGetOrigin(decoder) ((decoder)->record->record_origin) -#define XLogRecGetTopXid(decoder) ((decoder)->record->toplevel_xid) -#define XLogRecGetData(decoder) ((decoder)->record->main_data) -#define XLogRecGetDataLen(decoder) ((decoder)->record->main_data_len) -#define XLogRecHasAnyBlockRefs(decoder) ((decoder)->record->max_block_id >= 0) -#define XLogRecMaxBlockId(decoder) ((decoder)->record->max_block_id) -#define XLogRecGetBlock(decoder, i) (&(decoder)->record->blocks[(i)]) +#define XLogRecGetTotalLen(decoder) ((decoder)->decoded_record->xl_tot_len) +#define XLogRecGetPrev(decoder) ((decoder)->decoded_record->xl_prev) +#define XLogRecGetInfo(decoder) ((decoder)->decoded_record->xl_info) +#define XLogRecGetRmid(decoder) ((decoder)->decoded_record->xl_rmid) +#define XLogRecGetXid(decoder) ((decoder)->decoded_record->xl_xid) +#define XLogRecGetOrigin(decoder) ((decoder)->record_origin) +#define XLogRecGetTopXid(decoder) ((decoder)->toplevel_xid) +#define XLogRecGetData(decoder) ((decoder)->main_data) +#define XLogRecGetDataLen(decoder) ((decoder)->main_data_len) +#define XLogRecHasAnyBlockRefs(decoder) ((decoder)->max_block_id >= 0) #define XLogRecHasBlockRef(decoder, block_id) \ - ((decoder)->record->max_block_id >= (block_id) && \ - (decoder)->record->blocks[block_id].in_use) + ((decoder)->blocks[block_id].in_use) #define XLogRecHasBlockImage(decoder, block_id) \ - ((decoder)->record->blocks[block_id].has_image) + ((decoder)->blocks[block_id].has_image) #define XLogRecBlockImageApply(decoder, block_id) \ - ((decoder)->record->blocks[block_id].apply_image) + ((decoder)->blocks[block_id].apply_image) #ifndef FRONTEND extern FullTransactionId XLogRecGetFullXid(XLogReaderState *record); @@ -401,8 +326,5 @@ extern char *XLogRecGetBlockData(XLogReaderState *record, uint8 block_id, Size * extern bool XLogRecGetBlockTag(XLogReaderState *record, uint8 block_id, RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum); -extern bool XLogRecGetRecentBuffer(XLogReaderState *record, uint8 block_id, - RelFileNode *rnode, ForkNumber *forknum, - BlockNumber *blknum, Buffer *recent_buffer); #endif /* XLOGREADER_H */ diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h index bbc6085130..9ac602b674 100644 --- a/src/include/access/xlogutils.h +++ b/src/include/access/xlogutils.h @@ -42,13 +42,14 @@ extern XLogRedoAction XLogReadBufferForRedoExtended(XLogReaderState *record, Buffer *buf); extern Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, - BlockNumber blkno, ReadBufferMode mode, - Buffer recent_buffer); + BlockNumber blkno, ReadBufferMode mode); extern Relation CreateFakeRelcacheEntry(RelFileNode rnode); extern void FreeFakeRelcacheEntry(Relation fakerel); -extern bool read_local_xlog_page(XLogReaderState *state); +extern int read_local_xlog_page(XLogReaderState *state, + XLogRecPtr targetPagePtr, int reqLen, + XLogRecPtr targetRecPtr, char *cur_page); extern void wal_segment_open(XLogReaderState *state, XLogSegNo nextSegNo, TimeLineID *tli_p); diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index a54be88d7f..c8d445e4d9 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202105051 +#define CATALOG_VERSION_NO 202105091 #endif diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 91f0ea2212..26c3fc0f6b 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -6287,14 +6287,6 @@ prorettype => 'text', proargtypes => '', prosrc => 'pg_get_wal_replay_pause_state' }, -{ oid => '9085', descr => 'statistics: information about WAL prefetching', - proname => 'pg_stat_get_prefetch_recovery', prorows => '1', provolatile => 'v', - proretset => 't', prorettype => 'record', proargtypes => '', - proallargtypes => '{timestamptz,int8,int8,int8,int8,int8,int4,int4,float4,float4}', - proargmodes => '{o,o,o,o,o,o,o,o,o,o}', - proargnames => '{stats_reset,prefetch,skip_hit,skip_new,skip_fpw,skip_seq,distance,queue_depth,avg_distance,avg_queue_depth}', - prosrc => 'pg_stat_get_prefetch_recovery' }, - { oid => '2621', descr => 'reload configuration files', proname => 'pg_reload_conf', provolatile => 'v', prorettype => 'bool', proargtypes => '', prosrc => 'pg_reload_conf' }, diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 1ce363e7d1..72ff4a06d6 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -74,7 +74,6 @@ typedef enum StatMsgType PGSTAT_MTYPE_BGWRITER, PGSTAT_MTYPE_WAL, PGSTAT_MTYPE_SLRU, - PGSTAT_MTYPE_RECOVERYPREFETCH, PGSTAT_MTYPE_FUNCSTAT, PGSTAT_MTYPE_FUNCPURGE, PGSTAT_MTYPE_RECOVERYCONFLICT, @@ -198,19 +197,6 @@ typedef struct PgStat_TableXactStatus struct PgStat_TableXactStatus *next; /* next of same subxact */ } PgStat_TableXactStatus; -/* - * Recovery prefetching statistics persisted on disk by pgstat.c, but kept in - * shared memory by xlogprefetch.c. - */ -typedef struct PgStat_RecoveryPrefetchStats -{ - PgStat_Counter prefetch; - PgStat_Counter skip_hit; - PgStat_Counter skip_new; - PgStat_Counter skip_fpw; - PgStat_Counter skip_seq; - TimestampTz stat_reset_timestamp; -} PgStat_RecoveryPrefetchStats; /* ------------------------------------------------------------ * Message formats follow @@ -553,15 +539,6 @@ typedef struct PgStat_MsgReplSlot PgStat_Counter m_total_bytes; } PgStat_MsgReplSlot; -/* ---------- - * PgStat_MsgRecoveryPrefetch Sent by XLogPrefetch to save statistics. - * ---------- - */ -typedef struct PgStat_MsgRecoveryPrefetch -{ - PgStat_MsgHdr m_hdr; - PgStat_RecoveryPrefetchStats m_stats; -} PgStat_MsgRecoveryPrefetch; /* ---------- * PgStat_MsgRecoveryConflict Sent by the backend upon recovery conflict @@ -725,7 +702,6 @@ typedef union PgStat_Msg PgStat_MsgBgWriter msg_bgwriter; PgStat_MsgWal msg_wal; PgStat_MsgSLRU msg_slru; - PgStat_MsgRecoveryPrefetch msg_recoveryprefetch; PgStat_MsgFuncstat msg_funcstat; PgStat_MsgFuncpurge msg_funcpurge; PgStat_MsgRecoveryConflict msg_recoveryconflict; @@ -1115,7 +1091,6 @@ extern void pgstat_twophase_postabort(TransactionId xid, uint16 info, extern void pgstat_send_archiver(const char *xlog, bool failed); extern void pgstat_send_bgwriter(void); -extern void pgstat_send_recoveryprefetch(PgStat_RecoveryPrefetchStats *stats); extern void pgstat_report_wal(void); extern bool pgstat_send_wal(bool force); @@ -1132,7 +1107,6 @@ extern PgStat_GlobalStats *pgstat_fetch_global(void); extern PgStat_WalStats *pgstat_fetch_stat_wal(void); extern PgStat_SLRUStats *pgstat_fetch_slru(void); extern PgStat_StatReplSlotEntry *pgstat_fetch_replslot(NameData slotname); -extern PgStat_RecoveryPrefetchStats *pgstat_fetch_recoveryprefetch(void); extern void pgstat_count_slru_page_zeroed(int slru_idx); extern void pgstat_count_slru_page_hit(int slru_idx); diff --git a/src/include/replication/logical.h b/src/include/replication/logical.h index 7dfcb7be18..af551d6f4e 100644 --- a/src/include/replication/logical.h +++ b/src/include/replication/logical.h @@ -29,8 +29,6 @@ typedef void (*LogicalOutputPluginWriterUpdateProgress) (struct LogicalDecodingC TransactionId xid ); -typedef bool (*LogicalDecodingXLogPageReadCB)(XLogReaderState *ctx); - typedef struct LogicalDecodingContext { /* memory context this is all allocated in */ @@ -41,7 +39,6 @@ typedef struct LogicalDecodingContext /* infrastructure pieces for decoding */ XLogReaderState *reader; - LogicalDecodingXLogPageReadCB page_read; struct ReorderBuffer *reorder; struct SnapBuild *snapshot_builder; @@ -108,16 +105,14 @@ extern LogicalDecodingContext *CreateInitDecodingContext(const char *plugin, List *output_plugin_options, bool need_full_snapshot, XLogRecPtr restart_lsn, - LogicalDecodingXLogPageReadCB page_read, - WALSegmentCleanupCB cleanup_cb, + XLogReaderRoutine *xl_routine, LogicalOutputPluginWriterPrepareWrite prepare_write, LogicalOutputPluginWriterWrite do_write, LogicalOutputPluginWriterUpdateProgress update_progress); extern LogicalDecodingContext *CreateDecodingContext(XLogRecPtr start_lsn, List *output_plugin_options, bool fast_forward, - LogicalDecodingXLogPageReadCB page_read, - WALSegmentCleanupCB cleanup_cb, + XLogReaderRoutine *xl_routine, LogicalOutputPluginWriterPrepareWrite prepare_write, LogicalOutputPluginWriterWrite do_write, LogicalOutputPluginWriterUpdateProgress update_progress); diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index 7894940741..24a5d9d3fb 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -442,8 +442,4 @@ extern void assign_search_path(const char *newval, void *extra); extern bool check_wal_buffers(int *newval, void **extra, GucSource source); extern void assign_xlog_sync_method(int new_sync_method, void *extra); -/* in access/transam/xlogprefetch.c */ -extern void assign_recovery_prefetch(bool new_value, void *extra); -extern void assign_recovery_prefetch_fpw(bool new_value, void *extra); - #endif /* GUC_H */ diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h index 35aac5bbc7..6b40f1eeb8 100644 --- a/src/include/utils/guc_tables.h +++ b/src/include/utils/guc_tables.h @@ -67,7 +67,6 @@ enum config_group WAL_SETTINGS, WAL_CHECKPOINTS, WAL_ARCHIVING, - WAL_RECOVERY, WAL_ARCHIVE_RECOVERY, WAL_RECOVERY_TARGET, REPLICATION_SENDING, diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 572bc2057c..e5ab11275d 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1879,17 +1879,6 @@ pg_stat_gssapi| SELECT s.pid, s.gss_enc AS encrypted FROM pg_stat_get_activity(NULL::integer) s(datid, pid, usesysid, application_name, state, query, wait_event_type, wait_event, xact_start, query_start, backend_start, state_change, client_addr, client_hostname, client_port, backend_xid, backend_xmin, backend_type, ssl, sslversion, sslcipher, sslbits, ssl_client_dn, ssl_client_serial, ssl_issuer_dn, gss_auth, gss_princ, gss_enc, leader_pid, query_id) WHERE (s.client_port IS NOT NULL); -pg_stat_prefetch_recovery| SELECT s.stats_reset, - s.prefetch, - s.skip_hit, - s.skip_new, - s.skip_fpw, - s.skip_seq, - s.distance, - s.queue_depth, - s.avg_distance, - s.avg_queue_depth - FROM pg_stat_get_prefetch_recovery() s(stats_reset, prefetch, skip_hit, skip_new, skip_fpw, skip_seq, distance, queue_depth, avg_distance, avg_queue_depth); pg_stat_progress_analyze| SELECT s.pid, s.datid, d.datname, diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 0f197a9c8d..1196febfa2 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2804,10 +2804,6 @@ XLogPageHeader XLogPageHeaderData XLogPageReadCB XLogPageReadPrivate -XLogPrefetcher -XLogPrefetcherFilter -XLogPrefetchState -XLogPrefetchStats XLogReaderRoutine XLogReaderState XLogRecData