From a507b86900f695aacc8d52b7d2cfcb65f58862a2 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Wed, 8 Feb 2017 15:45:30 -0500 Subject: [PATCH] Add WAL consistency checking facility. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the new GUC wal_consistency_checking is set to a non-empty value, it triggers recording of additional full-page images, which are compared on the standby against the results of applying the WAL record (without regard to those full-page images). Allowable differences such as hints are masked out, and the resulting pages are compared; any difference results in a FATAL error on the standby. Kuntal Ghosh, based on earlier patches by Michael Paquier and Heikki Linnakangas. Extensively reviewed and revised by Michael Paquier and by me, with additional reviews and comments from Amit Kapila, Álvaro Herrera, Simon Riggs, and Peter Eisentraut. --- doc/src/sgml/config.sgml | 32 ++++++ src/backend/access/brin/brin_xlog.c | 20 ++++ src/backend/access/common/Makefile | 4 +- src/backend/access/common/bufmask.c | 128 ++++++++++++++++++++++ src/backend/access/gin/ginxlog.c | 32 ++++++ src/backend/access/gist/gistxlog.c | 43 ++++++++ src/backend/access/heap/heapam.c | 79 +++++++++++++ src/backend/access/nbtree/nbtxlog.c | 50 +++++++++ src/backend/access/rmgrdesc/gindesc.c | 14 ++- src/backend/access/spgist/spgxlog.c | 21 ++++ src/backend/access/transam/generic_xlog.c | 12 ++ src/backend/access/transam/rmgr.c | 4 +- src/backend/access/transam/xlog.c | 120 ++++++++++++++++++++ src/backend/access/transam/xloginsert.c | 38 ++++++- src/backend/access/transam/xlogreader.c | 8 ++ src/backend/access/transam/xlogutils.c | 11 +- src/backend/commands/sequence.c | 12 ++ src/backend/utils/misc/guc.c | 97 ++++++++++++++++ src/bin/pg_rewind/parsexlog.c | 2 +- src/bin/pg_xlogdump/pg_xlogdump.c | 16 ++- src/bin/pg_xlogdump/rmgrdesc.c | 2 +- src/include/access/brin_xlog.h | 1 + src/include/access/bufmask.h | 33 ++++++ src/include/access/generic_xlog.h | 1 + src/include/access/gin.h | 1 + src/include/access/gist_private.h | 1 + src/include/access/heapam_xlog.h | 1 + src/include/access/nbtree.h | 1 + src/include/access/rmgr.h | 2 +- src/include/access/rmgrlist.h | 44 ++++---- src/include/access/spgist.h | 1 + src/include/access/xlog.h | 2 + src/include/access/xlog_internal.h | 6 +- src/include/access/xlogreader.h | 5 +- src/include/access/xlogrecord.h | 14 ++- src/include/commands/sequence.h | 1 + 36 files changed, 811 insertions(+), 48 deletions(-) create mode 100644 src/backend/access/common/bufmask.c create mode 100644 src/include/access/bufmask.h diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index fb5d6473ef..dc63d7d5e4 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -8184,6 +8184,38 @@ LOG: CleanUpLock: deleting: lock(0xb7acd844) id(24688,24696,0,0,0,1) + + wal_consistency_checking (string) + + wal_consistency_checking configuration parameter + + + + + This parameter is intended to be used to check for bugs in the WAL + redo routines. When enabled, full-page images of any buffers modified + in conjunction with the WAL record are added to the record. + If the record is subsequently replayed, the system will first apply + each record and then test whether the buffers modified by the record + match the stored images. In certain cases (such as hint bits), minor + variations are acceptable, and will be ignored. Any unexpected + differences will result in a fatal error, terminating recovery. + + + + The default value of this setting is the empty string, which disables + the feature. It can be set to all to check all + records, or to a comma-separated list of resource managers to check + only records originating from those resource managers. Currently, + the supported resource managers are heap, + heap2, btree, gin, + gist, sequence, spgist, + brin, and generic. Only + superusers can change this setting. + + + + wal_debug (boolean) diff --git a/src/backend/access/brin/brin_xlog.c b/src/backend/access/brin/brin_xlog.c index b698c9b58c..f416bacc3f 100644 --- a/src/backend/access/brin/brin_xlog.c +++ b/src/backend/access/brin/brin_xlog.c @@ -13,6 +13,7 @@ #include "access/brin_page.h" #include "access/brin_pageops.h" #include "access/brin_xlog.h" +#include "access/bufmask.h" #include "access/xlogutils.h" @@ -279,3 +280,22 @@ brin_redo(XLogReaderState *record) elog(PANIC, "brin_redo: unknown op code %u", info); } } + +/* + * Mask a BRIN page before doing consistency checks. + */ +void +brin_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + + mask_page_lsn(page); + + mask_page_hint_bits(page); + + if (BRIN_IS_REGULAR_PAGE(page)) + { + /* Regular brin pages contain unused space which needs to be masked. */ + mask_unused_space(page); + } +} diff --git a/src/backend/access/common/Makefile b/src/backend/access/common/Makefile index d4b8132a97..fb27944b89 100644 --- a/src/backend/access/common/Makefile +++ b/src/backend/access/common/Makefile @@ -12,7 +12,7 @@ subdir = src/backend/access/common top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = heaptuple.o indextuple.o printsimple.o printtup.o reloptions.o \ - scankey.o tupconvert.o tupdesc.o +OBJS = bufmask.o heaptuple.o indextuple.o printsimple.o printtup.o \ + reloptions.o scankey.o tupconvert.o tupdesc.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/common/bufmask.c b/src/backend/access/common/bufmask.c new file mode 100644 index 0000000000..3b06115e03 --- /dev/null +++ b/src/backend/access/common/bufmask.c @@ -0,0 +1,128 @@ +/*------------------------------------------------------------------------- + * + * bufmask.c + * Routines for buffer masking. Used to mask certain bits + * in a page which can be different when the WAL is generated + * and when the WAL is applied. + * + * Portions Copyright (c) 2016, PostgreSQL Global Development Group + * + * Contains common routines required for masking a page. + * + * IDENTIFICATION + * src/backend/storage/buffer/bufmask.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/bufmask.h" + +/* + * mask_page_lsn + * + * In consistency checks, the LSN of the two pages compared will likely be + * different because of concurrent operations when the WAL is generated + * and the state of the page when WAL is applied. + */ +void +mask_page_lsn(Page page) +{ + PageHeader phdr = (PageHeader) page; + + PageXLogRecPtrSet(phdr->pd_lsn, (uint64) MASK_MARKER); +} + +/* + * mask_page_hint_bits + * + * Mask hint bits in PageHeader. We want to ignore differences in hint bits, + * since they can be set without emitting any WAL. + */ +void +mask_page_hint_bits(Page page) +{ + PageHeader phdr = (PageHeader) page; + + /* Ignore prune_xid (it's like a hint-bit) */ + phdr->pd_prune_xid = MASK_MARKER; + + /* Ignore PD_PAGE_FULL and PD_HAS_FREE_LINES flags, they are just hints. */ + PageClearFull(page); + PageClearHasFreeLinePointers(page); + + /* + * During replay, if the page LSN has advanced past our XLOG record's LSN, + * we don't mark the page all-visible. See heap_xlog_visible() for + * details. + */ + PageClearAllVisible(page); +} + +/* + * mask_unused_space + * + * Mask the unused space of a page between pd_lower and pd_upper. + */ +void +mask_unused_space(Page page) +{ + int pd_lower = ((PageHeader) page)->pd_lower; + int pd_upper = ((PageHeader) page)->pd_upper; + int pd_special = ((PageHeader) page)->pd_special; + + /* Sanity check */ + if (pd_lower > pd_upper || pd_special < pd_upper || + pd_lower < SizeOfPageHeaderData || pd_special > BLCKSZ) + { + elog(ERROR, "invalid page pd_lower %u pd_upper %u pd_special %u\n", + pd_lower, pd_upper, pd_special); + } + + memset(page + pd_lower, MASK_MARKER, pd_upper - pd_lower); +} + +/* + * mask_lp_flags + * + * In some index AMs, line pointer flags can be modified in master without + * emitting any WAL record. + */ +void +mask_lp_flags(Page page) +{ + OffsetNumber offnum, + maxoff; + + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemId = PageGetItemId(page, offnum); + + if (ItemIdIsUsed(itemId)) + itemId->lp_flags = LP_UNUSED; + } +} + +/* + * mask_page_content + * + * In some index AMs, the contents of deleted pages need to be almost + * completely ignored. + */ +void +mask_page_content(Page page) +{ + /* Mask Page Content */ + memset(page + SizeOfPageHeaderData, MASK_MARKER, + BLCKSZ - SizeOfPageHeaderData); + + /* Mask pd_lower and pd_upper */ + memset(&((PageHeader) page)->pd_lower, MASK_MARKER, + sizeof(uint16)); + memset(&((PageHeader) page)->pd_upper, MASK_MARKER, + sizeof(uint16)); +} diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c index 8468fe825c..2995e7b06a 100644 --- a/src/backend/access/gin/ginxlog.c +++ b/src/backend/access/gin/ginxlog.c @@ -13,6 +13,7 @@ */ #include "postgres.h" +#include "access/bufmask.h" #include "access/gin_private.h" #include "access/xlogutils.h" #include "utils/memutils.h" @@ -758,3 +759,34 @@ gin_xlog_cleanup(void) MemoryContextDelete(opCtx); opCtx = NULL; } + +/* + * Mask a GIN page before running consistency checks on it. + */ +void +gin_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + GinPageOpaque opaque; + + mask_page_lsn(page); + opaque = GinPageGetOpaque(page); + + mask_page_hint_bits(page); + + /* + * GIN metapage doesn't use pd_lower/pd_upper. Other page types do. Hence, + * we need to apply masking for those pages. + */ + if (opaque->flags != GIN_META) + { + /* + * For GIN_DELETED page, the page is initialized to empty. Hence, mask + * the page content. + */ + if (opaque->flags & GIN_DELETED) + mask_page_content(page); + else + mask_unused_space(page); + } +} diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c index 88b97a4e48..cbda9e705c 100644 --- a/src/backend/access/gist/gistxlog.c +++ b/src/backend/access/gist/gistxlog.c @@ -13,6 +13,7 @@ */ #include "postgres.h" +#include "access/bufmask.h" #include "access/gist_private.h" #include "access/xloginsert.h" #include "access/xlogutils.h" @@ -342,6 +343,48 @@ gist_xlog_cleanup(void) MemoryContextDelete(opCtx); } +/* + * Mask a Gist page before running consistency checks on it. + */ +void +gist_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + + mask_page_lsn(page); + + mask_page_hint_bits(page); + mask_unused_space(page); + + /* + * NSN is nothing but a special purpose LSN. Hence, mask it for the same + * reason as mask_page_lsn. + */ + GistPageSetNSN(page, (uint64) MASK_MARKER); + + /* + * We update F_FOLLOW_RIGHT flag on the left child after writing WAL + * record. Hence, mask this flag. See gistplacetopage() for details. + */ + GistMarkFollowRight(page); + + if (GistPageIsLeaf(page)) + { + /* + * In gist leaf pages, it is possible to modify the LP_FLAGS without + * emitting any WAL record. Hence, mask the line pointer flags. See + * gistkillitems() for details. + */ + mask_lp_flags(page); + } + + /* + * During gist redo, we never mark a page as garbage. Hence, mask it to + * ignore any differences. + */ + GistClearPageHasGarbage(page); +} + /* * Write WAL record of a page split. */ diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 5fd7f1e1a2..0be48fb3ee 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -38,6 +38,7 @@ */ #include "postgres.h" +#include "access/bufmask.h" #include "access/heapam.h" #include "access/heapam_xlog.h" #include "access/hio.h" @@ -9142,3 +9143,81 @@ heap_sync(Relation rel) heap_close(toastrel, AccessShareLock); } } + +/* + * Mask a heap page before performing consistency checks on it. + */ +void +heap_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + OffsetNumber off; + + mask_page_lsn(page); + + mask_page_hint_bits(page); + mask_unused_space(page); + + for (off = 1; off <= PageGetMaxOffsetNumber(page); off++) + { + ItemId iid = PageGetItemId(page, off); + char *page_item; + + page_item = (char *) (page + ItemIdGetOffset(iid)); + + if (ItemIdIsNormal(iid)) + { + + HeapTupleHeader page_htup = (HeapTupleHeader) page_item; + + /* + * If xmin of a tuple is not yet frozen, we should ignore + * differences in hint bits, since they can be set without + * emitting WAL. + */ + if (!HeapTupleHeaderXminFrozen(page_htup)) + page_htup->t_infomask &= ~HEAP_XACT_MASK; + else + { + /* Still we need to mask xmax hint bits. */ + page_htup->t_infomask &= ~HEAP_XMAX_INVALID; + page_htup->t_infomask &= ~HEAP_XMAX_COMMITTED; + } + + /* + * During replay, we set Command Id to FirstCommandId. Hence, mask + * it. See heap_xlog_insert() for details. + */ + page_htup->t_choice.t_heap.t_field3.t_cid = MASK_MARKER; + + /* + * For a speculative tuple, heap_insert() does not set ctid in the + * caller-passed heap tuple itself, leaving the ctid field to + * contain a speculative token value - a per-backend monotonically + * increasing identifier. Besides, it does not WAL-log ctid under + * any circumstances. + * + * During redo, heap_xlog_insert() sets t_ctid to current block + * number and self offset number. It doesn't care about any + * speculative insertions in master. Hence, we set t_ctid to + * current block number and self offset number to ignore any + * inconsistency. + */ + if (HeapTupleHeaderIsSpeculative(page_htup)) + ItemPointerSet(&page_htup->t_ctid, blkno, off); + } + + /* + * Ignore any padding bytes after the tuple, when the length of the + * item is not MAXALIGNed. + */ + if (ItemIdHasStorage(iid)) + { + int len = ItemIdGetLength(iid); + int padlen = MAXALIGN(len) - len; + + if (padlen > 0) + memset(page_item + len, MASK_MARKER, padlen); + } + } +} diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index efad745c57..a9ca279d81 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -14,6 +14,7 @@ */ #include "postgres.h" +#include "access/bufmask.h" #include "access/heapam_xlog.h" #include "access/nbtree.h" #include "access/transam.h" @@ -1028,3 +1029,52 @@ btree_redo(XLogReaderState *record) elog(PANIC, "btree_redo: unknown op code %u", info); } } + +/* + * Mask a btree page before performing consistency checks on it. + */ +void +btree_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + BTPageOpaque maskopaq; + + mask_page_lsn(page); + + mask_page_hint_bits(page); + mask_unused_space(page); + + maskopaq = (BTPageOpaque) PageGetSpecialPointer(page); + + if (P_ISDELETED(maskopaq)) + { + /* + * Mask page content on a DELETED page since it will be re-initialized + * during replay. See btree_xlog_unlink_page() for details. + */ + mask_page_content(page); + } + else if (P_ISLEAF(maskopaq)) + { + /* + * In btree leaf pages, it is possible to modify the LP_FLAGS without + * emitting any WAL record. Hence, mask the line pointer flags. See + * _bt_killitems(), _bt_check_unique() for details. + */ + mask_lp_flags(page); + } + + /* + * BTP_HAS_GARBAGE is just an un-logged hint bit. So, mask it. See + * _bt_killitems(), _bt_check_unique() for details. + */ + maskopaq->btpo_flags &= ~BTP_HAS_GARBAGE; + + /* + * During replay of a btree page split, we don't set the BTP_SPLIT_END + * flag of the right sibling and initialize the cycle_id to 0 for the same + * page. See btree_xlog_split() for details. + */ + maskopaq->btpo_flags &= ~BTP_SPLIT_END; + maskopaq->btpo_cycleid = 0; +} diff --git a/src/backend/access/rmgrdesc/gindesc.c b/src/backend/access/rmgrdesc/gindesc.c index 9e488b359a..d4ed7f9c0a 100644 --- a/src/backend/access/rmgrdesc/gindesc.c +++ b/src/backend/access/rmgrdesc/gindesc.c @@ -105,7 +105,12 @@ gin_desc(StringInfo buf, XLogReaderState *record) leftChildBlkno, rightChildBlkno); } if (XLogRecHasBlockImage(record, 0)) - appendStringInfoString(buf, " (full page image)"); + { + if (XLogRecBlockImageApply(record, 0)) + appendStringInfoString(buf, " (full page image)"); + else + appendStringInfoString(buf, " (full page image, for WAL verification)"); + } else { char *payload = XLogRecGetBlockData(record, 0, NULL); @@ -145,7 +150,12 @@ gin_desc(StringInfo buf, XLogReaderState *record) case XLOG_GIN_VACUUM_DATA_LEAF_PAGE: { if (XLogRecHasBlockImage(record, 0)) - appendStringInfoString(buf, " (full page image)"); + { + if (XLogRecBlockImageApply(record, 0)) + appendStringInfoString(buf, " (full page image)"); + else + appendStringInfoString(buf, " (full page image, for WAL verification)"); + } else { ginxlogVacuumDataLeafPage *xlrec = diff --git a/src/backend/access/spgist/spgxlog.c b/src/backend/access/spgist/spgxlog.c index 3dc6a5ab88..596b266ba6 100644 --- a/src/backend/access/spgist/spgxlog.c +++ b/src/backend/access/spgist/spgxlog.c @@ -14,6 +14,7 @@ */ #include "postgres.h" +#include "access/bufmask.h" #include "access/spgist_private.h" #include "access/transam.h" #include "access/xlog.h" @@ -1023,3 +1024,23 @@ spg_xlog_cleanup(void) MemoryContextDelete(opCtx); opCtx = NULL; } + +/* + * Mask a SpGist page before performing consistency checks on it. + */ +void +spg_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + + mask_page_lsn(page); + + mask_page_hint_bits(page); + + /* + * Any SpGist page other than meta contains unused space which needs to be + * masked. + */ + if (!SpGistPageIsMeta(page)) + mask_unused_space(page); +} diff --git a/src/backend/access/transam/generic_xlog.c b/src/backend/access/transam/generic_xlog.c index eddec9bc54..fbc6810c2f 100644 --- a/src/backend/access/transam/generic_xlog.c +++ b/src/backend/access/transam/generic_xlog.c @@ -13,6 +13,7 @@ */ #include "postgres.h" +#include "access/bufmask.h" #include "access/generic_xlog.h" #include "access/xlogutils.h" #include "miscadmin.h" @@ -533,3 +534,14 @@ generic_redo(XLogReaderState *record) UnlockReleaseBuffer(buffers[block_id]); } } + +/* + * Mask a generic page before performing consistency checks on it. + */ +void +generic_mask(char *page, BlockNumber blkno) +{ + mask_page_lsn(page); + + mask_unused_space(page); +} diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c index 9bb136218d..eae75242fe 100644 --- a/src/backend/access/transam/rmgr.c +++ b/src/backend/access/transam/rmgr.c @@ -30,8 +30,8 @@ #include "utils/relmapper.h" /* must be kept in sync with RmgrData definition in xlog_internal.h */ -#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup) \ - { name, redo, desc, identify, startup, cleanup }, +#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \ + { name, redo, desc, identify, startup, cleanup, mask }, const RmgrData RmgrTable[RM_MAX_ID + 1] = { #include "access/rmgrlist.h" diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 2f5d603066..cc8b83fa8d 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -95,6 +95,8 @@ bool EnableHotStandby = false; bool fullPageWrites = true; bool wal_log_hints = false; bool wal_compression = false; +char *wal_consistency_checking_string = NULL; +bool *wal_consistency_checking = NULL; bool log_checkpoints = false; int sync_method = DEFAULT_SYNC_METHOD; int wal_level = WAL_LEVEL_MINIMAL; @@ -245,6 +247,10 @@ bool InArchiveRecovery = false; /* Was the last xlog file restored from archive, or local? */ static bool restoredFromArchive = false; +/* Buffers dedicated to consistency checks of size BLCKSZ */ +static char *replay_image_masked = NULL; +static char *master_image_masked = NULL; + /* options taken from recovery.conf for archive recovery */ char *recoveryRestoreCommand = NULL; static char *recoveryEndCommand = NULL; @@ -903,6 +909,7 @@ static char *GetXLogBuffer(XLogRecPtr ptr); static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos); static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos); static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr); +static void checkXLogConsistency(XLogReaderState *record); static void WALInsertLockAcquire(void); static void WALInsertLockAcquireExclusive(void); @@ -1314,6 +1321,103 @@ ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr) return true; } +/* + * Checks whether the current buffer page and backup page stored in the + * WAL record are consistent or not. Before comparing the two pages, a + * masking can be applied to the pages to ignore certain areas like hint bits, + * unused space between pd_lower and pd_upper among other things. This + * function should be called once WAL replay has been completed for a + * given record. + */ +static void +checkXLogConsistency(XLogReaderState *record) +{ + RmgrId rmid = XLogRecGetRmid(record); + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blkno; + int block_id; + + /* Records with no backup blocks have no need for consistency checks. */ + if (!XLogRecHasAnyBlockRefs(record)) + return; + + Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0); + + for (block_id = 0; block_id <= record->max_block_id; block_id++) + { + Buffer buf; + Page page; + + if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno)) + { + /* + * WAL record doesn't contain a block reference with the given id. + * Do nothing. + */ + continue; + } + + Assert(XLogRecHasBlockImage(record, block_id)); + + /* + * Read the contents from the current buffer and store it in a + * temporary page. + */ + buf = XLogReadBufferExtended(rnode, forknum, blkno, + RBM_NORMAL_NO_LOG); + if (!BufferIsValid(buf)) + continue; + + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + page = BufferGetPage(buf); + + /* + * Take a copy of the local page where WAL has been applied to have a + * comparison base before masking it... + */ + memcpy(replay_image_masked, page, BLCKSZ); + + /* No need for this page anymore now that a copy is in. */ + UnlockReleaseBuffer(buf); + + /* + * If the block LSN is already ahead of this WAL record, we can't + * expect contents to match. This can happen if recovery is restarted. + */ + if (PageGetLSN(replay_image_masked) > record->EndRecPtr) + continue; + + /* + * Read the contents from the backup copy, stored in WAL record and + * store it in a temporary page. There is not need to allocate a new + * page here, a local buffer is fine to hold its contents and a mask + * can be directly applied on it. + */ + if (!RestoreBlockImage(record, block_id, master_image_masked)) + elog(ERROR, "failed to restore block image"); + + /* + * If masking function is defined, mask both the master and replay + * images + */ + if (RmgrTable[rmid].rm_mask != NULL) + { + RmgrTable[rmid].rm_mask(replay_image_masked, blkno); + RmgrTable[rmid].rm_mask(master_image_masked, blkno); + } + + /* Time to compare the master and replay images. */ + if (memcmp(replay_image_masked, master_image_masked, BLCKSZ) != 0) + { + elog(FATAL, + "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u", + rnode.spcNode, rnode.dbNode, rnode.relNode, + forknum, blkno); + } + } +} + /* * Subroutine of XLogInsertRecord. Copies a WAL record to an already-reserved * area in the WAL. @@ -6200,6 +6304,13 @@ StartupXLOG(void) errdetail("Failed while allocating an XLog reading processor."))); xlogreader->system_identifier = ControlFile->system_identifier; + /* + * Allocate pages dedicated to WAL consistency checks, those had better + * be aligned. + */ + replay_image_masked = (char *) palloc(BLCKSZ); + master_image_masked = (char *) palloc(BLCKSZ); + if (read_backup_label(&checkPointLoc, &backupEndRequired, &backupFromStandby)) { @@ -7000,6 +7111,15 @@ StartupXLOG(void) /* Now apply the WAL record itself */ RmgrTable[record->xl_rmid].rm_redo(xlogreader); + /* + * After redo, check whether the backup pages associated with + * the WAL record are consistent with the existing pages. This + * check is done only if consistency check is enabled for this + * record. + */ + if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0) + checkXLogConsistency(xlogreader); + /* Pop the error context stack */ error_context_stack = errcallback.previous; diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index a5aa58d845..797e68cd90 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -421,10 +421,12 @@ XLogInsert(RmgrId rmid, uint8 info) elog(ERROR, "XLogBeginInsert was not called"); /* - * The caller can set rmgr bits and XLR_SPECIAL_REL_UPDATE; the rest are - * reserved for use by me. + * The caller can set rmgr bits, XLR_SPECIAL_REL_UPDATE and + * XLR_CHECK_CONSISTENCY; the rest are reserved for use by me. */ - if ((info & ~(XLR_RMGR_INFO_MASK | XLR_SPECIAL_REL_UPDATE)) != 0) + if ((info & ~(XLR_RMGR_INFO_MASK | + XLR_SPECIAL_REL_UPDATE | + XLR_CHECK_CONSISTENCY)) != 0) elog(PANIC, "invalid xlog info mask %02X", info); TRACE_POSTGRESQL_XLOG_INSERT(rmid, info); @@ -504,6 +506,15 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, rdt_datas_last = &hdr_rdt; hdr_rdt.data = hdr_scratch; + /* + * Enforce consistency checks for this record if user is looking for + * it. Do this before at the beginning of this routine to give the + * possibility for callers of XLogInsert() to pass XLR_CHECK_CONSISTENCY + * directly for a record. + */ + if (wal_consistency_checking[rmid]) + info |= XLR_CHECK_CONSISTENCY; + /* * Make an rdata chain containing all the data portions of all block * references. This includes the data for full-page images. Also append @@ -520,6 +531,7 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, XLogRecordBlockCompressHeader cbimg = {0}; bool samerel; bool is_compressed = false; + bool include_image; if (!regbuf->in_use) continue; @@ -563,7 +575,14 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, if ((regbuf->flags & REGBUF_WILL_INIT) == REGBUF_WILL_INIT) bkpb.fork_flags |= BKPBLOCK_WILL_INIT; - if (needs_backup) + /* + * If needs_backup is true or WAL checking is enabled for + * current resource manager, log a full-page write for the current + * block. + */ + include_image = needs_backup || (info & XLR_CHECK_CONSISTENCY) != 0; + + if (include_image) { Page page = regbuf->page; uint16 compressed_len; @@ -625,6 +644,15 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, bimg.bimg_info = (cbimg.hole_length == 0) ? 0 : BKPIMAGE_HAS_HOLE; + /* + * If WAL consistency checking is enabled for the resource manager of + * this WAL record, a full-page image is included in the record + * for the block modified. During redo, the full-page is replayed + * only if BKPIMAGE_APPLY is set. + */ + if (needs_backup) + bimg.bimg_info |= BKPIMAGE_APPLY; + if (is_compressed) { bimg.length = compressed_len; @@ -687,7 +715,7 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, /* Ok, copy the header to the scratch buffer */ memcpy(scratch, &bkpb, SizeOfXLogRecordBlockHeader); scratch += SizeOfXLogRecordBlockHeader; - if (needs_backup) + if (include_image) { memcpy(scratch, &bimg, SizeOfXLogRecordBlockImageHeader); scratch += SizeOfXLogRecordBlockImageHeader; diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index b528745fe8..f077662946 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -997,6 +997,7 @@ ResetDecoder(XLogReaderState *state) state->blocks[block_id].in_use = false; state->blocks[block_id].has_image = false; state->blocks[block_id].has_data = false; + state->blocks[block_id].apply_image = false; } state->max_block_id = -1; } @@ -1089,6 +1090,7 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg) blk = &state->blocks[block_id]; blk->in_use = true; + blk->apply_image = false; COPY_HEADER_FIELD(&fork_flags, sizeof(uint8)); blk->forknum = fork_flags & BKPBLOCK_FORK_MASK; @@ -1120,6 +1122,9 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg) COPY_HEADER_FIELD(&blk->bimg_len, sizeof(uint16)); COPY_HEADER_FIELD(&blk->hole_offset, sizeof(uint16)); COPY_HEADER_FIELD(&blk->bimg_info, sizeof(uint8)); + + blk->apply_image = ((blk->bimg_info & BKPIMAGE_APPLY) != 0); + if (blk->bimg_info & BKPIMAGE_IS_COMPRESSED) { if (blk->bimg_info & BKPIMAGE_HAS_HOLE) @@ -1243,6 +1248,9 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg) if (!blk->in_use) continue; + + Assert(blk->has_image || !blk->apply_image); + if (blk->has_image) { blk->bkp_image = ptr; diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 0de2419e54..6627f5498b 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -275,9 +275,9 @@ XLogCheckInvalidPages(void) * will complain if we don't have the lock. In hot standby mode it's * definitely necessary.) * - * Note: when a backup block is available in XLOG, we restore it - * unconditionally, even if the page in the database appears newer. This is - * to protect ourselves against database pages that were partially or + * Note: when a backup block is available in XLOG with the BKPIMAGE_APPLY flag + * set, we restore it, even if the page in the database appears newer. This + * is to protect ourselves against database pages that were partially or * incorrectly written during a crash. We assume that the XLOG data must be * good because it has passed a CRC check, while the database page might not * be. This will force us to replay all subsequent modifications of the page @@ -352,9 +352,10 @@ XLogReadBufferForRedoExtended(XLogReaderState *record, if (!willinit && zeromode) elog(PANIC, "block to be initialized in redo routine must be marked with WILL_INIT flag in the WAL record"); - /* If it's a full-page image, restore it. */ - if (XLogRecHasBlockImage(record, block_id)) + /* If it has a full-page image and it should be restored, do it. */ + if (XLogRecBlockImageApply(record, block_id)) { + Assert(XLogRecHasBlockImage(record, block_id)); *buf = XLogReadBufferExtended(rnode, forknum, blkno, get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK); page = BufferGetPage(*buf); diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index c148b09cd7..e6f87543df 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -14,6 +14,7 @@ */ #include "postgres.h" +#include "access/bufmask.h" #include "access/htup_details.h" #include "access/multixact.h" #include "access/transam.h" @@ -1740,3 +1741,14 @@ ResetSequenceCaches(void) last_used_seq = NULL; } + +/* + * Mask a Sequence page before performing consistency checks on it. + */ +void +seq_mask(char *page, BlockNumber blkno) +{ + mask_page_lsn(page); + + mask_unused_space(page); +} diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index c53aededcb..de85eca6a8 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -28,9 +28,11 @@ #include "access/commit_ts.h" #include "access/gin.h" +#include "access/rmgr.h" #include "access/transam.h" #include "access/twophase.h" #include "access/xact.h" +#include "access/xlog_internal.h" #include "catalog/namespace.h" #include "commands/async.h" #include "commands/prepare.h" @@ -147,6 +149,10 @@ static bool call_enum_check_hook(struct config_enum * conf, int *newval, static bool check_log_destination(char **newval, void **extra, GucSource source); static void assign_log_destination(const char *newval, void *extra); +static bool check_wal_consistency_checking(char **newval, void **extra, + GucSource source); +static void assign_wal_consistency_checking(const char *newval, void *extra); + #ifdef HAVE_SYSLOG static int syslog_facility = LOG_LOCAL0; #else @@ -3572,6 +3578,17 @@ static struct config_string ConfigureNamesString[] = check_cluster_name, NULL, NULL }, + { + {"wal_consistency_checking", PGC_SUSET, DEVELOPER_OPTIONS, + gettext_noop("Sets the WAL resource managers for which WAL consistency checks are done."), + gettext_noop("Full-page images will be logged for all data blocks and cross-checked against the results of WAL replay."), + GUC_LIST_INPUT | GUC_NOT_IN_SAMPLE + }, + &wal_consistency_checking_string, + "", + check_wal_consistency_checking, assign_wal_consistency_checking, NULL + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, NULL, NULL, NULL, NULL @@ -9888,6 +9905,86 @@ call_enum_check_hook(struct config_enum * conf, int *newval, void **extra, * check_hook, assign_hook and show_hook subroutines */ +static bool +check_wal_consistency_checking(char **newval, void **extra, GucSource source) +{ + char *rawstring; + List *elemlist; + ListCell *l; + bool newwalconsistency[RM_MAX_ID + 1]; + + /* Initialize the array */ + MemSet(newwalconsistency, 0, (RM_MAX_ID + 1) * sizeof(bool)); + + /* Need a modifiable copy of string */ + rawstring = pstrdup(*newval); + + /* Parse string into list of identifiers */ + if (!SplitIdentifierString(rawstring, ',', &elemlist)) + { + /* syntax error in list */ + GUC_check_errdetail("List syntax is invalid."); + pfree(rawstring); + list_free(elemlist); + return false; + } + + foreach(l, elemlist) + { + char *tok = (char *) lfirst(l); + bool found = false; + RmgrId rmid; + + /* Check for 'all'. */ + if (pg_strcasecmp(tok, "all") == 0) + { + for (rmid = 0; rmid <= RM_MAX_ID; rmid++) + if (RmgrTable[rmid].rm_mask != NULL) + newwalconsistency[rmid] = true; + found = true; + } + else + { + /* + * Check if the token matches with any individual resource + * manager. + */ + for (rmid = 0; rmid <= RM_MAX_ID; rmid++) + { + if (pg_strcasecmp(tok, RmgrTable[rmid].rm_name) == 0 && + RmgrTable[rmid].rm_mask != NULL) + { + newwalconsistency[rmid] = true; + found = true; + } + } + } + + /* If a valid resource manager is found, check for the next one. */ + if (!found) + { + GUC_check_errdetail("Unrecognized key word: \"%s\".", tok); + pfree(rawstring); + list_free(elemlist); + return false; + } + } + + pfree(rawstring); + list_free(elemlist); + + /* assign new value */ + *extra = guc_malloc(ERROR, (RM_MAX_ID + 1) * sizeof(bool)); + memcpy(*extra, newwalconsistency, (RM_MAX_ID + 1) * sizeof(bool)); + return true; +} + +static void +assign_wal_consistency_checking(const char *newval, void *extra) +{ + wal_consistency_checking = (bool *) extra; +} + static bool check_log_destination(char **newval, void **extra, GucSource source) { diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c index cb433819e4..a7f6fe2df3 100644 --- a/src/bin/pg_rewind/parsexlog.c +++ b/src/bin/pg_rewind/parsexlog.c @@ -29,7 +29,7 @@ * RmgrNames is an array of resource manager names, to make error messages * a bit nicer. */ -#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup) \ +#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \ name, static const char *RmgrNames[RM_MAX_ID + 1] = { diff --git a/src/bin/pg_xlogdump/pg_xlogdump.c b/src/bin/pg_xlogdump/pg_xlogdump.c index 590d2ad587..679aead895 100644 --- a/src/bin/pg_xlogdump/pg_xlogdump.c +++ b/src/bin/pg_xlogdump/pg_xlogdump.c @@ -465,7 +465,12 @@ XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record) rnode.spcNode, rnode.dbNode, rnode.relNode, blk); if (XLogRecHasBlockImage(record, block_id)) - printf(" FPW"); + { + if (XLogRecBlockImageApply(record, block_id)) + printf(" FPW"); + else + printf(" FPW for WAL verification"); + } } putchar('\n'); } @@ -489,7 +494,10 @@ XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record) if (record->blocks[block_id].bimg_info & BKPIMAGE_IS_COMPRESSED) { - printf(" (FPW); hole: offset: %u, length: %u, compression saved: %u\n", + printf(" (FPW%s); hole: offset: %u, length: %u, " + "compression saved: %u\n", + XLogRecBlockImageApply(record, block_id) ? + "" : " for WAL verification", record->blocks[block_id].hole_offset, record->blocks[block_id].hole_length, BLCKSZ - @@ -498,7 +506,9 @@ XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record) } else { - printf(" (FPW); hole: offset: %u, length: %u\n", + printf(" (FPW%s); hole: offset: %u, length: %u\n", + XLogRecBlockImageApply(record, block_id) ? + "" : " for WAL verification", record->blocks[block_id].hole_offset, record->blocks[block_id].hole_length); } diff --git a/src/bin/pg_xlogdump/rmgrdesc.c b/src/bin/pg_xlogdump/rmgrdesc.c index 8fe20ce97e..5d19a4af72 100644 --- a/src/bin/pg_xlogdump/rmgrdesc.c +++ b/src/bin/pg_xlogdump/rmgrdesc.c @@ -32,7 +32,7 @@ #include "storage/standbydefs.h" #include "utils/relmapper.h" -#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup) \ +#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \ { name, desc, identify}, const RmgrDescData RmgrDescTable[RM_MAX_ID + 1] = { diff --git a/src/include/access/brin_xlog.h b/src/include/access/brin_xlog.h index 527b2f1a22..33ceb34ea5 100644 --- a/src/include/access/brin_xlog.h +++ b/src/include/access/brin_xlog.h @@ -128,5 +128,6 @@ typedef struct xl_brin_revmap_extend extern void brin_redo(XLogReaderState *record); extern void brin_desc(StringInfo buf, XLogReaderState *record); extern const char *brin_identify(uint8 info); +extern void brin_mask(char *pagedata, BlockNumber blkno); #endif /* BRIN_XLOG_H */ diff --git a/src/include/access/bufmask.h b/src/include/access/bufmask.h new file mode 100644 index 0000000000..add2dc0cd1 --- /dev/null +++ b/src/include/access/bufmask.h @@ -0,0 +1,33 @@ +/*------------------------------------------------------------------------- + * + * bufmask.h + * Definitions for buffer masking routines, used to mask certain bits + * in a page which can be different when the WAL is generated + * and when the WAL is applied. This is really the job of each + * individual rmgr, but we make things easier by providing some + * common routines to handle cases which occur in multiple rmgrs. + * + * Portions Copyright (c) 2016, PostgreSQL Global Development Group + * + * src/include/access/bufmask.h + * + *------------------------------------------------------------------------- + */ + +#ifndef BUFMASK_H +#define BUFMASK_H + +#include "postgres.h" +#include "storage/block.h" +#include "storage/bufmgr.h" + +/* Marker used to mask pages consistently */ +#define MASK_MARKER 0 + +extern void mask_page_lsn(Page page); +extern void mask_page_hint_bits(Page page); +extern void mask_unused_space(Page page); +extern void mask_lp_flags(Page page); +extern void mask_page_content(Page page); + +#endif diff --git a/src/include/access/generic_xlog.h b/src/include/access/generic_xlog.h index 187d68b3e1..0dc17f55f2 100644 --- a/src/include/access/generic_xlog.h +++ b/src/include/access/generic_xlog.h @@ -40,5 +40,6 @@ extern void GenericXLogAbort(GenericXLogState *state); extern void generic_redo(XLogReaderState *record); extern const char *generic_identify(uint8 info); extern void generic_desc(StringInfo buf, XLogReaderState *record); +extern void generic_mask(char *pagedata, BlockNumber blkno); #endif /* GENERIC_XLOG_H */ diff --git a/src/include/access/gin.h b/src/include/access/gin.h index 5629c8add7..e5d67305d9 100644 --- a/src/include/access/gin.h +++ b/src/include/access/gin.h @@ -79,5 +79,6 @@ extern void gin_desc(StringInfo buf, XLogReaderState *record); extern const char *gin_identify(uint8 info); extern void gin_xlog_startup(void); extern void gin_xlog_cleanup(void); +extern void gin_mask(char *pagedata, BlockNumber blkno); #endif /* GIN_H */ diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index 60a770aa30..f4beeb9209 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -459,6 +459,7 @@ extern void gist_desc(StringInfo buf, XLogReaderState *record); extern const char *gist_identify(uint8 info); extern void gist_xlog_startup(void); extern void gist_xlog_cleanup(void); +extern void gist_mask(char *pagedata, BlockNumber blkno); extern XLogRecPtr gistXLogUpdate(Buffer buffer, OffsetNumber *todelete, int ntodelete, diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index 52f28b86cb..b285f172aa 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -373,6 +373,7 @@ extern void HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple, extern void heap_redo(XLogReaderState *record); extern void heap_desc(StringInfo buf, XLogReaderState *record); extern const char *heap_identify(uint8 info); +extern void heap_mask(char *pagedata, BlockNumber blkno); extern void heap2_redo(XLogReaderState *record); extern void heap2_desc(StringInfo buf, XLogReaderState *record); extern const char *heap2_identify(uint8 info); diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 011a72ecf7..b2517623aa 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -774,5 +774,6 @@ extern void _bt_leafbuild(BTSpool *btspool, BTSpool *spool2); extern void btree_redo(XLogReaderState *record); extern void btree_desc(StringInfo buf, XLogReaderState *record); extern const char *btree_identify(uint8 info); +extern void btree_mask(char *pagedata, BlockNumber blkno); #endif /* NBTREE_H */ diff --git a/src/include/access/rmgr.h b/src/include/access/rmgr.h index ff7fe62c05..64b92ff33a 100644 --- a/src/include/access/rmgr.h +++ b/src/include/access/rmgr.h @@ -19,7 +19,7 @@ typedef uint8 RmgrId; * Note: RM_MAX_ID must fit in RmgrId; widening that type will affect the XLOG * file format. */ -#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup) \ +#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \ symname, typedef enum RmgrIds diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h index 5f76749dbd..b892aea370 100644 --- a/src/include/access/rmgrlist.h +++ b/src/include/access/rmgrlist.h @@ -25,25 +25,25 @@ */ /* symbol name, textual name, redo, desc, identify, startup, cleanup */ -PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL) -PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL) -PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL) -PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL) -PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL) -PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL) -PG_RMGR(RM_MULTIXACT_ID, "MultiXact", multixact_redo, multixact_desc, multixact_identify, NULL, NULL) -PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, NULL) -PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL) -PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL) -PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL) -PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, NULL, NULL) -PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL) -PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup) -PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup) -PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL) -PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup) -PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL) -PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_identify, NULL, NULL) -PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL) -PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL) -PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL) +PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL, NULL) +PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL, NULL) +PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL, NULL) +PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL, NULL) +PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL, NULL) +PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL, NULL) +PG_RMGR(RM_MULTIXACT_ID, "MultiXact", multixact_redo, multixact_desc, multixact_identify, NULL, NULL, NULL) +PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, NULL, NULL) +PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL) +PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask) +PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask) +PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, NULL, NULL, btree_mask) +PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, NULL) +PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask) +PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask) +PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL, seq_mask) +PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup, spg_mask) +PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL, brin_mask) +PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_identify, NULL, NULL, NULL) +PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL) +PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask) +PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL) diff --git a/src/include/access/spgist.h b/src/include/access/spgist.h index aaf78bca97..6f59c0bbc5 100644 --- a/src/include/access/spgist.h +++ b/src/include/access/spgist.h @@ -219,5 +219,6 @@ extern void spg_desc(StringInfo buf, XLogReaderState *record); extern const char *spg_identify(uint8 info); extern void spg_xlog_startup(void); extern void spg_xlog_cleanup(void); +extern void spg_mask(char *pagedata, BlockNumber blkno); #endif /* SPGIST_H */ diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index a4255723b7..9f036c72d8 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -105,6 +105,8 @@ extern bool EnableHotStandby; extern bool fullPageWrites; extern bool wal_log_hints; extern bool wal_compression; +extern bool *wal_consistency_checking; +extern char *wal_consistency_checking_string; extern bool log_checkpoints; extern int CheckPointSegments; diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 8ad4d47d12..3005b98aaa 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -31,7 +31,7 @@ /* * Each page of XLOG file has a header like this: */ -#define XLOG_PAGE_MAGIC 0xD094 /* can be used as WAL version indicator */ +#define XLOG_PAGE_MAGIC 0xD095 /* can be used as WAL version indicator */ typedef struct XLogPageHeaderData { @@ -266,6 +266,9 @@ typedef enum * "VACUUM". rm_desc can then be called to obtain additional detail for the * record, if available (e.g. the last block). * + * rm_mask takes as input a page modified by the resource manager and masks + * out bits that shouldn't be flagged by wal_consistency_checking. + * * RmgrTable[] is indexed by RmgrId values (see rmgrlist.h). */ typedef struct RmgrData @@ -276,6 +279,7 @@ typedef struct RmgrData const char *(*rm_identify) (uint8 info); void (*rm_startup) (void); void (*rm_cleanup) (void); + void (*rm_mask) (char *pagedata, BlockNumber blkno); } RmgrData; extern const RmgrData RmgrTable[]; diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h index 00102e8e0b..663d3e7890 100644 --- a/src/include/access/xlogreader.h +++ b/src/include/access/xlogreader.h @@ -51,7 +51,8 @@ typedef struct uint8 flags; /* Information on full-page image, if any */ - bool has_image; + bool has_image; /* has image, even for consistency checking */ + bool apply_image; /* has image that should be restored */ char *bkp_image; uint16 hole_offset; uint16 hole_length; @@ -205,6 +206,8 @@ extern bool DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, ((decoder)->blocks[block_id].in_use) #define XLogRecHasBlockImage(decoder, block_id) \ ((decoder)->blocks[block_id].has_image) +#define XLogRecBlockImageApply(decoder, block_id) \ + ((decoder)->blocks[block_id].apply_image) extern bool RestoreBlockImage(XLogReaderState *recoder, uint8 block_id, char *dst); extern char *XLogRecGetBlockData(XLogReaderState *record, uint8 block_id, Size *len); diff --git a/src/include/access/xlogrecord.h b/src/include/access/xlogrecord.h index 0162f93e82..eeb6a30c1c 100644 --- a/src/include/access/xlogrecord.h +++ b/src/include/access/xlogrecord.h @@ -56,8 +56,8 @@ typedef struct XLogRecord /* * The high 4 bits in xl_info may be used freely by rmgr. The - * XLR_SPECIAL_REL_UPDATE bit can be passed by XLogInsert caller. The rest - * are set internally by XLogInsert. + * XLR_SPECIAL_REL_UPDATE and XLR_CHECK_CONSISTENCY bits can be passed by + * XLogInsert caller. The rest are set internally by XLogInsert. */ #define XLR_INFO_MASK 0x0F #define XLR_RMGR_INFO_MASK 0xF0 @@ -70,6 +70,15 @@ typedef struct XLogRecord */ #define XLR_SPECIAL_REL_UPDATE 0x01 +/* + * Enforces consistency checks of replayed WAL at recovery. If enabled, + * each record will log a full-page write for each block modified by the + * record and will reuse it afterwards for consistency checks. The caller + * of XLogInsert can use this value if necessary, but if + * wal_consistency_checking is enabled for a rmgr this is set unconditionally. + */ +#define XLR_CHECK_CONSISTENCY 0x02 + /* * Header info for block data appended to an XLOG record. * @@ -137,6 +146,7 @@ typedef struct XLogRecordBlockImageHeader /* Information stored in bimg_info */ #define BKPIMAGE_HAS_HOLE 0x01 /* page image has "hole" */ #define BKPIMAGE_IS_COMPRESSED 0x02 /* page image is compressed */ +#define BKPIMAGE_APPLY 0x04 /* page image should be restored during replay */ /* * Extra header information used when page image has "hole" and diff --git a/src/include/commands/sequence.h b/src/include/commands/sequence.h index 144c3c2e6f..49a77c42fc 100644 --- a/src/include/commands/sequence.h +++ b/src/include/commands/sequence.h @@ -62,5 +62,6 @@ extern void ResetSequenceCaches(void); extern void seq_redo(XLogReaderState *rptr); extern void seq_desc(StringInfo buf, XLogReaderState *rptr); extern const char *seq_identify(uint8 info); +extern void seq_mask(char *pagedata, BlockNumber blkno); #endif /* SEQUENCE_H */