Move BKP_REMOVABLE bit from individual WAL records to WAL page headers.

Removing this bit from xl_info allows us to restore the old limit of four (not three) separate pages touched by a WAL record, which is needed for the upcoming SP-GiST feature, and will likely be useful elsewhere in future. When we implemented XLR_BKP_REMOVABLE in 2007, we had to do it like that because no special WAL-visible action was taken when starting a backup. However, now we force a segment switch when starting a backup, so a compressing WAL archiver (such as pglesslog) that uses the state shown in the current page header will not be fooled as to removability of backup blocks. The only downside is that the archiver will not return to compressing mode for up to one WAL page after the backup is over, which is a small price to pay for getting back the extra xl_info bit. In any case the archiver could look for XLOG_BACKUP_END records if it thought it was worth the trouble to do so. Bump XLOG_PAGE_MAGIC since this is effectively a change in WAL format.
2011-12-12 16:22:14 -05:00 · 2011-12-12 16:22:14 -05:00 · 2dd9322ba6
parent 8409b60476
commit 2dd9322ba6
4 changed files with 44 additions and 42 deletions
--- a/src/backend/access/transam/README
+++ b/src/backend/access/transam/README
@ -473,7 +473,7 @@ the same page, only BKP(1) would have been set.
 For this reason as well as the risk of deadlocking on buffer locks, it's best
 to design WAL records so that they reflect small atomic actions involving just
 one or a few pages.  The current XLOG infrastructure cannot handle WAL records
-involving references to more than three shared buffers, anyway.
+involving references to more than four shared buffers, anyway.

 In the case where the WAL record contains enough information to re-generate
 the entire contents of a page, do *not* show that page's buffer ID in the
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@ -970,19 +970,6 @@ begin:;
 		}
 	}

-	/*
-	 * If we backed up any full blocks and online backup is not in progress,
-	 * mark the backup blocks as removable.  This allows the WAL archiver to
-	 * know whether it is safe to compress archived WAL data by transforming
-	 * full-block records into the non-full-block format.
-	 *
-	 * Note: we could just set the flag whenever !forcePageWrites, but
-	 * defining it like this leaves the info bit free for some potential other
-	 * use in records without any backup blocks.
-	 */
-	if ((info & XLR_BKP_BLOCK_MASK) && !Insert->forcePageWrites)
-		info |= XLR_BKP_REMOVABLE;
-
 	/*
 	 * If there isn't enough space on the current XLOG page for a record
 	 * header, advance to the next page (leaving the unused space as zeroes).
@ -1601,6 +1588,21 @@ AdvanceXLInsertBuffer(bool new_segment)
 	NewPage   ->xlp_pageaddr.xlogid = NewPageEndPtr.xlogid;
 	NewPage   ->xlp_pageaddr.xrecoff = NewPageEndPtr.xrecoff - XLOG_BLCKSZ;

+	/*
+	 * If online backup is not in progress, mark the header to indicate that
+	 * WAL records beginning in this page have removable backup blocks.  This
+	 * allows the WAL archiver to know whether it is safe to compress archived
+	 * WAL data by transforming full-block records into the non-full-block
+	 * format.  It is sufficient to record this at the page level because we
+	 * force a page switch (in fact a segment switch) when starting a backup,
+	 * so the flag will be off before any records can be written during the
+	 * backup.  At the end of a backup, the last page will be marked as all
+	 * unsafe when perhaps only part is unsafe, but at worst the archiver
+	 * would miss the opportunity to compress a few records.
+	 */
+	if (!Insert->forcePageWrites)
+		NewPage->xlp_info |= XLP_BKP_REMOVABLE;
+
 	/*
 	 * If first page of an XLOG segment file, make it a long header.
 	 */
@ -8849,19 +8851,6 @@ do_pg_start_backup(const char *backupidstr, bool fast, char **labelfile)
 				 errmsg("backup label too long (max %d bytes)",
 						MAXPGPATH)));

-	/*
-	 * Force an XLOG file switch before the checkpoint, to ensure that the WAL
-	 * segment the checkpoint is written to doesn't contain pages with old
-	 * timeline IDs. That would otherwise happen if you called
-	 * pg_start_backup() right after restoring from a PITR archive: the first
-	 * WAL segment containing the startup checkpoint has pages in the
-	 * beginning with the old timeline ID. That can cause trouble at recovery:
-	 * we won't have a history file covering the old timeline if pg_xlog
-	 * directory was not included in the base backup and the WAL archive was
-	 * cleared too before starting the backup.
-	 */
-	RequestXLogSwitch();
-
 	/*
 	 * Mark backup active in shared memory.  We must do full-page WAL writes
 	 * during an on-line backup even if not doing so at other times, because
@ -8902,6 +8891,25 @@ do_pg_start_backup(const char *backupidstr, bool fast, char **labelfile)
 	{
 		bool		gotUniqueStartpoint = false;

+		/*
+		 * Force an XLOG file switch before the checkpoint, to ensure that the
+		 * WAL segment the checkpoint is written to doesn't contain pages with
+		 * old timeline IDs.  That would otherwise happen if you called
+		 * pg_start_backup() right after restoring from a PITR archive: the
+		 * first WAL segment containing the startup checkpoint has pages in
+		 * the beginning with the old timeline ID.  That can cause trouble at
+		 * recovery: we won't have a history file covering the old timeline if
+		 * pg_xlog directory was not included in the base backup and the WAL
+		 * archive was cleared too before starting the backup.
+		 *
+		 * This also ensures that we have emitted a WAL page header that has
+		 * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
+		 * Therefore, if a WAL archiver (such as pglesslog) is trying to
+		 * compress out removable backup blocks, it won't remove any that
+		 * occur after this point.
+		 */
+		RequestXLogSwitch();
+
 		do
 		{
 			/*
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@ -29,7 +29,7 @@
 *		backup block data
 *		...
 *
- * where there can be zero to three backup blocks (as signaled by xl_info flag
+ * where there can be zero to four backup blocks (as signaled by xl_info flag
 * bits).  XLogRecord structs always start on MAXALIGN boundaries in the WAL
 * files, and we round up SizeOfXLogRecord so that the rmgr data is also
 * guaranteed to begin on a MAXALIGN boundary.	However, no padding is added
@ -66,24 +66,16 @@ typedef struct XLogRecord

 /*
 * If we backed up any disk blocks with the XLOG record, we use flag bits in
- * xl_info to signal it.  We support backup of up to 3 disk blocks per XLOG
+ * xl_info to signal it.  We support backup of up to 4 disk blocks per XLOG
 * record.
 */
-#define XLR_BKP_BLOCK_MASK		0x0E	/* all info bits used for bkp blocks */
-#define XLR_MAX_BKP_BLOCKS		3
+#define XLR_BKP_BLOCK_MASK		0x0F	/* all info bits used for bkp blocks */
+#define XLR_MAX_BKP_BLOCKS		4
 #define XLR_SET_BKP_BLOCK(iblk) (0x08 >> (iblk))
 #define XLR_BKP_BLOCK_1			XLR_SET_BKP_BLOCK(0)	/* 0x08 */
 #define XLR_BKP_BLOCK_2			XLR_SET_BKP_BLOCK(1)	/* 0x04 */
 #define XLR_BKP_BLOCK_3			XLR_SET_BKP_BLOCK(2)	/* 0x02 */
-
-/*
- * Bit 0 of xl_info is set if the backed-up blocks could safely be removed
- * from a compressed version of XLOG (that is, they are backed up only to
- * prevent partial-page-write problems, and not to ensure consistency of PITR
- * recovery).  The compression algorithm would need to extract data from the
- * blocks to create an equivalent non-full-page XLOG record.
- */
-#define XLR_BKP_REMOVABLE		0x01
+#define XLR_BKP_BLOCK_4			XLR_SET_BKP_BLOCK(3)	/* 0x01 */

 /* Sync methods */
 #define SYNC_METHOD_FSYNC		0
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@ -71,7 +71,7 @@ typedef struct XLogContRecord
 /*
 * Each page of XLOG file has a header like this:
 */
-#define XLOG_PAGE_MAGIC 0xD068	/* can be used as WAL version indicator */
+#define XLOG_PAGE_MAGIC 0xD069	/* can be used as WAL version indicator */

 typedef struct XLogPageHeaderData
 {
@ -106,8 +106,10 @@ typedef XLogLongPageHeaderData *XLogLongPageHeader;
 #define XLP_FIRST_IS_CONTRECORD		0x0001
 /* This flag indicates a "long" page header */
 #define XLP_LONG_HEADER				0x0002
+/* This flag indicates backup blocks starting in this page are optional */
+#define XLP_BKP_REMOVABLE			0x0004
 /* All defined flag bits in xlp_info (used for validity checking of header) */
-#define XLP_ALL_FLAGS				0x0003
+#define XLP_ALL_FLAGS				0x0007

 #define XLogPageHeaderSize(hdr)		\
 	(((hdr)->xlp_info & XLP_LONG_HEADER) ? SizeOfXLogLongPHD : SizeOfXLogShortPHD)