diff --git a/contrib/pg_xlogdump/pg_xlogdump.c b/contrib/pg_xlogdump/pg_xlogdump.c index c1bfbc261a..c471267fde 100644 --- a/contrib/pg_xlogdump/pg_xlogdump.c +++ b/contrib/pg_xlogdump/pg_xlogdump.c @@ -359,18 +359,17 @@ XLogDumpCountRecord(XLogDumpConfig *config, XLogDumpStats *stats, rec_len = XLogRecGetDataLen(record) + SizeOfXLogRecord; /* - * Calculate the amount of FPI data in the record. Each backup block - * takes up BLCKSZ bytes, minus the "hole" length. + * Calculate the amount of FPI data in the record. * * XXX: We peek into xlogreader's private decoded backup blocks for the - * hole_length. It doesn't seem worth it to add an accessor macro for - * this. + * bimg_len indicating the length of FPI data. It doesn't seem worth it to + * add an accessor macro for this. */ fpi_len = 0; for (block_id = 0; block_id <= record->max_block_id; block_id++) { if (XLogRecHasBlockImage(record, block_id)) - fpi_len += BLCKSZ - record->blocks[block_id].hole_length; + fpi_len += record->blocks[block_id].bimg_len; } /* Update per-rmgr statistics */ @@ -465,9 +464,22 @@ XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record) blk); if (XLogRecHasBlockImage(record, block_id)) { - printf(" (FPW); hole: offset: %u, length: %u\n", - record->blocks[block_id].hole_offset, - record->blocks[block_id].hole_length); + if (record->blocks[block_id].bimg_info & + BKPIMAGE_IS_COMPRESSED) + { + printf(" (FPW); hole: offset: %u, length: %u, compression saved: %u\n", + record->blocks[block_id].hole_offset, + record->blocks[block_id].hole_length, + BLCKSZ - + record->blocks[block_id].hole_length - + record->blocks[block_id].bimg_len); + } + else + { + printf(" (FPW); hole: offset: %u, length: %u\n", + record->blocks[block_id].hole_offset, + record->blocks[block_id].hole_length); + } } putchar('\n'); } diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 0ad838345a..07214bfd76 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -2282,6 +2282,30 @@ include_dir 'conf.d' + + wal_compression (boolean) + + wal_compression configuration parameter + + + + + When this parameter is on, the PostgreSQL + server compresses a full page image written to WAL when + is on or during a base backup. + A compressed page image will be decompressed during WAL replay. + The default value is off. + + + + Turning this parameter on can reduce the WAL volume without + increasing the risk of unrecoverable data corruption, + but at the cost of some extra CPU spent on the compression during + WAL logging and on the decompression during WAL replay. + + + + wal_buffers (integer) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 71cbe0ef68..554491b9a4 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -89,6 +89,7 @@ char *XLogArchiveCommand = NULL; bool EnableHotStandby = false; bool fullPageWrites = true; bool wal_log_hints = false; +bool wal_compression = false; bool log_checkpoints = false; int sync_method = DEFAULT_SYNC_METHOD; int wal_level = WAL_LEVEL_MINIMAL; diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index e77a491635..fb39e708f0 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -24,12 +24,16 @@ #include "access/xlog_internal.h" #include "access/xloginsert.h" #include "catalog/pg_control.h" +#include "common/pg_lzcompress.h" #include "miscadmin.h" #include "storage/bufmgr.h" #include "storage/proc.h" #include "utils/memutils.h" #include "pg_trace.h" +/* Buffer size required to store a compressed version of backup block image */ +#define PGLZ_MAX_BLCKSZ PGLZ_MAX_OUTPUT(BLCKSZ) + /* * For each block reference registered with XLogRegisterBuffer, we fill in * a registered_buffer struct. @@ -50,6 +54,9 @@ typedef struct XLogRecData bkp_rdatas[2]; /* temporary rdatas used to hold references to * backup block data in XLogRecordAssemble() */ + + /* buffer to store a compressed version of backup block image */ + char compressed_page[PGLZ_MAX_BLCKSZ]; } registered_buffer; static registered_buffer *registered_buffers; @@ -96,6 +103,8 @@ static MemoryContext xloginsert_cxt; static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info, XLogRecPtr RedoRecPtr, bool doPageWrites, XLogRecPtr *fpw_lsn); +static bool XLogCompressBackupBlock(char *page, uint16 hole_offset, + uint16 hole_length, char *dest, uint16 *dlen); /* * Begin constructing a WAL record. This must be called before the @@ -482,7 +491,11 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, bool needs_data; XLogRecordBlockHeader bkpb; XLogRecordBlockImageHeader bimg; + XLogRecordBlockCompressHeader cbimg; bool samerel; + bool is_compressed = false; + uint16 hole_length; + uint16 hole_offset; if (!regbuf->in_use) continue; @@ -529,9 +542,11 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, if (needs_backup) { Page page = regbuf->page; + uint16 compressed_len; /* - * The page needs to be backed up, so set up *bimg + * The page needs to be backed up, so calculate its hole length + * and offset. */ if (regbuf->flags & REGBUF_STANDARD) { @@ -543,50 +558,81 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, upper > lower && upper <= BLCKSZ) { - bimg.hole_offset = lower; - bimg.hole_length = upper - lower; + hole_offset = lower; + hole_length = upper - lower; } else { /* No "hole" to compress out */ - bimg.hole_offset = 0; - bimg.hole_length = 0; + hole_offset = 0; + hole_length = 0; } } else { /* Not a standard page header, don't try to eliminate "hole" */ - bimg.hole_offset = 0; - bimg.hole_length = 0; + hole_offset = 0; + hole_length = 0; + } + + /* + * Try to compress a block image if wal_compression is enabled + */ + if (wal_compression) + { + is_compressed = + XLogCompressBackupBlock(page, hole_offset, hole_length, + regbuf->compressed_page, + &compressed_len); } /* Fill in the remaining fields in the XLogRecordBlockHeader struct */ bkpb.fork_flags |= BKPBLOCK_HAS_IMAGE; - total_len += BLCKSZ - bimg.hole_length; - /* * Construct XLogRecData entries for the page content. */ rdt_datas_last->next = ®buf->bkp_rdatas[0]; rdt_datas_last = rdt_datas_last->next; - if (bimg.hole_length == 0) + + bimg.bimg_info = (hole_length == 0) ? 0 : BKPIMAGE_HAS_HOLE; + + if (is_compressed) { - rdt_datas_last->data = page; - rdt_datas_last->len = BLCKSZ; + bimg.length = compressed_len; + bimg.hole_offset = hole_offset; + bimg.bimg_info |= BKPIMAGE_IS_COMPRESSED; + if (hole_length != 0) + cbimg.hole_length = hole_length; + + rdt_datas_last->data = regbuf->compressed_page; + rdt_datas_last->len = compressed_len; } else { - /* must skip the hole */ - rdt_datas_last->data = page; - rdt_datas_last->len = bimg.hole_offset; + bimg.length = BLCKSZ - hole_length; + bimg.hole_offset = hole_offset; - rdt_datas_last->next = ®buf->bkp_rdatas[1]; - rdt_datas_last = rdt_datas_last->next; + if (hole_length == 0) + { + rdt_datas_last->data = page; + rdt_datas_last->len = BLCKSZ; + } + else + { + /* must skip the hole */ + rdt_datas_last->data = page; + rdt_datas_last->len = hole_offset; - rdt_datas_last->data = page + (bimg.hole_offset + bimg.hole_length); - rdt_datas_last->len = BLCKSZ - (bimg.hole_offset + bimg.hole_length); + rdt_datas_last->next = ®buf->bkp_rdatas[1]; + rdt_datas_last = rdt_datas_last->next; + + rdt_datas_last->data = page + (hole_offset + hole_length); + rdt_datas_last->len = BLCKSZ - (hole_offset + hole_length); + } } + + total_len += bimg.length; } if (needs_data) @@ -619,6 +665,12 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, { memcpy(scratch, &bimg, SizeOfXLogRecordBlockImageHeader); scratch += SizeOfXLogRecordBlockImageHeader; + if (hole_length != 0 && is_compressed) + { + memcpy(scratch, &cbimg, + SizeOfXLogRecordBlockCompressHeader); + scratch += SizeOfXLogRecordBlockCompressHeader; + } } if (!samerel) { @@ -680,6 +732,57 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, return &hdr_rdt; } +/* + * Create a compressed version of a backup block image. + * + * Returns FALSE if compression fails (i.e., compressed result is actually + * bigger than original). Otherwise, returns TRUE and sets 'dlen' to + * the length of compressed block image. + */ +static bool +XLogCompressBackupBlock(char * page, uint16 hole_offset, uint16 hole_length, + char *dest, uint16 *dlen) +{ + int32 orig_len = BLCKSZ - hole_length; + int32 len; + int32 extra_bytes = 0; + char *source; + char tmp[BLCKSZ]; + + if (hole_length != 0) + { + /* must skip the hole */ + source = tmp; + memcpy(source, page, hole_offset); + memcpy(source + hole_offset, + page + (hole_offset + hole_length), + BLCKSZ - (hole_length + hole_offset)); + + /* + * Extra data needs to be stored in WAL record for the compressed + * version of block image if the hole exists. + */ + extra_bytes = SizeOfXLogRecordBlockCompressHeader; + } + else + source = page; + + /* + * We recheck the actual size even if pglz_compress() reports success + * and see if the number of bytes saved by compression is larger than + * the length of extra data needed for the compressed version of block + * image. + */ + len = pglz_compress(source, orig_len, dest, PGLZ_strategy_default); + if (len >= 0 && + len + extra_bytes < orig_len) + { + *dlen = (uint16) len; /* successful compression */ + return true; + } + return false; +} + /* * Determine whether the buffer referenced has to be backed up. * diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 5bd07e381d..fb4a2ddfcf 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -20,6 +20,7 @@ #include "access/xlog_internal.h" #include "access/xlogreader.h" #include "catalog/pg_control.h" +#include "common/pg_lzcompress.h" static bool allocate_recordbuf(XLogReaderState *state, uint32 reclength); @@ -1037,9 +1038,78 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg) if (blk->has_image) { + COPY_HEADER_FIELD(&blk->bimg_len, sizeof(uint16)); COPY_HEADER_FIELD(&blk->hole_offset, sizeof(uint16)); - COPY_HEADER_FIELD(&blk->hole_length, sizeof(uint16)); - datatotal += BLCKSZ - blk->hole_length; + COPY_HEADER_FIELD(&blk->bimg_info, sizeof(uint8)); + if (blk->bimg_info & BKPIMAGE_IS_COMPRESSED) + { + if (blk->bimg_info & BKPIMAGE_HAS_HOLE) + COPY_HEADER_FIELD(&blk->hole_length, sizeof(uint16)); + else + blk->hole_length = 0; + } + else + blk->hole_length = BLCKSZ - blk->bimg_len; + datatotal += blk->bimg_len; + + /* + * cross-check that hole_offset > 0, hole_length > 0 and + * bimg_len < BLCKSZ if the HAS_HOLE flag is set. + */ + if ((blk->bimg_info & BKPIMAGE_HAS_HOLE) && + (blk->hole_offset == 0 || + blk->hole_length == 0 || + blk->bimg_len == BLCKSZ)) + { + report_invalid_record(state, + "BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X", + (unsigned int) blk->hole_offset, + (unsigned int) blk->hole_length, + (unsigned int) blk->bimg_len, + (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); + goto err; + } + /* + * cross-check that hole_offset == 0 and hole_length == 0 + * if the HAS_HOLE flag is not set. + */ + if (!(blk->bimg_info & BKPIMAGE_HAS_HOLE) && + (blk->hole_offset != 0 || blk->hole_length != 0)) + { + report_invalid_record(state, + "BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%X", + (unsigned int) blk->hole_offset, + (unsigned int) blk->hole_length, + (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); + goto err; + } + /* + * cross-check that bimg_len < BLCKSZ + * if the IS_COMPRESSED flag is set. + */ + if ((blk->bimg_info & BKPIMAGE_IS_COMPRESSED) && + blk->bimg_len == BLCKSZ) + { + report_invalid_record(state, + "BKPIMAGE_IS_COMPRESSED set, but block image length %u at %X/%X", + (unsigned int) blk->bimg_len, + (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); + goto err; + } + /* + * cross-check that bimg_len = BLCKSZ if neither + * HAS_HOLE nor IS_COMPRESSED flag is set. + */ + if (!(blk->bimg_info & BKPIMAGE_HAS_HOLE) && + !(blk->bimg_info & BKPIMAGE_IS_COMPRESSED) && + blk->bimg_len != BLCKSZ) + { + report_invalid_record(state, + "neither BKPIMAGE_HAS_HOLE nor BKPIMAGE_IS_COMPRESSED set, but block image length is %u at %X/%X", + (unsigned int) blk->data_len, + (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); + goto err; + } } if (!(fork_flags & BKPBLOCK_SAME_REL)) { @@ -1094,7 +1164,7 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg) if (blk->has_image) { blk->bkp_image = ptr; - ptr += BLCKSZ - blk->hole_length; + ptr += blk->bimg_len; } if (blk->has_data) { @@ -1200,6 +1270,8 @@ bool RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) { DecodedBkpBlock *bkpb; + char *ptr; + char tmp[BLCKSZ]; if (!record->blocks[block_id].in_use) return false; @@ -1207,18 +1279,35 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) return false; bkpb = &record->blocks[block_id]; + ptr = bkpb->bkp_image; + if (bkpb->bimg_info & BKPIMAGE_IS_COMPRESSED) + { + /* If a backup block image is compressed, decompress it */ + if (pglz_decompress(ptr, bkpb->bimg_len, tmp, + BLCKSZ - bkpb->hole_length) < 0) + { + report_invalid_record(record, "invalid compressed image at %X/%X, block %d", + (uint32) (record->ReadRecPtr >> 32), + (uint32) record->ReadRecPtr, + block_id); + return false; + } + ptr = tmp; + } + + /* generate page, taking into account hole if necessary */ if (bkpb->hole_length == 0) { - memcpy(page, bkpb->bkp_image, BLCKSZ); + memcpy(page, ptr, BLCKSZ); } else { - memcpy(page, bkpb->bkp_image, bkpb->hole_offset); + memcpy(page, ptr, bkpb->hole_offset); /* must zero-fill the hole */ MemSet(page + bkpb->hole_offset, 0, bkpb->hole_length); memcpy(page + (bkpb->hole_offset + bkpb->hole_length), - bkpb->bkp_image + bkpb->hole_offset, + ptr + bkpb->hole_offset, BLCKSZ - (bkpb->hole_offset + bkpb->hole_length)); } diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index d84dba7732..0da1981e38 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -996,6 +996,16 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, + { + {"wal_compression", PGC_USERSET, WAL_SETTINGS, + gettext_noop("Compresses full-page writes written in WAL file."), + NULL + }, + &wal_compression, + false, + NULL, NULL, NULL + }, + { {"log_checkpoints", PGC_SIGHUP, LOGGING_WHAT, gettext_noop("Logs each checkpoint."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index f8f9ce18ec..7590a6f056 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -186,6 +186,7 @@ # fsync_writethrough # open_sync #full_page_writes = on # recover from partial page writes +#wal_compression = off # enable compression of full-page writes #wal_log_hints = off # also do full page writes of non-critical updates # (change requires restart) #wal_buffers = -1 # min 32kB, -1 sets based on shared_buffers diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 0e8e5873cc..2b1f42389c 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -100,6 +100,7 @@ extern char *XLogArchiveCommand; extern bool EnableHotStandby; extern bool fullPageWrites; extern bool wal_log_hints; +extern bool wal_compression; extern bool log_checkpoints; extern int CheckPointSegments; diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 6071c6dd8f..f1be598d8f 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -31,7 +31,7 @@ /* * Each page of XLOG file has a header like this: */ -#define XLOG_PAGE_MAGIC 0xD081 /* can be used as WAL version indicator */ +#define XLOG_PAGE_MAGIC 0xD082 /* can be used as WAL version indicator */ typedef struct XLogPageHeaderData { diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h index 74bec208b1..609bfe3e40 100644 --- a/src/include/access/xlogreader.h +++ b/src/include/access/xlogreader.h @@ -55,6 +55,8 @@ typedef struct char *bkp_image; uint16 hole_offset; uint16 hole_length; + uint16 bimg_len; + uint8 bimg_info; /* Buffer holding the rmgr-specific data associated with this block */ bool has_data; diff --git a/src/include/access/xlogrecord.h b/src/include/access/xlogrecord.h index 8741c32345..09bbcb1488 100644 --- a/src/include/access/xlogrecord.h +++ b/src/include/access/xlogrecord.h @@ -100,18 +100,55 @@ typedef struct XLogRecordBlockHeader * * As a trivial form of data compression, the XLOG code is aware that * PG data pages usually contain an unused "hole" in the middle, which - * contains only zero bytes. If hole_length > 0 then we have removed + * contains only zero bytes. If the length of "hole" > 0 then we have removed * such a "hole" from the stored data (and it's not counted in the * XLOG record's CRC, either). Hence, the amount of block data actually - * present is BLCKSZ - hole_length bytes. + * present is BLCKSZ - the length of "hole" bytes. + * + * When wal_compression is enabled, a full page image which "hole" was + * removed is additionally compressed using PGLZ compression algorithm. + * This can reduce the WAL volume, but at some extra cost of CPU spent + * on the compression during WAL logging. In this case, since the "hole" + * length cannot be calculated by subtracting the number of page image bytes + * from BLCKSZ, basically it needs to be stored as an extra information. + * But when no "hole" exists, we can assume that the "hole" length is zero + * and no such an extra information needs to be stored. Note that + * the original version of page image is stored in WAL instead of the + * compressed one if the number of bytes saved by compression is less than + * the length of extra information. Hence, when a page image is successfully + * compressed, the amount of block data actually present is less than + * BLCKSZ - the length of "hole" bytes - the length of extra information. */ typedef struct XLogRecordBlockImageHeader { - uint16 hole_offset; /* number of bytes before "hole" */ - uint16 hole_length; /* number of bytes in "hole" */ + uint16 length; /* number of page image bytes */ + uint16 hole_offset; /* number of bytes before "hole" */ + uint8 bimg_info; /* flag bits, see below */ + + /* + * If BKPIMAGE_HAS_HOLE and BKPIMAGE_IS_COMPRESSED, + * an XLogRecordBlockCompressHeader struct follows. + */ } XLogRecordBlockImageHeader; -#define SizeOfXLogRecordBlockImageHeader sizeof(XLogRecordBlockImageHeader) +#define SizeOfXLogRecordBlockImageHeader \ + (offsetof(XLogRecordBlockImageHeader, bimg_info) + sizeof(uint8)) + +/* Information stored in bimg_info */ +#define BKPIMAGE_HAS_HOLE 0x01 /* page image has "hole" */ +#define BKPIMAGE_IS_COMPRESSED 0x02 /* page image is compressed */ + +/* + * Extra header information used when page image has "hole" and + * is compressed. + */ +typedef struct XLogRecordBlockCompressHeader +{ + uint16 hole_length; /* number of bytes in "hole" */ +} XLogRecordBlockCompressHeader; + +#define SizeOfXLogRecordBlockCompressHeader \ + sizeof(XLogRecordBlockCompressHeader) /* * Maximum size of the header for a block reference. This is used to size a @@ -120,6 +157,7 @@ typedef struct XLogRecordBlockImageHeader #define MaxSizeOfXLogRecordBlockHeader \ (SizeOfXLogRecordBlockHeader + \ SizeOfXLogRecordBlockImageHeader + \ + SizeOfXLogRecordBlockCompressHeader + \ sizeof(RelFileNode) + \ sizeof(BlockNumber))