diff --git a/contrib/pg_xlogdump/pg_xlogdump.c b/contrib/pg_xlogdump/pg_xlogdump.c index 7f151f961c..26556dc82d 100644 --- a/contrib/pg_xlogdump/pg_xlogdump.c +++ b/contrib/pg_xlogdump/pg_xlogdump.c @@ -17,6 +17,7 @@ #include "access/xlogreader.h" #include "access/xlogrecord.h" +#include "access/xlog_internal.h" #include "access/transam.h" #include "common/fe_memutils.h" #include "getopt_long.h" @@ -343,90 +344,117 @@ XLogDumpReadPage(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen, * Store per-rmgr and per-record statistics for a given record. */ static void -XLogDumpCountRecord(XLogDumpConfig *config, XLogDumpStats *stats, XLogRecPtr ReadRecPtr, XLogRecord *record) +XLogDumpCountRecord(XLogDumpConfig *config, XLogDumpStats *stats, + XLogReaderState *record) { RmgrId rmid; uint8 recid; + uint32 rec_len; + uint32 fpi_len; stats->count++; /* Update per-rmgr statistics */ - rmid = record->xl_rmid; + rmid = XLogRecGetRmid(record); + rec_len = XLogRecGetDataLen(record) + SizeOfXLogRecord; + fpi_len = record->decoded_record->xl_tot_len - rec_len; stats->rmgr_stats[rmid].count++; - stats->rmgr_stats[rmid].rec_len += - record->xl_len + SizeOfXLogRecord; - stats->rmgr_stats[rmid].fpi_len += - record->xl_tot_len - (record->xl_len + SizeOfXLogRecord); + stats->rmgr_stats[rmid].rec_len += rec_len; + stats->rmgr_stats[rmid].fpi_len += fpi_len; /* * Update per-record statistics, where the record is identified by a - * combination of the RmgrId and the four bits of the xl_info field - * that are the rmgr's domain (resulting in sixteen possible entries - * per RmgrId). + * combination of the RmgrId and the four bits of the xl_info field that + * are the rmgr's domain (resulting in sixteen possible entries per + * RmgrId). */ - recid = record->xl_info >> 4; + recid = XLogRecGetInfo(record) >> 4; stats->record_stats[rmid][recid].count++; - stats->record_stats[rmid][recid].rec_len += - record->xl_len + SizeOfXLogRecord; - stats->record_stats[rmid][recid].fpi_len += - record->xl_tot_len - (record->xl_len + SizeOfXLogRecord); + stats->record_stats[rmid][recid].rec_len += rec_len; + stats->record_stats[rmid][recid].fpi_len += fpi_len; } /* * Print a record to stdout */ static void -XLogDumpDisplayRecord(XLogDumpConfig *config, XLogRecPtr ReadRecPtr, XLogRecord *record) +XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record) { - const char *id; - const RmgrDescData *desc = &RmgrDescTable[record->xl_rmid]; + const char *id; + const RmgrDescData *desc = &RmgrDescTable[XLogRecGetRmid(record)]; + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blk; + int block_id; + uint8 info = XLogRecGetInfo(record); + XLogRecPtr xl_prev = XLogRecGetPrev(record); - id = desc->rm_identify(record->xl_info); + id = desc->rm_identify(info); if (id == NULL) - id = psprintf("UNKNOWN (%x)", record->xl_info & ~XLR_INFO_MASK); + id = psprintf("UNKNOWN (%x)", info & ~XLR_INFO_MASK); - printf("rmgr: %-11s len (rec/tot): %6u/%6u, tx: %10u, lsn: %X/%08X, prev %X/%08X, bkp: %u%u%u%u, desc: %s ", + printf("rmgr: %-11s len (rec/tot): %6u/%6u, tx: %10u, lsn: %X/%08X, prev %X/%08X, ", desc->rm_name, - record->xl_len, record->xl_tot_len, - record->xl_xid, - (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr, - (uint32) (record->xl_prev >> 32), (uint32) record->xl_prev, - !!(XLR_BKP_BLOCK(0) & record->xl_info), - !!(XLR_BKP_BLOCK(1) & record->xl_info), - !!(XLR_BKP_BLOCK(2) & record->xl_info), - !!(XLR_BKP_BLOCK(3) & record->xl_info), - id); + XLogRecGetDataLen(record), XLogRecGetTotalLen(record), + XLogRecGetXid(record), + (uint32) (record->ReadRecPtr >> 32), (uint32) record->ReadRecPtr, + (uint32) (xl_prev >> 32), (uint32) xl_prev); + printf("desc: %s ", id); /* the desc routine will printf the description directly to stdout */ desc->rm_desc(NULL, record); - putchar('\n'); - - if (config->bkp_details) + if (!config->bkp_details) { - int bkpnum; - char *blk = (char *) XLogRecGetData(record) + record->xl_len; - - for (bkpnum = 0; bkpnum < XLR_MAX_BKP_BLOCKS; bkpnum++) + /* print block references (short format) */ + for (block_id = 0; block_id <= record->max_block_id; block_id++) { - BkpBlock bkpb; - - if (!(XLR_BKP_BLOCK(bkpnum) & record->xl_info)) + if (!XLogRecHasBlockRef(record, block_id)) continue; - memcpy(&bkpb, blk, sizeof(BkpBlock)); - blk += sizeof(BkpBlock); - blk += BLCKSZ - bkpb.hole_length; + XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk); + if (forknum != MAIN_FORKNUM) + printf(", blkref #%u: rel %u/%u/%u fork %s blk %u", + block_id, + rnode.spcNode, rnode.dbNode, rnode.relNode, + forkNames[forknum], + blk); + else + printf(", blkref #%u: rel %u/%u/%u blk %u", + block_id, + rnode.spcNode, rnode.dbNode, rnode.relNode, + blk); + if (XLogRecHasBlockImage(record, block_id)) + printf(" FPW"); + } + putchar('\n'); + } + else + { + /* print block references (detailed format) */ + putchar('\n'); + for (block_id = 0; block_id <= record->max_block_id; block_id++) + { + if (!XLogRecHasBlockRef(record, block_id)) + continue; - printf("\tbackup bkp #%u; rel %u/%u/%u; fork: %s; block: %u; hole: offset: %u, length: %u\n", - bkpnum, - bkpb.node.spcNode, bkpb.node.dbNode, bkpb.node.relNode, - forkNames[bkpb.fork], - bkpb.block, bkpb.hole_offset, bkpb.hole_length); + XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk); + printf("\tblkref #%u: rel %u/%u/%u fork %s blk %u", + block_id, + rnode.spcNode, rnode.dbNode, rnode.relNode, + forkNames[forknum], + blk); + if (XLogRecHasBlockImage(record, block_id)) + { + printf(" (FPW); hole: offset: %u, length: %u\n", + record->blocks[block_id].hole_offset, + record->blocks[block_id].hole_length); + } + putchar('\n'); } } } @@ -924,9 +952,9 @@ main(int argc, char **argv) /* process the record */ if (config.stats == true) - XLogDumpCountRecord(&config, &stats, xlogreader_state->ReadRecPtr, record); + XLogDumpCountRecord(&config, &stats, xlogreader_state); else - XLogDumpDisplayRecord(&config, xlogreader_state->ReadRecPtr, record); + XLogDumpDisplayRecord(&config, xlogreader_state); /* check whether we printed enough */ config.already_displayed_records++; diff --git a/contrib/pg_xlogdump/rmgrdesc.h b/contrib/pg_xlogdump/rmgrdesc.h index da805c53ca..aec4418303 100644 --- a/contrib/pg_xlogdump/rmgrdesc.h +++ b/contrib/pg_xlogdump/rmgrdesc.h @@ -13,7 +13,7 @@ typedef struct RmgrDescData { const char *rm_name; - void (*rm_desc) (StringInfo buf, XLogRecord *record); + void (*rm_desc) (StringInfo buf, XLogReaderState *record); const char *(*rm_identify) (uint8 info); } RmgrDescData; diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index bd35cf6696..cb645e3d45 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -666,19 +666,16 @@ brinbuild(PG_FUNCTION_ARGS) { xl_brin_createidx xlrec; XLogRecPtr recptr; - XLogRecData rdata; Page page; - xlrec.node = index->rd_node; xlrec.version = BRIN_CURRENT_VERSION; xlrec.pagesPerRange = BrinGetPagesPerRange(index); - rdata.buffer = InvalidBuffer; - rdata.data = (char *) &xlrec; - rdata.len = SizeOfBrinCreateIdx; - rdata.next = NULL; + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBrinCreateIdx); + XLogRegisterBuffer(0, meta, REGBUF_WILL_INIT); - recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_CREATE_INDEX, &rdata); + recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_CREATE_INDEX); page = BufferGetPage(meta); PageSetLSN(page, recptr); diff --git a/src/backend/access/brin/brin_pageops.c b/src/backend/access/brin/brin_pageops.c index 50f1dec163..0b6fbeb603 100644 --- a/src/backend/access/brin/brin_pageops.c +++ b/src/backend/access/brin/brin_pageops.c @@ -140,27 +140,19 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, /* XLOG stuff */ if (RelationNeedsWAL(idxrel)) { - BlockNumber blk = BufferGetBlockNumber(oldbuf); xl_brin_samepage_update xlrec; XLogRecPtr recptr; - XLogRecData rdata[2]; uint8 info = XLOG_BRIN_SAMEPAGE_UPDATE; - xlrec.node = idxrel->rd_node; - ItemPointerSetBlockNumber(&xlrec.tid, blk); - ItemPointerSetOffsetNumber(&xlrec.tid, oldoff); - rdata[0].data = (char *) &xlrec; - rdata[0].len = SizeOfBrinSamepageUpdate; - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &(rdata[1]); + xlrec.offnum = oldoff; - rdata[1].data = (char *) newtup; - rdata[1].len = newsz; - rdata[1].buffer = oldbuf; - rdata[1].buffer_std = true; - rdata[1].next = NULL; + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBrinSamepageUpdate); - recptr = XLogInsert(RM_BRIN_ID, info, rdata); + XLogRegisterBuffer(0, oldbuf, REGBUF_STANDARD); + XLogRegisterBufData(0, (char *) newtup, newsz); + + recptr = XLogInsert(RM_BRIN_ID, info); PageSetLSN(oldpage, recptr); } @@ -211,43 +203,30 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, { xl_brin_update xlrec; XLogRecPtr recptr; - XLogRecData rdata[4]; uint8 info; info = XLOG_BRIN_UPDATE | (extended ? XLOG_BRIN_INIT_PAGE : 0); - xlrec.insert.node = idxrel->rd_node; - ItemPointerSet(&xlrec.insert.tid, BufferGetBlockNumber(newbuf), newoff); + xlrec.insert.offnum = newoff; xlrec.insert.heapBlk = heapBlk; - xlrec.insert.tuplen = newsz; - xlrec.insert.revmapBlk = BufferGetBlockNumber(revmapbuf); xlrec.insert.pagesPerRange = pagesPerRange; - ItemPointerSet(&xlrec.oldtid, BufferGetBlockNumber(oldbuf), oldoff); + xlrec.oldOffnum = oldoff; - rdata[0].data = (char *) &xlrec; - rdata[0].len = SizeOfBrinUpdate; - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &(rdata[1]); + XLogBeginInsert(); - rdata[1].data = (char *) newtup; - rdata[1].len = newsz; - rdata[1].buffer = extended ? InvalidBuffer : newbuf; - rdata[1].buffer_std = true; - rdata[1].next = &(rdata[2]); + /* new page */ + XLogRegisterData((char *) &xlrec, SizeOfBrinUpdate); - rdata[2].data = (char *) NULL; - rdata[2].len = 0; - rdata[2].buffer = revmapbuf; - rdata[2].buffer_std = true; - rdata[2].next = &(rdata[3]); + XLogRegisterBuffer(0, newbuf, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0)); + XLogRegisterBufData(0, (char *) newtup, newsz); - rdata[3].data = (char *) NULL; - rdata[3].len = 0; - rdata[3].buffer = oldbuf; - rdata[3].buffer_std = true; - rdata[3].next = NULL; + /* revmap page */ + XLogRegisterBuffer(1, revmapbuf, REGBUF_STANDARD); - recptr = XLogInsert(RM_BRIN_ID, info, rdata); + /* old page */ + XLogRegisterBuffer(2, oldbuf, REGBUF_STANDARD); + + recptr = XLogInsert(RM_BRIN_ID, info); PageSetLSN(oldpage, recptr); PageSetLSN(newpage, recptr); @@ -354,36 +333,22 @@ brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, { xl_brin_insert xlrec; XLogRecPtr recptr; - XLogRecData rdata[3]; uint8 info; info = XLOG_BRIN_INSERT | (extended ? XLOG_BRIN_INIT_PAGE : 0); - xlrec.node = idxrel->rd_node; xlrec.heapBlk = heapBlk; xlrec.pagesPerRange = pagesPerRange; - xlrec.revmapBlk = BufferGetBlockNumber(revmapbuf); - xlrec.tuplen = itemsz; - ItemPointerSet(&xlrec.tid, blk, off); + xlrec.offnum = off; - rdata[0].data = (char *) &xlrec; - rdata[0].len = SizeOfBrinInsert; - rdata[0].buffer = InvalidBuffer; - rdata[0].buffer_std = false; - rdata[0].next = &(rdata[1]); + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBrinInsert); - rdata[1].data = (char *) tup; - rdata[1].len = itemsz; - rdata[1].buffer = extended ? InvalidBuffer : *buffer; - rdata[1].buffer_std = true; - rdata[1].next = &(rdata[2]); + XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0)); + XLogRegisterBufData(0, (char *) tup, itemsz); - rdata[2].data = (char *) NULL; - rdata[2].len = 0; - rdata[2].buffer = revmapbuf; - rdata[2].buffer_std = false; - rdata[2].next = NULL; + XLogRegisterBuffer(1, revmapbuf, 0); - recptr = XLogInsert(RM_BRIN_ID, info, rdata); + recptr = XLogInsert(RM_BRIN_ID, info); PageSetLSN(page, recptr); PageSetLSN(BufferGetPage(revmapbuf), recptr); diff --git a/src/backend/access/brin/brin_revmap.c b/src/backend/access/brin/brin_revmap.c index 272c74e6b6..adc7d0b847 100644 --- a/src/backend/access/brin/brin_revmap.c +++ b/src/backend/access/brin/brin_revmap.c @@ -477,23 +477,16 @@ revmap_physical_extend(BrinRevmap *revmap) { xl_brin_revmap_extend xlrec; XLogRecPtr recptr; - XLogRecData rdata[2]; - xlrec.node = revmap->rm_irel->rd_node; xlrec.targetBlk = mapBlk; - rdata[0].data = (char *) &xlrec; - rdata[0].len = SizeOfBrinRevmapExtend; - rdata[0].buffer = InvalidBuffer; - rdata[0].buffer_std = false; - rdata[0].next = &(rdata[1]); - rdata[1].data = (char *) NULL; - rdata[1].len = 0; - rdata[1].buffer = revmap->rm_metaBuf; - rdata[1].buffer_std = false; - rdata[1].next = NULL; + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBrinRevmapExtend); + XLogRegisterBuffer(0, revmap->rm_metaBuf, 0); - recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_REVMAP_EXTEND, rdata); + XLogRegisterBuffer(1, buf, REGBUF_WILL_INIT); + + recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_REVMAP_EXTEND); PageSetLSN(metapage, recptr); PageSetLSN(page, recptr); } diff --git a/src/backend/access/brin/brin_xlog.c b/src/backend/access/brin/brin_xlog.c index 29370689a7..e6a1750975 100644 --- a/src/backend/access/brin/brin_xlog.c +++ b/src/backend/access/brin/brin_xlog.c @@ -20,17 +20,15 @@ * xlog replay routines */ static void -brin_xlog_createidx(XLogRecPtr lsn, XLogRecord *record) +brin_xlog_createidx(XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; xl_brin_createidx *xlrec = (xl_brin_createidx *) XLogRecGetData(record); Buffer buf; Page page; - /* Backup blocks are not used in create_index records */ - Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); - /* create the index' metapage */ - buf = XLogReadBuffer(xlrec->node, BRIN_METAPAGE_BLKNO, true); + buf = XLogInitBufferForRedo(record, 0); Assert(BufferIsValid(buf)); page = (Page) BufferGetPage(buf); brin_metapage_init(page, xlrec->pagesPerRange, xlrec->version); @@ -44,51 +42,47 @@ brin_xlog_createidx(XLogRecPtr lsn, XLogRecord *record) * revmap. */ static void -brin_xlog_insert_update(XLogRecPtr lsn, XLogRecord *record, - xl_brin_insert *xlrec, BrinTuple *tuple) +brin_xlog_insert_update(XLogReaderState *record, + xl_brin_insert *xlrec) { - BlockNumber blkno; + XLogRecPtr lsn = record->EndRecPtr; Buffer buffer; Page page; XLogRedoAction action; - blkno = ItemPointerGetBlockNumber(&xlrec->tid); - /* * If we inserted the first and only tuple on the page, re-initialize the * page from scratch. */ - if (record->xl_info & XLOG_BRIN_INIT_PAGE) + if (XLogRecGetInfo(record) & XLOG_BRIN_INIT_PAGE) { - /* - * No full-page image here. Don't try to read it, because there - * might be one for the revmap buffer, below. - */ - buffer = XLogReadBuffer(xlrec->node, blkno, true); + buffer = XLogInitBufferForRedo(record, 0); page = BufferGetPage(buffer); brin_page_init(page, BRIN_PAGETYPE_REGULAR); action = BLK_NEEDS_REDO; } else { - action = XLogReadBufferForRedo(lsn, record, 0, - xlrec->node, blkno, &buffer); + action = XLogReadBufferForRedo(record, 0, &buffer); } /* insert the index item into the page */ if (action == BLK_NEEDS_REDO) { OffsetNumber offnum; + BrinTuple *tuple; + Size tuplen; + + tuple = (BrinTuple *) XLogRecGetBlockData(record, 0, &tuplen); Assert(tuple->bt_blkno == xlrec->heapBlk); page = (Page) BufferGetPage(buffer); - offnum = ItemPointerGetOffsetNumber(&(xlrec->tid)); + offnum = xlrec->offnum; if (PageGetMaxOffsetNumber(page) + 1 < offnum) elog(PANIC, "brin_xlog_insert_update: invalid max offset number"); - offnum = PageAddItem(page, (Item) tuple, xlrec->tuplen, offnum, true, - false); + offnum = PageAddItem(page, (Item) tuple, tuplen, offnum, true, false); if (offnum == InvalidOffsetNumber) elog(PANIC, "brin_xlog_insert_update: failed to add tuple"); @@ -99,16 +93,17 @@ brin_xlog_insert_update(XLogRecPtr lsn, XLogRecord *record, UnlockReleaseBuffer(buffer); /* update the revmap */ - action = XLogReadBufferForRedo(lsn, record, - record->xl_info & XLOG_BRIN_INIT_PAGE ? 0 : 1, - xlrec->node, - xlrec->revmapBlk, &buffer); + action = XLogReadBufferForRedo(record, 1, &buffer); if (action == BLK_NEEDS_REDO) { + ItemPointerData tid; + BlockNumber blkno = BufferGetBlockNumber(buffer); + + ItemPointerSet(&tid, blkno, xlrec->offnum); page = (Page) BufferGetPage(buffer); brinSetHeapBlockItemptr(buffer, xlrec->pagesPerRange, xlrec->heapBlk, - xlrec->tid); + tid); PageSetLSN(page, lsn); MarkBufferDirty(buffer); } @@ -122,34 +117,26 @@ brin_xlog_insert_update(XLogRecPtr lsn, XLogRecord *record, * replay a BRIN index insertion */ static void -brin_xlog_insert(XLogRecPtr lsn, XLogRecord *record) +brin_xlog_insert(XLogReaderState *record) { xl_brin_insert *xlrec = (xl_brin_insert *) XLogRecGetData(record); - BrinTuple *newtup; - newtup = (BrinTuple *) ((char *) xlrec + SizeOfBrinInsert); - - brin_xlog_insert_update(lsn, record, xlrec, newtup); + brin_xlog_insert_update(record, xlrec); } /* * replay a BRIN index update */ static void -brin_xlog_update(XLogRecPtr lsn, XLogRecord *record) +brin_xlog_update(XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; xl_brin_update *xlrec = (xl_brin_update *) XLogRecGetData(record); - BlockNumber blkno; Buffer buffer; - BrinTuple *newtup; XLogRedoAction action; - newtup = (BrinTuple *) ((char *) xlrec + SizeOfBrinUpdate); - /* First remove the old tuple */ - blkno = ItemPointerGetBlockNumber(&(xlrec->oldtid)); - action = XLogReadBufferForRedo(lsn, record, 2, xlrec->insert.node, - blkno, &buffer); + action = XLogReadBufferForRedo(record, 2, &buffer); if (action == BLK_NEEDS_REDO) { Page page; @@ -157,7 +144,7 @@ brin_xlog_update(XLogRecPtr lsn, XLogRecord *record) page = (Page) BufferGetPage(buffer); - offnum = ItemPointerGetOffsetNumber(&(xlrec->oldtid)); + offnum = xlrec->oldOffnum; if (PageGetMaxOffsetNumber(page) + 1 < offnum) elog(PANIC, "brin_xlog_update: invalid max offset number"); @@ -168,7 +155,7 @@ brin_xlog_update(XLogRecPtr lsn, XLogRecord *record) } /* Then insert the new tuple and update revmap, like in an insertion. */ - brin_xlog_insert_update(lsn, record, &xlrec->insert, newtup); + brin_xlog_insert_update(record, &xlrec->insert); if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); @@ -178,30 +165,27 @@ brin_xlog_update(XLogRecPtr lsn, XLogRecord *record) * Update a tuple on a single page. */ static void -brin_xlog_samepage_update(XLogRecPtr lsn, XLogRecord *record) +brin_xlog_samepage_update(XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; xl_brin_samepage_update *xlrec; - BlockNumber blkno; Buffer buffer; XLogRedoAction action; xlrec = (xl_brin_samepage_update *) XLogRecGetData(record); - blkno = ItemPointerGetBlockNumber(&(xlrec->tid)); - action = XLogReadBufferForRedo(lsn, record, 0, xlrec->node, blkno, - &buffer); + action = XLogReadBufferForRedo(record, 0, &buffer); if (action == BLK_NEEDS_REDO) { - int tuplen; + Size tuplen; BrinTuple *mmtuple; Page page; OffsetNumber offnum; - tuplen = record->xl_len - SizeOfBrinSamepageUpdate; - mmtuple = (BrinTuple *) ((char *) xlrec + SizeOfBrinSamepageUpdate); + mmtuple = (BrinTuple *) XLogRecGetBlockData(record, 0, &tuplen); page = (Page) BufferGetPage(buffer); - offnum = ItemPointerGetOffsetNumber(&(xlrec->tid)); + offnum = xlrec->offnum; if (PageGetMaxOffsetNumber(page) + 1 < offnum) elog(PANIC, "brin_xlog_samepage_update: invalid max offset number"); @@ -223,18 +207,23 @@ brin_xlog_samepage_update(XLogRecPtr lsn, XLogRecord *record) * Replay a revmap page extension */ static void -brin_xlog_revmap_extend(XLogRecPtr lsn, XLogRecord *record) +brin_xlog_revmap_extend(XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; xl_brin_revmap_extend *xlrec; Buffer metabuf; Buffer buf; Page page; + BlockNumber targetBlk; XLogRedoAction action; xlrec = (xl_brin_revmap_extend *) XLogRecGetData(record); + + XLogRecGetBlockTag(record, 1, NULL, NULL, &targetBlk); + Assert(xlrec->targetBlk == targetBlk); + /* Update the metapage */ - action = XLogReadBufferForRedo(lsn, record, 0, xlrec->node, - BRIN_METAPAGE_BLKNO, &metabuf); + action = XLogReadBufferForRedo(record, 0, &metabuf); if (action == BLK_NEEDS_REDO) { Page metapg; @@ -255,7 +244,7 @@ brin_xlog_revmap_extend(XLogRecPtr lsn, XLogRecord *record) * image here. */ - buf = XLogReadBuffer(xlrec->node, xlrec->targetBlk, true); + buf = XLogInitBufferForRedo(record, 1); page = (Page) BufferGetPage(buf); brin_page_init(page, BRIN_PAGETYPE_REVMAP); @@ -268,26 +257,26 @@ brin_xlog_revmap_extend(XLogRecPtr lsn, XLogRecord *record) } void -brin_redo(XLogRecPtr lsn, XLogRecord *record) +brin_redo(XLogReaderState *record) { - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; switch (info & XLOG_BRIN_OPMASK) { case XLOG_BRIN_CREATE_INDEX: - brin_xlog_createidx(lsn, record); + brin_xlog_createidx(record); break; case XLOG_BRIN_INSERT: - brin_xlog_insert(lsn, record); + brin_xlog_insert(record); break; case XLOG_BRIN_UPDATE: - brin_xlog_update(lsn, record); + brin_xlog_update(record); break; case XLOG_BRIN_SAMEPAGE_UPDATE: - brin_xlog_samepage_update(lsn, record); + brin_xlog_samepage_update(record); break; case XLOG_BRIN_REVMAP_EXTEND: - brin_xlog_revmap_extend(lsn, record); + brin_xlog_revmap_extend(record); break; default: elog(PANIC, "brin_redo: unknown op code %u", info); diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c index 5365477000..99f40a871f 100644 --- a/src/backend/access/gin/ginbtree.c +++ b/src/backend/access/gin/ginbtree.c @@ -326,7 +326,6 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, Buffer childbuf, GinStatsData *buildStats) { Page page = BufferGetPage(stack->buffer); - XLogRecData *payloadrdata; GinPlaceToPageRC rc; uint16 xlflags = 0; Page childpage = NULL; @@ -351,12 +350,36 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, /* * Try to put the incoming tuple on the page. placeToPage will decide if * the page needs to be split. + * + * WAL-logging this operation is a bit funny: + * + * We're responsible for calling XLogBeginInsert() and XLogInsert(). + * XLogBeginInsert() must be called before placeToPage, because + * placeToPage can register some data to the WAL record. + * + * If placeToPage returns INSERTED, placeToPage has already called + * START_CRIT_SECTION(), and we're responsible for calling + * END_CRIT_SECTION. When it returns INSERTED, it is also responsible for + * registering any data required to replay the operation with + * XLogRegisterData(0, ...). It may only add data to block index 0; the + * main data of the WAL record is reserved for this function. + * + * If placeToPage returns SPLIT, we're wholly responsible for WAL logging. + * Splits happen infrequently, so we just make a full-page image of all + * the pages involved. */ + + if (RelationNeedsWAL(btree->index)) + XLogBeginInsert(); + rc = btree->placeToPage(btree, stack->buffer, stack, insertdata, updateblkno, - &payloadrdata, &newlpage, &newrpage); + &newlpage, &newrpage); if (rc == UNMODIFIED) + { + XLogResetInsertion(); return true; + } else if (rc == INSERTED) { /* placeToPage did START_CRIT_SECTION() */ @@ -372,17 +395,18 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, if (RelationNeedsWAL(btree->index)) { XLogRecPtr recptr; - XLogRecData rdata[3]; ginxlogInsert xlrec; BlockIdData childblknos[2]; - xlrec.node = btree->index->rd_node; - xlrec.blkno = BufferGetBlockNumber(stack->buffer); + /* + * placetopage already registered stack->buffer as block 0. + */ xlrec.flags = xlflags; - rdata[0].buffer = InvalidBuffer; - rdata[0].data = (char *) &xlrec; - rdata[0].len = sizeof(ginxlogInsert); + if (childbuf != InvalidBuffer) + XLogRegisterBuffer(1, childbuf, REGBUF_STANDARD); + + XLogRegisterData((char *) &xlrec, sizeof(ginxlogInsert)); /* * Log information about child if this was an insertion of a @@ -390,26 +414,13 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, */ if (childbuf != InvalidBuffer) { - rdata[0].next = &rdata[1]; - BlockIdSet(&childblknos[0], BufferGetBlockNumber(childbuf)); BlockIdSet(&childblknos[1], GinPageGetOpaque(childpage)->rightlink); - - rdata[1].buffer = InvalidBuffer; - rdata[1].data = (char *) childblknos; - rdata[1].len = sizeof(BlockIdData) * 2; - rdata[1].next = &rdata[2]; - - rdata[2].buffer = childbuf; - rdata[2].buffer_std = false; - rdata[2].data = NULL; - rdata[2].len = 0; - rdata[2].next = payloadrdata; + XLogRegisterData((char *) childblknos, + sizeof(BlockIdData) * 2); } - else - rdata[0].next = payloadrdata; - recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT, rdata); + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT); PageSetLSN(page, recptr); if (childbuf != InvalidBuffer) PageSetLSN(childpage, recptr); @@ -421,10 +432,9 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, } else if (rc == SPLIT) { - /* Didn't fit, have to split */ + /* Didn't fit, had to split */ Buffer rbuffer; BlockNumber savedRightLink; - XLogRecData rdata[2]; ginxlogSplit data; Buffer lbuffer = InvalidBuffer; Page newrootpg = NULL; @@ -448,7 +458,6 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, */ data.node = btree->index->rd_node; - data.rblkno = BufferGetBlockNumber(rbuffer); data.flags = xlflags; if (childbuf != InvalidBuffer) { @@ -462,23 +471,6 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, else data.leftChildBlkno = data.rightChildBlkno = InvalidBlockNumber; - rdata[0].buffer = InvalidBuffer; - rdata[0].data = (char *) &data; - rdata[0].len = sizeof(ginxlogSplit); - - if (childbuf != InvalidBuffer) - { - rdata[0].next = &rdata[1]; - - rdata[1].buffer = childbuf; - rdata[1].buffer_std = false; - rdata[1].data = NULL; - rdata[1].len = 0; - rdata[1].next = payloadrdata; - } - else - rdata[0].next = payloadrdata; - if (stack->parent == NULL) { /* @@ -496,12 +488,7 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, buildStats->nEntryPages++; } - /* - * root never has a right-link, so we borrow the rrlink field to - * store the root block number. - */ - data.rrlink = BufferGetBlockNumber(stack->buffer); - data.lblkno = BufferGetBlockNumber(lbuffer); + data.rrlink = InvalidBlockNumber; data.flags |= GIN_SPLIT_ROOT; GinPageGetOpaque(newrpage)->rightlink = InvalidBlockNumber; @@ -524,7 +511,6 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, { /* split non-root page */ data.rrlink = savedRightLink; - data.lblkno = BufferGetBlockNumber(stack->buffer); GinPageGetOpaque(newrpage)->rightlink = savedRightLink; GinPageGetOpaque(newlpage)->flags |= GIN_INCOMPLETE_SPLIT; @@ -572,7 +558,28 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, { XLogRecPtr recptr; - recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT, rdata); + /* + * We just take full page images of all the split pages. Splits + * are uncommon enough that it's not worth complicating the code + * to be more efficient. + */ + if (stack->parent == NULL) + { + XLogRegisterBuffer(0, lbuffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); + XLogRegisterBuffer(1, rbuffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); + XLogRegisterBuffer(2, stack->buffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); + } + else + { + XLogRegisterBuffer(0, stack->buffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); + XLogRegisterBuffer(1, rbuffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); + } + if (BufferIsValid(childbuf)) + XLogRegisterBuffer(3, childbuf, 0); + + XLogRegisterData((char *) &data, sizeof(ginxlogSplit)); + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT); PageSetLSN(BufferGetPage(stack->buffer), recptr); PageSetLSN(BufferGetPage(rbuffer), recptr); if (stack->parent == NULL) diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c index 97cd706c08..012225eaa3 100644 --- a/src/backend/access/gin/gindatapage.c +++ b/src/backend/access/gin/gindatapage.c @@ -98,20 +98,19 @@ static ItemPointer dataLeafPageGetUncompressed(Page page, int *nitems); static void dataSplitPageInternal(GinBtree btree, Buffer origbuf, GinBtreeStack *stack, void *insertdata, BlockNumber updateblkno, - XLogRecData **prdata, Page *newlpage, Page *newrpage); + Page *newlpage, Page *newrpage); static disassembledLeaf *disassembleLeaf(Page page); static bool leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining); static bool addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems); -static XLogRecData *constructLeafRecompressWALData(Buffer buf, - disassembledLeaf *leaf); +static void registerLeafRecompressWALData(Buffer buf, disassembledLeaf *leaf); static void dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf); static void dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf, ItemPointerData lbound, ItemPointerData rbound, - XLogRecData **prdata, Page lpage, Page rpage); + Page lpage, Page rpage); /* * Read TIDs from leaf data page to single uncompressed array. The TIDs are @@ -428,8 +427,7 @@ GinPageDeletePostingItem(Page page, OffsetNumber offset) */ static GinPlaceToPageRC dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, - void *insertdata, XLogRecData **prdata, - Page *newlpage, Page *newrpage) + void *insertdata, Page *newlpage, Page *newrpage) { GinBtreeDataLeafInsertData *items = insertdata; ItemPointer newItems = &items->items[items->curitem]; @@ -602,9 +600,7 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, */ MemoryContextSwitchTo(oldCxt); if (RelationNeedsWAL(btree->index)) - *prdata = constructLeafRecompressWALData(buf, leaf); - else - *prdata = NULL; + registerLeafRecompressWALData(buf, leaf); START_CRIT_SECTION(); dataPlaceToPageLeafRecompress(buf, leaf); @@ -685,7 +681,7 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, *newrpage = MemoryContextAlloc(oldCxt, BLCKSZ); dataPlaceToPageLeafSplit(buf, leaf, lbound, rbound, - prdata, *newlpage, *newrpage); + *newlpage, *newrpage); Assert(GinPageRightMost(page) || ginCompareItemPointers(GinDataPageGetRightBound(*newlpage), @@ -791,7 +787,6 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs) */ if (removedsomething) { - XLogRecData *payloadrdata = NULL; bool modified; /* @@ -818,7 +813,10 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs) } if (RelationNeedsWAL(indexrel)) - payloadrdata = constructLeafRecompressWALData(buffer, leaf); + { + XLogBeginInsert(); + registerLeafRecompressWALData(buffer, leaf); + } START_CRIT_SECTION(); dataPlaceToPageLeafRecompress(buffer, leaf); @@ -827,18 +825,8 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs) if (RelationNeedsWAL(indexrel)) { XLogRecPtr recptr; - XLogRecData rdata; - ginxlogVacuumDataLeafPage xlrec; - xlrec.node = indexrel->rd_node; - xlrec.blkno = BufferGetBlockNumber(buffer); - - rdata.buffer = InvalidBuffer; - rdata.data = (char *) &xlrec; - rdata.len = offsetof(ginxlogVacuumDataLeafPage, data); - rdata.next = payloadrdata; - - recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_VACUUM_DATA_LEAF_PAGE, &rdata); + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_VACUUM_DATA_LEAF_PAGE); PageSetLSN(page, recptr); } @@ -850,13 +838,12 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs) * Construct a ginxlogRecompressDataLeaf record representing the changes * in *leaf. */ -static XLogRecData * -constructLeafRecompressWALData(Buffer buf, disassembledLeaf *leaf) +static void +registerLeafRecompressWALData(Buffer buf, disassembledLeaf *leaf) { int nmodified = 0; char *walbufbegin; char *walbufend; - XLogRecData *rdata; dlist_iter iter; int segno; ginxlogRecompressDataLeaf *recompress_xlog; @@ -871,12 +858,11 @@ constructLeafRecompressWALData(Buffer buf, disassembledLeaf *leaf) nmodified++; } - walbufbegin = palloc( - sizeof(ginxlogRecompressDataLeaf) + - BLCKSZ + /* max size needed to hold the segment - * data */ - nmodified * 2 + /* (segno + action) per action */ - sizeof(XLogRecData)); + walbufbegin = + palloc(sizeof(ginxlogRecompressDataLeaf) + + BLCKSZ + /* max size needed to hold the segment data */ + nmodified * 2 /* (segno + action) per action */ + ); walbufend = walbufbegin; recompress_xlog = (ginxlogRecompressDataLeaf *) walbufend; @@ -944,14 +930,10 @@ constructLeafRecompressWALData(Buffer buf, disassembledLeaf *leaf) segno++; } - rdata = (XLogRecData *) MAXALIGN(walbufend); - rdata->buffer = buf; - rdata->buffer_std = TRUE; - rdata->data = walbufbegin; - rdata->len = walbufend - walbufbegin; - rdata->next = NULL; - return rdata; + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + XLogRegisterBufData(0, walbufbegin, walbufend - walbufbegin); + } /* @@ -1024,7 +1006,7 @@ dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf) static void dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf, ItemPointerData lbound, ItemPointerData rbound, - XLogRecData **prdata, Page lpage, Page rpage) + Page lpage, Page rpage) { char *ptr; int segsize; @@ -1034,10 +1016,6 @@ dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf, dlist_node *firstright; leafSegmentInfo *seginfo; - /* these must be static so they can be returned to caller */ - static ginxlogSplitDataLeaf split_xlog; - static XLogRecData rdata[3]; - /* Initialize temporary pages to hold the new left and right pages */ GinInitPage(lpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ); GinInitPage(rpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ); @@ -1092,29 +1070,6 @@ dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf, Assert(rsize == leaf->rsize); GinDataPageSetDataSize(rpage, rsize); *GinDataPageGetRightBound(rpage) = rbound; - - /* Create WAL record */ - split_xlog.lsize = lsize; - split_xlog.rsize = rsize; - split_xlog.lrightbound = lbound; - split_xlog.rrightbound = rbound; - - rdata[0].buffer = InvalidBuffer; - rdata[0].data = (char *) &split_xlog; - rdata[0].len = sizeof(ginxlogSplitDataLeaf); - rdata[0].next = &rdata[1]; - - rdata[1].buffer = InvalidBuffer; - rdata[1].data = (char *) GinDataLeafPageGetPostingList(lpage); - rdata[1].len = lsize; - rdata[1].next = &rdata[2]; - - rdata[2].buffer = InvalidBuffer; - rdata[2].data = (char *) GinDataLeafPageGetPostingList(rpage); - rdata[2].len = rsize; - rdata[2].next = NULL; - - *prdata = rdata; } /* @@ -1124,29 +1079,30 @@ dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf, * * In addition to inserting the given item, the downlink of the existing item * at 'off' is updated to point to 'updateblkno'. + * + * On INSERTED, registers the buffer as buffer ID 0, with data. + * On SPLIT, returns rdata that represents the split pages in *prdata. */ static GinPlaceToPageRC dataPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack, void *insertdata, BlockNumber updateblkno, - XLogRecData **prdata, Page *newlpage, Page *newrpage) + Page *newlpage, Page *newrpage) { Page page = BufferGetPage(buf); OffsetNumber off = stack->off; PostingItem *pitem; - /* these must be static so they can be returned to caller */ - static XLogRecData rdata; + /* this must be static so it can be returned to caller */ static ginxlogInsertDataInternal data; /* split if we have to */ if (GinNonLeafDataPageGetFreeSpace(page) < sizeof(PostingItem)) { dataSplitPageInternal(btree, buf, stack, insertdata, updateblkno, - prdata, newlpage, newrpage); + newlpage, newrpage); return SPLIT; } - *prdata = &rdata; Assert(GinPageIsData(page)); START_CRIT_SECTION(); @@ -1159,14 +1115,15 @@ dataPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack, pitem = (PostingItem *) insertdata; GinDataPageAddPostingItem(page, pitem, off); - data.offset = off; - data.newitem = *pitem; + if (RelationNeedsWAL(btree->index)) + { + data.offset = off; + data.newitem = *pitem; - rdata.buffer = buf; - rdata.buffer_std = TRUE; - rdata.data = (char *) &data; - rdata.len = sizeof(ginxlogInsertDataInternal); - rdata.next = NULL; + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + XLogRegisterBufData(0, (char *) &data, + sizeof(ginxlogInsertDataInternal)); + } return INSERTED; } @@ -1178,7 +1135,6 @@ dataPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack, static GinPlaceToPageRC dataPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, void *insertdata, BlockNumber updateblkno, - XLogRecData **prdata, Page *newlpage, Page *newrpage) { Page page = BufferGetPage(buf); @@ -1187,11 +1143,11 @@ dataPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, if (GinPageIsLeaf(page)) return dataPlaceToPageLeaf(btree, buf, stack, insertdata, - prdata, newlpage, newrpage); + newlpage, newrpage); else return dataPlaceToPageInternal(btree, buf, stack, insertdata, updateblkno, - prdata, newlpage, newrpage); + newlpage, newrpage); } /* @@ -1202,7 +1158,7 @@ static void dataSplitPageInternal(GinBtree btree, Buffer origbuf, GinBtreeStack *stack, void *insertdata, BlockNumber updateblkno, - XLogRecData **prdata, Page *newlpage, Page *newrpage) + Page *newlpage, Page *newrpage) { Page oldpage = BufferGetPage(origbuf); OffsetNumber off = stack->off; @@ -1215,19 +1171,13 @@ dataSplitPageInternal(GinBtree btree, Buffer origbuf, Page lpage; Page rpage; OffsetNumber separator; - - /* these must be static so they can be returned to caller */ - static ginxlogSplitDataInternal data; - static XLogRecData rdata[4]; - static PostingItem allitems[(BLCKSZ / sizeof(PostingItem)) + 1]; + PostingItem allitems[(BLCKSZ / sizeof(PostingItem)) + 1]; lpage = PageGetTempPage(oldpage); rpage = PageGetTempPage(oldpage); GinInitPage(lpage, GinPageGetOpaque(oldpage)->flags, pageSize); GinInitPage(rpage, GinPageGetOpaque(oldpage)->flags, pageSize); - *prdata = rdata; - /* * First construct a new list of PostingItems, which includes all the old * items, and the new item. @@ -1277,20 +1227,6 @@ dataSplitPageInternal(GinBtree btree, Buffer origbuf, /* set up right bound for right page */ *GinDataPageGetRightBound(rpage) = oldbound; - data.separator = separator; - data.nitem = nitems; - data.rightbound = oldbound; - - rdata[0].buffer = InvalidBuffer; - rdata[0].data = (char *) &data; - rdata[0].len = sizeof(ginxlogSplitDataInternal); - rdata[0].next = &rdata[1]; - - rdata[1].buffer = InvalidBuffer; - rdata[1].data = (char *) allitems; - rdata[1].len = nitems * sizeof(PostingItem); - rdata[1].next = NULL; - *newlpage = lpage; *newrpage = rpage; } @@ -1797,24 +1733,18 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems, if (RelationNeedsWAL(index)) { XLogRecPtr recptr; - XLogRecData rdata[2]; ginxlogCreatePostingTree data; - data.node = index->rd_node; - data.blkno = blkno; data.size = rootsize; - rdata[0].buffer = InvalidBuffer; - rdata[0].data = (char *) &data; - rdata[0].len = sizeof(ginxlogCreatePostingTree); - rdata[0].next = &rdata[1]; + XLogBeginInsert(); + XLogRegisterData((char *) &data, sizeof(ginxlogCreatePostingTree)); - rdata[1].buffer = InvalidBuffer; - rdata[1].data = (char *) GinDataLeafPageGetPostingList(page); - rdata[1].len = rootsize; - rdata[1].next = NULL; + XLogRegisterData((char *) GinDataLeafPageGetPostingList(page), + rootsize); + XLogRegisterBuffer(0, buffer, REGBUF_WILL_INIT); - recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_PTREE, rdata); + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_PTREE); PageSetLSN(page, recptr); } diff --git a/src/backend/access/gin/ginentrypage.c b/src/backend/access/gin/ginentrypage.c index 84dc1e228c..2dae7b9549 100644 --- a/src/backend/access/gin/ginentrypage.c +++ b/src/backend/access/gin/ginentrypage.c @@ -22,7 +22,7 @@ static void entrySplitPage(GinBtree btree, Buffer origbuf, GinBtreeStack *stack, void *insertPayload, - BlockNumber updateblkno, XLogRecData **prdata, + BlockNumber updateblkno, Page *newlpage, Page *newrpage); /* @@ -515,33 +515,33 @@ entryPreparePage(GinBtree btree, Page page, OffsetNumber off, * On insertion to an internal node, in addition to inserting the given item, * the downlink of the existing item at 'off' is updated to point to * 'updateblkno'. + * + * On INSERTED, registers the buffer as buffer ID 0, with data. + * On SPLIT, returns rdata that represents the split pages in *prdata. */ static GinPlaceToPageRC entryPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, void *insertPayload, BlockNumber updateblkno, - XLogRecData **prdata, Page *newlpage, Page *newrpage) + Page *newlpage, Page *newrpage) { GinBtreeEntryInsertData *insertData = insertPayload; Page page = BufferGetPage(buf); OffsetNumber off = stack->off; OffsetNumber placed; - int cnt = 0; - /* these must be static so they can be returned to caller */ - static XLogRecData rdata[3]; + /* this must be static so it can be returned to caller. */ static ginxlogInsertEntry data; /* quick exit if it doesn't fit */ if (!entryIsEnoughSpace(btree, buf, off, insertData)) { entrySplitPage(btree, buf, stack, insertPayload, updateblkno, - prdata, newlpage, newrpage); + newlpage, newrpage); return SPLIT; } START_CRIT_SECTION(); - *prdata = rdata; entryPreparePage(btree, page, off, insertData, updateblkno); placed = PageAddItem(page, @@ -552,21 +552,17 @@ entryPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(btree->index)); - data.isDelete = insertData->isDelete; - data.offset = off; + if (RelationNeedsWAL(btree->index)) + { + data.isDelete = insertData->isDelete; + data.offset = off; - rdata[cnt].buffer = buf; - rdata[cnt].buffer_std = true; - rdata[cnt].data = (char *) &data; - rdata[cnt].len = offsetof(ginxlogInsertEntry, tuple); - rdata[cnt].next = &rdata[cnt + 1]; - cnt++; - - rdata[cnt].buffer = buf; - rdata[cnt].buffer_std = true; - rdata[cnt].data = (char *) insertData->entry; - rdata[cnt].len = IndexTupleSize(insertData->entry); - rdata[cnt].next = NULL; + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + XLogRegisterBufData(0, (char *) &data, + offsetof(ginxlogInsertEntry, tuple)); + XLogRegisterBufData(0, (char *) insertData->entry, + IndexTupleSize(insertData->entry)); + } return INSERTED; } @@ -581,7 +577,7 @@ static void entrySplitPage(GinBtree btree, Buffer origbuf, GinBtreeStack *stack, void *insertPayload, - BlockNumber updateblkno, XLogRecData **prdata, + BlockNumber updateblkno, Page *newlpage, Page *newrpage) { GinBtreeEntryInsertData *insertData = insertPayload; @@ -590,7 +586,6 @@ entrySplitPage(GinBtree btree, Buffer origbuf, maxoff, separator = InvalidOffsetNumber; Size totalsize = 0; - Size tupstoresize; Size lsize = 0, size; char *ptr; @@ -599,13 +594,8 @@ entrySplitPage(GinBtree btree, Buffer origbuf, Page lpage = PageGetTempPageCopy(BufferGetPage(origbuf)); Page rpage = PageGetTempPageCopy(BufferGetPage(origbuf)); Size pageSize = PageGetPageSize(lpage); + char tupstore[2 * BLCKSZ]; - /* these must be static so they can be returned to caller */ - static XLogRecData rdata[2]; - static ginxlogSplitEntry data; - static char tupstore[2 * BLCKSZ]; - - *prdata = rdata; entryPreparePage(btree, lpage, off, insertData, updateblkno); /* @@ -638,7 +628,6 @@ entrySplitPage(GinBtree btree, Buffer origbuf, ptr += size; totalsize += size + sizeof(ItemIdData); } - tupstoresize = ptr - tupstore; /* * Initialize the left and right pages, and copy all the tuples back to @@ -673,19 +662,6 @@ entrySplitPage(GinBtree btree, Buffer origbuf, ptr += MAXALIGN(IndexTupleSize(itup)); } - data.separator = separator; - data.nitem = maxoff; - - rdata[0].buffer = InvalidBuffer; - rdata[0].data = (char *) &data; - rdata[0].len = sizeof(ginxlogSplitEntry); - rdata[0].next = &rdata[1]; - - rdata[1].buffer = InvalidBuffer; - rdata[1].data = tupstore; - rdata[1].len = tupstoresize; - rdata[1].next = NULL; - *newlpage = lpage; *newrpage = rpage; } diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c index 25746995b5..fd81d67557 100644 --- a/src/backend/access/gin/ginfast.c +++ b/src/backend/access/gin/ginfast.c @@ -108,26 +108,19 @@ writeListPage(Relation index, Buffer buffer, if (RelationNeedsWAL(index)) { - XLogRecData rdata[2]; ginxlogInsertListPage data; XLogRecPtr recptr; - data.node = index->rd_node; - data.blkno = BufferGetBlockNumber(buffer); data.rightlink = rightlink; data.ntuples = ntuples; - rdata[0].buffer = InvalidBuffer; - rdata[0].data = (char *) &data; - rdata[0].len = sizeof(ginxlogInsertListPage); - rdata[0].next = rdata + 1; + XLogBeginInsert(); + XLogRegisterData((char *) &data, sizeof(ginxlogInsertListPage)); - rdata[1].buffer = InvalidBuffer; - rdata[1].data = workspace; - rdata[1].len = size; - rdata[1].next = NULL; + XLogRegisterBuffer(0, buffer, REGBUF_WILL_INIT); + XLogRegisterBufData(0, workspace, size); - recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT_LISTPAGE, rdata); + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT_LISTPAGE); PageSetLSN(page, recptr); } @@ -224,26 +217,23 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) Buffer metabuffer; Page metapage; GinMetaPageData *metadata = NULL; - XLogRecData rdata[2]; Buffer buffer = InvalidBuffer; Page page = NULL; ginxlogUpdateMeta data; bool separateList = false; bool needCleanup = false; int cleanupSize; + bool needWal; if (collector->ntuples == 0) return; + needWal = RelationNeedsWAL(index); + data.node = index->rd_node; data.ntuples = 0; data.newRightlink = data.prevTail = InvalidBlockNumber; - rdata[0].buffer = InvalidBuffer; - rdata[0].data = (char *) &data; - rdata[0].len = sizeof(ginxlogUpdateMeta); - rdata[0].next = NULL; - metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO); metapage = BufferGetPage(metabuffer); @@ -283,6 +273,9 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) memset(&sublist, 0, sizeof(GinMetaPageData)); makeSublist(index, collector->tuples, collector->ntuples, &sublist); + if (needWal) + XLogBeginInsert(); + /* * metapage was unlocked, see above */ @@ -315,14 +308,6 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) LockBuffer(buffer, GIN_EXCLUSIVE); page = BufferGetPage(buffer); - rdata[0].next = rdata + 1; - - rdata[1].buffer = buffer; - rdata[1].buffer_std = true; - rdata[1].data = NULL; - rdata[1].len = 0; - rdata[1].next = NULL; - Assert(GinPageGetOpaque(page)->rightlink == InvalidBlockNumber); START_CRIT_SECTION(); @@ -336,6 +321,9 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) metadata->nPendingPages += sublist.nPendingPages; metadata->nPendingHeapTuples += sublist.nPendingHeapTuples; + + if (needWal) + XLogRegisterBuffer(1, buffer, REGBUF_STANDARD); } } else @@ -348,6 +336,7 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) int i, tupsize; char *ptr; + char *collectordata; buffer = ReadBuffer(index, metadata->tail); LockBuffer(buffer, GIN_EXCLUSIVE); @@ -356,16 +345,13 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) off = (PageIsEmpty(page)) ? FirstOffsetNumber : OffsetNumberNext(PageGetMaxOffsetNumber(page)); - rdata[0].next = rdata + 1; - - rdata[1].buffer = buffer; - rdata[1].buffer_std = true; - ptr = rdata[1].data = (char *) palloc(collector->sumsize); - rdata[1].len = collector->sumsize; - rdata[1].next = NULL; + collectordata = ptr = (char *) palloc(collector->sumsize); data.ntuples = collector->ntuples; + if (needWal) + XLogBeginInsert(); + START_CRIT_SECTION(); /* @@ -390,7 +376,12 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) off++; } - Assert((ptr - rdata[1].data) <= collector->sumsize); + Assert((ptr - collectordata) <= collector->sumsize); + if (needWal) + { + XLogRegisterBuffer(1, buffer, REGBUF_STANDARD); + XLogRegisterBufData(1, collectordata, collector->sumsize); + } metadata->tailFreeSize = PageGetExactFreeSpace(page); @@ -402,13 +393,16 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) */ MarkBufferDirty(metabuffer); - if (RelationNeedsWAL(index)) + if (needWal) { XLogRecPtr recptr; memcpy(&data.metadata, metadata, sizeof(GinMetaPageData)); - recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_UPDATE_META_PAGE, rdata); + XLogRegisterBuffer(0, metabuffer, REGBUF_WILL_INIT); + XLogRegisterData((char *) &data, sizeof(ginxlogUpdateMeta)); + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_UPDATE_META_PAGE); PageSetLSN(metapage, recptr); if (buffer != InvalidBuffer) @@ -526,20 +520,11 @@ shiftList(Relation index, Buffer metabuffer, BlockNumber newHead, int i; int64 nDeletedHeapTuples = 0; ginxlogDeleteListPages data; - XLogRecData rdata[1]; Buffer buffers[GIN_NDELETE_AT_ONCE]; - data.node = index->rd_node; - - rdata[0].buffer = InvalidBuffer; - rdata[0].data = (char *) &data; - rdata[0].len = sizeof(ginxlogDeleteListPages); - rdata[0].next = NULL; - data.ndeleted = 0; while (data.ndeleted < GIN_NDELETE_AT_ONCE && blknoToDelete != newHead) { - data.toDelete[data.ndeleted] = blknoToDelete; buffers[data.ndeleted] = ReadBuffer(index, blknoToDelete); LockBuffer(buffers[data.ndeleted], GIN_EXCLUSIVE); page = BufferGetPage(buffers[data.ndeleted]); @@ -562,6 +547,13 @@ shiftList(Relation index, Buffer metabuffer, BlockNumber newHead, if (stats) stats->pages_deleted += data.ndeleted; + /* + * This operation touches an unusually large number of pages, so + * prepare the XLogInsert machinery for that before entering the + * critical section. + */ + XLogEnsureRecordSpace(data.ndeleted, 0); + START_CRIT_SECTION(); metadata->head = blknoToDelete; @@ -592,9 +584,17 @@ shiftList(Relation index, Buffer metabuffer, BlockNumber newHead, { XLogRecPtr recptr; + XLogBeginInsert(); + XLogRegisterBuffer(0, metabuffer, REGBUF_WILL_INIT); + for (i = 0; i < data.ndeleted; i++) + XLogRegisterBuffer(i + 1, buffers[i], REGBUF_WILL_INIT); + memcpy(&data.metadata, metadata, sizeof(GinMetaPageData)); - recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_LISTPAGE, rdata); + XLogRegisterData((char *) &data, + sizeof(ginxlogDeleteListPages)); + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_LISTPAGE); PageSetLSN(metapage, recptr); for (i = 0; i < data.ndeleted; i++) diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 370884ed17..c1ad0fd8c4 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -347,15 +347,13 @@ ginbuild(PG_FUNCTION_ARGS) if (RelationNeedsWAL(index)) { XLogRecPtr recptr; - XLogRecData rdata; Page page; - rdata.buffer = InvalidBuffer; - rdata.data = (char *) &(index->rd_node); - rdata.len = sizeof(RelFileNode); - rdata.next = NULL; + XLogBeginInsert(); + XLogRegisterBuffer(0, MetaBuffer, REGBUF_WILL_INIT); + XLogRegisterBuffer(1, RootBuffer, REGBUF_WILL_INIT); - recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_INDEX, &rdata); + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_INDEX); page = BufferGetPage(RootBuffer); PageSetLSN(page, recptr); diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c index d0458cfd0c..f593a7224f 100644 --- a/src/backend/access/gin/ginutil.c +++ b/src/backend/access/gin/ginutil.c @@ -605,19 +605,17 @@ ginUpdateStats(Relation index, const GinStatsData *stats) { XLogRecPtr recptr; ginxlogUpdateMeta data; - XLogRecData rdata; data.node = index->rd_node; data.ntuples = 0; data.newRightlink = data.prevTail = InvalidBlockNumber; memcpy(&data.metadata, metadata, sizeof(GinMetaPageData)); - rdata.buffer = InvalidBuffer; - rdata.data = (char *) &data; - rdata.len = sizeof(ginxlogUpdateMeta); - rdata.next = NULL; + XLogBeginInsert(); + XLogRegisterData((char *) &data, sizeof(ginxlogUpdateMeta)); + XLogRegisterBuffer(0, metabuffer, REGBUF_WILL_INIT); - recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_UPDATE_META_PAGE, &rdata); + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_UPDATE_META_PAGE); PageSetLSN(metapage, recptr); } diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c index 3a61321a83..6f32600ed7 100644 --- a/src/backend/access/gin/ginvacuum.c +++ b/src/backend/access/gin/ginvacuum.c @@ -89,10 +89,6 @@ xlogVacuumPage(Relation index, Buffer buffer) { Page page = BufferGetPage(buffer); XLogRecPtr recptr; - XLogRecData rdata[3]; - ginxlogVacuumPage xlrec; - uint16 lower; - uint16 upper; /* This is only used for entry tree leaf pages. */ Assert(!GinPageIsData(page)); @@ -101,57 +97,14 @@ xlogVacuumPage(Relation index, Buffer buffer) if (!RelationNeedsWAL(index)) return; - xlrec.node = index->rd_node; - xlrec.blkno = BufferGetBlockNumber(buffer); + /* + * Always create a full image, we don't track the changes on the page at + * any more fine-grained level. This could obviously be improved... + */ + XLogBeginInsert(); + XLogRegisterBuffer(0, buffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); - /* Assume we can omit data between pd_lower and pd_upper */ - lower = ((PageHeader) page)->pd_lower; - upper = ((PageHeader) page)->pd_upper; - - Assert(lower < BLCKSZ); - Assert(upper < BLCKSZ); - - if (lower >= SizeOfPageHeaderData && - upper > lower && - upper <= BLCKSZ) - { - xlrec.hole_offset = lower; - xlrec.hole_length = upper - lower; - } - else - { - /* No "hole" to compress out */ - xlrec.hole_offset = 0; - xlrec.hole_length = 0; - } - - rdata[0].data = (char *) &xlrec; - rdata[0].len = sizeof(ginxlogVacuumPage); - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &rdata[1]; - - if (xlrec.hole_length == 0) - { - rdata[1].data = (char *) page; - rdata[1].len = BLCKSZ; - rdata[1].buffer = InvalidBuffer; - rdata[1].next = NULL; - } - else - { - /* must skip the hole */ - rdata[1].data = (char *) page; - rdata[1].len = xlrec.hole_offset; - rdata[1].buffer = InvalidBuffer; - rdata[1].next = &rdata[2]; - - rdata[2].data = (char *) page + (xlrec.hole_offset + xlrec.hole_length); - rdata[2].len = BLCKSZ - (xlrec.hole_offset + xlrec.hole_length); - rdata[2].buffer = InvalidBuffer; - rdata[2].next = NULL; - } - - recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_VACUUM_PAGE, rdata); + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_VACUUM_PAGE); PageSetLSN(page, recptr); } @@ -292,48 +245,27 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn if (RelationNeedsWAL(gvs->index)) { XLogRecPtr recptr; - XLogRecData rdata[4]; ginxlogDeletePage data; - data.node = gvs->index->rd_node; - data.blkno = deleteBlkno; - data.parentBlkno = parentBlkno; + /* + * We can't pass REGBUF_STANDARD for the deleted page, because we + * didn't set pd_lower on pre-9.4 versions. The page might've been + * binary-upgraded from an older version, and hence not have pd_lower + * set correctly. Ditto for the left page, but removing the item from + * the parent updated its pd_lower, so we know that's OK at this + * point. + */ + XLogBeginInsert(); + XLogRegisterBuffer(0, dBuffer, 0); + XLogRegisterBuffer(1, pBuffer, REGBUF_STANDARD); + XLogRegisterBuffer(2, lBuffer, 0); + data.parentOffset = myoff; - data.leftBlkno = leftBlkno; data.rightLink = GinPageGetOpaque(page)->rightlink; - /* - * We can't pass buffer_std = TRUE, because we didn't set pd_lower on - * pre-9.4 versions. The page might've been binary-upgraded from an - * older version, and hence not have pd_lower set correctly. Ditto for - * the left page, but removing the item from the parent updated its - * pd_lower, so we know that's OK at this point. - */ - rdata[0].buffer = dBuffer; - rdata[0].buffer_std = FALSE; - rdata[0].data = NULL; - rdata[0].len = 0; - rdata[0].next = rdata + 1; + XLogRegisterData((char *) &data, sizeof(ginxlogDeletePage)); - rdata[1].buffer = pBuffer; - rdata[1].buffer_std = TRUE; - rdata[1].data = NULL; - rdata[1].len = 0; - rdata[1].next = rdata + 2; - - rdata[2].buffer = lBuffer; - rdata[2].buffer_std = FALSE; - rdata[2].data = NULL; - rdata[2].len = 0; - rdata[2].next = rdata + 3; - - rdata[3].buffer = InvalidBuffer; - rdata[3].buffer_std = FALSE; - rdata[3].len = sizeof(ginxlogDeletePage); - rdata[3].data = (char *) &data; - rdata[3].next = NULL; - - recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_PAGE, rdata); + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_PAGE); PageSetLSN(page, recptr); PageSetLSN(parentPage, recptr); PageSetLSN(BufferGetPage(lBuffer), recptr); diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c index d0553bb8f7..6c0042bd79 100644 --- a/src/backend/access/gin/ginxlog.c +++ b/src/backend/access/gin/ginxlog.c @@ -20,18 +20,15 @@ static MemoryContext opCtx; /* working memory for operations */ static void -ginRedoClearIncompleteSplit(XLogRecPtr lsn, XLogRecord *record, - int block_index, - RelFileNode node, BlockNumber blkno) +ginRedoClearIncompleteSplit(XLogReaderState *record, uint8 block_id) { + XLogRecPtr lsn = record->EndRecPtr; Buffer buffer; Page page; - if (XLogReadBufferForRedo(lsn, record, block_index, node, blkno, &buffer) - == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, block_id, &buffer) == BLK_NEEDS_REDO) { page = (Page) BufferGetPage(buffer); - GinPageGetOpaque(page)->flags &= ~GIN_INCOMPLETE_SPLIT; PageSetLSN(page, lsn); @@ -42,18 +39,15 @@ ginRedoClearIncompleteSplit(XLogRecPtr lsn, XLogRecord *record, } static void -ginRedoCreateIndex(XLogRecPtr lsn, XLogRecord *record) +ginRedoCreateIndex(XLogReaderState *record) { - RelFileNode *node = (RelFileNode *) XLogRecGetData(record); + XLogRecPtr lsn = record->EndRecPtr; Buffer RootBuffer, MetaBuffer; Page page; - /* Backup blocks are not used in create_index records */ - Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); - - MetaBuffer = XLogReadBuffer(*node, GIN_METAPAGE_BLKNO, true); - Assert(BufferIsValid(MetaBuffer)); + MetaBuffer = XLogInitBufferForRedo(record, 0); + Assert(BufferGetBlockNumber(MetaBuffer) == GIN_METAPAGE_BLKNO); page = (Page) BufferGetPage(MetaBuffer); GinInitMetabuffer(MetaBuffer); @@ -61,8 +55,8 @@ ginRedoCreateIndex(XLogRecPtr lsn, XLogRecord *record) PageSetLSN(page, lsn); MarkBufferDirty(MetaBuffer); - RootBuffer = XLogReadBuffer(*node, GIN_ROOT_BLKNO, true); - Assert(BufferIsValid(RootBuffer)); + RootBuffer = XLogInitBufferForRedo(record, 1); + Assert(BufferGetBlockNumber(RootBuffer) == GIN_ROOT_BLKNO); page = (Page) BufferGetPage(RootBuffer); GinInitBuffer(RootBuffer, GIN_LEAF); @@ -75,18 +69,15 @@ ginRedoCreateIndex(XLogRecPtr lsn, XLogRecord *record) } static void -ginRedoCreatePTree(XLogRecPtr lsn, XLogRecord *record) +ginRedoCreatePTree(XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; ginxlogCreatePostingTree *data = (ginxlogCreatePostingTree *) XLogRecGetData(record); char *ptr; Buffer buffer; Page page; - /* Backup blocks are not used in create_ptree records */ - Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); - - buffer = XLogReadBuffer(data->node, data->blkno, true); - Assert(BufferIsValid(buffer)); + buffer = XLogInitBufferForRedo(record, 0); page = (Page) BufferGetPage(buffer); GinInitBuffer(buffer, GIN_DATA | GIN_LEAF | GIN_COMPRESSED); @@ -328,35 +319,40 @@ ginRedoInsertData(Buffer buffer, bool isLeaf, BlockNumber rightblkno, void *rdat } static void -ginRedoInsert(XLogRecPtr lsn, XLogRecord *record) +ginRedoInsert(XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; ginxlogInsert *data = (ginxlogInsert *) XLogRecGetData(record); Buffer buffer; - char *payload; +#ifdef NOT_USED BlockNumber leftChildBlkno = InvalidBlockNumber; +#endif BlockNumber rightChildBlkno = InvalidBlockNumber; bool isLeaf = (data->flags & GIN_INSERT_ISLEAF) != 0; - payload = XLogRecGetData(record) + sizeof(ginxlogInsert); - /* * First clear incomplete-split flag on child page if this finishes a * split. */ if (!isLeaf) { + char *payload = XLogRecGetData(record) + sizeof(ginxlogInsert); + +#ifdef NOT_USED leftChildBlkno = BlockIdGetBlockNumber((BlockId) payload); +#endif payload += sizeof(BlockIdData); rightChildBlkno = BlockIdGetBlockNumber((BlockId) payload); payload += sizeof(BlockIdData); - ginRedoClearIncompleteSplit(lsn, record, 0, data->node, leftChildBlkno); + ginRedoClearIncompleteSplit(record, 1); } - if (XLogReadBufferForRedo(lsn, record, isLeaf ? 0 : 1, data->node, - data->blkno, &buffer) == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { Page page = BufferGetPage(buffer); + Size len; + char *payload = XLogRecGetBlockData(record, 0, &len); /* How to insert the payload is tree-type specific */ if (data->flags & GIN_INSERT_ISDATA) @@ -378,161 +374,33 @@ ginRedoInsert(XLogRecPtr lsn, XLogRecord *record) } static void -ginRedoSplitEntry(Page lpage, Page rpage, void *rdata) -{ - ginxlogSplitEntry *data = (ginxlogSplitEntry *) rdata; - IndexTuple itup = (IndexTuple) ((char *) rdata + sizeof(ginxlogSplitEntry)); - OffsetNumber i; - - for (i = 0; i < data->separator; i++) - { - if (PageAddItem(lpage, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) - elog(ERROR, "failed to add item to gin index page"); - itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup))); - } - - for (i = data->separator; i < data->nitem; i++) - { - if (PageAddItem(rpage, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) - elog(ERROR, "failed to add item to gin index page"); - itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup))); - } -} - -static void -ginRedoSplitData(Page lpage, Page rpage, void *rdata) -{ - bool isleaf = GinPageIsLeaf(lpage); - - if (isleaf) - { - ginxlogSplitDataLeaf *data = (ginxlogSplitDataLeaf *) rdata; - Pointer lptr = (Pointer) rdata + sizeof(ginxlogSplitDataLeaf); - Pointer rptr = lptr + data->lsize; - - Assert(data->lsize > 0 && data->lsize <= GinDataPageMaxDataSize); - Assert(data->rsize > 0 && data->rsize <= GinDataPageMaxDataSize); - - memcpy(GinDataLeafPageGetPostingList(lpage), lptr, data->lsize); - memcpy(GinDataLeafPageGetPostingList(rpage), rptr, data->rsize); - - GinDataPageSetDataSize(lpage, data->lsize); - GinDataPageSetDataSize(rpage, data->rsize); - *GinDataPageGetRightBound(lpage) = data->lrightbound; - *GinDataPageGetRightBound(rpage) = data->rrightbound; - } - else - { - ginxlogSplitDataInternal *data = (ginxlogSplitDataInternal *) rdata; - PostingItem *items = (PostingItem *) ((char *) rdata + sizeof(ginxlogSplitDataInternal)); - OffsetNumber i; - OffsetNumber maxoff; - - for (i = 0; i < data->separator; i++) - GinDataPageAddPostingItem(lpage, &items[i], InvalidOffsetNumber); - for (i = data->separator; i < data->nitem; i++) - GinDataPageAddPostingItem(rpage, &items[i], InvalidOffsetNumber); - - /* set up right key */ - maxoff = GinPageGetOpaque(lpage)->maxoff; - *GinDataPageGetRightBound(lpage) = GinDataPageGetPostingItem(lpage, maxoff)->key; - *GinDataPageGetRightBound(rpage) = data->rightbound; - } -} - -static void -ginRedoSplit(XLogRecPtr lsn, XLogRecord *record) +ginRedoSplit(XLogReaderState *record) { ginxlogSplit *data = (ginxlogSplit *) XLogRecGetData(record); Buffer lbuffer, - rbuffer; - Page lpage, - rpage; - uint32 flags; - uint32 lflags, - rflags; - char *payload; + rbuffer, + rootbuf; bool isLeaf = (data->flags & GIN_INSERT_ISLEAF) != 0; - bool isData = (data->flags & GIN_INSERT_ISDATA) != 0; bool isRoot = (data->flags & GIN_SPLIT_ROOT) != 0; - payload = XLogRecGetData(record) + sizeof(ginxlogSplit); - /* * First clear incomplete-split flag on child page if this finishes a * split */ if (!isLeaf) - ginRedoClearIncompleteSplit(lsn, record, 0, data->node, data->leftChildBlkno); + ginRedoClearIncompleteSplit(record, 3); - flags = 0; - if (isLeaf) - flags |= GIN_LEAF; - if (isData) - flags |= GIN_DATA; - if (isLeaf && isData) - flags |= GIN_COMPRESSED; + if (XLogReadBufferForRedo(record, 0, &lbuffer) != BLK_RESTORED) + elog(ERROR, "GIN split record did not contain a full-page image of left page"); - lflags = rflags = flags; - if (!isRoot) - lflags |= GIN_INCOMPLETE_SPLIT; - - lbuffer = XLogReadBuffer(data->node, data->lblkno, true); - Assert(BufferIsValid(lbuffer)); - lpage = (Page) BufferGetPage(lbuffer); - GinInitBuffer(lbuffer, lflags); - - rbuffer = XLogReadBuffer(data->node, data->rblkno, true); - Assert(BufferIsValid(rbuffer)); - rpage = (Page) BufferGetPage(rbuffer); - GinInitBuffer(rbuffer, rflags); - - GinPageGetOpaque(lpage)->rightlink = BufferGetBlockNumber(rbuffer); - GinPageGetOpaque(rpage)->rightlink = isRoot ? InvalidBlockNumber : data->rrlink; - - /* Do the tree-type specific portion to restore the page contents */ - if (isData) - ginRedoSplitData(lpage, rpage, payload); - else - ginRedoSplitEntry(lpage, rpage, payload); - - PageSetLSN(rpage, lsn); - MarkBufferDirty(rbuffer); - - PageSetLSN(lpage, lsn); - MarkBufferDirty(lbuffer); + if (XLogReadBufferForRedo(record, 1, &rbuffer) != BLK_RESTORED) + elog(ERROR, "GIN split record did not contain a full-page image of right page"); if (isRoot) { - BlockNumber rootBlkno = data->rrlink; - Buffer rootBuf = XLogReadBuffer(data->node, rootBlkno, true); - Page rootPage = BufferGetPage(rootBuf); - - GinInitBuffer(rootBuf, flags & ~GIN_LEAF & ~GIN_COMPRESSED); - - if (isData) - { - Assert(rootBlkno != GIN_ROOT_BLKNO); - ginDataFillRoot(NULL, BufferGetPage(rootBuf), - BufferGetBlockNumber(lbuffer), - BufferGetPage(lbuffer), - BufferGetBlockNumber(rbuffer), - BufferGetPage(rbuffer)); - } - else - { - Assert(rootBlkno == GIN_ROOT_BLKNO); - ginEntryFillRoot(NULL, BufferGetPage(rootBuf), - BufferGetBlockNumber(lbuffer), - BufferGetPage(lbuffer), - BufferGetBlockNumber(rbuffer), - BufferGetPage(rbuffer)); - } - - PageSetLSN(rootPage, lsn); - - MarkBufferDirty(rootBuf); - UnlockReleaseBuffer(rootBuf); + if (XLogReadBufferForRedo(record, 2, &rootbuf) != BLK_RESTORED) + elog(ERROR, "GIN split record did not contain a full-page image of root page"); + UnlockReleaseBuffer(rootbuf); } UnlockReleaseBuffer(rbuffer); @@ -544,54 +412,30 @@ ginRedoSplit(XLogRecPtr lsn, XLogRecord *record) * a XLOG_FPI record. */ static void -ginRedoVacuumPage(XLogRecPtr lsn, XLogRecord *record) +ginRedoVacuumPage(XLogReaderState *record) { - ginxlogVacuumPage *xlrec = (ginxlogVacuumPage *) XLogRecGetData(record); - char *blk = ((char *) xlrec) + sizeof(ginxlogVacuumPage); Buffer buffer; - Page page; - Assert(xlrec->hole_offset < BLCKSZ); - Assert(xlrec->hole_length < BLCKSZ); - - /* Backup blocks are not used, we'll re-initialize the page always. */ - Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); - - buffer = XLogReadBuffer(xlrec->node, xlrec->blkno, true); - if (!BufferIsValid(buffer)) - return; - page = (Page) BufferGetPage(buffer); - - if (xlrec->hole_length == 0) + if (XLogReadBufferForRedo(record, 0, &buffer) != BLK_RESTORED) { - memcpy((char *) page, blk, BLCKSZ); + elog(ERROR, "replay of gin entry tree page vacuum did not restore the page"); } - else - { - memcpy((char *) page, blk, xlrec->hole_offset); - /* must zero-fill the hole */ - MemSet((char *) page + xlrec->hole_offset, 0, xlrec->hole_length); - memcpy((char *) page + (xlrec->hole_offset + xlrec->hole_length), - blk + xlrec->hole_offset, - BLCKSZ - (xlrec->hole_offset + xlrec->hole_length)); - } - - PageSetLSN(page, lsn); - - MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); } static void -ginRedoVacuumDataLeafPage(XLogRecPtr lsn, XLogRecord *record) +ginRedoVacuumDataLeafPage(XLogReaderState *record) { - ginxlogVacuumDataLeafPage *xlrec = (ginxlogVacuumDataLeafPage *) XLogRecGetData(record); + XLogRecPtr lsn = record->EndRecPtr; Buffer buffer; - if (XLogReadBufferForRedo(lsn, record, 0, xlrec->node, xlrec->blkno, - &buffer) == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { Page page = BufferGetPage(buffer); + Size len; + ginxlogVacuumDataLeafPage *xlrec; + + xlrec = (ginxlogVacuumDataLeafPage *) XLogRecGetBlockData(record, 0, &len); Assert(GinPageIsLeaf(page)); Assert(GinPageIsData(page)); @@ -605,30 +449,27 @@ ginRedoVacuumDataLeafPage(XLogRecPtr lsn, XLogRecord *record) } static void -ginRedoDeletePage(XLogRecPtr lsn, XLogRecord *record) +ginRedoDeletePage(XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; ginxlogDeletePage *data = (ginxlogDeletePage *) XLogRecGetData(record); Buffer dbuffer; Buffer pbuffer; Buffer lbuffer; Page page; - if (XLogReadBufferForRedo(lsn, record, 0, data->node, data->blkno, &dbuffer) - == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, 0, &dbuffer) == BLK_NEEDS_REDO) { page = BufferGetPage(dbuffer); - Assert(GinPageIsData(page)); GinPageGetOpaque(page)->flags = GIN_DELETED; PageSetLSN(page, lsn); MarkBufferDirty(dbuffer); } - if (XLogReadBufferForRedo(lsn, record, 1, data->node, data->parentBlkno, - &pbuffer) == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, 1, &pbuffer) == BLK_NEEDS_REDO) { page = BufferGetPage(pbuffer); - Assert(GinPageIsData(page)); Assert(!GinPageIsLeaf(page)); GinPageDeletePostingItem(page, data->parentOffset); @@ -636,11 +477,9 @@ ginRedoDeletePage(XLogRecPtr lsn, XLogRecord *record) MarkBufferDirty(pbuffer); } - if (XLogReadBufferForRedo(lsn, record, 2, data->node, data->leftBlkno, - &lbuffer) == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, 2, &lbuffer) == BLK_NEEDS_REDO) { page = BufferGetPage(lbuffer); - Assert(GinPageIsData(page)); GinPageGetOpaque(page)->rightlink = data->rightLink; PageSetLSN(page, lsn); @@ -656,8 +495,9 @@ ginRedoDeletePage(XLogRecPtr lsn, XLogRecord *record) } static void -ginRedoUpdateMetapage(XLogRecPtr lsn, XLogRecord *record) +ginRedoUpdateMetapage(XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; ginxlogUpdateMeta *data = (ginxlogUpdateMeta *) XLogRecGetData(record); Buffer metabuffer; Page metapage; @@ -668,9 +508,8 @@ ginRedoUpdateMetapage(XLogRecPtr lsn, XLogRecord *record) * image, so restore the metapage unconditionally without looking at the * LSN, to avoid torn page hazards. */ - metabuffer = XLogReadBuffer(data->node, GIN_METAPAGE_BLKNO, false); - if (!BufferIsValid(metabuffer)) - return; /* assume index was deleted, nothing to do */ + metabuffer = XLogInitBufferForRedo(record, 0); + Assert(BufferGetBlockNumber(metabuffer) == GIN_METAPAGE_BLKNO); metapage = BufferGetPage(metabuffer); memcpy(GinPageGetMeta(metapage), &data->metadata, sizeof(GinMetaPageData)); @@ -682,17 +521,18 @@ ginRedoUpdateMetapage(XLogRecPtr lsn, XLogRecord *record) /* * insert into tail page */ - if (XLogReadBufferForRedo(lsn, record, 0, data->node, - data->metadata.tail, &buffer) - == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) { Page page = BufferGetPage(buffer); OffsetNumber off; int i; Size tupsize; + char *payload; IndexTuple tuples; + Size totaltupsize; - tuples = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogUpdateMeta)); + payload = XLogRecGetBlockData(record, 1, &totaltupsize); + tuples = (IndexTuple) payload; if (PageIsEmpty(page)) off = FirstOffsetNumber; @@ -711,6 +551,7 @@ ginRedoUpdateMetapage(XLogRecPtr lsn, XLogRecord *record) off++; } + Assert(payload + totaltupsize == (char *) tuples); /* * Increase counter of heap tuples @@ -728,8 +569,7 @@ ginRedoUpdateMetapage(XLogRecPtr lsn, XLogRecord *record) /* * New tail */ - if (XLogReadBufferForRedo(lsn, record, 0, data->node, data->prevTail, - &buffer) == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) { Page page = BufferGetPage(buffer); @@ -746,8 +586,9 @@ ginRedoUpdateMetapage(XLogRecPtr lsn, XLogRecord *record) } static void -ginRedoInsertListPage(XLogRecPtr lsn, XLogRecord *record) +ginRedoInsertListPage(XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; ginxlogInsertListPage *data = (ginxlogInsertListPage *) XLogRecGetData(record); Buffer buffer; Page page; @@ -755,15 +596,12 @@ ginRedoInsertListPage(XLogRecPtr lsn, XLogRecord *record) off = FirstOffsetNumber; int i, tupsize; - IndexTuple tuples = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogInsertListPage)); + char *payload; + IndexTuple tuples; + Size totaltupsize; - /* - * Backup blocks are not used, we always re-initialize the page. - */ - Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); - - buffer = XLogReadBuffer(data->node, data->blkno, true); - Assert(BufferIsValid(buffer)); + /* We always re-initialize the page. */ + buffer = XLogInitBufferForRedo(record, 0); page = BufferGetPage(buffer); GinInitBuffer(buffer, GIN_LIST); @@ -779,6 +617,9 @@ ginRedoInsertListPage(XLogRecPtr lsn, XLogRecord *record) GinPageGetOpaque(page)->maxoff = 0; } + payload = XLogRecGetBlockData(record, 0, &totaltupsize); + + tuples = (IndexTuple) payload; for (i = 0; i < data->ntuples; i++) { tupsize = IndexTupleSize(tuples); @@ -791,6 +632,7 @@ ginRedoInsertListPage(XLogRecPtr lsn, XLogRecord *record) tuples = (IndexTuple) (((char *) tuples) + tupsize); off++; } + Assert((char *) tuples == payload + totaltupsize); PageSetLSN(page, lsn); MarkBufferDirty(buffer); @@ -799,21 +641,20 @@ ginRedoInsertListPage(XLogRecPtr lsn, XLogRecord *record) } static void -ginRedoDeleteListPages(XLogRecPtr lsn, XLogRecord *record) +ginRedoDeleteListPages(XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; ginxlogDeleteListPages *data = (ginxlogDeleteListPages *) XLogRecGetData(record); Buffer metabuffer; Page metapage; int i; - /* Backup blocks are not used in delete_listpage records */ - Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); - - metabuffer = XLogReadBuffer(data->node, GIN_METAPAGE_BLKNO, false); - if (!BufferIsValid(metabuffer)) - return; /* assume index was deleted, nothing to do */ + metabuffer = XLogInitBufferForRedo(record, 0); + Assert(BufferGetBlockNumber(metabuffer) == GIN_METAPAGE_BLKNO); metapage = BufferGetPage(metabuffer); + GinInitPage(metapage, GIN_META, BufferGetPageSize(metabuffer)); + memcpy(GinPageGetMeta(metapage), &data->metadata, sizeof(GinMetaPageData)); PageSetLSN(metapage, lsn); MarkBufferDirty(metabuffer); @@ -838,7 +679,7 @@ ginRedoDeleteListPages(XLogRecPtr lsn, XLogRecord *record) Buffer buffer; Page page; - buffer = XLogReadBuffer(data->node, data->toDelete[i], true); + buffer = XLogInitBufferForRedo(record, i + 1); page = BufferGetPage(buffer); GinInitBuffer(buffer, GIN_DELETED); @@ -851,9 +692,9 @@ ginRedoDeleteListPages(XLogRecPtr lsn, XLogRecord *record) } void -gin_redo(XLogRecPtr lsn, XLogRecord *record) +gin_redo(XLogReaderState *record) { - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; MemoryContext oldCtx; /* @@ -866,34 +707,34 @@ gin_redo(XLogRecPtr lsn, XLogRecord *record) switch (info) { case XLOG_GIN_CREATE_INDEX: - ginRedoCreateIndex(lsn, record); + ginRedoCreateIndex(record); break; case XLOG_GIN_CREATE_PTREE: - ginRedoCreatePTree(lsn, record); + ginRedoCreatePTree(record); break; case XLOG_GIN_INSERT: - ginRedoInsert(lsn, record); + ginRedoInsert(record); break; case XLOG_GIN_SPLIT: - ginRedoSplit(lsn, record); + ginRedoSplit(record); break; case XLOG_GIN_VACUUM_PAGE: - ginRedoVacuumPage(lsn, record); + ginRedoVacuumPage(record); break; case XLOG_GIN_VACUUM_DATA_LEAF_PAGE: - ginRedoVacuumDataLeafPage(lsn, record); + ginRedoVacuumDataLeafPage(record); break; case XLOG_GIN_DELETE_PAGE: - ginRedoDeletePage(lsn, record); + ginRedoDeletePage(record); break; case XLOG_GIN_UPDATE_META_PAGE: - ginRedoUpdateMetapage(lsn, record); + ginRedoUpdateMetapage(record); break; case XLOG_GIN_INSERT_LISTPAGE: - ginRedoInsertListPage(lsn, record); + ginRedoInsertListPage(record); break; case XLOG_GIN_DELETE_LISTPAGE: - ginRedoDeleteListPages(lsn, record); + ginRedoDeleteListPages(record); break; default: elog(PANIC, "gin_redo: unknown op code %u", info); diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index 644b882b7d..2141045f99 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -16,6 +16,7 @@ #include "access/genam.h" #include "access/gist_private.h" +#include "access/xloginsert.h" #include "catalog/index.h" #include "catalog/pg_collation.h" #include "miscadmin.h" @@ -394,6 +395,14 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, GistPageSetNSN(ptr->page, oldnsn); } + /* + * gistXLogSplit() needs to WAL log a lot of pages, prepare WAL + * insertion for that. NB: The number of pages and data segments + * specified here must match the calculations in gistXLogSplit()! + */ + if (RelationNeedsWAL(rel)) + XLogEnsureRecordSpace(npage, 1 + npage * 2); + START_CRIT_SECTION(); /* diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index 2143096c66..5acc986585 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -183,14 +183,11 @@ gistbuild(PG_FUNCTION_ARGS) if (RelationNeedsWAL(index)) { XLogRecPtr recptr; - XLogRecData rdata; - rdata.data = (char *) &(index->rd_node); - rdata.len = sizeof(RelFileNode); - rdata.buffer = InvalidBuffer; - rdata.next = NULL; + XLogBeginInsert(); + XLogRegisterBuffer(0, buffer, REGBUF_WILL_INIT); - recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX, &rdata); + recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX); PageSetLSN(page, recptr); } else diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c index 2999d21191..0a4f04810f 100644 --- a/src/backend/access/gist/gistxlog.c +++ b/src/backend/access/gist/gistxlog.c @@ -18,18 +18,6 @@ #include "access/xlogutils.h" #include "utils/memutils.h" -typedef struct -{ - gistxlogPage *header; - IndexTuple *itup; -} NewPage; - -typedef struct -{ - gistxlogPageSplit *data; - NewPage *page; -} PageSplitRecord; - static MemoryContext opCtx; /* working memory for operations */ /* @@ -44,9 +32,9 @@ static MemoryContext opCtx; /* working memory for operations */ * action.) */ static void -gistRedoClearFollowRight(XLogRecPtr lsn, XLogRecord *record, int block_index, - RelFileNode node, BlockNumber childblkno) +gistRedoClearFollowRight(XLogReaderState *record, uint8 block_id) { + XLogRecPtr lsn = record->EndRecPtr; Buffer buffer; Page page; XLogRedoAction action; @@ -55,8 +43,7 @@ gistRedoClearFollowRight(XLogRecPtr lsn, XLogRecord *record, int block_index, * Note that we still update the page even if it was restored from a full * page image, because the updated NSN is not included in the image. */ - action = XLogReadBufferForRedo(lsn, record, block_index, node, childblkno, - &buffer); + action = XLogReadBufferForRedo(record, block_id, &buffer); if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) { page = BufferGetPage(buffer); @@ -75,20 +62,23 @@ gistRedoClearFollowRight(XLogRecPtr lsn, XLogRecord *record, int block_index, * redo any page update (except page split) */ static void -gistRedoPageUpdateRecord(XLogRecPtr lsn, XLogRecord *record) +gistRedoPageUpdateRecord(XLogReaderState *record) { - char *begin = XLogRecGetData(record); - gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) begin; + XLogRecPtr lsn = record->EndRecPtr; + gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) XLogRecGetData(record); Buffer buffer; Page page; - char *data; - if (XLogReadBufferForRedo(lsn, record, 0, xldata->node, xldata->blkno, - &buffer) == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { - page = (Page) BufferGetPage(buffer); + char *begin; + char *data; + Size datalen; + int ninserted = 0; - data = begin + sizeof(gistxlogPageUpdate); + data = begin = XLogRecGetBlockData(record, 0, &datalen); + + page = (Page) BufferGetPage(buffer); /* Delete old tuples */ if (xldata->ntodelete > 0) @@ -105,12 +95,12 @@ gistRedoPageUpdateRecord(XLogRecPtr lsn, XLogRecord *record) } /* add tuples */ - if (data - begin < record->xl_len) + if (data - begin < datalen) { OffsetNumber off = (PageIsEmpty(page)) ? FirstOffsetNumber : OffsetNumberNext(PageGetMaxOffsetNumber(page)); - while (data - begin < record->xl_len) + while (data - begin < datalen) { IndexTuple itup = (IndexTuple) data; Size sz = IndexTupleSize(itup); @@ -123,9 +113,12 @@ gistRedoPageUpdateRecord(XLogRecPtr lsn, XLogRecord *record) elog(ERROR, "failed to add item to GiST index page, size %d bytes", (int) sz); off++; + ninserted++; } } + Assert(ninserted == xldata->ntoinsert); + PageSetLSN(page, lsn); MarkBufferDirty(buffer); } @@ -137,58 +130,51 @@ gistRedoPageUpdateRecord(XLogRecPtr lsn, XLogRecord *record) * that even if the target page no longer exists, we still attempt to * replay the change on the child page. */ - if (BlockNumberIsValid(xldata->leftchild)) - gistRedoClearFollowRight(lsn, record, 1, - xldata->node, xldata->leftchild); + if (XLogRecHasBlockRef(record, 1)) + gistRedoClearFollowRight(record, 1); if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); } -static void -decodePageSplitRecord(PageSplitRecord *decoded, XLogRecord *record) +/* + * Returns an array of index pointers. + */ +static IndexTuple * +decodePageSplitRecord(char *begin, int len, int *n) { - char *begin = XLogRecGetData(record), - *ptr; - int j, - i = 0; + char *ptr; + int i = 0; + IndexTuple *tuples; - decoded->data = (gistxlogPageSplit *) begin; - decoded->page = (NewPage *) palloc(sizeof(NewPage) * decoded->data->npage); + /* extract the number of tuples */ + memcpy(n, begin, sizeof(int)); + ptr = begin + sizeof(int); - ptr = begin + sizeof(gistxlogPageSplit); - for (i = 0; i < decoded->data->npage; i++) + tuples = palloc(*n * sizeof(IndexTuple)); + + for (i = 0; i < *n; i++) { - Assert(ptr - begin < record->xl_len); - decoded->page[i].header = (gistxlogPage *) ptr; - ptr += sizeof(gistxlogPage); - - decoded->page[i].itup = (IndexTuple *) - palloc(sizeof(IndexTuple) * decoded->page[i].header->num); - j = 0; - while (j < decoded->page[i].header->num) - { - Assert(ptr - begin < record->xl_len); - decoded->page[i].itup[j] = (IndexTuple) ptr; - ptr += IndexTupleSize((IndexTuple) ptr); - j++; - } + Assert(ptr - begin < len); + tuples[i] = (IndexTuple) ptr; + ptr += IndexTupleSize((IndexTuple) ptr); } + Assert(ptr - begin == len); + + return tuples; } static void -gistRedoPageSplitRecord(XLogRecPtr lsn, XLogRecord *record) +gistRedoPageSplitRecord(XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; gistxlogPageSplit *xldata = (gistxlogPageSplit *) XLogRecGetData(record); - PageSplitRecord xlrec; Buffer firstbuffer = InvalidBuffer; Buffer buffer; Page page; int i; bool isrootsplit = false; - decodePageSplitRecord(&xlrec, record); - /* * We must hold lock on the first-listed page throughout the action, * including while updating the left child page (if any). We can unlock @@ -198,32 +184,39 @@ gistRedoPageSplitRecord(XLogRecPtr lsn, XLogRecord *record) */ /* loop around all pages */ - for (i = 0; i < xlrec.data->npage; i++) + for (i = 0; i < xldata->npage; i++) { - NewPage *newpage = xlrec.page + i; int flags; + char *data; + Size datalen; + int num; + BlockNumber blkno; + IndexTuple *tuples; - if (newpage->header->blkno == GIST_ROOT_BLKNO) + XLogRecGetBlockTag(record, i + 1, NULL, NULL, &blkno); + if (blkno == GIST_ROOT_BLKNO) { Assert(i == 0); isrootsplit = true; } - buffer = XLogReadBuffer(xlrec.data->node, newpage->header->blkno, true); - Assert(BufferIsValid(buffer)); + buffer = XLogInitBufferForRedo(record, i + 1); page = (Page) BufferGetPage(buffer); + data = XLogRecGetBlockData(record, i + 1, &datalen); + + tuples = decodePageSplitRecord(data, datalen, &num); /* ok, clear buffer */ - if (xlrec.data->origleaf && newpage->header->blkno != GIST_ROOT_BLKNO) + if (xldata->origleaf && blkno != GIST_ROOT_BLKNO) flags = F_LEAF; else flags = 0; GISTInitBuffer(buffer, flags); /* and fill it */ - gistfillbuffer(page, newpage->itup, newpage->header->num, FirstOffsetNumber); + gistfillbuffer(page, tuples, num, FirstOffsetNumber); - if (newpage->header->blkno == GIST_ROOT_BLKNO) + if (blkno == GIST_ROOT_BLKNO) { GistPageGetOpaque(page)->rightlink = InvalidBlockNumber; GistPageSetNSN(page, xldata->orignsn); @@ -231,12 +224,17 @@ gistRedoPageSplitRecord(XLogRecPtr lsn, XLogRecord *record) } else { - if (i < xlrec.data->npage - 1) - GistPageGetOpaque(page)->rightlink = xlrec.page[i + 1].header->blkno; + if (i < xldata->npage - 1) + { + BlockNumber nextblkno; + + XLogRecGetBlockTag(record, i + 2, NULL, NULL, &nextblkno); + GistPageGetOpaque(page)->rightlink = nextblkno; + } else GistPageGetOpaque(page)->rightlink = xldata->origrlink; GistPageSetNSN(page, xldata->orignsn); - if (i < xlrec.data->npage - 1 && !isrootsplit && + if (i < xldata->npage - 1 && !isrootsplit && xldata->markfollowright) GistMarkFollowRight(page); else @@ -253,26 +251,22 @@ gistRedoPageSplitRecord(XLogRecPtr lsn, XLogRecord *record) } /* Fix follow-right data on left child page, if any */ - if (BlockNumberIsValid(xldata->leftchild)) - gistRedoClearFollowRight(lsn, record, 0, - xldata->node, xldata->leftchild); + if (XLogRecHasBlockRef(record, 0)) + gistRedoClearFollowRight(record, 0); /* Finally, release lock on the first page */ UnlockReleaseBuffer(firstbuffer); } static void -gistRedoCreateIndex(XLogRecPtr lsn, XLogRecord *record) +gistRedoCreateIndex(XLogReaderState *record) { - RelFileNode *node = (RelFileNode *) XLogRecGetData(record); + XLogRecPtr lsn = record->EndRecPtr; Buffer buffer; Page page; - /* Backup blocks are not used in create_index records */ - Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); - - buffer = XLogReadBuffer(*node, GIST_ROOT_BLKNO, true); - Assert(BufferIsValid(buffer)); + buffer = XLogInitBufferForRedo(record, 0); + Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO); page = (Page) BufferGetPage(buffer); GISTInitBuffer(buffer, F_LEAF); @@ -284,9 +278,9 @@ gistRedoCreateIndex(XLogRecPtr lsn, XLogRecord *record) } void -gist_redo(XLogRecPtr lsn, XLogRecord *record) +gist_redo(XLogReaderState *record) { - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; MemoryContext oldCxt; /* @@ -299,13 +293,13 @@ gist_redo(XLogRecPtr lsn, XLogRecord *record) switch (info) { case XLOG_GIST_PAGE_UPDATE: - gistRedoPageUpdateRecord(lsn, record); + gistRedoPageUpdateRecord(record); break; case XLOG_GIST_PAGE_SPLIT: - gistRedoPageSplitRecord(lsn, record); + gistRedoPageSplitRecord(record); break; case XLOG_GIST_CREATE_INDEX: - gistRedoCreateIndex(lsn, record); + gistRedoCreateIndex(record); break; default: elog(PANIC, "gist_redo: unknown op code %u", info); @@ -336,70 +330,49 @@ gistXLogSplit(RelFileNode node, BlockNumber blkno, bool page_is_leaf, BlockNumber origrlink, GistNSN orignsn, Buffer leftchildbuf, bool markfollowright) { - XLogRecData rdata[GIST_MAX_SPLIT_PAGES * 2 + 2]; gistxlogPageSplit xlrec; SplitedPageLayout *ptr; - int npage = 0, - cur; + int npage = 0; XLogRecPtr recptr; + int i; for (ptr = dist; ptr; ptr = ptr->next) npage++; - /* - * the caller should've checked this already, but doesn't hurt to check - * again. - */ - if (npage > GIST_MAX_SPLIT_PAGES) - elog(ERROR, "GiST page split into too many halves"); - - xlrec.node = node; - xlrec.origblkno = blkno; xlrec.origrlink = origrlink; xlrec.orignsn = orignsn; xlrec.origleaf = page_is_leaf; xlrec.npage = (uint16) npage; - xlrec.leftchild = - BufferIsValid(leftchildbuf) ? BufferGetBlockNumber(leftchildbuf) : InvalidBlockNumber; xlrec.markfollowright = markfollowright; - rdata[0].data = (char *) &xlrec; - rdata[0].len = sizeof(gistxlogPageSplit); - rdata[0].buffer = InvalidBuffer; - - cur = 1; + XLogBeginInsert(); /* * Include a full page image of the child buf. (only necessary if a * checkpoint happened since the child page was split) */ if (BufferIsValid(leftchildbuf)) - { - rdata[cur - 1].next = &(rdata[cur]); - rdata[cur].data = NULL; - rdata[cur].len = 0; - rdata[cur].buffer = leftchildbuf; - rdata[cur].buffer_std = true; - cur++; - } + XLogRegisterBuffer(0, leftchildbuf, REGBUF_STANDARD); + /* + * NOTE: We register a lot of data. The caller must've called + * XLogEnsureRecordSpace() to prepare for that. We cannot do it here, + * because we're already in a critical section. If you change the number + * of buffer or data registrations here, make sure you modify the + * XLogEnsureRecordSpace() calls accordingly! + */ + XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageSplit)); + + i = 1; for (ptr = dist; ptr; ptr = ptr->next) { - rdata[cur - 1].next = &(rdata[cur]); - rdata[cur].buffer = InvalidBuffer; - rdata[cur].data = (char *) &(ptr->block); - rdata[cur].len = sizeof(gistxlogPage); - cur++; - - rdata[cur - 1].next = &(rdata[cur]); - rdata[cur].buffer = InvalidBuffer; - rdata[cur].data = (char *) (ptr->list); - rdata[cur].len = ptr->lenlist; - cur++; + XLogRegisterBuffer(i, ptr->buffer, REGBUF_WILL_INIT); + XLogRegisterBufData(i, (char *) &(ptr->block.num), sizeof(int)); + XLogRegisterBufData(i, (char *) ptr->list, ptr->lenlist); + i++; } - rdata[cur - 1].next = NULL; - recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_SPLIT, rdata); + recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_SPLIT); return recptr; } @@ -413,9 +386,7 @@ gistXLogSplit(RelFileNode node, BlockNumber blkno, bool page_is_leaf, * * Note that both the todelete array and the tuples are marked as belonging * to the target buffer; they need not be stored in XLOG if XLogInsert decides - * to log the whole buffer contents instead. Also, we take care that there's - * at least one rdata item referencing the buffer, even when ntodelete and - * ituplen are both zero; this ensures that XLogInsert knows about the buffer. + * to log the whole buffer contents instead. */ XLogRecPtr gistXLogUpdate(RelFileNode node, Buffer buffer, @@ -423,57 +394,31 @@ gistXLogUpdate(RelFileNode node, Buffer buffer, IndexTuple *itup, int ituplen, Buffer leftchildbuf) { - XLogRecData rdata[MaxIndexTuplesPerPage + 3]; gistxlogPageUpdate xlrec; - int cur, - i; + int i; XLogRecPtr recptr; - xlrec.node = node; - xlrec.blkno = BufferGetBlockNumber(buffer); xlrec.ntodelete = ntodelete; - xlrec.leftchild = - BufferIsValid(leftchildbuf) ? BufferGetBlockNumber(leftchildbuf) : InvalidBlockNumber; + xlrec.ntoinsert = ituplen; - rdata[0].data = (char *) &xlrec; - rdata[0].len = sizeof(gistxlogPageUpdate); - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &(rdata[1]); + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageUpdate)); - rdata[1].data = (char *) todelete; - rdata[1].len = sizeof(OffsetNumber) * ntodelete; - rdata[1].buffer = buffer; - rdata[1].buffer_std = true; - - cur = 2; + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + XLogRegisterBufData(0, (char *) todelete, sizeof(OffsetNumber) * ntodelete); /* new tuples */ for (i = 0; i < ituplen; i++) - { - rdata[cur - 1].next = &(rdata[cur]); - rdata[cur].data = (char *) (itup[i]); - rdata[cur].len = IndexTupleSize(itup[i]); - rdata[cur].buffer = buffer; - rdata[cur].buffer_std = true; - cur++; - } + XLogRegisterBufData(0, (char *) (itup[i]), IndexTupleSize(itup[i])); /* * Include a full page image of the child buf. (only necessary if a * checkpoint happened since the child page was split) */ if (BufferIsValid(leftchildbuf)) - { - rdata[cur - 1].next = &(rdata[cur]); - rdata[cur].data = NULL; - rdata[cur].len = 0; - rdata[cur].buffer = leftchildbuf; - rdata[cur].buffer_std = true; - cur++; - } - rdata[cur - 1].next = NULL; + XLogRegisterBuffer(1, leftchildbuf, REGBUF_STANDARD); - recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE, rdata); + recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE); return recptr; } diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 925a58f4f6..673459fd6c 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -700,7 +700,7 @@ hashvacuumcleanup(PG_FUNCTION_ARGS) void -hash_redo(XLogRecPtr lsn, XLogRecord *record) +hash_redo(XLogReaderState *record) { elog(PANIC, "hash_redo: unimplemented"); } diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 1763b70631..c6e1eb79b2 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -2132,71 +2132,17 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, xl_heap_insert xlrec; xl_heap_header xlhdr; XLogRecPtr recptr; - XLogRecData rdata[4]; Page page = BufferGetPage(buffer); uint8 info = XLOG_HEAP_INSERT; - bool need_tuple_data; + int bufflags = 0; /* - * For logical decoding, we need the tuple even if we're doing a full - * page write, so make sure to log it separately. (XXX We could - * alternatively store a pointer into the FPW). - * - * Also, if this is a catalog, we need to transmit combocids to - * properly decode, so log that as well. + * If this is a catalog, we need to transmit combocids to properly + * decode, so log that as well. */ - need_tuple_data = RelationIsLogicallyLogged(relation); if (RelationIsAccessibleInLogicalDecoding(relation)) log_heap_new_cid(relation, heaptup); - xlrec.flags = all_visible_cleared ? XLOG_HEAP_ALL_VISIBLE_CLEARED : 0; - xlrec.target.node = relation->rd_node; - xlrec.target.tid = heaptup->t_self; - rdata[0].data = (char *) &xlrec; - rdata[0].len = SizeOfHeapInsert; - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &(rdata[1]); - - xlhdr.t_infomask2 = heaptup->t_data->t_infomask2; - xlhdr.t_infomask = heaptup->t_data->t_infomask; - xlhdr.t_hoff = heaptup->t_data->t_hoff; - - /* - * note we mark rdata[1] as belonging to buffer; if XLogInsert decides - * to write the whole page to the xlog, we don't need to store - * xl_heap_header in the xlog. - */ - rdata[1].data = (char *) &xlhdr; - rdata[1].len = SizeOfHeapHeader; - rdata[1].buffer = need_tuple_data ? InvalidBuffer : buffer; - rdata[1].buffer_std = true; - rdata[1].next = &(rdata[2]); - - /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */ - rdata[2].data = (char *) heaptup->t_data + offsetof(HeapTupleHeaderData, t_bits); - rdata[2].len = heaptup->t_len - offsetof(HeapTupleHeaderData, t_bits); - rdata[2].buffer = need_tuple_data ? InvalidBuffer : buffer; - rdata[2].buffer_std = true; - rdata[2].next = NULL; - - /* - * Make a separate rdata entry for the tuple's buffer if we're doing - * logical decoding, so that an eventual FPW doesn't remove the - * tuple's data. - */ - if (need_tuple_data) - { - rdata[2].next = &(rdata[3]); - - rdata[3].data = NULL; - rdata[3].len = 0; - rdata[3].buffer = buffer; - rdata[3].buffer_std = true; - rdata[3].next = NULL; - - xlrec.flags |= XLOG_HEAP_CONTAINS_NEW_TUPLE; - } - /* * If this is the single and first tuple on page, we can reinit the * page instead of restoring the whole thing. Set flag, and hide @@ -2206,10 +2152,44 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, PageGetMaxOffsetNumber(page) == FirstOffsetNumber) { info |= XLOG_HEAP_INIT_PAGE; - rdata[1].buffer = rdata[2].buffer = rdata[3].buffer = InvalidBuffer; + bufflags |= REGBUF_WILL_INIT; } - recptr = XLogInsert(RM_HEAP_ID, info, rdata); + xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self); + xlrec.flags = all_visible_cleared ? XLOG_HEAP_ALL_VISIBLE_CLEARED : 0; + Assert(ItemPointerGetBlockNumber(&heaptup->t_self) == BufferGetBlockNumber(buffer)); + + /* + * For logical decoding, we need the tuple even if we're doing a full + * page write, so make sure it's included even if we take a full-page + * image. (XXX We could alternatively store a pointer into the FPW). + */ + if (RelationIsLogicallyLogged(relation)) + { + xlrec.flags |= XLOG_HEAP_CONTAINS_NEW_TUPLE; + bufflags |= REGBUF_KEEP_DATA; + } + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapInsert); + + xlhdr.t_infomask2 = heaptup->t_data->t_infomask2; + xlhdr.t_infomask = heaptup->t_data->t_infomask; + xlhdr.t_hoff = heaptup->t_data->t_hoff; + + /* + * note we mark xlhdr as belonging to buffer; if XLogInsert decides to + * write the whole page to the xlog, we don't need to store + * xl_heap_header in the xlog. + */ + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags); + XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader); + /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */ + XLogRegisterBufData(0, + (char *) heaptup->t_data + offsetof(HeapTupleHeaderData, t_bits), + heaptup->t_len - offsetof(HeapTupleHeaderData, t_bits)); + + recptr = XLogInsert(RM_HEAP_ID, info); PageSetLSN(page, recptr); } @@ -2397,6 +2377,13 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, break; RelationPutHeapTuple(relation, buffer, heaptup); + + /* + * We don't use heap_multi_insert for catalog tuples yet, but + * better be prepared... + */ + if (needwal && need_cids) + log_heap_new_cid(relation, heaptup); } if (PageIsAllVisible(page)) @@ -2419,12 +2406,12 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, { XLogRecPtr recptr; xl_heap_multi_insert *xlrec; - XLogRecData rdata[3]; uint8 info = XLOG_HEAP2_MULTI_INSERT; char *tupledata; int totaldatalen; char *scratchptr = scratch; bool init; + int bufflags = 0; /* * If the page was previously empty, we can reinit the page @@ -2450,8 +2437,6 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, tupledata = scratchptr; xlrec->flags = all_visible_cleared ? XLOG_HEAP_ALL_VISIBLE_CLEARED : 0; - xlrec->node = relation->rd_node; - xlrec->blkno = BufferGetBlockNumber(buffer); xlrec->ntuples = nthispage; /* @@ -2481,54 +2466,12 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, datalen); tuphdr->datalen = datalen; scratchptr += datalen; - - /* - * We don't use heap_multi_insert for catalog tuples yet, but - * better be prepared... - */ - if (need_cids) - log_heap_new_cid(relation, heaptup); } totaldatalen = scratchptr - tupledata; Assert((scratchptr - scratch) < BLCKSZ); - rdata[0].data = (char *) xlrec; - rdata[0].len = tupledata - scratch; - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &rdata[1]; - - rdata[1].data = tupledata; - rdata[1].len = totaldatalen; - rdata[1].buffer = need_tuple_data ? InvalidBuffer : buffer; - rdata[1].buffer_std = true; - rdata[1].next = NULL; - - /* - * Make a separate rdata entry for the tuple's buffer if we're - * doing logical decoding, so that an eventual FPW doesn't remove - * the tuple's data. - */ if (need_tuple_data) - { - rdata[1].next = &(rdata[2]); - - rdata[2].data = NULL; - rdata[2].len = 0; - rdata[2].buffer = buffer; - rdata[2].buffer_std = true; - rdata[2].next = NULL; xlrec->flags |= XLOG_HEAP_CONTAINS_NEW_TUPLE; - } - - /* - * If we're going to reinitialize the whole page using the WAL - * record, hide buffer reference from XLogInsert. - */ - if (init) - { - rdata[1].buffer = rdata[2].buffer = InvalidBuffer; - info |= XLOG_HEAP_INIT_PAGE; - } /* * Signal that this is the last xl_heap_multi_insert record @@ -2538,7 +2481,25 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, if (ndone + nthispage == ntuples) xlrec->flags |= XLOG_HEAP_LAST_MULTI_INSERT; - recptr = XLogInsert(RM_HEAP2_ID, info, rdata); + if (init) + { + info |= XLOG_HEAP_INIT_PAGE; + bufflags |= REGBUF_WILL_INIT; + } + + /* + * If we're doing logical decoding, include the new tuple data + * even if we take a full-page image of the page. + */ + if (need_tuple_data) + bufflags |= REGBUF_KEEP_DATA; + + XLogBeginInsert(); + XLogRegisterData((char *) xlrec, tupledata - scratch); + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags); + + XLogRegisterBufData(0, tupledata, totaldatalen); + recptr = XLogInsert(RM_HEAP2_ID, info); PageSetLSN(page, recptr); } @@ -2909,7 +2870,6 @@ l1: { xl_heap_delete xlrec; XLogRecPtr recptr; - XLogRecData rdata[4]; /* For logical decode we need combocids to properly decode the catalog */ if (RelationIsAccessibleInLogicalDecoding(relation)) @@ -2918,19 +2878,21 @@ l1: xlrec.flags = all_visible_cleared ? XLOG_HEAP_ALL_VISIBLE_CLEARED : 0; xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask, tp.t_data->t_infomask2); - xlrec.target.node = relation->rd_node; - xlrec.target.tid = tp.t_self; + xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self); xlrec.xmax = new_xmax; - rdata[0].data = (char *) &xlrec; - rdata[0].len = SizeOfHeapDelete; - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &(rdata[1]); - rdata[1].data = NULL; - rdata[1].len = 0; - rdata[1].buffer = buffer; - rdata[1].buffer_std = true; - rdata[1].next = NULL; + if (old_key_tuple != NULL) + { + if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL) + xlrec.flags |= XLOG_HEAP_CONTAINS_OLD_TUPLE; + else + xlrec.flags |= XLOG_HEAP_CONTAINS_OLD_KEY; + } + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapDelete); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); /* * Log replica identity of the deleted tuple if there is one @@ -2943,27 +2905,14 @@ l1: xlhdr.t_infomask = old_key_tuple->t_data->t_infomask; xlhdr.t_hoff = old_key_tuple->t_data->t_hoff; - rdata[1].next = &(rdata[2]); - rdata[2].data = (char *) &xlhdr; - rdata[2].len = SizeOfHeapHeader; - rdata[2].buffer = InvalidBuffer; - rdata[2].next = NULL; - - rdata[2].next = &(rdata[3]); - rdata[3].data = (char *) old_key_tuple->t_data - + offsetof(HeapTupleHeaderData, t_bits); - rdata[3].len = old_key_tuple->t_len - - offsetof(HeapTupleHeaderData, t_bits); - rdata[3].buffer = InvalidBuffer; - rdata[3].next = NULL; - - if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL) - xlrec.flags |= XLOG_HEAP_CONTAINS_OLD_TUPLE; - else - xlrec.flags |= XLOG_HEAP_CONTAINS_OLD_KEY; + XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader); + XLogRegisterData((char *) old_key_tuple->t_data + + offsetof(HeapTupleHeaderData, t_bits), + old_key_tuple->t_len + - offsetof(HeapTupleHeaderData, t_bits)); } - recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE, rdata); + recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE); PageSetLSN(page, recptr); } @@ -4735,25 +4684,17 @@ failed: { xl_heap_lock xlrec; XLogRecPtr recptr; - XLogRecData rdata[2]; - xlrec.target.node = relation->rd_node; - xlrec.target.tid = tuple->t_self; + XLogBeginInsert(); + XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD); + + xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self); xlrec.locking_xid = xid; xlrec.infobits_set = compute_infobits(new_infomask, tuple->t_data->t_infomask2); - rdata[0].data = (char *) &xlrec; - rdata[0].len = SizeOfHeapLock; - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &(rdata[1]); + XLogRegisterData((char *) &xlrec, SizeOfHeapLock); - rdata[1].data = NULL; - rdata[1].len = 0; - rdata[1].buffer = *buffer; - rdata[1].buffer_std = true; - rdata[1].next = NULL; - - recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK, rdata); + recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK); PageSetLSN(page, recptr); } @@ -5342,26 +5283,18 @@ l4: { xl_heap_lock_updated xlrec; XLogRecPtr recptr; - XLogRecData rdata[2]; Page page = BufferGetPage(buf); - xlrec.target.node = rel->rd_node; - xlrec.target.tid = mytup.t_self; + XLogBeginInsert(); + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + + xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self); xlrec.xmax = new_xmax; xlrec.infobits_set = compute_infobits(new_infomask, new_infomask2); - rdata[0].data = (char *) &xlrec; - rdata[0].len = SizeOfHeapLockUpdated; - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &(rdata[1]); + XLogRegisterData((char *) &xlrec, SizeOfHeapLockUpdated); - rdata[1].data = NULL; - rdata[1].len = 0; - rdata[1].buffer = buf; - rdata[1].buffer_std = true; - rdata[1].next = NULL; - - recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_LOCK_UPDATED, rdata); + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_LOCK_UPDATED); PageSetLSN(page, recptr); } @@ -5489,23 +5422,16 @@ heap_inplace_update(Relation relation, HeapTuple tuple) { xl_heap_inplace xlrec; XLogRecPtr recptr; - XLogRecData rdata[2]; - xlrec.target.node = relation->rd_node; - xlrec.target.tid = tuple->t_self; + xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self); - rdata[0].data = (char *) &xlrec; - rdata[0].len = SizeOfHeapInplace; - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &(rdata[1]); + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapInplace); - rdata[1].data = (char *) htup + htup->t_hoff; - rdata[1].len = newlen; - rdata[1].buffer = buffer; - rdata[1].buffer_std = true; - rdata[1].next = NULL; + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen); - recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE, rdata); + recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE); PageSetLSN(page, recptr); } @@ -6507,17 +6433,14 @@ log_heap_cleanup_info(RelFileNode rnode, TransactionId latestRemovedXid) { xl_heap_cleanup_info xlrec; XLogRecPtr recptr; - XLogRecData rdata; xlrec.node = rnode; xlrec.latestRemovedXid = latestRemovedXid; - rdata.data = (char *) &xlrec; - rdata.len = SizeOfHeapCleanupInfo; - rdata.buffer = InvalidBuffer; - rdata.next = NULL; + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapCleanupInfo); - recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_CLEANUP_INFO, &rdata); + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_CLEANUP_INFO); return recptr; } @@ -6542,23 +6465,19 @@ log_heap_clean(Relation reln, Buffer buffer, TransactionId latestRemovedXid) { xl_heap_clean xlrec; - uint8 info; XLogRecPtr recptr; - XLogRecData rdata[4]; /* Caller should not call me on a non-WAL-logged relation */ Assert(RelationNeedsWAL(reln)); - xlrec.node = reln->rd_node; - xlrec.block = BufferGetBlockNumber(buffer); xlrec.latestRemovedXid = latestRemovedXid; xlrec.nredirected = nredirected; xlrec.ndead = ndead; - rdata[0].data = (char *) &xlrec; - rdata[0].len = SizeOfHeapClean; - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &(rdata[1]); + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapClean); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); /* * The OffsetNumber arrays are not actually in the buffer, but we pretend @@ -6569,49 +6488,18 @@ log_heap_clean(Relation reln, Buffer buffer, * even if no item pointers changed state. */ if (nredirected > 0) - { - rdata[1].data = (char *) redirected; - rdata[1].len = nredirected * sizeof(OffsetNumber) * 2; - } - else - { - rdata[1].data = NULL; - rdata[1].len = 0; - } - rdata[1].buffer = buffer; - rdata[1].buffer_std = true; - rdata[1].next = &(rdata[2]); + XLogRegisterBufData(0, (char *) redirected, + nredirected * sizeof(OffsetNumber) * 2); if (ndead > 0) - { - rdata[2].data = (char *) nowdead; - rdata[2].len = ndead * sizeof(OffsetNumber); - } - else - { - rdata[2].data = NULL; - rdata[2].len = 0; - } - rdata[2].buffer = buffer; - rdata[2].buffer_std = true; - rdata[2].next = &(rdata[3]); + XLogRegisterBufData(0, (char *) nowdead, + ndead * sizeof(OffsetNumber)); if (nunused > 0) - { - rdata[3].data = (char *) nowunused; - rdata[3].len = nunused * sizeof(OffsetNumber); - } - else - { - rdata[3].data = NULL; - rdata[3].len = 0; - } - rdata[3].buffer = buffer; - rdata[3].buffer_std = true; - rdata[3].next = NULL; + XLogRegisterBufData(0, (char *) nowunused, + nunused * sizeof(OffsetNumber)); - info = XLOG_HEAP2_CLEAN; - recptr = XLogInsert(RM_HEAP2_ID, info, rdata); + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_CLEAN); return recptr; } @@ -6626,35 +6514,28 @@ log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid, { xl_heap_freeze_page xlrec; XLogRecPtr recptr; - XLogRecData rdata[2]; /* Caller should not call me on a non-WAL-logged relation */ Assert(RelationNeedsWAL(reln)); /* nor when there are no tuples to freeze */ Assert(ntuples > 0); - xlrec.node = reln->rd_node; - xlrec.block = BufferGetBlockNumber(buffer); xlrec.cutoff_xid = cutoff_xid; xlrec.ntuples = ntuples; - rdata[0].data = (char *) &xlrec; - rdata[0].len = SizeOfHeapFreezePage; - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &(rdata[1]); + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapFreezePage); /* * The freeze plan array is not actually in the buffer, but pretend that * it is. When XLogInsert stores the whole buffer, the freeze plan need * not be stored too. */ - rdata[1].data = (char *) tuples; - rdata[1].len = ntuples * sizeof(xl_heap_freeze_tuple); - rdata[1].buffer = buffer; - rdata[1].buffer_std = true; - rdata[1].next = NULL; + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + XLogRegisterBufData(0, (char *) tuples, + ntuples * sizeof(xl_heap_freeze_tuple)); - recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE_PAGE, rdata); + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE_PAGE); return recptr; } @@ -6665,8 +6546,8 @@ log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid, * corresponding visibility map block. Both should have already been modified * and dirtied. * - * If checksums are enabled, we also add the heap_buffer to the chain to - * protect it from being torn. + * If checksums are enabled, we also generate a full-page image of + * heap_buffer, if necessary. */ XLogRecPtr log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer, @@ -6674,38 +6555,23 @@ log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer, { xl_heap_visible xlrec; XLogRecPtr recptr; - XLogRecData rdata[3]; + uint8 flags; Assert(BufferIsValid(heap_buffer)); Assert(BufferIsValid(vm_buffer)); - xlrec.node = rnode; - xlrec.block = BufferGetBlockNumber(heap_buffer); xlrec.cutoff_xid = cutoff_xid; + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapVisible); - rdata[0].data = (char *) &xlrec; - rdata[0].len = SizeOfHeapVisible; - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &(rdata[1]); + XLogRegisterBuffer(0, vm_buffer, 0); - rdata[1].data = NULL; - rdata[1].len = 0; - rdata[1].buffer = vm_buffer; - rdata[1].buffer_std = false; - rdata[1].next = NULL; + flags = REGBUF_STANDARD; + if (!XLogHintBitIsNeeded()) + flags |= REGBUF_NO_IMAGE; + XLogRegisterBuffer(1, heap_buffer, flags); - if (XLogHintBitIsNeeded()) - { - rdata[1].next = &(rdata[2]); - - rdata[2].data = NULL; - rdata[2].len = 0; - rdata[2].buffer = heap_buffer; - rdata[2].buffer_std = true; - rdata[2].next = NULL; - } - - recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VISIBLE, rdata); + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VISIBLE); return recptr; } @@ -6721,22 +6587,23 @@ log_heap_update(Relation reln, Buffer oldbuf, bool all_visible_cleared, bool new_all_visible_cleared) { xl_heap_update xlrec; - xl_heap_header_len xlhdr; - xl_heap_header_len xlhdr_idx; + xl_heap_header xlhdr; + xl_heap_header xlhdr_idx; uint8 info; uint16 prefix_suffix[2]; uint16 prefixlen = 0, suffixlen = 0; XLogRecPtr recptr; - XLogRecData rdata[9]; Page page = BufferGetPage(newbuf); bool need_tuple_data = RelationIsLogicallyLogged(reln); - int nr; - Buffer newbufref; + bool init; + int bufflags; /* Caller should not call me on a non-WAL-logged relation */ Assert(RelationNeedsWAL(reln)); + XLogBeginInsert(); + if (HeapTupleIsHeapOnly(newtup)) info = XLOG_HEAP_HOT_UPDATE; else @@ -6794,103 +6661,97 @@ log_heap_update(Relation reln, Buffer oldbuf, suffixlen = 0; } - xlrec.target.node = reln->rd_node; - xlrec.target.tid = oldtup->t_self; - xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data); - xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask, - oldtup->t_data->t_infomask2); - xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data); + /* Prepare main WAL data chain */ xlrec.flags = 0; if (all_visible_cleared) xlrec.flags |= XLOG_HEAP_ALL_VISIBLE_CLEARED; - xlrec.newtid = newtup->t_self; if (new_all_visible_cleared) xlrec.flags |= XLOG_HEAP_NEW_ALL_VISIBLE_CLEARED; if (prefixlen > 0) xlrec.flags |= XLOG_HEAP_PREFIX_FROM_OLD; if (suffixlen > 0) xlrec.flags |= XLOG_HEAP_SUFFIX_FROM_OLD; + if (need_tuple_data) + { + xlrec.flags |= XLOG_HEAP_CONTAINS_NEW_TUPLE; + if (old_key_tuple) + { + if (reln->rd_rel->relreplident == REPLICA_IDENTITY_FULL) + xlrec.flags |= XLOG_HEAP_CONTAINS_OLD_TUPLE; + else + xlrec.flags |= XLOG_HEAP_CONTAINS_OLD_KEY; + } + } /* If new tuple is the single and first tuple on page... */ if (ItemPointerGetOffsetNumber(&(newtup->t_self)) == FirstOffsetNumber && PageGetMaxOffsetNumber(page) == FirstOffsetNumber) { info |= XLOG_HEAP_INIT_PAGE; - newbufref = InvalidBuffer; + init = true; } else - newbufref = newbuf; + init = false; - rdata[0].data = NULL; - rdata[0].len = 0; - rdata[0].buffer = oldbuf; - rdata[0].buffer_std = true; - rdata[0].next = &(rdata[1]); + /* Prepare WAL data for the old page */ + xlrec.old_offnum = ItemPointerGetOffsetNumber(&oldtup->t_self); + xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data); + xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask, + oldtup->t_data->t_infomask2); - rdata[1].data = (char *) &xlrec; - rdata[1].len = SizeOfHeapUpdate; - rdata[1].buffer = InvalidBuffer; - rdata[1].next = &(rdata[2]); + /* Prepare WAL data for the new page */ + xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self); + xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data); - /* prefix and/or suffix length fields */ + bufflags = REGBUF_STANDARD; + if (init) + bufflags |= REGBUF_WILL_INIT; + if (need_tuple_data) + bufflags |= REGBUF_KEEP_DATA; + + XLogRegisterBuffer(0, newbuf, bufflags); + if (oldbuf != newbuf) + XLogRegisterBuffer(1, oldbuf, REGBUF_STANDARD); + + XLogRegisterData((char *) &xlrec, SizeOfHeapUpdate); + + /* + * Prepare WAL data for the new tuple. + */ if (prefixlen > 0 || suffixlen > 0) { if (prefixlen > 0 && suffixlen > 0) { prefix_suffix[0] = prefixlen; prefix_suffix[1] = suffixlen; - rdata[2].data = (char *) &prefix_suffix; - rdata[2].len = 2 * sizeof(uint16); + XLogRegisterBufData(0, (char *) &prefix_suffix, sizeof(uint16) * 2); } else if (prefixlen > 0) { - rdata[2].data = (char *) &prefixlen; - rdata[2].len = sizeof(uint16); + XLogRegisterBufData(0, (char *) &prefixlen, sizeof(uint16)); } else { - rdata[2].data = (char *) &suffixlen; - rdata[2].len = sizeof(uint16); + XLogRegisterBufData(0, (char *) &suffixlen, sizeof(uint16)); } - rdata[2].buffer = newbufref; - rdata[2].buffer_std = true; - rdata[2].next = &(rdata[3]); - nr = 3; } - else - nr = 2; - xlhdr.header.t_infomask2 = newtup->t_data->t_infomask2; - xlhdr.header.t_infomask = newtup->t_data->t_infomask; - xlhdr.header.t_hoff = newtup->t_data->t_hoff; - Assert(offsetof(HeapTupleHeaderData, t_bits) +prefixlen + suffixlen <= newtup->t_len); - xlhdr.t_len = newtup->t_len - offsetof(HeapTupleHeaderData, t_bits) -prefixlen - suffixlen; - - /* - * As with insert records, we need not store this rdata segment if we - * decide to store the whole buffer instead, unless we're doing logical - * decoding. - */ - rdata[nr].data = (char *) &xlhdr; - rdata[nr].len = SizeOfHeapHeaderLen; - rdata[nr].buffer = need_tuple_data ? InvalidBuffer : newbufref; - rdata[nr].buffer_std = true; - rdata[nr].next = &(rdata[nr + 1]); - nr++; + xlhdr.t_infomask2 = newtup->t_data->t_infomask2; + xlhdr.t_infomask = newtup->t_data->t_infomask; + xlhdr.t_hoff = newtup->t_data->t_hoff; + Assert(offsetof(HeapTupleHeaderData, t_bits) + prefixlen + suffixlen <= newtup->t_len); /* * PG73FORMAT: write bitmap [+ padding] [+ oid] + data * * The 'data' doesn't include the common prefix or suffix. */ + XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader); if (prefixlen == 0) { - rdata[nr].data = ((char *) newtup->t_data) + offsetof(HeapTupleHeaderData, t_bits); - rdata[nr].len = newtup->t_len - offsetof(HeapTupleHeaderData, t_bits) -suffixlen; - rdata[nr].buffer = need_tuple_data ? InvalidBuffer : newbufref; - rdata[nr].buffer_std = true; - rdata[nr].next = NULL; - nr++; + XLogRegisterBufData(0, + ((char *) newtup->t_data) + offsetof(HeapTupleHeaderData, t_bits), + newtup->t_len - offsetof(HeapTupleHeaderData, t_bits) -suffixlen); } else { @@ -6901,75 +6762,33 @@ log_heap_update(Relation reln, Buffer oldbuf, /* bitmap [+ padding] [+ oid] */ if (newtup->t_data->t_hoff - offsetof(HeapTupleHeaderData, t_bits) >0) { - rdata[nr - 1].next = &(rdata[nr]); - rdata[nr].data = ((char *) newtup->t_data) + offsetof(HeapTupleHeaderData, t_bits); - rdata[nr].len = newtup->t_data->t_hoff - offsetof(HeapTupleHeaderData, t_bits); - rdata[nr].buffer = need_tuple_data ? InvalidBuffer : newbufref; - rdata[nr].buffer_std = true; - rdata[nr].next = NULL; - nr++; + XLogRegisterBufData(0, + ((char *) newtup->t_data) + offsetof(HeapTupleHeaderData, t_bits), + newtup->t_data->t_hoff - offsetof(HeapTupleHeaderData, t_bits)); } /* data after common prefix */ - rdata[nr - 1].next = &(rdata[nr]); - rdata[nr].data = ((char *) newtup->t_data) + newtup->t_data->t_hoff + prefixlen; - rdata[nr].len = newtup->t_len - newtup->t_data->t_hoff - prefixlen - suffixlen; - rdata[nr].buffer = need_tuple_data ? InvalidBuffer : newbufref; - rdata[nr].buffer_std = true; - rdata[nr].next = NULL; - nr++; + XLogRegisterBufData(0, + ((char *) newtup->t_data) + newtup->t_data->t_hoff + prefixlen, + newtup->t_len - newtup->t_data->t_hoff - prefixlen - suffixlen); } - /* - * Separate storage for the FPW buffer reference of the new page in the - * wal_level >= logical case. - */ - if (need_tuple_data) + /* We need to log a tuple identity */ + if (need_tuple_data && old_key_tuple) { - rdata[nr - 1].next = &(rdata[nr]); + /* don't really need this, but its more comfy to decode */ + xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2; + xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask; + xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff; - rdata[nr].data = NULL, - rdata[nr].len = 0; - rdata[nr].buffer = newbufref; - rdata[nr].buffer_std = true; - rdata[nr].next = NULL; - nr++; + XLogRegisterData((char *) &xlhdr_idx, SizeOfHeapHeader); - xlrec.flags |= XLOG_HEAP_CONTAINS_NEW_TUPLE; - - /* We need to log a tuple identity */ - if (old_key_tuple) - { - /* don't really need this, but its more comfy to decode */ - xlhdr_idx.header.t_infomask2 = old_key_tuple->t_data->t_infomask2; - xlhdr_idx.header.t_infomask = old_key_tuple->t_data->t_infomask; - xlhdr_idx.header.t_hoff = old_key_tuple->t_data->t_hoff; - xlhdr_idx.t_len = old_key_tuple->t_len; - - rdata[nr - 1].next = &(rdata[nr]); - rdata[nr].data = (char *) &xlhdr_idx; - rdata[nr].len = SizeOfHeapHeaderLen; - rdata[nr].buffer = InvalidBuffer; - rdata[nr].next = &(rdata[nr + 1]); - nr++; - - /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */ - rdata[nr].data = (char *) old_key_tuple->t_data - + offsetof(HeapTupleHeaderData, t_bits); - rdata[nr].len = old_key_tuple->t_len - - offsetof(HeapTupleHeaderData, t_bits); - rdata[nr].buffer = InvalidBuffer; - rdata[nr].next = NULL; - nr++; - - if (reln->rd_rel->relreplident == REPLICA_IDENTITY_FULL) - xlrec.flags |= XLOG_HEAP_CONTAINS_OLD_TUPLE; - else - xlrec.flags |= XLOG_HEAP_CONTAINS_OLD_KEY; - } + /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */ + XLogRegisterData((char *) old_key_tuple->t_data + offsetof(HeapTupleHeaderData, t_bits), + old_key_tuple->t_len - offsetof(HeapTupleHeaderData, t_bits)); } - recptr = XLogInsert(RM_HEAP_ID, info, rdata); + recptr = XLogInsert(RM_HEAP_ID, info); return recptr; } @@ -6986,15 +6805,14 @@ log_heap_new_cid(Relation relation, HeapTuple tup) xl_heap_new_cid xlrec; XLogRecPtr recptr; - XLogRecData rdata[1]; HeapTupleHeader hdr = tup->t_data; Assert(ItemPointerIsValid(&tup->t_self)); Assert(tup->t_tableOid != InvalidOid); xlrec.top_xid = GetTopTransactionId(); - xlrec.target.node = relation->rd_node; - xlrec.target.tid = tup->t_self; + xlrec.target_node = relation->rd_node; + xlrec.target_tid = tup->t_self; /* * If the tuple got inserted & deleted in the same TX we definitely have a @@ -7035,12 +6853,15 @@ log_heap_new_cid(Relation relation, HeapTuple tup) xlrec.combocid = InvalidCommandId; } - rdata[0].data = (char *) &xlrec; - rdata[0].len = SizeOfHeapNewCid; - rdata[0].buffer = InvalidBuffer; - rdata[0].next = NULL; + /* + * Note that we don't need to register the buffer here, because this + * operation does not modify the page. The insert/update/delete that + * called us certainly did, but that's WAL-logged separately. + */ + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapNewCid); - recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_NEW_CID, rdata); + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_NEW_CID); return recptr; } @@ -7165,7 +6986,7 @@ ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_changed, bool * * Handles CLEANUP_INFO */ static void -heap_xlog_cleanup_info(XLogRecPtr lsn, XLogRecord *record) +heap_xlog_cleanup_info(XLogReaderState *record) { xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) XLogRecGetData(record); @@ -7179,15 +7000,16 @@ heap_xlog_cleanup_info(XLogRecPtr lsn, XLogRecord *record) */ /* Backup blocks are not used in cleanup_info records */ - Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); + Assert(!XLogRecHasAnyBlockRefs(record)); } /* * Handles HEAP2_CLEAN record type */ static void -heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record) +heap_xlog_clean(XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; xl_heap_clean *xlrec = (xl_heap_clean *) XLogRecGetData(record); Buffer buffer; Size freespace = 0; @@ -7195,8 +7017,7 @@ heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record) BlockNumber blkno; XLogRedoAction action; - rnode = xlrec->node; - blkno = xlrec->block; + XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno); /* * We're about to remove tuples. In Hot Standby mode, ensure that there's @@ -7213,9 +7034,8 @@ heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record) * If we have a full-page image, restore it (using a cleanup lock) and * we're done. */ - action = XLogReadBufferForRedoExtended(lsn, record, 0, - rnode, MAIN_FORKNUM, blkno, - RBM_NORMAL, true, &buffer); + action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, + &buffer); if (action == BLK_NEEDS_REDO) { Page page = (Page) BufferGetPage(buffer); @@ -7226,11 +7046,13 @@ heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record) int nredirected; int ndead; int nunused; + Size datalen; + + redirected = (OffsetNumber *) XLogRecGetBlockData(record, 0, &datalen); nredirected = xlrec->nredirected; ndead = xlrec->ndead; - end = (OffsetNumber *) ((char *) xlrec + record->xl_len); - redirected = (OffsetNumber *) ((char *) xlrec + SizeOfHeapClean); + end = (OffsetNumber *) ((char *) redirected + datalen); nowdead = redirected + (nredirected * 2); nowunused = nowdead + ndead; nunused = (end - nowunused); @@ -7263,7 +7085,7 @@ heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record) * totally accurate anyway. */ if (action == BLK_NEEDS_REDO) - XLogRecordPageWithFreeSpace(xlrec->node, xlrec->block, freespace); + XLogRecordPageWithFreeSpace(rnode, blkno, freespace); } /* @@ -7275,17 +7097,18 @@ heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record) * page modification would fail to clear the visibility map bit. */ static void -heap_xlog_visible(XLogRecPtr lsn, XLogRecord *record) +heap_xlog_visible(XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; xl_heap_visible *xlrec = (xl_heap_visible *) XLogRecGetData(record); + Buffer vmbuffer = InvalidBuffer; Buffer buffer; Page page; RelFileNode rnode; BlockNumber blkno; XLogRedoAction action; - rnode = xlrec->node; - blkno = xlrec->block; + XLogRecGetBlockTag(record, 1, &rnode, NULL, &blkno); /* * If there are any Hot Standby transactions running that have an xmin @@ -7304,7 +7127,7 @@ heap_xlog_visible(XLogRecPtr lsn, XLogRecord *record) * truncated later in recovery, we don't need to update the page, but we'd * better still update the visibility map. */ - action = XLogReadBufferForRedo(lsn, record, 1, rnode, blkno, &buffer); + action = XLogReadBufferForRedo(record, 1, &buffer); if (action == BLK_NEEDS_REDO) { /* @@ -7341,12 +7164,21 @@ heap_xlog_visible(XLogRecPtr lsn, XLogRecord *record) * the visibility map bit does so before checking the page LSN, so any * bits that need to be cleared will still be cleared. */ - if (record->xl_info & XLR_BKP_BLOCK(0)) - (void) RestoreBackupBlock(lsn, record, 0, false, false); - else + if (XLogReadBufferForRedoExtended(record, 0, RBM_ZERO_ON_ERROR, false, + &vmbuffer) == BLK_NEEDS_REDO) { + Page vmpage = BufferGetPage(vmbuffer); Relation reln; - Buffer vmbuffer = InvalidBuffer; + + /* initialize the page if it was read as zeros */ + if (PageIsNew(vmpage)) + PageInit(vmpage, BLCKSZ, 0); + + /* + * XLogReplayBufferExtended locked the buffer. But visibilitymap_set + * will handle locking itself. + */ + LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK); reln = CreateFakeRelcacheEntry(rnode); visibilitymap_pin(reln, blkno, &vmbuffer); @@ -7362,25 +7194,27 @@ heap_xlog_visible(XLogRecPtr lsn, XLogRecord *record) * we did for the heap page. If this results in a dropped bit, no * real harm is done; and the next VACUUM will fix it. */ - if (lsn > PageGetLSN(BufferGetPage(vmbuffer))) + if (lsn > PageGetLSN(vmpage)) visibilitymap_set(reln, blkno, InvalidBuffer, lsn, vmbuffer, xlrec->cutoff_xid); ReleaseBuffer(vmbuffer); FreeFakeRelcacheEntry(reln); } + else if (BufferIsValid(vmbuffer)) + UnlockReleaseBuffer(vmbuffer); } /* * Replay XLOG_HEAP2_FREEZE_PAGE records */ static void -heap_xlog_freeze_page(XLogRecPtr lsn, XLogRecord *record) +heap_xlog_freeze_page(XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; xl_heap_freeze_page *xlrec = (xl_heap_freeze_page *) XLogRecGetData(record); TransactionId cutoff_xid = xlrec->cutoff_xid; Buffer buffer; - Page page; int ntup; /* @@ -7388,12 +7222,19 @@ heap_xlog_freeze_page(XLogRecPtr lsn, XLogRecord *record) * consider the frozen xids as running. */ if (InHotStandby) - ResolveRecoveryConflictWithSnapshot(cutoff_xid, xlrec->node); - - if (XLogReadBufferForRedo(lsn, record, 0, xlrec->node, xlrec->block, - &buffer) == BLK_NEEDS_REDO) { - page = BufferGetPage(buffer); + RelFileNode rnode; + + XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL); + ResolveRecoveryConflictWithSnapshot(cutoff_xid, rnode); + } + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + Page page = BufferGetPage(buffer); + xl_heap_freeze_tuple *tuples; + + tuples = (xl_heap_freeze_tuple *) XLogRecGetBlockData(record, 0, NULL); /* now execute freeze plan for each frozen tuple */ for (ntup = 0; ntup < xlrec->ntuples; ntup++) @@ -7402,7 +7243,7 @@ heap_xlog_freeze_page(XLogRecPtr lsn, XLogRecord *record) ItemId lp; HeapTupleHeader tuple; - xlrec_tp = &xlrec->tuples[ntup]; + xlrec_tp = &tuples[ntup]; lp = PageGetItemId(page, xlrec_tp->offset); /* offsets are one-based */ tuple = (HeapTupleHeader) PageGetItem(page, lp); @@ -7444,19 +7285,21 @@ fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2) } static void -heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record) +heap_xlog_delete(XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; xl_heap_delete *xlrec = (xl_heap_delete *) XLogRecGetData(record); Buffer buffer; Page page; - OffsetNumber offnum; ItemId lp = NULL; HeapTupleHeader htup; BlockNumber blkno; RelFileNode target_node; + ItemPointerData target_tid; - blkno = ItemPointerGetBlockNumber(&(xlrec->target.tid)); - target_node = xlrec->target.node; + XLogRecGetBlockTag(record, 0, &target_node, NULL, &blkno); + ItemPointerSetBlockNumber(&target_tid, blkno); + ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum); /* * The visibility map may need to be fixed even if the heap page is @@ -7473,16 +7316,14 @@ heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record) FreeFakeRelcacheEntry(reln); } - if (XLogReadBufferForRedo(lsn, record, 0, target_node, blkno, &buffer) - == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { - page = (Page) BufferGetPage(buffer); + page = BufferGetPage(buffer); - offnum = ItemPointerGetOffsetNumber(&(xlrec->target.tid)); - if (PageGetMaxOffsetNumber(page) >= offnum) - lp = PageGetItemId(page, offnum); + if (PageGetMaxOffsetNumber(page) >= xlrec->offnum) + lp = PageGetItemId(page, xlrec->offnum); - if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + if (PageGetMaxOffsetNumber(page) < xlrec->offnum || !ItemIdIsNormal(lp)) elog(PANIC, "heap_delete_redo: invalid lp"); htup = (HeapTupleHeader) PageGetItem(page, lp); @@ -7496,13 +7337,13 @@ heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record) HeapTupleHeaderSetCmax(htup, FirstCommandId, false); /* Mark the page as a candidate for pruning */ - PageSetPrunable(page, record->xl_xid); + PageSetPrunable(page, XLogRecGetXid(record)); if (xlrec->flags & XLOG_HEAP_ALL_VISIBLE_CLEARED) PageClearAllVisible(page); /* Make sure there is no forward chain link in t_ctid */ - htup->t_ctid = xlrec->target.tid; + htup->t_ctid = target_tid; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } @@ -7511,12 +7352,12 @@ heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record) } static void -heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record) +heap_xlog_insert(XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; xl_heap_insert *xlrec = (xl_heap_insert *) XLogRecGetData(record); Buffer buffer; Page page; - OffsetNumber offnum; struct { HeapTupleHeaderData hdr; @@ -7528,10 +7369,12 @@ heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record) Size freespace = 0; RelFileNode target_node; BlockNumber blkno; + ItemPointerData target_tid; XLogRedoAction action; - target_node = xlrec->target.node; - blkno = ItemPointerGetBlockNumber(&(xlrec->target.tid)); + XLogRecGetBlockTag(record, 0, &target_node, NULL, &blkno); + ItemPointerSetBlockNumber(&target_tid, blkno); + ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum); /* * The visibility map may need to be fixed even if the heap page is @@ -7549,51 +7392,51 @@ heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record) } /* - * If we inserted the first and only tuple on the page, re-initialize - * the page from scratch. + * If we inserted the first and only tuple on the page, re-initialize the + * page from scratch. */ - if (record->xl_info & XLOG_HEAP_INIT_PAGE) + if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) { - XLogReadBufferForRedoExtended(lsn, record, 0, - target_node, MAIN_FORKNUM, blkno, - RBM_ZERO_AND_LOCK, false, &buffer); + buffer = XLogInitBufferForRedo(record, 0); page = BufferGetPage(buffer); PageInit(page, BufferGetPageSize(buffer), 0); action = BLK_NEEDS_REDO; } else - action = XLogReadBufferForRedo(lsn, record, 0, target_node, blkno, - &buffer); - + action = XLogReadBufferForRedo(record, 0, &buffer); if (action == BLK_NEEDS_REDO) { + Size datalen; + char *data; + page = BufferGetPage(buffer); - offnum = ItemPointerGetOffsetNumber(&(xlrec->target.tid)); - if (PageGetMaxOffsetNumber(page) + 1 < offnum) + if (PageGetMaxOffsetNumber(page) + 1 < xlrec->offnum) elog(PANIC, "heap_insert_redo: invalid max offset number"); - newlen = record->xl_len - SizeOfHeapInsert - SizeOfHeapHeader; - Assert(newlen <= MaxHeapTupleSize); - memcpy((char *) &xlhdr, - (char *) xlrec + SizeOfHeapInsert, - SizeOfHeapHeader); + data = XLogRecGetBlockData(record, 0, &datalen); + + newlen = datalen - SizeOfHeapHeader; + Assert(datalen > SizeOfHeapHeader && newlen <= MaxHeapTupleSize); + memcpy((char *) &xlhdr, data, SizeOfHeapHeader); + data += SizeOfHeapHeader; + htup = &tbuf.hdr; MemSet((char *) htup, 0, sizeof(HeapTupleHeaderData)); /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */ memcpy((char *) htup + offsetof(HeapTupleHeaderData, t_bits), - (char *) xlrec + SizeOfHeapInsert + SizeOfHeapHeader, + data, newlen); newlen += offsetof(HeapTupleHeaderData, t_bits); htup->t_infomask2 = xlhdr.t_infomask2; htup->t_infomask = xlhdr.t_infomask; htup->t_hoff = xlhdr.t_hoff; - HeapTupleHeaderSetXmin(htup, record->xl_xid); + HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); HeapTupleHeaderSetCmin(htup, FirstCommandId); - htup->t_ctid = xlrec->target.tid; + htup->t_ctid = target_tid; - offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true); - if (offnum == InvalidOffsetNumber) + if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum, + true, true) == InvalidOffsetNumber) elog(PANIC, "heap_insert_redo: failed to add tuple"); freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ @@ -7618,16 +7461,16 @@ heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record) * totally accurate anyway. */ if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5) - XLogRecordPageWithFreeSpace(xlrec->target.node, blkno, freespace); + XLogRecordPageWithFreeSpace(target_node, blkno, freespace); } /* * Handles MULTI_INSERT record type. */ static void -heap_xlog_multi_insert(XLogRecPtr lsn, XLogRecord *record) +heap_xlog_multi_insert(XLogReaderState *record) { - char *recdata = XLogRecGetData(record); + XLogRecPtr lsn = record->EndRecPtr; xl_heap_multi_insert *xlrec; RelFileNode rnode; BlockNumber blkno; @@ -7642,27 +7485,16 @@ heap_xlog_multi_insert(XLogRecPtr lsn, XLogRecord *record) uint32 newlen; Size freespace = 0; int i; - bool isinit = (record->xl_info & XLOG_HEAP_INIT_PAGE) != 0; + bool isinit = (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) != 0; XLogRedoAction action; /* * Insertion doesn't overwrite MVCC data, so no conflict processing is * required. */ + xlrec = (xl_heap_multi_insert *) XLogRecGetData(record); - xlrec = (xl_heap_multi_insert *) recdata; - recdata += SizeOfHeapMultiInsert; - - rnode = xlrec->node; - blkno = xlrec->blkno; - - /* - * If we're reinitializing the page, the tuples are stored in order from - * FirstOffsetNumber. Otherwise there's an array of offsets in the WAL - * record. - */ - if (!isinit) - recdata += sizeof(OffsetNumber) * xlrec->ntuples; + XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno); /* * The visibility map may need to be fixed even if the heap page is @@ -7681,24 +7513,35 @@ heap_xlog_multi_insert(XLogRecPtr lsn, XLogRecord *record) if (isinit) { - XLogReadBufferForRedoExtended(lsn, record, 0, - rnode, MAIN_FORKNUM, blkno, - RBM_ZERO_AND_LOCK, false, &buffer); + buffer = XLogInitBufferForRedo(record, 0); page = BufferGetPage(buffer); PageInit(page, BufferGetPageSize(buffer), 0); action = BLK_NEEDS_REDO; } else - action = XLogReadBufferForRedo(lsn, record, 0, rnode, blkno, &buffer); - + action = XLogReadBufferForRedo(record, 0, &buffer); if (action == BLK_NEEDS_REDO) { - page = BufferGetPage(buffer); + char *tupdata; + char *endptr; + Size len; + + /* Tuples are stored as block data */ + tupdata = XLogRecGetBlockData(record, 0, &len); + endptr = tupdata + len; + + page = (Page) BufferGetPage(buffer); + for (i = 0; i < xlrec->ntuples; i++) { OffsetNumber offnum; xl_multi_insert_tuple *xlhdr; + /* + * If we're reinitializing the page, the tuples are stored in + * order from FirstOffsetNumber. Otherwise there's an array of + * offsets in the WAL record, and the tuples come after that. + */ if (isinit) offnum = FirstOffsetNumber + i; else @@ -7706,8 +7549,8 @@ heap_xlog_multi_insert(XLogRecPtr lsn, XLogRecord *record) if (PageGetMaxOffsetNumber(page) + 1 < offnum) elog(PANIC, "heap_multi_insert_redo: invalid max offset number"); - xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(recdata); - recdata = ((char *) xlhdr) + SizeOfMultiInsertTuple; + xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(tupdata); + tupdata = ((char *) xlhdr) + SizeOfMultiInsertTuple; newlen = xlhdr->datalen; Assert(newlen <= MaxHeapTupleSize); @@ -7715,15 +7558,15 @@ heap_xlog_multi_insert(XLogRecPtr lsn, XLogRecord *record) MemSet((char *) htup, 0, sizeof(HeapTupleHeaderData)); /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */ memcpy((char *) htup + offsetof(HeapTupleHeaderData, t_bits), - (char *) recdata, + (char *) tupdata, newlen); - recdata += newlen; + tupdata += newlen; newlen += offsetof(HeapTupleHeaderData, t_bits); htup->t_infomask2 = xlhdr->t_infomask2; htup->t_infomask = xlhdr->t_infomask; htup->t_hoff = xlhdr->t_hoff; - HeapTupleHeaderSetXmin(htup, record->xl_xid); + HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); HeapTupleHeaderSetCmin(htup, FirstCommandId); ItemPointerSetBlockNumber(&htup->t_ctid, blkno); ItemPointerSetOffsetNumber(&htup->t_ctid, offnum); @@ -7732,6 +7575,8 @@ heap_xlog_multi_insert(XLogRecPtr lsn, XLogRecord *record) if (offnum == InvalidOffsetNumber) elog(PANIC, "heap_multi_insert_redo: failed to add tuple"); } + if (tupdata != endptr) + elog(PANIC, "heap_multi_insert_redo: total tuple length mismatch"); freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ @@ -7755,19 +7600,21 @@ heap_xlog_multi_insert(XLogRecPtr lsn, XLogRecord *record) * totally accurate anyway. */ if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5) - XLogRecordPageWithFreeSpace(xlrec->node, blkno, freespace); + XLogRecordPageWithFreeSpace(rnode, blkno, freespace); } /* * Handles UPDATE and HOT_UPDATE */ static void -heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update) +heap_xlog_update(XLogReaderState *record, bool hot_update) { + XLogRecPtr lsn = record->EndRecPtr; xl_heap_update *xlrec = (xl_heap_update *) XLogRecGetData(record); RelFileNode rnode; BlockNumber oldblk; BlockNumber newblk; + ItemPointerData newtid; Buffer obuffer, nbuffer; Page page; @@ -7775,7 +7622,6 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update) ItemId lp = NULL; HeapTupleData oldtup; HeapTupleHeader htup; - char *recdata; uint16 prefixlen = 0, suffixlen = 0; char *newp; @@ -7784,7 +7630,7 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update) HeapTupleHeaderData hdr; char data[MaxHeapTupleSize]; } tbuf; - xl_heap_header_len xlhdr; + xl_heap_header xlhdr; uint32 newlen; Size freespace = 0; XLogRedoAction oldaction; @@ -7794,9 +7640,16 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update) oldtup.t_data = NULL; oldtup.t_len = 0; - rnode = xlrec->target.node; - newblk = ItemPointerGetBlockNumber(&xlrec->newtid); - oldblk = ItemPointerGetBlockNumber(&xlrec->target.tid); + XLogRecGetBlockTag(record, 0, &rnode, NULL, &newblk); + if (XLogRecGetBlockTag(record, 1, NULL, NULL, &oldblk)) + { + /* HOT updates are never done across pages */ + Assert(!hot_update); + } + else + oldblk = newblk; + + ItemPointerSet(&newtid, newblk, xlrec->new_offnum); /* * The visibility map may need to be fixed even if the heap page is @@ -7824,12 +7677,12 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update) */ /* Deal with old tuple version */ - oldaction = XLogReadBufferForRedo(lsn, record, 0, rnode, oldblk, &obuffer); + oldaction = XLogReadBufferForRedo(record, (oldblk == newblk) ? 0 : 1, + &obuffer); if (oldaction == BLK_NEEDS_REDO) { - page = (Page) BufferGetPage(obuffer); - - offnum = ItemPointerGetOffsetNumber(&(xlrec->target.tid)); + page = BufferGetPage(obuffer); + offnum = xlrec->old_offnum; if (PageGetMaxOffsetNumber(page) >= offnum) lp = PageGetItemId(page, offnum); @@ -7852,10 +7705,10 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update) HeapTupleHeaderSetXmax(htup, xlrec->old_xmax); HeapTupleHeaderSetCmax(htup, FirstCommandId, false); /* Set forward chain link in t_ctid */ - htup->t_ctid = xlrec->newtid; + htup->t_ctid = newtid; /* Mark the page as a candidate for pruning */ - PageSetPrunable(page, record->xl_xid); + PageSetPrunable(page, XLogRecGetXid(record)); if (xlrec->flags & XLOG_HEAP_ALL_VISIBLE_CLEARED) PageClearAllVisible(page); @@ -7872,18 +7725,15 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update) nbuffer = obuffer; newaction = oldaction; } - else if (record->xl_info & XLOG_HEAP_INIT_PAGE) + else if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) { - XLogReadBufferForRedoExtended(lsn, record, 1, - rnode, MAIN_FORKNUM, newblk, - RBM_ZERO_AND_LOCK, false, &nbuffer); + nbuffer = XLogInitBufferForRedo(record, 0); page = (Page) BufferGetPage(nbuffer); PageInit(page, BufferGetPageSize(nbuffer), 0); newaction = BLK_NEEDS_REDO; } else - newaction = XLogReadBufferForRedo(lsn, record, 1, rnode, newblk, - &nbuffer); + newaction = XLogReadBufferForRedo(record, 0, &nbuffer); /* * The visibility map may need to be fixed even if the heap page is @@ -7891,7 +7741,7 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update) */ if (xlrec->flags & XLOG_HEAP_NEW_ALL_VISIBLE_CLEARED) { - Relation reln = CreateFakeRelcacheEntry(xlrec->target.node); + Relation reln = CreateFakeRelcacheEntry(rnode); Buffer vmbuffer = InvalidBuffer; visibilitymap_pin(reln, newblk, &vmbuffer); @@ -7903,14 +7753,20 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update) /* Deal with new tuple */ if (newaction == BLK_NEEDS_REDO) { - page = (Page) BufferGetPage(nbuffer); + char *recdata; + char *recdata_end; + Size datalen; + Size tuplen; - offnum = ItemPointerGetOffsetNumber(&(xlrec->newtid)); + recdata = XLogRecGetBlockData(record, 0, &datalen); + recdata_end = recdata + datalen; + + page = BufferGetPage(nbuffer); + + offnum = xlrec->new_offnum; if (PageGetMaxOffsetNumber(page) + 1 < offnum) elog(PANIC, "heap_update_redo: invalid max offset number"); - recdata = (char *) xlrec + SizeOfHeapUpdate; - if (xlrec->flags & XLOG_HEAP_PREFIX_FROM_OLD) { Assert(newblk == oldblk); @@ -7924,10 +7780,12 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update) recdata += sizeof(uint16); } - memcpy((char *) &xlhdr, recdata, SizeOfHeapHeaderLen); - recdata += SizeOfHeapHeaderLen; + memcpy((char *) &xlhdr, recdata, SizeOfHeapHeader); + recdata += SizeOfHeapHeader; + + tuplen = recdata_end - recdata; + Assert(tuplen <= MaxHeapTupleSize); - Assert(xlhdr.t_len + prefixlen + suffixlen <= MaxHeapTupleSize); htup = &tbuf.hdr; MemSet((char *) htup, 0, sizeof(HeapTupleHeaderData)); @@ -7941,7 +7799,7 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update) int len; /* copy bitmap [+ padding] [+ oid] from WAL record */ - len = xlhdr.header.t_hoff - offsetof(HeapTupleHeaderData, t_bits); + len = xlhdr.t_hoff - offsetof(HeapTupleHeaderData, t_bits); memcpy(newp, recdata, len); recdata += len; newp += len; @@ -7951,7 +7809,7 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update) newp += prefixlen; /* copy new tuple data from WAL record */ - len = xlhdr.t_len - (xlhdr.header.t_hoff - offsetof(HeapTupleHeaderData, t_bits)); + len = tuplen - (xlhdr.t_hoff - offsetof(HeapTupleHeaderData, t_bits)); memcpy(newp, recdata, len); recdata += len; newp += len; @@ -7962,24 +7820,26 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update) * copy bitmap [+ padding] [+ oid] + data from record, all in one * go */ - memcpy(newp, recdata, xlhdr.t_len); - recdata += xlhdr.t_len; - newp += xlhdr.t_len; + memcpy(newp, recdata, tuplen); + recdata += tuplen; + newp += tuplen; } + Assert(recdata == recdata_end); + /* copy suffix from old tuple */ if (suffixlen > 0) memcpy(newp, (char *) oldtup.t_data + oldtup.t_len - suffixlen, suffixlen); - newlen = offsetof(HeapTupleHeaderData, t_bits) + xlhdr.t_len + prefixlen + suffixlen; - htup->t_infomask2 = xlhdr.header.t_infomask2; - htup->t_infomask = xlhdr.header.t_infomask; - htup->t_hoff = xlhdr.header.t_hoff; + newlen = offsetof(HeapTupleHeaderData, t_bits) + tuplen + prefixlen + suffixlen; + htup->t_infomask2 = xlhdr.t_infomask2; + htup->t_infomask = xlhdr.t_infomask; + htup->t_hoff = xlhdr.t_hoff; - HeapTupleHeaderSetXmin(htup, record->xl_xid); + HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); HeapTupleHeaderSetCmin(htup, FirstCommandId); HeapTupleHeaderSetXmax(htup, xlrec->new_xmax); /* Make sure there is no forward chain link in t_ctid */ - htup->t_ctid = xlrec->newtid; + htup->t_ctid = newtid; offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true); if (offnum == InvalidOffsetNumber) @@ -7993,6 +7853,7 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update) PageSetLSN(page, lsn); MarkBufferDirty(nbuffer); } + if (BufferIsValid(nbuffer) && nbuffer != obuffer) UnlockReleaseBuffer(nbuffer); if (BufferIsValid(obuffer)) @@ -8014,14 +7875,13 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update) * totally accurate anyway. */ if (newaction == BLK_NEEDS_REDO && !hot_update && freespace < BLCKSZ / 5) - XLogRecordPageWithFreeSpace(xlrec->target.node, - ItemPointerGetBlockNumber(&(xlrec->newtid)), - freespace); + XLogRecordPageWithFreeSpace(rnode, newblk, freespace); } static void -heap_xlog_lock(XLogRecPtr lsn, XLogRecord *record) +heap_xlog_lock(XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; xl_heap_lock *xlrec = (xl_heap_lock *) XLogRecGetData(record); Buffer buffer; Page page; @@ -8029,13 +7889,11 @@ heap_xlog_lock(XLogRecPtr lsn, XLogRecord *record) ItemId lp = NULL; HeapTupleHeader htup; - if (XLogReadBufferForRedo(lsn, record, 0, xlrec->target.node, - ItemPointerGetBlockNumber(&xlrec->target.tid), - &buffer) == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { page = (Page) BufferGetPage(buffer); - offnum = ItemPointerGetOffsetNumber(&(xlrec->target.tid)); + offnum = xlrec->offnum; if (PageGetMaxOffsetNumber(page) >= offnum) lp = PageGetItemId(page, offnum); @@ -8055,7 +7913,9 @@ heap_xlog_lock(XLogRecPtr lsn, XLogRecord *record) { HeapTupleHeaderClearHotUpdated(htup); /* Make sure there is no forward chain link in t_ctid */ - htup->t_ctid = xlrec->target.tid; + ItemPointerSet(&htup->t_ctid, + BufferGetBlockNumber(buffer), + offnum); } HeapTupleHeaderSetXmax(htup, xlrec->locking_xid); HeapTupleHeaderSetCmax(htup, FirstCommandId, false); @@ -8067,22 +7927,23 @@ heap_xlog_lock(XLogRecPtr lsn, XLogRecord *record) } static void -heap_xlog_lock_updated(XLogRecPtr lsn, XLogRecord *record) +heap_xlog_lock_updated(XLogReaderState *record) { - xl_heap_lock_updated *xlrec = - (xl_heap_lock_updated *) XLogRecGetData(record); + XLogRecPtr lsn = record->EndRecPtr; + xl_heap_lock_updated *xlrec; Buffer buffer; Page page; OffsetNumber offnum; ItemId lp = NULL; HeapTupleHeader htup; - if (XLogReadBufferForRedo(lsn, record, 0, xlrec->target.node, - ItemPointerGetBlockNumber(&(xlrec->target.tid)), - &buffer) == BLK_NEEDS_REDO) + xlrec = (xl_heap_lock_updated *) XLogRecGetData(record); + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { page = BufferGetPage(buffer); - offnum = ItemPointerGetOffsetNumber(&(xlrec->target.tid)); + + offnum = xlrec->offnum; if (PageGetMaxOffsetNumber(page) >= offnum) lp = PageGetItemId(page, offnum); @@ -8103,8 +7964,9 @@ heap_xlog_lock_updated(XLogRecPtr lsn, XLogRecord *record) } static void -heap_xlog_inplace(XLogRecPtr lsn, XLogRecord *record) +heap_xlog_inplace(XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; xl_heap_inplace *xlrec = (xl_heap_inplace *) XLogRecGetData(record); Buffer buffer; Page page; @@ -8112,15 +7974,15 @@ heap_xlog_inplace(XLogRecPtr lsn, XLogRecord *record) ItemId lp = NULL; HeapTupleHeader htup; uint32 oldlen; - uint32 newlen; + Size newlen; - if (XLogReadBufferForRedo(lsn, record, 0, xlrec->target.node, - ItemPointerGetBlockNumber(&(xlrec->target.tid)), - &buffer) == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { + char *newtup = XLogRecGetBlockData(record, 0, &newlen); + page = BufferGetPage(buffer); - offnum = ItemPointerGetOffsetNumber(&(xlrec->target.tid)); + offnum = xlrec->offnum; if (PageGetMaxOffsetNumber(page) >= offnum) lp = PageGetItemId(page, offnum); @@ -8130,13 +7992,10 @@ heap_xlog_inplace(XLogRecPtr lsn, XLogRecord *record) htup = (HeapTupleHeader) PageGetItem(page, lp); oldlen = ItemIdGetLength(lp) - htup->t_hoff; - newlen = record->xl_len - SizeOfHeapInplace; if (oldlen != newlen) elog(PANIC, "heap_inplace_redo: wrong tuple length"); - memcpy((char *) htup + htup->t_hoff, - (char *) xlrec + SizeOfHeapInplace, - newlen); + memcpy((char *) htup + htup->t_hoff, newtup, newlen); PageSetLSN(page, lsn); MarkBufferDirty(buffer); @@ -8146,9 +8005,9 @@ heap_xlog_inplace(XLogRecPtr lsn, XLogRecord *record) } void -heap_redo(XLogRecPtr lsn, XLogRecord *record) +heap_redo(XLogReaderState *record) { - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; /* * These operations don't overwrite MVCC data so no conflict processing is @@ -8158,22 +8017,22 @@ heap_redo(XLogRecPtr lsn, XLogRecord *record) switch (info & XLOG_HEAP_OPMASK) { case XLOG_HEAP_INSERT: - heap_xlog_insert(lsn, record); + heap_xlog_insert(record); break; case XLOG_HEAP_DELETE: - heap_xlog_delete(lsn, record); + heap_xlog_delete(record); break; case XLOG_HEAP_UPDATE: - heap_xlog_update(lsn, record, false); + heap_xlog_update(record, false); break; case XLOG_HEAP_HOT_UPDATE: - heap_xlog_update(lsn, record, true); + heap_xlog_update(record, true); break; case XLOG_HEAP_LOCK: - heap_xlog_lock(lsn, record); + heap_xlog_lock(record); break; case XLOG_HEAP_INPLACE: - heap_xlog_inplace(lsn, record); + heap_xlog_inplace(record); break; default: elog(PANIC, "heap_redo: unknown op code %u", info); @@ -8181,29 +8040,29 @@ heap_redo(XLogRecPtr lsn, XLogRecord *record) } void -heap2_redo(XLogRecPtr lsn, XLogRecord *record) +heap2_redo(XLogReaderState *record) { - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; switch (info & XLOG_HEAP_OPMASK) { case XLOG_HEAP2_CLEAN: - heap_xlog_clean(lsn, record); + heap_xlog_clean(record); break; case XLOG_HEAP2_FREEZE_PAGE: - heap_xlog_freeze_page(lsn, record); + heap_xlog_freeze_page(record); break; case XLOG_HEAP2_CLEANUP_INFO: - heap_xlog_cleanup_info(lsn, record); + heap_xlog_cleanup_info(record); break; case XLOG_HEAP2_VISIBLE: - heap_xlog_visible(lsn, record); + heap_xlog_visible(record); break; case XLOG_HEAP2_MULTI_INSERT: - heap_xlog_multi_insert(lsn, record); + heap_xlog_multi_insert(record); break; case XLOG_HEAP2_LOCK_UPDATED: - heap_xlog_lock_updated(lsn, record); + heap_xlog_lock_updated(record); break; case XLOG_HEAP2_NEW_CID: @@ -8213,7 +8072,7 @@ heap2_redo(XLogRecPtr lsn, XLogRecord *record) */ break; case XLOG_HEAP2_REWRITE: - heap_xlog_logical_rewrite(lsn, record); + heap_xlog_logical_rewrite(record); break; default: elog(PANIC, "heap2_redo: unknown op code %u", info); diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index bea52460a0..4b132b7d01 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -865,7 +865,6 @@ logical_heap_rewrite_flush_mappings(RewriteState state) hash_seq_init(&seq_status, state->rs_logical_mappings); while ((src = (RewriteMappingFile *) hash_seq_search(&seq_status)) != NULL) { - XLogRecData rdata[2]; char *waldata; char *waldata_start; xl_heap_rewrite_mapping xlrec; @@ -889,11 +888,6 @@ logical_heap_rewrite_flush_mappings(RewriteState state) xlrec.offset = src->off; xlrec.start_lsn = state->rs_begin_lsn; - rdata[0].data = (char *) (&xlrec); - rdata[0].len = sizeof(xlrec); - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &(rdata[1]); - /* write all mappings consecutively */ len = src->num_mappings * sizeof(LogicalRewriteMappingData); waldata_start = waldata = palloc(len); @@ -934,13 +928,12 @@ logical_heap_rewrite_flush_mappings(RewriteState state) written, len))); src->off += len; - rdata[1].data = waldata_start; - rdata[1].len = len; - rdata[1].buffer = InvalidBuffer; - rdata[1].next = NULL; + XLogBeginInsert(); + XLogRegisterData((char *) (&xlrec), sizeof(xlrec)); + XLogRegisterData(waldata_start, len); /* write xlog record */ - XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_REWRITE, rdata); + XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_REWRITE); pfree(waldata_start); } @@ -1123,7 +1116,7 @@ logical_rewrite_heap_tuple(RewriteState state, ItemPointerData old_tid, * Replay XLOG_HEAP2_REWRITE records */ void -heap_xlog_logical_rewrite(XLogRecPtr lsn, XLogRecord *r) +heap_xlog_logical_rewrite(XLogReaderState *r) { char path[MAXPGPATH]; int fd; @@ -1138,7 +1131,7 @@ heap_xlog_logical_rewrite(XLogRecPtr lsn, XLogRecord *r) xlrec->mapped_db, xlrec->mapped_rel, (uint32) (xlrec->start_lsn >> 32), (uint32) xlrec->start_lsn, - xlrec->mapped_xid, r->xl_xid); + xlrec->mapped_xid, XLogRecGetXid(r)); fd = OpenTransientFile(path, O_CREAT | O_WRONLY | PG_BINARY, diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index bcaba7e5e8..2c4f9904e1 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -837,37 +837,25 @@ _bt_insertonpg(Relation rel, if (RelationNeedsWAL(rel)) { xl_btree_insert xlrec; - BlockNumber xlleftchild; xl_btree_metadata xlmeta; uint8 xlinfo; XLogRecPtr recptr; - XLogRecData rdata[4]; - XLogRecData *nextrdata; IndexTupleData trunctuple; - xlrec.target.node = rel->rd_node; - ItemPointerSet(&(xlrec.target.tid), itup_blkno, itup_off); + xlrec.offnum = itup_off; - rdata[0].data = (char *) &xlrec; - rdata[0].len = SizeOfBtreeInsert; - rdata[0].buffer = InvalidBuffer; - rdata[0].next = nextrdata = &(rdata[1]); + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert); if (P_ISLEAF(lpageop)) xlinfo = XLOG_BTREE_INSERT_LEAF; else { /* - * Include the block number of the left child, whose - * INCOMPLETE_SPLIT flag was cleared. + * Register the left child whose INCOMPLETE_SPLIT flag was + * cleared. */ - xlleftchild = BufferGetBlockNumber(cbuf); - nextrdata->data = (char *) &xlleftchild; - nextrdata->len = sizeof(BlockNumber); - nextrdata->buffer = cbuf; - nextrdata->buffer_std = true; - nextrdata->next = nextrdata + 1; - nextrdata++; + XLogRegisterBuffer(1, cbuf, REGBUF_STANDARD); xlinfo = XLOG_BTREE_INSERT_UPPER; } @@ -879,33 +867,25 @@ _bt_insertonpg(Relation rel, xlmeta.fastroot = metad->btm_fastroot; xlmeta.fastlevel = metad->btm_fastlevel; - nextrdata->data = (char *) &xlmeta; - nextrdata->len = sizeof(xl_btree_metadata); - nextrdata->buffer = InvalidBuffer; - nextrdata->next = nextrdata + 1; - nextrdata++; + XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT); + XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata)); xlinfo = XLOG_BTREE_INSERT_META; } /* Read comments in _bt_pgaddtup */ + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); if (!P_ISLEAF(lpageop) && newitemoff == P_FIRSTDATAKEY(lpageop)) { trunctuple = *itup; trunctuple.t_info = sizeof(IndexTupleData); - nextrdata->data = (char *) &trunctuple; - nextrdata->len = sizeof(IndexTupleData); + XLogRegisterBufData(0, (char *) &trunctuple, + sizeof(IndexTupleData)); } else - { - nextrdata->data = (char *) itup; - nextrdata->len = IndexTupleDSize(*itup); - } - nextrdata->buffer = buf; - nextrdata->buffer_std = true; - nextrdata->next = NULL; + XLogRegisterBufData(0, (char *) itup, IndexTupleDSize(*itup)); - recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata); + recptr = XLogInsert(RM_BTREE_ID, xlinfo); if (BufferIsValid(metabuf)) { @@ -1260,56 +1240,37 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, xl_btree_split xlrec; uint8 xlinfo; XLogRecPtr recptr; - XLogRecData rdata[7]; - XLogRecData *lastrdata; - BlockNumber cblkno; - xlrec.node = rel->rd_node; - xlrec.leftsib = origpagenumber; - xlrec.rightsib = rightpagenumber; - xlrec.rnext = ropaque->btpo_next; xlrec.level = ropaque->btpo.level; xlrec.firstright = firstright; + xlrec.newitemoff = newitemoff; - rdata[0].data = (char *) &xlrec; - rdata[0].len = SizeOfBtreeSplit; - rdata[0].buffer = InvalidBuffer; + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit); - lastrdata = &rdata[0]; + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT); + /* Log the right sibling, because we've changed its prev-pointer. */ + if (!P_RIGHTMOST(ropaque)) + XLogRegisterBuffer(2, sbuf, REGBUF_STANDARD); + if (BufferIsValid(cbuf)) + XLogRegisterBuffer(3, cbuf, REGBUF_STANDARD); /* - * Log the new item and its offset, if it was inserted on the left - * page. (If it was put on the right page, we don't need to explicitly - * WAL log it because it's included with all the other items on the - * right page.) Show the new item as belonging to the left page - * buffer, so that it is not stored if XLogInsert decides it needs a - * full-page image of the left page. We store the offset anyway, - * though, to support archive compression of these records. + * Log the new item, if it was inserted on the left page. (If it was + * put on the right page, we don't need to explicitly WAL log it + * because it's included with all the other items on the right page.) + * Show the new item as belonging to the left page buffer, so that it + * is not stored if XLogInsert decides it needs a full-page image of + * the left page. We store the offset anyway, though, to support + * archive compression of these records. */ if (newitemonleft) - { - lastrdata->next = lastrdata + 1; - lastrdata++; - - lastrdata->data = (char *) &newitemoff; - lastrdata->len = sizeof(OffsetNumber); - lastrdata->buffer = InvalidBuffer; - - lastrdata->next = lastrdata + 1; - lastrdata++; - - lastrdata->data = (char *) newitem; - lastrdata->len = MAXALIGN(newitemsz); - lastrdata->buffer = buf; /* backup block 0 */ - lastrdata->buffer_std = true; - } + XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz)); /* Log left page */ if (!isleaf) { - lastrdata->next = lastrdata + 1; - lastrdata++; - /* * We must also log the left page's high key, because the right * page's leftmost key is suppressed on non-leaf levels. Show it @@ -1319,43 +1280,7 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, */ itemid = PageGetItemId(origpage, P_HIKEY); item = (IndexTuple) PageGetItem(origpage, itemid); - lastrdata->data = (char *) item; - lastrdata->len = MAXALIGN(IndexTupleSize(item)); - lastrdata->buffer = buf; /* backup block 0 */ - lastrdata->buffer_std = true; - } - - if (isleaf && !newitemonleft) - { - lastrdata->next = lastrdata + 1; - lastrdata++; - - /* - * Although we don't need to WAL-log anything on the left page, we - * still need XLogInsert to consider storing a full-page image of - * the left page, so make an empty entry referencing that buffer. - * This also ensures that the left page is always backup block 0. - */ - lastrdata->data = NULL; - lastrdata->len = 0; - lastrdata->buffer = buf; /* backup block 0 */ - lastrdata->buffer_std = true; - } - - /* - * Log block number of left child, whose INCOMPLETE_SPLIT flag this - * insertion clears. - */ - if (!isleaf) - { - lastrdata->next = lastrdata + 1; - lastrdata++; - - cblkno = BufferGetBlockNumber(cbuf); - lastrdata->data = (char *) &cblkno; - lastrdata->len = sizeof(BlockNumber); - lastrdata->buffer = cbuf; /* backup block 1 */ - lastrdata->buffer_std = true; + XLogRegisterBufData(0, (char *) item, MAXALIGN(IndexTupleSize(item))); } /* @@ -1370,35 +1295,16 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, * and so the item pointers can be reconstructed. See comments for * _bt_restore_page(). */ - lastrdata->next = lastrdata + 1; - lastrdata++; - - lastrdata->data = (char *) rightpage + - ((PageHeader) rightpage)->pd_upper; - lastrdata->len = ((PageHeader) rightpage)->pd_special - - ((PageHeader) rightpage)->pd_upper; - lastrdata->buffer = InvalidBuffer; - - /* Log the right sibling, because we've changed its' prev-pointer. */ - if (!P_RIGHTMOST(ropaque)) - { - lastrdata->next = lastrdata + 1; - lastrdata++; - - lastrdata->data = NULL; - lastrdata->len = 0; - lastrdata->buffer = sbuf; /* bkp block 1 (leaf) or 2 (non-leaf) */ - lastrdata->buffer_std = true; - } - - lastrdata->next = NULL; + XLogRegisterBufData(1, + (char *) rightpage + ((PageHeader) rightpage)->pd_upper, + ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper); if (isroot) xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L_ROOT : XLOG_BTREE_SPLIT_R_ROOT; else xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R; - recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata); + recptr = XLogInsert(RM_BTREE_ID, xlinfo); PageSetLSN(origpage, recptr); PageSetLSN(rightpage, recptr); @@ -2090,34 +1996,35 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) { xl_btree_newroot xlrec; XLogRecPtr recptr; - XLogRecData rdata[3]; + xl_btree_metadata md; - xlrec.node = rel->rd_node; xlrec.rootblk = rootblknum; xlrec.level = metad->btm_level; - rdata[0].data = (char *) &xlrec; - rdata[0].len = SizeOfBtreeNewroot; - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &(rdata[1]); + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot); + + XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT); + XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD); + XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT); + + md.root = rootblknum; + md.level = metad->btm_level; + md.fastroot = rootblknum; + md.fastlevel = metad->btm_level; + + XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata)); /* * Direct access to page is not good but faster - we should implement * some new func in page API. */ - rdata[1].data = (char *) rootpage + ((PageHeader) rootpage)->pd_upper; - rdata[1].len = ((PageHeader) rootpage)->pd_special - - ((PageHeader) rootpage)->pd_upper; - rdata[1].buffer = InvalidBuffer; - rdata[1].next = &(rdata[2]); + XLogRegisterBufData(0, + (char *) rootpage + ((PageHeader) rootpage)->pd_upper, + ((PageHeader) rootpage)->pd_special - + ((PageHeader) rootpage)->pd_upper); - /* Make a full-page image of the left child if needed */ - rdata[2].data = NULL; - rdata[2].len = 0; - rdata[2].buffer = lbuf; - rdata[2].next = NULL; - - recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, rdata); + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT); PageSetLSN(lpage, recptr); PageSetLSN(rootpage, recptr); diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index ea95ce6e1e..a25dafeb40 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -236,18 +236,25 @@ _bt_getroot(Relation rel, int access) { xl_btree_newroot xlrec; XLogRecPtr recptr; - XLogRecData rdata; + xl_btree_metadata md; + + XLogBeginInsert(); + XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT); + XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT); + + md.root = rootblkno; + md.level = 0; + md.fastroot = rootblkno; + md.fastlevel = 0; + + XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata)); - xlrec.node = rel->rd_node; xlrec.rootblk = rootblkno; xlrec.level = 0; - rdata.data = (char *) &xlrec; - rdata.len = SizeOfBtreeNewroot; - rdata.buffer = InvalidBuffer; - rdata.next = NULL; + XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot); - recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, &rdata); + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT); PageSetLSN(rootpage, recptr); PageSetLSN(metapg, recptr); @@ -528,39 +535,23 @@ _bt_checkpage(Relation rel, Buffer buf) static void _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid) { - if (!RelationNeedsWAL(rel)) - return; - - /* No ereport(ERROR) until changes are logged */ - START_CRIT_SECTION(); + xl_btree_reuse_page xlrec_reuse; /* - * We don't do MarkBufferDirty here because we're about to initialise the - * page, and nobody else can see it yet. + * Note that we don't register the buffer with the record, because this + * operation doesn't modify the page. This record only exists to provide a + * conflict point for Hot Standby. */ /* XLOG stuff */ - { - XLogRecData rdata[1]; - xl_btree_reuse_page xlrec_reuse; + xlrec_reuse.node = rel->rd_node; + xlrec_reuse.block = blkno; + xlrec_reuse.latestRemovedXid = latestRemovedXid; - xlrec_reuse.node = rel->rd_node; - xlrec_reuse.block = blkno; - xlrec_reuse.latestRemovedXid = latestRemovedXid; - rdata[0].data = (char *) &xlrec_reuse; - rdata[0].len = SizeOfBtreeReusePage; - rdata[0].buffer = InvalidBuffer; - rdata[0].next = NULL; + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage); - XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE, rdata); - - /* - * We don't do PageSetLSN here because we're about to initialise the - * page, so no need. - */ - } - - END_CRIT_SECTION(); + XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE); } /* @@ -633,7 +624,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access) * WAL record that will allow us to conflict with queries * running on standby. */ - if (XLogStandbyInfoActive()) + if (XLogStandbyInfoActive() && RelationNeedsWAL(rel)) { BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); @@ -830,17 +821,13 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; - XLogRecData rdata[2]; xl_btree_vacuum xlrec_vacuum; - xlrec_vacuum.node = rel->rd_node; - xlrec_vacuum.block = BufferGetBlockNumber(buf); - xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed; - rdata[0].data = (char *) &xlrec_vacuum; - rdata[0].len = SizeOfBtreeVacuum; - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &(rdata[1]); + + XLogBeginInsert(); + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum); /* * The target-offsets array is not in the buffer, but pretend that it @@ -848,20 +835,9 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, * need not be stored too. */ if (nitems > 0) - { - rdata[1].data = (char *) itemnos; - rdata[1].len = nitems * sizeof(OffsetNumber); - } - else - { - rdata[1].data = NULL; - rdata[1].len = 0; - } - rdata[1].buffer = buf; - rdata[1].buffer_std = true; - rdata[1].next = NULL; + XLogRegisterBufData(0, (char *) itemnos, nitems * sizeof(OffsetNumber)); - recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM, rdata); + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM); PageSetLSN(page, recptr); } @@ -919,36 +895,23 @@ _bt_delitems_delete(Relation rel, Buffer buf, if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; - XLogRecData rdata[3]; xl_btree_delete xlrec_delete; - xlrec_delete.node = rel->rd_node; xlrec_delete.hnode = heapRel->rd_node; - xlrec_delete.block = BufferGetBlockNumber(buf); xlrec_delete.nitems = nitems; - rdata[0].data = (char *) &xlrec_delete; - rdata[0].len = SizeOfBtreeDelete; - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &(rdata[1]); + XLogBeginInsert(); + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + XLogRegisterData((char *) &xlrec_delete, SizeOfBtreeDelete); /* * We need the target-offsets array whether or not we store the whole * buffer, to allow us to find the latestRemovedXid on a standby * server. */ - rdata[1].data = (char *) itemnos; - rdata[1].len = nitems * sizeof(OffsetNumber); - rdata[1].buffer = InvalidBuffer; - rdata[1].next = &(rdata[2]); + XLogRegisterData((char *) itemnos, nitems * sizeof(OffsetNumber)); - rdata[2].data = NULL; - rdata[2].len = 0; - rdata[2].buffer = buf; - rdata[2].buffer_std = true; - rdata[2].next = NULL; - - recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata); + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE); PageSetLSN(page, recptr); } @@ -1493,33 +1456,26 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) { xl_btree_mark_page_halfdead xlrec; XLogRecPtr recptr; - XLogRecData rdata[2]; - xlrec.target.node = rel->rd_node; - ItemPointerSet(&(xlrec.target.tid), BufferGetBlockNumber(topparent), topoff); + xlrec.poffset = topoff; xlrec.leafblk = leafblkno; if (target != leafblkno) xlrec.topparent = target; else xlrec.topparent = InvalidBlockNumber; + XLogBeginInsert(); + XLogRegisterBuffer(0, leafbuf, REGBUF_WILL_INIT); + XLogRegisterBuffer(1, topparent, REGBUF_STANDARD); + page = BufferGetPage(leafbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); xlrec.leftblk = opaque->btpo_prev; xlrec.rightblk = opaque->btpo_next; - rdata[0].data = (char *) &xlrec; - rdata[0].len = SizeOfBtreeMarkPageHalfDead; - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &(rdata[1]); + XLogRegisterData((char *) &xlrec, SizeOfBtreeMarkPageHalfDead); - rdata[1].data = NULL; - rdata[1].len = 0; - rdata[1].buffer = topparent; - rdata[1].buffer_std = true; - rdata[1].next = NULL; - - recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD, rdata); + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD); page = BufferGetPage(topparent); PageSetLSN(page, recptr); @@ -1826,63 +1782,44 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty) xl_btree_metadata xlmeta; uint8 xlinfo; XLogRecPtr recptr; - XLogRecData rdata[4]; - XLogRecData *nextrdata; - xlrec.node = rel->rd_node; + XLogBeginInsert(); + + XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT); + if (BufferIsValid(lbuf)) + XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD); + XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD); + if (target != leafblkno) + XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT); /* information on the unlinked block */ - xlrec.deadblk = target; xlrec.leftsib = leftsib; xlrec.rightsib = rightsib; xlrec.btpo_xact = opaque->btpo.xact; /* information needed to recreate the leaf block (if not the target) */ - xlrec.leafblk = leafblkno; xlrec.leafleftsib = leafleftsib; xlrec.leafrightsib = leafrightsib; xlrec.topparent = nextchild; - rdata[0].data = (char *) &xlrec; - rdata[0].len = SizeOfBtreeUnlinkPage; - rdata[0].buffer = InvalidBuffer; - rdata[0].next = nextrdata = &(rdata[1]); + XLogRegisterData((char *) &xlrec, SizeOfBtreeUnlinkPage); if (BufferIsValid(metabuf)) { + XLogRegisterBuffer(4, metabuf, REGBUF_WILL_INIT); + xlmeta.root = metad->btm_root; xlmeta.level = metad->btm_level; xlmeta.fastroot = metad->btm_fastroot; xlmeta.fastlevel = metad->btm_fastlevel; - nextrdata->data = (char *) &xlmeta; - nextrdata->len = sizeof(xl_btree_metadata); - nextrdata->buffer = InvalidBuffer; - nextrdata->next = nextrdata + 1; - nextrdata++; + XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata)); xlinfo = XLOG_BTREE_UNLINK_PAGE_META; } else xlinfo = XLOG_BTREE_UNLINK_PAGE; - nextrdata->data = NULL; - nextrdata->len = 0; - nextrdata->buffer = rbuf; - nextrdata->buffer_std = true; - nextrdata->next = NULL; - - if (BufferIsValid(lbuf)) - { - nextrdata->next = nextrdata + 1; - nextrdata++; - nextrdata->data = NULL; - nextrdata->len = 0; - nextrdata->buffer = lbuf; - nextrdata->buffer_std = true; - nextrdata->next = NULL; - } - - recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata); + recptr = XLogInsert(RM_BTREE_ID, xlinfo); if (BufferIsValid(metabuf)) { diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 13951be62a..52aef9b983 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -72,17 +72,23 @@ _bt_restore_page(Page page, char *from, int len) } static void -_bt_restore_meta(RelFileNode rnode, XLogRecPtr lsn, - BlockNumber root, uint32 level, - BlockNumber fastroot, uint32 fastlevel) +_bt_restore_meta(XLogReaderState *record, uint8 block_id) { + XLogRecPtr lsn = record->EndRecPtr; Buffer metabuf; Page metapg; BTMetaPageData *md; BTPageOpaque pageop; + xl_btree_metadata *xlrec; + char *ptr; + Size len; - metabuf = XLogReadBuffer(rnode, BTREE_METAPAGE, true); - Assert(BufferIsValid(metabuf)); + metabuf = XLogInitBufferForRedo(record, block_id); + ptr = XLogRecGetBlockData(record, block_id, &len); + + Assert(len == sizeof(xl_btree_metadata)); + Assert(BufferGetBlockNumber(metabuf) == BTREE_METAPAGE); + xlrec = (xl_btree_metadata *) ptr; metapg = BufferGetPage(metabuf); _bt_pageinit(metapg, BufferGetPageSize(metabuf)); @@ -90,10 +96,10 @@ _bt_restore_meta(RelFileNode rnode, XLogRecPtr lsn, md = BTPageGetMeta(metapg); md->btm_magic = BTREE_MAGIC; md->btm_version = BTREE_VERSION; - md->btm_root = root; - md->btm_level = level; - md->btm_fastroot = fastroot; - md->btm_fastlevel = fastlevel; + md->btm_root = xlrec->root; + md->btm_level = xlrec->level; + md->btm_fastroot = xlrec->fastroot; + md->btm_fastlevel = xlrec->fastlevel; pageop = (BTPageOpaque) PageGetSpecialPointer(metapg); pageop->btpo_flags = BTP_META; @@ -117,14 +123,12 @@ _bt_restore_meta(RelFileNode rnode, XLogRecPtr lsn, * types that can insert a downlink: insert, split, and newroot. */ static void -_bt_clear_incomplete_split(XLogRecPtr lsn, XLogRecord *record, - int block_index, - RelFileNode rnode, BlockNumber cblock) +_bt_clear_incomplete_split(XLogReaderState *record, uint8 block_id) { + XLogRecPtr lsn = record->EndRecPtr; Buffer buf; - if (XLogReadBufferForRedo(lsn, record, block_index, rnode, cblock, &buf) - == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, block_id, &buf) == BLK_NEEDS_REDO) { Page page = (Page) BufferGetPage(buf); BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page); @@ -140,38 +144,12 @@ _bt_clear_incomplete_split(XLogRecPtr lsn, XLogRecord *record, } static void -btree_xlog_insert(bool isleaf, bool ismeta, - XLogRecPtr lsn, XLogRecord *record) +btree_xlog_insert(bool isleaf, bool ismeta, XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; xl_btree_insert *xlrec = (xl_btree_insert *) XLogRecGetData(record); Buffer buffer; Page page; - char *datapos; - int datalen; - xl_btree_metadata md; - BlockNumber cblkno = 0; - int main_blk_index; - - datapos = (char *) xlrec + SizeOfBtreeInsert; - datalen = record->xl_len - SizeOfBtreeInsert; - - /* - * if this insert finishes a split at lower level, extract the block - * number of the (left) child. - */ - if (!isleaf && (record->xl_info & XLR_BKP_BLOCK(0)) == 0) - { - memcpy(&cblkno, datapos, sizeof(BlockNumber)); - Assert(cblkno != 0); - datapos += sizeof(BlockNumber); - datalen -= sizeof(BlockNumber); - } - if (ismeta) - { - memcpy(&md, datapos, sizeof(xl_btree_metadata)); - datapos += sizeof(xl_btree_metadata); - datalen -= sizeof(xl_btree_metadata); - } /* * Insertion to an internal page finishes an incomplete split at the child @@ -183,21 +161,15 @@ btree_xlog_insert(bool isleaf, bool ismeta, * cannot be updates happening. */ if (!isleaf) + _bt_clear_incomplete_split(record, 1); + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { - _bt_clear_incomplete_split(lsn, record, 0, xlrec->target.node, cblkno); - main_blk_index = 1; - } - else - main_blk_index = 0; + Size datalen; + char *datapos = XLogRecGetBlockData(record, 0, &datalen); - if (XLogReadBufferForRedo(lsn, record, main_blk_index, xlrec->target.node, - ItemPointerGetBlockNumber(&(xlrec->target.tid)), - &buffer) == BLK_NEEDS_REDO) - { page = BufferGetPage(buffer); - if (PageAddItem(page, (Item) datapos, datalen, - ItemPointerGetOffsetNumber(&(xlrec->target.tid)), + if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum, false, false) == InvalidOffsetNumber) elog(PANIC, "btree_insert_redo: failed to add item"); @@ -215,15 +187,13 @@ btree_xlog_insert(bool isleaf, bool ismeta, * obsolete link from the metapage. */ if (ismeta) - _bt_restore_meta(xlrec->target.node, lsn, - md.root, md.level, - md.fastroot, md.fastlevel); + _bt_restore_meta(record, 2); } static void -btree_xlog_split(bool onleft, bool isroot, - XLogRecPtr lsn, XLogRecord *record) +btree_xlog_split(bool onleft, bool isroot, XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record); bool isleaf = (xlrec->level == 0); Buffer lbuf; @@ -231,56 +201,17 @@ btree_xlog_split(bool onleft, bool isroot, Page rpage; BTPageOpaque ropaque; char *datapos; - int datalen; - OffsetNumber newitemoff = 0; - Item newitem = NULL; - Size newitemsz = 0; + Size datalen; Item left_hikey = NULL; Size left_hikeysz = 0; - BlockNumber cblkno = InvalidBlockNumber; + BlockNumber leftsib; + BlockNumber rightsib; + BlockNumber rnext; - datapos = (char *) xlrec + SizeOfBtreeSplit; - datalen = record->xl_len - SizeOfBtreeSplit; - - /* Extract newitemoff and newitem, if present */ - if (onleft) - { - memcpy(&newitemoff, datapos, sizeof(OffsetNumber)); - datapos += sizeof(OffsetNumber); - datalen -= sizeof(OffsetNumber); - } - if (onleft && !(record->xl_info & XLR_BKP_BLOCK(0))) - { - /* - * We assume that 16-bit alignment is enough to apply IndexTupleSize - * (since it's fetching from a uint16 field) and also enough for - * PageAddItem to insert the tuple. - */ - newitem = (Item) datapos; - newitemsz = MAXALIGN(IndexTupleSize(newitem)); - datapos += newitemsz; - datalen -= newitemsz; - } - - /* Extract left hikey and its size (still assuming 16-bit alignment) */ - if (!isleaf && !(record->xl_info & XLR_BKP_BLOCK(0))) - { - left_hikey = (Item) datapos; - left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey)); - datapos += left_hikeysz; - datalen -= left_hikeysz; - } - - /* - * If this insertion finishes an incomplete split, get the block number of - * the child. - */ - if (!isleaf && !(record->xl_info & XLR_BKP_BLOCK(1))) - { - memcpy(&cblkno, datapos, sizeof(BlockNumber)); - datapos += sizeof(BlockNumber); - datalen -= sizeof(BlockNumber); - } + XLogRecGetBlockTag(record, 0, NULL, NULL, &leftsib); + XLogRecGetBlockTag(record, 1, NULL, NULL, &rightsib); + if (!XLogRecGetBlockTag(record, 2, NULL, NULL, &rnext)) + rnext = P_NONE; /* * Clear the incomplete split flag on the left sibling of the child page @@ -288,18 +219,18 @@ btree_xlog_split(bool onleft, bool isroot, * before locking the other pages) */ if (!isleaf) - _bt_clear_incomplete_split(lsn, record, 1, xlrec->node, cblkno); + _bt_clear_incomplete_split(record, 3); /* Reconstruct right (new) sibling page from scratch */ - rbuf = XLogReadBuffer(xlrec->node, xlrec->rightsib, true); - Assert(BufferIsValid(rbuf)); + rbuf = XLogInitBufferForRedo(record, 1); + datapos = XLogRecGetBlockData(record, 1, &datalen); rpage = (Page) BufferGetPage(rbuf); _bt_pageinit(rpage, BufferGetPageSize(rbuf)); ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage); - ropaque->btpo_prev = xlrec->leftsib; - ropaque->btpo_next = xlrec->rnext; + ropaque->btpo_prev = leftsib; + ropaque->btpo_next = rnext; ropaque->btpo.level = xlrec->level; ropaque->btpo_flags = isleaf ? BTP_LEAF : 0; ropaque->btpo_cycleid = 0; @@ -324,8 +255,7 @@ btree_xlog_split(bool onleft, bool isroot, /* don't release the buffer yet; we touch right page's first item below */ /* Now reconstruct left (original) sibling page */ - if (XLogReadBufferForRedo(lsn, record, 0, xlrec->node, xlrec->leftsib, - &lbuf) == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, 0, &lbuf) == BLK_NEEDS_REDO) { /* * To retain the same physical order of the tuples that they had, we @@ -339,9 +269,31 @@ btree_xlog_split(bool onleft, bool isroot, Page lpage = (Page) BufferGetPage(lbuf); BTPageOpaque lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage); OffsetNumber off; + Item newitem; + Size newitemsz = 0; Page newlpage; OffsetNumber leftoff; + datapos = XLogRecGetBlockData(record, 0, &datalen); + + if (onleft) + { + newitem = (Item) datapos; + newitemsz = MAXALIGN(IndexTupleSize(newitem)); + datapos += newitemsz; + datalen -= newitemsz; + } + + /* Extract left hikey and its size (assuming 16-bit alignment) */ + if (!isleaf) + { + left_hikey = (Item) datapos; + left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey)); + datapos += left_hikeysz; + datalen -= left_hikeysz; + } + Assert(datalen == 0); + newlpage = PageGetTempPageCopySpecial(lpage); /* Set high key */ @@ -358,7 +310,7 @@ btree_xlog_split(bool onleft, bool isroot, Item item; /* add the new item if it was inserted on left page */ - if (onleft && off == newitemoff) + if (onleft && off == xlrec->newitemoff) { if (PageAddItem(newlpage, newitem, newitemsz, leftoff, false, false) == InvalidOffsetNumber) @@ -376,7 +328,7 @@ btree_xlog_split(bool onleft, bool isroot, } /* cope with possibility that newitem goes at the end */ - if (onleft && off == newitemoff) + if (onleft && off == xlrec->newitemoff) { if (PageAddItem(newlpage, newitem, newitemsz, leftoff, false, false) == InvalidOffsetNumber) @@ -390,7 +342,7 @@ btree_xlog_split(bool onleft, bool isroot, lopaque->btpo_flags = BTP_INCOMPLETE_SPLIT; if (isleaf) lopaque->btpo_flags |= BTP_LEAF; - lopaque->btpo_next = xlrec->rightsib; + lopaque->btpo_next = rightsib; lopaque->btpo_cycleid = 0; PageSetLSN(lpage, lsn); @@ -410,22 +362,16 @@ btree_xlog_split(bool onleft, bool isroot, * replay, because no other index update can be in progress, and readers * will cope properly when following an obsolete left-link. */ - if (xlrec->rnext != P_NONE) + if (rnext != P_NONE) { - /* - * the backup block containing right sibling is 1 or 2, depending - * whether this was a leaf or internal page. - */ - int rnext_index = isleaf ? 1 : 2; Buffer buffer; - if (XLogReadBufferForRedo(lsn, record, rnext_index, xlrec->node, - xlrec->rnext, &buffer) == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO) { Page page = (Page) BufferGetPage(buffer); BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page); - pageop->btpo_prev = xlrec->rightsib; + pageop->btpo_prev = rightsib; PageSetLSN(page, lsn); MarkBufferDirty(buffer); @@ -436,8 +382,9 @@ btree_xlog_split(bool onleft, bool isroot, } static void -btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record) +btree_xlog_vacuum(XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record); Buffer buffer; Page page; @@ -466,9 +413,13 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record) */ if (HotStandbyActiveInReplay()) { + RelFileNode thisrnode; + BlockNumber thisblkno; BlockNumber blkno; - for (blkno = xlrec->lastBlockVacuumed + 1; blkno < xlrec->block; blkno++) + XLogRecGetBlockTag(record, 0, &thisrnode, NULL, &thisblkno); + + for (blkno = xlrec->lastBlockVacuumed + 1; blkno < thisblkno; blkno++) { /* * We use RBM_NORMAL_NO_LOG mode because it's not an error @@ -483,7 +434,7 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record) * buffer manager we could optimise this so that if the block is * not in shared_buffers we confirm it as unpinned. */ - buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, blkno, + buffer = XLogReadBufferExtended(thisrnode, MAIN_FORKNUM, blkno, RBM_NORMAL_NO_LOG); if (BufferIsValid(buffer)) { @@ -497,20 +448,23 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record) * Like in btvacuumpage(), we need to take a cleanup lock on every leaf * page. See nbtree/README for details. */ - if (XLogReadBufferForRedoExtended(lsn, record, 0, - xlrec->node, MAIN_FORKNUM, xlrec->block, - RBM_NORMAL, true, &buffer) + if (XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer) == BLK_NEEDS_REDO) { + char *ptr; + Size len; + + ptr = XLogRecGetBlockData(record, 0, &len); + page = (Page) BufferGetPage(buffer); - if (record->xl_len > SizeOfBtreeVacuum) + if (len > 0) { OffsetNumber *unused; OffsetNumber *unend; - unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeVacuum); - unend = (OffsetNumber *) ((char *) xlrec + record->xl_len); + unused = (OffsetNumber *) ptr; + unend = (OffsetNumber *) ((char *) ptr + len); if ((unend - unused) > 0) PageIndexMultiDelete(page, unused, unend - unused); @@ -542,13 +496,16 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record) * XXX optimise later with something like XLogPrefetchBuffer() */ static TransactionId -btree_xlog_delete_get_latestRemovedXid(xl_btree_delete *xlrec) +btree_xlog_delete_get_latestRemovedXid(XLogReaderState *record) { + xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record); OffsetNumber *unused; Buffer ibuffer, hbuffer; Page ipage, hpage; + RelFileNode rnode; + BlockNumber blkno; ItemId iitemid, hitemid; IndexTuple itup; @@ -588,9 +545,11 @@ btree_xlog_delete_get_latestRemovedXid(xl_btree_delete *xlrec) * InvalidTransactionId to cancel all HS transactions. That's probably * overkill, but it's safe, and certainly better than panicking here. */ - ibuffer = XLogReadBuffer(xlrec->node, xlrec->block, false); + XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno); + ibuffer = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, RBM_NORMAL); if (!BufferIsValid(ibuffer)) return InvalidTransactionId; + LockBuffer(ibuffer, BT_READ); ipage = (Page) BufferGetPage(ibuffer); /* @@ -611,12 +570,13 @@ btree_xlog_delete_get_latestRemovedXid(xl_btree_delete *xlrec) * Locate the heap page that the index tuple points at */ hblkno = ItemPointerGetBlockNumber(&(itup->t_tid)); - hbuffer = XLogReadBuffer(xlrec->hnode, hblkno, false); + hbuffer = XLogReadBufferExtended(xlrec->hnode, MAIN_FORKNUM, hblkno, RBM_NORMAL); if (!BufferIsValid(hbuffer)) { UnlockReleaseBuffer(ibuffer); return InvalidTransactionId; } + LockBuffer(hbuffer, BUFFER_LOCK_SHARE); hpage = (Page) BufferGetPage(hbuffer); /* @@ -678,8 +638,9 @@ btree_xlog_delete_get_latestRemovedXid(xl_btree_delete *xlrec) } static void -btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record) +btree_xlog_delete(XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record); Buffer buffer; Page page; @@ -698,21 +659,23 @@ btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record) */ if (InHotStandby) { - TransactionId latestRemovedXid = btree_xlog_delete_get_latestRemovedXid(xlrec); + TransactionId latestRemovedXid = btree_xlog_delete_get_latestRemovedXid(record); + RelFileNode rnode; - ResolveRecoveryConflictWithSnapshot(latestRemovedXid, xlrec->node); + XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL); + + ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode); } /* * We don't need to take a cleanup lock to apply these changes. See * nbtree/README for details. */ - if (XLogReadBufferForRedo(lsn, record, 0, xlrec->node, xlrec->block, - &buffer) == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { page = (Page) BufferGetPage(buffer); - if (record->xl_len > SizeOfBtreeDelete) + if (XLogRecGetDataLen(record) > SizeOfBtreeDelete) { OffsetNumber *unused; @@ -736,17 +699,15 @@ btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record) } static void -btree_xlog_mark_page_halfdead(uint8 info, XLogRecPtr lsn, XLogRecord *record) +btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; xl_btree_mark_page_halfdead *xlrec = (xl_btree_mark_page_halfdead *) XLogRecGetData(record); - BlockNumber parent; Buffer buffer; Page page; BTPageOpaque pageop; IndexTupleData trunctuple; - parent = ItemPointerGetBlockNumber(&(xlrec->target.tid)); - /* * In normal operation, we would lock all the pages this WAL record * touches before changing any of them. In WAL replay, it should be okay @@ -756,8 +717,7 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogRecPtr lsn, XLogRecord *record) */ /* parent page */ - if (XLogReadBufferForRedo(lsn, record, 0, xlrec->target.node, parent, - &buffer) == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) { OffsetNumber poffset; ItemId itemid; @@ -768,7 +728,7 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogRecPtr lsn, XLogRecord *record) page = (Page) BufferGetPage(buffer); pageop = (BTPageOpaque) PageGetSpecialPointer(page); - poffset = ItemPointerGetOffsetNumber(&(xlrec->target.tid)); + poffset = xlrec->poffset; nextoffset = OffsetNumberNext(poffset); itemid = PageGetItemId(page, nextoffset); @@ -788,8 +748,7 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogRecPtr lsn, XLogRecord *record) UnlockReleaseBuffer(buffer); /* Rewrite the leaf page as a halfdead page */ - buffer = XLogReadBuffer(xlrec->target.node, xlrec->leafblk, true); - Assert(BufferIsValid(buffer)); + buffer = XLogInitBufferForRedo(record, 0); page = (Page) BufferGetPage(buffer); _bt_pageinit(page, BufferGetPageSize(buffer)); @@ -822,17 +781,16 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogRecPtr lsn, XLogRecord *record) static void -btree_xlog_unlink_page(uint8 info, XLogRecPtr lsn, XLogRecord *record) +btree_xlog_unlink_page(uint8 info, XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) XLogRecGetData(record); - BlockNumber target; BlockNumber leftsib; BlockNumber rightsib; Buffer buffer; Page page; BTPageOpaque pageop; - target = xlrec->deadblk; leftsib = xlrec->leftsib; rightsib = xlrec->rightsib; @@ -845,8 +803,7 @@ btree_xlog_unlink_page(uint8 info, XLogRecPtr lsn, XLogRecord *record) */ /* Fix left-link of right sibling */ - if (XLogReadBufferForRedo(lsn, record, 0, xlrec->node, rightsib, &buffer) - == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO) { page = (Page) BufferGetPage(buffer); pageop = (BTPageOpaque) PageGetSpecialPointer(page); @@ -861,8 +818,7 @@ btree_xlog_unlink_page(uint8 info, XLogRecPtr lsn, XLogRecord *record) /* Fix right-link of left sibling, if any */ if (leftsib != P_NONE) { - if (XLogReadBufferForRedo(lsn, record, 1, xlrec->node, leftsib, &buffer) - == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) { page = (Page) BufferGetPage(buffer); pageop = (BTPageOpaque) PageGetSpecialPointer(page); @@ -876,8 +832,7 @@ btree_xlog_unlink_page(uint8 info, XLogRecPtr lsn, XLogRecord *record) } /* Rewrite target page as empty deleted page */ - buffer = XLogReadBuffer(xlrec->node, target, true); - Assert(BufferIsValid(buffer)); + buffer = XLogInitBufferForRedo(record, 0); page = (Page) BufferGetPage(buffer); _bt_pageinit(page, BufferGetPageSize(buffer)); @@ -898,7 +853,7 @@ btree_xlog_unlink_page(uint8 info, XLogRecPtr lsn, XLogRecord *record) * itself, update the leaf to point to the next remaining child in the * branch. */ - if (target != xlrec->leafblk) + if (XLogRecHasBlockRef(record, 3)) { /* * There is no real data on the page, so we just re-create it from @@ -906,8 +861,7 @@ btree_xlog_unlink_page(uint8 info, XLogRecPtr lsn, XLogRecord *record) */ IndexTupleData trunctuple; - buffer = XLogReadBuffer(xlrec->node, xlrec->leafblk, true); - Assert(BufferIsValid(buffer)); + buffer = XLogInitBufferForRedo(record, 3); page = (Page) BufferGetPage(buffer); pageop = (BTPageOpaque) PageGetSpecialPointer(page); @@ -936,27 +890,21 @@ btree_xlog_unlink_page(uint8 info, XLogRecPtr lsn, XLogRecord *record) /* Update metapage if needed */ if (info == XLOG_BTREE_UNLINK_PAGE_META) - { - xl_btree_metadata md; - - memcpy(&md, (char *) xlrec + SizeOfBtreeUnlinkPage, - sizeof(xl_btree_metadata)); - _bt_restore_meta(xlrec->node, lsn, - md.root, md.level, - md.fastroot, md.fastlevel); - } + _bt_restore_meta(record, 4); } static void -btree_xlog_newroot(XLogRecPtr lsn, XLogRecord *record) +btree_xlog_newroot(XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; xl_btree_newroot *xlrec = (xl_btree_newroot *) XLogRecGetData(record); Buffer buffer; Page page; BTPageOpaque pageop; + char *ptr; + Size len; - buffer = XLogReadBuffer(xlrec->node, xlrec->rootblk, true); - Assert(BufferIsValid(buffer)); + buffer = XLogInitBufferForRedo(record, 0); page = (Page) BufferGetPage(buffer); _bt_pageinit(page, BufferGetPageSize(buffer)); @@ -969,34 +917,24 @@ btree_xlog_newroot(XLogRecPtr lsn, XLogRecord *record) pageop->btpo_flags |= BTP_LEAF; pageop->btpo_cycleid = 0; - if (record->xl_len > SizeOfBtreeNewroot) + if (xlrec->level > 0) { - IndexTuple itup; - BlockNumber cblkno; - - _bt_restore_page(page, - (char *) xlrec + SizeOfBtreeNewroot, - record->xl_len - SizeOfBtreeNewroot); - /* extract block number of the left-hand split page */ - itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, P_HIKEY)); - cblkno = ItemPointerGetBlockNumber(&(itup->t_tid)); - Assert(ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY); + ptr = XLogRecGetBlockData(record, 0, &len); + _bt_restore_page(page, ptr, len); /* Clear the incomplete-split flag in left child */ - _bt_clear_incomplete_split(lsn, record, 0, xlrec->node, cblkno); + _bt_clear_incomplete_split(record, 1); } PageSetLSN(page, lsn); MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); - _bt_restore_meta(xlrec->node, lsn, - xlrec->rootblk, xlrec->level, - xlrec->rootblk, xlrec->level); + _bt_restore_meta(record, 2); } static void -btree_xlog_reuse_page(XLogRecPtr lsn, XLogRecord *record) +btree_xlog_reuse_page(XLogReaderState *record) { xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) XLogRecGetData(record); @@ -1015,58 +953,55 @@ btree_xlog_reuse_page(XLogRecPtr lsn, XLogRecord *record) ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, xlrec->node); } - - /* Backup blocks are not used in reuse_page records */ - Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); } void -btree_redo(XLogRecPtr lsn, XLogRecord *record) +btree_redo(XLogReaderState *record) { - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; switch (info) { case XLOG_BTREE_INSERT_LEAF: - btree_xlog_insert(true, false, lsn, record); + btree_xlog_insert(true, false, record); break; case XLOG_BTREE_INSERT_UPPER: - btree_xlog_insert(false, false, lsn, record); + btree_xlog_insert(false, false, record); break; case XLOG_BTREE_INSERT_META: - btree_xlog_insert(false, true, lsn, record); + btree_xlog_insert(false, true, record); break; case XLOG_BTREE_SPLIT_L: - btree_xlog_split(true, false, lsn, record); + btree_xlog_split(true, false, record); break; case XLOG_BTREE_SPLIT_R: - btree_xlog_split(false, false, lsn, record); + btree_xlog_split(false, false, record); break; case XLOG_BTREE_SPLIT_L_ROOT: - btree_xlog_split(true, true, lsn, record); + btree_xlog_split(true, true, record); break; case XLOG_BTREE_SPLIT_R_ROOT: - btree_xlog_split(false, true, lsn, record); + btree_xlog_split(false, true, record); break; case XLOG_BTREE_VACUUM: - btree_xlog_vacuum(lsn, record); + btree_xlog_vacuum(record); break; case XLOG_BTREE_DELETE: - btree_xlog_delete(lsn, record); + btree_xlog_delete(record); break; case XLOG_BTREE_MARK_PAGE_HALFDEAD: - btree_xlog_mark_page_halfdead(info, lsn, record); + btree_xlog_mark_page_halfdead(info, record); break; case XLOG_BTREE_UNLINK_PAGE: case XLOG_BTREE_UNLINK_PAGE_META: - btree_xlog_unlink_page(info, lsn, record); + btree_xlog_unlink_page(info, record); break; case XLOG_BTREE_NEWROOT: - btree_xlog_newroot(lsn, record); + btree_xlog_newroot(record); break; case XLOG_BTREE_REUSE_PAGE: - btree_xlog_reuse_page(lsn, record); + btree_xlog_reuse_page(record); break; default: elog(PANIC, "btree_redo: unknown op code %u", info); diff --git a/src/backend/access/rmgrdesc/brindesc.c b/src/backend/access/rmgrdesc/brindesc.c index 97dc3c0fa9..6cda6f8ffd 100644 --- a/src/backend/access/rmgrdesc/brindesc.c +++ b/src/backend/access/rmgrdesc/brindesc.c @@ -17,64 +17,49 @@ #include "access/brin_xlog.h" void -brin_desc(StringInfo buf, XLogRecord *record) +brin_desc(StringInfo buf, XLogReaderState *record) { char *rec = XLogRecGetData(record); - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; info &= XLOG_BRIN_OPMASK; if (info == XLOG_BRIN_CREATE_INDEX) { xl_brin_createidx *xlrec = (xl_brin_createidx *) rec; - appendStringInfo(buf, "v%d pagesPerRange %u rel %u/%u/%u", - xlrec->version, xlrec->pagesPerRange, - xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode); + appendStringInfo(buf, "v%d pagesPerRange %u", + xlrec->version, xlrec->pagesPerRange); } else if (info == XLOG_BRIN_INSERT) { xl_brin_insert *xlrec = (xl_brin_insert *) rec; - appendStringInfo(buf, "rel %u/%u/%u heapBlk %u revmapBlk %u pagesPerRange %u TID (%u,%u)", - xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode, - xlrec->heapBlk, xlrec->revmapBlk, + appendStringInfo(buf, "heapBlk %u pagesPerRange %u offnum %u", + xlrec->heapBlk, xlrec->pagesPerRange, - ItemPointerGetBlockNumber(&xlrec->tid), - ItemPointerGetOffsetNumber(&xlrec->tid)); + xlrec->offnum); } else if (info == XLOG_BRIN_UPDATE) { xl_brin_update *xlrec = (xl_brin_update *) rec; - appendStringInfo(buf, "rel %u/%u/%u heapBlk %u revmapBlk %u pagesPerRange %u old TID (%u,%u) TID (%u,%u)", - xlrec->insert.node.spcNode, xlrec->insert.node.dbNode, - xlrec->insert.node.relNode, - xlrec->insert.heapBlk, xlrec->insert.revmapBlk, + appendStringInfo(buf, "heapBlk %u pagesPerRange %u old offnum %u, new offnum %u", + xlrec->insert.heapBlk, xlrec->insert.pagesPerRange, - ItemPointerGetBlockNumber(&xlrec->oldtid), - ItemPointerGetOffsetNumber(&xlrec->oldtid), - ItemPointerGetBlockNumber(&xlrec->insert.tid), - ItemPointerGetOffsetNumber(&xlrec->insert.tid)); + xlrec->oldOffnum, + xlrec->insert.offnum); } else if (info == XLOG_BRIN_SAMEPAGE_UPDATE) { xl_brin_samepage_update *xlrec = (xl_brin_samepage_update *) rec; - appendStringInfo(buf, "rel %u/%u/%u TID (%u,%u)", - xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode, - ItemPointerGetBlockNumber(&xlrec->tid), - ItemPointerGetOffsetNumber(&xlrec->tid)); + appendStringInfo(buf, "offnum %u", xlrec->offnum); } else if (info == XLOG_BRIN_REVMAP_EXTEND) { xl_brin_revmap_extend *xlrec = (xl_brin_revmap_extend *) rec; - appendStringInfo(buf, "rel %u/%u/%u targetBlk %u", - xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode, xlrec->targetBlk); + appendStringInfo(buf, "targetBlk %u", xlrec->targetBlk); } } diff --git a/src/backend/access/rmgrdesc/clogdesc.c b/src/backend/access/rmgrdesc/clogdesc.c index 4a12e286e4..8de72963e6 100644 --- a/src/backend/access/rmgrdesc/clogdesc.c +++ b/src/backend/access/rmgrdesc/clogdesc.c @@ -18,10 +18,10 @@ void -clog_desc(StringInfo buf, XLogRecord *record) +clog_desc(StringInfo buf, XLogReaderState *record) { char *rec = XLogRecGetData(record); - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; if (info == CLOG_ZEROPAGE || info == CLOG_TRUNCATE) { diff --git a/src/backend/access/rmgrdesc/dbasedesc.c b/src/backend/access/rmgrdesc/dbasedesc.c index 446e5f97f4..ee1d83baa4 100644 --- a/src/backend/access/rmgrdesc/dbasedesc.c +++ b/src/backend/access/rmgrdesc/dbasedesc.c @@ -19,10 +19,10 @@ void -dbase_desc(StringInfo buf, XLogRecord *record) +dbase_desc(StringInfo buf, XLogReaderState *record) { char *rec = XLogRecGetData(record); - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; if (info == XLOG_DBASE_CREATE) { diff --git a/src/backend/access/rmgrdesc/gindesc.c b/src/backend/access/rmgrdesc/gindesc.c index 2f783cee2b..8754214f64 100644 --- a/src/backend/access/rmgrdesc/gindesc.c +++ b/src/backend/access/rmgrdesc/gindesc.c @@ -15,16 +15,10 @@ #include "postgres.h" #include "access/gin_private.h" +#include "access/xlogutils.h" #include "lib/stringinfo.h" #include "storage/relfilenode.h" -static void -desc_node(StringInfo buf, RelFileNode node, BlockNumber blkno) -{ - appendStringInfo(buf, "node: %u/%u/%u blkno: %u", - node.spcNode, node.dbNode, node.relNode, blkno); -} - static void desc_recompress_leaf(StringInfo buf, ginxlogRecompressDataLeaf *insertData) { @@ -77,26 +71,25 @@ desc_recompress_leaf(StringInfo buf, ginxlogRecompressDataLeaf *insertData) } void -gin_desc(StringInfo buf, XLogRecord *record) +gin_desc(StringInfo buf, XLogReaderState *record) { char *rec = XLogRecGetData(record); - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; switch (info) { case XLOG_GIN_CREATE_INDEX: - desc_node(buf, *(RelFileNode *) rec, GIN_ROOT_BLKNO); + /* no further information */ break; case XLOG_GIN_CREATE_PTREE: - desc_node(buf, ((ginxlogCreatePostingTree *) rec)->node, ((ginxlogCreatePostingTree *) rec)->blkno); + /* no further information */ break; case XLOG_GIN_INSERT: { ginxlogInsert *xlrec = (ginxlogInsert *) rec; char *payload = rec + sizeof(ginxlogInsert); - desc_node(buf, xlrec->node, xlrec->blkno); - appendStringInfo(buf, " isdata: %c isleaf: %c", + appendStringInfo(buf, "isdata: %c isleaf: %c", (xlrec->flags & GIN_INSERT_ISDATA) ? 'T' : 'F', (xlrec->flags & GIN_INSERT_ISLEAF) ? 'T' : 'F'); if (!(xlrec->flags & GIN_INSERT_ISLEAF)) @@ -119,7 +112,7 @@ gin_desc(StringInfo buf, XLogRecord *record) ginxlogRecompressDataLeaf *insertData = (ginxlogRecompressDataLeaf *) payload; - if (record->xl_info & XLR_BKP_BLOCK(0)) + if (XLogRecHasBlockImage(record, 0)) appendStringInfo(buf, " (full page image)"); else desc_recompress_leaf(buf, insertData); @@ -139,39 +132,38 @@ gin_desc(StringInfo buf, XLogRecord *record) { ginxlogSplit *xlrec = (ginxlogSplit *) rec; - desc_node(buf, ((ginxlogSplit *) rec)->node, ((ginxlogSplit *) rec)->lblkno); - appendStringInfo(buf, " isrootsplit: %c", (((ginxlogSplit *) rec)->flags & GIN_SPLIT_ROOT) ? 'T' : 'F'); + appendStringInfo(buf, "isrootsplit: %c", + (((ginxlogSplit *) rec)->flags & GIN_SPLIT_ROOT) ? 'T' : 'F'); appendStringInfo(buf, " isdata: %c isleaf: %c", (xlrec->flags & GIN_INSERT_ISDATA) ? 'T' : 'F', (xlrec->flags & GIN_INSERT_ISLEAF) ? 'T' : 'F'); } break; case XLOG_GIN_VACUUM_PAGE: - desc_node(buf, ((ginxlogVacuumPage *) rec)->node, ((ginxlogVacuumPage *) rec)->blkno); + /* no further information */ break; case XLOG_GIN_VACUUM_DATA_LEAF_PAGE: { ginxlogVacuumDataLeafPage *xlrec = (ginxlogVacuumDataLeafPage *) rec; - desc_node(buf, xlrec->node, xlrec->blkno); - if (record->xl_info & XLR_BKP_BLOCK(0)) + if (XLogRecHasBlockImage(record, 0)) appendStringInfo(buf, " (full page image)"); else desc_recompress_leaf(buf, &xlrec->data); } break; case XLOG_GIN_DELETE_PAGE: - desc_node(buf, ((ginxlogDeletePage *) rec)->node, ((ginxlogDeletePage *) rec)->blkno); + /* no further information */ break; case XLOG_GIN_UPDATE_META_PAGE: - desc_node(buf, ((ginxlogUpdateMeta *) rec)->node, GIN_METAPAGE_BLKNO); + /* no further information */ break; case XLOG_GIN_INSERT_LISTPAGE: - desc_node(buf, ((ginxlogInsertListPage *) rec)->node, ((ginxlogInsertListPage *) rec)->blkno); + /* no further information */ break; case XLOG_GIN_DELETE_LISTPAGE: - appendStringInfo(buf, "%d pages, ", ((ginxlogDeleteListPages *) rec)->ndeleted); - desc_node(buf, ((ginxlogDeleteListPages *) rec)->node, GIN_METAPAGE_BLKNO); + appendStringInfo(buf, "ndeleted: %d", + ((ginxlogDeleteListPages *) rec)->ndeleted); break; } } diff --git a/src/backend/access/rmgrdesc/gistdesc.c b/src/backend/access/rmgrdesc/gistdesc.c index db3ba13ccd..576c644c2a 100644 --- a/src/backend/access/rmgrdesc/gistdesc.c +++ b/src/backend/access/rmgrdesc/gistdesc.c @@ -18,34 +18,23 @@ #include "lib/stringinfo.h" #include "storage/relfilenode.h" -static void -out_target(StringInfo buf, RelFileNode node) -{ - appendStringInfo(buf, "rel %u/%u/%u", - node.spcNode, node.dbNode, node.relNode); -} - static void out_gistxlogPageUpdate(StringInfo buf, gistxlogPageUpdate *xlrec) { - out_target(buf, xlrec->node); - appendStringInfo(buf, "; block number %u", xlrec->blkno); } static void out_gistxlogPageSplit(StringInfo buf, gistxlogPageSplit *xlrec) { - appendStringInfoString(buf, "page_split: "); - out_target(buf, xlrec->node); - appendStringInfo(buf, "; block number %u splits to %d pages", - xlrec->origblkno, xlrec->npage); + appendStringInfo(buf, "page_split: splits to %d pages", + xlrec->npage); } void -gist_desc(StringInfo buf, XLogRecord *record) +gist_desc(StringInfo buf, XLogReaderState *record) { char *rec = XLogRecGetData(record); - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; switch (info) { @@ -56,10 +45,6 @@ gist_desc(StringInfo buf, XLogRecord *record) out_gistxlogPageSplit(buf, (gistxlogPageSplit *) rec); break; case XLOG_GIST_CREATE_INDEX: - appendStringInfo(buf, "rel %u/%u/%u", - ((RelFileNode *) rec)->spcNode, - ((RelFileNode *) rec)->dbNode, - ((RelFileNode *) rec)->relNode); break; } } diff --git a/src/backend/access/rmgrdesc/hashdesc.c b/src/backend/access/rmgrdesc/hashdesc.c index c58461c6ff..71afaa9cbd 100644 --- a/src/backend/access/rmgrdesc/hashdesc.c +++ b/src/backend/access/rmgrdesc/hashdesc.c @@ -17,7 +17,7 @@ #include "access/hash.h" void -hash_desc(StringInfo buf, XLogRecord *record) +hash_desc(StringInfo buf, XLogReaderState *record) { } diff --git a/src/backend/access/rmgrdesc/heapdesc.c b/src/backend/access/rmgrdesc/heapdesc.c index ee2c073f71..958b0b0e85 100644 --- a/src/backend/access/rmgrdesc/heapdesc.c +++ b/src/backend/access/rmgrdesc/heapdesc.c @@ -16,15 +16,6 @@ #include "access/heapam_xlog.h" -static void -out_target(StringInfo buf, xl_heaptid *target) -{ - appendStringInfo(buf, "rel %u/%u/%u; tid %u/%u", - target->node.spcNode, target->node.dbNode, target->node.relNode, - ItemPointerGetBlockNumber(&(target->tid)), - ItemPointerGetOffsetNumber(&(target->tid))); -} - static void out_infobits(StringInfo buf, uint8 infobits) { @@ -41,23 +32,23 @@ out_infobits(StringInfo buf, uint8 infobits) } void -heap_desc(StringInfo buf, XLogRecord *record) +heap_desc(StringInfo buf, XLogReaderState *record) { char *rec = XLogRecGetData(record); - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; info &= XLOG_HEAP_OPMASK; if (info == XLOG_HEAP_INSERT) { xl_heap_insert *xlrec = (xl_heap_insert *) rec; - out_target(buf, &(xlrec->target)); + appendStringInfo(buf, "off %u", xlrec->offnum); } else if (info == XLOG_HEAP_DELETE) { xl_heap_delete *xlrec = (xl_heap_delete *) rec; - out_target(buf, &(xlrec->target)); + appendStringInfo(buf, "off %u", xlrec->offnum); appendStringInfoChar(buf, ' '); out_infobits(buf, xlrec->infobits_set); } @@ -65,24 +56,24 @@ heap_desc(StringInfo buf, XLogRecord *record) { xl_heap_update *xlrec = (xl_heap_update *) rec; - out_target(buf, &(xlrec->target)); - appendStringInfo(buf, " xmax %u ", xlrec->old_xmax); + appendStringInfo(buf, "off %u xmax %u", + xlrec->old_offnum, + xlrec->old_xmax); out_infobits(buf, xlrec->old_infobits_set); - appendStringInfo(buf, "; new tid %u/%u xmax %u", - ItemPointerGetBlockNumber(&(xlrec->newtid)), - ItemPointerGetOffsetNumber(&(xlrec->newtid)), + appendStringInfo(buf, "; new off %u xmax %u", + xlrec->new_offnum, xlrec->new_xmax); } else if (info == XLOG_HEAP_HOT_UPDATE) { xl_heap_update *xlrec = (xl_heap_update *) rec; - out_target(buf, &(xlrec->target)); - appendStringInfo(buf, " xmax %u ", xlrec->old_xmax); + appendStringInfo(buf, "off %u xmax %u", + xlrec->old_offnum, + xlrec->old_xmax); out_infobits(buf, xlrec->old_infobits_set); - appendStringInfo(buf, "; new tid %u/%u xmax %u", - ItemPointerGetBlockNumber(&(xlrec->newtid)), - ItemPointerGetOffsetNumber(&(xlrec->newtid)), + appendStringInfo(buf, "; new off %u xmax %u", + xlrec->new_offnum, xlrec->new_xmax); } else if (info == XLOG_HEAP_LOCK) @@ -90,40 +81,34 @@ heap_desc(StringInfo buf, XLogRecord *record) xl_heap_lock *xlrec = (xl_heap_lock *) rec; appendStringInfo(buf, "xid %u: ", xlrec->locking_xid); - out_target(buf, &(xlrec->target)); - appendStringInfoChar(buf, ' '); + appendStringInfo(buf, "off %u ", xlrec->offnum); out_infobits(buf, xlrec->infobits_set); } else if (info == XLOG_HEAP_INPLACE) { xl_heap_inplace *xlrec = (xl_heap_inplace *) rec; - out_target(buf, &(xlrec->target)); + appendStringInfo(buf, "off %u", xlrec->offnum); } } void -heap2_desc(StringInfo buf, XLogRecord *record) +heap2_desc(StringInfo buf, XLogReaderState *record) { char *rec = XLogRecGetData(record); - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; info &= XLOG_HEAP_OPMASK; if (info == XLOG_HEAP2_CLEAN) { xl_heap_clean *xlrec = (xl_heap_clean *) rec; - appendStringInfo(buf, "rel %u/%u/%u; blk %u remxid %u", - xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode, xlrec->block, - xlrec->latestRemovedXid); + appendStringInfo(buf, "remxid %u", xlrec->latestRemovedXid); } else if (info == XLOG_HEAP2_FREEZE_PAGE) { xl_heap_freeze_page *xlrec = (xl_heap_freeze_page *) rec; - appendStringInfo(buf, "rel %u/%u/%u; blk %u; cutoff xid %u ntuples %u", - xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode, xlrec->block, + appendStringInfo(buf, "cutoff xid %u ntuples %u", xlrec->cutoff_xid, xlrec->ntuples); } else if (info == XLOG_HEAP2_CLEANUP_INFO) @@ -136,17 +121,13 @@ heap2_desc(StringInfo buf, XLogRecord *record) { xl_heap_visible *xlrec = (xl_heap_visible *) rec; - appendStringInfo(buf, "rel %u/%u/%u; blk %u", - xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode, xlrec->block); + appendStringInfo(buf, "cutoff xid %u", xlrec->cutoff_xid); } else if (info == XLOG_HEAP2_MULTI_INSERT) { xl_heap_multi_insert *xlrec = (xl_heap_multi_insert *) rec; - appendStringInfo(buf, "rel %u/%u/%u; blk %u; %d tuples", - xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.relNode, - xlrec->blkno, xlrec->ntuples); + appendStringInfo(buf, "%d tuples", xlrec->ntuples); } else if (info == XLOG_HEAP2_LOCK_UPDATED) { @@ -154,13 +135,18 @@ heap2_desc(StringInfo buf, XLogRecord *record) appendStringInfo(buf, "xmax %u msk %04x; ", xlrec->xmax, xlrec->infobits_set); - out_target(buf, &(xlrec->target)); + appendStringInfo(buf, "off %u", xlrec->offnum); } else if (info == XLOG_HEAP2_NEW_CID) { xl_heap_new_cid *xlrec = (xl_heap_new_cid *) rec; - out_target(buf, &(xlrec->target)); + appendStringInfo(buf, "rel %u/%u/%u; tid %u/%u", + xlrec->target_node.spcNode, + xlrec->target_node.dbNode, + xlrec->target_node.relNode, + ItemPointerGetBlockNumber(&(xlrec->target_tid)), + ItemPointerGetOffsetNumber(&(xlrec->target_tid))); appendStringInfo(buf, "; cmin: %u, cmax: %u, combo: %u", xlrec->cmin, xlrec->cmax, xlrec->combocid); } diff --git a/src/backend/access/rmgrdesc/mxactdesc.c b/src/backend/access/rmgrdesc/mxactdesc.c index afc5aca197..0902cb73c6 100644 --- a/src/backend/access/rmgrdesc/mxactdesc.c +++ b/src/backend/access/rmgrdesc/mxactdesc.c @@ -47,10 +47,10 @@ out_member(StringInfo buf, MultiXactMember *member) } void -multixact_desc(StringInfo buf, XLogRecord *record) +multixact_desc(StringInfo buf, XLogReaderState *record) { char *rec = XLogRecGetData(record); - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE || info == XLOG_MULTIXACT_ZERO_MEM_PAGE) diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c index 8b63f2b6ba..85795f6409 100644 --- a/src/backend/access/rmgrdesc/nbtdesc.c +++ b/src/backend/access/rmgrdesc/nbtdesc.c @@ -16,20 +16,11 @@ #include "access/nbtree.h" -static void -out_target(StringInfo buf, xl_btreetid *target) -{ - appendStringInfo(buf, "rel %u/%u/%u; tid %u/%u", - target->node.spcNode, target->node.dbNode, target->node.relNode, - ItemPointerGetBlockNumber(&(target->tid)), - ItemPointerGetOffsetNumber(&(target->tid))); -} - void -btree_desc(StringInfo buf, XLogRecord *record) +btree_desc(StringInfo buf, XLogReaderState *record) { char *rec = XLogRecGetData(record); - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; switch (info) { @@ -39,7 +30,7 @@ btree_desc(StringInfo buf, XLogRecord *record) { xl_btree_insert *xlrec = (xl_btree_insert *) rec; - out_target(buf, &(xlrec->target)); + appendStringInfo(buf, "off %u", xlrec->offnum); break; } case XLOG_BTREE_SPLIT_L: @@ -49,11 +40,7 @@ btree_desc(StringInfo buf, XLogRecord *record) { xl_btree_split *xlrec = (xl_btree_split *) rec; - appendStringInfo(buf, "rel %u/%u/%u ", - xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode); - appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d", - xlrec->leftsib, xlrec->rightsib, xlrec->rnext, + appendStringInfo(buf, "level %u, firstright %d", xlrec->level, xlrec->firstright); break; } @@ -61,9 +48,7 @@ btree_desc(StringInfo buf, XLogRecord *record) { xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec; - appendStringInfo(buf, "rel %u/%u/%u; blk %u, lastBlockVacuumed %u", - xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode, xlrec->block, + appendStringInfo(buf, "lastBlockVacuumed %u", xlrec->lastBlockVacuumed); break; } @@ -71,18 +56,14 @@ btree_desc(StringInfo buf, XLogRecord *record) { xl_btree_delete *xlrec = (xl_btree_delete *) rec; - appendStringInfo(buf, "index %u/%u/%u; iblk %u, heap %u/%u/%u;", - xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.relNode, - xlrec->block, - xlrec->hnode.spcNode, xlrec->hnode.dbNode, xlrec->hnode.relNode); + appendStringInfo(buf, "%d items", xlrec->nitems); break; } case XLOG_BTREE_MARK_PAGE_HALFDEAD: { xl_btree_mark_page_halfdead *xlrec = (xl_btree_mark_page_halfdead *) rec; - out_target(buf, &(xlrec->target)); - appendStringInfo(buf, "; topparent %u; leaf %u; left %u; right %u", + appendStringInfo(buf, "topparent %u; leaf %u; left %u; right %u", xlrec->topparent, xlrec->leafblk, xlrec->leftblk, xlrec->rightblk); break; } @@ -91,22 +72,19 @@ btree_desc(StringInfo buf, XLogRecord *record) { xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) rec; - appendStringInfo(buf, "rel %u/%u/%u; ", - xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.relNode); - appendStringInfo(buf, "dead %u; left %u; right %u; btpo_xact %u; ", - xlrec->deadblk, xlrec->leftsib, xlrec->rightsib, xlrec->btpo_xact); - appendStringInfo(buf, "leaf %u; leafleft %u; leafright %u; topparent %u", - xlrec->leafblk, xlrec->leafleftsib, xlrec->leafrightsib, xlrec->topparent); + appendStringInfo(buf, "left %u; right %u; btpo_xact %u; ", + xlrec->leftsib, xlrec->rightsib, + xlrec->btpo_xact); + appendStringInfo(buf, "leafleft %u; leafright %u; topparent %u", + xlrec->leafleftsib, xlrec->leafrightsib, + xlrec->topparent); break; } case XLOG_BTREE_NEWROOT: { xl_btree_newroot *xlrec = (xl_btree_newroot *) rec; - appendStringInfo(buf, "rel %u/%u/%u; root %u lev %u", - xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode, - xlrec->rootblk, xlrec->level); + appendStringInfo(buf, "lev %u", xlrec->level); break; } case XLOG_BTREE_REUSE_PAGE: @@ -115,7 +93,7 @@ btree_desc(StringInfo buf, XLogRecord *record) appendStringInfo(buf, "rel %u/%u/%u; latestRemovedXid %u", xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode, xlrec->latestRemovedXid); + xlrec->node.relNode, xlrec->latestRemovedXid); break; } } diff --git a/src/backend/access/rmgrdesc/relmapdesc.c b/src/backend/access/rmgrdesc/relmapdesc.c index ef7c533fe5..5bda1da25c 100644 --- a/src/backend/access/rmgrdesc/relmapdesc.c +++ b/src/backend/access/rmgrdesc/relmapdesc.c @@ -17,10 +17,10 @@ #include "utils/relmapper.h" void -relmap_desc(StringInfo buf, XLogRecord *record) +relmap_desc(StringInfo buf, XLogReaderState *record) { char *rec = XLogRecGetData(record); - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; if (info == XLOG_RELMAP_UPDATE) { diff --git a/src/backend/access/rmgrdesc/seqdesc.c b/src/backend/access/rmgrdesc/seqdesc.c index 73de3969df..b8da96310c 100644 --- a/src/backend/access/rmgrdesc/seqdesc.c +++ b/src/backend/access/rmgrdesc/seqdesc.c @@ -18,10 +18,10 @@ void -seq_desc(StringInfo buf, XLogRecord *record) +seq_desc(StringInfo buf, XLogReaderState *record) { char *rec = XLogRecGetData(record); - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; xl_seq_rec *xlrec = (xl_seq_rec *) rec; if (info == XLOG_SEQ_LOG) diff --git a/src/backend/access/rmgrdesc/smgrdesc.c b/src/backend/access/rmgrdesc/smgrdesc.c index 109e3eaf04..4e8c06f5b9 100644 --- a/src/backend/access/rmgrdesc/smgrdesc.c +++ b/src/backend/access/rmgrdesc/smgrdesc.c @@ -19,10 +19,10 @@ void -smgr_desc(StringInfo buf, XLogRecord *record) +smgr_desc(StringInfo buf, XLogReaderState *record) { char *rec = XLogRecGetData(record); - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; if (info == XLOG_SMGR_CREATE) { diff --git a/src/backend/access/rmgrdesc/spgdesc.c b/src/backend/access/rmgrdesc/spgdesc.c index 3ee0427dcb..319c5f9d70 100644 --- a/src/backend/access/rmgrdesc/spgdesc.c +++ b/src/backend/access/rmgrdesc/spgdesc.c @@ -16,70 +16,66 @@ #include "access/spgist_private.h" -static void -out_target(StringInfo buf, RelFileNode node) -{ - appendStringInfo(buf, "rel %u/%u/%u ", - node.spcNode, node.dbNode, node.relNode); -} - void -spg_desc(StringInfo buf, XLogRecord *record) +spg_desc(StringInfo buf, XLogReaderState *record) { char *rec = XLogRecGetData(record); - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; switch (info) { case XLOG_SPGIST_CREATE_INDEX: - appendStringInfo(buf, "rel %u/%u/%u", - ((RelFileNode *) rec)->spcNode, - ((RelFileNode *) rec)->dbNode, - ((RelFileNode *) rec)->relNode); break; case XLOG_SPGIST_ADD_LEAF: - out_target(buf, ((spgxlogAddLeaf *) rec)->node); - appendStringInfo(buf, "%u", - ((spgxlogAddLeaf *) rec)->blknoLeaf); + { + spgxlogAddLeaf *xlrec = (spgxlogAddLeaf *) rec; + + appendStringInfo(buf, "add leaf to page"); + appendStringInfo(buf, "; off %u; headoff %u; parentoff %u", + xlrec->offnumLeaf, xlrec->offnumHeadLeaf, + xlrec->offnumParent); + if (xlrec->newPage) + appendStringInfo(buf, " (newpage)"); + if (xlrec->storesNulls) + appendStringInfo(buf, " (nulls)"); + } break; case XLOG_SPGIST_MOVE_LEAFS: - out_target(buf, ((spgxlogMoveLeafs *) rec)->node); - appendStringInfo(buf, "%u leafs from page %u to page %u", - ((spgxlogMoveLeafs *) rec)->nMoves, - ((spgxlogMoveLeafs *) rec)->blknoSrc, - ((spgxlogMoveLeafs *) rec)->blknoDst); + appendStringInfo(buf, "%u leafs", + ((spgxlogMoveLeafs *) rec)->nMoves); break; case XLOG_SPGIST_ADD_NODE: - out_target(buf, ((spgxlogAddNode *) rec)->node); - appendStringInfo(buf, "%u:%u", - ((spgxlogAddNode *) rec)->blkno, + appendStringInfo(buf, "off %u", ((spgxlogAddNode *) rec)->offnum); break; case XLOG_SPGIST_SPLIT_TUPLE: - out_target(buf, ((spgxlogSplitTuple *) rec)->node); - appendStringInfo(buf, "%u:%u to %u:%u", - ((spgxlogSplitTuple *) rec)->blknoPrefix, + appendStringInfo(buf, "prefix off: %u, postfix off: %u (same %d, new %d)", ((spgxlogSplitTuple *) rec)->offnumPrefix, - ((spgxlogSplitTuple *) rec)->blknoPostfix, - ((spgxlogSplitTuple *) rec)->offnumPostfix); + ((spgxlogSplitTuple *) rec)->offnumPostfix, + ((spgxlogSplitTuple *) rec)->postfixBlkSame, + ((spgxlogSplitTuple *) rec)->newPage + ); break; case XLOG_SPGIST_PICKSPLIT: - out_target(buf, ((spgxlogPickSplit *) rec)->node); + { + spgxlogPickSplit *xlrec = (spgxlogPickSplit *) rec; + + appendStringInfo(buf, "ndel %u; nins %u", + xlrec->nDelete, xlrec->nInsert); + if (xlrec->innerIsParent) + appendStringInfo(buf, " (innerIsParent)"); + if (xlrec->isRootSplit) + appendStringInfo(buf, " (isRootSplit)"); + } break; case XLOG_SPGIST_VACUUM_LEAF: - out_target(buf, ((spgxlogVacuumLeaf *) rec)->node); - appendStringInfo(buf, "page %u", - ((spgxlogVacuumLeaf *) rec)->blkno); + /* no further information */ break; case XLOG_SPGIST_VACUUM_ROOT: - out_target(buf, ((spgxlogVacuumRoot *) rec)->node); - appendStringInfo(buf, "page %u", - ((spgxlogVacuumRoot *) rec)->blkno); + /* no further information */ break; case XLOG_SPGIST_VACUUM_REDIRECT: - out_target(buf, ((spgxlogVacuumRedirect *) rec)->node); - appendStringInfo(buf, "page %u, newest XID %u", - ((spgxlogVacuumRedirect *) rec)->blkno, + appendStringInfo(buf, "newest XID %u", ((spgxlogVacuumRedirect *) rec)->newestRedirectXid); break; } diff --git a/src/backend/access/rmgrdesc/standbydesc.c b/src/backend/access/rmgrdesc/standbydesc.c index d09041f8df..0ce1aa325c 100644 --- a/src/backend/access/rmgrdesc/standbydesc.c +++ b/src/backend/access/rmgrdesc/standbydesc.c @@ -37,10 +37,10 @@ standby_desc_running_xacts(StringInfo buf, xl_running_xacts *xlrec) } void -standby_desc(StringInfo buf, XLogRecord *record) +standby_desc(StringInfo buf, XLogReaderState *record) { char *rec = XLogRecGetData(record); - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; if (info == XLOG_STANDBY_LOCK) { diff --git a/src/backend/access/rmgrdesc/tblspcdesc.c b/src/backend/access/rmgrdesc/tblspcdesc.c index b6b0e6394d..8b2ebb4d92 100644 --- a/src/backend/access/rmgrdesc/tblspcdesc.c +++ b/src/backend/access/rmgrdesc/tblspcdesc.c @@ -18,10 +18,10 @@ void -tblspc_desc(StringInfo buf, XLogRecord *record) +tblspc_desc(StringInfo buf, XLogReaderState *record) { char *rec = XLogRecGetData(record); - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; if (info == XLOG_TBLSPC_CREATE) { diff --git a/src/backend/access/rmgrdesc/xactdesc.c b/src/backend/access/rmgrdesc/xactdesc.c index 22a22efc73..f5450a9b25 100644 --- a/src/backend/access/rmgrdesc/xactdesc.c +++ b/src/backend/access/rmgrdesc/xactdesc.c @@ -137,10 +137,10 @@ xact_desc_assignment(StringInfo buf, xl_xact_assignment *xlrec) } void -xact_desc(StringInfo buf, XLogRecord *record) +xact_desc(StringInfo buf, XLogReaderState *record) { char *rec = XLogRecGetData(record); - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; if (info == XLOG_XACT_COMMIT_COMPACT) { diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index e0957ff3a8..4088ba99b7 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -32,10 +32,10 @@ const struct config_enum_entry wal_level_options[] = { }; void -xlog_desc(StringInfo buf, XLogRecord *record) +xlog_desc(StringInfo buf, XLogReaderState *record) { char *rec = XLogRecGetData(record); - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; if (info == XLOG_CHECKPOINT_SHUTDOWN || info == XLOG_CHECKPOINT_ONLINE) @@ -76,11 +76,7 @@ xlog_desc(StringInfo buf, XLogRecord *record) } else if (info == XLOG_FPI) { - BkpBlock *bkp = (BkpBlock *) rec; - - appendStringInfo(buf, "%s block %u", - relpathperm(bkp->node, bkp->fork), - bkp->block); + /* no further information to print */ } else if (info == XLOG_BACKUP_END) { diff --git a/src/backend/access/spgist/spgdoinsert.c b/src/backend/access/spgist/spgdoinsert.c index 21a071ab19..1a17cc467e 100644 --- a/src/backend/access/spgist/spgdoinsert.c +++ b/src/backend/access/spgist/spgdoinsert.c @@ -16,8 +16,8 @@ #include "postgres.h" #include "access/genam.h" -#include "access/xloginsert.h" #include "access/spgist_private.h" +#include "access/xloginsert.h" #include "miscadmin.h" #include "storage/bufmgr.h" #include "utils/rel.h" @@ -202,25 +202,17 @@ static void addLeafTuple(Relation index, SpGistState *state, SpGistLeafTuple leafTuple, SPPageDesc *current, SPPageDesc *parent, bool isNulls, bool isNew) { - XLogRecData rdata[4]; spgxlogAddLeaf xlrec; - xlrec.node = index->rd_node; - xlrec.blknoLeaf = current->blkno; xlrec.newPage = isNew; xlrec.storesNulls = isNulls; /* these will be filled below as needed */ xlrec.offnumLeaf = InvalidOffsetNumber; xlrec.offnumHeadLeaf = InvalidOffsetNumber; - xlrec.blknoParent = InvalidBlockNumber; xlrec.offnumParent = InvalidOffsetNumber; xlrec.nodeI = 0; - ACCEPT_RDATA_DATA(&xlrec, sizeof(xlrec), 0); - ACCEPT_RDATA_DATA(leafTuple, leafTuple->size, 1); - ACCEPT_RDATA_BUFFER(current->buffer, 2); - START_CRIT_SECTION(); if (current->offnum == InvalidOffsetNumber || @@ -237,13 +229,10 @@ addLeafTuple(Relation index, SpGistState *state, SpGistLeafTuple leafTuple, /* Must update parent's downlink if any */ if (parent->buffer != InvalidBuffer) { - xlrec.blknoParent = parent->blkno; xlrec.offnumParent = parent->offnum; xlrec.nodeI = parent->node; saveNodeLink(index, parent, current->blkno, current->offnum); - - ACCEPT_RDATA_BUFFER(parent->buffer, 3); } } else @@ -303,12 +292,20 @@ addLeafTuple(Relation index, SpGistState *state, SpGistLeafTuple leafTuple, { XLogRecPtr recptr; - recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_ADD_LEAF, rdata); + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + XLogRegisterData((char *) leafTuple, leafTuple->size); + + XLogRegisterBuffer(0, current->buffer, REGBUF_STANDARD); + if (xlrec.offnumParent != InvalidOffsetNumber) + XLogRegisterBuffer(1, parent->buffer, REGBUF_STANDARD); + + recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_ADD_LEAF); PageSetLSN(current->page, recptr); /* update parent only if we actually changed it */ - if (xlrec.blknoParent != InvalidBlockNumber) + if (xlrec.offnumParent != InvalidOffsetNumber) { PageSetLSN(parent->page, recptr); } @@ -399,7 +396,6 @@ moveLeafs(Relation index, SpGistState *state, OffsetNumber *toDelete; OffsetNumber *toInsert; BlockNumber nblkno; - XLogRecData rdata[7]; spgxlogMoveLeafs xlrec; char *leafdata, *leafptr; @@ -455,20 +451,6 @@ moveLeafs(Relation index, SpGistState *state, nblkno = BufferGetBlockNumber(nbuf); Assert(nblkno != current->blkno); - /* prepare WAL info */ - xlrec.node = index->rd_node; - STORE_STATE(state, xlrec.stateSrc); - - xlrec.blknoSrc = current->blkno; - xlrec.blknoDst = nblkno; - xlrec.nMoves = nDelete; - xlrec.replaceDead = replaceDead; - xlrec.storesNulls = isNulls; - - xlrec.blknoParent = parent->blkno; - xlrec.offnumParent = parent->offnum; - xlrec.nodeI = parent->node; - leafdata = leafptr = palloc(size); START_CRIT_SECTION(); @@ -533,15 +515,29 @@ moveLeafs(Relation index, SpGistState *state, { XLogRecPtr recptr; - ACCEPT_RDATA_DATA(&xlrec, SizeOfSpgxlogMoveLeafs, 0); - ACCEPT_RDATA_DATA(toDelete, sizeof(OffsetNumber) * nDelete, 1); - ACCEPT_RDATA_DATA(toInsert, sizeof(OffsetNumber) * nInsert, 2); - ACCEPT_RDATA_DATA(leafdata, leafptr - leafdata, 3); - ACCEPT_RDATA_BUFFER(current->buffer, 4); - ACCEPT_RDATA_BUFFER(nbuf, 5); - ACCEPT_RDATA_BUFFER(parent->buffer, 6); + /* prepare WAL info */ + STORE_STATE(state, xlrec.stateSrc); - recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_MOVE_LEAFS, rdata); + xlrec.nMoves = nDelete; + xlrec.replaceDead = replaceDead; + xlrec.storesNulls = isNulls; + + xlrec.offnumParent = parent->offnum; + xlrec.nodeI = parent->node; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfSpgxlogMoveLeafs); + XLogRegisterData((char *) toDelete, + sizeof(OffsetNumber) * nDelete); + XLogRegisterData((char *) toInsert, + sizeof(OffsetNumber) * nInsert); + XLogRegisterData((char *) leafdata, leafptr - leafdata); + + XLogRegisterBuffer(0, current->buffer, REGBUF_STANDARD); + XLogRegisterBuffer(1, nbuf, REGBUF_STANDARD | (xlrec.newPage ? REGBUF_WILL_INIT : 0)); + XLogRegisterBuffer(2, parent->buffer, REGBUF_STANDARD); + + recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_MOVE_LEAFS); PageSetLSN(current->page, recptr); PageSetLSN(npage, recptr); @@ -701,8 +697,6 @@ doPickSplit(Relation index, SpGistState *state, int currentFreeSpace; int totalLeafSizes; bool allTheSame; - XLogRecData rdata[10]; - int nRdata; spgxlogPickSplit xlrec; char *leafdata, *leafptr; @@ -725,7 +719,6 @@ doPickSplit(Relation index, SpGistState *state, newLeafs = (SpGistLeafTuple *) palloc(sizeof(SpGistLeafTuple) * n); leafPageSelect = (uint8 *) palloc(sizeof(uint8) * n); - xlrec.node = index->rd_node; STORE_STATE(state, xlrec.stateSrc); /* @@ -971,10 +964,6 @@ doPickSplit(Relation index, SpGistState *state, } /* - * Because a WAL record can't involve more than four buffers, we can only - * afford to deal with two leaf pages in each picksplit action, ie the - * current page and at most one other. - * * The new leaf tuples converted from the existing ones should require the * same or less space, and therefore should all fit onto one page * (although that's not necessarily the current page, since we can't @@ -1108,17 +1097,13 @@ doPickSplit(Relation index, SpGistState *state, } /* Start preparing WAL record */ - xlrec.blknoSrc = current->blkno; - xlrec.blknoDest = InvalidBlockNumber; xlrec.nDelete = 0; xlrec.initSrc = isNew; xlrec.storesNulls = isNulls; + xlrec.isRootSplit = SpGistBlockIsRoot(current->blkno); leafdata = leafptr = (char *) palloc(totalLeafSizes); - ACCEPT_RDATA_DATA(&xlrec, SizeOfSpgxlogPickSplit, 0); - nRdata = 1; - /* Here we begin making the changes to the target pages */ START_CRIT_SECTION(); @@ -1150,12 +1135,6 @@ doPickSplit(Relation index, SpGistState *state, else { xlrec.nDelete = nToDelete; - ACCEPT_RDATA_DATA(toDelete, - sizeof(OffsetNumber) * nToDelete, - nRdata); - nRdata++; - ACCEPT_RDATA_BUFFER(current->buffer, nRdata); - nRdata++; if (!state->isBuild) { @@ -1240,25 +1219,8 @@ doPickSplit(Relation index, SpGistState *state, if (newLeafBuffer != InvalidBuffer) { MarkBufferDirty(newLeafBuffer); - /* also save block number for WAL */ - xlrec.blknoDest = BufferGetBlockNumber(newLeafBuffer); - if (!xlrec.initDest) - { - ACCEPT_RDATA_BUFFER(newLeafBuffer, nRdata); - nRdata++; - } } - xlrec.nInsert = nToInsert; - ACCEPT_RDATA_DATA(toInsert, sizeof(OffsetNumber) * nToInsert, nRdata); - nRdata++; - ACCEPT_RDATA_DATA(leafPageSelect, sizeof(uint8) * nToInsert, nRdata); - nRdata++; - ACCEPT_RDATA_DATA(innerTuple, innerTuple->size, nRdata); - nRdata++; - ACCEPT_RDATA_DATA(leafdata, leafptr - leafdata, nRdata); - nRdata++; - /* Remember current buffer, since we're about to change "current" */ saveCurrent = *current; @@ -1276,7 +1238,6 @@ doPickSplit(Relation index, SpGistState *state, current->blkno = parent->blkno; current->buffer = parent->buffer; current->page = parent->page; - xlrec.blknoInner = current->blkno; xlrec.offnumInner = current->offnum = SpGistPageAddNewItem(state, current->page, (Item) innerTuple, innerTuple->size, @@ -1285,14 +1246,11 @@ doPickSplit(Relation index, SpGistState *state, /* * Update parent node link and mark parent page dirty */ - xlrec.blknoParent = parent->blkno; + xlrec.innerIsParent = true; xlrec.offnumParent = parent->offnum; xlrec.nodeI = parent->node; saveNodeLink(index, parent, current->blkno, current->offnum); - ACCEPT_RDATA_BUFFER(parent->buffer, nRdata); - nRdata++; - /* * Update redirection link (in old current buffer) */ @@ -1314,7 +1272,6 @@ doPickSplit(Relation index, SpGistState *state, current->buffer = newInnerBuffer; current->blkno = BufferGetBlockNumber(current->buffer); current->page = BufferGetPage(current->buffer); - xlrec.blknoInner = current->blkno; xlrec.offnumInner = current->offnum = SpGistPageAddNewItem(state, current->page, (Item) innerTuple, innerTuple->size, @@ -1326,16 +1283,11 @@ doPickSplit(Relation index, SpGistState *state, /* * Update parent node link and mark parent page dirty */ - xlrec.blknoParent = parent->blkno; + xlrec.innerIsParent = (parent->buffer == current->buffer); xlrec.offnumParent = parent->offnum; xlrec.nodeI = parent->node; saveNodeLink(index, parent, current->blkno, current->offnum); - ACCEPT_RDATA_BUFFER(current->buffer, nRdata); - nRdata++; - ACCEPT_RDATA_BUFFER(parent->buffer, nRdata); - nRdata++; - /* * Update redirection link (in old current buffer) */ @@ -1357,8 +1309,8 @@ doPickSplit(Relation index, SpGistState *state, SpGistInitBuffer(current->buffer, (isNulls ? SPGIST_NULLS : 0)); xlrec.initInner = true; + xlrec.innerIsParent = false; - xlrec.blknoInner = current->blkno; xlrec.offnumInner = current->offnum = PageAddItem(current->page, (Item) innerTuple, innerTuple->size, InvalidOffsetNumber, false, false); @@ -1367,7 +1319,6 @@ doPickSplit(Relation index, SpGistState *state, innerTuple->size); /* No parent link to update, nor redirection to do */ - xlrec.blknoParent = InvalidBlockNumber; xlrec.offnumParent = InvalidOffsetNumber; xlrec.nodeI = 0; @@ -1381,9 +1332,46 @@ doPickSplit(Relation index, SpGistState *state, if (RelationNeedsWAL(index)) { XLogRecPtr recptr; + int flags; + + XLogBeginInsert(); + + xlrec.nInsert = nToInsert; + XLogRegisterData((char *) &xlrec, SizeOfSpgxlogPickSplit); + + XLogRegisterData((char *) toDelete, + sizeof(OffsetNumber) * xlrec.nDelete); + XLogRegisterData((char *) toInsert, + sizeof(OffsetNumber) * xlrec.nInsert); + XLogRegisterData((char *) leafPageSelect, + sizeof(uint8) * xlrec.nInsert); + XLogRegisterData((char *) innerTuple, innerTuple->size); + XLogRegisterData(leafdata, leafptr - leafdata); + + flags = REGBUF_STANDARD; + if (xlrec.initSrc) + flags |= REGBUF_WILL_INIT; + if (BufferIsValid(saveCurrent.buffer)) + XLogRegisterBuffer(0, saveCurrent.buffer, flags); + + if (BufferIsValid(newLeafBuffer)) + { + flags = REGBUF_STANDARD; + if (xlrec.initDest) + flags |= REGBUF_WILL_INIT; + XLogRegisterBuffer(1, newLeafBuffer, flags); + } + XLogRegisterBuffer(2, current->buffer, REGBUF_STANDARD); + if (parent->buffer != InvalidBuffer) + { + if (parent->buffer != current->buffer) + XLogRegisterBuffer(3, parent->buffer, REGBUF_STANDARD); + else + Assert(xlrec.innerIsParent); + } /* Issue the WAL record */ - recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_PICKSPLIT, rdata); + recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_PICKSPLIT); /* Update page LSNs on all affected pages */ if (newLeafBuffer != InvalidBuffer) @@ -1489,7 +1477,6 @@ spgAddNodeAction(Relation index, SpGistState *state, int nodeN, Datum nodeLabel) { SpGistInnerTuple newInnerTuple; - XLogRecData rdata[5]; spgxlogAddNode xlrec; /* Should not be applied to nulls */ @@ -1499,25 +1486,18 @@ spgAddNodeAction(Relation index, SpGistState *state, newInnerTuple = addNode(state, innerTuple, nodeLabel, nodeN); /* Prepare WAL record */ - xlrec.node = index->rd_node; STORE_STATE(state, xlrec.stateSrc); - xlrec.blkno = current->blkno; xlrec.offnum = current->offnum; /* we don't fill these unless we need to change the parent downlink */ - xlrec.blknoParent = InvalidBlockNumber; + xlrec.parentBlk = -1; xlrec.offnumParent = InvalidOffsetNumber; xlrec.nodeI = 0; /* we don't fill these unless tuple has to be moved */ - xlrec.blknoNew = InvalidBlockNumber; xlrec.offnumNew = InvalidOffsetNumber; xlrec.newPage = false; - ACCEPT_RDATA_DATA(&xlrec, sizeof(xlrec), 0); - ACCEPT_RDATA_DATA(newInnerTuple, newInnerTuple->size, 1); - ACCEPT_RDATA_BUFFER(current->buffer, 2); - if (PageGetExactFreeSpace(current->page) >= newInnerTuple->size - innerTuple->size) { @@ -1539,7 +1519,13 @@ spgAddNodeAction(Relation index, SpGistState *state, { XLogRecPtr recptr; - recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_ADD_NODE, rdata); + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + XLogRegisterData((char *) newInnerTuple, newInnerTuple->size); + + XLogRegisterBuffer(0, current->buffer, REGBUF_STANDARD); + + recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_ADD_NODE); PageSetLSN(current->page, recptr); } @@ -1565,7 +1551,6 @@ spgAddNodeAction(Relation index, SpGistState *state, saveCurrent = *current; - xlrec.blknoParent = parent->blkno; xlrec.offnumParent = parent->offnum; xlrec.nodeI = parent->node; @@ -1580,8 +1565,6 @@ spgAddNodeAction(Relation index, SpGistState *state, current->blkno = BufferGetBlockNumber(current->buffer); current->page = BufferGetPage(current->buffer); - xlrec.blknoNew = current->blkno; - /* * Let's just make real sure new current isn't same as old. Right now * that's impossible, but if SpGistGetBuffer ever got smart enough to @@ -1590,17 +1573,19 @@ spgAddNodeAction(Relation index, SpGistState *state, * replay would be subtly wrong, so I think a mere assert isn't enough * here. */ - if (xlrec.blknoNew == xlrec.blkno) + if (current->blkno == saveCurrent.blkno) elog(ERROR, "SPGiST new buffer shouldn't be same as old buffer"); /* * New current and parent buffer will both be modified; but note that * parent buffer could be same as either new or old current. */ - ACCEPT_RDATA_BUFFER(current->buffer, 3); - if (parent->buffer != current->buffer && - parent->buffer != saveCurrent.buffer) - ACCEPT_RDATA_BUFFER(parent->buffer, 4); + if (parent->buffer == saveCurrent.buffer) + xlrec.parentBlk = 0; + else if (parent->buffer == current->buffer) + xlrec.parentBlk = 1; + else + xlrec.parentBlk = 2; START_CRIT_SECTION(); @@ -1647,7 +1632,20 @@ spgAddNodeAction(Relation index, SpGistState *state, { XLogRecPtr recptr; - recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_ADD_NODE, rdata); + XLogBeginInsert(); + + /* orig page */ + XLogRegisterBuffer(0, saveCurrent.buffer, REGBUF_STANDARD); + /* new page */ + XLogRegisterBuffer(1, current->buffer, REGBUF_STANDARD); + /* parent page (if different from orig and new) */ + if (xlrec.parentBlk == 2) + XLogRegisterBuffer(2, parent->buffer, REGBUF_STANDARD); + + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + XLogRegisterData((char *) newInnerTuple, newInnerTuple->size); + + recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_ADD_NODE); /* we don't bother to check if any of these are redundant */ PageSetLSN(current->page, recptr); @@ -1682,7 +1680,6 @@ spgSplitNodeAction(Relation index, SpGistState *state, BlockNumber postfixBlkno; OffsetNumber postfixOffset; int i; - XLogRecData rdata[5]; spgxlogSplitTuple xlrec; Buffer newBuffer = InvalidBuffer; @@ -1725,14 +1722,8 @@ spgSplitNodeAction(Relation index, SpGistState *state, postfixTuple->allTheSame = innerTuple->allTheSame; /* prep data for WAL record */ - xlrec.node = index->rd_node; xlrec.newPage = false; - ACCEPT_RDATA_DATA(&xlrec, sizeof(xlrec), 0); - ACCEPT_RDATA_DATA(prefixTuple, prefixTuple->size, 1); - ACCEPT_RDATA_DATA(postfixTuple, postfixTuple->size, 2); - ACCEPT_RDATA_BUFFER(current->buffer, 3); - /* * If we can't fit both tuples on the current page, get a new page for the * postfix tuple. In particular, can't split to the root page. @@ -1752,7 +1743,6 @@ spgSplitNodeAction(Relation index, SpGistState *state, GBUF_INNER_PARITY(current->blkno + 1), postfixTuple->size + sizeof(ItemIdData), &xlrec.newPage); - ACCEPT_RDATA_BUFFER(newBuffer, 4); } START_CRIT_SECTION(); @@ -1767,27 +1757,28 @@ spgSplitNodeAction(Relation index, SpGistState *state, if (xlrec.offnumPrefix != current->offnum) elog(ERROR, "failed to add item of size %u to SPGiST index page", prefixTuple->size); - xlrec.blknoPrefix = current->blkno; /* * put postfix tuple into appropriate page */ if (newBuffer == InvalidBuffer) { - xlrec.blknoPostfix = postfixBlkno = current->blkno; + postfixBlkno = current->blkno; xlrec.offnumPostfix = postfixOffset = SpGistPageAddNewItem(state, current->page, (Item) postfixTuple, postfixTuple->size, NULL, false); + xlrec.postfixBlkSame = true; } else { - xlrec.blknoPostfix = postfixBlkno = BufferGetBlockNumber(newBuffer); + postfixBlkno = BufferGetBlockNumber(newBuffer); xlrec.offnumPostfix = postfixOffset = SpGistPageAddNewItem(state, BufferGetPage(newBuffer), (Item) postfixTuple, postfixTuple->size, NULL, false); MarkBufferDirty(newBuffer); + xlrec.postfixBlkSame = false; } /* @@ -1808,7 +1799,23 @@ spgSplitNodeAction(Relation index, SpGistState *state, { XLogRecPtr recptr; - recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_SPLIT_TUPLE, rdata); + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + XLogRegisterData((char *) prefixTuple, prefixTuple->size); + XLogRegisterData((char *) postfixTuple, postfixTuple->size); + + XLogRegisterBuffer(0, current->buffer, REGBUF_STANDARD); + if (newBuffer != InvalidBuffer) + { + int flags; + + flags = REGBUF_STANDARD; + if (xlrec.newPage) + flags |= REGBUF_WILL_INIT; + XLogRegisterBuffer(1, newBuffer, flags); + } + + recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_SPLIT_TUPLE); PageSetLSN(current->page, recptr); diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index e1dfc8e358..f168ac5c5c 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -105,15 +105,18 @@ spgbuild(PG_FUNCTION_ARGS) if (RelationNeedsWAL(index)) { XLogRecPtr recptr; - XLogRecData rdata; - /* WAL data is just the relfilenode */ - rdata.data = (char *) &(index->rd_node); - rdata.len = sizeof(RelFileNode); - rdata.buffer = InvalidBuffer; - rdata.next = NULL; + XLogBeginInsert(); - recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_CREATE_INDEX, &rdata); + /* + * Replay will re-initialize the pages, so don't take full pages + * images. No other data to log. + */ + XLogRegisterBuffer(0, metabuffer, REGBUF_WILL_INIT); + XLogRegisterBuffer(1, rootbuffer, REGBUF_WILL_INIT | REGBUF_STANDARD); + XLogRegisterBuffer(2, nullbuffer, REGBUF_WILL_INIT | REGBUF_STANDARD); + + recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_CREATE_INDEX); PageSetLSN(BufferGetPage(metabuffer), recptr); PageSetLSN(BufferGetPage(rootbuffer), recptr); diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c index 2e05d22b74..c95b80b5c7 100644 --- a/src/backend/access/spgist/spgvacuum.c +++ b/src/backend/access/spgist/spgvacuum.c @@ -127,7 +127,6 @@ vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer, { Page page = BufferGetPage(buffer); spgxlogVacuumLeaf xlrec; - XLogRecData rdata[8]; OffsetNumber toDead[MaxIndexTuplesPerPage]; OffsetNumber toPlaceholder[MaxIndexTuplesPerPage]; OffsetNumber moveSrc[MaxIndexTuplesPerPage]; @@ -323,20 +322,6 @@ vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer, if (nDeletable != xlrec.nDead + xlrec.nPlaceholder + xlrec.nMove) elog(ERROR, "inconsistent counts of deletable tuples"); - /* Prepare WAL record */ - xlrec.node = index->rd_node; - xlrec.blkno = BufferGetBlockNumber(buffer); - STORE_STATE(&bds->spgstate, xlrec.stateSrc); - - ACCEPT_RDATA_DATA(&xlrec, SizeOfSpgxlogVacuumLeaf, 0); - ACCEPT_RDATA_DATA(toDead, sizeof(OffsetNumber) * xlrec.nDead, 1); - ACCEPT_RDATA_DATA(toPlaceholder, sizeof(OffsetNumber) * xlrec.nPlaceholder, 2); - ACCEPT_RDATA_DATA(moveSrc, sizeof(OffsetNumber) * xlrec.nMove, 3); - ACCEPT_RDATA_DATA(moveDest, sizeof(OffsetNumber) * xlrec.nMove, 4); - ACCEPT_RDATA_DATA(chainSrc, sizeof(OffsetNumber) * xlrec.nChain, 5); - ACCEPT_RDATA_DATA(chainDest, sizeof(OffsetNumber) * xlrec.nChain, 6); - ACCEPT_RDATA_BUFFER(buffer, 7); - /* Do the updates */ START_CRIT_SECTION(); @@ -389,7 +374,22 @@ vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer, { XLogRecPtr recptr; - recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_LEAF, rdata); + XLogBeginInsert(); + + STORE_STATE(&bds->spgstate, xlrec.stateSrc); + + XLogRegisterData((char *) &xlrec, SizeOfSpgxlogVacuumLeaf); + /* sizeof(xlrec) should be a multiple of sizeof(OffsetNumber) */ + XLogRegisterData((char *) toDead, sizeof(OffsetNumber) * xlrec.nDead); + XLogRegisterData((char *) toPlaceholder, sizeof(OffsetNumber) * xlrec.nPlaceholder); + XLogRegisterData((char *) moveSrc, sizeof(OffsetNumber) * xlrec.nMove); + XLogRegisterData((char *) moveDest, sizeof(OffsetNumber) * xlrec.nMove); + XLogRegisterData((char *) chainSrc, sizeof(OffsetNumber) * xlrec.nChain); + XLogRegisterData((char *) chainDest, sizeof(OffsetNumber) * xlrec.nChain); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + + recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_LEAF); PageSetLSN(page, recptr); } @@ -407,12 +407,10 @@ vacuumLeafRoot(spgBulkDeleteState *bds, Relation index, Buffer buffer) { Page page = BufferGetPage(buffer); spgxlogVacuumRoot xlrec; - XLogRecData rdata[3]; OffsetNumber toDelete[MaxIndexTuplesPerPage]; OffsetNumber i, max = PageGetMaxOffsetNumber(page); - xlrec.blkno = BufferGetBlockNumber(buffer); xlrec.nDelete = 0; /* Scan page, identify tuples to delete, accumulate stats */ @@ -448,15 +446,6 @@ vacuumLeafRoot(spgBulkDeleteState *bds, Relation index, Buffer buffer) if (xlrec.nDelete == 0) return; /* nothing more to do */ - /* Prepare WAL record */ - xlrec.node = index->rd_node; - STORE_STATE(&bds->spgstate, xlrec.stateSrc); - - ACCEPT_RDATA_DATA(&xlrec, SizeOfSpgxlogVacuumRoot, 0); - /* sizeof(xlrec) should be a multiple of sizeof(OffsetNumber) */ - ACCEPT_RDATA_DATA(toDelete, sizeof(OffsetNumber) * xlrec.nDelete, 1); - ACCEPT_RDATA_BUFFER(buffer, 2); - /* Do the update */ START_CRIT_SECTION(); @@ -469,7 +458,19 @@ vacuumLeafRoot(spgBulkDeleteState *bds, Relation index, Buffer buffer) { XLogRecPtr recptr; - recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_ROOT, rdata); + XLogBeginInsert(); + + /* Prepare WAL record */ + STORE_STATE(&bds->spgstate, xlrec.stateSrc); + + XLogRegisterData((char *) &xlrec, SizeOfSpgxlogVacuumRoot); + /* sizeof(xlrec) should be a multiple of sizeof(OffsetNumber) */ + XLogRegisterData((char *) toDelete, + sizeof(OffsetNumber) * xlrec.nDelete); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + + recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_ROOT); PageSetLSN(page, recptr); } @@ -499,10 +500,7 @@ vacuumRedirectAndPlaceholder(Relation index, Buffer buffer) OffsetNumber itemToPlaceholder[MaxIndexTuplesPerPage]; OffsetNumber itemnos[MaxIndexTuplesPerPage]; spgxlogVacuumRedirect xlrec; - XLogRecData rdata[3]; - xlrec.node = index->rd_node; - xlrec.blkno = BufferGetBlockNumber(buffer); xlrec.nToPlaceholder = 0; xlrec.newestRedirectXid = InvalidTransactionId; @@ -585,11 +583,15 @@ vacuumRedirectAndPlaceholder(Relation index, Buffer buffer) { XLogRecPtr recptr; - ACCEPT_RDATA_DATA(&xlrec, SizeOfSpgxlogVacuumRedirect, 0); - ACCEPT_RDATA_DATA(itemToPlaceholder, sizeof(OffsetNumber) * xlrec.nToPlaceholder, 1); - ACCEPT_RDATA_BUFFER(buffer, 2); + XLogBeginInsert(); - recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_REDIRECT, rdata); + XLogRegisterData((char *) &xlrec, SizeOfSpgxlogVacuumRedirect); + XLogRegisterData((char *) itemToPlaceholder, + sizeof(OffsetNumber) * xlrec.nToPlaceholder); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + + recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_REDIRECT); PageSetLSN(page, recptr); } diff --git a/src/backend/access/spgist/spgxlog.c b/src/backend/access/spgist/spgxlog.c index 920739436a..ac6d4bd369 100644 --- a/src/backend/access/spgist/spgxlog.c +++ b/src/backend/access/spgist/spgxlog.c @@ -71,33 +71,30 @@ addOrReplaceTuple(Page page, Item tuple, int size, OffsetNumber offset) } static void -spgRedoCreateIndex(XLogRecPtr lsn, XLogRecord *record) +spgRedoCreateIndex(XLogReaderState *record) { - RelFileNode *node = (RelFileNode *) XLogRecGetData(record); + XLogRecPtr lsn = record->EndRecPtr; Buffer buffer; Page page; - /* Backup blocks are not used in create_index records */ - Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); - - buffer = XLogReadBuffer(*node, SPGIST_METAPAGE_BLKNO, true); - Assert(BufferIsValid(buffer)); + buffer = XLogInitBufferForRedo(record, 0); + Assert(BufferGetBlockNumber(buffer) == SPGIST_METAPAGE_BLKNO); page = (Page) BufferGetPage(buffer); SpGistInitMetapage(page); PageSetLSN(page, lsn); MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); - buffer = XLogReadBuffer(*node, SPGIST_ROOT_BLKNO, true); - Assert(BufferIsValid(buffer)); + buffer = XLogInitBufferForRedo(record, 1); + Assert(BufferGetBlockNumber(buffer) == SPGIST_ROOT_BLKNO); SpGistInitBuffer(buffer, SPGIST_LEAF); page = (Page) BufferGetPage(buffer); PageSetLSN(page, lsn); MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); - buffer = XLogReadBuffer(*node, SPGIST_NULL_BLKNO, true); - Assert(BufferIsValid(buffer)); + buffer = XLogInitBufferForRedo(record, 2); + Assert(BufferGetBlockNumber(buffer) == SPGIST_NULL_BLKNO); SpGistInitBuffer(buffer, SPGIST_LEAF | SPGIST_NULLS); page = (Page) BufferGetPage(buffer); PageSetLSN(page, lsn); @@ -106,8 +103,9 @@ spgRedoCreateIndex(XLogRecPtr lsn, XLogRecord *record) } static void -spgRedoAddLeaf(XLogRecPtr lsn, XLogRecord *record) +spgRedoAddLeaf(XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; char *ptr = XLogRecGetData(record); spgxlogAddLeaf *xldata = (spgxlogAddLeaf *) ptr; char *leafTuple; @@ -128,15 +126,13 @@ spgRedoAddLeaf(XLogRecPtr lsn, XLogRecord *record) */ if (xldata->newPage) { - buffer = XLogReadBuffer(xldata->node, xldata->blknoLeaf, true); + buffer = XLogInitBufferForRedo(record, 0); SpGistInitBuffer(buffer, SPGIST_LEAF | (xldata->storesNulls ? SPGIST_NULLS : 0)); action = BLK_NEEDS_REDO; } else - action = XLogReadBufferForRedo(lsn, record, 0, - xldata->node, xldata->blknoLeaf, - &buffer); + action = XLogReadBufferForRedo(record, 0, &buffer); if (action == BLK_NEEDS_REDO) { @@ -164,7 +160,8 @@ spgRedoAddLeaf(XLogRecPtr lsn, XLogRecord *record) { /* replacing a DEAD tuple */ PageIndexTupleDelete(page, xldata->offnumLeaf); - if (PageAddItem(page, (Item) leafTuple, leafTupleHdr.size, + if (PageAddItem(page, + (Item) leafTuple, leafTupleHdr.size, xldata->offnumLeaf, false, false) != xldata->offnumLeaf) elog(ERROR, "failed to add item of size %u to SPGiST index page", leafTupleHdr.size); @@ -177,13 +174,14 @@ spgRedoAddLeaf(XLogRecPtr lsn, XLogRecord *record) UnlockReleaseBuffer(buffer); /* update parent downlink if necessary */ - if (xldata->blknoParent != InvalidBlockNumber) + if (xldata->offnumParent != InvalidOffsetNumber) { - if (XLogReadBufferForRedo(lsn, record, 1, - xldata->node, xldata->blknoParent, - &buffer) == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) { SpGistInnerTuple tuple; + BlockNumber blknoLeaf; + + XLogRecGetBlockTag(record, 0, NULL, NULL, &blknoLeaf); page = BufferGetPage(buffer); @@ -191,7 +189,7 @@ spgRedoAddLeaf(XLogRecPtr lsn, XLogRecord *record) PageGetItemId(page, xldata->offnumParent)); spgUpdateNodeLink(tuple, xldata->nodeI, - xldata->blknoLeaf, xldata->offnumLeaf); + blknoLeaf, xldata->offnumLeaf); PageSetLSN(page, lsn); MarkBufferDirty(buffer); @@ -202,8 +200,9 @@ spgRedoAddLeaf(XLogRecPtr lsn, XLogRecord *record) } static void -spgRedoMoveLeafs(XLogRecPtr lsn, XLogRecord *record) +spgRedoMoveLeafs(XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; char *ptr = XLogRecGetData(record); spgxlogMoveLeafs *xldata = (spgxlogMoveLeafs *) ptr; SpGistState state; @@ -213,6 +212,9 @@ spgRedoMoveLeafs(XLogRecPtr lsn, XLogRecord *record) Buffer buffer; Page page; XLogRedoAction action; + BlockNumber blknoDst; + + XLogRecGetBlockTag(record, 1, NULL, NULL, &blknoDst); fillFakeState(&state, xldata->stateSrc); @@ -235,15 +237,14 @@ spgRedoMoveLeafs(XLogRecPtr lsn, XLogRecord *record) /* Insert tuples on the dest page (do first, so redirect is valid) */ if (xldata->newPage) { - buffer = XLogReadBuffer(xldata->node, xldata->blknoDst, true); + buffer = XLogInitBufferForRedo(record, 1); SpGistInitBuffer(buffer, SPGIST_LEAF | (xldata->storesNulls ? SPGIST_NULLS : 0)); action = BLK_NEEDS_REDO; } else - action = XLogReadBufferForRedo(lsn, record, 1, - xldata->node, xldata->blknoDst, - &buffer); + action = XLogReadBufferForRedo(record, 1, &buffer); + if (action == BLK_NEEDS_REDO) { int i; @@ -260,7 +261,8 @@ spgRedoMoveLeafs(XLogRecPtr lsn, XLogRecord *record) * field. */ leafTuple = ptr; - memcpy(&leafTupleHdr, leafTuple, sizeof(SpGistLeafTupleData)); + memcpy(&leafTupleHdr, leafTuple, + sizeof(SpGistLeafTupleData)); addOrReplaceTuple(page, (Item) leafTuple, leafTupleHdr.size, toInsert[i]); @@ -274,14 +276,14 @@ spgRedoMoveLeafs(XLogRecPtr lsn, XLogRecord *record) UnlockReleaseBuffer(buffer); /* Delete tuples from the source page, inserting a redirection pointer */ - if (XLogReadBufferForRedo(lsn, record, 0, xldata->node, xldata->blknoSrc, - &buffer) == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { page = BufferGetPage(buffer); + spgPageIndexMultiDelete(&state, page, toDelete, xldata->nMoves, state.isBuild ? SPGIST_PLACEHOLDER : SPGIST_REDIRECT, SPGIST_PLACEHOLDER, - xldata->blknoDst, + blknoDst, toInsert[nInsert - 1]); PageSetLSN(page, lsn); @@ -291,8 +293,7 @@ spgRedoMoveLeafs(XLogRecPtr lsn, XLogRecord *record) UnlockReleaseBuffer(buffer); /* And update the parent downlink */ - if (XLogReadBufferForRedo(lsn, record, 2, xldata->node, xldata->blknoParent, - &buffer) == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO) { SpGistInnerTuple tuple; @@ -302,7 +303,7 @@ spgRedoMoveLeafs(XLogRecPtr lsn, XLogRecord *record) PageGetItemId(page, xldata->offnumParent)); spgUpdateNodeLink(tuple, xldata->nodeI, - xldata->blknoDst, toInsert[nInsert - 1]); + blknoDst, toInsert[nInsert - 1]); PageSetLSN(page, lsn); MarkBufferDirty(buffer); @@ -312,8 +313,9 @@ spgRedoMoveLeafs(XLogRecPtr lsn, XLogRecord *record) } static void -spgRedoAddNode(XLogRecPtr lsn, XLogRecord *record) +spgRedoAddNode(XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; char *ptr = XLogRecGetData(record); spgxlogAddNode *xldata = (spgxlogAddNode *) ptr; char *innerTuple; @@ -321,7 +323,6 @@ spgRedoAddNode(XLogRecPtr lsn, XLogRecord *record) SpGistState state; Buffer buffer; Page page; - int bbi; XLogRedoAction action; ptr += sizeof(spgxlogAddNode); @@ -331,17 +332,18 @@ spgRedoAddNode(XLogRecPtr lsn, XLogRecord *record) fillFakeState(&state, xldata->stateSrc); - if (xldata->blknoNew == InvalidBlockNumber) + if (!XLogRecHasBlockRef(record, 1)) { /* update in place */ - Assert(xldata->blknoParent == InvalidBlockNumber); - if (XLogReadBufferForRedo(lsn, record, 0, xldata->node, xldata->blkno, - &buffer) == BLK_NEEDS_REDO) + Assert(xldata->parentBlk == -1); + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { page = BufferGetPage(buffer); + PageIndexTupleDelete(page, xldata->offnum); if (PageAddItem(page, (Item) innerTuple, innerTupleHdr.size, - xldata->offnum, false, false) != xldata->offnum) + xldata->offnum, + false, false) != xldata->offnum) elog(ERROR, "failed to add item of size %u to SPGiST index page", innerTupleHdr.size); @@ -353,30 +355,30 @@ spgRedoAddNode(XLogRecPtr lsn, XLogRecord *record) } else { + BlockNumber blkno; + BlockNumber blknoNew; + + XLogRecGetBlockTag(record, 0, NULL, NULL, &blkno); + XLogRecGetBlockTag(record, 1, NULL, NULL, &blknoNew); + /* * In normal operation we would have all three pages (source, dest, * and parent) locked simultaneously; but in WAL replay it should be * safe to update them one at a time, as long as we do it in the right - * order. - * - * The logic here depends on the assumption that blkno != blknoNew, - * else we can't tell which BKP bit goes with which page, and the LSN - * checks could go wrong too. + * order. We must insert the new tuple before replacing the old tuple + * with the redirect tuple. */ - Assert(xldata->blkno != xldata->blknoNew); /* Install new tuple first so redirect is valid */ if (xldata->newPage) { - buffer = XLogReadBuffer(xldata->node, xldata->blknoNew, true); /* AddNode is not used for nulls pages */ + buffer = XLogInitBufferForRedo(record, 1); SpGistInitBuffer(buffer, 0); action = BLK_NEEDS_REDO; } else - action = XLogReadBufferForRedo(lsn, record, 1, - xldata->node, xldata->blknoNew, - &buffer); + action = XLogReadBufferForRedo(record, 1, &buffer); if (action == BLK_NEEDS_REDO) { page = BufferGetPage(buffer); @@ -385,22 +387,26 @@ spgRedoAddNode(XLogRecPtr lsn, XLogRecord *record) innerTupleHdr.size, xldata->offnumNew); /* - * If parent is in this same page, don't advance LSN; doing so - * would fool us into not applying the parent downlink update - * below. We'll update the LSN when we fix the parent downlink. + * If parent is in this same page, update it now. */ - if (xldata->blknoParent != xldata->blknoNew) + if (xldata->parentBlk == 1) { - PageSetLSN(page, lsn); + SpGistInnerTuple parentTuple; + + parentTuple = (SpGistInnerTuple) PageGetItem(page, + PageGetItemId(page, xldata->offnumParent)); + + spgUpdateNodeLink(parentTuple, xldata->nodeI, + blknoNew, xldata->offnumNew); } + PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); /* Delete old tuple, replacing it with redirect or placeholder tuple */ - if (XLogReadBufferForRedo(lsn, record, 0, xldata->node, xldata->blkno, - &buffer) == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { SpGistDeadTuple dt; @@ -412,11 +418,12 @@ spgRedoAddNode(XLogRecPtr lsn, XLogRecord *record) InvalidOffsetNumber); else dt = spgFormDeadTuple(&state, SPGIST_REDIRECT, - xldata->blknoNew, + blknoNew, xldata->offnumNew); PageIndexTupleDelete(page, xldata->offnum); - if (PageAddItem(page, (Item) dt, dt->size, xldata->offnum, + if (PageAddItem(page, (Item) dt, dt->size, + xldata->offnum, false, false) != xldata->offnum) elog(ERROR, "failed to add item of size %u to SPGiST index page", dt->size); @@ -427,67 +434,55 @@ spgRedoAddNode(XLogRecPtr lsn, XLogRecord *record) SpGistPageGetOpaque(page)->nRedirection++; /* - * If parent is in this same page, don't advance LSN; doing so - * would fool us into not applying the parent downlink update - * below. We'll update the LSN when we fix the parent downlink. + * If parent is in this same page, update it now. */ - if (xldata->blknoParent != xldata->blkno) + if (xldata->parentBlk == 0) { - PageSetLSN(page, lsn); + SpGistInnerTuple parentTuple; + + parentTuple = (SpGistInnerTuple) PageGetItem(page, + PageGetItemId(page, xldata->offnumParent)); + + spgUpdateNodeLink(parentTuple, xldata->nodeI, + blknoNew, xldata->offnumNew); } + PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); /* - * Update parent downlink. Since parent could be in either of the - * previous two buffers, it's a bit tricky to determine which BKP bit - * applies. + * Update parent downlink (if we didn't do it as part of the source or + * destination page update already). */ - if (xldata->blknoParent == xldata->blkno) - bbi = 0; - else if (xldata->blknoParent == xldata->blknoNew) - bbi = 1; - else - bbi = 2; - - if (record->xl_info & XLR_BKP_BLOCK(bbi)) + if (xldata->parentBlk == 2) { - if (bbi == 2) /* else we already did it */ - (void) RestoreBackupBlock(lsn, record, bbi, false, false); - action = BLK_RESTORED; - buffer = InvalidBuffer; - } - else - { - action = XLogReadBufferForRedo(lsn, record, bbi, xldata->node, - xldata->blknoParent, &buffer); - Assert(action != BLK_RESTORED); - } - if (action == BLK_NEEDS_REDO) - { - SpGistInnerTuple innerTuple; + if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO) + { + SpGistInnerTuple parentTuple; - page = BufferGetPage(buffer); + page = BufferGetPage(buffer); - innerTuple = (SpGistInnerTuple) PageGetItem(page, + parentTuple = (SpGistInnerTuple) PageGetItem(page, PageGetItemId(page, xldata->offnumParent)); - spgUpdateNodeLink(innerTuple, xldata->nodeI, - xldata->blknoNew, xldata->offnumNew); + spgUpdateNodeLink(parentTuple, xldata->nodeI, + blknoNew, xldata->offnumNew); - PageSetLSN(page, lsn); - MarkBufferDirty(buffer); + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); } - if (BufferIsValid(buffer)) - UnlockReleaseBuffer(buffer); } } static void -spgRedoSplitTuple(XLogRecPtr lsn, XLogRecord *record) +spgRedoSplitTuple(XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; char *ptr = XLogRecGetData(record); spgxlogSplitTuple *xldata = (spgxlogSplitTuple *) ptr; char *prefixTuple; @@ -496,6 +491,7 @@ spgRedoSplitTuple(XLogRecPtr lsn, XLogRecord *record) SpGistInnerTupleData postfixTupleHdr; Buffer buffer; Page page; + XLogRedoAction action; ptr += sizeof(spgxlogSplitTuple); prefixTuple = ptr; @@ -513,22 +509,17 @@ spgRedoSplitTuple(XLogRecPtr lsn, XLogRecord *record) */ /* insert postfix tuple first to avoid dangling link */ - if (xldata->blknoPostfix != xldata->blknoPrefix) + if (!xldata->postfixBlkSame) { - XLogRedoAction action; - if (xldata->newPage) { - buffer = XLogReadBuffer(xldata->node, xldata->blknoPostfix, true); + buffer = XLogInitBufferForRedo(record, 1); /* SplitTuple is not used for nulls pages */ SpGistInitBuffer(buffer, 0); action = BLK_NEEDS_REDO; } else - action = XLogReadBufferForRedo(lsn, record, 1, - xldata->node, xldata->blknoPostfix, - &buffer); - + action = XLogReadBufferForRedo(record, 1, &buffer); if (action == BLK_NEEDS_REDO) { page = BufferGetPage(buffer); @@ -544,18 +535,19 @@ spgRedoSplitTuple(XLogRecPtr lsn, XLogRecord *record) } /* now handle the original page */ - if (XLogReadBufferForRedo(lsn, record, 0, xldata->node, xldata->blknoPrefix, - &buffer) == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { page = BufferGetPage(buffer); + PageIndexTupleDelete(page, xldata->offnumPrefix); if (PageAddItem(page, (Item) prefixTuple, prefixTupleHdr.size, xldata->offnumPrefix, false, false) != xldata->offnumPrefix) elog(ERROR, "failed to add item of size %u to SPGiST index page", prefixTupleHdr.size); - if (xldata->blknoPostfix == xldata->blknoPrefix) - addOrReplaceTuple(page, (Item) postfixTuple, postfixTupleHdr.size, + if (xldata->postfixBlkSame) + addOrReplaceTuple(page, (Item) postfixTuple, + postfixTupleHdr.size, xldata->offnumPostfix); PageSetLSN(page, lsn); @@ -566,8 +558,9 @@ spgRedoSplitTuple(XLogRecPtr lsn, XLogRecord *record) } static void -spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record) +spgRedoPickSplit(XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; char *ptr = XLogRecGetData(record); spgxlogPickSplit *xldata = (spgxlogPickSplit *) ptr; char *innerTuple; @@ -578,14 +571,16 @@ spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record) uint8 *leafPageSelect; Buffer srcBuffer; Buffer destBuffer; + Buffer innerBuffer; Page srcPage; Page destPage; - Buffer innerBuffer; Page page; - int bbi; int i; + BlockNumber blknoInner; XLogRedoAction action; + XLogRecGetBlockTag(record, 2, NULL, NULL, &blknoInner); + fillFakeState(&state, xldata->stateSrc); ptr += SizeOfSpgxlogPickSplit; @@ -603,13 +598,7 @@ spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record) /* now ptr points to the list of leaf tuples */ - /* - * It's a bit tricky to identify which pages have been handled as - * full-page images, so we explicitly count each referenced buffer. - */ - bbi = 0; - - if (SpGistBlockIsRoot(xldata->blknoSrc)) + if (xldata->isRootSplit) { /* when splitting root, we touch it only in the guise of new inner */ srcBuffer = InvalidBuffer; @@ -618,8 +607,7 @@ spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record) else if (xldata->initSrc) { /* just re-init the source page */ - srcBuffer = XLogReadBuffer(xldata->node, xldata->blknoSrc, true); - Assert(BufferIsValid(srcBuffer)); + srcBuffer = XLogInitBufferForRedo(record, 0); srcPage = (Page) BufferGetPage(srcBuffer); SpGistInitBuffer(srcBuffer, @@ -634,9 +622,8 @@ spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record) * inserting leaf tuples and the new inner tuple, else the added * redirect tuple will be a dangling link.) */ - if (XLogReadBufferForRedo(lsn, record, bbi, - xldata->node, xldata->blknoSrc, - &srcBuffer) == BLK_NEEDS_REDO) + srcPage = NULL; + if (XLogReadBufferForRedo(record, 0, &srcBuffer) == BLK_NEEDS_REDO) { srcPage = BufferGetPage(srcBuffer); @@ -650,7 +637,7 @@ spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record) toDelete, xldata->nDelete, SPGIST_REDIRECT, SPGIST_PLACEHOLDER, - xldata->blknoInner, + blknoInner, xldata->offnumInner); else spgPageIndexMultiDelete(&state, srcPage, @@ -662,15 +649,10 @@ spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record) /* don't update LSN etc till we're done with it */ } - else - { - srcPage = NULL; /* don't do any page updates */ - } - bbi++; } /* try to access dest page if any */ - if (xldata->blknoDest == InvalidBlockNumber) + if (!XLogRecHasBlockRef(record, 1)) { destBuffer = InvalidBuffer; destPage = NULL; @@ -678,8 +660,7 @@ spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record) else if (xldata->initDest) { /* just re-init the dest page */ - destBuffer = XLogReadBuffer(xldata->node, xldata->blknoDest, true); - Assert(BufferIsValid(destBuffer)); + destBuffer = XLogInitBufferForRedo(record, 1); destPage = (Page) BufferGetPage(destBuffer); SpGistInitBuffer(destBuffer, @@ -692,17 +673,10 @@ spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record) * We could probably release the page lock immediately in the * full-page-image case, but for safety let's hold it till later. */ - if (XLogReadBufferForRedo(lsn, record, bbi, - xldata->node, xldata->blknoDest, - &destBuffer) == BLK_NEEDS_REDO) - { + if (XLogReadBufferForRedo(record, 1, &destBuffer) == BLK_NEEDS_REDO) destPage = (Page) BufferGetPage(destBuffer); - } else - { destPage = NULL; /* don't do any page updates */ - } - bbi++; } /* restore leaf tuples to src and/or dest page */ @@ -739,14 +713,12 @@ spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record) /* restore new inner tuple */ if (xldata->initInner) { - innerBuffer = XLogReadBuffer(xldata->node, xldata->blknoInner, true); - SpGistInitBuffer(innerBuffer, - (xldata->storesNulls ? SPGIST_NULLS : 0)); + innerBuffer = XLogInitBufferForRedo(record, 2); + SpGistInitBuffer(innerBuffer, (xldata->storesNulls ? SPGIST_NULLS : 0)); action = BLK_NEEDS_REDO; } else - action = XLogReadBufferForRedo(lsn, record, bbi, xldata->node, - xldata->blknoInner, &innerBuffer); + action = XLogReadBufferForRedo(record, 2, &innerBuffer); if (action == BLK_NEEDS_REDO) { @@ -756,14 +728,14 @@ spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record) xldata->offnumInner); /* if inner is also parent, update link while we're here */ - if (xldata->blknoInner == xldata->blknoParent) + if (xldata->innerIsParent) { SpGistInnerTuple parent; parent = (SpGistInnerTuple) PageGetItem(page, PageGetItemId(page, xldata->offnumParent)); spgUpdateNodeLink(parent, xldata->nodeI, - xldata->blknoInner, xldata->offnumInner); + blknoInner, xldata->offnumInner); } PageSetLSN(page, lsn); @@ -771,7 +743,6 @@ spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record) } if (BufferIsValid(innerBuffer)) UnlockReleaseBuffer(innerBuffer); - bbi++; /* * Now we can release the leaf-page locks. It's okay to do this before @@ -783,18 +754,11 @@ spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record) UnlockReleaseBuffer(destBuffer); /* update parent downlink, unless we did it above */ - if (xldata->blknoParent == InvalidBlockNumber) - { - /* no parent cause we split the root */ - Assert(SpGistBlockIsRoot(xldata->blknoInner)); - } - else if (xldata->blknoInner != xldata->blknoParent) + if (XLogRecHasBlockRef(record, 3)) { Buffer parentBuffer; - if (XLogReadBufferForRedo(lsn, record, bbi, - xldata->node, xldata->blknoParent, - &parentBuffer) == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, 3, &parentBuffer) == BLK_NEEDS_REDO) { SpGistInnerTuple parent; @@ -803,7 +767,7 @@ spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record) parent = (SpGistInnerTuple) PageGetItem(page, PageGetItemId(page, xldata->offnumParent)); spgUpdateNodeLink(parent, xldata->nodeI, - xldata->blknoInner, xldata->offnumInner); + blknoInner, xldata->offnumInner); PageSetLSN(page, lsn); MarkBufferDirty(parentBuffer); @@ -811,11 +775,14 @@ spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record) if (BufferIsValid(parentBuffer)) UnlockReleaseBuffer(parentBuffer); } + else + Assert(xldata->innerIsParent || xldata->isRootSplit); } static void -spgRedoVacuumLeaf(XLogRecPtr lsn, XLogRecord *record) +spgRedoVacuumLeaf(XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; char *ptr = XLogRecGetData(record); spgxlogVacuumLeaf *xldata = (spgxlogVacuumLeaf *) ptr; OffsetNumber *toDead; @@ -844,8 +811,7 @@ spgRedoVacuumLeaf(XLogRecPtr lsn, XLogRecord *record) ptr += sizeof(OffsetNumber) * xldata->nChain; chainDest = (OffsetNumber *) ptr; - if (XLogReadBufferForRedo(lsn, record, 0, xldata->node, xldata->blkno, - &buffer) == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { page = BufferGetPage(buffer); @@ -897,8 +863,9 @@ spgRedoVacuumLeaf(XLogRecPtr lsn, XLogRecord *record) } static void -spgRedoVacuumRoot(XLogRecPtr lsn, XLogRecord *record) +spgRedoVacuumRoot(XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; char *ptr = XLogRecGetData(record); spgxlogVacuumRoot *xldata = (spgxlogVacuumRoot *) ptr; OffsetNumber *toDelete; @@ -907,8 +874,7 @@ spgRedoVacuumRoot(XLogRecPtr lsn, XLogRecord *record) toDelete = xldata->offsets; - if (XLogReadBufferForRedo(lsn, record, 0, xldata->node, xldata->blkno, - &buffer) == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { page = BufferGetPage(buffer); @@ -923,8 +889,9 @@ spgRedoVacuumRoot(XLogRecPtr lsn, XLogRecord *record) } static void -spgRedoVacuumRedirect(XLogRecPtr lsn, XLogRecord *record) +spgRedoVacuumRedirect(XLogReaderState *record) { + XLogRecPtr lsn = record->EndRecPtr; char *ptr = XLogRecGetData(record); spgxlogVacuumRedirect *xldata = (spgxlogVacuumRedirect *) ptr; OffsetNumber *itemToPlaceholder; @@ -939,12 +906,16 @@ spgRedoVacuumRedirect(XLogRecPtr lsn, XLogRecord *record) if (InHotStandby) { if (TransactionIdIsValid(xldata->newestRedirectXid)) + { + RelFileNode node; + + XLogRecGetBlockTag(record, 0, &node, NULL, NULL); ResolveRecoveryConflictWithSnapshot(xldata->newestRedirectXid, - xldata->node); + node); + } } - if (XLogReadBufferForRedo(lsn, record, 0, xldata->node, xldata->blkno, - &buffer) == BLK_NEEDS_REDO) + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { Page page = BufferGetPage(buffer); SpGistPageOpaque opaque = SpGistPageGetOpaque(page); @@ -995,40 +966,40 @@ spgRedoVacuumRedirect(XLogRecPtr lsn, XLogRecord *record) } void -spg_redo(XLogRecPtr lsn, XLogRecord *record) +spg_redo(XLogReaderState *record) { - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; MemoryContext oldCxt; oldCxt = MemoryContextSwitchTo(opCtx); switch (info) { case XLOG_SPGIST_CREATE_INDEX: - spgRedoCreateIndex(lsn, record); + spgRedoCreateIndex(record); break; case XLOG_SPGIST_ADD_LEAF: - spgRedoAddLeaf(lsn, record); + spgRedoAddLeaf(record); break; case XLOG_SPGIST_MOVE_LEAFS: - spgRedoMoveLeafs(lsn, record); + spgRedoMoveLeafs(record); break; case XLOG_SPGIST_ADD_NODE: - spgRedoAddNode(lsn, record); + spgRedoAddNode(record); break; case XLOG_SPGIST_SPLIT_TUPLE: - spgRedoSplitTuple(lsn, record); + spgRedoSplitTuple(record); break; case XLOG_SPGIST_PICKSPLIT: - spgRedoPickSplit(lsn, record); + spgRedoPickSplit(record); break; case XLOG_SPGIST_VACUUM_LEAF: - spgRedoVacuumLeaf(lsn, record); + spgRedoVacuumLeaf(record); break; case XLOG_SPGIST_VACUUM_ROOT: - spgRedoVacuumRoot(lsn, record); + spgRedoVacuumRoot(record); break; case XLOG_SPGIST_VACUUM_REDIRECT: - spgRedoVacuumRedirect(lsn, record); + spgRedoVacuumRedirect(record); break; default: elog(PANIC, "spg_redo: unknown op code %u", info); diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README index 92b12fbb6c..ba6ae05d65 100644 --- a/src/backend/access/transam/README +++ b/src/backend/access/transam/README @@ -440,96 +440,164 @@ happen before the WAL record is inserted; see notes in SyncOneBuffer().) Note that marking a buffer dirty with MarkBufferDirty() should only happen iff you write a WAL record; see Writing Hints below. -5. If the relation requires WAL-logging, build a WAL log record and pass it -to XLogInsert(); then update the page's LSN using the returned XLOG -location. For instance, +5. If the relation requires WAL-logging, build a WAL record using +XLogBeginInsert and XLogRegister* functions, and insert it. (See +"Constructing a WAL record" below). Then update the page's LSN using the +returned XLOG location. For instance, - recptr = XLogInsert(rmgr_id, info, rdata); + XLogBeginInsert(); + XLogRegisterBuffer(...) + XLogRegisterData(...) + recptr = XLogInsert(rmgr_id, info); PageSetLSN(dp, recptr); - // Note that we no longer do PageSetTLI() from 9.3 onwards - // since that field on a page has now changed its meaning. 6. END_CRIT_SECTION() 7. Unlock and unpin the buffer(s). -XLogInsert's "rdata" argument is an array of pointer/size items identifying -chunks of data to be written in the XLOG record, plus optional shared-buffer -IDs for chunks that are in shared buffers rather than temporary variables. -The "rdata" array must mention (at least once) each of the shared buffers -being modified, unless the action is such that the WAL replay routine can -reconstruct the entire page contents. XLogInsert includes the logic that -tests to see whether a shared buffer has been modified since the last -checkpoint. If not, the entire page contents are logged rather than just the -portion(s) pointed to by "rdata". +Complex changes (such as a multilevel index insertion) normally need to be +described by a series of atomic-action WAL records. The intermediate states +must be self-consistent, so that if the replay is interrupted between any +two actions, the system is fully functional. In btree indexes, for example, +a page split requires a new page to be allocated, and an insertion of a new +key in the parent btree level, but for locking reasons this has to be +reflected by two separate WAL records. Replaying the first record, to +allocate the new page and move tuples to it, sets a flag on the page to +indicate that the key has not been inserted to the parent yet. Replaying the +second record clears the flag. This intermediate state is never seen by +other backends during normal operation, because the lock on the child page +is held across the two actions, but will be seen if the operation is +interrupted before writing the second WAL record. The search algorithm works +with the intermediate state as normal, but if an insertion encounters a page +with the incomplete-split flag set, it will finish the interrupted split by +inserting the key to the parent, before proceeding. -Because XLogInsert drops the rdata components associated with buffers it -chooses to log in full, the WAL replay routines normally need to test to see -which buffers were handled that way --- otherwise they may be misled about -what the XLOG record actually contains. XLOG records that describe multi-page -changes therefore require some care to design: you must be certain that you -know what data is indicated by each "BKP" bit. An example of the trickiness -is that in a HEAP_UPDATE record, BKP(0) normally is associated with the source -page and BKP(1) is associated with the destination page --- but if these are -the same page, only BKP(0) would have been set. -For this reason as well as the risk of deadlocking on buffer locks, it's best -to design WAL records so that they reflect small atomic actions involving just -one or a few pages. The current XLOG infrastructure cannot handle WAL records -involving references to more than four shared buffers, anyway. +Constructing a WAL record +------------------------- -In the case where the WAL record contains enough information to re-generate -the entire contents of a page, do *not* show that page's buffer ID in the -rdata array, even if some of the rdata items point into the buffer. This is -because you don't want XLogInsert to log the whole page contents. The -standard replay-routine pattern for this case is +A WAL record consists of a header common to all WAL record types, +record-specific data, and information about the data blocks modified. Each +modified data block is identified by an ID number, and can optionally have +more record-specific data associated with the block. If XLogInsert decides +that a full-page image of a block needs to be taken, the data associated +with that block is not included. - buffer = XLogReadBuffer(rnode, blkno, true); - Assert(BufferIsValid(buffer)); - page = (Page) BufferGetPage(buffer); +The API for constructing a WAL record consists of five functions: +XLogBeginInsert, XLogRegisterBuffer, XLogRegisterData, XLogRegisterBufData, +and XLogInsert. First, call XLogBeginInsert(). Then register all the buffers +modified, and data needed to replay the changes, using XLogRegister* +functions. Finally, insert the constructed record to the WAL by calling +XLogInsert(). - ... initialize the page ... + XLogBeginInsert(); - PageSetLSN(page, lsn); - MarkBufferDirty(buffer); - UnlockReleaseBuffer(buffer); + /* register buffers modified as part of this WAL-logged action */ + XLogRegisterBuffer(0, lbuffer, REGBUF_STANDARD); + XLogRegisterBuffer(1, rbuffer, REGBUF_STANDARD); -In the case where the WAL record provides only enough information to -incrementally update the page, the rdata array *must* mention the buffer -ID at least once; otherwise there is no defense against torn-page problems. -The standard replay-routine pattern for this case is + /* register data that is always included in the WAL record */ + XLogRegisterData(&xlrec, SizeOfFictionalAction); - if (XLogReadBufferForRedo(lsn, record, N, rnode, blkno, &buffer) == BLK_NEEDS_REDO) - { - page = (Page) BufferGetPage(buffer); + /* + * register data associated with a buffer. This will not be included + * in the record if a full-page image is taken. + */ + XLogRegisterBufData(0, tuple->data, tuple->len); - ... apply the change ... + /* more data associated with the buffer */ + XLogRegisterBufData(0, data2, len2); - PageSetLSN(page, lsn); - MarkBufferDirty(buffer); - } - if (BufferIsValid(buffer)) - UnlockReleaseBuffer(buffer); + /* + * Ok, all the data and buffers to include in the WAL record have + * been registered. Insert the record. + */ + recptr = XLogInsert(RM_FOO_ID, XLOG_FOOBAR_DO_STUFF); -XLogReadBufferForRedo reads the page from disk, and checks what action needs to -be taken to the page. If the XLR_BKP_BLOCK(N) flag is set, it restores the -full page image and returns BLK_RESTORED. If there is no full page image, but -page cannot be found or if the change has already been replayed (i.e. the -page's LSN >= the record we're replaying), it returns BLK_NOTFOUND or BLK_DONE, -respectively. Usually, the redo routine only needs to pay attention to the -BLK_NEEDS_REDO return code, which means that the routine should apply the -incremental change. In any case, the caller is responsible for unlocking and -releasing the buffer. Note that XLogReadBufferForRedo returns the buffer -locked even if no redo is required, unless the page does not exist. +Details of the API functions: -As noted above, for a multi-page update you need to be able to determine -which XLR_BKP_BLOCK(N) flag applies to each page. If a WAL record reflects -a combination of fully-rewritable and incremental updates, then the rewritable -pages don't count for the XLR_BKP_BLOCK(N) numbering. (XLR_BKP_BLOCK(N) is -associated with the N'th distinct buffer ID seen in the "rdata" array, and -per the above discussion, fully-rewritable buffers shouldn't be mentioned in -"rdata".) +void XLogBeginInsert(void) + + Must be called before XLogRegisterBuffer and XLogRegisterData. + +void XLogResetInsertion(void) + + Clear any currently registered data and buffers from the WAL record + construction workspace. This is only needed if you have already called + XLogBeginInsert(), but decide to not insert the record after all. + +void XLogEnsureRecordSpace(int max_block_id, int nrdatas) + + Normally, the WAL record construction buffers have the following limits: + + * highest block ID that can be used is 4 (allowing five block references) + * Max 20 chunks of registered data + + These default limits are enough for most record types that change some + on-disk structures. For the odd case that requires more data, or needs to + modify more buffers, these limits can be raised by calling + XLogEnsureRecordSpace(). XLogEnsureRecordSpace() must be called before + XLogBeginInsert(), and outside a critical section. + +void XLogRegisterBuffer(uint8 block_id, Buffer buf, uint8 flags); + + XLogRegisterBuffer adds information about a data block to the WAL record. + block_id is an arbitrary number used to identify this page reference in + the redo routine. The information needed to re-find the page at redo - + relfilenode, fork, and block number - are included in the WAL record. + + XLogInsert will automatically include a full copy of the page contents, if + this is the first modification of the buffer since the last checkpoint. + It is important to register every buffer modified by the action with + XLogRegisterBuffer, to avoid torn-page hazards. + + The flags control when and how the buffer contents are included in the + WAL record. Normally, a full-page image is taken only if the page has not + been modified since the last checkpoint, and only if full_page_writes=on + or an online backup is in progress. The REGBUF_FORCE_IMAGE flag can be + used to force a full-page image to always be included; that is useful + e.g. for an operation that rewrites most of the page, so that tracking the + details is not worth it. For the rare case where it is not necessary to + protect from torn pages, REGBUF_NO_IMAGE flag can be used to suppress + full page image from being taken. REGBUF_WILL_INIT also suppresses a full + page image, but the redo routine must re-generate the page from scratch, + without looking at the old page contents. Re-initializing the page + protects from torn page hazards like a full page image does. + + The REGBUF_STANDARD flag can be specified together with the other flags to + indicate that the page follows the standard page layout. It causes the + area between pd_lower and pd_upper to be left out from the image, reducing + WAL volume. + + If the REGBUF_KEEP_DATA flag is given, any per-buffer data registered with + XLogRegisterBufData() is included in the WAL record even if a full-page + image is taken. + +void XLogRegisterData(char *data, int len); + + XLogRegisterData is used to include arbitrary data in the WAL record. If + XLogRegisterData() is called multiple times, the data are appended, and + will be made available to the redo routine as one contiguous chunk. + +void XLogRegisterBufData(uint8 block_id, char *data, int len); + + XLogRegisterBufData is used to include data associated with a particular + buffer that was registered earlier with XLogRegisterBuffer(). If + XLogRegisterBufData() is called multiple times with the same block ID, the + data are appended, and will be made available to the redo routine as one + contiguous chunk. + + If a full-page image of the buffer is taken at insertion, the data is not + included in the WAL record, unless the REGBUF_KEEP_DATA flag is used. + + +Writing a REDO routine +---------------------- + +A REDO routine uses the data and page references included in the WAL record +to reconstruct the new state of the page. The record decoding functions +and macros in xlogreader.c/h can be used to extract the data from the record. When replaying a WAL record that describes changes on multiple pages, you must be careful to lock the pages properly to prevent concurrent Hot Standby @@ -545,23 +613,6 @@ either an exclusive buffer lock or a shared lock plus buffer header lock, or be writing the data block directly rather than through shared buffers while holding AccessExclusiveLock on the relation. -Due to all these constraints, complex changes (such as a multilevel index -insertion) normally need to be described by a series of atomic-action WAL -records. The intermediate states must be self-consistent, so that if the -replay is interrupted between any two actions, the system is fully -functional. In btree indexes, for example, a page split requires a new page -to be allocated, and an insertion of a new key in the parent btree level, -but for locking reasons this has to be reflected by two separate WAL -records. Replaying the first record, to allocate the new page and move -tuples to it, sets a flag on the page to indicate that the key has not been -inserted to the parent yet. Replaying the second record clears the flag. -This intermediate state is never seen by other backends during normal -operation, because the lock on the child page is held across the two -actions, but will be seen if the operation is interrupted before writing -the second WAL record. The search algorithm works with the intermediate -state as normal, but if an insertion encounters a page with the -incomplete-split flag set, it will finish the interrupted split by -inserting the key to the parent, before proceeding. Writing Hints ------------- diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index 5ee070bd0a..313bd04240 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -699,13 +699,9 @@ CLOGPagePrecedes(int page1, int page2) static void WriteZeroPageXlogRec(int pageno) { - XLogRecData rdata; - - rdata.data = (char *) (&pageno); - rdata.len = sizeof(int); - rdata.buffer = InvalidBuffer; - rdata.next = NULL; - (void) XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE, &rdata); + XLogBeginInsert(); + XLogRegisterData((char *) (&pageno), sizeof(int)); + (void) XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE); } /* @@ -717,14 +713,11 @@ WriteZeroPageXlogRec(int pageno) static void WriteTruncateXlogRec(int pageno) { - XLogRecData rdata; XLogRecPtr recptr; - rdata.data = (char *) (&pageno); - rdata.len = sizeof(int); - rdata.buffer = InvalidBuffer; - rdata.next = NULL; - recptr = XLogInsert(RM_CLOG_ID, CLOG_TRUNCATE, &rdata); + XLogBeginInsert(); + XLogRegisterData((char *) (&pageno), sizeof(int)); + recptr = XLogInsert(RM_CLOG_ID, CLOG_TRUNCATE); XLogFlush(recptr); } @@ -732,12 +725,12 @@ WriteTruncateXlogRec(int pageno) * CLOG resource manager's routines */ void -clog_redo(XLogRecPtr lsn, XLogRecord *record) +clog_redo(XLogReaderState *record) { - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; /* Backup blocks are not used in clog records */ - Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); + Assert(!XLogRecHasAnyBlockRefs(record)); if (info == CLOG_ZEROPAGE) { diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 3c20bb37e4..fff9f83733 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -720,7 +720,6 @@ MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members) { MultiXactId multi; MultiXactOffset offset; - XLogRecData rdata[2]; xl_multixact_create xlrec; debug_elog3(DEBUG2, "Create: %s", @@ -796,17 +795,11 @@ MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members) * the status flags in one XLogRecData, then all the xids in another one? * Not clear that it's worth the trouble though. */ - rdata[0].data = (char *) (&xlrec); - rdata[0].len = SizeOfMultiXactCreate; - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &(rdata[1]); + XLogBeginInsert(); + XLogRegisterData((char *) (&xlrec), SizeOfMultiXactCreate); + XLogRegisterData((char *) members, nmembers * sizeof(MultiXactMember)); - rdata[1].data = (char *) members; - rdata[1].len = nmembers * sizeof(MultiXactMember); - rdata[1].buffer = InvalidBuffer; - rdata[1].next = NULL; - - (void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_CREATE_ID, rdata); + (void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_CREATE_ID); /* Now enter the information into the OFFSETs and MEMBERs logs */ RecordNewMultiXact(multi, offset, nmembers, members); @@ -2705,25 +2698,21 @@ MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2) static void WriteMZeroPageXlogRec(int pageno, uint8 info) { - XLogRecData rdata; - - rdata.data = (char *) (&pageno); - rdata.len = sizeof(int); - rdata.buffer = InvalidBuffer; - rdata.next = NULL; - (void) XLogInsert(RM_MULTIXACT_ID, info, &rdata); + XLogBeginInsert(); + XLogRegisterData((char *) (&pageno), sizeof(int)); + (void) XLogInsert(RM_MULTIXACT_ID, info); } /* * MULTIXACT resource manager's routines */ void -multixact_redo(XLogRecPtr lsn, XLogRecord *record) +multixact_redo(XLogReaderState *record) { - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; /* Backup blocks are not used in multixact records */ - Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); + Assert(!XLogRecHasAnyBlockRefs(record)); if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE) { @@ -2775,7 +2764,7 @@ multixact_redo(XLogRecPtr lsn, XLogRecord *record) * should be unnecessary, since any XID found here ought to have other * evidence in the XLOG, but let's be safe. */ - max_xid = record->xl_xid; + max_xid = XLogRecGetXid(record); for (i = 0; i < xlrec->nmembers; i++) { if (TransactionIdPrecedes(max_xid, xlrec->members[i].xid)) diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index d23c292edc..40de84e934 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -889,14 +889,21 @@ typedef struct TwoPhaseRecordOnDisk /* * During prepare, the state file is assembled in memory before writing it - * to WAL and the actual state file. We use a chain of XLogRecData blocks - * so that we will be able to pass the state file contents directly to - * XLogInsert. + * to WAL and the actual state file. We use a chain of StateFileChunk blocks + * for that. */ +typedef struct StateFileChunk +{ + char *data; + uint32 len; + struct StateFileChunk *next; +} StateFileChunk; + static struct xllist { - XLogRecData *head; /* first data block in the chain */ - XLogRecData *tail; /* last block in chain */ + StateFileChunk *head; /* first data block in the chain */ + StateFileChunk *tail; /* last block in chain */ + uint32 num_chunks; uint32 bytes_free; /* free bytes left in tail block */ uint32 total_len; /* total data bytes in chain */ } records; @@ -917,11 +924,11 @@ save_state_data(const void *data, uint32 len) if (padlen > records.bytes_free) { - records.tail->next = palloc0(sizeof(XLogRecData)); + records.tail->next = palloc0(sizeof(StateFileChunk)); records.tail = records.tail->next; - records.tail->buffer = InvalidBuffer; records.tail->len = 0; records.tail->next = NULL; + records.num_chunks++; records.bytes_free = Max(padlen, 512); records.tail->data = palloc(records.bytes_free); @@ -951,8 +958,7 @@ StartPrepare(GlobalTransaction gxact) SharedInvalidationMessage *invalmsgs; /* Initialize linked list */ - records.head = palloc0(sizeof(XLogRecData)); - records.head->buffer = InvalidBuffer; + records.head = palloc0(sizeof(StateFileChunk)); records.head->len = 0; records.head->next = NULL; @@ -960,6 +966,7 @@ StartPrepare(GlobalTransaction gxact) records.head->data = palloc(records.bytes_free); records.tail = records.head; + records.num_chunks = 1; records.total_len = 0; @@ -1019,7 +1026,7 @@ EndPrepare(GlobalTransaction gxact) TransactionId xid = pgxact->xid; TwoPhaseFileHeader *hdr; char path[MAXPGPATH]; - XLogRecData *record; + StateFileChunk *record; pg_crc32 statefile_crc; pg_crc32 bogus_crc; int fd; @@ -1117,12 +1124,16 @@ EndPrepare(GlobalTransaction gxact) * We save the PREPARE record's location in the gxact for later use by * CheckPointTwoPhase. */ + XLogEnsureRecordSpace(0, records.num_chunks); + START_CRIT_SECTION(); MyPgXact->delayChkpt = true; - gxact->prepare_lsn = XLogInsert(RM_XACT_ID, XLOG_XACT_PREPARE, - records.head); + XLogBeginInsert(); + for (record = records.head; record != NULL; record = record->next) + XLogRegisterData(record->data, record->len); + gxact->prepare_lsn = XLogInsert(RM_XACT_ID, XLOG_XACT_PREPARE); XLogFlush(gxact->prepare_lsn); /* If we crash now, we have prepared: WAL replay will fix things */ @@ -1180,6 +1191,7 @@ EndPrepare(GlobalTransaction gxact) SyncRepWaitForLSN(gxact->prepare_lsn); records.tail = records.head = NULL; + records.num_chunks = 0; } /* @@ -2071,8 +2083,6 @@ RecordTransactionCommitPrepared(TransactionId xid, SharedInvalidationMessage *invalmsgs, bool initfileinval) { - XLogRecData rdata[4]; - int lastrdata = 0; xl_xact_commit_prepared xlrec; XLogRecPtr recptr; @@ -2094,39 +2104,24 @@ RecordTransactionCommitPrepared(TransactionId xid, xlrec.crec.nsubxacts = nchildren; xlrec.crec.nmsgs = ninvalmsgs; - rdata[0].data = (char *) (&xlrec); - rdata[0].len = MinSizeOfXactCommitPrepared; - rdata[0].buffer = InvalidBuffer; + XLogBeginInsert(); + XLogRegisterData((char *) (&xlrec), MinSizeOfXactCommitPrepared); + /* dump rels to delete */ if (nrels > 0) - { - rdata[0].next = &(rdata[1]); - rdata[1].data = (char *) rels; - rdata[1].len = nrels * sizeof(RelFileNode); - rdata[1].buffer = InvalidBuffer; - lastrdata = 1; - } + XLogRegisterData((char *) rels, nrels * sizeof(RelFileNode)); + /* dump committed child Xids */ if (nchildren > 0) - { - rdata[lastrdata].next = &(rdata[2]); - rdata[2].data = (char *) children; - rdata[2].len = nchildren * sizeof(TransactionId); - rdata[2].buffer = InvalidBuffer; - lastrdata = 2; - } + XLogRegisterData((char *) children, + nchildren * sizeof(TransactionId)); + /* dump cache invalidation messages */ if (ninvalmsgs > 0) - { - rdata[lastrdata].next = &(rdata[3]); - rdata[3].data = (char *) invalmsgs; - rdata[3].len = ninvalmsgs * sizeof(SharedInvalidationMessage); - rdata[3].buffer = InvalidBuffer; - lastrdata = 3; - } - rdata[lastrdata].next = NULL; + XLogRegisterData((char *) invalmsgs, + ninvalmsgs * sizeof(SharedInvalidationMessage)); - recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_COMMIT_PREPARED, rdata); + recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_COMMIT_PREPARED); /* * We don't currently try to sleep before flush here ... nor is there any @@ -2169,8 +2164,6 @@ RecordTransactionAbortPrepared(TransactionId xid, int nrels, RelFileNode *rels) { - XLogRecData rdata[3]; - int lastrdata = 0; xl_xact_abort_prepared xlrec; XLogRecPtr recptr; @@ -2189,30 +2182,20 @@ RecordTransactionAbortPrepared(TransactionId xid, xlrec.arec.xact_time = GetCurrentTimestamp(); xlrec.arec.nrels = nrels; xlrec.arec.nsubxacts = nchildren; - rdata[0].data = (char *) (&xlrec); - rdata[0].len = MinSizeOfXactAbortPrepared; - rdata[0].buffer = InvalidBuffer; + + XLogBeginInsert(); + XLogRegisterData((char *) (&xlrec), MinSizeOfXactAbortPrepared); + /* dump rels to delete */ if (nrels > 0) - { - rdata[0].next = &(rdata[1]); - rdata[1].data = (char *) rels; - rdata[1].len = nrels * sizeof(RelFileNode); - rdata[1].buffer = InvalidBuffer; - lastrdata = 1; - } + XLogRegisterData((char *) rels, nrels * sizeof(RelFileNode)); + /* dump committed child Xids */ if (nchildren > 0) - { - rdata[lastrdata].next = &(rdata[2]); - rdata[2].data = (char *) children; - rdata[2].len = nchildren * sizeof(TransactionId); - rdata[2].buffer = InvalidBuffer; - lastrdata = 2; - } - rdata[lastrdata].next = NULL; + XLogRegisterData((char *) children, + nchildren * sizeof(TransactionId)); - recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_ABORT_PREPARED, rdata); + recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_ABORT_PREPARED); /* Always flush, since we're about to remove the 2PC state file */ XLogFlush(recptr); diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 6f92bad07c..763e9deb6f 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -571,7 +571,6 @@ AssignTransactionId(TransactionState s) if (nUnreportedXids >= PGPROC_MAX_CACHED_SUBXIDS || log_unknown_top) { - XLogRecData rdata[2]; xl_xact_assignment xlrec; /* @@ -582,17 +581,12 @@ AssignTransactionId(TransactionState s) Assert(TransactionIdIsValid(xlrec.xtop)); xlrec.nsubxacts = nUnreportedXids; - rdata[0].data = (char *) &xlrec; - rdata[0].len = MinSizeOfXactAssignment; - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &rdata[1]; + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, MinSizeOfXactAssignment); + XLogRegisterData((char *) unreportedXids, + nUnreportedXids * sizeof(TransactionId)); - rdata[1].data = (char *) unreportedXids; - rdata[1].len = nUnreportedXids * sizeof(TransactionId); - rdata[1].buffer = InvalidBuffer; - rdata[1].next = NULL; - - (void) XLogInsert(RM_XACT_ID, XLOG_XACT_ASSIGNMENT, rdata); + (void) XLogInsert(RM_XACT_ID, XLOG_XACT_ASSIGNMENT); nUnreportedXids = 0; /* mark top, not current xact as having been logged */ @@ -1087,8 +1081,6 @@ RecordTransactionCommit(void) if (nrels > 0 || nmsgs > 0 || RelcacheInitFileInval || forceSyncCommit || XLogLogicalInfoActive()) { - XLogRecData rdata[4]; - int lastrdata = 0; xl_xact_commit xlrec; /* @@ -1107,63 +1099,38 @@ RecordTransactionCommit(void) xlrec.nrels = nrels; xlrec.nsubxacts = nchildren; xlrec.nmsgs = nmsgs; - rdata[0].data = (char *) (&xlrec); - rdata[0].len = MinSizeOfXactCommit; - rdata[0].buffer = InvalidBuffer; + + XLogBeginInsert(); + XLogRegisterData((char *) (&xlrec), MinSizeOfXactCommit); /* dump rels to delete */ if (nrels > 0) - { - rdata[0].next = &(rdata[1]); - rdata[1].data = (char *) rels; - rdata[1].len = nrels * sizeof(RelFileNode); - rdata[1].buffer = InvalidBuffer; - lastrdata = 1; - } + XLogRegisterData((char *) rels, + nrels * sizeof(RelFileNode)); /* dump committed child Xids */ if (nchildren > 0) - { - rdata[lastrdata].next = &(rdata[2]); - rdata[2].data = (char *) children; - rdata[2].len = nchildren * sizeof(TransactionId); - rdata[2].buffer = InvalidBuffer; - lastrdata = 2; - } + XLogRegisterData((char *) children, + nchildren * sizeof(TransactionId)); /* dump shared cache invalidation messages */ if (nmsgs > 0) - { - rdata[lastrdata].next = &(rdata[3]); - rdata[3].data = (char *) invalMessages; - rdata[3].len = nmsgs * sizeof(SharedInvalidationMessage); - rdata[3].buffer = InvalidBuffer; - lastrdata = 3; - } - rdata[lastrdata].next = NULL; - - (void) XLogInsert(RM_XACT_ID, XLOG_XACT_COMMIT, rdata); + XLogRegisterData((char *) invalMessages, + nmsgs * sizeof(SharedInvalidationMessage)); + (void) XLogInsert(RM_XACT_ID, XLOG_XACT_COMMIT); } else { - XLogRecData rdata[2]; - int lastrdata = 0; xl_xact_commit_compact xlrec; xlrec.xact_time = xactStopTimestamp; xlrec.nsubxacts = nchildren; - rdata[0].data = (char *) (&xlrec); - rdata[0].len = MinSizeOfXactCommitCompact; - rdata[0].buffer = InvalidBuffer; + + XLogBeginInsert(); + XLogRegisterData((char *) (&xlrec), MinSizeOfXactCommitCompact); /* dump committed child Xids */ if (nchildren > 0) - { - rdata[0].next = &(rdata[1]); - rdata[1].data = (char *) children; - rdata[1].len = nchildren * sizeof(TransactionId); - rdata[1].buffer = InvalidBuffer; - lastrdata = 1; - } - rdata[lastrdata].next = NULL; + XLogRegisterData((char *) children, + nchildren * sizeof(TransactionId)); - (void) XLogInsert(RM_XACT_ID, XLOG_XACT_COMMIT_COMPACT, rdata); + (void) XLogInsert(RM_XACT_ID, XLOG_XACT_COMMIT_COMPACT); } } @@ -1436,8 +1403,6 @@ RecordTransactionAbort(bool isSubXact) RelFileNode *rels; int nchildren; TransactionId *children; - XLogRecData rdata[3]; - int lastrdata = 0; xl_xact_abort xlrec; /* @@ -1486,30 +1451,20 @@ RecordTransactionAbort(bool isSubXact) } xlrec.nrels = nrels; xlrec.nsubxacts = nchildren; - rdata[0].data = (char *) (&xlrec); - rdata[0].len = MinSizeOfXactAbort; - rdata[0].buffer = InvalidBuffer; + + XLogBeginInsert(); + XLogRegisterData((char *) (&xlrec), MinSizeOfXactAbort); + /* dump rels to delete */ if (nrels > 0) - { - rdata[0].next = &(rdata[1]); - rdata[1].data = (char *) rels; - rdata[1].len = nrels * sizeof(RelFileNode); - rdata[1].buffer = InvalidBuffer; - lastrdata = 1; - } + XLogRegisterData((char *) rels, nrels * sizeof(RelFileNode)); + /* dump committed child Xids */ if (nchildren > 0) - { - rdata[lastrdata].next = &(rdata[2]); - rdata[2].data = (char *) children; - rdata[2].len = nchildren * sizeof(TransactionId); - rdata[2].buffer = InvalidBuffer; - lastrdata = 2; - } - rdata[lastrdata].next = NULL; + XLogRegisterData((char *) children, + nchildren * sizeof(TransactionId)); - (void) XLogInsert(RM_XACT_ID, XLOG_XACT_ABORT, rdata); + (void) XLogInsert(RM_XACT_ID, XLOG_XACT_ABORT); /* * Report the latest async abort LSN, so that the WAL writer knows to @@ -2351,6 +2306,9 @@ AbortTransaction(void) AbortBufferIO(); UnlockBuffers(); + /* Reset WAL record construction state */ + XLogResetInsertion(); + /* * Also clean up any open wait for lock, since the lock manager will choke * if we try to wait for another lock before doing this. @@ -4299,6 +4257,9 @@ AbortSubTransaction(void) AbortBufferIO(); UnlockBuffers(); + /* Reset WAL record construction state */ + XLogResetInsertion(); + /* * Also clean up any open wait for lock, since the lock manager will choke * if we try to wait for another lock before doing this. @@ -4938,42 +4899,42 @@ xact_redo_abort(xl_xact_abort *xlrec, TransactionId xid) } void -xact_redo(XLogRecPtr lsn, XLogRecord *record) +xact_redo(XLogReaderState *record) { - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; /* Backup blocks are not used in xact records */ - Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); + Assert(!XLogRecHasAnyBlockRefs(record)); if (info == XLOG_XACT_COMMIT_COMPACT) { xl_xact_commit_compact *xlrec = (xl_xact_commit_compact *) XLogRecGetData(record); - xact_redo_commit_compact(xlrec, record->xl_xid, lsn); + xact_redo_commit_compact(xlrec, XLogRecGetXid(record), record->EndRecPtr); } else if (info == XLOG_XACT_COMMIT) { xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record); - xact_redo_commit(xlrec, record->xl_xid, lsn); + xact_redo_commit(xlrec, XLogRecGetXid(record), record->EndRecPtr); } else if (info == XLOG_XACT_ABORT) { xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record); - xact_redo_abort(xlrec, record->xl_xid); + xact_redo_abort(xlrec, XLogRecGetXid(record)); } else if (info == XLOG_XACT_PREPARE) { /* the record contents are exactly the 2PC file */ - RecreateTwoPhaseFile(record->xl_xid, - XLogRecGetData(record), record->xl_len); + RecreateTwoPhaseFile(XLogRecGetXid(record), + XLogRecGetData(record), XLogRecGetDataLen(record)); } else if (info == XLOG_XACT_COMMIT_PREPARED) { xl_xact_commit_prepared *xlrec = (xl_xact_commit_prepared *) XLogRecGetData(record); - xact_redo_commit(&xlrec->crec, xlrec->xid, lsn); + xact_redo_commit(&xlrec->crec, xlrec->xid, record->EndRecPtr); RemoveTwoPhaseFile(xlrec->xid, false); } else if (info == XLOG_XACT_ABORT_PREPARED) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 60531277dc..2059bbeda4 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -757,10 +757,10 @@ static MemoryContext walDebugCxt = NULL; static void readRecoveryCommandFile(void); static void exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo); -static bool recoveryStopsBefore(XLogRecord *record); -static bool recoveryStopsAfter(XLogRecord *record); +static bool recoveryStopsBefore(XLogReaderState *record); +static bool recoveryStopsAfter(XLogReaderState *record); static void recoveryPausesHere(void); -static bool recoveryApplyDelay(XLogRecord *record); +static bool recoveryApplyDelay(XLogReaderState *record); static void SetLatestXTime(TimestampTz xtime); static void SetCurrentChunkStartTime(TimestampTz xtime); static void CheckRequiredParameterValues(void); @@ -807,9 +807,9 @@ static char *str_time(pg_time_t tnow); static bool CheckForStandbyTrigger(void); #ifdef WAL_DEBUG -static void xlog_outrec(StringInfo buf, XLogRecord *record); +static void xlog_outrec(StringInfo buf, XLogReaderState *record); #endif -static void xlog_outdesc(StringInfo buf, RmgrId rmid, XLogRecord *record); +static void xlog_outdesc(StringInfo buf, XLogReaderState *record); static void pg_start_backup_callback(int code, Datum arg); static bool read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired, bool *backupFromStandby); @@ -861,7 +861,6 @@ XLogRecPtr XLogInsertRecord(XLogRecData *rdata, XLogRecPtr fpw_lsn) { XLogCtlInsert *Insert = &XLogCtl->Insert; - XLogRecData *rdt; pg_crc32 rdata_crc; bool inserted; XLogRecord *rechdr = (XLogRecord *) rdata->data; @@ -870,28 +869,13 @@ XLogInsertRecord(XLogRecData *rdata, XLogRecPtr fpw_lsn) XLogRecPtr StartPos; XLogRecPtr EndPos; + /* we assume that all of the record header is in the first chunk */ + Assert(rdata->len >= SizeOfXLogRecord); + /* cross-check on whether we should be here or not */ if (!XLogInsertAllowed()) elog(ERROR, "cannot make new WAL entries during recovery"); - /* - * Calculate CRC of the data, including all the backup blocks - * - * Note that the record header isn't added into the CRC initially since we - * don't know the prev-link yet. Thus, the CRC will represent the CRC of - * the whole record in the order: rdata, then backup blocks, then record - * header. - */ - INIT_CRC32C(rdata_crc); - for (rdt = rdata->next; rdt != NULL; rdt = rdt->next) - COMP_CRC32C(rdata_crc, rdt->data, rdt->len); - - /* - * Calculate CRC of the header, except for prev-link, because we don't - * know it yet. It will be added later. - */ - COMP_CRC32C(rdata_crc, ((char *) rechdr), offsetof(XLogRecord, xl_prev)); - /*---------- * * We have now done all the preparatory work we can without holding a @@ -976,10 +960,11 @@ XLogInsertRecord(XLogRecData *rdata, XLogRecPtr fpw_lsn) if (inserted) { /* - * Now that xl_prev has been filled in, finish CRC calculation of the - * record header. + * Now that xl_prev has been filled in, calculate CRC of the record + * header. */ - COMP_CRC32C(rdata_crc, ((char *) &rechdr->xl_prev), sizeof(XLogRecPtr)); + rdata_crc = rechdr->xl_crc; + COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc)); FIN_CRC32C(rdata_crc); rechdr->xl_crc = rdata_crc; @@ -1053,34 +1038,47 @@ XLogInsertRecord(XLogRecData *rdata, XLogRecPtr fpw_lsn) #ifdef WAL_DEBUG if (XLOG_DEBUG) { + static XLogReaderState *debug_reader = NULL; StringInfoData buf; - MemoryContext oldCxt = MemoryContextSwitchTo(walDebugCxt); + StringInfoData recordBuf; + char *errormsg = NULL; + MemoryContext oldCxt; + + oldCxt = MemoryContextSwitchTo(walDebugCxt); initStringInfo(&buf); appendStringInfo(&buf, "INSERT @ %X/%X: ", (uint32) (EndPos >> 32), (uint32) EndPos); - xlog_outrec(&buf, rechdr); - if (rdata->data != NULL) + + /* + * We have to piece together the WAL record data from the XLogRecData + * entries, so that we can pass it to the rm_desc function as one + * contiguous chunk. + */ + initStringInfo(&recordBuf); + for (; rdata != NULL; rdata = rdata->next) + appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len); + + if (!debug_reader) + debug_reader = XLogReaderAllocate(NULL, NULL); + + if (!debug_reader || + !DecodeXLogRecord(debug_reader, (XLogRecord *) recordBuf.data, + &errormsg)) + { + appendStringInfo(&buf, "error decoding record: %s", + errormsg ? errormsg : "no error message"); + } + else { - StringInfoData recordbuf; - - /* - * We have to piece together the WAL record data from the - * XLogRecData entries, so that we can pass it to the rm_desc - * function as one contiguous chunk. - */ - initStringInfo(&recordbuf); - appendBinaryStringInfo(&recordbuf, (char *) rechdr, sizeof(XLogRecord)); - for (; rdata != NULL; rdata = rdata->next) - appendBinaryStringInfo(&recordbuf, rdata->data, rdata->len); - appendStringInfoString(&buf, " - "); - xlog_outdesc(&buf, rechdr->xl_rmid, (XLogRecord *) recordbuf.data); + xlog_outdesc(&buf, debug_reader); } elog(LOG, "%s", buf.data); + pfree(buf.data); + pfree(recordBuf.data); MemoryContextSwitchTo(oldCxt); - MemoryContextReset(walDebugCxt); } #endif @@ -1170,7 +1168,7 @@ ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr) uint64 startbytepos; uint64 endbytepos; uint64 prevbytepos; - uint32 size = SizeOfXLogRecord; + uint32 size = MAXALIGN(SizeOfXLogRecord); XLogRecPtr ptr; uint32 segleft; @@ -1234,9 +1232,6 @@ CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata, XLogRecPtr CurrPos; XLogPageHeader pagehdr; - /* The first chunk is the record header */ - Assert(rdata->len == SizeOfXLogRecord); - /* * Get a pointer to the right place in the right WAL buffer to start * inserting to. @@ -1309,9 +1304,6 @@ CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata, } Assert(written == write_len); - /* Align the end position, so that the next record starts aligned */ - CurrPos = MAXALIGN64(CurrPos); - /* * If this was an xlog-switch, it's not enough to write the switch record, * we also have to consume all the remaining space in the WAL segment. We @@ -1341,6 +1333,11 @@ CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata, CurrPos += XLOG_BLCKSZ; } } + else + { + /* Align the end position, so that the next record starts aligned */ + CurrPos = MAXALIGN64(CurrPos); + } if (CurrPos != EndPos) elog(PANIC, "space reserved for WAL record does not match what was written"); @@ -4470,6 +4467,7 @@ BootStrapXLOG(void) XLogPageHeader page; XLogLongPageHeader longpage; XLogRecord *record; + char *recptr; bool use_existent; uint64 sysidentifier; struct timeval tv; @@ -4541,17 +4539,23 @@ BootStrapXLOG(void) longpage->xlp_xlog_blcksz = XLOG_BLCKSZ; /* Insert the initial checkpoint record */ - record = (XLogRecord *) ((char *) page + SizeOfXLogLongPHD); + recptr = ((char *) page + SizeOfXLogLongPHD); + record = (XLogRecord *) recptr; record->xl_prev = 0; record->xl_xid = InvalidTransactionId; - record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint); - record->xl_len = sizeof(checkPoint); + record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint); record->xl_info = XLOG_CHECKPOINT_SHUTDOWN; record->xl_rmid = RM_XLOG_ID; - memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint)); + recptr += SizeOfXLogRecord; + /* fill the XLogRecordDataHeaderShort struct */ + *(recptr++) = XLR_BLOCK_ID_DATA_SHORT; + *(recptr++) = sizeof(checkPoint); + memcpy(recptr, &checkPoint, sizeof(checkPoint)); + recptr += sizeof(checkPoint); + Assert(recptr - (char *) record == record->xl_tot_len); INIT_CRC32C(crc); - COMP_CRC32C(crc, &checkPoint, sizeof(checkPoint)); + COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord); COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc)); FIN_CRC32C(crc); record->xl_crc = crc; @@ -4984,36 +4988,37 @@ exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo) * timestamps. */ static bool -getRecordTimestamp(XLogRecord *record, TimestampTz *recordXtime) +getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime) { - uint8 record_info = record->xl_info & ~XLR_INFO_MASK; + uint8 record_info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + uint8 rmid = XLogRecGetRmid(record); - if (record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT) + if (rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT) { *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time; return true; } - if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT_COMPACT) + if (rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT_COMPACT) { *recordXtime = ((xl_xact_commit_compact *) XLogRecGetData(record))->xact_time; return true; } - if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT) + if (rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT) { *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time; return true; } - if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT_PREPARED) + if (rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT_PREPARED) { *recordXtime = ((xl_xact_commit_prepared *) XLogRecGetData(record))->crec.xact_time; return true; } - if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_ABORT) + if (rmid == RM_XACT_ID && record_info == XLOG_XACT_ABORT) { *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time; return true; } - if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_ABORT_PREPARED) + if (rmid == RM_XACT_ID && record_info == XLOG_XACT_ABORT_PREPARED) { *recordXtime = ((xl_xact_abort_prepared *) XLogRecGetData(record))->arec.xact_time; return true; @@ -5030,7 +5035,7 @@ getRecordTimestamp(XLogRecord *record, TimestampTz *recordXtime) * new timeline's history file. */ static bool -recoveryStopsBefore(XLogRecord *record) +recoveryStopsBefore(XLogReaderState *record) { bool stopsHere = false; uint8 record_info; @@ -5052,14 +5057,14 @@ recoveryStopsBefore(XLogRecord *record) } /* Otherwise we only consider stopping before COMMIT or ABORT records. */ - if (record->xl_rmid != RM_XACT_ID) + if (XLogRecGetRmid(record) != RM_XACT_ID) return false; - record_info = record->xl_info & ~XLR_INFO_MASK; + record_info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; if (record_info == XLOG_XACT_COMMIT_COMPACT || record_info == XLOG_XACT_COMMIT) { isCommit = true; - recordXid = record->xl_xid; + recordXid = XLogRecGetXid(record); } else if (record_info == XLOG_XACT_COMMIT_PREPARED) { @@ -5069,7 +5074,7 @@ recoveryStopsBefore(XLogRecord *record) else if (record_info == XLOG_XACT_ABORT) { isCommit = false; - recordXid = record->xl_xid; + recordXid = XLogRecGetXid(record); } else if (record_info == XLOG_XACT_ABORT_PREPARED) { @@ -5140,19 +5145,21 @@ recoveryStopsBefore(XLogRecord *record) * record in XLogCtl->recoveryLastXTime. */ static bool -recoveryStopsAfter(XLogRecord *record) +recoveryStopsAfter(XLogReaderState *record) { uint8 record_info; + uint8 rmid; TimestampTz recordXtime; - record_info = record->xl_info & ~XLR_INFO_MASK; + record_info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + rmid = XLogRecGetRmid(record); /* * There can be many restore points that share the same name; we stop at * the first one. */ if (recoveryTarget == RECOVERY_TARGET_NAME && - record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT) + rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT) { xl_restore_point *recordRestorePointData; @@ -5173,7 +5180,7 @@ recoveryStopsAfter(XLogRecord *record) } } - if (record->xl_rmid == RM_XACT_ID && + if (rmid == RM_XACT_ID && (record_info == XLOG_XACT_COMMIT_COMPACT || record_info == XLOG_XACT_COMMIT || record_info == XLOG_XACT_COMMIT_PREPARED || @@ -5192,7 +5199,7 @@ recoveryStopsAfter(XLogRecord *record) else if (record_info == XLOG_XACT_ABORT_PREPARED) recordXid = ((xl_xact_abort_prepared *) XLogRecGetData(record))->xid; else - recordXid = record->xl_xid; + recordXid = XLogRecGetXid(record); /* * There can be only one transaction end record with this exact @@ -5307,7 +5314,7 @@ SetRecoveryPause(bool recoveryPause) * usability. */ static bool -recoveryApplyDelay(XLogRecord *record) +recoveryApplyDelay(XLogReaderState *record) { uint8 record_info; TimestampTz xtime; @@ -5326,8 +5333,8 @@ recoveryApplyDelay(XLogRecord *record) * so there is already opportunity for issues caused by early conflicts on * standbys. */ - record_info = record->xl_info & ~XLR_INFO_MASK; - if (!(record->xl_rmid == RM_XACT_ID && + record_info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + if (!(XLogRecGetRmid(record) == RM_XACT_ID && (record_info == XLOG_XACT_COMMIT_COMPACT || record_info == XLOG_XACT_COMMIT || record_info == XLOG_XACT_COMMIT_PREPARED))) @@ -5696,7 +5703,7 @@ StartupXLOG(void) record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true); if (record != NULL) { - memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint)); + memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN); ereport(DEBUG1, (errmsg("checkpoint record is at %X/%X", @@ -5793,7 +5800,7 @@ StartupXLOG(void) ereport(PANIC, (errmsg("could not locate a valid checkpoint record"))); } - memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint)); + memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN); } @@ -6230,9 +6237,9 @@ StartupXLOG(void) appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ", (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr, (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr); - xlog_outrec(&buf, record); + xlog_outrec(&buf, xlogreader); appendStringInfoString(&buf, " - "); - xlog_outdesc(&buf, record->xl_rmid, record); + xlog_outdesc(&buf, xlogreader); elog(LOG, "%s", buf.data); pfree(buf.data); } @@ -6260,7 +6267,7 @@ StartupXLOG(void) /* * Have we reached our recovery target? */ - if (recoveryStopsBefore(record)) + if (recoveryStopsBefore(xlogreader)) { reachedStopPoint = true; /* see below */ break; @@ -6270,7 +6277,7 @@ StartupXLOG(void) * If we've been asked to lag the master, wait on latch until * enough time has passed. */ - if (recoveryApplyDelay(record)) + if (recoveryApplyDelay(xlogreader)) { /* * We test for paused recovery again here. If user sets @@ -6285,7 +6292,7 @@ StartupXLOG(void) /* Setup error traceback support for ereport() */ errcallback.callback = rm_redo_error_callback; - errcallback.arg = (void *) record; + errcallback.arg = (void *) xlogreader; errcallback.previous = error_context_stack; error_context_stack = &errcallback; @@ -6324,7 +6331,7 @@ StartupXLOG(void) { CheckPoint checkPoint; - memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint)); + memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); newTLI = checkPoint.ThisTimeLineID; prevTLI = checkPoint.PrevTimeLineID; } @@ -6332,7 +6339,7 @@ StartupXLOG(void) { xl_end_of_recovery xlrec; - memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery)); + memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery)); newTLI = xlrec.ThisTimeLineID; prevTLI = xlrec.PrevTimeLineID; } @@ -6366,7 +6373,7 @@ StartupXLOG(void) RecordKnownAssignedTransactionIds(record->xl_xid); /* Now apply the WAL record itself */ - RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record); + RmgrTable[record->xl_rmid].rm_redo(xlogreader); /* Pop the error context stack */ error_context_stack = errcallback.previous; @@ -6394,7 +6401,7 @@ StartupXLOG(void) WalSndWakeup(); /* Exit loop if we reached inclusive recovery target */ - if (recoveryStopsAfter(record)) + if (recoveryStopsAfter(xlogreader)) { reachedStopPoint = true; break; @@ -7148,8 +7155,7 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, } return NULL; } - if (record->xl_len != sizeof(CheckPoint) || - record->xl_tot_len != SizeOfXLogRecord + sizeof(CheckPoint)) + if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint)) { switch (whichChkpt) { @@ -7194,6 +7200,9 @@ InitXLOGAccess(void) (void) GetRedoRecPtr(); /* Also update our copy of doPageWrites. */ doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites); + + /* Also initialize the working areas for constructing WAL records */ + InitXLogInsert(); } /* @@ -7490,7 +7499,6 @@ CreateCheckPoint(int flags) CheckPoint checkPoint; XLogRecPtr recptr; XLogCtlInsert *Insert = &XLogCtl->Insert; - XLogRecData rdata; uint32 freespace; XLogSegNo _logSegNo; XLogRecPtr curInsert; @@ -7760,15 +7768,11 @@ CreateCheckPoint(int flags) /* * Now insert the checkpoint record into XLOG. */ - rdata.data = (char *) (&checkPoint); - rdata.len = sizeof(checkPoint); - rdata.buffer = InvalidBuffer; - rdata.next = NULL; - + XLogBeginInsert(); + XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint)); recptr = XLogInsert(RM_XLOG_ID, shutdown ? XLOG_CHECKPOINT_SHUTDOWN : - XLOG_CHECKPOINT_ONLINE, - &rdata); + XLOG_CHECKPOINT_ONLINE); XLogFlush(recptr); @@ -7908,7 +7912,6 @@ static void CreateEndOfRecoveryRecord(void) { xl_end_of_recovery xlrec; - XLogRecData rdata; XLogRecPtr recptr; /* sanity check */ @@ -7926,12 +7929,9 @@ CreateEndOfRecoveryRecord(void) START_CRIT_SECTION(); - rdata.data = (char *) &xlrec; - rdata.len = sizeof(xl_end_of_recovery); - rdata.buffer = InvalidBuffer; - rdata.next = NULL; - - recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY, &rdata); + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xl_end_of_recovery)); + recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY); XLogFlush(recptr); @@ -8307,13 +8307,9 @@ KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo) void XLogPutNextOid(Oid nextOid) { - XLogRecData rdata; - - rdata.data = (char *) (&nextOid); - rdata.len = sizeof(Oid); - rdata.buffer = InvalidBuffer; - rdata.next = NULL; - (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata); + XLogBeginInsert(); + XLogRegisterData((char *) (&nextOid), sizeof(Oid)); + (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID); /* * We need not flush the NEXTOID record immediately, because any of the @@ -8349,15 +8345,10 @@ XLogRecPtr RequestXLogSwitch(void) { XLogRecPtr RecPtr; - XLogRecData rdata; - /* XLOG SWITCH, alone among xlog record types, has no data */ - rdata.buffer = InvalidBuffer; - rdata.data = NULL; - rdata.len = 0; - rdata.next = NULL; - - RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH, &rdata); + /* XLOG SWITCH has no data */ + XLogBeginInsert(); + RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH); return RecPtr; } @@ -8369,18 +8360,15 @@ XLogRecPtr XLogRestorePoint(const char *rpName) { XLogRecPtr RecPtr; - XLogRecData rdata; xl_restore_point xlrec; xlrec.rp_time = GetCurrentTimestamp(); strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN); - rdata.buffer = InvalidBuffer; - rdata.data = (char *) &xlrec; - rdata.len = sizeof(xl_restore_point); - rdata.next = NULL; + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point)); - RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT, &rdata); + RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT); ereport(LOG, (errmsg("restore point \"%s\" created at %X/%X", @@ -8412,7 +8400,6 @@ XLogReportParameters(void) */ if (wal_level != ControlFile->wal_level || XLogIsNeeded()) { - XLogRecData rdata; xl_parameter_change xlrec; XLogRecPtr recptr; @@ -8423,12 +8410,10 @@ XLogReportParameters(void) xlrec.wal_level = wal_level; xlrec.wal_log_hints = wal_log_hints; - rdata.buffer = InvalidBuffer; - rdata.data = (char *) &xlrec; - rdata.len = sizeof(xlrec); - rdata.next = NULL; + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); - recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE, &rdata); + recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE); XLogFlush(recptr); } @@ -8486,14 +8471,10 @@ UpdateFullPageWrites(void) */ if (XLogStandbyInfoActive() && !RecoveryInProgress()) { - XLogRecData rdata; + XLogBeginInsert(); + XLogRegisterData((char *) (&fullPageWrites), sizeof(bool)); - rdata.data = (char *) (&fullPageWrites); - rdata.len = sizeof(bool); - rdata.buffer = InvalidBuffer; - rdata.next = NULL; - - XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE, &rdata); + XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE); } if (!fullPageWrites) @@ -8558,12 +8539,13 @@ checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI) * not all record types are related to control file updates. */ void -xlog_redo(XLogRecPtr lsn, XLogRecord *record) +xlog_redo(XLogReaderState *record) { - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + XLogRecPtr lsn = record->EndRecPtr; - /* Backup blocks are not used by XLOG rmgr */ - Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); + /* in XLOG rmgr, backup blocks are only used by XLOG_FPI records */ + Assert(!XLogRecHasAnyBlockRefs(record) || info == XLOG_FPI); if (info == XLOG_NEXTOID) { @@ -8750,14 +8732,12 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record) } else if (info == XLOG_FPI) { - char *data; - BkpBlock bkpb; + Buffer buffer; /* - * Full-page image (FPI) records contain a backup block stored - * "inline" in the normal data since the locking when writing hint - * records isn't sufficient to use the normal backup block mechanism, - * which assumes exclusive lock on the buffer supplied. + * Full-page image (FPI) records contain nothing else but a backup + * block. The block reference must include a full-page image - + * otherwise there would be no point in this record. * * Since the only change in these backup block are hint bits, there * are no recovery conflicts generated. @@ -8766,11 +8746,9 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record) * smgr implementation has no need to implement anything. Which means * nothing is needed in md.c etc */ - data = XLogRecGetData(record); - memcpy(&bkpb, data, sizeof(BkpBlock)); - data += sizeof(BkpBlock); - - RestoreBackupBlockContents(lsn, bkpb, data, false, false); + if (XLogReadBufferForRedo(record, 0, &buffer) != BLK_RESTORED) + elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block"); + UnlockReleaseBuffer(buffer); } else if (info == XLOG_BACKUP_END) { @@ -8867,22 +8845,42 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record) #ifdef WAL_DEBUG static void -xlog_outrec(StringInfo buf, XLogRecord *record) +xlog_outrec(StringInfo buf, XLogReaderState *record) { - int i; + int block_id; appendStringInfo(buf, "prev %X/%X; xid %u", - (uint32) (record->xl_prev >> 32), - (uint32) record->xl_prev, - record->xl_xid); + (uint32) (XLogRecGetPrev(record) >> 32), + (uint32) XLogRecGetPrev(record), + XLogRecGetXid(record)); appendStringInfo(buf, "; len %u", - record->xl_len); + XLogRecGetDataLen(record)); - for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++) + /* decode block references */ + for (block_id = 0; block_id <= record->max_block_id; block_id++) { - if (record->xl_info & XLR_BKP_BLOCK(i)) - appendStringInfo(buf, "; bkpb%d", i); + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blk; + + if (!XLogRecHasBlockRef(record, block_id)) + continue; + + XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk); + if (forknum != MAIN_FORKNUM) + appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, fork %u, blk %u", + block_id, + rnode.spcNode, rnode.dbNode, rnode.relNode, + forknum, + blk); + else + appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, blk %u", + block_id, + rnode.spcNode, rnode.dbNode, rnode.relNode, + blk); + if (XLogRecHasBlockImage(record, block_id)) + appendStringInfo(buf, " FPW"); } } #endif /* WAL_DEBUG */ @@ -8892,17 +8890,18 @@ xlog_outrec(StringInfo buf, XLogRecord *record) * optionally followed by a colon, a space, and a further description. */ static void -xlog_outdesc(StringInfo buf, RmgrId rmid, XLogRecord *record) +xlog_outdesc(StringInfo buf, XLogReaderState *record) { + RmgrId rmid = XLogRecGetRmid(record); + uint8 info = XLogRecGetInfo(record); const char *id; appendStringInfoString(buf, RmgrTable[rmid].rm_name); appendStringInfoChar(buf, '/'); - id = RmgrTable[rmid].rm_identify(record->xl_info); + id = RmgrTable[rmid].rm_identify(info); if (id == NULL) - appendStringInfo(buf, "UNKNOWN (%X): ", - record->xl_info & ~XLR_INFO_MASK); + appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK); else appendStringInfo(buf, "%s: ", id); @@ -9411,7 +9410,6 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p) XLogRecPtr startpoint; XLogRecPtr stoppoint; TimeLineID stoptli; - XLogRecData rdata; pg_time_t stamp_time; char strfbuf[128]; char histfilepath[MAXPGPATH]; @@ -9618,11 +9616,9 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p) /* * Write the backup-end xlog record */ - rdata.data = (char *) (&startpoint); - rdata.len = sizeof(startpoint); - rdata.buffer = InvalidBuffer; - rdata.next = NULL; - stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END, &rdata); + XLogBeginInsert(); + XLogRegisterData((char *) (&startpoint), sizeof(startpoint)); + stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END); stoptli = ThisTimeLineID; /* @@ -9930,15 +9926,13 @@ read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired, static void rm_redo_error_callback(void *arg) { - XLogRecord *record = (XLogRecord *) arg; + XLogReaderState *record = (XLogReaderState *) arg; StringInfoData buf; initStringInfo(&buf); - xlog_outdesc(&buf, record->xl_rmid, record); + xlog_outdesc(&buf, record); - /* don't bother emitting empty description */ - if (buf.len > 0) - errcontext("xlog redo %s", buf.data); + errcontext("xlog redo %s", buf.data); pfree(buf.data); } diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index b83343bf5b..89c407e521 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -3,6 +3,12 @@ * xloginsert.c * Functions for constructing WAL records * + * Constructing a WAL record begins with a call to XLogBeginInsert, + * followed by a number of XLogRegister* calls. The registered data is + * collected in private working memory, and finally assembled into a chain + * of XLogRecData structs by a call to XLogRecordAssemble(). See + * access/transam/README for details. + * * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -24,39 +30,366 @@ #include "utils/memutils.h" #include "pg_trace.h" -static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info, - XLogRecData *rdata, - XLogRecPtr RedoRecPtr, bool doPageWrites, - XLogRecPtr *fpw_lsn, XLogRecData **rdt_lastnormal); -static void XLogFillBkpBlock(Buffer buffer, bool buffer_std, BkpBlock *bkpb); +/* + * For each block reference registered with XLogRegisterBuffer, we fill in + * a registered_buffer struct. + */ +typedef struct +{ + bool in_use; /* is this slot in use? */ + uint8 flags; /* REGBUF_* flags */ + RelFileNode rnode; /* identifies the relation and block */ + ForkNumber forkno; + BlockNumber block; + Page page; /* page content */ + uint32 rdata_len; /* total length of data in rdata chain */ + XLogRecData *rdata_head; /* head of the chain of data registered with + * this block */ + XLogRecData *rdata_tail; /* last entry in the chain, or &rdata_head if + * empty */ + + XLogRecData bkp_rdatas[2]; /* temporary rdatas used to hold references to + * backup block data in XLogRecordAssemble() */ +} registered_buffer; + +static registered_buffer *registered_buffers; +static int max_registered_buffers; /* allocated size */ +static int max_registered_block_id = 0; /* highest block_id + 1 + * currently registered */ /* - * Insert an XLOG record having the specified RMID and info bytes, - * with the body of the record being the data chunk(s) described by - * the rdata chain (see xloginsert.h for notes about rdata). + * A chain of XLogRecDatas to hold the "main data" of a WAL record, registered + * with XLogRegisterData(...). + */ +static XLogRecData *mainrdata_head; +static XLogRecData *mainrdata_last = (XLogRecData *) &mainrdata_head; +static uint32 mainrdata_len; /* total # of bytes in chain */ + +/* + * These are used to hold the record header while constructing a record. + * 'hdr_scratch' is not a plain variable, but is palloc'd at initialization, + * because we want it to be MAXALIGNed and padding bytes zeroed. + * + * For simplicity, it's allocated large enough to hold the headers for any + * WAL record. + */ +static XLogRecData hdr_rdt; +static char *hdr_scratch = NULL; + +#define HEADER_SCRATCH_SIZE \ + (SizeOfXLogRecord + \ + MaxSizeOfXLogRecordBlockHeader * (XLR_MAX_BLOCK_ID + 1) + \ + SizeOfXLogRecordDataHeaderLong) + +/* + * An array of XLogRecData structs, to hold registered data. + */ +static XLogRecData *rdatas; +static int num_rdatas; /* entries currently used */ +static int max_rdatas; /* allocated size */ + +static bool begininsert_called = false; + +/* Memory context to hold the registered buffer and data references. */ +static MemoryContext xloginsert_cxt; + +static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info, + XLogRecPtr RedoRecPtr, bool doPageWrites, + XLogRecPtr *fpw_lsn); + +/* + * Begin constructing a WAL record. This must be called before the + * XLogRegister* functions and XLogInsert(). + */ +void +XLogBeginInsert(void) +{ + Assert(max_registered_block_id == 0); + Assert(mainrdata_last == (XLogRecData *) &mainrdata_head); + Assert(mainrdata_len == 0); + Assert(!begininsert_called); + + /* cross-check on whether we should be here or not */ + if (!XLogInsertAllowed()) + elog(ERROR, "cannot make new WAL entries during recovery"); + + begininsert_called = true; +} + +/* + * Ensure that there are enough buffer and data slots in the working area, + * for subsequent XLogRegisterBuffer, XLogRegisterData and XLogRegisterBufData + * calls. + * + * There is always space for a small number of buffers and data chunks, enough + * for most record types. This function is for the exceptional cases that need + * more. + */ +void +XLogEnsureRecordSpace(int max_block_id, int ndatas) +{ + int nbuffers; + + /* + * This must be called before entering a critical section, because + * allocating memory inside a critical section can fail. repalloc() will + * check the same, but better to check it here too so that we fail + * consistently even if the arrays happen to be large enough already. + */ + Assert(CritSectionCount == 0); + + /* the minimum values can't be decreased */ + if (max_block_id < XLR_NORMAL_MAX_BLOCK_ID) + max_block_id = XLR_NORMAL_MAX_BLOCK_ID; + if (ndatas < XLR_NORMAL_RDATAS) + ndatas = XLR_NORMAL_RDATAS; + + if (max_block_id > XLR_MAX_BLOCK_ID) + elog(ERROR, "maximum number of WAL record block references exceeded"); + nbuffers = max_block_id + 1; + + if (nbuffers > max_registered_buffers) + { + registered_buffers = (registered_buffer *) + repalloc(registered_buffers, sizeof(registered_buffer) * nbuffers); + + /* + * At least the padding bytes in the structs must be zeroed, because + * they are included in WAL data, but initialize it all for tidiness. + */ + MemSet(®istered_buffers[max_registered_buffers], 0, + (nbuffers - max_registered_buffers) * sizeof(registered_buffer)); + max_registered_buffers = nbuffers; + } + + if (ndatas > max_rdatas) + { + rdatas = (XLogRecData *) repalloc(rdatas, sizeof(XLogRecData) * ndatas); + max_rdatas = ndatas; + } +} + +/* + * Reset WAL record construction buffers. + */ +void +XLogResetInsertion(void) +{ + int i; + + for (i = 0; i < max_registered_block_id; i++) + registered_buffers[i].in_use = false; + + num_rdatas = 0; + max_registered_block_id = 0; + mainrdata_len = 0; + mainrdata_last = (XLogRecData *) &mainrdata_head; + begininsert_called = false; +} + +/* + * Register a reference to a buffer with the WAL record being constructed. + * This must be called for every page that the WAL-logged operation modifies. + */ +void +XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags) +{ + registered_buffer *regbuf; + + /* NO_IMAGE doesn't make sense with FORCE_IMAGE */ + Assert(!((flags & REGBUF_FORCE_IMAGE) && (flags & (REGBUF_NO_IMAGE)))); + Assert(begininsert_called); + + if (block_id >= max_registered_block_id) + { + if (block_id >= max_registered_buffers) + elog(ERROR, "too many registered buffers"); + max_registered_block_id = block_id + 1; + } + + regbuf = ®istered_buffers[block_id]; + + BufferGetTag(buffer, ®buf->rnode, ®buf->forkno, ®buf->block); + regbuf->page = BufferGetPage(buffer); + regbuf->flags = flags; + regbuf->rdata_tail = (XLogRecData *) ®buf->rdata_head; + regbuf->rdata_len = 0; + + /* + * Check that this page hasn't already been registered with some other + * block_id. + */ +#ifdef USE_ASSERT_CHECKING + { + int i; + + for (i = 0; i < max_registered_block_id; i++) + { + registered_buffer *regbuf_old = ®istered_buffers[i]; + + if (i == block_id || !regbuf_old->in_use) + continue; + + Assert(!RelFileNodeEquals(regbuf_old->rnode, regbuf->rnode) || + regbuf_old->forkno != regbuf->forkno || + regbuf_old->block != regbuf->block); + } + } +#endif + + regbuf->in_use = true; +} + +/* + * Like XLogRegisterBuffer, but for registering a block that's not in the + * shared buffer pool (i.e. when you don't have a Buffer for it). + */ +void +XLogRegisterBlock(uint8 block_id, RelFileNode *rnode, ForkNumber forknum, + BlockNumber blknum, Page page, uint8 flags) +{ + registered_buffer *regbuf; + + /* This is currently only used to WAL-log a full-page image of a page */ + Assert(flags & REGBUF_FORCE_IMAGE); + Assert(begininsert_called); + + if (block_id >= max_registered_block_id) + max_registered_block_id = block_id + 1; + + if (block_id >= max_registered_buffers) + elog(ERROR, "too many registered buffers"); + + regbuf = ®istered_buffers[block_id]; + + regbuf->rnode = *rnode; + regbuf->forkno = forknum; + regbuf->block = blknum; + regbuf->page = page; + regbuf->flags = flags; + regbuf->rdata_tail = (XLogRecData *) ®buf->rdata_head; + regbuf->rdata_len = 0; + + /* + * Check that this page hasn't already been registered with some other + * block_id. + */ +#ifdef USE_ASSERT_CHECKING + { + int i; + + for (i = 0; i < max_registered_block_id; i++) + { + registered_buffer *regbuf_old = ®istered_buffers[i]; + + if (i == block_id || !regbuf_old->in_use) + continue; + + Assert(!RelFileNodeEquals(regbuf_old->rnode, regbuf->rnode) || + regbuf_old->forkno != regbuf->forkno || + regbuf_old->block != regbuf->block); + } + } +#endif + + regbuf->in_use = true; +} + +/* + * Add data to the WAL record that's being constructed. + * + * The data is appended to the "main chunk", available at replay with + * XLogGetRecData(). + */ +void +XLogRegisterData(char *data, int len) +{ + XLogRecData *rdata; + + Assert(begininsert_called); + + if (num_rdatas >= max_rdatas) + elog(ERROR, "too much WAL data"); + rdata = &rdatas[num_rdatas++]; + + rdata->data = data; + rdata->len = len; + + /* + * we use the mainrdata_last pointer to track the end of the chain, so no + * need to clear 'next' here. + */ + + mainrdata_last->next = rdata; + mainrdata_last = rdata; + + mainrdata_len += len; +} + +/* + * Add buffer-specific data to the WAL record that's being constructed. + * + * Block_id must reference a block previously registered with + * XLogRegisterBuffer(). If this is called more than once for the same + * block_id, the data is appended. + * + * The maximum amount of data that can be registered per block is 65535 + * bytes. That should be plenty; if you need more than BLCKSZ bytes to + * reconstruct the changes to the page, you might as well just log a full + * copy of it. (the "main data" that's not associated with a block is not + * limited) + */ +void +XLogRegisterBufData(uint8 block_id, char *data, int len) +{ + registered_buffer *regbuf; + XLogRecData *rdata; + + Assert(begininsert_called); + + /* find the registered buffer struct */ + regbuf = ®istered_buffers[block_id]; + if (!regbuf->in_use) + elog(ERROR, "no block with id %d registered with WAL insertion", + block_id); + + if (num_rdatas >= max_rdatas) + elog(ERROR, "too much WAL data"); + rdata = &rdatas[num_rdatas++]; + + rdata->data = data; + rdata->len = len; + + regbuf->rdata_tail->next = rdata; + regbuf->rdata_tail = rdata; + regbuf->rdata_len += len; +} + +/* + * Insert an XLOG record having the specified RMID and info bytes, with the + * body of the record being the data and buffer references registered earlier + * with XLogRegister* calls. * * Returns XLOG pointer to end of record (beginning of next record). * This can be used as LSN for data pages affected by the logged action. * (LSN is the XLOG point up to which the XLOG must be flushed to disk * before the data page can be written out. This implements the basic * WAL rule "write the log before the data".) - * - * NB: this routine feels free to scribble on the XLogRecData structs, - * though not on the data they reference. This is OK since the XLogRecData - * structs are always just temporaries in the calling code. */ XLogRecPtr -XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata) +XLogInsert(RmgrId rmid, uint8 info) { - XLogRecPtr RedoRecPtr; - bool doPageWrites; XLogRecPtr EndPos; - XLogRecPtr fpw_lsn; - XLogRecData *rdt; - XLogRecData *rdt_lastnormal; - /* info's high bits are reserved for use by me */ - if (info & XLR_INFO_MASK) + /* XLogBeginInsert() must have been called. */ + if (!begininsert_called) + elog(ERROR, "XLogBeginInsert was not called"); + + /* + * The caller can set rmgr bits and XLR_SPECIAL_REL_UPDATE; the rest are + * reserved for use by me. + */ + if ((info & ~(XLR_RMGR_INFO_MASK | XLR_SPECIAL_REL_UPDATE)) != 0) elog(PANIC, "invalid xlog info mask %02X", info); TRACE_POSTGRESQL_XLOG_INSERT(rmid, info); @@ -67,292 +400,282 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata) */ if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID) { + XLogResetInsertion(); EndPos = SizeOfXLogLongPHD; /* start of 1st chkpt record */ return EndPos; } - /* - * Get values needed to decide whether to do full-page writes. Since we - * don't yet have an insertion lock, these could change under us, but - * XLogInsertRecord will recheck them once it has a lock. - */ - GetFullPageWriteInfo(&RedoRecPtr, &doPageWrites); - - /* - * Assemble an XLogRecData chain representing the WAL record, including - * any backup blocks needed. - * - * We may have to loop back to here if a race condition is detected in - * XLogInsertRecord. We could prevent the race by doing all this work - * while holding an insertion lock, but it seems better to avoid doing CRC - * calculations while holding one. - */ -retry: - rdt = XLogRecordAssemble(rmid, info, rdata, RedoRecPtr, doPageWrites, - &fpw_lsn, &rdt_lastnormal); - - EndPos = XLogInsertRecord(rdt, fpw_lsn); - - if (EndPos == InvalidXLogRecPtr) + do { + XLogRecPtr RedoRecPtr; + bool doPageWrites; + XLogRecPtr fpw_lsn; + XLogRecData *rdt; + /* - * Undo the changes we made to the rdata chain, and retry. - * - * XXX: This doesn't undo *all* the changes; the XLogRecData - * entries for buffers that we had already decided to back up have - * had their data-pointers cleared. That's OK, as long as we - * decide to back them up on the next iteration as well. Hence, - * don't allow "doPageWrites" value to go from true to false after - * we've modified the rdata chain. + * Get values needed to decide whether to do full-page writes. Since + * we don't yet have an insertion lock, these could change under us, + * but XLogInsertRecData will recheck them once it has a lock. */ - bool newDoPageWrites; + GetFullPageWriteInfo(&RedoRecPtr, &doPageWrites); - GetFullPageWriteInfo(&RedoRecPtr, &newDoPageWrites); - doPageWrites = doPageWrites || newDoPageWrites; - rdt_lastnormal->next = NULL; + rdt = XLogRecordAssemble(rmid, info, RedoRecPtr, doPageWrites, + &fpw_lsn); - goto retry; - } + EndPos = XLogInsertRecord(rdt, fpw_lsn); + } while (EndPos == InvalidXLogRecPtr); + + XLogResetInsertion(); return EndPos; } /* - * Assemble a full WAL record, including backup blocks, from an XLogRecData - * chain, ready for insertion with XLogInsertRecord(). The record header - * fields are filled in, except for the xl_prev field and CRC. + * Assemble a WAL record from the registered data and buffers into an + * XLogRecData chain, ready for insertion with XLogInsertRecord(). * - * The rdata chain is modified, adding entries for full-page images. - * *rdt_lastnormal is set to point to the last normal (ie. not added by - * this function) entry. It can be used to reset the chain to its original - * state. + * The record header fields are filled in, except for the xl_prev field. The + * calculated CRC does not include xl_prev either. * - * If the rdata chain contains any buffer references, and a full-page image - * was not taken of all the buffers, *fpw_lsn is set to the lowest LSN among - * such pages. This signals that the assembled record is only good for - * insertion on the assumption that the RedoRecPtr and doPageWrites values - * were up-to-date. + * If there are any registered buffers, and a full-page image was not taken + * of all them, *page_writes_omitted is set to true. This signals that the + * assembled record is only good for insertion on the assumption that the + * RedoRecPtr and doPageWrites values were up-to-date. */ static XLogRecData * -XLogRecordAssemble(RmgrId rmid, uint8 info, XLogRecData *rdata, +XLogRecordAssemble(RmgrId rmid, uint8 info, XLogRecPtr RedoRecPtr, bool doPageWrites, - XLogRecPtr *fpw_lsn, XLogRecData **rdt_lastnormal) + XLogRecPtr *fpw_lsn) { - bool isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH); XLogRecData *rdt; - Buffer dtbuf[XLR_MAX_BKP_BLOCKS]; - bool dtbuf_bkp[XLR_MAX_BKP_BLOCKS]; - uint32 len, - total_len; - unsigned i; + uint32 total_len = 0; + int block_id; + pg_crc32 rdata_crc; + registered_buffer *prev_regbuf = NULL; + XLogRecData *rdt_datas_last; + XLogRecord *rechdr; + char *scratch = hdr_scratch; /* - * These need to be static because they are returned to the caller as part - * of the XLogRecData chain. + * Note: this function can be called multiple times for the same record. + * All the modifications we do to the rdata chains below must handle that. */ - static BkpBlock dtbuf_xlg[XLR_MAX_BKP_BLOCKS]; - static XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS]; - static XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS]; - static XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS]; - static XLogRecData hdr_rdt; - static XLogRecord *rechdr; - if (rechdr == NULL) - { - static char rechdrbuf[SizeOfXLogRecord + MAXIMUM_ALIGNOF]; + /* The record begins with the fixed-size header */ + rechdr = (XLogRecord *) scratch; + scratch += SizeOfXLogRecord; - rechdr = (XLogRecord *) MAXALIGN(&rechdrbuf); - MemSet(rechdr, 0, SizeOfXLogRecord); - } - - /* The record begins with the header */ - hdr_rdt.data = (char *) rechdr; - hdr_rdt.len = SizeOfXLogRecord; - hdr_rdt.next = rdata; - total_len = SizeOfXLogRecord; + hdr_rdt.next = NULL; + rdt_datas_last = &hdr_rdt; + hdr_rdt.data = hdr_scratch; /* - * Here we scan the rdata chain, to determine which buffers must be backed - * up. - * - * We add entries for backup blocks to the chain, so that they don't need - * any special treatment in the critical section where the chunks are - * copied into the WAL buffers. Those entries have to be unlinked from the - * chain if we have to loop back here. + * Make an rdata chain containing all the data portions of all block + * references. This includes the data for full-page images. Also append + * the headers for the block references in the scratch buffer. */ - for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++) - { - dtbuf[i] = InvalidBuffer; - dtbuf_bkp[i] = false; - } - *fpw_lsn = InvalidXLogRecPtr; - len = 0; - for (rdt = rdata;;) + for (block_id = 0; block_id < max_registered_block_id; block_id++) { - if (rdt->buffer == InvalidBuffer) - { - /* Simple data, just include it */ - len += rdt->len; - } - else - { - /* Find info for buffer */ - for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++) - { - if (rdt->buffer == dtbuf[i]) - { - /* Buffer already referenced by earlier chain item */ - if (dtbuf_bkp[i]) - { - rdt->data = NULL; - rdt->len = 0; - } - else if (rdt->data) - len += rdt->len; - break; - } - if (dtbuf[i] == InvalidBuffer) - { - /* OK, put it in this slot */ - XLogRecPtr page_lsn; - bool needs_backup; + registered_buffer *regbuf = ®istered_buffers[block_id]; + bool needs_backup; + bool needs_data; + XLogRecordBlockHeader bkpb; + XLogRecordBlockImageHeader bimg; + bool samerel; - dtbuf[i] = rdt->buffer; - - /* - * Determine whether the buffer has to be backed up. - * - * We assume page LSN is first data on *every* page that - * can be passed to XLogInsert, whether it has the - * standard page layout or not. We don't need to take the - * buffer header lock for PageGetLSN because we hold an - * exclusive lock on the page and/or the relation. - */ - page_lsn = PageGetLSN(BufferGetPage(rdt->buffer)); - if (!doPageWrites) - needs_backup = false; - else if (page_lsn <= RedoRecPtr) - needs_backup = true; - else - needs_backup = false; - - if (needs_backup) - { - /* - * The page needs to be backed up, so set up BkpBlock - */ - XLogFillBkpBlock(rdt->buffer, rdt->buffer_std, - &(dtbuf_xlg[i])); - dtbuf_bkp[i] = true; - rdt->data = NULL; - rdt->len = 0; - } - else - { - if (rdt->data) - len += rdt->len; - if (*fpw_lsn == InvalidXLogRecPtr || - page_lsn < *fpw_lsn) - { - *fpw_lsn = page_lsn; - } - } - break; - } - } - if (i >= XLR_MAX_BKP_BLOCKS) - elog(PANIC, "can backup at most %d blocks per xlog record", - XLR_MAX_BKP_BLOCKS); - } - /* Break out of loop when rdt points to last chain item */ - if (rdt->next == NULL) - break; - rdt = rdt->next; - } - total_len += len; - - /* - * Make additional rdata chain entries for the backup blocks, so that we - * don't need to special-case them in the write loop. This modifies the - * original rdata chain, but we keep a pointer to the last regular entry, - * rdt_lastnormal, so that we can undo this if we have to start over. - * - * At the exit of this loop, total_len includes the backup block data. - * - * Also set the appropriate info bits to show which buffers were backed - * up. The XLR_BKP_BLOCK(N) bit corresponds to the N'th distinct buffer - * value (ignoring InvalidBuffer) appearing in the rdata chain. - */ - *rdt_lastnormal = rdt; - for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++) - { - BkpBlock *bkpb; - char *page; - - if (!dtbuf_bkp[i]) + if (!regbuf->in_use) continue; - info |= XLR_BKP_BLOCK(i); - - bkpb = &(dtbuf_xlg[i]); - page = (char *) BufferGetBlock(dtbuf[i]); - - rdt->next = &(dtbuf_rdt1[i]); - rdt = rdt->next; - - rdt->data = (char *) bkpb; - rdt->len = sizeof(BkpBlock); - total_len += sizeof(BkpBlock); - - rdt->next = &(dtbuf_rdt2[i]); - rdt = rdt->next; - - if (bkpb->hole_length == 0) + /* Determine if this block needs to be backed up */ + if (regbuf->flags & REGBUF_FORCE_IMAGE) + needs_backup = true; + else if (regbuf->flags & REGBUF_NO_IMAGE) + needs_backup = false; + else if (!doPageWrites) + needs_backup = false; + else { - rdt->data = page; - rdt->len = BLCKSZ; - total_len += BLCKSZ; - rdt->next = NULL; + /* + * We assume page LSN is first data on *every* page that can be + * passed to XLogInsert, whether it has the standard page layout + * or not. + */ + XLogRecPtr page_lsn = PageGetLSN(regbuf->page); + + needs_backup = (page_lsn <= RedoRecPtr); + if (!needs_backup) + { + if (*fpw_lsn == InvalidXLogRecPtr || page_lsn < *fpw_lsn) + *fpw_lsn = page_lsn; + } + } + + /* Determine if the buffer data needs to included */ + if (regbuf->rdata_len == 0) + needs_data = false; + else if ((regbuf->flags & REGBUF_KEEP_DATA) != 0) + needs_data = true; + else + needs_data = !needs_backup; + + bkpb.id = block_id; + bkpb.fork_flags = regbuf->forkno; + bkpb.data_length = 0; + + if ((regbuf->flags & REGBUF_WILL_INIT) == REGBUF_WILL_INIT) + bkpb.fork_flags |= BKPBLOCK_WILL_INIT; + + if (needs_backup) + { + Page page = regbuf->page; + + /* + * The page needs to be backed up, so set up *bimg + */ + if (regbuf->flags & REGBUF_STANDARD) + { + /* Assume we can omit data between pd_lower and pd_upper */ + uint16 lower = ((PageHeader) page)->pd_lower; + uint16 upper = ((PageHeader) page)->pd_upper; + + if (lower >= SizeOfPageHeaderData && + upper > lower && + upper <= BLCKSZ) + { + bimg.hole_offset = lower; + bimg.hole_length = upper - lower; + } + else + { + /* No "hole" to compress out */ + bimg.hole_offset = 0; + bimg.hole_length = 0; + } + } + else + { + /* Not a standard page header, don't try to eliminate "hole" */ + bimg.hole_offset = 0; + bimg.hole_length = 0; + } + + /* Fill in the remaining fields in the XLogRecordBlockData struct */ + bkpb.fork_flags |= BKPBLOCK_HAS_IMAGE; + + total_len += BLCKSZ - bimg.hole_length; + + /* + * Construct XLogRecData entries for the page content. + */ + rdt_datas_last->next = ®buf->bkp_rdatas[0]; + rdt_datas_last = rdt_datas_last->next; + if (bimg.hole_length == 0) + { + rdt_datas_last->data = page; + rdt_datas_last->len = BLCKSZ; + } + else + { + /* must skip the hole */ + rdt_datas_last->data = page; + rdt_datas_last->len = bimg.hole_offset; + + rdt_datas_last->next = ®buf->bkp_rdatas[1]; + rdt_datas_last = rdt_datas_last->next; + + rdt_datas_last->data = page + (bimg.hole_offset + bimg.hole_length); + rdt_datas_last->len = BLCKSZ - (bimg.hole_offset + bimg.hole_length); + } + } + + if (needs_data) + { + /* + * Link the caller-supplied rdata chain for this buffer to the + * overall list. + */ + bkpb.fork_flags |= BKPBLOCK_HAS_DATA; + bkpb.data_length = regbuf->rdata_len; + total_len += regbuf->rdata_len; + + rdt_datas_last->next = regbuf->rdata_head; + rdt_datas_last = regbuf->rdata_tail; + } + + if (prev_regbuf && RelFileNodeEquals(regbuf->rnode, prev_regbuf->rnode)) + { + samerel = true; + bkpb.fork_flags |= BKPBLOCK_SAME_REL; + prev_regbuf = regbuf; + } + else + samerel = false; + + /* Ok, copy the header to the scratch buffer */ + memcpy(scratch, &bkpb, SizeOfXLogRecordBlockHeader); + scratch += SizeOfXLogRecordBlockHeader; + if (needs_backup) + { + memcpy(scratch, &bimg, SizeOfXLogRecordBlockImageHeader); + scratch += SizeOfXLogRecordBlockImageHeader; + } + if (!samerel) + { + memcpy(scratch, ®buf->rnode, sizeof(RelFileNode)); + scratch += sizeof(RelFileNode); + } + memcpy(scratch, ®buf->block, sizeof(BlockNumber)); + scratch += sizeof(BlockNumber); + } + + /* followed by main data, if any */ + if (mainrdata_len > 0) + { + if (mainrdata_len > 255) + { + *(scratch++) = XLR_BLOCK_ID_DATA_LONG; + memcpy(scratch, &mainrdata_len, sizeof(uint32)); + scratch += sizeof(uint32); } else { - /* must skip the hole */ - rdt->data = page; - rdt->len = bkpb->hole_offset; - total_len += bkpb->hole_offset; - - rdt->next = &(dtbuf_rdt3[i]); - rdt = rdt->next; - - rdt->data = page + (bkpb->hole_offset + bkpb->hole_length); - rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length); - total_len += rdt->len; - rdt->next = NULL; + *(scratch++) = XLR_BLOCK_ID_DATA_SHORT; + *(scratch++) = (uint8) mainrdata_len; } + rdt_datas_last->next = mainrdata_head; + rdt_datas_last = mainrdata_last; + total_len += mainrdata_len; } + rdt_datas_last->next = NULL; + + hdr_rdt.len = (scratch - hdr_scratch); + total_len += hdr_rdt.len; /* - * We disallow len == 0 because it provides a useful bit of extra error - * checking in ReadRecord. This means that all callers of XLogInsert - * must supply at least some not-in-a-buffer data. However, we make an - * exception for XLOG SWITCH records because we don't want them to ever - * cross a segment boundary. + * Calculate CRC of the data + * + * Note that the record header isn't added into the CRC initially since we + * don't know the prev-link yet. Thus, the CRC will represent the CRC of + * the whole record in the order: rdata, then backup blocks, then record + * header. */ - if (len == 0 && !isLogSwitch) - elog(PANIC, "invalid xlog record length %u", rechdr->xl_len); + INIT_CRC32C(rdata_crc); + COMP_CRC32C(rdata_crc, hdr_scratch + SizeOfXLogRecord, hdr_rdt.len - SizeOfXLogRecord); + for (rdt = hdr_rdt.next; rdt != NULL; rdt = rdt->next) + COMP_CRC32C(rdata_crc, rdt->data, rdt->len); /* * Fill in the fields in the record header. Prev-link is filled in later, - * once we know where in the WAL the record will be inserted. CRC is also - * not calculated yet. + * once we know where in the WAL the record will be inserted. The CRC does + * not include the record header yet. */ rechdr->xl_xid = GetCurrentTransactionIdIfAny(); rechdr->xl_tot_len = total_len; - rechdr->xl_len = len; /* doesn't include backup blocks */ rechdr->xl_info = info; rechdr->xl_rmid = rmid; rechdr->xl_prev = InvalidXLogRecPtr; + rechdr->xl_crc = rdata_crc; return &hdr_rdt; } @@ -429,45 +752,41 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std) if (lsn <= RedoRecPtr) { - XLogRecData rdata[2]; - BkpBlock bkpb; + int flags; char copied_buffer[BLCKSZ]; char *origdata = (char *) BufferGetBlock(buffer); - - /* Make a BkpBlock struct representing the buffer */ - XLogFillBkpBlock(buffer, buffer_std, &bkpb); + RelFileNode rnode; + ForkNumber forkno; + BlockNumber blkno; /* * Copy buffer so we don't have to worry about concurrent hint bit or * lsn updates. We assume pd_lower/upper cannot be changed without an * exclusive lock, so the contents bkp are not racy. - * - * With buffer_std set to false, XLogFillBkpBlock() sets hole_length - * and hole_offset to 0; so the following code is safe for either - * case. */ - memcpy(copied_buffer, origdata, bkpb.hole_offset); - memcpy(copied_buffer + bkpb.hole_offset, - origdata + bkpb.hole_offset + bkpb.hole_length, - BLCKSZ - bkpb.hole_offset - bkpb.hole_length); + if (buffer_std) + { + /* Assume we can omit data between pd_lower and pd_upper */ + Page page = BufferGetPage(buffer); + uint16 lower = ((PageHeader) page)->pd_lower; + uint16 upper = ((PageHeader) page)->pd_upper; - /* - * Header for backup block. - */ - rdata[0].data = (char *) &bkpb; - rdata[0].len = sizeof(BkpBlock); - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &(rdata[1]); + memcpy(copied_buffer, origdata, lower); + memcpy(copied_buffer + upper, origdata + upper, BLCKSZ - upper); + } + else + memcpy(copied_buffer, origdata, BLCKSZ); - /* - * Save copy of the buffer. - */ - rdata[1].data = copied_buffer; - rdata[1].len = BLCKSZ - bkpb.hole_length; - rdata[1].buffer = InvalidBuffer; - rdata[1].next = NULL; + XLogBeginInsert(); - recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI, rdata); + flags = REGBUF_FORCE_IMAGE; + if (buffer_std) + flags |= REGBUF_STANDARD; + + BufferGetTag(buffer, &rnode, &forkno, &blkno); + XLogRegisterBlock(0, &rnode, forkno, blkno, copied_buffer, flags); + + recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI); } return recptr; @@ -489,71 +808,16 @@ XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, Page page, bool page_std) { - BkpBlock bkpb; + int flags; XLogRecPtr recptr; - XLogRecData rdata[3]; - - /* NO ELOG(ERROR) from here till newpage op is logged */ - START_CRIT_SECTION(); - - bkpb.node = *rnode; - bkpb.fork = forkNum; - bkpb.block = blkno; + flags = REGBUF_FORCE_IMAGE; if (page_std) - { - /* Assume we can omit data between pd_lower and pd_upper */ - uint16 lower = ((PageHeader) page)->pd_lower; - uint16 upper = ((PageHeader) page)->pd_upper; + flags |= REGBUF_STANDARD; - if (lower >= SizeOfPageHeaderData && - upper > lower && - upper <= BLCKSZ) - { - bkpb.hole_offset = lower; - bkpb.hole_length = upper - lower; - } - else - { - /* No "hole" to compress out */ - bkpb.hole_offset = 0; - bkpb.hole_length = 0; - } - } - else - { - /* Not a standard page header, don't try to eliminate "hole" */ - bkpb.hole_offset = 0; - bkpb.hole_length = 0; - } - - rdata[0].data = (char *) &bkpb; - rdata[0].len = sizeof(BkpBlock); - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &(rdata[1]); - - if (bkpb.hole_length == 0) - { - rdata[1].data = (char *) page; - rdata[1].len = BLCKSZ; - rdata[1].buffer = InvalidBuffer; - rdata[1].next = NULL; - } - else - { - /* must skip the hole */ - rdata[1].data = (char *) page; - rdata[1].len = bkpb.hole_offset; - rdata[1].buffer = InvalidBuffer; - rdata[1].next = &rdata[2]; - - rdata[2].data = (char *) page + (bkpb.hole_offset + bkpb.hole_length); - rdata[2].len = BLCKSZ - (bkpb.hole_offset + bkpb.hole_length); - rdata[2].buffer = InvalidBuffer; - rdata[2].next = NULL; - } - - recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI, rdata); + XLogBeginInsert(); + XLogRegisterBlock(0, rnode, forkNum, blkno, page, flags); + recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI); /* * The page may be uninitialized. If so, we can't set the LSN because that @@ -564,8 +828,6 @@ log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, PageSetLSN(page, recptr); } - END_CRIT_SECTION(); - return recptr; } @@ -596,38 +858,38 @@ log_newpage_buffer(Buffer buffer, bool page_std) } /* - * Fill a BkpBlock for a buffer. + * Allocate working buffers needed for WAL record construction. */ -static void -XLogFillBkpBlock(Buffer buffer, bool buffer_std, BkpBlock *bkpb) +void +InitXLogInsert(void) { - BufferGetTag(buffer, &bkpb->node, &bkpb->fork, &bkpb->block); - - if (buffer_std) + /* Initialize the working areas */ + if (xloginsert_cxt == NULL) { - /* Assume we can omit data between pd_lower and pd_upper */ - Page page = BufferGetPage(buffer); - uint16 lower = ((PageHeader) page)->pd_lower; - uint16 upper = ((PageHeader) page)->pd_upper; + xloginsert_cxt = AllocSetContextCreate(TopMemoryContext, + "WAL record construction", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + } - if (lower >= SizeOfPageHeaderData && - upper > lower && - upper <= BLCKSZ) - { - bkpb->hole_offset = lower; - bkpb->hole_length = upper - lower; - } - else - { - /* No "hole" to compress out */ - bkpb->hole_offset = 0; - bkpb->hole_length = 0; - } - } - else + if (registered_buffers == NULL) { - /* Not a standard page header, don't try to eliminate "hole" */ - bkpb->hole_offset = 0; - bkpb->hole_length = 0; + registered_buffers = (registered_buffer *) + MemoryContextAllocZero(xloginsert_cxt, + sizeof(registered_buffer) * (XLR_NORMAL_MAX_BLOCK_ID + 1)); + max_registered_buffers = XLR_NORMAL_MAX_BLOCK_ID + 1; } + if (rdatas == NULL) + { + rdatas = MemoryContextAlloc(xloginsert_cxt, + sizeof(XLogRecData) * XLR_NORMAL_RDATAS); + max_rdatas = XLR_NORMAL_RDATAS; + } + + /* + * Allocate a buffer to hold the header information for a WAL record. + */ + if (hdr_scratch == NULL) + hdr_scratch = palloc0(HEADER_SCRATCH_SIZE); } diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 7d573cc585..67d6223436 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -37,6 +37,8 @@ report_invalid_record(XLogReaderState *state, const char *fmt,...) the supplied arguments. */ __attribute__((format(PG_PRINTF_ATTRIBUTE, 2, 3))); +static void ResetDecoder(XLogReaderState *state); + /* size of the buffer allocated for error message. */ #define MAX_ERRORMSG_LEN 1000 @@ -59,46 +61,33 @@ report_invalid_record(XLogReaderState *state, const char *fmt,...) /* * Allocate and initialize a new XLogReader. * - * Returns NULL if the xlogreader couldn't be allocated. + * The returned XLogReader is palloc'd. (In FRONTEND code, that means that + * running out-of-memory causes an immediate exit(1). */ XLogReaderState * XLogReaderAllocate(XLogPageReadCB pagereadfunc, void *private_data) { XLogReaderState *state; - AssertArg(pagereadfunc != NULL); + state = (XLogReaderState *) palloc0(sizeof(XLogReaderState)); - state = (XLogReaderState *) malloc(sizeof(XLogReaderState)); - if (!state) - return NULL; - MemSet(state, 0, sizeof(XLogReaderState)); + state->max_block_id = -1; /* * Permanently allocate readBuf. We do it this way, rather than just * making a static array, for two reasons: (1) no need to waste the * storage in most instantiations of the backend; (2) a static char array - * isn't guaranteed to have any particular alignment, whereas malloc() + * isn't guaranteed to have any particular alignment, whereas palloc() * will provide MAXALIGN'd storage. */ - state->readBuf = (char *) malloc(XLOG_BLCKSZ); - if (!state->readBuf) - { - free(state); - return NULL; - } + state->readBuf = (char *) palloc(XLOG_BLCKSZ); state->read_page = pagereadfunc; /* system_identifier initialized to zeroes above */ state->private_data = private_data; /* ReadRecPtr and EndRecPtr initialized to zeroes above */ /* readSegNo, readOff, readLen, readPageTLI initialized to zeroes above */ - state->errormsg_buf = malloc(MAX_ERRORMSG_LEN + 1); - if (!state->errormsg_buf) - { - free(state->readBuf); - free(state); - return NULL; - } + state->errormsg_buf = palloc(MAX_ERRORMSG_LEN + 1); state->errormsg_buf[0] = '\0'; /* @@ -107,9 +96,9 @@ XLogReaderAllocate(XLogPageReadCB pagereadfunc, void *private_data) */ if (!allocate_recordbuf(state, 0)) { - free(state->errormsg_buf); - free(state->readBuf); - free(state); + pfree(state->errormsg_buf); + pfree(state->readBuf); + pfree(state); return NULL; } @@ -119,11 +108,24 @@ XLogReaderAllocate(XLogPageReadCB pagereadfunc, void *private_data) void XLogReaderFree(XLogReaderState *state) { - free(state->errormsg_buf); + int block_id; + + for (block_id = 0; block_id <= state->max_block_id; block_id++) + { + if (state->blocks[block_id].in_use) + { + if (state->blocks[block_id].data) + pfree(state->blocks[block_id].data); + } + } + if (state->main_data) + pfree(state->main_data); + + pfree(state->errormsg_buf); if (state->readRecordBuf) - free(state->readRecordBuf); - free(state->readBuf); - free(state); + pfree(state->readRecordBuf); + pfree(state->readBuf); + pfree(state); } /* @@ -146,14 +148,8 @@ allocate_recordbuf(XLogReaderState *state, uint32 reclength) newSize = Max(newSize, 5 * Max(BLCKSZ, XLOG_BLCKSZ)); if (state->readRecordBuf) - free(state->readRecordBuf); - state->readRecordBuf = (char *) malloc(newSize); - if (!state->readRecordBuf) - { - state->readRecordBufSize = 0; - return false; - } - + pfree(state->readRecordBuf); + state->readRecordBuf = (char *) palloc(newSize); state->readRecordBufSize = newSize; return true; } @@ -191,6 +187,8 @@ XLogReadRecord(XLogReaderState *state, XLogRecPtr RecPtr, char **errormsg) *errormsg = NULL; state->errormsg_buf[0] = '\0'; + ResetDecoder(state); + if (RecPtr == InvalidXLogRecPtr) { RecPtr = state->EndRecPtr; @@ -440,7 +438,10 @@ XLogReadRecord(XLogReaderState *state, XLogRecPtr RecPtr, char **errormsg) state->EndRecPtr -= state->EndRecPtr % XLogSegSize; } - return record; + if (DecodeXLogRecord(state, record, errormsg)) + return record; + else + return NULL; err: @@ -579,30 +580,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, XLogRecPtr PrevRecPtr, XLogRecord *record, bool randAccess) { - /* - * xl_len == 0 is bad data for everything except XLOG SWITCH, where it is - * required. - */ - if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH) - { - if (record->xl_len != 0) - { - report_invalid_record(state, - "invalid xlog switch record at %X/%X", - (uint32) (RecPtr >> 32), (uint32) RecPtr); - return false; - } - } - else if (record->xl_len == 0) - { - report_invalid_record(state, - "record with zero length at %X/%X", - (uint32) (RecPtr >> 32), (uint32) RecPtr); - return false; - } - if (record->xl_tot_len < SizeOfXLogRecord + record->xl_len || - record->xl_tot_len > SizeOfXLogRecord + record->xl_len + - XLR_MAX_BKP_BLOCKS * (sizeof(BkpBlock) + BLCKSZ)) + if (record->xl_tot_len < SizeOfXLogRecord) { report_invalid_record(state, "invalid record length at %X/%X", @@ -663,79 +641,17 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, * We assume all of the record (that is, xl_tot_len bytes) has been read * into memory at *record. Also, ValidXLogRecordHeader() has accepted the * record's header, which means in particular that xl_tot_len is at least - * SizeOfXlogRecord, so it is safe to fetch xl_len. + * SizeOfXlogRecord. */ static bool ValidXLogRecord(XLogReaderState *state, XLogRecord *record, XLogRecPtr recptr) { pg_crc32 crc; - int i; - uint32 len = record->xl_len; - BkpBlock bkpb; - char *blk; - size_t remaining = record->xl_tot_len; - /* First the rmgr data */ - if (remaining < SizeOfXLogRecord + len) - { - /* ValidXLogRecordHeader() should've caught this already... */ - report_invalid_record(state, "invalid record length at %X/%X", - (uint32) (recptr >> 32), (uint32) recptr); - return false; - } - remaining -= SizeOfXLogRecord + len; + /* Calculate the CRC */ INIT_CRC32C(crc); - COMP_CRC32C(crc, XLogRecGetData(record), len); - - /* Add in the backup blocks, if any */ - blk = (char *) XLogRecGetData(record) + len; - for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++) - { - uint32 blen; - - if (!(record->xl_info & XLR_BKP_BLOCK(i))) - continue; - - if (remaining < sizeof(BkpBlock)) - { - report_invalid_record(state, - "invalid backup block size in record at %X/%X", - (uint32) (recptr >> 32), (uint32) recptr); - return false; - } - memcpy(&bkpb, blk, sizeof(BkpBlock)); - - if (bkpb.hole_offset + bkpb.hole_length > BLCKSZ) - { - report_invalid_record(state, - "incorrect hole size in record at %X/%X", - (uint32) (recptr >> 32), (uint32) recptr); - return false; - } - blen = sizeof(BkpBlock) + BLCKSZ - bkpb.hole_length; - - if (remaining < blen) - { - report_invalid_record(state, - "invalid backup block size in record at %X/%X", - (uint32) (recptr >> 32), (uint32) recptr); - return false; - } - remaining -= blen; - COMP_CRC32C(crc, blk, blen); - blk += blen; - } - - /* Check that xl_tot_len agrees with our calculation */ - if (remaining != 0) - { - report_invalid_record(state, - "incorrect total length in record at %X/%X", - (uint32) (recptr >> 32), (uint32) recptr); - return false; - } - - /* Finally include the record header */ + COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord); + /* include the record header last */ COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc)); FIN_CRC32C(crc); @@ -985,3 +901,321 @@ out: } #endif /* FRONTEND */ + + +/* ---------------------------------------- + * Functions for decoding the data and block references in a record. + * ---------------------------------------- + */ + +/* private function to reset the state between records */ +static void +ResetDecoder(XLogReaderState *state) +{ + int block_id; + + state->decoded_record = NULL; + + state->main_data_len = 0; + + for (block_id = 0; block_id <= state->max_block_id; block_id++) + { + state->blocks[block_id].in_use = false; + state->blocks[block_id].has_image = false; + state->blocks[block_id].has_data = false; + } + state->max_block_id = -1; +} + +/* + * Decode the previously read record. + * + * On error, a human-readable error message is returned in *errormsg, and + * the return value is false. + */ +bool +DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg) +{ + /* + * read next _size bytes from record buffer, but check for overrun first. + */ +#define COPY_HEADER_FIELD(_dst, _size) \ + do { \ + if (remaining < _size) \ + goto shortdata_err; \ + memcpy(_dst, ptr, _size); \ + ptr += _size; \ + remaining -= _size; \ + } while(0) + + char *ptr; + uint32 remaining; + uint32 datatotal; + RelFileNode *rnode = NULL; + uint8 block_id; + + ResetDecoder(state); + + state->decoded_record = record; + + ptr = (char *) record; + ptr += SizeOfXLogRecord; + remaining = record->xl_tot_len - SizeOfXLogRecord; + + /* Decode the headers */ + datatotal = 0; + while (remaining > datatotal) + { + COPY_HEADER_FIELD(&block_id, sizeof(uint8)); + + if (block_id == XLR_BLOCK_ID_DATA_SHORT) + { + /* XLogRecordDataHeaderShort */ + uint8 main_data_len; + + COPY_HEADER_FIELD(&main_data_len, sizeof(uint8)); + + state->main_data_len = main_data_len; + datatotal += main_data_len; + break; /* by convention, the main data fragment is + * always last */ + } + else if (block_id == XLR_BLOCK_ID_DATA_LONG) + { + /* XLogRecordDataHeaderLong */ + uint32 main_data_len; + + COPY_HEADER_FIELD(&main_data_len, sizeof(uint32)); + state->main_data_len = main_data_len; + datatotal += main_data_len; + break; /* by convention, the main data fragment is + * always last */ + } + else if (block_id <= XLR_MAX_BLOCK_ID) + { + /* XLogRecordBlockHeader */ + DecodedBkpBlock *blk; + uint8 fork_flags; + + if (block_id <= state->max_block_id) + { + report_invalid_record(state, + "out-of-order block_id %u at %X/%X", + block_id, + (uint32) (state->ReadRecPtr >> 32), + (uint32) state->ReadRecPtr); + goto err; + } + state->max_block_id = block_id; + + blk = &state->blocks[block_id]; + blk->in_use = true; + + COPY_HEADER_FIELD(&fork_flags, sizeof(uint8)); + blk->forknum = fork_flags & BKPBLOCK_FORK_MASK; + blk->flags = fork_flags; + blk->has_image = ((fork_flags & BKPBLOCK_HAS_IMAGE) != 0); + blk->has_data = ((fork_flags & BKPBLOCK_HAS_DATA) != 0); + + COPY_HEADER_FIELD(&blk->data_len, sizeof(uint16)); + /* cross-check that the HAS_DATA flag is set iff data_length > 0 */ + if (blk->has_data && blk->data_len == 0) + report_invalid_record(state, + "BKPBLOCK_HAS_DATA set, but no data included at %X/%X", + (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); + if (!blk->has_data && blk->data_len != 0) + report_invalid_record(state, + "BKPBLOCK_HAS_DATA not set, but data length is %u at %X/%X", + (unsigned int) blk->data_len, + (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); + datatotal += blk->data_len; + + if (blk->has_image) + { + COPY_HEADER_FIELD(&blk->hole_offset, sizeof(uint16)); + COPY_HEADER_FIELD(&blk->hole_length, sizeof(uint16)); + datatotal += BLCKSZ - blk->hole_length; + } + if (!(fork_flags & BKPBLOCK_SAME_REL)) + { + COPY_HEADER_FIELD(&blk->rnode, sizeof(RelFileNode)); + rnode = &blk->rnode; + } + else + { + if (rnode == NULL) + { + report_invalid_record(state, + "BKPBLOCK_SAME_REL set but no previous rel at %X/%X", + (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); + goto err; + } + + blk->rnode = *rnode; + } + COPY_HEADER_FIELD(&blk->blkno, sizeof(BlockNumber)); + } + else + { + report_invalid_record(state, + "invalid block_id %u at %X/%X", + block_id, + (uint32) (state->ReadRecPtr >> 32), + (uint32) state->ReadRecPtr); + goto err; + } + } + + if (remaining != datatotal) + goto shortdata_err; + + /* + * Ok, we've parsed the fragment headers, and verified that the total + * length of the payload in the fragments is equal to the amount of data + * left. Copy the data of each fragment to a separate buffer. + * + * We could just set up pointers into readRecordBuf, but we want to align + * the data for the convenience of the callers. Backup images are not + * copied, however; they don't need alignment. + */ + + /* block data first */ + for (block_id = 0; block_id <= state->max_block_id; block_id++) + { + DecodedBkpBlock *blk = &state->blocks[block_id]; + + if (!blk->in_use) + continue; + if (blk->has_image) + { + blk->bkp_image = ptr; + ptr += BLCKSZ - blk->hole_length; + } + if (blk->has_data) + { + if (!blk->data || blk->data_len > blk->data_bufsz) + { + if (blk->data) + pfree(blk->data); + blk->data_bufsz = blk->data_len; + blk->data = palloc(blk->data_bufsz); + } + memcpy(blk->data, ptr, blk->data_len); + ptr += blk->data_len; + } + } + + /* and finally, the main data */ + if (state->main_data_len > 0) + { + if (!state->main_data || state->main_data_len > state->main_data_bufsz) + { + if (state->main_data) + pfree(state->main_data); + state->main_data_bufsz = state->main_data_len; + state->main_data = palloc(state->main_data_bufsz); + } + memcpy(state->main_data, ptr, state->main_data_len); + ptr += state->main_data_len; + } + + return true; + +shortdata_err: + report_invalid_record(state, + "record with invalid length at %X/%X", + (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); +err: + *errormsg = state->errormsg_buf; + + return false; +} + +/* + * Returns information about the block that a block reference refers to. + * + * If the WAL record contains a block reference with the given ID, *rnode, + * *forknum, and *blknum are filled in (if not NULL), and returns TRUE. + * Otherwise returns FALSE. + */ +bool +XLogRecGetBlockTag(XLogReaderState *record, uint8 block_id, + RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum) +{ + DecodedBkpBlock *bkpb; + + if (!record->blocks[block_id].in_use) + return false; + + bkpb = &record->blocks[block_id]; + if (rnode) + *rnode = bkpb->rnode; + if (forknum) + *forknum = bkpb->forknum; + if (blknum) + *blknum = bkpb->blkno; + return true; +} + +/* + * Returns the data associated with a block reference, or NULL if there is + * no data (e.g. because a full-page image was taken instead). The returned + * pointer points to a MAXALIGNed buffer. + */ +char * +XLogRecGetBlockData(XLogReaderState *record, uint8 block_id, Size *len) +{ + DecodedBkpBlock *bkpb; + + if (!record->blocks[block_id].in_use) + return NULL; + + bkpb = &record->blocks[block_id]; + + if (!bkpb->has_data) + { + if (len) + *len = 0; + return NULL; + } + else + { + if (len) + *len = bkpb->data_len; + return bkpb->data; + } +} + +/* + * Restore a full-page image from a backup block attached to an XLOG record. + * + * Returns the buffer number containing the page. + */ +bool +RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) +{ + DecodedBkpBlock *bkpb; + + if (!record->blocks[block_id].in_use) + return false; + if (!record->blocks[block_id].has_image) + return false; + + bkpb = &record->blocks[block_id]; + + if (bkpb->hole_length == 0) + { + memcpy(page, bkpb->bkp_image, BLCKSZ); + } + else + { + memcpy(page, bkpb->bkp_image, bkpb->hole_offset); + /* must zero-fill the hole */ + MemSet(page + bkpb->hole_offset, 0, bkpb->hole_length); + memcpy(page + (bkpb->hole_offset + bkpb->hole_length), + bkpb->bkp_image + bkpb->hole_offset, + BLCKSZ - (bkpb->hole_offset + bkpb->hole_length)); + } + + return true; +} diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index cf04081c19..ae323a0db8 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -253,9 +253,8 @@ XLogCheckInvalidPages(void) * * 'lsn' is the LSN of the record being replayed. It is compared with the * page's LSN to determine if the record has already been replayed. - * 'rnode' and 'blkno' point to the block being replayed (main fork number - * is implied, use XLogReadBufferForRedoExtended for other forks). - * 'block_index' identifies the backup block in the record for the page. + * 'block_id' is the ID number the block was registered with, when the WAL + * record was created. * * Returns one of the following: * @@ -272,15 +271,36 @@ XLogCheckInvalidPages(void) * single-process crash recovery, but some subroutines such as MarkBufferDirty * will complain if we don't have the lock. In hot standby mode it's * definitely necessary.) + * + * Note: when a backup block is available in XLOG, we restore it + * unconditionally, even if the page in the database appears newer. This is + * to protect ourselves against database pages that were partially or + * incorrectly written during a crash. We assume that the XLOG data must be + * good because it has passed a CRC check, while the database page might not + * be. This will force us to replay all subsequent modifications of the page + * that appear in XLOG, rather than possibly ignoring them as already + * applied, but that's not a huge drawback. */ XLogRedoAction -XLogReadBufferForRedo(XLogRecPtr lsn, XLogRecord *record, int block_index, - RelFileNode rnode, BlockNumber blkno, +XLogReadBufferForRedo(XLogReaderState *record, uint8 block_id, Buffer *buf) { - return XLogReadBufferForRedoExtended(lsn, record, block_index, - rnode, MAIN_FORKNUM, blkno, - RBM_NORMAL, false, buf); + return XLogReadBufferForRedoExtended(record, block_id, RBM_NORMAL, + false, buf); +} + +/* + * Pin and lock a buffer referenced by a WAL record, for the purpose of + * re-initializing it. + */ +Buffer +XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id) +{ + Buffer buf; + + XLogReadBufferForRedoExtended(record, block_id, RBM_ZERO_AND_LOCK, false, + &buf); + return buf; } /* @@ -299,21 +319,54 @@ XLogReadBufferForRedo(XLogRecPtr lsn, XLogRecord *record, int block_index, * using LockBufferForCleanup(), instead of a regular exclusive lock. */ XLogRedoAction -XLogReadBufferForRedoExtended(XLogRecPtr lsn, XLogRecord *record, - int block_index, RelFileNode rnode, - ForkNumber forkno, BlockNumber blkno, +XLogReadBufferForRedoExtended(XLogReaderState *record, + uint8 block_id, ReadBufferMode mode, bool get_cleanup_lock, Buffer *buf) { - if (record->xl_info & XLR_BKP_BLOCK(block_index)) + XLogRecPtr lsn = record->EndRecPtr; + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blkno; + Page page; + + if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno)) { - *buf = RestoreBackupBlock(lsn, record, block_index, - get_cleanup_lock, true); + /* Caller specified a bogus block_id */ + elog(PANIC, "failed to locate backup block with ID %d", block_id); + } + + /* If it's a full-page image, restore it. */ + if (XLogRecHasBlockImage(record, block_id)) + { + *buf = XLogReadBufferExtended(rnode, forknum, blkno, + get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK); + page = BufferGetPage(*buf); + if (!RestoreBlockImage(record, block_id, page)) + elog(ERROR, "failed to restore block image"); + + /* + * The page may be uninitialized. If so, we can't set the LSN because + * that would corrupt the page. + */ + if (!PageIsNew(page)) + { + PageSetLSN(page, lsn); + } + + MarkBufferDirty(*buf); + return BLK_RESTORED; } else { - *buf = XLogReadBufferExtended(rnode, forkno, blkno, mode); + if ((record->blocks[block_id].flags & BKPBLOCK_WILL_INIT) != 0 && + mode != RBM_ZERO_AND_LOCK && mode != RBM_ZERO_AND_CLEANUP_LOCK) + { + elog(PANIC, "block with WILL_INIT flag in WAL record must be zeroed by redo routine"); + } + + *buf = XLogReadBufferExtended(rnode, forknum, blkno, mode); if (BufferIsValid(*buf)) { if (mode != RBM_ZERO_AND_LOCK && mode != RBM_ZERO_AND_CLEANUP_LOCK) @@ -333,37 +386,6 @@ XLogReadBufferForRedoExtended(XLogRecPtr lsn, XLogRecord *record, } } -/* - * XLogReadBuffer - * Read a page during XLOG replay. - * - * This is a shorthand of XLogReadBufferExtended() followed by - * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), for reading from the main - * fork. - * - * (Getting the buffer lock is not really necessary during single-process - * crash recovery, but some subroutines such as MarkBufferDirty will complain - * if we don't have the lock. In hot standby mode it's definitely necessary.) - * - * The returned buffer is exclusively-locked. - * - * For historical reasons, instead of a ReadBufferMode argument, this only - * supports RBM_ZERO_AND_LOCK (init == true) and RBM_NORMAL (init == false) - * modes. - */ -Buffer -XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init) -{ - Buffer buf; - - buf = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, - init ? RBM_ZERO_AND_LOCK : RBM_NORMAL); - if (BufferIsValid(buf) && !init) - LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); - - return buf; -} - /* * XLogReadBufferExtended * Read a page during XLOG replay @@ -383,6 +405,11 @@ XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init) * In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't * exist, and we don't check for all-zeroes. Thus, no log entry is made * to imply that the page should be dropped or truncated later. + * + * NB: A redo function should normally not call this directly. To get a page + * to modify, use XLogReplayBuffer instead. It is important that all pages + * modified by a WAL record are registered in the WAL records, or they will be + * invisible to tools that that need to know which pages are modified. */ Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, @@ -473,124 +500,6 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, return buffer; } -/* - * Restore a full-page image from a backup block attached to an XLOG record. - * - * lsn: LSN of the XLOG record being replayed - * record: the complete XLOG record - * block_index: which backup block to restore (0 .. XLR_MAX_BKP_BLOCKS - 1) - * get_cleanup_lock: TRUE to get a cleanup rather than plain exclusive lock - * keep_buffer: TRUE to return the buffer still locked and pinned - * - * Returns the buffer number containing the page. Note this is not terribly - * useful unless keep_buffer is specified as TRUE. - * - * Note: when a backup block is available in XLOG, we restore it - * unconditionally, even if the page in the database appears newer. - * This is to protect ourselves against database pages that were partially - * or incorrectly written during a crash. We assume that the XLOG data - * must be good because it has passed a CRC check, while the database - * page might not be. This will force us to replay all subsequent - * modifications of the page that appear in XLOG, rather than possibly - * ignoring them as already applied, but that's not a huge drawback. - * - * If 'get_cleanup_lock' is true, a cleanup lock is obtained on the buffer, - * else a normal exclusive lock is used. During crash recovery, that's just - * pro forma because there can't be any regular backends in the system, but - * in hot standby mode the distinction is important. - * - * If 'keep_buffer' is true, return without releasing the buffer lock and pin; - * then caller is responsible for doing UnlockReleaseBuffer() later. This - * is needed in some cases when replaying XLOG records that touch multiple - * pages, to prevent inconsistent states from being visible to other backends. - * (Again, that's only important in hot standby mode.) - */ -Buffer -RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, int block_index, - bool get_cleanup_lock, bool keep_buffer) -{ - BkpBlock bkpb; - char *blk; - int i; - - /* Locate requested BkpBlock in the record */ - blk = (char *) XLogRecGetData(record) + record->xl_len; - for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++) - { - if (!(record->xl_info & XLR_BKP_BLOCK(i))) - continue; - - memcpy(&bkpb, blk, sizeof(BkpBlock)); - blk += sizeof(BkpBlock); - - if (i == block_index) - { - /* Found it, apply the update */ - return RestoreBackupBlockContents(lsn, bkpb, blk, get_cleanup_lock, - keep_buffer); - } - - blk += BLCKSZ - bkpb.hole_length; - } - - /* Caller specified a bogus block_index */ - elog(ERROR, "failed to restore block_index %d", block_index); - return InvalidBuffer; /* keep compiler quiet */ -} - -/* - * Workhorse for RestoreBackupBlock usable without an xlog record - * - * Restores a full-page image from BkpBlock and a data pointer. - */ -Buffer -RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb, char *blk, - bool get_cleanup_lock, bool keep_buffer) -{ - Buffer buffer; - Page page; - - buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block, - get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK); - Assert(BufferIsValid(buffer)); - - page = (Page) BufferGetPage(buffer); - - if (bkpb.hole_length == 0) - { - memcpy((char *) page, blk, BLCKSZ); - } - else - { - memcpy((char *) page, blk, bkpb.hole_offset); - /* must zero-fill the hole */ - MemSet((char *) page + bkpb.hole_offset, 0, bkpb.hole_length); - memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length), - blk + bkpb.hole_offset, - BLCKSZ - (bkpb.hole_offset + bkpb.hole_length)); - } - - /* - * The checksum value on this page is currently invalid. We don't need to - * reset it here since it will be set before being written. - */ - - /* - * The page may be uninitialized. If so, we can't set the LSN because that - * would corrupt the page. - */ - if (!PageIsNew(page)) - { - PageSetLSN(page, lsn); - } - MarkBufferDirty(buffer); - - if (!keep_buffer) - UnlockReleaseBuffer(buffer); - - return buffer; -} - /* * Struct actually returned by XLogFakeRelcacheEntry, though the declared * return type is Relation. diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index 46780e71d6..3f5e1700f0 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -125,7 +125,6 @@ void log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum) { xl_smgr_create xlrec; - XLogRecData rdata; /* * Make an XLOG entry reporting the file creation. @@ -133,12 +132,9 @@ log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum) xlrec.rnode = *rnode; xlrec.forkNum = forkNum; - rdata.data = (char *) &xlrec; - rdata.len = sizeof(xlrec); - rdata.buffer = InvalidBuffer; - rdata.next = NULL; - - XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE, &rdata); + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE); } /* @@ -268,18 +264,16 @@ RelationTruncate(Relation rel, BlockNumber nblocks) * Make an XLOG entry reporting the file truncation. */ XLogRecPtr lsn; - XLogRecData rdata; xl_smgr_truncate xlrec; xlrec.blkno = nblocks; xlrec.rnode = rel->rd_node; - rdata.data = (char *) &xlrec; - rdata.len = sizeof(xlrec); - rdata.buffer = InvalidBuffer; - rdata.next = NULL; + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); - lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE, &rdata); + lsn = XLogInsert(RM_SMGR_ID, + XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE); /* * Flush, because otherwise the truncation of the main relation might @@ -479,12 +473,13 @@ AtSubAbort_smgr(void) } void -smgr_redo(XLogRecPtr lsn, XLogRecord *record) +smgr_redo(XLogReaderState *record) { - uint8 info = record->xl_info & ~XLR_INFO_MASK; + XLogRecPtr lsn = record->EndRecPtr; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; /* Backup blocks are not used in smgr records */ - Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); + Assert(!XLogRecHasAnyBlockRefs(record)); if (info == XLOG_SMGR_CREATE) { @@ -505,8 +500,8 @@ smgr_redo(XLogRecPtr lsn, XLogRecord *record) /* * Forcibly create relation if it doesn't exist (which suggests that * it was dropped somewhere later in the WAL sequence). As in - * XLogReadBuffer, we prefer to recreate the rel and replay the log as - * best we can until the drop is seen. + * XLogReadBufferForRedo, we prefer to recreate the rel and replay the + * log as best we can until the drop is seen. */ smgrcreate(reln, MAIN_FORKNUM, true); diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 94c82d3741..1a5244cade 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -619,19 +619,17 @@ createdb(const CreatedbStmt *stmt) /* Record the filesystem change in XLOG */ { xl_dbase_create_rec xlrec; - XLogRecData rdata[1]; xlrec.db_id = dboid; xlrec.tablespace_id = dsttablespace; xlrec.src_db_id = src_dboid; xlrec.src_tablespace_id = srctablespace; - rdata[0].data = (char *) &xlrec; - rdata[0].len = sizeof(xl_dbase_create_rec); - rdata[0].buffer = InvalidBuffer; - rdata[0].next = NULL; + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xl_dbase_create_rec)); - (void) XLogInsert(RM_DBASE_ID, XLOG_DBASE_CREATE, rdata); + (void) XLogInsert(RM_DBASE_ID, + XLOG_DBASE_CREATE | XLR_SPECIAL_REL_UPDATE); } } heap_endscan(scan); @@ -1226,19 +1224,17 @@ movedb(const char *dbname, const char *tblspcname) */ { xl_dbase_create_rec xlrec; - XLogRecData rdata[1]; xlrec.db_id = db_id; xlrec.tablespace_id = dst_tblspcoid; xlrec.src_db_id = db_id; xlrec.src_tablespace_id = src_tblspcoid; - rdata[0].data = (char *) &xlrec; - rdata[0].len = sizeof(xl_dbase_create_rec); - rdata[0].buffer = InvalidBuffer; - rdata[0].next = NULL; + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xl_dbase_create_rec)); - (void) XLogInsert(RM_DBASE_ID, XLOG_DBASE_CREATE, rdata); + (void) XLogInsert(RM_DBASE_ID, + XLOG_DBASE_CREATE | XLR_SPECIAL_REL_UPDATE); } /* @@ -1330,17 +1326,15 @@ movedb(const char *dbname, const char *tblspcname) */ { xl_dbase_drop_rec xlrec; - XLogRecData rdata[1]; xlrec.db_id = db_id; xlrec.tablespace_id = src_tblspcoid; - rdata[0].data = (char *) &xlrec; - rdata[0].len = sizeof(xl_dbase_drop_rec); - rdata[0].buffer = InvalidBuffer; - rdata[0].next = NULL; + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xl_dbase_drop_rec)); - (void) XLogInsert(RM_DBASE_ID, XLOG_DBASE_DROP, rdata); + (void) XLogInsert(RM_DBASE_ID, + XLOG_DBASE_DROP | XLR_SPECIAL_REL_UPDATE); } /* Now it's safe to release the database lock */ @@ -1870,17 +1864,15 @@ remove_dbtablespaces(Oid db_id) /* Record the filesystem change in XLOG */ { xl_dbase_drop_rec xlrec; - XLogRecData rdata[1]; xlrec.db_id = db_id; xlrec.tablespace_id = dsttablespace; - rdata[0].data = (char *) &xlrec; - rdata[0].len = sizeof(xl_dbase_drop_rec); - rdata[0].buffer = InvalidBuffer; - rdata[0].next = NULL; + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xl_dbase_drop_rec)); - (void) XLogInsert(RM_DBASE_ID, XLOG_DBASE_DROP, rdata); + (void) XLogInsert(RM_DBASE_ID, + XLOG_DBASE_DROP | XLR_SPECIAL_REL_UPDATE); } pfree(dstpath); @@ -2043,12 +2035,12 @@ get_database_name(Oid dbid) * DATABASE resource manager's routines */ void -dbase_redo(XLogRecPtr lsn, XLogRecord *record) +dbase_redo(XLogReaderState *record) { - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; /* Backup blocks are not used in dbase records */ - Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); + Assert(!XLogRecHasAnyBlockRefs(record)); if (info == XLOG_DBASE_CREATE) { diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index cb8b27a93c..ba5b938863 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -372,20 +372,16 @@ fill_seq_with_data(Relation rel, HeapTuple tuple) { xl_seq_rec xlrec; XLogRecPtr recptr; - XLogRecData rdata[2]; + + XLogBeginInsert(); + XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT); xlrec.node = rel->rd_node; - rdata[0].data = (char *) &xlrec; - rdata[0].len = sizeof(xl_seq_rec); - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &(rdata[1]); - rdata[1].data = (char *) tuple->t_data; - rdata[1].len = tuple->t_len; - rdata[1].buffer = InvalidBuffer; - rdata[1].next = NULL; + XLogRegisterData((char *) &xlrec, sizeof(xl_seq_rec)); + XLogRegisterData((char *) tuple->t_data, tuple->t_len); - recptr = XLogInsert(RM_SEQ_ID, XLOG_SEQ_LOG, rdata); + recptr = XLogInsert(RM_SEQ_ID, XLOG_SEQ_LOG); PageSetLSN(page, recptr); } @@ -454,21 +450,17 @@ AlterSequence(AlterSeqStmt *stmt) { xl_seq_rec xlrec; XLogRecPtr recptr; - XLogRecData rdata[2]; Page page = BufferGetPage(buf); + XLogBeginInsert(); + XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT); + xlrec.node = seqrel->rd_node; - rdata[0].data = (char *) &xlrec; - rdata[0].len = sizeof(xl_seq_rec); - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &(rdata[1]); + XLogRegisterData((char *) &xlrec, sizeof(xl_seq_rec)); - rdata[1].data = (char *) seqtuple.t_data; - rdata[1].len = seqtuple.t_len; - rdata[1].buffer = InvalidBuffer; - rdata[1].next = NULL; + XLogRegisterData((char *) seqtuple.t_data, seqtuple.t_len); - recptr = XLogInsert(RM_SEQ_ID, XLOG_SEQ_LOG, rdata); + recptr = XLogInsert(RM_SEQ_ID, XLOG_SEQ_LOG); PageSetLSN(page, recptr); } @@ -706,7 +698,6 @@ nextval_internal(Oid relid) { xl_seq_rec xlrec; XLogRecPtr recptr; - XLogRecData rdata[2]; /* * We don't log the current state of the tuple, but rather the state @@ -714,6 +705,8 @@ nextval_internal(Oid relid) * that many future WAL records, at the cost that we lose those * sequence values if we crash. */ + XLogBeginInsert(); + XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT); /* set values that will be saved in xlog */ seq->last_value = next; @@ -721,17 +714,11 @@ nextval_internal(Oid relid) seq->log_cnt = 0; xlrec.node = seqrel->rd_node; - rdata[0].data = (char *) &xlrec; - rdata[0].len = sizeof(xl_seq_rec); - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &(rdata[1]); - rdata[1].data = (char *) seqtuple.t_data; - rdata[1].len = seqtuple.t_len; - rdata[1].buffer = InvalidBuffer; - rdata[1].next = NULL; + XLogRegisterData((char *) &xlrec, sizeof(xl_seq_rec)); + XLogRegisterData((char *) seqtuple.t_data, seqtuple.t_len); - recptr = XLogInsert(RM_SEQ_ID, XLOG_SEQ_LOG, rdata); + recptr = XLogInsert(RM_SEQ_ID, XLOG_SEQ_LOG); PageSetLSN(page, recptr); } @@ -894,21 +881,16 @@ do_setval(Oid relid, int64 next, bool iscalled) { xl_seq_rec xlrec; XLogRecPtr recptr; - XLogRecData rdata[2]; Page page = BufferGetPage(buf); + XLogBeginInsert(); + XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT); + xlrec.node = seqrel->rd_node; - rdata[0].data = (char *) &xlrec; - rdata[0].len = sizeof(xl_seq_rec); - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &(rdata[1]); + XLogRegisterData((char *) &xlrec, sizeof(xl_seq_rec)); + XLogRegisterData((char *) seqtuple.t_data, seqtuple.t_len); - rdata[1].data = (char *) seqtuple.t_data; - rdata[1].len = seqtuple.t_len; - rdata[1].buffer = InvalidBuffer; - rdata[1].next = NULL; - - recptr = XLogInsert(RM_SEQ_ID, XLOG_SEQ_LOG, rdata); + recptr = XLogInsert(RM_SEQ_ID, XLOG_SEQ_LOG); PageSetLSN(page, recptr); } @@ -1552,9 +1534,10 @@ pg_sequence_parameters(PG_FUNCTION_ARGS) void -seq_redo(XLogRecPtr lsn, XLogRecord *record) +seq_redo(XLogReaderState *record) { - uint8 info = record->xl_info & ~XLR_INFO_MASK; + XLogRecPtr lsn = record->EndRecPtr; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; Buffer buffer; Page page; Page localpage; @@ -1563,14 +1546,10 @@ seq_redo(XLogRecPtr lsn, XLogRecord *record) xl_seq_rec *xlrec = (xl_seq_rec *) XLogRecGetData(record); sequence_magic *sm; - /* Backup blocks are not used in seq records */ - Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); - if (info != XLOG_SEQ_LOG) elog(PANIC, "seq_redo: unknown op code %u", info); - buffer = XLogReadBuffer(xlrec->node, 0, true); - Assert(BufferIsValid(buffer)); + buffer = XLogInitBufferForRedo(record, 0); page = (Page) BufferGetPage(buffer); /* @@ -1589,7 +1568,7 @@ seq_redo(XLogRecPtr lsn, XLogRecord *record) sm->magic = SEQ_MAGIC; item = (char *) xlrec + sizeof(xl_seq_rec); - itemsz = record->xl_len - sizeof(xl_seq_rec); + itemsz = XLogRecGetDataLen(record) - sizeof(xl_seq_rec); if (PageAddItem(localpage, (Item) item, itemsz, FirstOffsetNumber, false, false) == InvalidOffsetNumber) diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c index 616308bc2d..3c9af5776a 100644 --- a/src/backend/commands/tablespace.c +++ b/src/backend/commands/tablespace.c @@ -354,20 +354,15 @@ CreateTableSpace(CreateTableSpaceStmt *stmt) /* Record the filesystem change in XLOG */ { xl_tblspc_create_rec xlrec; - XLogRecData rdata[2]; xlrec.ts_id = tablespaceoid; - rdata[0].data = (char *) &xlrec; - rdata[0].len = offsetof(xl_tblspc_create_rec, ts_path); - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &(rdata[1]); - rdata[1].data = (char *) location; - rdata[1].len = strlen(location) + 1; - rdata[1].buffer = InvalidBuffer; - rdata[1].next = NULL; + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, + offsetof(xl_tblspc_create_rec, ts_path)); + XLogRegisterData((char *) location, strlen(location) + 1); - (void) XLogInsert(RM_TBLSPC_ID, XLOG_TBLSPC_CREATE, rdata); + (void) XLogInsert(RM_TBLSPC_ID, XLOG_TBLSPC_CREATE); } /* @@ -515,15 +510,13 @@ DropTableSpace(DropTableSpaceStmt *stmt) /* Record the filesystem change in XLOG */ { xl_tblspc_drop_rec xlrec; - XLogRecData rdata[1]; xlrec.ts_id = tablespaceoid; - rdata[0].data = (char *) &xlrec; - rdata[0].len = sizeof(xl_tblspc_drop_rec); - rdata[0].buffer = InvalidBuffer; - rdata[0].next = NULL; - (void) XLogInsert(RM_TBLSPC_ID, XLOG_TBLSPC_DROP, rdata); + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xl_tblspc_drop_rec)); + + (void) XLogInsert(RM_TBLSPC_ID, XLOG_TBLSPC_DROP); } /* @@ -1408,12 +1401,12 @@ get_tablespace_name(Oid spc_oid) * TABLESPACE resource manager's routines */ void -tblspc_redo(XLogRecPtr lsn, XLogRecord *record) +tblspc_redo(XLogReaderState *record) { - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; /* Backup blocks are not used in tblspc records */ - Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); + Assert(!XLogRecHasAnyBlockRefs(record)); if (info == XLOG_TBLSPC_CREATE) { diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index 8e78aafda7..1c7dac38fc 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -31,7 +31,9 @@ #include "access/transam.h" #include "access/xact.h" #include "access/xlog_internal.h" +#include "access/xlogutils.h" #include "access/xlogreader.h" +#include "access/xlogrecord.h" #include "catalog/pg_control.h" @@ -46,8 +48,7 @@ typedef struct XLogRecordBuffer { XLogRecPtr origptr; XLogRecPtr endptr; - XLogRecord record; - char *record_data; + XLogReaderState *record; } XLogRecordBuffer; /* RMGR Handlers */ @@ -79,17 +80,16 @@ static void DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tup); * context. */ void -LogicalDecodingProcessRecord(LogicalDecodingContext *ctx, XLogRecord *record) +LogicalDecodingProcessRecord(LogicalDecodingContext *ctx, XLogReaderState *record) { XLogRecordBuffer buf; buf.origptr = ctx->reader->ReadRecPtr; buf.endptr = ctx->reader->EndRecPtr; - buf.record = *record; - buf.record_data = XLogRecGetData(record); + buf.record = record; /* cast so we get a warning when new rmgrs are added */ - switch ((RmgrIds) buf.record.xl_rmid) + switch ((RmgrIds) XLogRecGetRmid(record)) { /* * Rmgrs we care about for logical decoding. Add new rmgrs in @@ -135,7 +135,7 @@ LogicalDecodingProcessRecord(LogicalDecodingContext *ctx, XLogRecord *record) case RM_BRIN_ID: break; case RM_NEXT_ID: - elog(ERROR, "unexpected RM_NEXT_ID rmgr_id: %u", (RmgrIds) buf.record.xl_rmid); + elog(ERROR, "unexpected RM_NEXT_ID rmgr_id: %u", (RmgrIds) XLogRecGetRmid(buf.record)); } } @@ -146,7 +146,7 @@ static void DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { SnapBuild *builder = ctx->snapshot_builder; - uint8 info = buf->record.xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(buf->record) & ~XLR_INFO_MASK; switch (info) { @@ -185,8 +185,8 @@ DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { SnapBuild *builder = ctx->snapshot_builder; ReorderBuffer *reorder = ctx->reorder; - XLogRecord *r = &buf->record; - uint8 info = r->xl_info & ~XLR_INFO_MASK; + XLogReaderState *r = buf->record; + uint8 info = XLogRecGetInfo(r) & ~XLR_INFO_MASK; /* no point in doing anything yet, data could not be decoded anyway */ if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT) @@ -200,12 +200,12 @@ DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) TransactionId *subxacts = NULL; SharedInvalidationMessage *invals = NULL; - xlrec = (xl_xact_commit *) buf->record_data; + xlrec = (xl_xact_commit *) XLogRecGetData(r); subxacts = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]); invals = (SharedInvalidationMessage *) &(subxacts[xlrec->nsubxacts]); - DecodeCommit(ctx, buf, r->xl_xid, xlrec->dbId, + DecodeCommit(ctx, buf, XLogRecGetXid(r), xlrec->dbId, xlrec->xact_time, xlrec->nsubxacts, subxacts, xlrec->nmsgs, invals); @@ -220,7 +220,7 @@ DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) SharedInvalidationMessage *invals = NULL; /* Prepared commits contain a normal commit record... */ - prec = (xl_xact_commit_prepared *) buf->record_data; + prec = (xl_xact_commit_prepared *) XLogRecGetData(r); xlrec = &prec->crec; subxacts = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]); @@ -237,9 +237,9 @@ DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { xl_xact_commit_compact *xlrec; - xlrec = (xl_xact_commit_compact *) buf->record_data; + xlrec = (xl_xact_commit_compact *) XLogRecGetData(r); - DecodeCommit(ctx, buf, r->xl_xid, InvalidOid, + DecodeCommit(ctx, buf, XLogRecGetXid(r), InvalidOid, xlrec->xact_time, xlrec->nsubxacts, xlrec->subxacts, 0, NULL); @@ -250,11 +250,11 @@ DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) xl_xact_abort *xlrec; TransactionId *sub_xids; - xlrec = (xl_xact_abort *) buf->record_data; + xlrec = (xl_xact_abort *) XLogRecGetData(r); sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]); - DecodeAbort(ctx, buf->origptr, r->xl_xid, + DecodeAbort(ctx, buf->origptr, XLogRecGetXid(r), sub_xids, xlrec->nsubxacts); break; } @@ -265,7 +265,7 @@ DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) TransactionId *sub_xids; /* prepared abort contain a normal commit abort... */ - prec = (xl_xact_abort_prepared *) buf->record_data; + prec = (xl_xact_abort_prepared *) XLogRecGetData(r); xlrec = &prec->arec; sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]); @@ -282,7 +282,7 @@ DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) int i; TransactionId *sub_xid; - xlrec = (xl_xact_assignment *) buf->record_data; + xlrec = (xl_xact_assignment *) XLogRecGetData(r); sub_xid = &xlrec->xsub[0]; @@ -316,14 +316,14 @@ static void DecodeStandbyOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { SnapBuild *builder = ctx->snapshot_builder; - XLogRecord *r = &buf->record; - uint8 info = r->xl_info & ~XLR_INFO_MASK; + XLogReaderState *r = buf->record; + uint8 info = XLogRecGetInfo(r) & ~XLR_INFO_MASK; switch (info) { case XLOG_RUNNING_XACTS: { - xl_running_xacts *running = (xl_running_xacts *) buf->record_data; + xl_running_xacts *running = (xl_running_xacts *) XLogRecGetData(r); SnapBuildProcessRunningXacts(builder, buf->origptr, running); @@ -352,8 +352,8 @@ DecodeStandbyOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) static void DecodeHeap2Op(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { - uint8 info = buf->record.xl_info & XLOG_HEAP_OPMASK; - TransactionId xid = buf->record.xl_xid; + uint8 info = XLogRecGetInfo(buf->record) & XLOG_HEAP_OPMASK; + TransactionId xid = XLogRecGetXid(buf->record); SnapBuild *builder = ctx->snapshot_builder; /* no point in doing anything yet */ @@ -370,7 +370,7 @@ DecodeHeap2Op(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { xl_heap_new_cid *xlrec; - xlrec = (xl_heap_new_cid *) buf->record_data; + xlrec = (xl_heap_new_cid *) XLogRecGetData(buf->record); SnapBuildProcessNewCid(builder, xid, buf->origptr, xlrec); break; @@ -405,8 +405,8 @@ DecodeHeap2Op(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) static void DecodeHeapOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { - uint8 info = buf->record.xl_info & XLOG_HEAP_OPMASK; - TransactionId xid = buf->record.xl_xid; + uint8 info = XLogRecGetInfo(buf->record) & XLOG_HEAP_OPMASK; + TransactionId xid = XLogRecGetXid(buf->record); SnapBuild *builder = ctx->snapshot_builder; /* no point in doing anything yet */ @@ -576,34 +576,35 @@ DecodeAbort(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, static void DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { - XLogRecord *r = &buf->record; + XLogReaderState *r = buf->record; xl_heap_insert *xlrec; ReorderBufferChange *change; + RelFileNode target_node; - xlrec = (xl_heap_insert *) buf->record_data; + xlrec = (xl_heap_insert *) XLogRecGetData(r); /* only interested in our database */ - if (xlrec->target.node.dbNode != ctx->slot->data.database) + XLogRecGetBlockTag(r, 0, &target_node, NULL, NULL); + if (target_node.dbNode != ctx->slot->data.database) return; change = ReorderBufferGetChange(ctx->reorder); change->action = REORDER_BUFFER_CHANGE_INSERT; - memcpy(&change->data.tp.relnode, &xlrec->target.node, sizeof(RelFileNode)); + memcpy(&change->data.tp.relnode, &target_node, sizeof(RelFileNode)); if (xlrec->flags & XLOG_HEAP_CONTAINS_NEW_TUPLE) { - Assert(r->xl_len > (SizeOfHeapInsert + SizeOfHeapHeader)); + Size tuplelen; + char *tupledata = XLogRecGetBlockData(r, 0, &tuplelen); change->data.tp.newtuple = ReorderBufferGetTupleBuf(ctx->reorder); - DecodeXLogTuple((char *) xlrec + SizeOfHeapInsert, - r->xl_len - SizeOfHeapInsert, - change->data.tp.newtuple); + DecodeXLogTuple(tupledata, tuplelen, change->data.tp.newtuple); } change->data.tp.clear_toast_afterwards = true; - ReorderBufferQueueChange(ctx->reorder, r->xl_xid, buf->origptr, change); + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change); } /* @@ -615,62 +616,47 @@ DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) static void DecodeUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { - XLogRecord *r = &buf->record; + XLogReaderState *r = buf->record; xl_heap_update *xlrec; - xl_heap_header_len xlhdr; ReorderBufferChange *change; char *data; + Size datalen; + RelFileNode target_node; - xlrec = (xl_heap_update *) buf->record_data; + xlrec = (xl_heap_update *) XLogRecGetData(r); /* only interested in our database */ - if (xlrec->target.node.dbNode != ctx->slot->data.database) + XLogRecGetBlockTag(r, 0, &target_node, NULL, NULL); + if (target_node.dbNode != ctx->slot->data.database) return; change = ReorderBufferGetChange(ctx->reorder); change->action = REORDER_BUFFER_CHANGE_UPDATE; - memcpy(&change->data.tp.relnode, &xlrec->target.node, sizeof(RelFileNode)); - - /* caution, remaining data in record is not aligned */ - data = buf->record_data + SizeOfHeapUpdate; + memcpy(&change->data.tp.relnode, &target_node, sizeof(RelFileNode)); if (xlrec->flags & XLOG_HEAP_CONTAINS_NEW_TUPLE) { - Assert(r->xl_len > (SizeOfHeapUpdate + SizeOfHeapHeaderLen)); - - memcpy(&xlhdr, data, sizeof(xlhdr)); - data += offsetof(xl_heap_header_len, header); + data = XLogRecGetBlockData(r, 0, &datalen); change->data.tp.newtuple = ReorderBufferGetTupleBuf(ctx->reorder); - DecodeXLogTuple(data, - xlhdr.t_len + SizeOfHeapHeader, - change->data.tp.newtuple); - /* skip over the rest of the tuple header */ - data += SizeOfHeapHeader; - /* skip over the tuple data */ - data += xlhdr.t_len; + DecodeXLogTuple(data, datalen, change->data.tp.newtuple); } if (xlrec->flags & XLOG_HEAP_CONTAINS_OLD) { - memcpy(&xlhdr, data, sizeof(xlhdr)); - data += offsetof(xl_heap_header_len, header); + /* caution, remaining data in record is not aligned */ + data = XLogRecGetData(r) + SizeOfHeapUpdate; + datalen = XLogRecGetDataLen(r) - SizeOfHeapUpdate; change->data.tp.oldtuple = ReorderBufferGetTupleBuf(ctx->reorder); - DecodeXLogTuple(data, - xlhdr.t_len + SizeOfHeapHeader, - change->data.tp.oldtuple); -#ifdef NOT_USED - data += SizeOfHeapHeader; - data += xlhdr.t_len; -#endif + DecodeXLogTuple(data, datalen, change->data.tp.oldtuple); } change->data.tp.clear_toast_afterwards = true; - ReorderBufferQueueChange(ctx->reorder, r->xl_xid, buf->origptr, change); + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change); } /* @@ -681,36 +667,38 @@ DecodeUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) static void DecodeDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { - XLogRecord *r = &buf->record; + XLogReaderState *r = buf->record; xl_heap_delete *xlrec; ReorderBufferChange *change; + RelFileNode target_node; - xlrec = (xl_heap_delete *) buf->record_data; + xlrec = (xl_heap_delete *) XLogRecGetData(r); /* only interested in our database */ - if (xlrec->target.node.dbNode != ctx->slot->data.database) + XLogRecGetBlockTag(r, 0, &target_node, NULL, NULL); + if (target_node.dbNode != ctx->slot->data.database) return; change = ReorderBufferGetChange(ctx->reorder); change->action = REORDER_BUFFER_CHANGE_DELETE; - memcpy(&change->data.tp.relnode, &xlrec->target.node, sizeof(RelFileNode)); + memcpy(&change->data.tp.relnode, &target_node, sizeof(RelFileNode)); /* old primary key stored */ if (xlrec->flags & XLOG_HEAP_CONTAINS_OLD) { - Assert(r->xl_len > (SizeOfHeapDelete + SizeOfHeapHeader)); + Assert(XLogRecGetDataLen(r) > (SizeOfHeapDelete + SizeOfHeapHeader)); change->data.tp.oldtuple = ReorderBufferGetTupleBuf(ctx->reorder); DecodeXLogTuple((char *) xlrec + SizeOfHeapDelete, - r->xl_len - SizeOfHeapDelete, + XLogRecGetDataLen(r) - SizeOfHeapDelete, change->data.tp.oldtuple); } change->data.tp.clear_toast_afterwards = true; - ReorderBufferQueueChange(ctx->reorder, r->xl_xid, buf->origptr, change); + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change); } /* @@ -721,27 +709,24 @@ DecodeDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) static void DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { - XLogRecord *r = &buf->record; + XLogReaderState *r = buf->record; xl_heap_multi_insert *xlrec; int i; char *data; - bool isinit = (r->xl_info & XLOG_HEAP_INIT_PAGE) != 0; + char *tupledata; + Size tuplelen; + RelFileNode rnode; - xlrec = (xl_heap_multi_insert *) buf->record_data; + xlrec = (xl_heap_multi_insert *) XLogRecGetData(r); /* only interested in our database */ - if (xlrec->node.dbNode != ctx->slot->data.database) + XLogRecGetBlockTag(r, 0, &rnode, NULL, NULL); + if (rnode.dbNode != ctx->slot->data.database) return; - data = buf->record_data + SizeOfHeapMultiInsert; - - /* - * OffsetNumbers (which are not of interest to us) are stored when - * XLOG_HEAP_INIT_PAGE is not set -- skip over them. - */ - if (!isinit) - data += sizeof(OffsetNumber) * xlrec->ntuples; + tupledata = XLogRecGetBlockData(r, 0, &tuplelen); + data = tupledata; for (i = 0; i < xlrec->ntuples; i++) { ReorderBufferChange *change; @@ -751,7 +736,7 @@ DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) change = ReorderBufferGetChange(ctx->reorder); change->action = REORDER_BUFFER_CHANGE_INSERT; - memcpy(&change->data.tp.relnode, &xlrec->node, sizeof(RelFileNode)); + memcpy(&change->data.tp.relnode, &rnode, sizeof(RelFileNode)); /* * CONTAINS_NEW_TUPLE will always be set currently as multi_insert @@ -806,9 +791,10 @@ DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) else change->data.tp.clear_toast_afterwards = false; - ReorderBufferQueueChange(ctx->reorder, r->xl_xid, + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change); } + Assert(data == tupledata + tuplelen); } /* diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c index 875b89a628..8c318cd4b5 100644 --- a/src/backend/replication/logical/logical.c +++ b/src/backend/replication/logical/logical.c @@ -34,6 +34,7 @@ #include "miscadmin.h" #include "access/xact.h" +#include "access/xlog_internal.h" #include "replication/decode.h" #include "replication/logical.h" @@ -455,12 +456,12 @@ DecodingContextFindStartpoint(LogicalDecodingContext *ctx) record = XLogReadRecord(ctx->reader, startptr, &err); if (err) elog(ERROR, "%s", err); - - Assert(record); + if (!record) + elog(ERROR, "no record found"); /* shouldn't happen */ startptr = InvalidXLogRecPtr; - LogicalDecodingProcessRecord(ctx, record); + LogicalDecodingProcessRecord(ctx, ctx->reader); /* only continue till we found a consistent spot */ if (DecodingContextReady(ctx)) diff --git a/src/backend/replication/logical/logicalfuncs.c b/src/backend/replication/logical/logicalfuncs.c index 3a5ec2f61d..1977f098c7 100644 --- a/src/backend/replication/logical/logicalfuncs.c +++ b/src/backend/replication/logical/logicalfuncs.c @@ -21,6 +21,8 @@ #include "funcapi.h" #include "miscadmin.h" +#include "access/xlog_internal.h" + #include "catalog/pg_type.h" #include "nodes/makefuncs.h" @@ -431,7 +433,7 @@ pg_logical_slot_get_changes_guts(FunctionCallInfo fcinfo, bool confirm, bool bin * store the description into our tuplestore. */ if (record != NULL) - LogicalDecodingProcessRecord(ctx, record); + LogicalDecodingProcessRecord(ctx, ctx->reader); /* check limits */ if (upto_lsn != InvalidXLogRecPtr && diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c index 7d8f40738d..6e75398eab 100644 --- a/src/backend/replication/logical/reorderbuffer.c +++ b/src/backend/replication/logical/reorderbuffer.c @@ -54,6 +54,7 @@ #include "access/transam.h" #include "access/tuptoaster.h" #include "access/xact.h" +#include "access/xlog_internal.h" #include "catalog/catalog.h" #include "lib/binaryheap.h" #include "miscadmin.h" diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index 200b54d7c2..20f9b04adf 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -699,7 +699,7 @@ SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid, ReorderBufferXidSetCatalogChanges(builder->reorder, xid, lsn); ReorderBufferAddNewTupleCids(builder->reorder, xlrec->top_xid, lsn, - xlrec->target.node, xlrec->target.tid, + xlrec->target_node, xlrec->target_tid, xlrec->cmin, xlrec->cmax, xlrec->combocid); diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 385d18ba1b..addae8f6ce 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -2444,7 +2444,7 @@ XLogSendLogical(void) if (record != NULL) { - LogicalDecodingProcessRecord(logical_decoding_ctx, record); + LogicalDecodingProcessRecord(logical_decoding_ctx, logical_decoding_ctx->reader); sentPtr = logical_decoding_ctx->reader->EndRecPtr; } diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index 8c3720bc73..4269dda66b 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -759,12 +759,12 @@ StandbyReleaseOldLocks(int nxids, TransactionId *xids) */ void -standby_redo(XLogRecPtr lsn, XLogRecord *record) +standby_redo(XLogReaderState *record) { - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; /* Backup blocks are not used in standby records */ - Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); + Assert(!XLogRecHasAnyBlockRefs(record)); /* Do nothing if we're not in hot standby mode */ if (standbyState == STANDBY_DISABLED) @@ -928,8 +928,6 @@ static XLogRecPtr LogCurrentRunningXacts(RunningTransactions CurrRunningXacts) { xl_running_xacts xlrec; - XLogRecData rdata[2]; - int lastrdata = 0; XLogRecPtr recptr; xlrec.xcnt = CurrRunningXacts->xcnt; @@ -940,23 +938,15 @@ LogCurrentRunningXacts(RunningTransactions CurrRunningXacts) xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid; /* Header */ - rdata[0].data = (char *) (&xlrec); - rdata[0].len = MinSizeOfXactRunningXacts; - rdata[0].buffer = InvalidBuffer; + XLogBeginInsert(); + XLogRegisterData((char *) (&xlrec), MinSizeOfXactRunningXacts); /* array of TransactionIds */ if (xlrec.xcnt > 0) - { - rdata[0].next = &(rdata[1]); - rdata[1].data = (char *) CurrRunningXacts->xids; - rdata[1].len = (xlrec.xcnt + xlrec.subxcnt) * sizeof(TransactionId); - rdata[1].buffer = InvalidBuffer; - lastrdata = 1; - } + XLogRegisterData((char *) CurrRunningXacts->xids, + (xlrec.xcnt + xlrec.subxcnt) * sizeof(TransactionId)); - rdata[lastrdata].next = NULL; - - recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS, rdata); + recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS); if (CurrRunningXacts->subxid_overflow) elog(trace_recovery(DEBUG2), @@ -996,22 +986,15 @@ LogCurrentRunningXacts(RunningTransactions CurrRunningXacts) static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks) { - XLogRecData rdata[2]; xl_standby_locks xlrec; xlrec.nlocks = nlocks; - rdata[0].data = (char *) &xlrec; - rdata[0].len = offsetof(xl_standby_locks, locks); - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &rdata[1]; + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, offsetof(xl_standby_locks, locks)); + XLogRegisterData((char *) locks, nlocks * sizeof(xl_standby_lock)); - rdata[1].data = (char *) locks; - rdata[1].len = nlocks * sizeof(xl_standby_lock); - rdata[1].buffer = InvalidBuffer; - rdata[1].next = NULL; - - (void) XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_LOCK, rdata); + (void) XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_LOCK); } /* diff --git a/src/backend/utils/cache/relmapper.c b/src/backend/utils/cache/relmapper.c index d1f64e58c8..b90d6b5c7f 100644 --- a/src/backend/utils/cache/relmapper.c +++ b/src/backend/utils/cache/relmapper.c @@ -754,7 +754,6 @@ write_relmap_file(bool shared, RelMapFile *newmap, if (write_wal) { xl_relmap_update xlrec; - XLogRecData rdata[2]; XLogRecPtr lsn; /* now errors are fatal ... */ @@ -764,16 +763,11 @@ write_relmap_file(bool shared, RelMapFile *newmap, xlrec.tsid = tsid; xlrec.nbytes = sizeof(RelMapFile); - rdata[0].data = (char *) (&xlrec); - rdata[0].len = MinSizeOfRelmapUpdate; - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &(rdata[1]); - rdata[1].data = (char *) newmap; - rdata[1].len = sizeof(RelMapFile); - rdata[1].buffer = InvalidBuffer; - rdata[1].next = NULL; + XLogBeginInsert(); + XLogRegisterData((char *) (&xlrec), MinSizeOfRelmapUpdate); + XLogRegisterData((char *) newmap, sizeof(RelMapFile)); - lsn = XLogInsert(RM_RELMAP_ID, XLOG_RELMAP_UPDATE, rdata); + lsn = XLogInsert(RM_RELMAP_ID, XLOG_RELMAP_UPDATE); /* As always, WAL must hit the disk before the data update does */ XLogFlush(lsn); @@ -907,12 +901,12 @@ perform_relmap_update(bool shared, const RelMapFile *updates) * RELMAP resource manager's routines */ void -relmap_redo(XLogRecPtr lsn, XLogRecord *record) +relmap_redo(XLogReaderState *record) { - uint8 info = record->xl_info & ~XLR_INFO_MASK; + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; /* Backup blocks are not used in relmap records */ - Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); + Assert(!XLogRecHasAnyBlockRefs(record)); if (info == XLOG_RELMAP_UPDATE) { diff --git a/src/bin/pg_resetxlog/pg_resetxlog.c b/src/bin/pg_resetxlog/pg_resetxlog.c index 2ba9946982..666e8dbaa2 100644 --- a/src/bin/pg_resetxlog/pg_resetxlog.c +++ b/src/bin/pg_resetxlog/pg_resetxlog.c @@ -1006,6 +1006,7 @@ WriteEmptyXLOG(void) char path[MAXPGPATH]; int fd; int nbytes; + char *recptr; /* Use malloc() to ensure buffer is MAXALIGNED */ buffer = (char *) pg_malloc(XLOG_BLCKSZ); @@ -1023,18 +1024,21 @@ WriteEmptyXLOG(void) longpage->xlp_xlog_blcksz = XLOG_BLCKSZ; /* Insert the initial checkpoint record */ - record = (XLogRecord *) ((char *) page + SizeOfXLogLongPHD); + recptr = (char *) page + SizeOfXLogLongPHD; + record = (XLogRecord *) recptr; record->xl_prev = 0; record->xl_xid = InvalidTransactionId; - record->xl_tot_len = SizeOfXLogRecord + sizeof(CheckPoint); - record->xl_len = sizeof(CheckPoint); + record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint); record->xl_info = XLOG_CHECKPOINT_SHUTDOWN; record->xl_rmid = RM_XLOG_ID; - memcpy(XLogRecGetData(record), &ControlFile.checkPointCopy, + recptr += SizeOfXLogRecord; + *(recptr++) = XLR_BLOCK_ID_DATA_SHORT; + *(recptr++) = sizeof(CheckPoint); + memcpy(recptr, &ControlFile.checkPointCopy, sizeof(CheckPoint)); INIT_CRC32C(crc); - COMP_CRC32C(crc, &ControlFile.checkPointCopy, sizeof(CheckPoint)); + COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord); COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc)); FIN_CRC32C(crc); record->xl_crc = crc; diff --git a/src/include/access/brin_xlog.h b/src/include/access/brin_xlog.h index d748db4d0c..6dc9eb3eca 100644 --- a/src/include/access/brin_xlog.h +++ b/src/include/access/brin_xlog.h @@ -14,7 +14,7 @@ #ifndef BRIN_XLOG_H #define BRIN_XLOG_H -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "lib/stringinfo.h" #include "storage/bufpage.h" #include "storage/itemptr.h" @@ -42,59 +42,82 @@ */ #define XLOG_BRIN_INIT_PAGE 0x80 -/* This is what we need to know about a BRIN index create */ +/* + * This is what we need to know about a BRIN index create. + * + * Backup block 0: metapage + */ typedef struct xl_brin_createidx { BlockNumber pagesPerRange; - RelFileNode node; uint16 version; } xl_brin_createidx; #define SizeOfBrinCreateIdx (offsetof(xl_brin_createidx, version) + sizeof(uint16)) /* * This is what we need to know about a BRIN tuple insert + * + * Backup block 0: main page, block data is the new BrinTuple. + * Backup block 1: revmap page */ typedef struct xl_brin_insert { - RelFileNode node; BlockNumber heapBlk; /* extra information needed to update the revmap */ - BlockNumber revmapBlk; BlockNumber pagesPerRange; - uint16 tuplen; - ItemPointerData tid; - /* tuple data follows at end of struct */ + /* offset number in the main page to insert the tuple to. */ + OffsetNumber offnum; } xl_brin_insert; -#define SizeOfBrinInsert (offsetof(xl_brin_insert, tid) + sizeof(ItemPointerData)) +#define SizeOfBrinInsert (offsetof(xl_brin_insert, offnum) + sizeof(OffsetNumber)) /* - * A cross-page update is the same as an insert, but also store the old tid. + * A cross-page update is the same as an insert, but also stores information + * about the old tuple. + * + * Like in xlog_brin_update: + * Backup block 0: new page, block data includes the new BrinTuple. + * Backup block 1: revmap page + * + * And in addition: + * Backup block 2: old page */ typedef struct xl_brin_update { - ItemPointerData oldtid; + /* offset number of old tuple on old page */ + OffsetNumber oldOffnum; + xl_brin_insert insert; } xl_brin_update; #define SizeOfBrinUpdate (offsetof(xl_brin_update, insert) + SizeOfBrinInsert) -/* This is what we need to know about a BRIN tuple samepage update */ +/* + * This is what we need to know about a BRIN tuple samepage update + * + * Backup block 0: updated page, with new BrinTuple as block data + */ typedef struct xl_brin_samepage_update { - RelFileNode node; - ItemPointerData tid; - /* tuple data follows at end of struct */ + OffsetNumber offnum; } xl_brin_samepage_update; -#define SizeOfBrinSamepageUpdate (offsetof(xl_brin_samepage_update, tid) + sizeof(ItemPointerData)) +#define SizeOfBrinSamepageUpdate (sizeof(OffsetNumber)) -/* This is what we need to know about a revmap extension */ +/* + * This is what we need to know about a revmap extension + * + * Backup block 0: metapage + * Backup block 1: new revmap page + */ typedef struct xl_brin_revmap_extend { - RelFileNode node; + /* + * XXX: This is actually redundant - the block number is stored as part of + * backup block 1. + */ BlockNumber targetBlk; } xl_brin_revmap_extend; @@ -102,8 +125,8 @@ typedef struct xl_brin_revmap_extend sizeof(BlockNumber)) -extern void brin_desc(StringInfo buf, XLogRecord *record); -extern void brin_redo(XLogRecPtr lsn, XLogRecord *record); +extern void brin_redo(XLogReaderState *record); +extern void brin_desc(StringInfo buf, XLogReaderState *record); extern const char *brin_identify(uint8 info); #endif /* BRIN_XLOG_H */ diff --git a/src/include/access/clog.h b/src/include/access/clog.h index 04ac4ba311..fe5e4c634d 100644 --- a/src/include/access/clog.h +++ b/src/include/access/clog.h @@ -11,7 +11,7 @@ #ifndef CLOG_H #define CLOG_H -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "lib/stringinfo.h" /* @@ -48,8 +48,8 @@ extern void TruncateCLOG(TransactionId oldestXact); #define CLOG_ZEROPAGE 0x00 #define CLOG_TRUNCATE 0x10 -extern void clog_redo(XLogRecPtr lsn, XLogRecord *record); -extern void clog_desc(StringInfo buf, XLogRecord *record); +extern void clog_redo(XLogReaderState *record); +extern void clog_desc(StringInfo buf, XLogReaderState *record); extern const char *clog_identify(uint8 info); #endif /* CLOG_H */ diff --git a/src/include/access/gin.h b/src/include/access/gin.h index 433e56f20d..fe5f77b173 100644 --- a/src/include/access/gin.h +++ b/src/include/access/gin.h @@ -10,7 +10,7 @@ #ifndef GIN_H #define GIN_H -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "lib/stringinfo.h" #include "storage/block.h" #include "utils/relcache.h" @@ -74,8 +74,8 @@ extern void ginGetStats(Relation index, GinStatsData *stats); extern void ginUpdateStats(Relation index, const GinStatsData *stats); /* ginxlog.c */ -extern void gin_redo(XLogRecPtr lsn, XLogRecord *record); -extern void gin_desc(StringInfo buf, XLogRecord *record); +extern void gin_redo(XLogReaderState *record); +extern void gin_desc(StringInfo buf, XLogReaderState *record); extern const char *gin_identify(uint8 info); extern void gin_xlog_startup(void); extern void gin_xlog_cleanup(void); diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h index 333316d78e..3d46f20bb8 100644 --- a/src/include/access/gin_private.h +++ b/src/include/access/gin_private.h @@ -13,7 +13,6 @@ #include "access/genam.h" #include "access/gin.h" #include "access/itup.h" -#include "access/xloginsert.h" #include "fmgr.h" #include "storage/bufmgr.h" #include "utils/rbtree.h" @@ -397,22 +396,22 @@ typedef struct typedef struct ginxlogCreatePostingTree { - RelFileNode node; - BlockNumber blkno; uint32 size; /* A compressed posting list follows */ } ginxlogCreatePostingTree; -#define XLOG_GIN_INSERT 0x20 - /* * The format of the insertion record varies depending on the page type. * ginxlogInsert is the common part between all variants. + * + * Backup Blk 0: target page + * Backup Blk 1: left child, if this insertion finishes an incomplete split */ + +#define XLOG_GIN_INSERT 0x20 + typedef struct { - RelFileNode node; - BlockNumber blkno; uint16 flags; /* GIN_SPLIT_ISLEAF and/or GIN_SPLIT_ISDATA */ /* @@ -477,14 +476,17 @@ typedef struct PostingItem newitem; } ginxlogInsertDataInternal; - +/* + * Backup Blk 0: new left page (= original page, if not root split) + * Backup Blk 1: new right page + * Backup Blk 2: original page / new root page, if root split + * Backup Blk 3: left child, if this insertion completes an earlier split + */ #define XLOG_GIN_SPLIT 0x30 typedef struct ginxlogSplit { RelFileNode node; - BlockNumber lblkno; - BlockNumber rblkno; BlockNumber rrlink; /* right link, or root's blocknumber if root * split */ BlockNumber leftChildBlkno; /* valid on a non-leaf split */ @@ -538,15 +540,6 @@ typedef struct */ #define XLOG_GIN_VACUUM_PAGE 0x40 -typedef struct ginxlogVacuumPage -{ - RelFileNode node; - BlockNumber blkno; - uint16 hole_offset; /* number of bytes before "hole" */ - uint16 hole_length; /* number of bytes in "hole" */ - /* entire page contents (minus the hole) follow at end of record */ -} ginxlogVacuumPage; - /* * Vacuuming posting tree leaf page is WAL-logged like recompression caused * by insertion. @@ -555,26 +548,28 @@ typedef struct ginxlogVacuumPage typedef struct ginxlogVacuumDataLeafPage { - RelFileNode node; - BlockNumber blkno; - ginxlogRecompressDataLeaf data; } ginxlogVacuumDataLeafPage; +/* + * Backup Blk 0: deleted page + * Backup Blk 1: parent + * Backup Blk 2: left sibling + */ #define XLOG_GIN_DELETE_PAGE 0x50 typedef struct ginxlogDeletePage { - RelFileNode node; - BlockNumber blkno; - BlockNumber parentBlkno; OffsetNumber parentOffset; - BlockNumber leftBlkno; BlockNumber rightLink; } ginxlogDeletePage; #define XLOG_GIN_UPDATE_META_PAGE 0x60 +/* + * Backup Blk 0: metapage + * Backup Blk 1: tail page + */ typedef struct ginxlogUpdateMeta { RelFileNode node; @@ -591,22 +586,29 @@ typedef struct ginxlogUpdateMeta typedef struct ginxlogInsertListPage { - RelFileNode node; - BlockNumber blkno; BlockNumber rightlink; int32 ntuples; /* array of inserted tuples follows */ } ginxlogInsertListPage; +/* + * Backup Blk 0: metapage + * Backup Blk 1 to (ndeleted + 1): deleted pages + */ + #define XLOG_GIN_DELETE_LISTPAGE 0x80 -#define GIN_NDELETE_AT_ONCE 16 +/* + * The WAL record for deleting list pages must contain a block reference to + * all the deleted pages, so the number of pages that can be deleted in one + * record is limited by XLR_MAX_BLOCK_ID. (block_id 0 is used for the + * metapage.) + */ +#define GIN_NDELETE_AT_ONCE Min(16, XLR_MAX_BLOCK_ID - 1) typedef struct ginxlogDeleteListPages { - RelFileNode node; GinMetaPageData metadata; int32 ndeleted; - BlockNumber toDelete[GIN_NDELETE_AT_ONCE]; } ginxlogDeleteListPages; @@ -673,7 +675,7 @@ typedef struct GinBtreeData /* insert methods */ OffsetNumber (*findChildPtr) (GinBtree, Page, BlockNumber, OffsetNumber); - GinPlaceToPageRC (*placeToPage) (GinBtree, Buffer, GinBtreeStack *, void *, BlockNumber, XLogRecData **, Page *, Page *); + GinPlaceToPageRC (*placeToPage) (GinBtree, Buffer, GinBtreeStack *, void *, BlockNumber, Page *, Page *); void *(*prepareDownlink) (GinBtree, Buffer); void (*fillRoot) (GinBtree, Page, BlockNumber, Page, BlockNumber, Page); diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index 21daf3b2b6..2cbc918ad1 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -16,7 +16,7 @@ #include "access/gist.h" #include "access/itup.h" -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "fmgr.h" #include "storage/bufmgr.h" #include "storage/buffile.h" @@ -185,34 +185,33 @@ typedef GISTScanOpaqueData *GISTScanOpaque; #define XLOG_GIST_CREATE_INDEX 0x50 /* #define XLOG_GIST_PAGE_DELETE 0x60 */ /* not used anymore */ +/* + * Backup Blk 0: updated page. + * Backup Blk 1: If this operation completes a page split, by inserting a + * downlink for the split page, the left half of the split + */ typedef struct gistxlogPageUpdate { - RelFileNode node; - BlockNumber blkno; - - /* - * If this operation completes a page split, by inserting a downlink for - * the split page, leftchild points to the left half of the split. - */ - BlockNumber leftchild; - /* number of deleted offsets */ uint16 ntodelete; + uint16 ntoinsert; /* - * follow: 1. todelete OffsetNumbers 2. tuples to insert + * In payload of blk 0 : 1. todelete OffsetNumbers 2. tuples to insert */ } gistxlogPageUpdate; +/* + * Backup Blk 0: If this operation completes a page split, by inserting a + * downlink for the split page, the left half of the split + * Backup Blk 1 - npage: split pages (1 is the original page) + */ typedef struct gistxlogPageSplit { - RelFileNode node; - BlockNumber origblkno; /* splitted page */ BlockNumber origrlink; /* rightlink of the page before split */ GistNSN orignsn; /* NSN of the page before split */ bool origleaf; /* was splitted page a leaf page? */ - BlockNumber leftchild; /* like in gistxlogPageUpdate */ uint16 npage; /* # of pages in the split */ bool markfollowright; /* set F_FOLLOW_RIGHT flags */ @@ -451,8 +450,8 @@ extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup, int len, GISTSTATE *giststate); /* gistxlog.c */ -extern void gist_redo(XLogRecPtr lsn, XLogRecord *record); -extern void gist_desc(StringInfo buf, XLogRecord *record); +extern void gist_redo(XLogReaderState *record); +extern void gist_desc(StringInfo buf, XLogReaderState *record); extern const char *gist_identify(uint8 info); extern void gist_xlog_startup(void); extern void gist_xlog_cleanup(void); diff --git a/src/include/access/hash.h b/src/include/access/hash.h index c175a5c182..afd06ff7de 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -20,7 +20,7 @@ #include "access/genam.h" #include "access/itup.h" #include "access/sdir.h" -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "fmgr.h" #include "lib/stringinfo.h" #include "storage/bufmgr.h" @@ -356,8 +356,8 @@ extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value); extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value); /* hash.c */ -extern void hash_redo(XLogRecPtr lsn, XLogRecord *record); -extern void hash_desc(StringInfo buf, XLogRecord *record); +extern void hash_redo(XLogReaderState *record); +extern void hash_desc(StringInfo buf, XLogReaderState *record); extern const char *hash_identify(uint8 info); #endif /* HASH_H */ diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index 1d64264b01..853e2dd491 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -15,7 +15,7 @@ #define HEAPAM_XLOG_H #include "access/htup.h" -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "lib/stringinfo.h" #include "storage/buf.h" #include "storage/bufpage.h" @@ -78,27 +78,11 @@ #define XLOG_HEAP_CONTAINS_OLD \ (XLOG_HEAP_CONTAINS_OLD_TUPLE | XLOG_HEAP_CONTAINS_OLD_KEY) -/* - * All what we need to find changed tuple - * - * NB: on most machines, sizeof(xl_heaptid) will include some trailing pad - * bytes for alignment. We don't want to store the pad space in the XLOG, - * so use SizeOfHeapTid for space calculations. Similar comments apply for - * the other xl_FOO structs. - */ -typedef struct xl_heaptid -{ - RelFileNode node; - ItemPointerData tid; /* changed tuple id */ -} xl_heaptid; - -#define SizeOfHeapTid (offsetof(xl_heaptid, tid) + SizeOfIptrData) - /* This is what we need to know about delete */ typedef struct xl_heap_delete { - xl_heaptid target; /* deleted tuple id */ TransactionId xmax; /* xmax of the deleted tuple */ + OffsetNumber offnum; /* deleted tuple's offset */ uint8 infobits_set; /* infomask bits */ uint8 flags; } xl_heap_delete; @@ -122,45 +106,33 @@ typedef struct xl_heap_header #define SizeOfHeapHeader (offsetof(xl_heap_header, t_hoff) + sizeof(uint8)) -/* - * Variant of xl_heap_header that contains the length of the tuple, which is - * useful if the length of the tuple cannot be computed using the overall - * record length. E.g. because there are several tuples inside a single - * record. - */ -typedef struct xl_heap_header_len -{ - uint16 t_len; - xl_heap_header header; -} xl_heap_header_len; - -#define SizeOfHeapHeaderLen (offsetof(xl_heap_header_len, header) + SizeOfHeapHeader) - /* This is what we need to know about insert */ typedef struct xl_heap_insert { - xl_heaptid target; /* inserted tuple id */ + OffsetNumber offnum; /* inserted tuple's offset */ uint8 flags; - /* xl_heap_header & TUPLE DATA FOLLOWS AT END OF STRUCT */ + + /* xl_heap_header & TUPLE DATA in backup block 0 */ } xl_heap_insert; #define SizeOfHeapInsert (offsetof(xl_heap_insert, flags) + sizeof(uint8)) /* - * This is what we need to know about a multi-insert. The record consists of - * xl_heap_multi_insert header, followed by a xl_multi_insert_tuple and tuple - * data for each tuple. 'offsets' array is omitted if the whole page is - * reinitialized (XLOG_HEAP_INIT_PAGE) + * This is what we need to know about a multi-insert. + * + * The main data of the record consists of this xl_heap_multi_insert header. + * 'offsets' array is omitted if the whole page is reinitialized + * (XLOG_HEAP_INIT_PAGE). + * + * In block 0's data portion, there is an xl_multi_insert_tuple struct, + * followed by the tuple data for each tuple. There is padding to align + * each xl_multi_insert struct. */ typedef struct xl_heap_multi_insert { - RelFileNode node; - BlockNumber blkno; uint8 flags; uint16 ntuples; OffsetNumber offsets[1]; - - /* TUPLE DATA (xl_multi_insert_tuples) FOLLOW AT END OF STRUCT */ } xl_heap_multi_insert; #define SizeOfHeapMultiInsert offsetof(xl_heap_multi_insert, offsets) @@ -176,34 +148,39 @@ typedef struct xl_multi_insert_tuple #define SizeOfMultiInsertTuple (offsetof(xl_multi_insert_tuple, t_hoff) + sizeof(uint8)) -/* This is what we need to know about update|hot_update */ +/* + * This is what we need to know about update|hot_update + * + * Backup blk 0: new page + * + * If XLOG_HEAP_PREFIX_FROM_OLD or XLOG_HEAP_SUFFIX_FROM_OLD flags are set, + * the prefix and/or suffix come first, as one or two uint16s. + * + * After that, xl_heap_header and new tuple data follow. The new tuple + * data doesn't include the prefix and suffix, which are copied from the + * old tuple on replay. + * + * If HEAP_CONTAINS_NEW_TUPLE_DATA flag is given, the tuple data is + * included even if a full-page image was taken. + * + * Backup blk 1: old page, if different. (no data, just a reference to the blk) + */ typedef struct xl_heap_update { - xl_heaptid target; /* deleted tuple id */ TransactionId old_xmax; /* xmax of the old tuple */ - TransactionId new_xmax; /* xmax of the new tuple */ - ItemPointerData newtid; /* new inserted tuple id */ + OffsetNumber old_offnum; /* old tuple's offset */ uint8 old_infobits_set; /* infomask bits to set on old tuple */ uint8 flags; + TransactionId new_xmax; /* xmax of the new tuple */ + OffsetNumber new_offnum; /* new tuple's offset */ /* - * If XLOG_HEAP_PREFIX_FROM_OLD or XLOG_HEAP_SUFFIX_FROM_OLD flags are - * set, the prefix and/or suffix come next, as one or two uint16s. - * - * After that, xl_heap_header_len and new tuple data follow. The new - * tuple data and length don't include the prefix and suffix, which are - * copied from the old tuple on replay. The new tuple data is omitted if - * a full-page image of the page was taken (unless the - * XLOG_HEAP_CONTAINS_NEW_TUPLE flag is set, in which case it's included - * anyway). - * * If XLOG_HEAP_CONTAINS_OLD_TUPLE or XLOG_HEAP_CONTAINS_OLD_KEY flags are - * set, another xl_heap_header_len struct and tuple data for the old tuple - * follows. + * set, a xl_heap_header struct and tuple data for the old tuple follows. */ } xl_heap_update; -#define SizeOfHeapUpdate (offsetof(xl_heap_update, flags) + sizeof(uint8)) +#define SizeOfHeapUpdate (offsetof(xl_heap_update, new_offnum) + sizeof(OffsetNumber)) /* * This is what we need to know about vacuum page cleanup/redirect @@ -218,12 +195,10 @@ typedef struct xl_heap_update */ typedef struct xl_heap_clean { - RelFileNode node; - BlockNumber block; TransactionId latestRemovedXid; uint16 nredirected; uint16 ndead; - /* OFFSET NUMBERS FOLLOW */ + /* OFFSET NUMBERS are in the block reference 0 */ } xl_heap_clean; #define SizeOfHeapClean (offsetof(xl_heap_clean, ndead) + sizeof(uint16)) @@ -251,8 +226,8 @@ typedef struct xl_heap_cleanup_info /* This is what we need to know about lock */ typedef struct xl_heap_lock { - xl_heaptid target; /* locked tuple id */ TransactionId locking_xid; /* might be a MultiXactId not xid */ + OffsetNumber offnum; /* locked tuple's offset on page */ int8 infobits_set; /* infomask and infomask2 bits to set */ } xl_heap_lock; @@ -261,8 +236,8 @@ typedef struct xl_heap_lock /* This is what we need to know about locking an updated version of a row */ typedef struct xl_heap_lock_updated { - xl_heaptid target; TransactionId xmax; + OffsetNumber offnum; uint8 infobits_set; } xl_heap_lock_updated; @@ -271,11 +246,11 @@ typedef struct xl_heap_lock_updated /* This is what we need to know about in-place update */ typedef struct xl_heap_inplace { - xl_heaptid target; /* updated tuple id */ + OffsetNumber offnum; /* updated tuple's offset on page */ /* TUPLE DATA FOLLOWS AT END OF STRUCT */ } xl_heap_inplace; -#define SizeOfHeapInplace (offsetof(xl_heap_inplace, target) + SizeOfHeapTid) +#define SizeOfHeapInplace (offsetof(xl_heap_inplace, offnum) + sizeof(OffsetNumber)) /* * This struct represents a 'freeze plan', which is what we need to know about @@ -296,23 +271,26 @@ typedef struct xl_heap_freeze_tuple /* * This is what we need to know about a block being frozen during vacuum + * + * Backup block 0's data contains an array of xl_heap_freeze_tuple structs, + * one for each tuple. */ typedef struct xl_heap_freeze_page { - RelFileNode node; - BlockNumber block; TransactionId cutoff_xid; uint16 ntuples; - xl_heap_freeze_tuple tuples[FLEXIBLE_ARRAY_MEMBER]; } xl_heap_freeze_page; -#define SizeOfHeapFreezePage offsetof(xl_heap_freeze_page, tuples) +#define SizeOfHeapFreezePage (offsetof(xl_heap_freeze_page, ntuples) + sizeof(uint16)) -/* This is what we need to know about setting a visibility map bit */ +/* + * This is what we need to know about setting a visibility map bit + * + * Backup blk 0: visibility map buffer + * Backup blk 1: heap buffer + */ typedef struct xl_heap_visible { - RelFileNode node; - BlockNumber block; TransactionId cutoff_xid; } xl_heap_visible; @@ -338,10 +316,11 @@ typedef struct xl_heap_new_cid /* * Store the relfilenode/ctid pair to facilitate lookups. */ - xl_heaptid target; + RelFileNode target_node; + ItemPointerData target_tid; } xl_heap_new_cid; -#define SizeOfHeapNewCid (offsetof(xl_heap_new_cid, target) + SizeOfHeapTid) +#define SizeOfHeapNewCid (offsetof(xl_heap_new_cid, target_tid) + sizeof(ItemPointerData)) /* logical rewrite xlog record header */ typedef struct xl_heap_rewrite_mapping @@ -357,13 +336,13 @@ typedef struct xl_heap_rewrite_mapping extern void HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple, TransactionId *latestRemovedXid); -extern void heap_redo(XLogRecPtr lsn, XLogRecord *record); -extern void heap_desc(StringInfo buf, XLogRecord *record); +extern void heap_redo(XLogReaderState *record); +extern void heap_desc(StringInfo buf, XLogReaderState *record); extern const char *heap_identify(uint8 info); -extern void heap2_redo(XLogRecPtr lsn, XLogRecord *record); -extern void heap2_desc(StringInfo buf, XLogRecord *record); +extern void heap2_redo(XLogReaderState *record); +extern void heap2_desc(StringInfo buf, XLogReaderState *record); extern const char *heap2_identify(uint8 info); -extern void heap_xlog_logical_rewrite(XLogRecPtr lsn, XLogRecord *r); +extern void heap_xlog_logical_rewrite(XLogReaderState *r); extern XLogRecPtr log_heap_cleanup_info(RelFileNode rnode, TransactionId latestRemovedXid); diff --git a/src/include/access/htup_details.h b/src/include/access/htup_details.h index 294d21bd18..300c2a52f0 100644 --- a/src/include/access/htup_details.h +++ b/src/include/access/htup_details.h @@ -498,6 +498,7 @@ do { \ * you can, say, fit 2 tuples of size MaxHeapTupleSize/2 on the same page. */ #define MaxHeapTupleSize (BLCKSZ - MAXALIGN(SizeOfPageHeaderData + sizeof(ItemIdData))) +#define MinHeapTupleSize MAXALIGN(offsetof(HeapTupleHeaderData, t_bits)) /* * MaxHeapTuplesPerPage is an upper bound on the number of tuples that can diff --git a/src/include/access/itup.h b/src/include/access/itup.h index de17936b10..e4dc51e872 100644 --- a/src/include/access/itup.h +++ b/src/include/access/itup.h @@ -133,6 +133,7 @@ typedef IndexAttributeBitMapData *IndexAttributeBitMap; * IndexTupleData struct. We arrive at the divisor because each tuple * must be maxaligned, and it must have an associated item pointer. */ +#define MinIndexTupleSize MAXALIGN(sizeof(IndexTupleData) + 1) #define MaxIndexTuplesPerPage \ ((int) ((BLCKSZ - SizeOfPageHeaderData) / \ (MAXALIGN(sizeof(IndexTupleData) + 1) + sizeof(ItemIdData)))) diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h index 43d737505d..ac58a3766d 100644 --- a/src/include/access/multixact.h +++ b/src/include/access/multixact.h @@ -11,7 +11,7 @@ #ifndef MULTIXACT_H #define MULTIXACT_H -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "lib/stringinfo.h" @@ -135,8 +135,8 @@ extern void multixact_twophase_postcommit(TransactionId xid, uint16 info, extern void multixact_twophase_postabort(TransactionId xid, uint16 info, void *recdata, uint32 len); -extern void multixact_redo(XLogRecPtr lsn, XLogRecord *record); -extern void multixact_desc(StringInfo buf, XLogRecord *record); +extern void multixact_redo(XLogReaderState *record); +extern void multixact_desc(StringInfo buf, XLogReaderState *record); extern const char *multixact_identify(uint8 info); extern char *mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members); diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 6ecd2ced62..d3d258bcc9 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -17,7 +17,7 @@ #include "access/genam.h" #include "access/itup.h" #include "access/sdir.h" -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "catalog/pg_index.h" #include "lib/stringinfo.h" #include "storage/bufmgr.h" @@ -227,15 +227,6 @@ typedef struct BTMetaPageData #define XLOG_BTREE_REUSE_PAGE 0xD0 /* old page is about to be reused from * FSM */ -/* - * All that we need to find changed index tuple - */ -typedef struct xl_btreetid -{ - RelFileNode node; - ItemPointerData tid; /* changed tuple id */ -} xl_btreetid; - /* * All that we need to regenerate the meta-data page */ @@ -252,16 +243,17 @@ typedef struct xl_btree_metadata * * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META. * Note that INSERT_META implies it's not a leaf page. + * + * Backup Blk 0: original page (data contains the inserted tuple) + * Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META + * Backup Blk 2: xl_btree_metadata, if INSERT_META */ typedef struct xl_btree_insert { - xl_btreetid target; /* inserted tuple id */ - /* BlockNumber finishes_split field FOLLOWS IF NOT XLOG_BTREE_INSERT_LEAF */ - /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_INSERT_META */ - /* INDEX TUPLE FOLLOWS AT END OF STRUCT */ + OffsetNumber offnum; } xl_btree_insert; -#define SizeOfBtreeInsert (offsetof(xl_btreetid, tid) + SizeOfIptrData) +#define SizeOfBtreeInsert (offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber)) /* * On insert with split, we save all the items going into the right sibling @@ -278,45 +270,41 @@ typedef struct xl_btree_insert * the root page, and thus that a newroot record rather than an insert or * split record should follow. Note that a split record never carries a * metapage update --- we'll do that in the parent-level update. + * + * Backup Blk 0: original page / new left page + * + * The left page's data portion contains the new item, if it's the _L variant. + * (In the _R variants, the new item is one of the right page's tuples.) + * If level > 0, an IndexTuple representing the HIKEY of the left page + * follows. We don't need this on leaf pages, because it's the same as the + * leftmost key in the new right page. + * + * Backup Blk 1: new right page + * + * The right page's data portion contains the right page's tuples in the + * form used by _bt_restore_page. + * + * Backup Blk 2: next block (orig page's rightlink), if any + * Backup Blk 3: child's left sibling, if non-leaf split */ typedef struct xl_btree_split { - RelFileNode node; - BlockNumber leftsib; /* orig page / new left page */ - BlockNumber rightsib; /* new right page */ - BlockNumber rnext; /* next block (orig page's rightlink) */ uint32 level; /* tree level of page being split */ OffsetNumber firstright; /* first item moved to right page */ - - /* - * In the _L variants, next are OffsetNumber newitemoff and the new item. - * (In the _R variants, the new item is one of the right page's tuples.) - * The new item, but not newitemoff, is suppressed if XLogInsert chooses - * to store the left page's whole page image. - * - * If level > 0, an IndexTuple representing the HIKEY of the left page - * follows. We don't need this on leaf pages, because it's the same as - * the leftmost key in the new right page. Also, it's suppressed if - * XLogInsert chooses to store the left page's whole page image. - * - * If level > 0, BlockNumber of the page whose incomplete-split flag this - * insertion clears. (not aligned) - * - * Last are the right page's tuples in the form used by _bt_restore_page. - */ + OffsetNumber newitemoff; /* new item's offset (if placed on left page) */ } xl_btree_split; -#define SizeOfBtreeSplit (offsetof(xl_btree_split, firstright) + sizeof(OffsetNumber)) +#define SizeOfBtreeSplit (offsetof(xl_btree_split, newitemoff) + sizeof(OffsetNumber)) /* * This is what we need to know about delete of individual leaf index tuples. * The WAL record can represent deletion of any number of index tuples on a * single index page when *not* executed by VACUUM. + * + * Backup Blk 0: index page */ typedef struct xl_btree_delete { - RelFileNode node; /* RelFileNode of the index */ - BlockNumber block; RelFileNode hnode; /* RelFileNode of the heap the index currently * points at */ int nitems; @@ -361,8 +349,6 @@ typedef struct xl_btree_reuse_page */ typedef struct xl_btree_vacuum { - RelFileNode node; - BlockNumber block; BlockNumber lastBlockVacuumed; /* TARGET OFFSET NUMBERS FOLLOW */ @@ -376,10 +362,13 @@ typedef struct xl_btree_vacuum * remove this tuple's downlink and the *following* tuple's key). Note that * the leaf page is empty, so we don't need to store its content --- it is * just reinitialized during recovery using the rest of the fields. + * + * Backup Blk 0: leaf block + * Backup Blk 1: top parent */ typedef struct xl_btree_mark_page_halfdead { - xl_btreetid target; /* deleted tuple id in parent page */ + OffsetNumber poffset; /* deleted tuple id in parent page */ /* information needed to recreate the leaf page: */ BlockNumber leafblk; /* leaf block ultimately being deleted */ @@ -394,11 +383,15 @@ typedef struct xl_btree_mark_page_halfdead * This is what we need to know about deletion of a btree page. Note we do * not store any content for the deleted page --- it is just rewritten as empty * during recovery, apart from resetting the btpo.xact. + * + * Backup Blk 0: target block being deleted + * Backup Blk 1: target block's left sibling, if any + * Backup Blk 2: target block's right sibling + * Backup Blk 3: leaf block (if different from target) + * Backup Blk 4: metapage (if rightsib becomes new fast root) */ typedef struct xl_btree_unlink_page { - RelFileNode node; - BlockNumber deadblk; /* target block being deleted */ BlockNumber leftsib; /* target block's left sibling, if any */ BlockNumber rightsib; /* target block's right sibling */ @@ -406,7 +399,6 @@ typedef struct xl_btree_unlink_page * Information needed to recreate the leaf page, when target is an * internal page. */ - BlockNumber leafblk; BlockNumber leafleftsib; BlockNumber leafrightsib; BlockNumber topparent; /* next child down in the branch */ @@ -423,13 +415,15 @@ typedef struct xl_btree_unlink_page * * Note that although this implies rewriting the metadata page, we don't need * an xl_btree_metadata record --- the rootblk and level are sufficient. + * + * Backup Blk 0: new root page (2 tuples as payload, if splitting old root) + * Backup Blk 1: left child (if splitting an old root) + * Backup Blk 2: metapage */ typedef struct xl_btree_newroot { - RelFileNode node; - BlockNumber rootblk; /* location of new root */ + BlockNumber rootblk; /* location of new root (redundant with blk 0) */ uint32 level; /* its tree level */ - /* 0 or 2 INDEX TUPLES FOLLOW AT END OF STRUCT */ } xl_btree_newroot; #define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, level) + sizeof(uint32)) @@ -726,8 +720,8 @@ extern void _bt_leafbuild(BTSpool *btspool, BTSpool *spool2); /* * prototypes for functions in nbtxlog.c */ -extern void btree_redo(XLogRecPtr lsn, XLogRecord *record); -extern void btree_desc(StringInfo buf, XLogRecord *record); +extern void btree_redo(XLogReaderState *record); +extern void btree_desc(StringInfo buf, XLogReaderState *record); extern const char *btree_identify(uint8 info); #endif /* NBTREE_H */ diff --git a/src/include/access/spgist.h b/src/include/access/spgist.h index ccf1ed7786..3aa96bde86 100644 --- a/src/include/access/spgist.h +++ b/src/include/access/spgist.h @@ -15,7 +15,7 @@ #define SPGIST_H #include "access/skey.h" -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "fmgr.h" #include "lib/stringinfo.h" @@ -197,8 +197,8 @@ extern Datum spgbulkdelete(PG_FUNCTION_ARGS); extern Datum spgvacuumcleanup(PG_FUNCTION_ARGS); /* spgxlog.c */ -extern void spg_redo(XLogRecPtr lsn, XLogRecord *record); -extern void spg_desc(StringInfo buf, XLogRecord *record); +extern void spg_redo(XLogReaderState *record); +extern void spg_desc(StringInfo buf, XLogReaderState *record); extern const char *spg_identify(uint8 info); extern void spg_xlog_startup(void); extern void spg_xlog_cleanup(void); diff --git a/src/include/access/spgist_private.h b/src/include/access/spgist_private.h index 3330644651..4b6fdee801 100644 --- a/src/include/access/spgist_private.h +++ b/src/include/access/spgist_private.h @@ -18,7 +18,6 @@ #include "access/spgist.h" #include "nodes/tidbitmap.h" #include "storage/buf.h" -#include "storage/relfilenode.h" #include "utils/relcache.h" @@ -351,35 +350,8 @@ typedef SpGistDeadTupleData *SpGistDeadTuple; /* * XLOG stuff - * - * ACCEPT_RDATA_* can only use fixed-length rdata arrays, because of lengthof */ -#define ACCEPT_RDATA_DATA(p, s, i) \ - do { \ - Assert((i) < lengthof(rdata)); \ - rdata[i].data = (char *) (p); \ - rdata[i].len = (s); \ - rdata[i].buffer = InvalidBuffer; \ - rdata[i].buffer_std = true; \ - rdata[i].next = NULL; \ - if ((i) > 0) \ - rdata[(i) - 1].next = rdata + (i); \ - } while(0) - -#define ACCEPT_RDATA_BUFFER(b, i) \ - do { \ - Assert((i) < lengthof(rdata)); \ - rdata[i].data = NULL; \ - rdata[i].len = 0; \ - rdata[i].buffer = (b); \ - rdata[i].buffer_std = true; \ - rdata[i].next = NULL; \ - if ((i) > 0) \ - rdata[(i) - 1].next = rdata + (i); \ - } while(0) - - /* XLOG record types for SPGiST */ #define XLOG_SPGIST_CREATE_INDEX 0x00 #define XLOG_SPGIST_ADD_LEAF 0x10 @@ -408,36 +380,36 @@ typedef struct spgxlogState (d).isBuild = (s)->isBuild; \ } while(0) - +/* + * Backup Blk 0: destination page for leaf tuple + * Backup Blk 1: parent page (if any) + */ typedef struct spgxlogAddLeaf { - RelFileNode node; - - BlockNumber blknoLeaf; /* destination page for leaf tuple */ bool newPage; /* init dest page? */ bool storesNulls; /* page is in the nulls tree? */ OffsetNumber offnumLeaf; /* offset where leaf tuple gets placed */ OffsetNumber offnumHeadLeaf; /* offset of head tuple in chain, if any */ - BlockNumber blknoParent; /* where the parent downlink is, if any */ - OffsetNumber offnumParent; + OffsetNumber offnumParent; /* where the parent downlink is, if any */ uint16 nodeI; /* new leaf tuple follows (unaligned!) */ } spgxlogAddLeaf; +/* + * Backup Blk 0: source leaf page + * Backup Blk 1: destination leaf page + * Backup Blk 2: parent page + */ typedef struct spgxlogMoveLeafs { - RelFileNode node; - - BlockNumber blknoSrc; /* source leaf page */ - BlockNumber blknoDst; /* destination leaf page */ uint16 nMoves; /* number of tuples moved from source page */ bool newPage; /* init dest page? */ bool replaceDead; /* are we replacing a DEAD source tuple? */ bool storesNulls; /* pages are in the nulls tree? */ - BlockNumber blknoParent; /* where the parent downlink is */ + /* where the parent downlink is */ OffsetNumber offnumParent; uint16 nodeI; @@ -452,11 +424,6 @@ typedef struct spgxlogMoveLeafs * Note: if replaceDead is true then there is only one inserted tuple * number and only one leaf tuple in the data, because we are not copying * the dead tuple from the source - * - * Buffer references in the rdata array are: - * Src page - * Dest page - * Parent page *---------- */ OffsetNumber offsets[1]; @@ -464,21 +431,43 @@ typedef struct spgxlogMoveLeafs #define SizeOfSpgxlogMoveLeafs offsetof(spgxlogMoveLeafs, offsets) +/* + * Backup Blk 0: original page + * Backup Blk 1: where new tuple goes, if not same place + * Backup Blk 2: where parent downlink is, if updated and different from + * the old and new + */ typedef struct spgxlogAddNode { - RelFileNode node; + /* + * Offset of the original inner tuple, in the original page (on backup + * block 0). + */ + OffsetNumber offnum; - BlockNumber blkno; /* block number of original inner tuple */ - OffsetNumber offnum; /* offset of original inner tuple */ - - BlockNumber blknoParent; /* where parent downlink is, if updated */ - OffsetNumber offnumParent; - uint16 nodeI; - - BlockNumber blknoNew; /* where new tuple goes, if not same place */ + /* + * Offset of the new tuple, on the new page (on backup block 1). Invalid, + * if we overwrote the old tuple in the original page). + */ OffsetNumber offnumNew; bool newPage; /* init new page? */ + /*---- + * Where is the parent downlink? parentBlk indicates which page it's on, + * and offnumParent is the offset within the page. The possible values for + * parentBlk are: + * + * 0: parent == original page + * 1: parent == new page + * 2: parent == different page (blk ref 2) + * -1: parent not updated + *---- + */ + char parentBlk; + OffsetNumber offnumParent; /* offset within the parent page */ + + uint16 nodeI; + spgxlogState stateSrc; /* @@ -486,41 +475,51 @@ typedef struct spgxlogAddNode */ } spgxlogAddNode; +/* + * Backup Blk 0: where the prefix tuple goes + * Backup Blk 1: where the postfix tuple goes (if different page) + */ typedef struct spgxlogSplitTuple { - RelFileNode node; - - BlockNumber blknoPrefix; /* where the prefix tuple goes */ + /* where the prefix tuple goes */ OffsetNumber offnumPrefix; - BlockNumber blknoPostfix; /* where the postfix tuple goes */ + /* where the postfix tuple goes */ OffsetNumber offnumPostfix; bool newPage; /* need to init that page? */ + bool postfixBlkSame; /* was postfix tuple put on same page as + * prefix? */ /* - * new prefix inner tuple follows, then new postfix inner tuple - * (both are unaligned!) + * new prefix inner tuple follows, then new postfix inner tuple (both are + * unaligned!) */ } spgxlogSplitTuple; +/* + * Buffer references in the rdata array are: + * Backup Blk 0: Src page (only if not root) + * Backup Blk 1: Dest page (if used) + * Backup Blk 2: Inner page + * Backup Blk 3: Parent page (if any, and different from Inner) + */ typedef struct spgxlogPickSplit { - RelFileNode node; + bool isRootSplit; - BlockNumber blknoSrc; /* original leaf page */ - BlockNumber blknoDest; /* other leaf page, if any */ uint16 nDelete; /* n to delete from Src */ uint16 nInsert; /* n to insert on Src and/or Dest */ bool initSrc; /* re-init the Src page? */ bool initDest; /* re-init the Dest page? */ - BlockNumber blknoInner; /* where to put new inner tuple */ + /* where to put new inner tuple */ OffsetNumber offnumInner; bool initInner; /* re-init the Inner page? */ bool storesNulls; /* pages are in the nulls tree? */ - BlockNumber blknoParent; /* where the parent downlink is, if any */ + /* where the parent downlink is, if any */ + bool innerIsParent; /* is parent the same as inner page? */ OffsetNumber offnumParent; uint16 nodeI; @@ -533,24 +532,15 @@ typedef struct spgxlogPickSplit * array of page selector bytes for inserted tuples, length nInsert * new inner tuple (unaligned!) * list of leaf tuples, length nInsert (unaligned!) - * - * Buffer references in the rdata array are: - * Src page (only if not root and not being init'd) - * Dest page (if used and not being init'd) - * Inner page (only if not being init'd) - * Parent page (if any; could be same as Inner) *---------- */ - OffsetNumber offsets[1]; + OffsetNumber offsets[1]; } spgxlogPickSplit; #define SizeOfSpgxlogPickSplit offsetof(spgxlogPickSplit, offsets) typedef struct spgxlogVacuumLeaf { - RelFileNode node; - - BlockNumber blkno; /* block number to clean */ uint16 nDead; /* number of tuples to become DEAD */ uint16 nPlaceholder; /* number of tuples to become PLACEHOLDER */ uint16 nMove; /* number of tuples to move */ @@ -576,9 +566,6 @@ typedef struct spgxlogVacuumLeaf typedef struct spgxlogVacuumRoot { /* vacuum a root page when it is also a leaf */ - RelFileNode node; - - BlockNumber blkno; /* block number to clean */ uint16 nDelete; /* number of tuples to delete */ spgxlogState stateSrc; @@ -591,9 +578,6 @@ typedef struct spgxlogVacuumRoot typedef struct spgxlogVacuumRedirect { - RelFileNode node; - - BlockNumber blkno; /* block number to clean */ uint16 nToPlaceholder; /* number of redirects to make placeholders */ OffsetNumber firstPlaceholder; /* first placeholder tuple to remove */ TransactionId newestRedirectXid; /* newest XID of removed redirects */ diff --git a/src/include/access/xact.h b/src/include/access/xact.h index 11a51b2685..b018aa4f5d 100644 --- a/src/include/access/xact.h +++ b/src/include/access/xact.h @@ -14,7 +14,7 @@ #ifndef XACT_H #define XACT_H -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "lib/stringinfo.h" #include "nodes/pg_list.h" #include "storage/relfilenode.h" @@ -256,8 +256,8 @@ extern void UnregisterSubXactCallback(SubXactCallback callback, void *arg); extern int xactGetCommittedChildren(TransactionId **ptr); -extern void xact_redo(XLogRecPtr lsn, XLogRecord *record); -extern void xact_desc(StringInfo buf, XLogRecord *record); +extern void xact_redo(XLogReaderState *record); +extern void xact_desc(StringInfo buf, XLogReaderState *record); extern const char *xact_identify(uint8 info); #endif /* XACT_H */ diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 6f8b5f46e1..d06fbc0ec1 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -14,7 +14,7 @@ #include "access/rmgr.h" #include "access/xlogdefs.h" #include "access/xloginsert.h" -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "datatype/timestamp.h" #include "lib/stringinfo.h" @@ -186,7 +186,9 @@ typedef struct CheckpointStatsData extern CheckpointStatsData CheckpointStats; -extern XLogRecPtr XLogInsertRecord(XLogRecData *rdata, XLogRecPtr fpw_lsn); +struct XLogRecData; + +extern XLogRecPtr XLogInsertRecord(struct XLogRecData *rdata, XLogRecPtr fpw_lsn); extern void XLogFlush(XLogRecPtr RecPtr); extern bool XLogBackgroundFlush(void); extern bool XLogNeedsFlush(XLogRecPtr RecPtr); @@ -198,8 +200,8 @@ extern XLogSegNo XLogGetLastRemovedSegno(void); extern void XLogSetAsyncXactLSN(XLogRecPtr record); extern void XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn); -extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record); -extern void xlog_desc(StringInfo buf, XLogRecord *record); +extern void xlog_redo(XLogReaderState *record); +extern void xlog_desc(StringInfo buf, XLogReaderState *record); extern const char *xlog_identify(uint8 info); extern void issue_xlog_fsync(int fd, XLogSegNo segno); diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 19b2ef8d90..423ef4d7fa 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -20,7 +20,7 @@ #define XLOG_INTERNAL_H #include "access/xlogdefs.h" -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "datatype/timestamp.h" #include "lib/stringinfo.h" #include "pgtime.h" @@ -31,7 +31,7 @@ /* * Each page of XLOG file has a header like this: */ -#define XLOG_PAGE_MAGIC 0xD080 /* can be used as WAL version indicator */ +#define XLOG_PAGE_MAGIC 0xD081 /* can be used as WAL version indicator */ typedef struct XLogPageHeaderData { @@ -203,6 +203,17 @@ typedef struct xl_end_of_recovery TimeLineID PrevTimeLineID; /* previous TLI we forked off from */ } xl_end_of_recovery; +/* + * The functions in xloginsert.c construct a chain of XLogRecData structs + * to represent the final WAL record. + */ +typedef struct XLogRecData +{ + struct XLogRecData *next; /* next struct in chain, or NULL */ + char *data; /* start of rmgr data to include */ + uint32 len; /* length of rmgr data to include */ +} XLogRecData; + /* * Method table for resource managers. * @@ -219,8 +230,8 @@ typedef struct xl_end_of_recovery typedef struct RmgrData { const char *rm_name; - void (*rm_redo) (XLogRecPtr lsn, XLogRecord *rptr); - void (*rm_desc) (StringInfo buf, XLogRecord *rptr); + void (*rm_redo) (XLogReaderState *record); + void (*rm_desc) (StringInfo buf, XLogReaderState *record); const char *(*rm_identify) (uint8 info); void (*rm_startup) (void); void (*rm_cleanup) (void); diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h index 30c2e84cbc..e5ab71e230 100644 --- a/src/include/access/xloginsert.h +++ b/src/include/access/xloginsert.h @@ -18,49 +18,43 @@ #include "storage/relfilenode.h" /* - * The rmgr data to be written by XLogInsert() is defined by a chain of - * one or more XLogRecData structs. (Multiple structs would be used when - * parts of the source data aren't physically adjacent in memory, or when - * multiple associated buffers need to be specified.) - * - * If buffer is valid then XLOG will check if buffer must be backed up - * (ie, whether this is first change of that page since last checkpoint). - * If so, the whole page contents are attached to the XLOG record, and XLOG - * sets XLR_BKP_BLOCK(N) bit in xl_info. Note that the buffer must be pinned - * and exclusive-locked by the caller, so that it won't change under us. - * NB: when the buffer is backed up, we DO NOT insert the data pointed to by - * this XLogRecData struct into the XLOG record, since we assume it's present - * in the buffer. Therefore, rmgr redo routines MUST pay attention to - * XLR_BKP_BLOCK(N) to know what is actually stored in the XLOG record. - * The N'th XLR_BKP_BLOCK bit corresponds to the N'th distinct buffer - * value (ignoring InvalidBuffer) appearing in the rdata chain. - * - * When buffer is valid, caller must set buffer_std to indicate whether the - * page uses standard pd_lower/pd_upper header fields. If this is true, then - * XLOG is allowed to omit the free space between pd_lower and pd_upper from - * the backed-up page image. Note that even when buffer_std is false, the - * page MUST have an LSN field as its first eight bytes! - * - * Note: data can be NULL to indicate no rmgr data associated with this chain - * entry. This can be sensible (ie, not a wasted entry) if buffer is valid. - * The implication is that the buffer has been changed by the operation being - * logged, and so may need to be backed up, but the change can be redone using - * only information already present elsewhere in the XLOG entry. + * The minimum size of the WAL construction working area. If you need to + * register more than XLR_NORMAL_MAX_BLOCK_ID block references or have more + * than XLR_NORMAL_RDATAS data chunks in a single WAL record, you must call + * XLogEnsureRecordSpace() first to allocate more working memory. */ -typedef struct XLogRecData -{ - char *data; /* start of rmgr data to include */ - uint32 len; /* length of rmgr data to include */ - Buffer buffer; /* buffer associated with data, if any */ - bool buffer_std; /* buffer has standard pd_lower/pd_upper */ - struct XLogRecData *next; /* next struct in chain, or NULL */ -} XLogRecData; +#define XLR_NORMAL_MAX_BLOCK_ID 4 +#define XLR_NORMAL_RDATAS 20 + +/* flags for XLogRegisterBuffer */ +#define REGBUF_FORCE_IMAGE 0x01 /* force a full-page image */ +#define REGBUF_NO_IMAGE 0x02 /* don't take a full-page image */ +#define REGBUF_WILL_INIT (0x04 | 0x02) /* page will be re-initialized at + * replay (implies NO_IMAGE) */ +#define REGBUF_STANDARD 0x08 /* page follows "standard" page layout, + * (data between pd_lower and pd_upper + * will be skipped) */ +#define REGBUF_KEEP_DATA 0x10 /* include data even if a full-page image + * is taken */ + +/* prototypes for public functions in xloginsert.c: */ +extern void XLogBeginInsert(void); +extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info); +extern void XLogEnsureRecordSpace(int nbuffers, int ndatas); +extern void XLogRegisterData(char *data, int len); +extern void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags); +extern void XLogRegisterBlock(uint8 block_id, RelFileNode *rnode, + ForkNumber forknum, BlockNumber blknum, char *page, + uint8 flags); +extern void XLogRegisterBufData(uint8 block_id, char *data, int len); +extern void XLogResetInsertion(void); +extern bool XLogCheckBufferNeedsBackup(Buffer buffer); -extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata); extern XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blk, char *page, bool page_std); extern XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std); extern XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std); -extern bool XLogCheckBufferNeedsBackup(Buffer buffer); + +extern void InitXLogInsert(void); #endif /* XLOGINSERT_H */ diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h index ea873a2d9c..eb6cc8996a 100644 --- a/src/include/access/xlogreader.h +++ b/src/include/access/xlogreader.h @@ -14,12 +14,18 @@ * * The basic idea is to allocate an XLogReaderState via * XLogReaderAllocate(), and call XLogReadRecord() until it returns NULL. + * + * After reading a record with XLogReadRecord(), it's decomposed into + * the per-block and main data parts, and the parts can be accessed + * with the XLogRec* macros and functions. You can also decode a + * record that's already constructed in memory, without reading from + * disk, by calling the DecodeXLogRecord() function. *------------------------------------------------------------------------- */ #ifndef XLOGREADER_H #define XLOGREADER_H -#include "access/xlog_internal.h" +#include "access/xlogrecord.h" typedef struct XLogReaderState XLogReaderState; @@ -31,6 +37,32 @@ typedef int (*XLogPageReadCB) (XLogReaderState *xlogreader, char *readBuf, TimeLineID *pageTLI); +typedef struct +{ + /* Is this block ref in use? */ + bool in_use; + + /* Identify the block this refers to */ + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blkno; + + /* copy of the fork_flags field from the XLogRecordBlockHeader */ + uint8 flags; + + /* Information on full-page image, if any */ + bool has_image; + char *bkp_image; + uint16 hole_offset; + uint16 hole_length; + + /* Buffer holding the rmgr-specific data associated with this block */ + bool has_data; + char *data; + uint16 data_len; + uint16 data_bufsz; +} DecodedBkpBlock; + struct XLogReaderState { /* ---------------------------------------- @@ -79,6 +111,25 @@ struct XLogReaderState XLogRecPtr ReadRecPtr; /* start of last record read */ XLogRecPtr EndRecPtr; /* end+1 of last record read */ + + /* ---------------------------------------- + * Decoded representation of current record + * + * Use XLogRecGet* functions to investigate the record; these fields + * should not be accessed directly. + * ---------------------------------------- + */ + XLogRecord *decoded_record; /* currently decoded record */ + + char *main_data; /* record's main data portion */ + uint32 main_data_len; /* main data portion's length */ + uint32 main_data_bufsz; /* allocated size of the buffer */ + + /* information about blocks referenced by the record. */ + DecodedBkpBlock blocks[XLR_MAX_BLOCK_ID + 1]; + + int max_block_id; /* highest block_id in use (-1 if none) */ + /* ---------------------------------------- * private/internal state * ---------------------------------------- @@ -123,4 +174,28 @@ extern struct XLogRecord *XLogReadRecord(XLogReaderState *state, extern XLogRecPtr XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr); #endif /* FRONTEND */ +/* Functions for decoding an XLogRecord */ + +extern bool DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, + char **errmsg); + +#define XLogRecGetTotalLen(decoder) ((decoder)->decoded_record->xl_tot_len) +#define XLogRecGetPrev(decoder) ((decoder)->decoded_record->xl_prev) +#define XLogRecGetInfo(decoder) ((decoder)->decoded_record->xl_info) +#define XLogRecGetRmid(decoder) ((decoder)->decoded_record->xl_rmid) +#define XLogRecGetXid(decoder) ((decoder)->decoded_record->xl_xid) +#define XLogRecGetData(decoder) ((decoder)->main_data) +#define XLogRecGetDataLen(decoder) ((decoder)->main_data_len) +#define XLogRecHasAnyBlockRefs(decoder) ((decoder)->max_block_id >= 0) +#define XLogRecHasBlockRef(decoder, block_id) \ + ((decoder)->blocks[block_id].in_use) +#define XLogRecHasBlockImage(decoder, block_id) \ + ((decoder)->blocks[block_id].has_image) + +extern bool RestoreBlockImage(XLogReaderState *recoder, uint8 block_id, char *dst); +extern char *XLogRecGetBlockData(XLogReaderState *record, uint8 block_id, Size *len); +extern bool XLogRecGetBlockTag(XLogReaderState *record, uint8 block_id, + RelFileNode *rnode, ForkNumber *forknum, + BlockNumber *blknum); + #endif /* XLOGREADER_H */ diff --git a/src/include/access/xlogrecord.h b/src/include/access/xlogrecord.h index ab0fb1c500..11ddfac9c7 100644 --- a/src/include/access/xlogrecord.h +++ b/src/include/access/xlogrecord.h @@ -20,81 +20,161 @@ /* * The overall layout of an XLOG record is: * Fixed-size header (XLogRecord struct) - * rmgr-specific data - * BkpBlock - * backup block data - * BkpBlock - * backup block data + * XLogRecordBlockHeader struct + * XLogRecordBlockHeader struct * ... + * XLogRecordDataHeader[Short|Long] struct + * block data + * block data + * ... + * main data * - * where there can be zero to four backup blocks (as signaled by xl_info flag - * bits). XLogRecord structs always start on MAXALIGN boundaries in the WAL - * files, and we round up SizeOfXLogRecord so that the rmgr data is also - * guaranteed to begin on a MAXALIGN boundary. However, no padding is added - * to align BkpBlock structs or backup block data. + * There can be zero or more XLogRecordBlockHeaders, and 0 or more bytes of + * rmgr-specific data not associated with a block. XLogRecord structs + * always start on MAXALIGN boundaries in the WAL files, but the rest of + * the fields are not aligned. * - * NOTE: xl_len counts only the rmgr data, not the XLogRecord header, - * and also not any backup blocks. xl_tot_len counts everything. Neither - * length field is rounded up to an alignment boundary. + * The XLogRecordBlockHeader, XLogRecordDataHeaderShort and + * XLogRecordDataHeaderLong structs all begin with a single 'id' byte. It's + * used to distinguish between block references, and the main data structs. */ typedef struct XLogRecord { uint32 xl_tot_len; /* total len of entire record */ TransactionId xl_xid; /* xact id */ - uint32 xl_len; /* total len of rmgr data */ + XLogRecPtr xl_prev; /* ptr to previous record in log */ uint8 xl_info; /* flag bits, see below */ RmgrId xl_rmid; /* resource manager for this record */ /* 2 bytes of padding here, initialize to zero */ - XLogRecPtr xl_prev; /* ptr to previous record in log */ pg_crc32 xl_crc; /* CRC for this record */ - /* If MAXALIGN==8, there are 4 wasted bytes here */ - - /* ACTUAL LOG DATA FOLLOWS AT END OF STRUCT */ + /* XLogRecordBlockHeaders and XLogRecordDataHeader follow, no padding */ } XLogRecord; -#define SizeOfXLogRecord MAXALIGN(sizeof(XLogRecord)) - -#define XLogRecGetData(record) ((char*) (record) + SizeOfXLogRecord) +#define SizeOfXLogRecord (offsetof(XLogRecord, xl_crc) + sizeof(pg_crc32)) /* - * XLOG uses only low 4 bits of xl_info. High 4 bits may be used by rmgr. + * The high 4 bits in xl_info may be used freely by rmgr. The + * XLR_SPECIAL_REL_UPDATE bit can be passed by XLogInsert caller. The rest + * are set internally by XLogInsert. */ #define XLR_INFO_MASK 0x0F +#define XLR_RMGR_INFO_MASK 0xF0 /* - * If we backed up any disk blocks with the XLOG record, we use flag bits in - * xl_info to signal it. We support backup of up to 4 disk blocks per XLOG - * record. + * If a WAL record modifies any relation files, in ways not covered by the + * usual block references, this flag is set. This is not used for anything + * by PostgreSQL itself, but it allows external tools that read WAL and keep + * track of modified blocks to recognize such special record types. */ -#define XLR_BKP_BLOCK_MASK 0x0F /* all info bits used for bkp blocks */ -#define XLR_MAX_BKP_BLOCKS 4 -#define XLR_BKP_BLOCK(iblk) (0x08 >> (iblk)) /* iblk in 0..3 */ +#define XLR_SPECIAL_REL_UPDATE 0x01 /* - * Header info for a backup block appended to an XLOG record. + * Header info for block data appended to an XLOG record. + * + * Note that we don't attempt to align the XLogRecordBlockHeader struct! + * So, the struct must be copied to aligned local storage before use. + * 'data_length' is the length of the payload data associated with this, + * and includes the possible full-page image, and rmgr-specific data. It + * does not include the XLogRecordBlockHeader struct itself. + */ +typedef struct XLogRecordBlockHeader +{ + uint8 id; /* block reference ID */ + uint8 fork_flags; /* fork within the relation, and flags */ + uint16 data_length; /* number of payload bytes (not including page + * image) */ + + /* If BKPBLOCK_HAS_IMAGE, an XLogRecordBlockImageHeader struct follows */ + /* If !BKPBLOCK_SAME_REL is not set, a RelFileNode follows */ + /* BlockNumber follows */ +} XLogRecordBlockHeader; + +#define SizeOfXLogRecordBlockHeader (offsetof(XLogRecordBlockHeader, data_length) + sizeof(uint16)) + +/* + * Additional header information when a full-page image is included + * (i.e. when BKPBLOCK_HAS_IMAGE is set). * * As a trivial form of data compression, the XLOG code is aware that * PG data pages usually contain an unused "hole" in the middle, which * contains only zero bytes. If hole_length > 0 then we have removed * such a "hole" from the stored data (and it's not counted in the * XLOG record's CRC, either). Hence, the amount of block data actually - * present following the BkpBlock struct is BLCKSZ - hole_length bytes. - * - * Note that we don't attempt to align either the BkpBlock struct or the - * block's data. So, the struct must be copied to aligned local storage - * before use. + * present is BLCKSZ - hole_length bytes. */ -typedef struct BkpBlock +typedef struct XLogRecordBlockImageHeader { - RelFileNode node; /* relation containing block */ - ForkNumber fork; /* fork within the relation */ - BlockNumber block; /* block number */ uint16 hole_offset; /* number of bytes before "hole" */ uint16 hole_length; /* number of bytes in "hole" */ +} XLogRecordBlockImageHeader; + +#define SizeOfXLogRecordBlockImageHeader sizeof(XLogRecordBlockImageHeader) + +/* + * Maximum size of the header for a block reference. This is used to size a + * temporary buffer for constructing the header. + */ +#define MaxSizeOfXLogRecordBlockHeader \ + (SizeOfXLogRecordBlockHeader + \ + SizeOfXLogRecordBlockImageHeader + \ + sizeof(RelFileNode) + \ + sizeof(BlockNumber)) + +/* + * The fork number fits in the lower 4 bits in the fork_flags field. The upper + * bits are used for flags. + */ +#define BKPBLOCK_FORK_MASK 0x0F +#define BKPBLOCK_FLAG_MASK 0xF0 +#define BKPBLOCK_HAS_IMAGE 0x10 /* block data is an XLogRecordBlockImage */ +#define BKPBLOCK_HAS_DATA 0x20 +#define BKPBLOCK_WILL_INIT 0x40 /* redo will re-init the page */ +#define BKPBLOCK_SAME_REL 0x80 /* RelFileNode omitted, same as previous */ + +/* + * XLogRecordDataHeaderShort/Long are used for the "main data" portion of + * the record. If the length of the data is less than 256 bytes, the short + * form is used, with a single byte to hold the length. Otherwise the long + * form is used. + * + * (These structs are currently not used in the code, they are here just for + * documentation purposes). + */ +typedef struct XLogRecordDataHeaderShort +{ + uint8 id; /* XLR_BLOCK_ID_DATA_SHORT */ + uint8 data_length; /* number of payload bytes */ +} XLogRecordDataHeaderShort; + +#define SizeOfXLogRecordDataHeaderShort (sizeof(uint8) * 2) + +typedef struct XLogRecordDataHeaderLong +{ + uint8 id; /* XLR_BLOCK_ID_DATA_LONG */ + /* followed by uint32 data_length, unaligned */ +} XLogRecordDataHeaderLong; + +#define SizeOfXLogRecordDataHeaderLong (sizeof(uint8) + sizeof(uint32)) + +/* + * Block IDs used to distinguish different kinds of record fragments. Block + * references are numbered from 0 to XLR_MAX_BLOCK_ID. A rmgr is free to use + * any ID number in that range (although you should stick to small numbers, + * because the WAL machinery is optimized for that case). A couple of ID + * numbers are reserved to denote the "main" data portion of the record. + * + * The maximum is currently set at 32, quite arbitrarily. Most records only + * need a handful of block references, but there are a few exceptions that + * need more. + */ +#define XLR_MAX_BLOCK_ID 32 + +#define XLR_BLOCK_ID_DATA_SHORT 255 +#define XLR_BLOCK_ID_DATA_LONG 254 + +#define SizeOfXLogRecordDataHeaderLong (sizeof(uint8) + sizeof(uint32)) - /* ACTUAL BLOCK DATA FOLLOWS AT END OF STRUCT */ -} BkpBlock; #endif /* XLOGRECORD_H */ diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h index 8d90696723..68f72cfac6 100644 --- a/src/include/access/xlogutils.h +++ b/src/include/access/xlogutils.h @@ -11,7 +11,7 @@ #ifndef XLOG_UTILS_H #define XLOG_UTILS_H -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "storage/bufmgr.h" @@ -33,26 +33,17 @@ typedef enum * replayed) */ } XLogRedoAction; -extern XLogRedoAction XLogReadBufferForRedo(XLogRecPtr lsn, XLogRecord *record, - int block_index, RelFileNode rnode, BlockNumber blkno, - Buffer *buf); -extern XLogRedoAction XLogReadBufferForRedoExtended(XLogRecPtr lsn, - XLogRecord *record, int block_index, - RelFileNode rnode, ForkNumber forkno, - BlockNumber blkno, +extern XLogRedoAction XLogReadBufferForRedo(XLogReaderState *record, + uint8 buffer_id, Buffer *buf); +extern Buffer XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id); +extern XLogRedoAction XLogReadBufferForRedoExtended(XLogReaderState *record, + uint8 buffer_id, ReadBufferMode mode, bool get_cleanup_lock, Buffer *buf); -extern Buffer XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init); extern Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, BlockNumber blkno, ReadBufferMode mode); -extern Buffer RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, - int block_index, - bool get_cleanup_lock, bool keep_buffer); -extern Buffer RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb, - char *blk, bool get_cleanup_lock, bool keep_buffer); - extern Relation CreateFakeRelcacheEntry(RelFileNode rnode); extern void FreeFakeRelcacheEntry(Relation fakerel); diff --git a/src/include/catalog/storage_xlog.h b/src/include/catalog/storage_xlog.h index 6c687e3a82..31a51c42f6 100644 --- a/src/include/catalog/storage_xlog.h +++ b/src/include/catalog/storage_xlog.h @@ -14,7 +14,7 @@ #ifndef STORAGE_XLOG_H #define STORAGE_XLOG_H -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "lib/stringinfo.h" #include "storage/block.h" #include "storage/relfilenode.h" @@ -44,8 +44,8 @@ typedef struct xl_smgr_truncate extern void log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum); -extern void smgr_redo(XLogRecPtr lsn, XLogRecord *record); -extern void smgr_desc(StringInfo buf, XLogRecord *record); +extern void smgr_redo(XLogReaderState *record); +extern void smgr_desc(StringInfo buf, XLogReaderState *record); extern const char *smgr_identify(uint8 info); #endif /* STORAGE_XLOG_H */ diff --git a/src/include/commands/dbcommands.h b/src/include/commands/dbcommands.h index b79d9fc864..bcf0e48cbb 100644 --- a/src/include/commands/dbcommands.h +++ b/src/include/commands/dbcommands.h @@ -14,7 +14,7 @@ #ifndef DBCOMMANDS_H #define DBCOMMANDS_H -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "lib/stringinfo.h" #include "nodes/parsenodes.h" @@ -63,8 +63,8 @@ extern Oid AlterDatabaseOwner(const char *dbname, Oid newOwnerId); extern Oid get_database_oid(const char *dbname, bool missingok); extern char *get_database_name(Oid dbid); -extern void dbase_redo(XLogRecPtr lsn, XLogRecord *rptr); -extern void dbase_desc(StringInfo buf, XLogRecord *rptr); +extern void dbase_redo(XLogReaderState *rptr); +extern void dbase_desc(StringInfo buf, XLogReaderState *rptr); extern const char *dbase_identify(uint8 info); extern void check_encoding_locale_matches(int encoding, const char *collate, const char *ctype); diff --git a/src/include/commands/sequence.h b/src/include/commands/sequence.h index 7cbe6f9a81..386f1e677c 100644 --- a/src/include/commands/sequence.h +++ b/src/include/commands/sequence.h @@ -13,7 +13,7 @@ #ifndef SEQUENCE_H #define SEQUENCE_H -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "fmgr.h" #include "lib/stringinfo.h" #include "nodes/parsenodes.h" @@ -77,8 +77,8 @@ extern Oid AlterSequence(AlterSeqStmt *stmt); extern void ResetSequence(Oid seq_relid); extern void ResetSequenceCaches(void); -extern void seq_redo(XLogRecPtr lsn, XLogRecord *rptr); -extern void seq_desc(StringInfo buf, XLogRecord *rptr); +extern void seq_redo(XLogReaderState *rptr); +extern void seq_desc(StringInfo buf, XLogReaderState *rptr); extern const char *seq_identify(uint8 info); #endif /* SEQUENCE_H */ diff --git a/src/include/commands/tablespace.h b/src/include/commands/tablespace.h index afd9e05cb7..80e021e2d4 100644 --- a/src/include/commands/tablespace.h +++ b/src/include/commands/tablespace.h @@ -14,7 +14,7 @@ #ifndef TABLESPACE_H #define TABLESPACE_H -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "lib/stringinfo.h" #include "nodes/parsenodes.h" @@ -56,8 +56,8 @@ extern char *get_tablespace_name(Oid spc_oid); extern bool directory_is_empty(const char *path); -extern void tblspc_redo(XLogRecPtr lsn, XLogRecord *rptr); -extern void tblspc_desc(StringInfo buf, XLogRecord *rptr); +extern void tblspc_redo(XLogReaderState *rptr); +extern void tblspc_desc(StringInfo buf, XLogReaderState *rptr); extern const char *tblspc_identify(uint8 info); #endif /* TABLESPACE_H */ diff --git a/src/include/replication/decode.h b/src/include/replication/decode.h index e4185287a1..385c4a7b50 100644 --- a/src/include/replication/decode.h +++ b/src/include/replication/decode.h @@ -15,6 +15,6 @@ #include "replication/logical.h" void LogicalDecodingProcessRecord(LogicalDecodingContext *ctx, - XLogRecord *record); + XLogReaderState *record); #endif diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h index c89989fd20..d2599be0cf 100644 --- a/src/include/storage/standby.h +++ b/src/include/storage/standby.h @@ -14,7 +14,7 @@ #ifndef STANDBY_H #define STANDBY_H -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "lib/stringinfo.h" #include "storage/lock.h" #include "storage/procsignal.h" @@ -82,8 +82,8 @@ typedef struct xl_running_xacts /* Recovery handlers for the Standby Rmgr (RM_STANDBY_ID) */ -extern void standby_redo(XLogRecPtr lsn, XLogRecord *record); -extern void standby_desc(StringInfo buf, XLogRecord *record); +extern void standby_redo(XLogReaderState *record); +extern void standby_desc(StringInfo buf, XLogReaderState *record); extern const char *standby_identify(uint8 info); /* diff --git a/src/include/utils/relmapper.h b/src/include/utils/relmapper.h index bd5836b0d9..1f2c960ebe 100644 --- a/src/include/utils/relmapper.h +++ b/src/include/utils/relmapper.h @@ -14,7 +14,7 @@ #ifndef RELMAPPER_H #define RELMAPPER_H -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "lib/stringinfo.h" /* ---------------- @@ -59,8 +59,8 @@ extern void RelationMapInitialize(void); extern void RelationMapInitializePhase2(void); extern void RelationMapInitializePhase3(void); -extern void relmap_redo(XLogRecPtr lsn, XLogRecord *record); -extern void relmap_desc(StringInfo buf, XLogRecord *record); +extern void relmap_redo(XLogReaderState *record); +extern void relmap_desc(StringInfo buf, XLogReaderState *record); extern const char *relmap_identify(uint8 info); #endif /* RELMAPPER_H */