diff --git a/src/backend/access/heap/Makefile b/src/backend/access/heap/Makefile index 66175ae7da..dc33054641 100644 --- a/src/backend/access/heap/Makefile +++ b/src/backend/access/heap/Makefile @@ -4,7 +4,7 @@ # Makefile for access/heap # # IDENTIFICATION -# $PostgreSQL: pgsql/src/backend/access/heap/Makefile,v 1.18 2008/02/19 10:30:06 petere Exp $ +# $PostgreSQL: pgsql/src/backend/access/heap/Makefile,v 1.19 2008/12/03 13:05:22 heikki Exp $ # #------------------------------------------------------------------------- @@ -12,6 +12,6 @@ subdir = src/backend/access/heap top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = heapam.o hio.o pruneheap.o rewriteheap.o syncscan.o tuptoaster.o +OBJS = heapam.o hio.o pruneheap.o rewriteheap.o syncscan.o tuptoaster.o visibilitymap.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index f6d75c6e2b..c561e8f960 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.270 2008/11/19 10:34:50 heikki Exp $ + * $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.271 2008/12/03 13:05:22 heikki Exp $ * * * INTERFACE ROUTINES @@ -47,6 +47,7 @@ #include "access/transam.h" #include "access/tuptoaster.h" #include "access/valid.h" +#include "access/visibilitymap.h" #include "access/xact.h" #include "access/xlogutils.h" #include "catalog/catalog.h" @@ -195,6 +196,7 @@ heapgetpage(HeapScanDesc scan, BlockNumber page) int ntup; OffsetNumber lineoff; ItemId lpp; + bool all_visible; Assert(page < scan->rs_nblocks); @@ -233,20 +235,32 @@ heapgetpage(HeapScanDesc scan, BlockNumber page) lines = PageGetMaxOffsetNumber(dp); ntup = 0; + /* + * If the all-visible flag indicates that all tuples on the page are + * visible to everyone, we can skip the per-tuple visibility tests. + */ + all_visible = PageIsAllVisible(dp); + for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff); lineoff <= lines; lineoff++, lpp++) { if (ItemIdIsNormal(lpp)) { - HeapTupleData loctup; bool valid; - loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp); - loctup.t_len = ItemIdGetLength(lpp); - ItemPointerSet(&(loctup.t_self), page, lineoff); + if (all_visible) + valid = true; + else + { + HeapTupleData loctup; - valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); + loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp); + loctup.t_len = ItemIdGetLength(lpp); + ItemPointerSet(&(loctup.t_self), page, lineoff); + + valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); + } if (valid) scan->rs_vistuples[ntup++] = lineoff; } @@ -1860,6 +1874,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, TransactionId xid = GetCurrentTransactionId(); HeapTuple heaptup; Buffer buffer; + bool all_visible_cleared = false; if (relation->rd_rel->relhasoids) { @@ -1920,6 +1935,12 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, RelationPutHeapTuple(relation, buffer, heaptup); + if (PageIsAllVisible(BufferGetPage(buffer))) + { + all_visible_cleared = true; + PageClearAllVisible(BufferGetPage(buffer)); + } + /* * XXX Should we set PageSetPrunable on this page ? * @@ -1943,6 +1964,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, Page page = BufferGetPage(buffer); uint8 info = XLOG_HEAP_INSERT; + xlrec.all_visible_cleared = all_visible_cleared; xlrec.target.node = relation->rd_node; xlrec.target.tid = heaptup->t_self; rdata[0].data = (char *) &xlrec; @@ -1994,6 +2016,11 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, UnlockReleaseBuffer(buffer); + /* Clear the bit in the visibility map if necessary */ + if (all_visible_cleared) + visibilitymap_clear(relation, + ItemPointerGetBlockNumber(&(heaptup->t_self))); + /* * If tuple is cachable, mark it for invalidation from the caches in case * we abort. Note it is OK to do this after releasing the buffer, because @@ -2070,6 +2097,7 @@ heap_delete(Relation relation, ItemPointer tid, Buffer buffer; bool have_tuple_lock = false; bool iscombo; + bool all_visible_cleared = false; Assert(ItemPointerIsValid(tid)); @@ -2216,6 +2244,12 @@ l1: */ PageSetPrunable(page, xid); + if (PageIsAllVisible(page)) + { + all_visible_cleared = true; + PageClearAllVisible(page); + } + /* store transaction information of xact deleting the tuple */ tp.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID | @@ -2237,6 +2271,7 @@ l1: XLogRecPtr recptr; XLogRecData rdata[2]; + xlrec.all_visible_cleared = all_visible_cleared; xlrec.target.node = relation->rd_node; xlrec.target.tid = tp.t_self; rdata[0].data = (char *) &xlrec; @@ -2281,6 +2316,10 @@ l1: */ CacheInvalidateHeapTuple(relation, &tp); + /* Clear the bit in the visibility map if necessary */ + if (all_visible_cleared) + visibilitymap_clear(relation, BufferGetBlockNumber(buffer)); + /* Now we can release the buffer */ ReleaseBuffer(buffer); @@ -2388,6 +2427,8 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, bool have_tuple_lock = false; bool iscombo; bool use_hot_update = false; + bool all_visible_cleared = false; + bool all_visible_cleared_new = false; Assert(ItemPointerIsValid(otid)); @@ -2763,6 +2804,12 @@ l2: MarkBufferDirty(newbuf); MarkBufferDirty(buffer); + /* + * Note: we mustn't clear PD_ALL_VISIBLE flags before writing the WAL + * record, because log_heap_update looks at those flags to set the + * corresponding flags in the WAL record. + */ + /* XLOG stuff */ if (!relation->rd_istemp) { @@ -2778,6 +2825,18 @@ l2: PageSetTLI(BufferGetPage(buffer), ThisTimeLineID); } + /* Clear PD_ALL_VISIBLE flags */ + if (PageIsAllVisible(BufferGetPage(buffer))) + { + all_visible_cleared = true; + PageClearAllVisible(BufferGetPage(buffer)); + } + if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf))) + { + all_visible_cleared_new = true; + PageClearAllVisible(BufferGetPage(newbuf)); + } + END_CRIT_SECTION(); if (newbuf != buffer) @@ -2791,6 +2850,12 @@ l2: */ CacheInvalidateHeapTuple(relation, &oldtup); + /* Clear bits in visibility map */ + if (all_visible_cleared) + visibilitymap_clear(relation, BufferGetBlockNumber(buffer)); + if (all_visible_cleared_new) + visibilitymap_clear(relation, BufferGetBlockNumber(newbuf)); + /* Now we can release the buffer(s) */ if (newbuf != buffer) ReleaseBuffer(newbuf); @@ -3411,6 +3476,11 @@ l3: LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); + /* + * Don't update the visibility map here. Locking a tuple doesn't + * change visibility info. + */ + /* * Now that we have successfully marked the tuple as locked, we can * release the lmgr tuple lock, if we had it. @@ -3916,7 +3986,9 @@ log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from, xlrec.target.node = reln->rd_node; xlrec.target.tid = from; + xlrec.all_visible_cleared = PageIsAllVisible(BufferGetPage(oldbuf)); xlrec.newtid = newtup->t_self; + xlrec.new_all_visible_cleared = PageIsAllVisible(BufferGetPage(newbuf)); rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfHeapUpdate; @@ -4185,13 +4257,25 @@ heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record) OffsetNumber offnum; ItemId lp = NULL; HeapTupleHeader htup; + BlockNumber blkno; + + blkno = ItemPointerGetBlockNumber(&(xlrec->target.tid)); + + /* + * The visibility map always needs to be updated, even if the heap page + * is already up-to-date. + */ + if (xlrec->all_visible_cleared) + { + Relation reln = CreateFakeRelcacheEntry(xlrec->target.node); + visibilitymap_clear(reln, blkno); + FreeFakeRelcacheEntry(reln); + } if (record->xl_info & XLR_BKP_BLOCK_1) return; - buffer = XLogReadBuffer(xlrec->target.node, - ItemPointerGetBlockNumber(&(xlrec->target.tid)), - false); + buffer = XLogReadBuffer(xlrec->target.node, blkno, false); if (!BufferIsValid(buffer)) return; page = (Page) BufferGetPage(buffer); @@ -4223,6 +4307,9 @@ heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record) /* Mark the page as a candidate for pruning */ PageSetPrunable(page, record->xl_xid); + if (xlrec->all_visible_cleared) + PageClearAllVisible(page); + /* Make sure there is no forward chain link in t_ctid */ htup->t_ctid = xlrec->target.tid; PageSetLSN(page, lsn); @@ -4249,11 +4336,22 @@ heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record) Size freespace; BlockNumber blkno; + blkno = ItemPointerGetBlockNumber(&(xlrec->target.tid)); + + /* + * The visibility map always needs to be updated, even if the heap page + * is already up-to-date. + */ + if (xlrec->all_visible_cleared) + { + Relation reln = CreateFakeRelcacheEntry(xlrec->target.node); + visibilitymap_clear(reln, blkno); + FreeFakeRelcacheEntry(reln); + } + if (record->xl_info & XLR_BKP_BLOCK_1) return; - blkno = ItemPointerGetBlockNumber(&(xlrec->target.tid)); - if (record->xl_info & XLOG_HEAP_INIT_PAGE) { buffer = XLogReadBuffer(xlrec->target.node, blkno, true); @@ -4307,6 +4405,10 @@ heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record) PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); + + if (xlrec->all_visible_cleared) + PageClearAllVisible(page); + MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); @@ -4347,6 +4449,18 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move, bool hot_update) uint32 newlen; Size freespace; + /* + * The visibility map always needs to be updated, even if the heap page + * is already up-to-date. + */ + if (xlrec->all_visible_cleared) + { + Relation reln = CreateFakeRelcacheEntry(xlrec->target.node); + visibilitymap_clear(reln, + ItemPointerGetBlockNumber(&xlrec->target.tid)); + FreeFakeRelcacheEntry(reln); + } + if (record->xl_info & XLR_BKP_BLOCK_1) { if (samepage) @@ -4411,6 +4525,9 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move, bool hot_update) /* Mark the page as a candidate for pruning */ PageSetPrunable(page, record->xl_xid); + if (xlrec->all_visible_cleared) + PageClearAllVisible(page); + /* * this test is ugly, but necessary to avoid thinking that insert change * is already applied @@ -4426,6 +4543,17 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move, bool hot_update) newt:; + /* + * The visibility map always needs to be updated, even if the heap page + * is already up-to-date. + */ + if (xlrec->new_all_visible_cleared) + { + Relation reln = CreateFakeRelcacheEntry(xlrec->target.node); + visibilitymap_clear(reln, ItemPointerGetBlockNumber(&xlrec->newtid)); + FreeFakeRelcacheEntry(reln); + } + if (record->xl_info & XLR_BKP_BLOCK_2) return; @@ -4504,6 +4632,9 @@ newsame:; if (offnum == InvalidOffsetNumber) elog(PANIC, "heap_update_redo: failed to add tuple"); + if (xlrec->new_all_visible_cleared) + PageClearAllVisible(page); + freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ PageSetLSN(page, lsn); diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c new file mode 100644 index 0000000000..e3cbb4e3dd --- /dev/null +++ b/src/backend/access/heap/visibilitymap.c @@ -0,0 +1,478 @@ +/*------------------------------------------------------------------------- + * + * visibilitymap.c + * bitmap for tracking visibility of heap tuples + * + * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/access/heap/visibilitymap.c,v 1.1 2008/12/03 13:05:22 heikki Exp $ + * + * INTERFACE ROUTINES + * visibilitymap_clear - clear a bit in the visibility map + * visibilitymap_pin - pin a map page for setting a bit + * visibilitymap_set - set a bit in a previously pinned page + * visibilitymap_test - test if a bit is set + * + * NOTES + * + * The visibility map is a bitmap with one bit per heap page. A set bit means + * that all tuples on the page are visible to all transactions, and doesn't + * therefore need to be vacuumed. The map is conservative in the sense that we + * make sure that whenever a bit is set, we know the condition is true, but if + * a bit is not set, it might or might not be. + * + * There's no explicit WAL logging in the functions in this file. The callers + * must make sure that whenever a bit is cleared, the bit is cleared on WAL + * replay of the updating operation as well. Setting bits during recovery + * isn't necessary for correctness. + * + * Currently, the visibility map is only used as a hint, to speed up VACUUM. + * A corrupted visibility map won't cause data corruption, although it can + * make VACUUM skip pages that need vacuuming, until the next anti-wraparound + * vacuum. The visibility map is not used for anti-wraparound vacuums, because + * an anti-wraparound vacuum needs to freeze tuples and observe the latest xid + * present in the table, also on pages that don't have any dead tuples. + * + * Although the visibility map is just a hint at the moment, the PD_ALL_VISIBLE + * flag on heap pages *must* be correct. + * + * LOCKING + * + * In heapam.c, whenever a page is modified so that not all tuples on the + * page are visible to everyone anymore, the corresponding bit in the + * visibility map is cleared. The bit in the visibility map is cleared + * after releasing the lock on the heap page, to avoid holding the lock + * over possible I/O to read in the visibility map page. + * + * To set a bit, you need to hold a lock on the heap page. That prevents + * the race condition where VACUUM sees that all tuples on the page are + * visible to everyone, but another backend modifies the page before VACUUM + * sets the bit in the visibility map. + * + * When a bit is set, the LSN of the visibility map page is updated to make + * sure that the visibility map update doesn't get written to disk before the + * WAL record of the changes that made it possible to set the bit is flushed. + * But when a bit is cleared, we don't have to do that because it's always OK + * to clear a bit in the map from correctness point of view. + * + * TODO + * + * It would be nice to use the visibility map to skip visibility checkes in + * index scans. + * + * Currently, the visibility map is not 100% correct all the time. + * During updates, the bit in the visibility map is cleared after releasing + * the lock on the heap page. During the window after releasing the lock + * and clearing the bit in the visibility map, the bit in the visibility map + * is set, but the new insertion or deletion is not yet visible to other + * backends. + * + * That might actually be OK for the index scans, though. The newly inserted + * tuple wouldn't have an index pointer yet, so all tuples reachable from an + * index would still be visible to all other backends, and deletions wouldn't + * be visible to other backends yet. + * + * There's another hole in the way the PD_ALL_VISIBLE flag is set. When + * vacuum observes that all tuples are visible to all, it sets the flag on + * the heap page, and also sets the bit in the visibility map. If we then + * crash, and only the visibility map page was flushed to disk, we'll have + * a bit set in the visibility map, but the corresponding flag on the heap + * page is not set. If the heap page is then updated, the updater won't + * know to clear the bit in the visibility map. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/visibilitymap.h" +#include "storage/bufmgr.h" +#include "storage/bufpage.h" +#include "storage/lmgr.h" +#include "storage/smgr.h" +#include "utils/inval.h" + +/*#define TRACE_VISIBILITYMAP */ + +/* + * Size of the bitmap on each visibility map page, in bytes. There's no + * extra headers, so the whole page minus except for the standard page header + * is used for the bitmap. + */ +#define MAPSIZE (BLCKSZ - SizeOfPageHeaderData) + +/* Number of bits allocated for each heap block. */ +#define BITS_PER_HEAPBLOCK 1 + +/* Number of heap blocks we can represent in one byte. */ +#define HEAPBLOCKS_PER_BYTE 8 + +/* Number of heap blocks we can represent in one visibility map page. */ +#define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE) + +/* Mapping from heap block number to the right bit in the visibility map */ +#define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE) +#define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE) +#define HEAPBLK_TO_MAPBIT(x) ((x) % HEAPBLOCKS_PER_BYTE) + +/* prototypes for internal routines */ +static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend); +static void vm_extend(Relation rel, BlockNumber nvmblocks); + + +/* + * visibilitymap_clear - clear a bit in visibility map + * + * Clear a bit in the visibility map, marking that not all tuples are + * visible to all transactions anymore. + */ +void +visibilitymap_clear(Relation rel, BlockNumber heapBlk) +{ + BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); + int mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); + int mapBit = HEAPBLK_TO_MAPBIT(heapBlk); + uint8 mask = 1 << mapBit; + Buffer mapBuffer; + char *map; + +#ifdef TRACE_VISIBILITYMAP + elog(DEBUG1, "vm_clear %s %d", RelationGetRelationName(rel), heapBlk); +#endif + + mapBuffer = vm_readbuf(rel, mapBlock, false); + if (!BufferIsValid(mapBuffer)) + return; /* nothing to do */ + + LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE); + map = PageGetContents(BufferGetPage(mapBuffer)); + + if (map[mapByte] & mask) + { + map[mapByte] &= ~mask; + + MarkBufferDirty(mapBuffer); + } + + UnlockReleaseBuffer(mapBuffer); +} + +/* + * visibilitymap_pin - pin a map page for setting a bit + * + * Setting a bit in the visibility map is a two-phase operation. First, call + * visibilitymap_pin, to pin the visibility map page containing the bit for + * the heap page. Because that can require I/O to read the map page, you + * shouldn't hold a lock on the heap page while doing that. Then, call + * visibilitymap_set to actually set the bit. + * + * On entry, *buf should be InvalidBuffer or a valid buffer returned by + * an earlier call to visibilitymap_pin or visibilitymap_test on the same + * relation. On return, *buf is a valid buffer with the map page containing + * the the bit for heapBlk. + * + * If the page doesn't exist in the map file yet, it is extended. + */ +void +visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *buf) +{ + BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); + + /* Reuse the old pinned buffer if possible */ + if (BufferIsValid(*buf)) + { + if (BufferGetBlockNumber(*buf) == mapBlock) + return; + + ReleaseBuffer(*buf); + } + *buf = vm_readbuf(rel, mapBlock, true); +} + +/* + * visibilitymap_set - set a bit on a previously pinned page + * + * recptr is the LSN of the heap page. The LSN of the visibility map page is + * advanced to that, to make sure that the visibility map doesn't get flushed + * to disk before the update to the heap page that made all tuples visible. + * + * This is an opportunistic function. It does nothing, unless *buf + * contains the bit for heapBlk. Call visibilitymap_pin first to pin + * the right map page. This function doesn't do any I/O. + */ +void +visibilitymap_set(Relation rel, BlockNumber heapBlk, XLogRecPtr recptr, + Buffer *buf) +{ + BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); + uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); + uint8 mapBit = HEAPBLK_TO_MAPBIT(heapBlk); + Page page; + char *map; + +#ifdef TRACE_VISIBILITYMAP + elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk); +#endif + + /* Check that we have the right page pinned */ + if (!BufferIsValid(*buf) || BufferGetBlockNumber(*buf) != mapBlock) + return; + + page = BufferGetPage(*buf); + map = PageGetContents(page); + LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE); + + if (!(map[mapByte] & (1 << mapBit))) + { + map[mapByte] |= (1 << mapBit); + + if (XLByteLT(PageGetLSN(page), recptr)) + PageSetLSN(page, recptr); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(*buf); + } + + LockBuffer(*buf, BUFFER_LOCK_UNLOCK); +} + +/* + * visibilitymap_test - test if a bit is set + * + * Are all tuples on heapBlk visible to all, according to the visibility map? + * + * On entry, *buf should be InvalidBuffer or a valid buffer returned by an + * earlier call to visibilitymap_pin or visibilitymap_test on the same + * relation. On return, *buf is a valid buffer with the map page containing + * the the bit for heapBlk, or InvalidBuffer. The caller is responsible for + * releasing *buf after it's done testing and setting bits. + */ +bool +visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *buf) +{ + BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); + uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); + uint8 mapBit = HEAPBLK_TO_MAPBIT(heapBlk); + bool result; + char *map; + +#ifdef TRACE_VISIBILITYMAP + elog(DEBUG1, "vm_test %s %d", RelationGetRelationName(rel), heapBlk); +#endif + + /* Reuse the old pinned buffer if possible */ + if (BufferIsValid(*buf)) + { + if (BufferGetBlockNumber(*buf) != mapBlock) + { + ReleaseBuffer(*buf); + *buf = InvalidBuffer; + } + } + + if (!BufferIsValid(*buf)) + { + *buf = vm_readbuf(rel, mapBlock, false); + if (!BufferIsValid(*buf)) + return false; + } + + map = PageGetContents(BufferGetPage(*buf)); + + /* + * We don't need to lock the page, as we're only looking at a single bit. + */ + result = (map[mapByte] & (1 << mapBit)) ? true : false; + + return result; +} + +/* + * visibilitymap_test - truncate the visibility map + */ +void +visibilitymap_truncate(Relation rel, BlockNumber nheapblocks) +{ + BlockNumber newnblocks; + /* last remaining block, byte, and bit */ + BlockNumber truncBlock = HEAPBLK_TO_MAPBLOCK(nheapblocks); + uint32 truncByte = HEAPBLK_TO_MAPBYTE(nheapblocks); + uint8 truncBit = HEAPBLK_TO_MAPBIT(nheapblocks); + +#ifdef TRACE_VISIBILITYMAP + elog(DEBUG1, "vm_truncate %s %d", RelationGetRelationName(rel), nheapblocks); +#endif + + /* + * If no visibility map has been created yet for this relation, there's + * nothing to truncate. + */ + if (!smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM)) + return; + + /* + * Unless the new size is exactly at a visibility map page boundary, the + * tail bits in the last remaining map page, representing truncated heap + * blocks, need to be cleared. This is not only tidy, but also necessary + * because we don't get a chance to clear the bits if the heap is + * extended again. + */ + if (truncByte != 0 || truncBit != 0) + { + Buffer mapBuffer; + Page page; + char *map; + + newnblocks = truncBlock + 1; + + mapBuffer = vm_readbuf(rel, truncBlock, false); + if (!BufferIsValid(mapBuffer)) + { + /* nothing to do, the file was already smaller */ + return; + } + + page = BufferGetPage(mapBuffer); + map = PageGetContents(page); + + LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE); + + /* Clear out the unwanted bytes. */ + MemSet(&map[truncByte + 1], 0, MAPSIZE - (truncByte + 1)); + + /* + * Mask out the unwanted bits of the last remaining byte. + * + * ((1 << 0) - 1) = 00000000 + * ((1 << 1) - 1) = 00000001 + * ... + * ((1 << 6) - 1) = 00111111 + * ((1 << 7) - 1) = 01111111 + */ + map[truncByte] &= (1 << truncBit) - 1; + + MarkBufferDirty(mapBuffer); + UnlockReleaseBuffer(mapBuffer); + } + else + newnblocks = truncBlock; + + if (smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM) < newnblocks) + { + /* nothing to do, the file was already smaller than requested size */ + return; + } + + smgrtruncate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, newnblocks, + rel->rd_istemp); + + /* + * Need to invalidate the relcache entry, because rd_vm_nblocks + * seen by other backends is no longer valid. + */ + if (!InRecovery) + CacheInvalidateRelcache(rel); + + rel->rd_vm_nblocks = newnblocks; +} + +/* + * Read a visibility map page. + * + * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is + * true, the visibility map file is extended. + */ +static Buffer +vm_readbuf(Relation rel, BlockNumber blkno, bool extend) +{ + Buffer buf; + + RelationOpenSmgr(rel); + + /* + * The current size of the visibility map fork is kept in relcache, to + * avoid reading beyond EOF. If we haven't cached the size of the map yet, + * do that first. + */ + if (rel->rd_vm_nblocks == InvalidBlockNumber) + { + if (smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM)) + rel->rd_vm_nblocks = smgrnblocks(rel->rd_smgr, + VISIBILITYMAP_FORKNUM); + else + rel->rd_vm_nblocks = 0; + } + + /* Handle requests beyond EOF */ + if (blkno >= rel->rd_vm_nblocks) + { + if (extend) + vm_extend(rel, blkno + 1); + else + return InvalidBuffer; + } + + /* + * Use ZERO_ON_ERROR mode, and initialize the page if necessary. It's + * always safe to clear bits, so it's better to clear corrupt pages than + * error out. + */ + buf = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, blkno, + RBM_ZERO_ON_ERROR, NULL); + if (PageIsNew(BufferGetPage(buf))) + PageInit(BufferGetPage(buf), BLCKSZ, 0); + return buf; +} + +/* + * Ensure that the visibility map fork is at least vm_nblocks long, extending + * it if necessary with zeroed pages. + */ +static void +vm_extend(Relation rel, BlockNumber vm_nblocks) +{ + BlockNumber vm_nblocks_now; + Page pg; + + pg = (Page) palloc(BLCKSZ); + PageInit(pg, BLCKSZ, 0); + + /* + * We use the relation extension lock to lock out other backends trying + * to extend the visibility map at the same time. It also locks out + * extension of the main fork, unnecessarily, but extending the + * visibility map happens seldom enough that it doesn't seem worthwhile to + * have a separate lock tag type for it. + * + * Note that another backend might have extended or created the + * relation before we get the lock. + */ + LockRelationForExtension(rel, ExclusiveLock); + + /* Create the file first if it doesn't exist */ + if ((rel->rd_vm_nblocks == 0 || rel->rd_vm_nblocks == InvalidBlockNumber) + && !smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM)) + { + smgrcreate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, false); + vm_nblocks_now = 0; + } + else + vm_nblocks_now = smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM); + + while (vm_nblocks_now < vm_nblocks) + { + smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now, + (char *) pg, rel->rd_istemp); + vm_nblocks_now++; + } + + UnlockRelationForExtension(rel, ExclusiveLock); + + pfree(pg); + + /* Update the relcache with the up-to-date size */ + if (!InRecovery) + CacheInvalidateRelcache(rel); + rel->rd_vm_nblocks = vm_nblocks_now; +} diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 466cc64858..6a2588de18 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -11,7 +11,7 @@ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/access/transam/xlogutils.c,v 1.64 2008/11/26 17:08:57 heikki Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/xlogutils.c,v 1.65 2008/12/03 13:05:22 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -377,6 +377,7 @@ CreateFakeRelcacheEntry(RelFileNode rnode) rel->rd_targblock = InvalidBlockNumber; rel->rd_fsm_nblocks = InvalidBlockNumber; + rel->rd_vm_nblocks = InvalidBlockNumber; rel->rd_smgr = NULL; return rel; diff --git a/src/backend/catalog/catalog.c b/src/backend/catalog/catalog.c index 733b9d9622..8f96298022 100644 --- a/src/backend/catalog/catalog.c +++ b/src/backend/catalog/catalog.c @@ -10,7 +10,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/catalog/catalog.c,v 1.79 2008/10/06 14:13:17 heikki Exp $ + * $PostgreSQL: pgsql/src/backend/catalog/catalog.c,v 1.80 2008/12/03 13:05:22 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -54,7 +54,8 @@ */ const char *forkNames[] = { "main", /* MAIN_FORKNUM */ - "fsm" /* FSM_FORKNUM */ + "fsm", /* FSM_FORKNUM */ + "vm" /* VISIBILITYMAP_FORKNUM */ }; /* diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index c8187d511c..dfba476cda 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/catalog/storage.c,v 1.1 2008/11/19 10:34:51 heikki Exp $ + * $PostgreSQL: pgsql/src/backend/catalog/storage.c,v 1.2 2008/12/03 13:05:22 heikki Exp $ * * NOTES * Some of this code used to be in storage/smgr/smgr.c, and the @@ -19,6 +19,7 @@ #include "postgres.h" +#include "access/visibilitymap.h" #include "access/xact.h" #include "access/xlogutils.h" #include "catalog/catalog.h" @@ -175,6 +176,7 @@ void RelationTruncate(Relation rel, BlockNumber nblocks) { bool fsm; + bool vm; /* Open it at the smgr level if not already done */ RelationOpenSmgr(rel); @@ -187,6 +189,11 @@ RelationTruncate(Relation rel, BlockNumber nblocks) if (fsm) FreeSpaceMapTruncateRel(rel, nblocks); + /* Truncate the visibility map too if it exists. */ + vm = smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM); + if (vm) + visibilitymap_truncate(rel, nblocks); + /* * We WAL-log the truncation before actually truncating, which * means trouble if the truncation fails. If we then crash, the WAL @@ -217,12 +224,12 @@ RelationTruncate(Relation rel, BlockNumber nblocks) /* * Flush, because otherwise the truncation of the main relation - * might hit the disk before the WAL record of truncating the - * FSM is flushed. If we crashed during that window, we'd be - * left with a truncated heap, but the FSM would still contain - * entries for the non-existent heap pages. + * might hit the disk before the WAL record, and the truncation of + * the FSM or visibility map. If we crashed during that window, we'd + * be left with a truncated heap, but the FSM or visibility map would + * still contain entries for the non-existent heap pages. */ - if (fsm) + if (fsm || vm) XLogFlush(lsn); } diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 9cb641fe3f..e016ddb067 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -13,7 +13,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.381 2008/11/19 10:34:51 heikki Exp $ + * $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.382 2008/12/03 13:05:22 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -26,6 +26,7 @@ #include "access/genam.h" #include "access/heapam.h" #include "access/transam.h" +#include "access/visibilitymap.h" #include "access/xact.h" #include "access/xlog.h" #include "catalog/namespace.h" @@ -3005,10 +3006,19 @@ move_chain_tuple(Relation rel, END_CRIT_SECTION(); + PageClearAllVisible(BufferGetPage(old_buf)); + if (dst_buf != old_buf) + PageClearAllVisible(BufferGetPage(dst_buf)); + LockBuffer(dst_buf, BUFFER_LOCK_UNLOCK); if (dst_buf != old_buf) LockBuffer(old_buf, BUFFER_LOCK_UNLOCK); + /* Clear the bits in the visibility map. */ + visibilitymap_clear(rel, BufferGetBlockNumber(old_buf)); + if (dst_buf != old_buf) + visibilitymap_clear(rel, BufferGetBlockNumber(dst_buf)); + /* Create index entries for the moved tuple */ if (ec->resultRelInfo->ri_NumIndices > 0) { @@ -3107,6 +3117,23 @@ move_plain_tuple(Relation rel, END_CRIT_SECTION(); + /* + * Clear the visible-to-all hint bits on the page, and bits in the + * visibility map. Normally we'd release the locks on the heap pages + * before updating the visibility map, but doesn't really matter here + * because we're holding an AccessExclusiveLock on the relation anyway. + */ + if (PageIsAllVisible(dst_page)) + { + PageClearAllVisible(dst_page); + visibilitymap_clear(rel, BufferGetBlockNumber(dst_buf)); + } + if (PageIsAllVisible(old_page)) + { + PageClearAllVisible(old_page); + visibilitymap_clear(rel, BufferGetBlockNumber(old_buf)); + } + dst_vacpage->free = PageGetFreeSpaceWithFillFactor(rel, dst_page); LockBuffer(dst_buf, BUFFER_LOCK_UNLOCK); LockBuffer(old_buf, BUFFER_LOCK_UNLOCK); diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index 4230b2e3ef..d389123a3e 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -29,7 +29,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.111 2008/11/19 10:34:51 heikki Exp $ + * $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.112 2008/12/03 13:05:22 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -40,6 +40,7 @@ #include "access/genam.h" #include "access/heapam.h" #include "access/transam.h" +#include "access/visibilitymap.h" #include "catalog/storage.h" #include "commands/dbcommands.h" #include "commands/vacuum.h" @@ -88,6 +89,7 @@ typedef struct LVRelStats int max_dead_tuples; /* # slots allocated in array */ ItemPointer dead_tuples; /* array of ItemPointerData */ int num_index_scans; + bool scanned_all; /* have we scanned all pages (this far)? */ } LVRelStats; @@ -102,7 +104,7 @@ static BufferAccessStrategy vac_strategy; /* non-export function prototypes */ static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, - Relation *Irel, int nindexes); + Relation *Irel, int nindexes, bool scan_all); static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats); static void lazy_vacuum_index(Relation indrel, IndexBulkDeleteResult **stats, @@ -141,6 +143,7 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, BlockNumber possibly_freeable; PGRUsage ru0; TimestampTz starttime = 0; + bool scan_all; pg_rusage_init(&ru0); @@ -161,13 +164,20 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats)); vacrelstats->num_index_scans = 0; + vacrelstats->scanned_all = true; /* Open all indexes of the relation */ vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel); vacrelstats->hasindex = (nindexes > 0); + /* Should we use the visibility map or scan all pages? */ + if (vacstmt->freeze_min_age != -1) + scan_all = true; + else + scan_all = false; + /* Do the vacuuming */ - lazy_scan_heap(onerel, vacrelstats, Irel, nindexes); + lazy_scan_heap(onerel, vacrelstats, Irel, nindexes, scan_all); /* Done with indexes */ vac_close_indexes(nindexes, Irel, NoLock); @@ -186,10 +196,14 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, /* Vacuum the Free Space Map */ FreeSpaceMapVacuum(onerel); - /* Update statistics in pg_class */ + /* + * Update statistics in pg_class. We can only advance relfrozenxid if we + * didn't skip any pages. + */ vac_update_relstats(onerel, vacrelstats->rel_pages, vacrelstats->rel_tuples, - vacrelstats->hasindex, FreezeLimit); + vacrelstats->hasindex, + vacrelstats->scanned_all ? FreezeLimit : InvalidOid); /* report results to the stats collector, too */ pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared, @@ -230,13 +244,14 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, */ static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, - Relation *Irel, int nindexes) + Relation *Irel, int nindexes, bool scan_all) { BlockNumber nblocks, blkno; HeapTupleData tuple; char *relname; BlockNumber empty_pages, + scanned_pages, vacuumed_pages; double num_tuples, tups_vacuumed, @@ -245,6 +260,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, IndexBulkDeleteResult **indstats; int i; PGRUsage ru0; + Buffer vmbuffer = InvalidBuffer; pg_rusage_init(&ru0); @@ -254,7 +270,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, get_namespace_name(RelationGetNamespace(onerel)), relname))); - empty_pages = vacuumed_pages = 0; + empty_pages = vacuumed_pages = scanned_pages = 0; num_tuples = tups_vacuumed = nkeep = nunused = 0; indstats = (IndexBulkDeleteResult **) @@ -278,9 +294,28 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, OffsetNumber frozen[MaxOffsetNumber]; int nfrozen; Size freespace; + bool all_visible_according_to_vm = false; + bool all_visible; + + /* + * Skip pages that don't require vacuuming according to the + * visibility map. + */ + if (!scan_all) + { + all_visible_according_to_vm = + visibilitymap_test(onerel, blkno, &vmbuffer); + if (all_visible_according_to_vm) + { + vacrelstats->scanned_all = false; + continue; + } + } vacuum_delay_point(); + scanned_pages++; + /* * If we are close to overrunning the available space for dead-tuple * TIDs, pause and do a cycle of vacuuming before we tackle this page. @@ -354,7 +389,26 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, { empty_pages++; freespace = PageGetHeapFreeSpace(page); - UnlockReleaseBuffer(buf); + + if (!PageIsAllVisible(page)) + { + SetBufferCommitInfoNeedsSave(buf); + PageSetAllVisible(page); + } + + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + /* Update the visibility map */ + if (!all_visible_according_to_vm) + { + visibilitymap_pin(onerel, blkno, &vmbuffer); + LockBuffer(buf, BUFFER_LOCK_SHARE); + if (PageIsAllVisible(page)) + visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + + ReleaseBuffer(buf); RecordPageWithFreeSpace(onerel, blkno, freespace); continue; } @@ -371,6 +425,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, * Now scan the page to collect vacuumable items and check for tuples * requiring freezing. */ + all_visible = true; nfrozen = 0; hastup = false; prev_dead_count = vacrelstats->num_dead_tuples; @@ -408,6 +463,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, if (ItemIdIsDead(itemid)) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); + all_visible = false; continue; } @@ -442,6 +498,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, nkeep += 1; else tupgone = true; /* we can delete the tuple */ + all_visible = false; break; case HEAPTUPLE_LIVE: /* Tuple is good --- but let's do some validity checks */ @@ -449,6 +506,36 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, !OidIsValid(HeapTupleGetOid(&tuple))) elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid", relname, blkno, offnum); + + /* + * Is the tuple definitely visible to all transactions? + * + * NB: Like with per-tuple hint bits, we can't set the + * PD_ALL_VISIBLE flag if the inserter committed + * asynchronously. See SetHintBits for more info. Check + * that the HEAP_XMIN_COMMITTED hint bit is set because of + * that. + */ + if (all_visible) + { + TransactionId xmin; + + if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED)) + { + all_visible = false; + break; + } + /* + * The inserter definitely committed. But is it + * old enough that everyone sees it as committed? + */ + xmin = HeapTupleHeaderGetXmin(tuple.t_data); + if (!TransactionIdPrecedes(xmin, OldestXmin)) + { + all_visible = false; + break; + } + } break; case HEAPTUPLE_RECENTLY_DEAD: @@ -457,12 +544,15 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, * from relation. */ nkeep += 1; + all_visible = false; break; case HEAPTUPLE_INSERT_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ + all_visible = false; break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ + all_visible = false; break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); @@ -525,12 +615,44 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, freespace = PageGetHeapFreeSpace(page); + /* Update the all-visible flag on the page */ + if (!PageIsAllVisible(page) && all_visible) + { + SetBufferCommitInfoNeedsSave(buf); + PageSetAllVisible(page); + } + else if (PageIsAllVisible(page) && !all_visible) + { + elog(WARNING, "PD_ALL_VISIBLE flag was incorrectly set"); + SetBufferCommitInfoNeedsSave(buf); + PageClearAllVisible(page); + + /* + * Normally, we would drop the lock on the heap page before + * updating the visibility map, but since this is a can't-happen + * case anyway, don't bother. + */ + visibilitymap_clear(onerel, blkno); + } + + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + /* Update the visibility map */ + if (!all_visible_according_to_vm && all_visible) + { + visibilitymap_pin(onerel, blkno, &vmbuffer); + LockBuffer(buf, BUFFER_LOCK_SHARE); + if (PageIsAllVisible(page)) + visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + + ReleaseBuffer(buf); + /* Remember the location of the last page with nonremovable tuples */ if (hastup) vacrelstats->nonempty_pages = blkno + 1; - UnlockReleaseBuffer(buf); - /* * If we remembered any tuples for deletion, then the page will be * visited again by lazy_vacuum_heap, which will compute and record @@ -560,6 +682,13 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, vacrelstats->num_index_scans++; } + /* Release the pin on the visibility map page */ + if (BufferIsValid(vmbuffer)) + { + ReleaseBuffer(vmbuffer); + vmbuffer = InvalidBuffer; + } + /* Do post-vacuum cleanup and statistics update for each index */ for (i = 0; i < nindexes; i++) lazy_cleanup_index(Irel[i], indstats[i], vacrelstats); @@ -572,9 +701,9 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, tups_vacuumed, vacuumed_pages))); ereport(elevel, - (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages", + (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages", RelationGetRelationName(onerel), - tups_vacuumed, num_tuples, nblocks), + tups_vacuumed, num_tuples, scanned_pages, nblocks), errdetail("%.0f dead row versions cannot be removed yet.\n" "There were %.0f unused item pointers.\n" "%u pages are entirely empty.\n" diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index b6a87ed7a8..0ab9f8de11 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.277 2008/11/26 17:08:57 heikki Exp $ + * $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.278 2008/12/03 13:05:22 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -305,6 +305,7 @@ AllocateRelationDesc(Relation relation, Form_pg_class relp) MemSet(relation, 0, sizeof(RelationData)); relation->rd_targblock = InvalidBlockNumber; relation->rd_fsm_nblocks = InvalidBlockNumber; + relation->rd_vm_nblocks = InvalidBlockNumber; /* make sure relation is marked as having no open file yet */ relation->rd_smgr = NULL; @@ -1377,6 +1378,7 @@ formrdesc(const char *relationName, Oid relationReltype, relation = (Relation) palloc0(sizeof(RelationData)); relation->rd_targblock = InvalidBlockNumber; relation->rd_fsm_nblocks = InvalidBlockNumber; + relation->rd_vm_nblocks = InvalidBlockNumber; /* make sure relation is marked as having no open file yet */ relation->rd_smgr = NULL; @@ -1665,9 +1667,13 @@ RelationReloadIndexInfo(Relation relation) heap_freetuple(pg_class_tuple); /* We must recalculate physical address in case it changed */ RelationInitPhysicalAddr(relation); - /* Must reset targblock and fsm_nblocks in case rel was truncated */ + /* + * Must reset targblock, fsm_nblocks and vm_nblocks in case rel was + * truncated + */ relation->rd_targblock = InvalidBlockNumber; relation->rd_fsm_nblocks = InvalidBlockNumber; + relation->rd_vm_nblocks = InvalidBlockNumber; /* Must free any AM cached data, too */ if (relation->rd_amcache) pfree(relation->rd_amcache); @@ -1751,6 +1757,7 @@ RelationClearRelation(Relation relation, bool rebuild) { relation->rd_targblock = InvalidBlockNumber; relation->rd_fsm_nblocks = InvalidBlockNumber; + relation->rd_vm_nblocks = InvalidBlockNumber; if (relation->rd_rel->relkind == RELKIND_INDEX) { relation->rd_isvalid = false; /* needs to be revalidated */ @@ -2346,6 +2353,7 @@ RelationBuildLocalRelation(const char *relname, rel->rd_targblock = InvalidBlockNumber; rel->rd_fsm_nblocks = InvalidBlockNumber; + rel->rd_vm_nblocks = InvalidBlockNumber; /* make sure relation is marked as having no open file yet */ rel->rd_smgr = NULL; @@ -3603,6 +3611,7 @@ load_relcache_init_file(void) rel->rd_smgr = NULL; rel->rd_targblock = InvalidBlockNumber; rel->rd_fsm_nblocks = InvalidBlockNumber; + rel->rd_vm_nblocks = InvalidBlockNumber; if (rel->rd_isnailed) rel->rd_refcnt = 1; else diff --git a/src/include/access/htup.h b/src/include/access/htup.h index 3e075236d3..5d1f0b1d90 100644 --- a/src/include/access/htup.h +++ b/src/include/access/htup.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/htup.h,v 1.104 2008/11/14 01:57:42 alvherre Exp $ + * $PostgreSQL: pgsql/src/include/access/htup.h,v 1.105 2008/12/03 13:05:22 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -601,9 +601,10 @@ typedef struct xl_heaptid typedef struct xl_heap_delete { xl_heaptid target; /* deleted tuple id */ + bool all_visible_cleared; /* PD_ALL_VISIBLE was cleared */ } xl_heap_delete; -#define SizeOfHeapDelete (offsetof(xl_heap_delete, target) + SizeOfHeapTid) +#define SizeOfHeapDelete (offsetof(xl_heap_delete, all_visible_cleared) + sizeof(bool)) /* * We don't store the whole fixed part (HeapTupleHeaderData) of an inserted @@ -626,21 +627,24 @@ typedef struct xl_heap_header typedef struct xl_heap_insert { xl_heaptid target; /* inserted tuple id */ + bool all_visible_cleared; /* PD_ALL_VISIBLE was cleared */ /* xl_heap_header & TUPLE DATA FOLLOWS AT END OF STRUCT */ } xl_heap_insert; -#define SizeOfHeapInsert (offsetof(xl_heap_insert, target) + SizeOfHeapTid) +#define SizeOfHeapInsert (offsetof(xl_heap_insert, all_visible_cleared) + sizeof(bool)) /* This is what we need to know about update|move|hot_update */ typedef struct xl_heap_update { xl_heaptid target; /* deleted tuple id */ ItemPointerData newtid; /* new inserted tuple id */ + bool all_visible_cleared; /* PD_ALL_VISIBLE was cleared */ + bool new_all_visible_cleared; /* same for the page of newtid */ /* NEW TUPLE xl_heap_header (PLUS xmax & xmin IF MOVE OP) */ /* and TUPLE DATA FOLLOWS AT END OF STRUCT */ } xl_heap_update; -#define SizeOfHeapUpdate (offsetof(xl_heap_update, newtid) + SizeOfIptrData) +#define SizeOfHeapUpdate (offsetof(xl_heap_update, new_all_visible_cleared) + sizeof(bool)) /* * This is what we need to know about vacuum page cleanup/redirect diff --git a/src/include/access/visibilitymap.h b/src/include/access/visibilitymap.h new file mode 100644 index 0000000000..3803ac40bc --- /dev/null +++ b/src/include/access/visibilitymap.h @@ -0,0 +1,30 @@ +/*------------------------------------------------------------------------- + * + * visibilitymap.h + * visibility map interface + * + * + * Portions Copyright (c) 2007, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * $PostgreSQL: pgsql/src/include/access/visibilitymap.h,v 1.1 2008/12/03 13:05:22 heikki Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef VISIBILITYMAP_H +#define VISIBILITYMAP_H + +#include "utils/rel.h" +#include "storage/buf.h" +#include "storage/itemptr.h" +#include "access/xlogdefs.h" + +extern void visibilitymap_clear(Relation rel, BlockNumber heapBlk); +extern void visibilitymap_pin(Relation rel, BlockNumber heapBlk, + Buffer *vmbuf); +extern void visibilitymap_set(Relation rel, BlockNumber heapBlk, + XLogRecPtr recptr, Buffer *vmbuf); +extern bool visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *vmbuf); +extern void visibilitymap_truncate(Relation rel, BlockNumber heapblk); + +#endif /* VISIBILITYMAP_H */ diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h index a5b88e78df..6115d5fc64 100644 --- a/src/include/storage/bufpage.h +++ b/src/include/storage/bufpage.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/bufpage.h,v 1.84 2008/11/03 20:47:49 tgl Exp $ + * $PostgreSQL: pgsql/src/include/storage/bufpage.h,v 1.85 2008/12/03 13:05:22 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -152,8 +152,10 @@ typedef PageHeaderData *PageHeader; #define PD_HAS_FREE_LINES 0x0001 /* are there any unused line pointers? */ #define PD_PAGE_FULL 0x0002 /* not enough free space for new * tuple? */ +#define PD_ALL_VISIBLE 0x0004 /* all tuples on page are visible to + * everyone */ -#define PD_VALID_FLAG_BITS 0x0003 /* OR of all valid pd_flags bits */ +#define PD_VALID_FLAG_BITS 0x0007 /* OR of all valid pd_flags bits */ /* * Page layout version number 0 is for pre-7.3 Postgres releases. @@ -336,6 +338,13 @@ typedef PageHeaderData *PageHeader; #define PageClearFull(page) \ (((PageHeader) (page))->pd_flags &= ~PD_PAGE_FULL) +#define PageIsAllVisible(page) \ + (((PageHeader) (page))->pd_flags & PD_ALL_VISIBLE) +#define PageSetAllVisible(page) \ + (((PageHeader) (page))->pd_flags |= PD_ALL_VISIBLE) +#define PageClearAllVisible(page) \ + (((PageHeader) (page))->pd_flags &= ~PD_ALL_VISIBLE) + #define PageIsPrunable(page, oldestxmin) \ ( \ AssertMacro(TransactionIdIsNormal(oldestxmin)), \ diff --git a/src/include/storage/relfilenode.h b/src/include/storage/relfilenode.h index adedad61b3..90a0f642d4 100644 --- a/src/include/storage/relfilenode.h +++ b/src/include/storage/relfilenode.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/relfilenode.h,v 1.20 2008/11/19 10:34:52 heikki Exp $ + * $PostgreSQL: pgsql/src/include/storage/relfilenode.h,v 1.21 2008/12/03 13:05:22 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -24,14 +24,15 @@ typedef enum ForkNumber { InvalidForkNumber = -1, MAIN_FORKNUM = 0, - FSM_FORKNUM + FSM_FORKNUM, + VISIBILITYMAP_FORKNUM /* * NOTE: if you add a new fork, change MAX_FORKNUM below and update the * forkNames array in catalog.c */ } ForkNumber; -#define MAX_FORKNUM FSM_FORKNUM +#define MAX_FORKNUM VISIBILITYMAP_FORKNUM /* * RelFileNode must provide all that we need to know to physically access diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index 44540dd649..c75b1cffc3 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.109 2008/11/26 17:08:58 heikki Exp $ + * $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.110 2008/12/03 13:05:22 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -195,8 +195,12 @@ typedef struct RelationData List *rd_indpred; /* index predicate tree, if any */ void *rd_amcache; /* available for use by index AM */ - /* size of the FSM, or InvalidBlockNumber if not known yet */ + /* + * sizes of the free space and visibility map forks, or InvalidBlockNumber + * if not known yet + */ BlockNumber rd_fsm_nblocks; + BlockNumber rd_vm_nblocks; /* use "struct" here to avoid needing to include pgstat.h: */ struct PgStat_TableStatus *pgstat_info; /* statistics collection area */