diff --git a/contrib/pageinspect/expected/page.out b/contrib/pageinspect/expected/page.out index 3fcd9fbe6d..89b73ca991 100644 --- a/contrib/pageinspect/expected/page.out +++ b/contrib/pageinspect/expected/page.out @@ -1,48 +1,69 @@ CREATE EXTENSION pageinspect; -CREATE TABLE test1 (a int, b int); -INSERT INTO test1 VALUES (16777217, 131584); -VACUUM test1; -- set up FSM +CREATE TABLE test_rel_forks (a int); +-- Make sure there are enough blocks in the heap for the FSM to be created. +INSERT INTO test_rel_forks SELECT i from generate_series(1,2000) i; +-- set up FSM and VM +VACUUM test_rel_forks; -- The page contents can vary, so just test that it can be read -- successfully, but don't keep the output. -SELECT octet_length(get_raw_page('test1', 'main', 0)) AS main_0; +SELECT octet_length(get_raw_page('test_rel_forks', 'main', 0)) AS main_0; main_0 -------- 8192 (1 row) -SELECT octet_length(get_raw_page('test1', 'main', 1)) AS main_1; -ERROR: block number 1 is out of range for relation "test1" -SELECT octet_length(get_raw_page('test1', 'fsm', 0)) AS fsm_0; +SELECT octet_length(get_raw_page('test_rel_forks', 'main', 100)) AS main_100; +ERROR: block number 100 is out of range for relation "test_rel_forks" +SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 0)) AS fsm_0; fsm_0 ------- 8192 (1 row) -SELECT octet_length(get_raw_page('test1', 'fsm', 1)) AS fsm_1; - fsm_1 -------- - 8192 -(1 row) - -SELECT octet_length(get_raw_page('test1', 'vm', 0)) AS vm_0; +SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 20)) AS fsm_20; +ERROR: block number 20 is out of range for relation "test_rel_forks" +SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 0)) AS vm_0; vm_0 ------ 8192 (1 row) -SELECT octet_length(get_raw_page('test1', 'vm', 1)) AS vm_1; -ERROR: block number 1 is out of range for relation "test1" +SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 1)) AS vm_1; +ERROR: block number 1 is out of range for relation "test_rel_forks" SELECT octet_length(get_raw_page('xxx', 'main', 0)); ERROR: relation "xxx" does not exist -SELECT octet_length(get_raw_page('test1', 'xxx', 0)); +SELECT octet_length(get_raw_page('test_rel_forks', 'xxx', 0)); ERROR: invalid fork name HINT: Valid fork names are "main", "fsm", "vm", and "init". -SELECT get_raw_page('test1', 0) = get_raw_page('test1', 'main', 0); +SELECT * FROM fsm_page_contents(get_raw_page('test_rel_forks', 'fsm', 0)); + fsm_page_contents +------------------- + 0: 39 + + 1: 39 + + 3: 39 + + 7: 39 + + 15: 39 + + 31: 39 + + 63: 39 + + 127: 39 + + 255: 39 + + 511: 39 + + 1023: 39 + + 2047: 39 + + 4095: 39 + + fp_next_slot: 0 + + +(1 row) + +SELECT get_raw_page('test_rel_forks', 0) = get_raw_page('test_rel_forks', 'main', 0); ?column? ---------- t (1 row) +DROP TABLE test_rel_forks; +CREATE TABLE test1 (a int, b int); +INSERT INTO test1 VALUES (16777217, 131584); SELECT pagesize, version FROM page_header(get_raw_page('test1', 0)); pagesize | version ----------+--------- @@ -62,26 +83,6 @@ SELECT tuple_data_split('test1'::regclass, t_data, t_infomask, t_infomask2, t_bi {"\\x01000001","\\x00020200"} (1 row) -SELECT * FROM fsm_page_contents(get_raw_page('test1', 'fsm', 0)); - fsm_page_contents -------------------- - 0: 254 + - 1: 254 + - 3: 254 + - 7: 254 + - 15: 254 + - 31: 254 + - 63: 254 + - 127: 254 + - 255: 254 + - 511: 254 + - 1023: 254 + - 2047: 254 + - 4095: 254 + - fp_next_slot: 0 + - -(1 row) - DROP TABLE test1; -- check that using any of these functions with a partitioned table or index -- would fail diff --git a/contrib/pageinspect/sql/page.sql b/contrib/pageinspect/sql/page.sql index 8ac9991837..67166ef54c 100644 --- a/contrib/pageinspect/sql/page.sql +++ b/contrib/pageinspect/sql/page.sql @@ -1,26 +1,35 @@ CREATE EXTENSION pageinspect; -CREATE TABLE test1 (a int, b int); -INSERT INTO test1 VALUES (16777217, 131584); +CREATE TABLE test_rel_forks (a int); +-- Make sure there are enough blocks in the heap for the FSM to be created. +INSERT INTO test_rel_forks SELECT i from generate_series(1,2000) i; -VACUUM test1; -- set up FSM +-- set up FSM and VM +VACUUM test_rel_forks; -- The page contents can vary, so just test that it can be read -- successfully, but don't keep the output. -SELECT octet_length(get_raw_page('test1', 'main', 0)) AS main_0; -SELECT octet_length(get_raw_page('test1', 'main', 1)) AS main_1; +SELECT octet_length(get_raw_page('test_rel_forks', 'main', 0)) AS main_0; +SELECT octet_length(get_raw_page('test_rel_forks', 'main', 100)) AS main_100; -SELECT octet_length(get_raw_page('test1', 'fsm', 0)) AS fsm_0; -SELECT octet_length(get_raw_page('test1', 'fsm', 1)) AS fsm_1; +SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 0)) AS fsm_0; +SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 20)) AS fsm_20; -SELECT octet_length(get_raw_page('test1', 'vm', 0)) AS vm_0; -SELECT octet_length(get_raw_page('test1', 'vm', 1)) AS vm_1; +SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 0)) AS vm_0; +SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 1)) AS vm_1; SELECT octet_length(get_raw_page('xxx', 'main', 0)); -SELECT octet_length(get_raw_page('test1', 'xxx', 0)); +SELECT octet_length(get_raw_page('test_rel_forks', 'xxx', 0)); -SELECT get_raw_page('test1', 0) = get_raw_page('test1', 'main', 0); +SELECT * FROM fsm_page_contents(get_raw_page('test_rel_forks', 'fsm', 0)); + +SELECT get_raw_page('test_rel_forks', 0) = get_raw_page('test_rel_forks', 'main', 0); + +DROP TABLE test_rel_forks; + +CREATE TABLE test1 (a int, b int); +INSERT INTO test1 VALUES (16777217, 131584); SELECT pagesize, version FROM page_header(get_raw_page('test1', 0)); @@ -29,8 +38,6 @@ SELECT page_checksum(get_raw_page('test1', 0), 0) IS NOT NULL AS silly_checksum_ SELECT tuple_data_split('test1'::regclass, t_data, t_infomask, t_infomask2, t_bits) FROM heap_page_items(get_raw_page('test1', 0)); -SELECT * FROM fsm_page_contents(get_raw_page('test1', 'fsm', 0)); - DROP TABLE test1; -- check that using any of these functions with a partitioned table or index diff --git a/doc/src/sgml/storage.sgml b/doc/src/sgml/storage.sgml index 8ef2ac8010..cbdad0c3fb 100644 --- a/doc/src/sgml/storage.sgml +++ b/doc/src/sgml/storage.sgml @@ -590,12 +590,13 @@ tuple would otherwise be too big. FSMFree Space Map -Each heap and index relation, except for hash indexes, has a Free Space Map -(FSM) to keep track of available space in the relation. It's stored -alongside the main relation data in a separate relation fork, named after the -filenode number of the relation, plus a _fsm suffix. For example, -if the filenode of a relation is 12345, the FSM is stored in a file called -12345_fsm, in the same directory as the main relation file. +Each heap relation, unless it is very small, and each index relation, except +for hash indexes, has a Free Space Map (FSM) to keep track of available +space in the relation. It's stored alongside the main relation data in a +separate relation fork, named after the filenode number of the relation, plus +a _fsm suffix. For example, if the filenode of a relation +is 12345, the FSM is stored in a file called 12345_fsm, +in the same directory as the main relation file. diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 467d91e681..8f008dd008 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -1150,7 +1150,7 @@ terminate_brin_buildstate(BrinBuildState *state) freespace = PageGetFreeSpace(page); blk = BufferGetBlockNumber(state->bs_currentInsertBuf); ReleaseBuffer(state->bs_currentInsertBuf); - RecordPageWithFreeSpace(state->bs_irel, blk, freespace); + RecordPageWithFreeSpace(state->bs_irel, blk, freespace, InvalidBlockNumber); FreeSpaceMapVacuumRange(state->bs_irel, blk, blk + 1); } diff --git a/src/backend/access/brin/brin_pageops.c b/src/backend/access/brin/brin_pageops.c index 164a468155..2eb354f948 100644 --- a/src/backend/access/brin/brin_pageops.c +++ b/src/backend/access/brin/brin_pageops.c @@ -310,7 +310,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, if (extended) { - RecordPageWithFreeSpace(idxrel, newblk, freespace); + RecordPageWithFreeSpace(idxrel, newblk, freespace, InvalidBlockNumber); FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1); } @@ -461,7 +461,7 @@ brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, if (extended) { - RecordPageWithFreeSpace(idxrel, blk, freespace); + RecordPageWithFreeSpace(idxrel, blk, freespace, InvalidBlockNumber); FreeSpaceMapVacuumRange(idxrel, blk, blk + 1); } @@ -654,7 +654,7 @@ brin_page_cleanup(Relation idxrel, Buffer buf) /* Measure free space and record it */ RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buf), - br_page_get_freespace(page)); + br_page_get_freespace(page), InvalidBlockNumber); } /* @@ -703,7 +703,7 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, /* Choose initial target page, re-using existing target if known */ newblk = RelationGetTargetBlock(irel); if (newblk == InvalidBlockNumber) - newblk = GetPageWithFreeSpace(irel, itemsz); + newblk = GetPageWithFreeSpace(irel, itemsz, true); /* * Loop until we find a page with sufficient free space. By the time we @@ -895,7 +895,7 @@ brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer) * pages whose FSM records were forgotten in a crash. */ RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buffer), - br_page_get_freespace(page)); + br_page_get_freespace(page), InvalidBlockNumber); } diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c index d41d318eef..a9c8ec43a7 100644 --- a/src/backend/access/heap/hio.c +++ b/src/backend/access/heap/hio.c @@ -246,8 +246,14 @@ RelationAddExtraBlocks(Relation relation, BulkInsertState bistate) * Immediately update the bottom level of the FSM. This has a good * chance of making this page visible to other concurrently inserting * backends, and we want that to happen without delay. + * + * Since we know the table will end up with extraBlocks additional + * pages, we pass the final number to avoid possible unnecessary + * system calls and to make sure the FSM is created when we add the + * first new page. */ - RecordPageWithFreeSpace(relation, blockNum, freespace); + RecordPageWithFreeSpace(relation, blockNum, freespace, + firstBlock + extraBlocks); } while (--extraBlocks > 0); @@ -384,20 +390,9 @@ RelationGetBufferForTuple(Relation relation, Size len, * We have no cached target page, so ask the FSM for an initial * target. */ - targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace); - - /* - * If the FSM knows nothing of the rel, try the last page before we - * give up and extend. This avoids one-tuple-per-page syndrome during - * bootstrapping or in a recently-started system. - */ - if (targetBlock == InvalidBlockNumber) - { - BlockNumber nblocks = RelationGetNumberOfBlocks(relation); - - if (nblocks > 0) - targetBlock = nblocks - 1; - } + targetBlock = GetPageWithFreeSpace(relation, + len + saveFreeSpace, + false); } loop: @@ -504,6 +499,13 @@ loop: { /* use this page as future insert target, too */ RelationSetTargetBlock(relation, targetBlock); + + /* + * In case we used an in-memory map of available blocks, reset it + * for next use. + */ + FSMClearLocalMap(); + return buffer; } @@ -563,9 +565,12 @@ loop: /* * Check if some other backend has extended a block for us while - * we were waiting on the lock. + * we were waiting on the lock. We only check the FSM -- if there + * isn't one we don't recheck the number of blocks. */ - targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace); + targetBlock = GetPageWithFreeSpace(relation, + len + saveFreeSpace, + true); /* * If some other waiter has already extended the relation, we @@ -670,5 +675,11 @@ loop: */ RelationSetTargetBlock(relation, BufferGetBlockNumber(buffer)); + /* + * In case we used an in-memory map of available blocks, reset it for next + * use. + */ + FSMClearLocalMap(); + return buffer; } diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 26dfb0c7e0..9416c31889 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -153,7 +153,7 @@ static BufferAccessStrategy vac_strategy; static void lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, Relation *Irel, int nindexes, bool aggressive); -static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats); +static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats, BlockNumber nblocks); static bool lazy_check_needs_freeze(Buffer buf, bool *hastup); static void lazy_vacuum_index(Relation indrel, IndexBulkDeleteResult **stats, @@ -758,7 +758,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, pgstat_progress_update_multi_param(2, hvp_index, hvp_val); /* Remove tuples from heap */ - lazy_vacuum_heap(onerel, vacrelstats); + lazy_vacuum_heap(onerel, vacrelstats, nblocks); /* * Forget the now-vacuumed tuples, and press on, but be careful @@ -897,7 +897,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, Size freespace; freespace = BufferGetPageSize(buf) - SizeOfPageHeaderData; - RecordPageWithFreeSpace(onerel, blkno, freespace); + RecordPageWithFreeSpace(onerel, blkno, freespace, nblocks); } } continue; @@ -941,7 +941,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, } UnlockReleaseBuffer(buf); - RecordPageWithFreeSpace(onerel, blkno, freespace); + RecordPageWithFreeSpace(onerel, blkno, freespace, nblocks); continue; } @@ -1338,7 +1338,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, * taken if there are no indexes.) */ if (vacrelstats->num_dead_tuples == prev_dead_count) - RecordPageWithFreeSpace(onerel, blkno, freespace); + RecordPageWithFreeSpace(onerel, blkno, freespace, nblocks); } /* report that everything is scanned and vacuumed */ @@ -1400,7 +1400,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, /* Remove tuples from heap */ pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, PROGRESS_VACUUM_PHASE_VACUUM_HEAP); - lazy_vacuum_heap(onerel, vacrelstats); + lazy_vacuum_heap(onerel, vacrelstats, nblocks); vacrelstats->num_index_scans++; } @@ -1471,9 +1471,10 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, * Note: the reason for doing this as a second pass is we cannot remove * the tuples until we've removed their index entries, and we want to * process index entry removal in batches as large as possible. + * Note: nblocks is passed as an optimization for RecordPageWithFreeSpace(). */ static void -lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) +lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats, BlockNumber nblocks) { int tupindex; int npages; @@ -1510,7 +1511,7 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) freespace = PageGetHeapFreeSpace(page); UnlockReleaseBuffer(buf); - RecordPageWithFreeSpace(onerel, tblk, freespace); + RecordPageWithFreeSpace(onerel, tblk, freespace, nblocks); npages++; } diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 0181976964..92bda87804 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -48,6 +48,7 @@ #include "replication/walsender.h" #include "storage/condition_variable.h" #include "storage/fd.h" +#include "storage/freespace.h" #include "storage/lmgr.h" #include "storage/predicate.h" #include "storage/proc.h" @@ -2493,6 +2494,12 @@ AbortTransaction(void) pgstat_report_wait_end(); pgstat_progress_end_command(); + /* + * In case we aborted during RelationGetBufferForTuple(), clear the local + * map of heap pages. + */ + FSMClearLocalMap(); + /* Clean up buffer I/O and buffer context locks, too */ AbortBufferIO(); UnlockBuffers(); @@ -4714,6 +4721,13 @@ AbortSubTransaction(void) pgstat_report_wait_end(); pgstat_progress_end_command(); + + /* + * In case we aborted during RelationGetBufferForTuple(), clear the local + * map of heap pages. + */ + FSMClearLocalMap(); + AbortBufferIO(); UnlockBuffers(); diff --git a/src/backend/storage/freespace/README b/src/backend/storage/freespace/README index e7ff23b76f..0d3cd29772 100644 --- a/src/backend/storage/freespace/README +++ b/src/backend/storage/freespace/README @@ -8,7 +8,41 @@ free space to hold a tuple to be stored; or to determine that no such page exists and the relation must be extended by one page. As of PostgreSQL 8.4 each relation has its own, extensible free space map stored in a separate "fork" of its relation. This eliminates the disadvantages of the former -fixed-size FSM. +fixed-size FSM. There are two exceptions: + +1. Hash indexes never have a FSM. +2. For very small tables, a 3-page relation fork would be relatively large +and wasteful, so to save space we refrain from creating the FSM if the +heap has HEAP_FSM_CREATION_THRESHOLD pages or fewer. + +To locate free space in the latter case, we simply try pages directly without +knowing ahead of time how much free space they have. To maintain good +performance, we create a local in-memory map of pages to try, and only mark +every other page as available. For example, in a 3-page heap, the local map +would look like: + +ANAN +0123 + +Pages 0 and 2 are marked "available", and page 1 as "not available". +Page 3 is beyond the end of the relation, so is likewise marked "not +available". First we try page 2, and if that doesn't have sufficient free +space we try page 0 before giving up and extending the relation. There may +be some wasted free space on block 1, but if the relation extends to 4 pages: + +NANA +0123 + +We not only have the new page 3 at our disposal, we can now check page 1 +for free space as well. + +Once the FSM is created for a heap we don't remove it even if somebody deletes +all the rows from the corresponding relation. We don't think it is a useful +optimization as it is quite likely that relation will again grow to the same +size. + +FSM data structure +------------------ It is important to keep the map small so that it can be searched rapidly. Therefore, we don't attempt to record the exact free space on a page. @@ -192,5 +226,3 @@ TODO ---- - fastroot to avoid traversing upper nodes with just 1 child -- use a different system for tables that fit into one FSM page, with a - mechanism to switch to the real thing as it grows. diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c index eee8286057..d3f207b854 100644 --- a/src/backend/storage/freespace/freespace.c +++ b/src/backend/storage/freespace/freespace.c @@ -76,6 +76,14 @@ #define FSM_ROOT_LEVEL (FSM_TREE_DEPTH - 1) #define FSM_BOTTOM_LEVEL 0 +/* Status codes for the local map. */ + +/* Either already tried, or beyond the end of the relation */ +#define FSM_LOCAL_NOT_AVAIL 0x00 + +/* Available to try */ +#define FSM_LOCAL_AVAIL 0x01 + /* * The internal FSM routines work on a logical addressing scheme. Each * level of the tree can be thought of as a separately addressable file. @@ -89,6 +97,23 @@ typedef struct /* Address of the root page. */ static const FSMAddress FSM_ROOT_ADDRESS = {FSM_ROOT_LEVEL, 0}; +/* Local map of block numbers for small heaps with no FSM. */ +typedef struct +{ + BlockNumber nblocks; + uint8 map[HEAP_FSM_CREATION_THRESHOLD]; +} FSMLocalMap; + +static FSMLocalMap fsm_local_map = +{ + 0, + { + FSM_LOCAL_NOT_AVAIL + } +}; + +#define FSM_LOCAL_MAP_EXISTS (fsm_local_map.nblocks > 0) + /* functions to navigate the tree */ static FSMAddress fsm_get_child(FSMAddress parent, uint16 slot); static FSMAddress fsm_get_parent(FSMAddress child, uint16 *slot); @@ -107,10 +132,14 @@ static Size fsm_space_cat_to_avail(uint8 cat); /* workhorse functions for various operations */ static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot, uint8 newValue, uint8 minValue); +static void fsm_local_set(Relation rel, BlockNumber cur_nblocks); static BlockNumber fsm_search(Relation rel, uint8 min_cat); +static BlockNumber fsm_local_search(void); static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr, BlockNumber start, BlockNumber end, bool *eof); +static bool fsm_allow_writes(Relation rel, BlockNumber heapblk, + BlockNumber nblocks, BlockNumber *get_nblocks); /******** Public API ********/ @@ -127,13 +156,46 @@ static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr, * amount of free space available on that page and then try again (see * RecordAndGetPageWithFreeSpace). If InvalidBlockNumber is returned, * extend the relation. + * + * For very small heap relations that don't have a FSM, we try every other + * page before extending the relation. To keep track of which pages have + * been tried, initialize a local in-memory map of pages. */ BlockNumber -GetPageWithFreeSpace(Relation rel, Size spaceNeeded) +GetPageWithFreeSpace(Relation rel, Size spaceNeeded, bool check_fsm_only) { uint8 min_cat = fsm_space_needed_to_cat(spaceNeeded); + BlockNumber target_block, + nblocks; - return fsm_search(rel, min_cat); + /* First try the FSM, if it exists. */ + target_block = fsm_search(rel, min_cat); + + if (target_block == InvalidBlockNumber && + (rel->rd_rel->relkind == RELKIND_RELATION || + rel->rd_rel->relkind == RELKIND_TOASTVALUE) && + !check_fsm_only) + { + nblocks = RelationGetNumberOfBlocks(rel); + + if (nblocks > HEAP_FSM_CREATION_THRESHOLD) + { + /* + * If the FSM knows nothing of the rel, try the last page before + * we give up and extend. This avoids one-tuple-per-page syndrome + * during bootstrapping or in a recently-started system. + */ + target_block = nblocks - 1; + } + else if (nblocks > 0) + { + /* Create or update local map and get first candidate block. */ + fsm_local_set(rel, nblocks); + target_block = fsm_local_search(); + } + } + + return target_block; } /* @@ -144,16 +206,47 @@ GetPageWithFreeSpace(Relation rel, Size spaceNeeded) * also some effort to return a page close to the old page; if there's a * page with enough free space on the same FSM page where the old one page * is located, it is preferred. + * + * For very small heap relations that don't have a FSM, we update the local + * map to indicate we have tried a page, and return the next page to try. */ BlockNumber RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage, Size oldSpaceAvail, Size spaceNeeded) { - int old_cat = fsm_space_avail_to_cat(oldSpaceAvail); - int search_cat = fsm_space_needed_to_cat(spaceNeeded); + int old_cat; + int search_cat; FSMAddress addr; uint16 slot; int search_slot; + BlockNumber nblocks = InvalidBlockNumber; + + /* First try the local map, if it exists. */ + if (FSM_LOCAL_MAP_EXISTS) + { + Assert((rel->rd_rel->relkind == RELKIND_RELATION || + rel->rd_rel->relkind == RELKIND_TOASTVALUE) && + fsm_local_map.map[oldPage] == FSM_LOCAL_AVAIL); + + fsm_local_map.map[oldPage] = FSM_LOCAL_NOT_AVAIL; + return fsm_local_search(); + } + + if (!fsm_allow_writes(rel, oldPage, InvalidBlockNumber, &nblocks)) + { + /* + * If we have neither a local map nor a FSM, we probably just tried + * the target block in the smgr relation entry and failed, so we'll + * need to create the local map. + */ + fsm_local_set(rel, nblocks); + return fsm_local_search(); + } + + /* Normal FSM logic follows */ + + old_cat = fsm_space_avail_to_cat(oldSpaceAvail); + search_cat = fsm_space_needed_to_cat(spaceNeeded); /* Get the location of the FSM byte representing the heap block */ addr = fsm_get_location(oldPage, &slot); @@ -176,20 +269,44 @@ RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage, * Note that if the new spaceAvail value is higher than the old value stored * in the FSM, the space might not become visible to searchers until the next * FreeSpaceMapVacuum call, which updates the upper level pages. + * + * Callers have no need for a local map. */ void -RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail) +RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, + Size spaceAvail, BlockNumber nblocks) { - int new_cat = fsm_space_avail_to_cat(spaceAvail); + int new_cat; FSMAddress addr; uint16 slot; + BlockNumber dummy; + + if (!fsm_allow_writes(rel, heapBlk, nblocks, &dummy)) + /* No FSM to update and no local map either */ + return; /* Get the location of the FSM byte representing the heap block */ addr = fsm_get_location(heapBlk, &slot); + new_cat = fsm_space_avail_to_cat(spaceAvail); fsm_set_and_search(rel, addr, slot, new_cat, 0); } +/* + * Clear the local map. We must call this when we have found a block with + * enough free space, when we extend the relation, or on transaction abort. + */ +void +FSMClearLocalMap(void) +{ + if (FSM_LOCAL_MAP_EXISTS) + { + fsm_local_map.nblocks = 0; + memset(&fsm_local_map.map, FSM_LOCAL_NOT_AVAIL, + sizeof(fsm_local_map.map)); + } +} + /* * XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in * WAL replay @@ -204,6 +321,31 @@ XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk, BlockNumber blkno; Buffer buf; Page page; + bool write_to_fsm; + + /* This is meant to mirror the logic in fsm_allow_writes() */ + if (heapBlk >= HEAP_FSM_CREATION_THRESHOLD) + write_to_fsm = true; + else + { + /* Open the relation at smgr level */ + SMgrRelation smgr = smgropen(rnode, InvalidBackendId); + + if (smgrexists(smgr, FSM_FORKNUM)) + write_to_fsm = true; + else + { + BlockNumber heap_nblocks = smgrnblocks(smgr, MAIN_FORKNUM); + + if (heap_nblocks > HEAP_FSM_CREATION_THRESHOLD) + write_to_fsm = true; + else + write_to_fsm = false; + } + } + + if (!write_to_fsm) + return; /* Get the location of the FSM byte representing the heap block */ addr = fsm_get_location(heapBlk, &slot); @@ -904,3 +1046,134 @@ fsm_vacuum_page(Relation rel, FSMAddress addr, return max_avail; } + +/* + * For heaps, we prevent creation of the FSM unless the number of pages + * exceeds HEAP_FSM_CREATION_THRESHOLD. For tables that don't already have + * a FSM, this will save an inode and a few kB of space. + * + * XXX The API is a little awkward -- if the caller passes a valid nblocks + * value, it can avoid invoking a system call. If the caller passes + * InvalidBlockNumber and receives a false return value, it can get an + * up-to-date relation size from get_nblocks. This saves a few cycles in + * the caller, which would otherwise need to get the relation size by itself. + */ +static bool +fsm_allow_writes(Relation rel, BlockNumber heapblk, + BlockNumber nblocks, BlockNumber *get_nblocks) +{ + bool skip_get_nblocks; + + if (heapblk >= HEAP_FSM_CREATION_THRESHOLD) + return true; + + /* Non-heap rels can always create a FSM. */ + if (rel->rd_rel->relkind != RELKIND_RELATION && + rel->rd_rel->relkind != RELKIND_TOASTVALUE) + return true; + + /* + * If the caller knows nblocks, we can avoid a system call later. If it + * doesn't, maybe we have relpages from a previous VACUUM. Since the table + * may have extended since then, we still have to count the pages later if + * we can't return now. + */ + if (nblocks != InvalidBlockNumber) + { + if (nblocks > HEAP_FSM_CREATION_THRESHOLD) + return true; + else + skip_get_nblocks = true; + } + else + { + if (rel->rd_rel->relpages != InvalidBlockNumber && + rel->rd_rel->relpages > HEAP_FSM_CREATION_THRESHOLD) + return true; + else + skip_get_nblocks = false; + } + + RelationOpenSmgr(rel); + if (smgrexists(rel->rd_smgr, FSM_FORKNUM)) + return true; + + if (skip_get_nblocks) + return false; + + /* last resort */ + *get_nblocks = RelationGetNumberOfBlocks(rel); + if (*get_nblocks > HEAP_FSM_CREATION_THRESHOLD) + return true; + else + return false; +} + +/* + * Initialize or update the local map of blocks to try, for when there is + * no FSM. + * + * When we initialize the map, the whole heap is potentially available to + * try. Testing revealed that trying every block can cause a small + * performance dip compared to when we use a FSM, so we try every other + * block instead. + */ +static void +fsm_local_set(Relation rel, BlockNumber cur_nblocks) +{ + BlockNumber blkno, + cached_target_block; + + /* The local map must not be set already. */ + Assert(!FSM_LOCAL_MAP_EXISTS); + + /* + * Starting at the current last block in the relation and working + * backwards, mark alternating blocks as available. + */ + blkno = cur_nblocks - 1; + while (true) + { + fsm_local_map.map[blkno] = FSM_LOCAL_AVAIL; + if (blkno >= 2) + blkno -= 2; + else + break; + } + + /* Cache the number of blocks. */ + fsm_local_map.nblocks = cur_nblocks; + + /* Set the status of the cached target block to 'unavailable'. */ + cached_target_block = RelationGetTargetBlock(rel); + if (cached_target_block != InvalidBlockNumber && + cached_target_block < cur_nblocks) + fsm_local_map.map[cached_target_block] = FSM_LOCAL_NOT_AVAIL; +} + +/* + * Search the local map for an available block to try, in descending order. + * As such, there is no heuristic available to decide which order will be + * better to try, but the probability of having space in the last block in the + * map is higher because that is the most recent block added to the heap. + * + * This function is used when there is no FSM. + */ +static BlockNumber +fsm_local_search(void) +{ + BlockNumber target_block; + + /* Local map must be set by now. */ + Assert(FSM_LOCAL_MAP_EXISTS); + + target_block = fsm_local_map.nblocks; + do + { + target_block--; + if (fsm_local_map.map[target_block] == FSM_LOCAL_AVAIL) + return target_block; + } while (target_block > 0); + + return InvalidBlockNumber; +} diff --git a/src/backend/storage/freespace/indexfsm.c b/src/backend/storage/freespace/indexfsm.c index 58cedeaa9f..9d8f43d373 100644 --- a/src/backend/storage/freespace/indexfsm.c +++ b/src/backend/storage/freespace/indexfsm.c @@ -37,7 +37,7 @@ BlockNumber GetFreeIndexPage(Relation rel) { - BlockNumber blkno = GetPageWithFreeSpace(rel, BLCKSZ / 2); + BlockNumber blkno = GetPageWithFreeSpace(rel, BLCKSZ / 2, true); if (blkno != InvalidBlockNumber) RecordUsedIndexPage(rel, blkno); @@ -51,7 +51,7 @@ GetFreeIndexPage(Relation rel) void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock) { - RecordPageWithFreeSpace(rel, freeBlock, BLCKSZ - 1); + RecordPageWithFreeSpace(rel, freeBlock, BLCKSZ - 1, InvalidBlockNumber); } @@ -61,7 +61,7 @@ RecordFreeIndexPage(Relation rel, BlockNumber freeBlock) void RecordUsedIndexPage(Relation rel, BlockNumber usedBlock) { - RecordPageWithFreeSpace(rel, usedBlock, 0); + RecordPageWithFreeSpace(rel, usedBlock, 0, InvalidBlockNumber); } /* diff --git a/src/include/storage/freespace.h b/src/include/storage/freespace.h index 8b00033438..dbaae651c5 100644 --- a/src/include/storage/freespace.h +++ b/src/include/storage/freespace.h @@ -18,15 +18,20 @@ #include "storage/relfilenode.h" #include "utils/relcache.h" +/* Only create the FSM if the heap has greater than this many blocks */ +#define HEAP_FSM_CREATION_THRESHOLD 4 + /* prototypes for public functions in freespace.c */ extern Size GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk); -extern BlockNumber GetPageWithFreeSpace(Relation rel, Size spaceNeeded); +extern BlockNumber GetPageWithFreeSpace(Relation rel, Size spaceNeeded, + bool check_fsm_only); extern BlockNumber RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage, Size oldSpaceAvail, Size spaceNeeded); extern void RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, - Size spaceAvail); + Size spaceAvail, BlockNumber nblocks); +extern void FSMClearLocalMap(void); extern void XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk, Size spaceAvail); diff --git a/src/test/regress/expected/fsm.out b/src/test/regress/expected/fsm.out new file mode 100644 index 0000000000..b02993188c --- /dev/null +++ b/src/test/regress/expected/fsm.out @@ -0,0 +1,48 @@ +-- +-- Free Space Map test +-- +CREATE TABLE fsm_check_size (num int, str text); +-- With one block, there should be no FSM +INSERT INTO fsm_check_size VALUES(1, 'a'); +VACUUM fsm_check_size; +SELECT pg_relation_size('fsm_check_size', 'main') AS heap_size, +pg_relation_size('fsm_check_size', 'fsm') AS fsm_size; + heap_size | fsm_size +-----------+---------- + 8192 | 0 +(1 row) + +-- Extend table with enough blocks to exceed the FSM threshold +DO $$ +DECLARE curtid tid; +num int; +BEGIN +num = 11; + LOOP + INSERT INTO fsm_check_size VALUES (num, 'b') RETURNING ctid INTO curtid; + EXIT WHEN curtid >= tid '(4, 0)'; + num = num + 1; + END LOOP; +END; +$$; +VACUUM fsm_check_size; +SELECT pg_relation_size('fsm_check_size', 'fsm') AS fsm_size; + fsm_size +---------- + 24576 +(1 row) + +-- Add long random string to extend TOAST table to 1 block +INSERT INTO fsm_check_size +VALUES(0, (SELECT string_agg(md5(chr(i)), '') + FROM generate_series(1,100) i)); +VACUUM fsm_check_size; +SELECT pg_relation_size(reltoastrelid, 'main') AS toast_size, +pg_relation_size(reltoastrelid, 'fsm') AS toast_fsm_size +FROM pg_class WHERE relname = 'fsm_check_size'; + toast_size | toast_fsm_size +------------+---------------- + 8192 | 0 +(1 row) + +DROP TABLE fsm_check_size; diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index cc0bbf5db9..4051a4ad4e 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -68,6 +68,12 @@ test: create_aggregate create_function_3 create_cast constraints triggers inheri # ---------- test: sanity_check +# ---------- +# fsm does a delete followed by vacuum, and running it in parallel can prevent +# removal of rows. +# ---------- +test: fsm + # ---------- # Believe it or not, select creates a table, subsequent # tests need. diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule index 0c10c7100c..ac1ea622d6 100644 --- a/src/test/regress/serial_schedule +++ b/src/test/regress/serial_schedule @@ -80,6 +80,7 @@ test: roleattributes test: create_am test: hash_func test: sanity_check +test: fsm test: errors test: select test: select_into diff --git a/src/test/regress/sql/fsm.sql b/src/test/regress/sql/fsm.sql new file mode 100644 index 0000000000..332c3e2b2d --- /dev/null +++ b/src/test/regress/sql/fsm.sql @@ -0,0 +1,41 @@ +-- +-- Free Space Map test +-- + +CREATE TABLE fsm_check_size (num int, str text); + +-- With one block, there should be no FSM +INSERT INTO fsm_check_size VALUES(1, 'a'); + +VACUUM fsm_check_size; +SELECT pg_relation_size('fsm_check_size', 'main') AS heap_size, +pg_relation_size('fsm_check_size', 'fsm') AS fsm_size; + +-- Extend table with enough blocks to exceed the FSM threshold +DO $$ +DECLARE curtid tid; +num int; +BEGIN +num = 11; + LOOP + INSERT INTO fsm_check_size VALUES (num, 'b') RETURNING ctid INTO curtid; + EXIT WHEN curtid >= tid '(4, 0)'; + num = num + 1; + END LOOP; +END; +$$; + +VACUUM fsm_check_size; +SELECT pg_relation_size('fsm_check_size', 'fsm') AS fsm_size; + +-- Add long random string to extend TOAST table to 1 block +INSERT INTO fsm_check_size +VALUES(0, (SELECT string_agg(md5(chr(i)), '') + FROM generate_series(1,100) i)); + +VACUUM fsm_check_size; +SELECT pg_relation_size(reltoastrelid, 'main') AS toast_size, +pg_relation_size(reltoastrelid, 'fsm') AS toast_fsm_size +FROM pg_class WHERE relname = 'fsm_check_size'; + +DROP TABLE fsm_check_size;