From 15c121b3ed7eb2f290e19533e41ccca734d23574 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 30 Sep 2008 10:52:14 +0000 Subject: [PATCH] Rewrite the FSM. Instead of relying on a fixed-size shared memory segment, the free space information is stored in a dedicated FSM relation fork, with each relation (except for hash indexes; they don't use FSM). This eliminates the max_fsm_relations and max_fsm_pages GUC options; remove any trace of them from the backend, initdb, and documentation. Rewrite contrib/pg_freespacemap to match the new FSM implementation. Also introduce a new variant of the get_raw_page(regclass, int4, int4) function in contrib/pageinspect that let's you to return pages from any relation fork, and a new fsm_page_contents() function to inspect the new FSM pages. --- contrib/pageinspect/Makefile | 4 +- contrib/pageinspect/fsmfuncs.c | 61 + contrib/pageinspect/pageinspect.sql.in | 17 +- contrib/pageinspect/rawpage.c | 12 +- .../pg_freespacemap/pg_freespacemap.sql.in | 48 +- doc/src/sgml/acronyms.sgml | 4 +- doc/src/sgml/config.sgml | 76 +- doc/src/sgml/pageinspect.sgml | 49 +- doc/src/sgml/pgfreespacemap.sgml | 281 +- doc/src/sgml/ref/vacuum.sgml | 8 +- doc/src/sgml/release.sgml | 5 +- doc/src/sgml/runtime.sgml | 12 +- doc/src/sgml/storage.sgml | 51 +- src/backend/access/gin/gininsert.c | 6 +- src/backend/access/gin/ginutil.c | 5 +- src/backend/access/gin/ginvacuum.c | 35 +- src/backend/access/gist/gist.c | 6 +- src/backend/access/gist/gistutil.c | 5 +- src/backend/access/gist/gistvacuum.c | 37 +- src/backend/access/heap/heapam.c | 6 +- src/backend/access/heap/hio.c | 7 +- src/backend/access/nbtree/nbtpage.c | 5 +- src/backend/access/nbtree/nbtree.c | 80 +- src/backend/access/nbtree/nbtsort.c | 4 +- src/backend/access/transam/rmgr.c | 5 +- src/backend/access/transam/xlogutils.c | 3 +- src/backend/bootstrap/bootstrap.c | 4 +- src/backend/catalog/heap.c | 24 +- src/backend/catalog/index.c | 19 +- src/backend/commands/dbcommands.c | 11 +- src/backend/commands/vacuum.c | 47 +- src/backend/commands/vacuumlazy.c | 283 +- src/backend/postmaster/bgwriter.c | 4 +- src/backend/storage/freespace/Makefile | 4 +- src/backend/storage/freespace/README | 195 ++ src/backend/storage/freespace/freespace.c | 2308 +++++------------ src/backend/storage/freespace/fsmpage.c | 352 +++ src/backend/storage/freespace/indexfsm.c | 92 + src/backend/storage/ipc/ipci.c | 9 +- src/backend/storage/smgr/smgr.c | 24 +- src/backend/tcop/postgres.c | 10 +- src/backend/utils/cache/relcache.c | 10 +- src/backend/utils/misc/guc.c | 22 +- src/backend/utils/misc/postgresql.conf.sample | 7 - src/bin/initdb/initdb.c | 29 +- src/include/access/rmgr.h | 3 +- src/include/storage/freespace.h | 150 +- src/include/storage/fsm_internals.h | 73 + src/include/storage/indexfsm.h | 27 + src/include/storage/lwlock.h | 8 +- src/include/storage/relfilenode.h | 9 +- src/include/utils/guc_tables.h | 3 +- src/include/utils/rel.h | 5 +- 53 files changed, 1844 insertions(+), 2720 deletions(-) create mode 100644 contrib/pageinspect/fsmfuncs.c create mode 100644 src/backend/storage/freespace/README create mode 100644 src/backend/storage/freespace/fsmpage.c create mode 100644 src/backend/storage/freespace/indexfsm.c create mode 100644 src/include/storage/fsm_internals.h create mode 100644 src/include/storage/indexfsm.h diff --git a/contrib/pageinspect/Makefile b/contrib/pageinspect/Makefile index 63da705215..3a6b729c17 100644 --- a/contrib/pageinspect/Makefile +++ b/contrib/pageinspect/Makefile @@ -2,12 +2,12 @@ # # pageinspect Makefile # -# $PostgreSQL: pgsql/contrib/pageinspect/Makefile,v 1.3 2007/11/10 23:59:51 momjian Exp $ +# $PostgreSQL: pgsql/contrib/pageinspect/Makefile,v 1.4 2008/09/30 10:52:09 heikki Exp $ # #------------------------------------------------------------------------- MODULE_big = pageinspect -OBJS = rawpage.o heapfuncs.o btreefuncs.o +OBJS = rawpage.o heapfuncs.o btreefuncs.o fsmfuncs.o DATA_built = pageinspect.sql DATA = uninstall_pageinspect.sql diff --git a/contrib/pageinspect/fsmfuncs.c b/contrib/pageinspect/fsmfuncs.c new file mode 100644 index 0000000000..fb522e5ff5 --- /dev/null +++ b/contrib/pageinspect/fsmfuncs.c @@ -0,0 +1,61 @@ +/*------------------------------------------------------------------------- + * + * fsmfuncs.c + * Functions to investigate FSM pages + * + * These functions are restricted to superusers for the fear of introducing + * security holes if the input checking isn't as water-tight as it should. + * You'd need to be superuser to obtain a raw page image anyway, so + * there's hardly any use case for using these without superuser-rights + * anyway. + * + * Copyright (c) 2007-2008, PostgreSQL Global Development Group + * + * IDENTIFICATION + * $PostgreSQL: pgsql/contrib/pageinspect/fsmfuncs.c,v 1.1 2008/09/30 10:52:09 heikki Exp $ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" +#include "lib/stringinfo.h" +#include "storage/fsm_internals.h" +#include "utils/builtins.h" +#include "miscadmin.h" +#include "funcapi.h" + +Datum fsm_page_contents(PG_FUNCTION_ARGS); + +/* + * Dumps the contents of a FSM page. + */ +PG_FUNCTION_INFO_V1(fsm_page_contents); + +Datum +fsm_page_contents(PG_FUNCTION_ARGS) +{ + bytea *raw_page = PG_GETARG_BYTEA_P(0); + int raw_page_size; + StringInfoData sinfo; + FSMPage fsmpage; + int i; + + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + (errmsg("must be superuser to use raw page functions")))); + + raw_page_size = VARSIZE(raw_page) - VARHDRSZ; + fsmpage = (FSMPage) PageGetContents(VARDATA(raw_page)); + + initStringInfo(&sinfo); + + for(i=0; i < NodesPerPage; i++) + { + if (fsmpage->fp_nodes[i] != 0) + appendStringInfo(&sinfo, "%d: %d\n", i, fsmpage->fp_nodes[i]); + } + appendStringInfo(&sinfo, "fp_next_slot: %d\n", fsmpage->fp_next_slot); + + PG_RETURN_TEXT_P(cstring_to_text(sinfo.data)); +} diff --git a/contrib/pageinspect/pageinspect.sql.in b/contrib/pageinspect/pageinspect.sql.in index 1af59f70f4..49fea9eb51 100644 --- a/contrib/pageinspect/pageinspect.sql.in +++ b/contrib/pageinspect/pageinspect.sql.in @@ -1,4 +1,4 @@ -/* $PostgreSQL: pgsql/contrib/pageinspect/pageinspect.sql.in,v 1.4 2007/11/13 04:24:28 momjian Exp $ */ +/* $PostgreSQL: pgsql/contrib/pageinspect/pageinspect.sql.in,v 1.5 2008/09/30 10:52:09 heikki Exp $ */ -- Adjust this setting to control where the objects get created. SET search_path = public; @@ -6,11 +6,16 @@ SET search_path = public; -- -- get_raw_page() -- -CREATE OR REPLACE FUNCTION get_raw_page(text, int4) +CREATE OR REPLACE FUNCTION get_raw_page(text, int4, int4) RETURNS bytea AS 'MODULE_PATHNAME', 'get_raw_page' LANGUAGE C STRICT; +CREATE OR REPLACE FUNCTION get_raw_page(text, int4) +RETURNS bytea +AS $$ SELECT get_raw_page($1, 0, $2); $$ +LANGUAGE SQL STRICT; + -- -- page_header() -- @@ -92,3 +97,11 @@ CREATE OR REPLACE FUNCTION bt_page_items(IN relname text, IN blkno int4, RETURNS SETOF record AS 'MODULE_PATHNAME', 'bt_page_items' LANGUAGE C STRICT; + +-- +-- fsm_page_contents() +-- +CREATE OR REPLACE FUNCTION fsm_page_contents(IN page bytea) +RETURNS text +AS 'MODULE_PATHNAME', 'fsm_page_contents' +LANGUAGE C STRICT; diff --git a/contrib/pageinspect/rawpage.c b/contrib/pageinspect/rawpage.c index 0bc6bdc017..51c6ee179f 100644 --- a/contrib/pageinspect/rawpage.c +++ b/contrib/pageinspect/rawpage.c @@ -8,7 +8,7 @@ * Copyright (c) 2007-2008, PostgreSQL Global Development Group * * IDENTIFICATION - * $PostgreSQL: pgsql/contrib/pageinspect/rawpage.c,v 1.6 2008/05/12 00:00:43 alvherre Exp $ + * $PostgreSQL: pgsql/contrib/pageinspect/rawpage.c,v 1.7 2008/09/30 10:52:09 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -41,7 +41,8 @@ Datum get_raw_page(PG_FUNCTION_ARGS) { text *relname = PG_GETARG_TEXT_P(0); - uint32 blkno = PG_GETARG_UINT32(1); + uint32 forknum = PG_GETARG_UINT32(1); + uint32 blkno = PG_GETARG_UINT32(2); Relation rel; RangeVar *relrv; @@ -54,6 +55,11 @@ get_raw_page(PG_FUNCTION_ARGS) (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to use raw functions")))); + if (forknum > MAX_FORKNUM) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid fork number"))); + relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); rel = relation_openrv(relrv, AccessShareLock); @@ -80,7 +86,7 @@ get_raw_page(PG_FUNCTION_ARGS) /* Take a verbatim copy of the page */ - buf = ReadBuffer(rel, blkno); + buf = ReadBufferWithFork(rel, forknum, blkno); LockBuffer(buf, BUFFER_LOCK_SHARE); memcpy(raw_page_data, BufferGetPage(buf), BLCKSZ); diff --git a/contrib/pg_freespacemap/pg_freespacemap.sql.in b/contrib/pg_freespacemap/pg_freespacemap.sql.in index e950d9a112..0ab5e1d1ea 100644 --- a/contrib/pg_freespacemap/pg_freespacemap.sql.in +++ b/contrib/pg_freespacemap/pg_freespacemap.sql.in @@ -1,44 +1,26 @@ -/* $PostgreSQL: pgsql/contrib/pg_freespacemap/pg_freespacemap.sql.in,v 1.8 2007/11/13 04:24:28 momjian Exp $ */ +/* $PostgreSQL: pgsql/contrib/pg_freespacemap/pg_freespacemap.sql.in,v 1.9 2008/09/30 10:52:09 heikki Exp $ */ -- Adjust this setting to control where the objects get created. SET search_path = public; --- Register the functions. -CREATE OR REPLACE FUNCTION pg_freespacemap_pages() -RETURNS SETOF RECORD -AS 'MODULE_PATHNAME', 'pg_freespacemap_pages' +-- Register the C function. +CREATE OR REPLACE FUNCTION pg_freespace(regclass, int4) +RETURNS int2 +AS 'MODULE_PATHNAME', 'pg_freespace' LANGUAGE C; -CREATE OR REPLACE FUNCTION pg_freespacemap_relations() +-- pg_freespace shows the recorded space avail at each block in a relation +CREATE OR REPLACE FUNCTION + pg_freespace(rel regclass, blkno OUT int4, avail OUT int2) RETURNS SETOF RECORD -AS 'MODULE_PATHNAME', 'pg_freespacemap_relations' -LANGUAGE C; +AS $$ + SELECT blkno::int4, pg_freespace($1, blkno::int4) AS avail + FROM generate_series(0, pg_relation_size($1) / current_setting('block_size')::bigint - 1) AS blkno; +$$ +LANGUAGE SQL; --- Create views for convenient access. -CREATE VIEW pg_freespacemap_pages AS - SELECT P.* FROM pg_freespacemap_pages() AS P - (reltablespace oid, - reldatabase oid, - relfilenode oid, - relblocknumber bigint, - bytes integer); - -CREATE VIEW pg_freespacemap_relations AS - SELECT P.* FROM pg_freespacemap_relations() AS P - (reltablespace oid, - reldatabase oid, - relfilenode oid, - avgrequest integer, - interestingpages integer, - storedpages integer, - nextpage integer); - - -- Don't want these to be available to public. -REVOKE ALL ON FUNCTION pg_freespacemap_pages() FROM PUBLIC; -REVOKE ALL ON pg_freespacemap_pages FROM PUBLIC; - -REVOKE ALL ON FUNCTION pg_freespacemap_relations() FROM PUBLIC; -REVOKE ALL ON pg_freespacemap_relations FROM PUBLIC; +REVOKE ALL ON FUNCTION pg_freespace(regclass, int4) FROM PUBLIC; +REVOKE ALL ON FUNCTION pg_freespace(regclass) FROM PUBLIC; diff --git a/doc/src/sgml/acronyms.sgml b/doc/src/sgml/acronyms.sgml index c7c5f865d9..82d70de730 100644 --- a/doc/src/sgml/acronyms.sgml +++ b/doc/src/sgml/acronyms.sgml @@ -1,4 +1,4 @@ - + Acronyms @@ -216,7 +216,7 @@ FSM - Free Space Map + Free Space Map diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 9d33918a3e..dfb976c473 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1,4 +1,4 @@ - + Server Configuration @@ -896,80 +896,6 @@ SET ENABLE_SEQSCAN TO OFF; - - - Free Space Map - - - free space map - - - - These parameters control the size of the shared free space - map (FSM), which tracks the locations of unused space in the database. - An undersized free space map can cause the database to consume - increasing amounts of disk space over time, because free space that - is not in the map cannot be re-used; instead PostgreSQL - will request more disk space from the operating system when it needs - to store new data. - The last few lines displayed by a database-wide VACUUM VERBOSE - command can help in determining if the current settings are adequate. - A NOTICE message is also printed during such an operation - if the current settings are too low. - - - - Increasing these parameters might cause PostgreSQL - to request more System V shared - memory than your operating system's default configuration - allows. See for information on how to - adjust those parameters, if necessary. - - - - - max_fsm_pages (integer) - - max_fsm_pages configuration parameter - - - - Sets the maximum number of disk pages for which free space will - be tracked in the shared free-space map. Six bytes of shared memory - are consumed for each page slot. This setting must be at least - 16 * max_fsm_relations. The default is chosen - by initdb depending on the amount of available memory, - and can range from 20k to 200k pages. - This parameter can only be set at server start. - - - - - - max_fsm_relations (integer) - - max_fsm_relations configuration parameter - - - - Sets the maximum number of relations (tables and indexes) for which - free space will be tracked in the shared free-space map. Roughly - seventy bytes of shared memory are consumed for each slot. - The default is one thousand relations. - This parameter can only be set at server start. - - - - - - - - - See the - command for information on setting this parameter. - - - diff --git a/doc/src/sgml/pageinspect.sgml b/doc/src/sgml/pageinspect.sgml index e398733d01..94249399e1 100644 --- a/doc/src/sgml/pageinspect.sgml +++ b/doc/src/sgml/pageinspect.sgml @@ -1,4 +1,4 @@ - + pageinspect @@ -19,7 +19,7 @@ - get_raw_page(text, int) returns bytea + get_raw_page(relname text, forknum int, blkno int) returns bytea @@ -27,13 +27,28 @@ get_raw_page reads the specified block of the named table and returns a copy as a bytea value. This allows a single time-consistent copy of the block to be obtained. + forknum should be 0 for the main data fork, or 1 for + the FSM. - page_header(bytea) returns record + get_raw_page(relname text, blkno int) returns bytea + + + + + A shorthand of above, for reading from the main fork. Equal to + get_raw_page(relname, 0, blkno) + + + + + + + page_header(page bytea) returns record @@ -63,7 +78,7 @@ test=# SELECT * FROM page_header(get_raw_page('pg_class', 0)); - heap_page_items(bytea) returns setof record + heap_page_items(page bytea) returns setof record @@ -90,7 +105,7 @@ test=# SELECT * FROM heap_page_items(get_raw_page('pg_class', 0)); - bt_metap(text) returns record + bt_metap(relname text) returns record @@ -113,7 +128,7 @@ fastlevel | 0 - bt_page_stats(text, int) returns record + bt_page_stats(relname text, blkno int) returns record @@ -141,7 +156,7 @@ btpo_flags | 3 - bt_page_items(text, int) returns setof record + bt_page_items(relname text, blkno int) returns setof record @@ -164,6 +179,26 @@ test=# SELECT * FROM bt_page_items('pg_cast_oid_index', 1); + + + + fsm_page_contents(page bytea) returns text + + + + + fsm_page_contents shows the internal node structure + of a FSM page. The output is a multi-line string, with one line per + node in the binary tree within the page. Only those nodes that are not + zero are printed. The so-called "next" pointer, which points to the + next slot to be returned from the page, is also printed. + + + See src/backend/storage/freespace/README for more + information on the structure of an FSM page. + + + diff --git a/doc/src/sgml/pgfreespacemap.sgml b/doc/src/sgml/pgfreespacemap.sgml index bc821ead6b..3d749a953d 100644 --- a/doc/src/sgml/pgfreespacemap.sgml +++ b/doc/src/sgml/pgfreespacemap.sgml @@ -1,4 +1,4 @@ - + pg_freespacemap @@ -9,183 +9,66 @@ The pg_freespacemap module provides a means for examining the - free space map (FSM). It provides two C functions: - pg_freespacemap_relations and - pg_freespacemap_pages that each return a set of - records, plus two views pg_freespacemap_relations - and pg_freespacemap_pages that wrap the functions - for convenient use. + free space map (FSM). It provides a function called + pg_freespacemap, or two overloaded functions, to be + precise. The functions show the value recorded in the free space map for + a given page, or for all pages in the relation. - By default public access is revoked from the functions and views, just in - case there are security issues lurking. + By default public access is revoked from the functions, just in case + there are security issues lurking. - The <filename>pg_freespacemap</> views + Functions + + + + + pg_freespacemap(rel regclass IN, blkno bigint IN) returns int2 + + + + + Returns the amount of free space on the page of the relation, specified + by blkno, according to the FSM. + (blkno). + + + + + + + + pg_freespacemap(rel regclass IN, blkno OUT int4, avail OUT int2) + + + + + Displays the the amount of free space on each page of the relation, + according to the FSM. A set of (blkno int4, avail int2) + tuples is returned, one tuple for each page in the relation. + + + + - The definitions of the columns exposed by the views are: - - - - <structname>pg_freespacemap_relations</> Columns - - - - - Name - Type - References - Description - - - - - - reltablespace - oid - pg_tablespace.oid - Tablespace OID of the relation - - - reldatabase - oid - pg_database.oid - Database OID of the relation - - - relfilenode - oid - pg_class.relfilenode - Relfilenode of the relation - - - avgrequest - integer - - Moving average of free space requests (NULL for indexes) - - - interestingpages - integer - - Count of pages last reported as containing useful free space - - - storedpages - integer - - Count of pages actually stored in free space map - - - nextpage - integer - - Page index (from 0) to start next search at - - - - -
- - - <structname>pg_freespacemap_pages</> Columns - - - - - Name - Type - References - Description - - - - - - reltablespace - oid - pg_tablespace.oid - Tablespace OID of the relation - - - reldatabase - oid - pg_database.oid - Database OID of the relation - - - relfilenode - oid - pg_class.relfilenode - Relfilenode of the relation - - - relblocknumber - bigint - - Page number within the relation - - - bytes - integer - - Free bytes in the page, or NULL for an index page (see below) - - - - -
- - - For pg_freespacemap_relations, there is one row - for each relation in the free space map. - storedpages is the number of pages actually - stored in the map, while interestingpages is the - number of pages the last VACUUM thought had useful amounts of - free space. - - - - If storedpages is consistently less than - interestingpages then it'd be a good idea to increase - max_fsm_pages. Also, if the number of rows in - pg_freespacemap_relations is close to - max_fsm_relations, then you should consider increasing - max_fsm_relations. - - - - For pg_freespacemap_pages, there is one row for - each page in the free space map. The number of rows for a relation will - match the storedpages column in - pg_freespacemap_relations. + The values stored in the free space map are not exact. They're rounded + to precision of 1/256th of BLCKSZ (32 bytes with default BLCKSZ), and + they're not kept fully up-to-date as tuples are inserted and updated. For indexes, what is tracked is entirely-unused pages, rather than free - space within pages. Therefore, the average request size and free bytes - within a page are not meaningful, and are shown as NULL. + space within pages. Therefore, the values are not meaningful, just + whether a page is full or empty. - Because the map is shared by all the databases, there will normally be - entries for relations not belonging to the current database. This means - that there may not be matching join rows in pg_class for - some rows, or that there could even be incorrect joins. If you are - trying to join against pg_class, it's a good idea to - restrict the join to rows having reldatabase equal to - the current database's OID or zero. - - - - When either of the views is accessed, internal free space map locks are - taken for long enough to copy all the state data that the view will display. - This ensures that the views produce a consistent set of results, while not - blocking normal activity longer than necessary. Nonetheless there - could be some impact on database performance if they are read often. + NOTE: The interface was changed in version 8.4, to reflect the new FSM + implementation introduced in the same version.
@@ -193,45 +76,37 @@ Sample output -regression=# SELECT c.relname, r.avgrequest, r.interestingpages, r.storedpages - FROM pg_freespacemap_relations r INNER JOIN pg_class c - ON r.relfilenode = c.relfilenode AND - r.reldatabase IN (0, (SELECT oid FROM pg_database - WHERE datname = current_database())) - ORDER BY r.storedpages DESC LIMIT 10; - relname | avgrequest | interestingpages | storedpages ----------------------------------+------------+------------------+------------- - onek | 256 | 109 | 109 - pg_attribute | 167 | 93 | 93 - pg_class | 191 | 49 | 49 - pg_attribute_relid_attnam_index | | 48 | 48 - onek2 | 256 | 37 | 37 - pg_depend | 95 | 26 | 26 - pg_type | 199 | 16 | 16 - pg_rewrite | 1011 | 13 | 13 - pg_class_relname_nsp_index | | 10 | 10 - pg_proc | 302 | 8 | 8 -(10 rows) +postgres=# SELECT * FROM pg_freespace('foo'); + blkno | avail +-------+------- + 0 | 0 + 1 | 0 + 2 | 0 + 3 | 32 + 4 | 704 + 5 | 704 + 6 | 704 + 7 | 1216 + 8 | 704 + 9 | 704 + 10 | 704 + 11 | 704 + 12 | 704 + 13 | 704 + 14 | 704 + 15 | 704 + 16 | 704 + 17 | 704 + 18 | 704 + 19 | 3648 +(20 rows) + +postgres=# SELECT * FROM pg_freespace('foo', 7); + pg_freespace +-------------- + 1216 +(1 row) -regression=# SELECT c.relname, p.relblocknumber, p.bytes - FROM pg_freespacemap_pages p INNER JOIN pg_class c - ON p.relfilenode = c.relfilenode AND - p.reldatabase IN (0, (SELECT oid FROM pg_database - WHERE datname = current_database())) - ORDER BY c.relname LIMIT 10; - relname | relblocknumber | bytes ---------------+----------------+------- - a_star | 0 | 8040 - abstime_tbl | 0 | 7908 - aggtest | 0 | 8008 - altinhoid | 0 | 8128 - altstartwith | 0 | 8128 - arrtest | 0 | 7172 - b_star | 0 | 7976 - box_tbl | 0 | 7912 - bt_f8_heap | 54 | 7728 - bt_i4_heap | 49 | 8008 -(10 rows) @@ -239,7 +114,9 @@ regression=# SELECT c.relname, p.relblocknumber, p.bytes Author - Mark Kirkwood markir@paradise.net.nz + Original version by Mark Kirkwood markir@paradise.net.nz. + Rewritten in version 8.4 to suit new FSM implementation by Heikki + Linnakangas heikki@enterprisedb.com diff --git a/doc/src/sgml/ref/vacuum.sgml b/doc/src/sgml/ref/vacuum.sgml index 082473c069..0568fd4eeb 100644 --- a/doc/src/sgml/ref/vacuum.sgml +++ b/doc/src/sgml/ref/vacuum.sgml @@ -1,5 +1,5 @@ @@ -96,11 +96,7 @@ VACUUM [ FULL ] [ FREEZE ] [ VERBOSE ] ANALYZE [ VERBOSE - Prints a detailed vacuum activity report for each table. Can be used - to help determine appropriate settings for - , - , and - . + Prints a detailed vacuum activity report for each table. diff --git a/doc/src/sgml/release.sgml b/doc/src/sgml/release.sgml index 6f3daac2b8..eea942d1a2 100644 --- a/doc/src/sgml/release.sgml +++ b/doc/src/sgml/release.sgml @@ -1,4 +1,4 @@ - + + Operating System Environment @@ -1117,16 +1117,6 @@ set semsys:seminfo_semmsl=32 8200 (assuming 8 kB XLOG_BLCKSZ) - - - 70 - - - - - 6 - - Fixed space requirements 770 kB diff --git a/doc/src/sgml/storage.sgml b/doc/src/sgml/storage.sgml index e564fd2be9..51f8a2fe16 100644 --- a/doc/src/sgml/storage.sgml +++ b/doc/src/sgml/storage.sgml @@ -1,4 +1,4 @@ - + @@ -130,7 +130,12 @@ there. Each table and index is stored in a separate file, named after the table or index's filenode number, which can be found in -pg_class.relfilenode. +pg_class.relfilenode. In addition to the +main file (aka. main fork), a free space map (see +) that stores information about free space +available in the relation, is stored in a file named after the filenode +number, with the the _1 suffix. For example, if the table's filenode number +is 12345, the FSM file is named 12345_1. @@ -367,6 +372,48 @@ comparison table, in which all the HTML pages were cut down to 7 kB to fit.
+ + +Free Space Map + + + Free Space Map + + FSMFree Space Map + + +A Free Space Map is stored with every heap and index relation, except for +hash indexes, to keep track of available space in the relation. It's stored +along the main relation data, in a separate FSM relation fork, named after +relfilenode of the relation, but with a _1 suffix. For example, +if the relfilenode of a relation is 12345, the FSM is stored in a file called +12345_1, in the same directory as the main relation file. + + + +The Free Space Map is organized as a tree of FSM pages. The +bottom level FSM pages stores the free space available on every +heap (or index) page, using one byte to represent each heap page. The upper +levels aggregate information from the lower levels. + + + +Within each FSM page is a binary tree, stored in an array with +one byte per node. Each leaf node represents a heap page, or a lower level +FSM page. In each non-leaf node, the higher of its children's +values is stored. The maximum value in the leaf nodes is therefore stored +at the root. + + + +See src/backend/storage/freespace/README for more details on +how the FSM is structured, and how it's updated and searched. + contrib module can be used to view the +information stored in free space maps. + + + + Database Page Layout diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index ac35069d7f..64099cd1e5 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gin/gininsert.c,v 1.14 2008/07/11 21:06:29 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/gin/gininsert.c,v 1.15 2008/09/30 10:52:10 heikki Exp $ *------------------------------------------------------------------------- */ @@ -19,6 +19,7 @@ #include "catalog/index.h" #include "miscadmin.h" #include "storage/bufmgr.h" +#include "storage/indexfsm.h" #include "utils/memutils.h" @@ -283,6 +284,9 @@ ginbuild(PG_FUNCTION_ARGS) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); + /* Initialize FSM */ + InitIndexFreeSpaceMap(index); + initGinState(&buildstate.ginstate, index); /* initialize the root page */ diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c index 86b2650c75..587add92e9 100644 --- a/src/backend/access/gin/ginutil.c +++ b/src/backend/access/gin/ginutil.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gin/ginutil.c,v 1.16 2008/07/11 21:06:29 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/gin/ginutil.c,v 1.17 2008/09/30 10:52:10 heikki Exp $ *------------------------------------------------------------------------- */ @@ -19,6 +19,7 @@ #include "catalog/pg_type.h" #include "storage/bufmgr.h" #include "storage/freespace.h" +#include "storage/indexfsm.h" #include "storage/lmgr.h" void @@ -151,7 +152,7 @@ GinNewBuffer(Relation index) /* First, try to get a page from FSM */ for (;;) { - BlockNumber blkno = GetFreeIndexPage(&index->rd_node); + BlockNumber blkno = GetFreeIndexPage(index); if (blkno == InvalidBlockNumber) break; diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c index 249f612dd1..c3e6f4e6f2 100644 --- a/src/backend/access/gin/ginvacuum.c +++ b/src/backend/access/gin/ginvacuum.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gin/ginvacuum.c,v 1.21 2008/07/11 21:06:29 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/gin/ginvacuum.c,v 1.22 2008/09/30 10:52:10 heikki Exp $ *------------------------------------------------------------------------- */ @@ -20,6 +20,7 @@ #include "miscadmin.h" #include "storage/bufmgr.h" #include "storage/freespace.h" +#include "storage/indexfsm.h" #include "storage/lmgr.h" typedef struct @@ -678,10 +679,7 @@ ginvacuumcleanup(PG_FUNCTION_ARGS) bool needLock; BlockNumber npages, blkno; - BlockNumber totFreePages, - nFreePages, - *freePages, - maxFreePages; + BlockNumber totFreePages; BlockNumber lastBlock = GIN_ROOT_BLKNO, lastFilledBlock = GIN_ROOT_BLKNO; @@ -711,12 +709,7 @@ ginvacuumcleanup(PG_FUNCTION_ARGS) if (needLock) UnlockRelationForExtension(index, ExclusiveLock); - maxFreePages = npages; - if (maxFreePages > MaxFSMPages) - maxFreePages = MaxFSMPages; - - totFreePages = nFreePages = 0; - freePages = (BlockNumber *) palloc(sizeof(BlockNumber) * maxFreePages); + totFreePages = 0; for (blkno = GIN_ROOT_BLKNO + 1; blkno < npages; blkno++) { @@ -731,8 +724,7 @@ ginvacuumcleanup(PG_FUNCTION_ARGS) if (GinPageIsDeleted(page)) { - if (nFreePages < maxFreePages) - freePages[nFreePages++] = blkno; + RecordFreeIndexPage(index, blkno); totFreePages++; } else @@ -742,25 +734,16 @@ ginvacuumcleanup(PG_FUNCTION_ARGS) } lastBlock = npages - 1; - if (info->vacuum_full && nFreePages > 0) + if (info->vacuum_full && lastBlock > lastFilledBlock) { /* try to truncate index */ - int i; - - for (i = 0; i < nFreePages; i++) - if (freePages[i] >= lastFilledBlock) - { - totFreePages = nFreePages = i; - break; - } - - if (lastBlock > lastFilledBlock) - RelationTruncate(index, lastFilledBlock + 1); + FreeSpaceMapTruncateRel(index, lastFilledBlock + 1); + RelationTruncate(index, lastFilledBlock + 1); stats->pages_removed = lastBlock - lastFilledBlock; + totFreePages = totFreePages - stats->pages_removed; } - RecordIndexFreeSpace(&index->rd_node, totFreePages, nFreePages, freePages); stats->pages_free = totFreePages; if (needLock) diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index 7dd981a490..2f75c3fa2a 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gist/gist.c,v 1.151 2008/06/12 09:12:29 heikki Exp $ + * $PostgreSQL: pgsql/src/backend/access/gist/gist.c,v 1.152 2008/09/30 10:52:10 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -19,6 +19,7 @@ #include "catalog/index.h" #include "miscadmin.h" #include "storage/bufmgr.h" +#include "storage/indexfsm.h" #include "utils/memutils.h" const XLogRecPtr XLogRecPtrForTemp = {1, 1}; @@ -102,6 +103,9 @@ gistbuild(PG_FUNCTION_ARGS) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); + /* Initialize FSM */ + InitIndexFreeSpaceMap(index); + /* no locking is needed */ initGISTstate(&buildstate.giststate, index); diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c index 248ec25956..22f50c19ac 100644 --- a/src/backend/access/gist/gistutil.c +++ b/src/backend/access/gist/gistutil.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gist/gistutil.c,v 1.30 2008/07/13 20:45:46 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/gist/gistutil.c,v 1.31 2008/09/30 10:52:10 heikki Exp $ *------------------------------------------------------------------------- */ #include "postgres.h" @@ -16,6 +16,7 @@ #include "access/gist_private.h" #include "access/reloptions.h" #include "storage/freespace.h" +#include "storage/indexfsm.h" #include "storage/lmgr.h" #include "storage/bufmgr.h" #include "utils/rel.h" @@ -617,7 +618,7 @@ gistNewBuffer(Relation r) /* First, try to get a page from FSM */ for (;;) { - BlockNumber blkno = GetFreeIndexPage(&r->rd_node); + BlockNumber blkno = GetFreeIndexPage(r); if (blkno == InvalidBlockNumber) break; /* nothing left in FSM */ diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c index d929962af2..b545922ccc 100644 --- a/src/backend/access/gist/gistvacuum.c +++ b/src/backend/access/gist/gistvacuum.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gist/gistvacuum.c,v 1.36 2008/06/12 09:12:30 heikki Exp $ + * $PostgreSQL: pgsql/src/backend/access/gist/gistvacuum.c,v 1.37 2008/09/30 10:52:10 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -20,6 +20,7 @@ #include "miscadmin.h" #include "storage/bufmgr.h" #include "storage/freespace.h" +#include "storage/indexfsm.h" #include "storage/lmgr.h" #include "utils/memutils.h" @@ -518,10 +519,7 @@ gistvacuumcleanup(PG_FUNCTION_ARGS) Relation rel = info->index; BlockNumber npages, blkno; - BlockNumber totFreePages, - nFreePages, - *freePages, - maxFreePages; + BlockNumber totFreePages; BlockNumber lastBlock = GIST_ROOT_BLKNO, lastFilledBlock = GIST_ROOT_BLKNO; bool needLock; @@ -589,13 +587,7 @@ gistvacuumcleanup(PG_FUNCTION_ARGS) if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); - maxFreePages = npages; - if (maxFreePages > MaxFSMPages) - maxFreePages = MaxFSMPages; - - totFreePages = nFreePages = 0; - freePages = (BlockNumber *) palloc(sizeof(BlockNumber) * maxFreePages); - + totFreePages = 0; for (blkno = GIST_ROOT_BLKNO + 1; blkno < npages; blkno++) { Buffer buffer; @@ -609,9 +601,8 @@ gistvacuumcleanup(PG_FUNCTION_ARGS) if (PageIsNew(page) || GistPageIsDeleted(page)) { - if (nFreePages < maxFreePages) - freePages[nFreePages++] = blkno; totFreePages++; + RecordFreeIndexPage(rel, blkno); } else lastFilledBlock = blkno; @@ -619,25 +610,15 @@ gistvacuumcleanup(PG_FUNCTION_ARGS) } lastBlock = npages - 1; - if (info->vacuum_full && nFreePages > 0) + if (info->vacuum_full && lastFilledBlock < lastBlock) { /* try to truncate index */ - int i; + FreeSpaceMapTruncateRel(rel, lastFilledBlock + 1); + RelationTruncate(rel, lastFilledBlock + 1); - for (i = 0; i < nFreePages; i++) - if (freePages[i] >= lastFilledBlock) - { - totFreePages = nFreePages = i; - break; - } - - if (lastBlock > lastFilledBlock) - RelationTruncate(rel, lastFilledBlock + 1); stats->std.pages_removed = lastBlock - lastFilledBlock; + totFreePages = totFreePages - stats->std.pages_removed; } - RecordIndexFreeSpace(&rel->rd_node, totFreePages, nFreePages, freePages); - pfree(freePages); - /* return statistics */ stats->std.pages_free = totFreePages; if (needLock) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index eb9f8701ae..0fd61fe9ce 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.263 2008/09/11 14:01:09 alvherre Exp $ + * $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.264 2008/09/30 10:52:10 heikki Exp $ * * * INTERFACE ROUTINES @@ -4721,6 +4721,9 @@ heap_sync(Relation rel) /* FlushRelationBuffers will have opened rd_smgr */ smgrimmedsync(rel->rd_smgr, MAIN_FORKNUM); + /* sync FSM as well */ + smgrimmedsync(rel->rd_smgr, FSM_FORKNUM); + /* toast heap, if any */ if (OidIsValid(rel->rd_rel->reltoastrelid)) { @@ -4729,6 +4732,7 @@ heap_sync(Relation rel) toastrel = heap_open(rel->rd_rel->reltoastrelid, AccessShareLock); FlushRelationBuffers(toastrel); smgrimmedsync(toastrel->rd_smgr, MAIN_FORKNUM); + smgrimmedsync(toastrel->rd_smgr, FSM_FORKNUM); heap_close(toastrel, AccessShareLock); } } diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c index 6db80590de..3723977fe0 100644 --- a/src/backend/access/heap/hio.c +++ b/src/backend/access/heap/hio.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/heap/hio.c,v 1.72 2008/07/13 20:45:47 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/heap/hio.c,v 1.73 2008/09/30 10:52:10 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -163,8 +163,7 @@ RelationGetBufferForTuple(Relation relation, Size len, * We have no cached target page, so ask the FSM for an initial * target. */ - targetBlock = GetPageWithFreeSpace(&relation->rd_node, - len + saveFreeSpace); + targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace); /* * If the FSM knows nothing of the rel, try the last page before we @@ -250,7 +249,7 @@ RelationGetBufferForTuple(Relation relation, Size len, * Update FSM as to condition of this page, and ask for another page * to try. */ - targetBlock = RecordAndGetPageWithFreeSpace(&relation->rd_node, + targetBlock = RecordAndGetPageWithFreeSpace(relation, targetBlock, pageFreeSpace, len + saveFreeSpace); diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 2cc5ebe844..8ac9f538fc 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -9,7 +9,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.110 2008/07/13 20:45:47 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.111 2008/09/30 10:52:10 heikki Exp $ * * NOTES * Postgres btree pages look like ordinary relation pages. The opaque @@ -27,6 +27,7 @@ #include "miscadmin.h" #include "storage/bufmgr.h" #include "storage/freespace.h" +#include "storage/indexfsm.h" #include "storage/lmgr.h" #include "utils/inval.h" #include "utils/snapmgr.h" @@ -501,7 +502,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access) */ for (;;) { - blkno = GetFreeIndexPage(&rel->rd_node); + blkno = GetFreeIndexPage(rel); if (blkno == InvalidBlockNumber) break; buf = ReadBuffer(rel, blkno); diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 64a719f827..abb6bd5c5d 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -12,7 +12,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.161 2008/06/19 00:46:03 alvherre Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.162 2008/09/30 10:52:10 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -26,6 +26,7 @@ #include "miscadmin.h" #include "storage/bufmgr.h" #include "storage/freespace.h" +#include "storage/indexfsm.h" #include "storage/ipc.h" #include "storage/lmgr.h" #include "utils/memutils.h" @@ -56,9 +57,7 @@ typedef struct IndexBulkDeleteCallback callback; void *callback_state; BTCycleId cycleid; - BlockNumber *freePages; - int nFreePages; /* number of entries in freePages[] */ - int maxFreePages; /* allocated size of freePages[] */ + BlockNumber lastUsedPage; BlockNumber totFreePages; /* true total # of free pages */ MemoryContext pagedelcontext; } BTVacState; @@ -110,6 +109,9 @@ btbuild(PG_FUNCTION_ARGS) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); + /* Initialize FSM */ + InitIndexFreeSpaceMap(index); + buildstate.spool = _bt_spoolinit(index, indexInfo->ii_Unique, false); /* @@ -623,9 +625,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, vstate.callback = callback; vstate.callback_state = callback_state; vstate.cycleid = cycleid; - vstate.freePages = NULL; /* temporarily */ - vstate.nFreePages = 0; - vstate.maxFreePages = 0; + vstate.lastUsedPage = BTREE_METAPAGE; vstate.totFreePages = 0; /* Create a temporary memory context to run _bt_pagedel in */ @@ -670,17 +670,6 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); - /* Allocate freePages after we read num_pages the first time */ - if (vstate.freePages == NULL) - { - /* No point in remembering more than MaxFSMPages pages */ - vstate.maxFreePages = MaxFSMPages; - if ((BlockNumber) vstate.maxFreePages > num_pages) - vstate.maxFreePages = (int) num_pages; - vstate.freePages = (BlockNumber *) - palloc(vstate.maxFreePages * sizeof(BlockNumber)); - } - /* Quit if we've scanned the whole relation */ if (blkno >= num_pages) break; @@ -697,42 +686,22 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, * acquiring exclusive lock on the index and then rechecking all the * pages; doesn't seem worth it. */ - if (info->vacuum_full && vstate.nFreePages > 0) + if (info->vacuum_full && vstate.lastUsedPage < num_pages - 1) { - BlockNumber new_pages = num_pages; + BlockNumber new_pages = vstate.lastUsedPage + 1; - while (vstate.nFreePages > 0 && - vstate.freePages[vstate.nFreePages - 1] == new_pages - 1) - { - new_pages--; - stats->pages_deleted--; - vstate.nFreePages--; - vstate.totFreePages = vstate.nFreePages; /* can't be more */ - } - if (new_pages != num_pages) - { - /* - * Okay to truncate. - */ - RelationTruncate(rel, new_pages); + /* + * Okay to truncate. + */ + FreeSpaceMapTruncateRel(rel, new_pages); + RelationTruncate(rel, new_pages); - /* update statistics */ - stats->pages_removed += num_pages - new_pages; - - num_pages = new_pages; - } + /* update statistics */ + stats->pages_removed += num_pages - new_pages; + vstate.totFreePages -= (num_pages - new_pages); + num_pages = new_pages; } - /* - * Update the shared Free Space Map with the info we now have about free - * pages in the index, discarding any old info the map may have. We do not - * need to sort the page numbers; they're in order already. - */ - RecordIndexFreeSpace(&rel->rd_node, vstate.totFreePages, - vstate.nFreePages, vstate.freePages); - - pfree(vstate.freePages); - MemoryContextDelete(vstate.pagedelcontext); /* update statistics */ @@ -788,8 +757,7 @@ restart: /* * If we are recursing, the only case we want to do anything with is a * live leaf page having the current vacuum cycle ID. Any other state - * implies we already saw the page (eg, deleted it as being empty). In - * particular, we don't want to risk adding it to freePages twice. + * implies we already saw the page (eg, deleted it as being empty). */ if (blkno != orig_blkno) { @@ -803,12 +771,15 @@ restart: } } + /* If the page is in use, update lastUsedPage */ + if (!_bt_page_recyclable(page) && vstate->lastUsedPage < blkno) + vstate->lastUsedPage = blkno; + /* Page is valid, see what to do with it */ if (_bt_page_recyclable(page)) { /* Okay to recycle this page */ - if (vstate->nFreePages < vstate->maxFreePages) - vstate->freePages[vstate->nFreePages++] = blkno; + RecordFreeIndexPage(rel, blkno); vstate->totFreePages++; stats->pages_deleted++; } @@ -944,8 +915,7 @@ restart: */ if (ndel && info->vacuum_full) { - if (vstate->nFreePages < vstate->maxFreePages) - vstate->freePages[vstate->nFreePages++] = blkno; + RecordFreeIndexPage(rel, blkno); vstate->totFreePages++; } diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 7dcfa10eee..eb1653e2f3 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -52,12 +52,14 @@ * we log the completed index pages to WAL if and only if WAL archiving is * active. * + * This code isn't concerned about the FSM at all. The caller is responsible + * for initializing that. * * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.117 2008/08/11 11:05:10 heikki Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.118 2008/09/30 10:52:10 heikki Exp $ * *------------------------------------------------------------------------- */ diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c index 08de22eaa4..7c62ec3854 100644 --- a/src/backend/access/transam/rmgr.c +++ b/src/backend/access/transam/rmgr.c @@ -3,7 +3,7 @@ * * Resource managers definition * - * $PostgreSQL: pgsql/src/backend/access/transam/rmgr.c,v 1.25 2006/11/05 22:42:07 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/rmgr.c,v 1.26 2008/09/30 10:52:11 heikki Exp $ */ #include "postgres.h" @@ -19,6 +19,7 @@ #include "commands/dbcommands.h" #include "commands/sequence.h" #include "commands/tablespace.h" +#include "storage/freespace.h" #include "storage/smgr.h" @@ -30,7 +31,7 @@ const RmgrData RmgrTable[RM_MAX_ID + 1] = { {"Database", dbase_redo, dbase_desc, NULL, NULL, NULL}, {"Tablespace", tblspc_redo, tblspc_desc, NULL, NULL, NULL}, {"MultiXact", multixact_redo, multixact_desc, NULL, NULL, NULL}, - {"Reserved 7", NULL, NULL, NULL, NULL, NULL}, + {"FreeSpaceMap", fsm_redo, fsm_desc, NULL, NULL, NULL}, {"Reserved 8", NULL, NULL, NULL, NULL, NULL}, {"Heap2", heap2_redo, heap2_desc, NULL, NULL, NULL}, {"Heap", heap_redo, heap_desc, NULL, NULL, NULL}, diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 59124e349e..9abcce6548 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -11,7 +11,7 @@ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/access/transam/xlogutils.c,v 1.58 2008/08/11 11:05:10 heikki Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/xlogutils.c,v 1.59 2008/09/30 10:52:11 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -359,6 +359,7 @@ CreateFakeRelcacheEntry(RelFileNode rnode) rel->rd_lockInfo.lockRelId.relId = rnode.relNode; rel->rd_targblock = InvalidBlockNumber; + rel->rd_fsm_nblocks_cache = InvalidBlockNumber; rel->rd_smgr = NULL; return rel; diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 0689fb1f1a..04194acd3f 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.245 2008/09/01 20:42:43 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.246 2008/09/30 10:52:11 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -33,7 +33,6 @@ #include "postmaster/bgwriter.h" #include "postmaster/walwriter.h" #include "storage/bufmgr.h" -#include "storage/freespace.h" #include "storage/ipc.h" #include "storage/proc.h" #include "tcop/tcopprot.h" @@ -419,7 +418,6 @@ AuxiliaryProcessMain(int argc, char *argv[]) case StartupProcess: bootstrap_signals(); StartupXLOG(); - LoadFreeSpaceMap(); BuildFlatFiles(false); proc_exit(0); /* startup done */ diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index 5b26b91b69..50a2a98bbb 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/catalog/heap.c,v 1.339 2008/08/28 23:09:45 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/catalog/heap.c,v 1.340 2008/09/30 10:52:12 heikki Exp $ * * * INTERFACE ROUTINES @@ -56,6 +56,7 @@ #include "parser/parse_expr.h" #include "parser/parse_relation.h" #include "storage/bufmgr.h" +#include "storage/freespace.h" #include "storage/smgr.h" #include "utils/builtins.h" #include "utils/fmgroids.h" @@ -294,14 +295,22 @@ heap_create(const char *relname, /* * Have the storage manager create the relation's disk file, if needed. * - * We only create storage for the main fork here. The caller is - * responsible for creating any additional forks if needed. + * We create storage for the main fork here, and also for the FSM for a + * heap or toast relation. The caller is responsible for creating any + * additional forks if needed. */ if (create_storage) { Assert(rel->rd_smgr == NULL); RelationOpenSmgr(rel); smgrcreate(rel->rd_smgr, MAIN_FORKNUM, rel->rd_istemp, false); + + /* + * For a real heap, create FSM fork as well. Indexams are + * responsible for creating any extra forks themselves. + */ + if (relkind == RELKIND_RELATION || relkind == RELKIND_TOASTVALUE) + smgrcreate(rel->rd_smgr, FSM_FORKNUM, rel->rd_istemp, false); } return rel; @@ -2256,7 +2265,11 @@ RelationTruncateIndexes(Relation heapRelation) /* Fetch info needed for index_build */ indexInfo = BuildIndexInfo(currentIndex); - /* Now truncate the actual file (and discard buffers) */ + /* + * Now truncate the actual file (and discard buffers). The indexam + * is responsible for truncating the FSM in index_build(), if + * applicable. + */ RelationTruncate(currentIndex, 0); /* Initialize the index and rebuild */ @@ -2310,7 +2323,8 @@ heap_truncate(List *relids) { Relation rel = lfirst(cell); - /* Truncate the actual file (and discard buffers) */ + /* Truncate the FSM and actual file (and discard buffers) */ + FreeSpaceMapTruncateRel(rel, 0); RelationTruncate(rel, 0); /* If this relation has indexes, truncate the indexes too */ diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 301e7d1f2d..e8063476ad 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.304 2008/09/15 18:43:41 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.305 2008/09/30 10:52:12 heikki Exp $ * * * INTERFACE ROUTINES @@ -920,7 +920,7 @@ index_drop(Oid indexId) RelationOpenSmgr(userIndexRelation); for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) if (smgrexists(userIndexRelation->rd_smgr, forknum)) - smgrscheduleunlink(userIndexRelation->rd_smgr, forknum, + smgrscheduleunlink(userIndexRelation->rd_smgr, forknum, userIndexRelation->rd_istemp); RelationCloseSmgr(userIndexRelation); @@ -1322,7 +1322,7 @@ setNewRelfilenode(Relation relation, TransactionId freezeXid) /* * ... and create storage for corresponding forks in the new relfilenode. * - * NOTE: any conflict in relfilenode value will be caught here + * NOTE: any conflict in relfilenode value will be caught here */ newrnode = relation->rd_node; newrnode.relNode = newrelfilenode; @@ -1331,6 +1331,14 @@ setNewRelfilenode(Relation relation, TransactionId freezeXid) /* Create the main fork, like heap_create() does */ smgrcreate(srel, MAIN_FORKNUM, relation->rd_istemp, false); + /* + * For a heap, create FSM fork as well. Indexams are responsible for + * creating any extra forks themselves. + */ + if (relation->rd_rel->relkind == RELKIND_RELATION || + relation->rd_rel->relkind == RELKIND_TOASTVALUE) + smgrcreate(srel, FSM_FORKNUM, relation->rd_istemp, false); + /* schedule unlinking old files */ for (i = 0; i <= MAX_FORKNUM; i++) { @@ -2310,7 +2318,10 @@ reindex_index(Oid indexId) if (inplace) { - /* Truncate the actual file (and discard buffers) */ + /* + * Truncate the actual file (and discard buffers). The indexam + * is responsible for truncating the FSM, if applicable + */ RelationTruncate(iRel, 0); } else diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 29dc0733a7..37c2f45c72 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -13,7 +13,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/dbcommands.c,v 1.212 2008/09/23 10:58:03 heikki Exp $ + * $PostgreSQL: pgsql/src/backend/commands/dbcommands.c,v 1.213 2008/09/30 10:52:12 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -43,7 +43,6 @@ #include "postmaster/bgwriter.h" #include "storage/bufmgr.h" #include "storage/lmgr.h" -#include "storage/freespace.h" #include "storage/ipc.h" #include "storage/procarray.h" #include "storage/smgr.h" @@ -796,11 +795,6 @@ dropdb(const char *dbname, bool missing_ok) */ DropDatabaseBuffers(db_id); - /* - * Also, clean out any entries in the shared free space map. - */ - FreeSpaceMapForgetDatabase(db_id); - /* * Tell the stats collector to forget it immediately, too. */ @@ -1640,9 +1634,6 @@ dbase_redo(XLogRecPtr lsn, XLogRecord *record) /* Drop pages for this database that are in the shared buffer cache */ DropDatabaseBuffers(xlrec->db_id); - /* Also, clean out any entries in the shared free space map */ - FreeSpaceMapForgetDatabase(xlrec->db_id); - /* Also, clean out any fsync requests that might be pending in md.c */ ForgetDatabaseFsyncRequests(xlrec->db_id); diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index af7b6646d2..925a8d8abd 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -13,7 +13,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.377 2008/09/11 14:01:09 alvherre Exp $ + * $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.378 2008/09/30 10:52:12 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -505,14 +505,6 @@ vacuum(VacuumStmt *vacstmt, Oid relid, bool do_toast, * (autovacuum.c does this for itself.) */ vac_update_datfrozenxid(); - - /* - * If it was a database-wide VACUUM, print FSM usage statistics (we - * don't make you be superuser to see these). We suppress this in - * autovacuum, too. - */ - if (all_rels) - PrintFreeSpaceMapStatistics(elevel); } /* @@ -1272,8 +1264,9 @@ full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt) } } - /* update shared free space map with final free space info */ + /* update thefree space map with final free space info, and vacuum it */ vac_update_fsm(onerel, &fraged_pages, vacrelstats->rel_pages); + FreeSpaceMapVacuum(onerel); /* update statistics in pg_class */ vac_update_relstats(RelationGetRelid(onerel), vacrelstats->rel_pages, @@ -2849,6 +2842,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, /* Truncate relation, if needed */ if (blkno < nblocks) { + FreeSpaceMapTruncateRel(onerel, blkno); RelationTruncate(onerel, blkno); vacrelstats->rel_pages = blkno; /* set new number of blocks */ } @@ -3243,6 +3237,7 @@ vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages) (errmsg("\"%s\": truncated %u to %u pages", RelationGetRelationName(onerel), vacrelstats->rel_pages, relblocks))); + FreeSpaceMapTruncateRel(onerel, relblocks); RelationTruncate(onerel, relblocks); vacrelstats->rel_pages = relblocks; /* set new number of blocks */ } @@ -3475,8 +3470,8 @@ tid_reaped(ItemPointer itemptr, void *state) } /* - * Update the shared Free Space Map with the info we now have about - * free space in the relation, discarding any old info the map may have. + * Update the Free Space Map with the info we now have about free space in + * the relation. */ static void vac_update_fsm(Relation onerel, VacPageList fraged_pages, @@ -3484,26 +3479,8 @@ vac_update_fsm(Relation onerel, VacPageList fraged_pages, { int nPages = fraged_pages->num_pages; VacPage *pagedesc = fraged_pages->pagedesc; - Size threshold; - FSMPageData *pageSpaces; - int outPages; int i; - /* - * We only report pages with free space at least equal to the average - * request size --- this avoids cluttering FSM with uselessly-small bits - * of space. Although FSM would discard pages with little free space - * anyway, it's important to do this prefiltering because (a) it reduces - * the time spent holding the FSM lock in RecordRelationFreeSpace, and (b) - * FSM uses the number of pages reported as a statistic for guiding space - * management. If we didn't threshold our reports the same way - * vacuumlazy.c does, we'd be skewing that statistic. - */ - threshold = GetAvgFSMRequestSize(&onerel->rd_node); - - pageSpaces = (FSMPageData *) palloc(nPages * sizeof(FSMPageData)); - outPages = 0; - for (i = 0; i < nPages; i++) { /* @@ -3514,17 +3491,9 @@ vac_update_fsm(Relation onerel, VacPageList fraged_pages, if (pagedesc[i]->blkno >= rel_pages) break; - if (pagedesc[i]->free >= threshold) - { - FSMPageSetPageNum(&pageSpaces[outPages], pagedesc[i]->blkno); - FSMPageSetSpace(&pageSpaces[outPages], pagedesc[i]->free); - outPages++; - } + RecordPageWithFreeSpace(onerel, pagedesc[i]->blkno, pagedesc[i]->free); } - RecordRelationFreeSpace(&onerel->rd_node, outPages, outPages, pageSpaces); - - pfree(pageSpaces); } /* Copy a VacPage structure */ diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index 6ebf25933b..fbaeb8d602 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -18,15 +18,6 @@ * index cleanup and page compaction, then resume the heap scan with an empty * TID array. * - * We can limit the storage for page free space to MaxFSMPages entries, - * since that's the most the free space map will be willing to remember - * anyway. If the relation has fewer than that many pages with free space, - * life is easy: just build an array of per-page info. If it has more, - * we store the free space info as a heap ordered by amount of free space, - * so that we can discard the pages with least free space to ensure we never - * have more than MaxFSMPages entries in all. The surviving page entries - * are passed to the free space map at conclusion of the scan. - * * If we're processing a table with no indexes, we can just vacuum each page * as we go; there's no need to save up multiple tuples to minimize the number * of index scans performed. So we don't use maintenance_work_mem memory for @@ -38,7 +29,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.107 2008/05/12 00:00:48 alvherre Exp $ + * $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.108 2008/09/30 10:52:12 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -90,19 +81,11 @@ typedef struct LVRelStats BlockNumber pages_removed; double tuples_deleted; BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */ - Size threshold; /* minimum interesting free space */ /* List of TIDs of tuples we intend to delete */ /* NB: this list is ordered by TID address */ int num_dead_tuples; /* current # of entries */ int max_dead_tuples; /* # slots allocated in array */ ItemPointer dead_tuples; /* array of ItemPointerData */ - /* Array or heap of per-page info about free space */ - /* We use a simple array until it fills up, then convert to heap */ - bool fs_is_heap; /* are we using heap organization? */ - int num_free_pages; /* current # of entries */ - int max_free_pages; /* # slots allocated in array */ - FSMPageData *free_pages; /* array or heap of blkno/avail */ - BlockNumber tot_free_pages; /* total pages with >= threshold space */ int num_index_scans; } LVRelStats; @@ -134,12 +117,8 @@ static BlockNumber count_nondeletable_pages(Relation onerel, static void lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks); static void lazy_record_dead_tuple(LVRelStats *vacrelstats, ItemPointer itemptr); -static void lazy_record_free_space(LVRelStats *vacrelstats, - BlockNumber page, Size avail); static bool lazy_tid_reaped(ItemPointer itemptr, void *state); -static void lazy_update_fsm(Relation onerel, LVRelStats *vacrelstats); static int vac_cmp_itemptr(const void *left, const void *right); -static int vac_cmp_page_spaces(const void *left, const void *right); /* @@ -180,10 +159,6 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats)); - /* Set threshold for interesting free space = average request size */ - /* XXX should we scale it up or down? Adjust vacuum.c too, if so */ - vacrelstats->threshold = GetAvgFSMRequestSize(&onerel->rd_node); - vacrelstats->num_index_scans = 0; /* Open all indexes of the relation */ @@ -207,18 +182,8 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION) lazy_truncate_heap(onerel, vacrelstats); - /* Update shared free space map with final free space info */ - lazy_update_fsm(onerel, vacrelstats); - - if (vacrelstats->tot_free_pages > MaxFSMPages) - ereport(WARNING, - (errmsg("relation \"%s.%s\" contains more than \"max_fsm_pages\" pages with useful free space", - get_namespace_name(RelationGetNamespace(onerel)), - RelationGetRelationName(onerel)), - /* Only suggest VACUUM FULL if > 20% free */ - (vacrelstats->tot_free_pages > vacrelstats->rel_pages * 0.20) ? - errhint("Consider using VACUUM FULL on this relation or increasing the configuration parameter \"max_fsm_pages\".") : - errhint("Consider increasing the configuration parameter \"max_fsm_pages\"."))); + /* Vacuum the Free Space Map */ + FreeSpaceMapVacuum(onerel); /* Update statistics in pg_class */ vac_update_relstats(RelationGetRelid(onerel), @@ -313,6 +278,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, int prev_dead_count; OffsetNumber frozen[MaxOffsetNumber]; int nfrozen; + Size freespace; vacuum_delay_point(); @@ -375,20 +341,21 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, relname, blkno))); PageInit(page, BufferGetPageSize(buf), 0); empty_pages++; - lazy_record_free_space(vacrelstats, blkno, - PageGetHeapFreeSpace(page)); } + freespace = PageGetHeapFreeSpace(page); MarkBufferDirty(buf); UnlockReleaseBuffer(buf); + + RecordPageWithFreeSpace(onerel, blkno, freespace); continue; } if (PageIsEmpty(page)) { empty_pages++; - lazy_record_free_space(vacrelstats, blkno, - PageGetHeapFreeSpace(page)); + freespace = PageGetHeapFreeSpace(page); UnlockReleaseBuffer(buf); + RecordPageWithFreeSpace(onerel, blkno, freespace); continue; } @@ -556,6 +523,14 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, vacuumed_pages++; } + freespace = PageGetHeapFreeSpace(page); + + /* Remember the location of the last page with nonremovable tuples */ + if (hastup) + vacrelstats->nonempty_pages = blkno + 1; + + UnlockReleaseBuffer(buf); + /* * If we remembered any tuples for deletion, then the page will be * visited again by lazy_vacuum_heap, which will compute and record @@ -564,16 +539,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, * taken if there are no indexes.) */ if (vacrelstats->num_dead_tuples == prev_dead_count) - { - lazy_record_free_space(vacrelstats, blkno, - PageGetHeapFreeSpace(page)); - } - - /* Remember the location of the last page with nonremovable tuples */ - if (hastup) - vacrelstats->nonempty_pages = blkno + 1; - - UnlockReleaseBuffer(buf); + RecordPageWithFreeSpace(onerel, blkno, freespace); } /* save stats for use later */ @@ -611,12 +577,10 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, tups_vacuumed, num_tuples, nblocks), errdetail("%.0f dead row versions cannot be removed yet.\n" "There were %.0f unused item pointers.\n" - "%u pages contain useful free space.\n" "%u pages are entirely empty.\n" "%s.", nkeep, nunused, - vacrelstats->tot_free_pages, empty_pages, pg_rusage_show(&ru0)))); } @@ -649,6 +613,7 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) BlockNumber tblk; Buffer buf; Page page; + Size freespace; vacuum_delay_point(); @@ -656,11 +621,13 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) buf = ReadBufferWithStrategy(onerel, tblk, vac_strategy); LockBufferForCleanup(buf); tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats); + /* Now that we've compacted the page, record its available space */ page = BufferGetPage(buf); - lazy_record_free_space(vacrelstats, tblk, - PageGetHeapFreeSpace(page)); + freespace = PageGetHeapFreeSpace(page); + UnlockReleaseBuffer(buf); + RecordPageWithFreeSpace(onerel, tblk, freespace); npages++; } @@ -816,10 +783,6 @@ lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats) { BlockNumber old_rel_pages = vacrelstats->rel_pages; BlockNumber new_rel_pages; - FSMPageData *pageSpaces; - int n; - int i, - j; PGRUsage ru0; pg_rusage_init(&ru0); @@ -865,6 +828,7 @@ lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats) /* * Okay to truncate. */ + FreeSpaceMapTruncateRel(onerel, new_rel_pages); RelationTruncate(onerel, new_rel_pages); /* @@ -875,34 +839,6 @@ lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats) * the table again. */ - /* - * Drop free-space info for removed blocks; these must not get entered - * into the FSM! - */ - pageSpaces = vacrelstats->free_pages; - n = vacrelstats->num_free_pages; - j = 0; - for (i = 0; i < n; i++) - { - if (FSMPageGetPageNum(&pageSpaces[i]) < new_rel_pages) - { - pageSpaces[j] = pageSpaces[i]; - j++; - } - } - vacrelstats->num_free_pages = j; - - /* - * If tot_free_pages was more than num_free_pages, we can't tell for sure - * what its correct value is now, because we don't know which of the - * forgotten pages are getting truncated. Conservatively set it equal to - * num_free_pages. - */ - vacrelstats->tot_free_pages = j; - - /* We destroyed the heap ordering, so mark array unordered */ - vacrelstats->fs_is_heap = false; - /* update statistics */ vacrelstats->rel_pages = new_rel_pages; vacrelstats->pages_removed = old_rel_pages - new_rel_pages; @@ -1005,7 +941,6 @@ static void lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks) { long maxtuples; - int maxpages; if (vacrelstats->hasindex) { @@ -1029,19 +964,6 @@ lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks) vacrelstats->max_dead_tuples = (int) maxtuples; vacrelstats->dead_tuples = (ItemPointer) palloc(maxtuples * sizeof(ItemPointerData)); - - maxpages = MaxFSMPages; - maxpages = Min(maxpages, MaxAllocSize / sizeof(FSMPageData)); - /* No need to allocate more pages than the relation has blocks */ - if (relblocks < (BlockNumber) maxpages) - maxpages = (int) relblocks; - - vacrelstats->fs_is_heap = false; - vacrelstats->num_free_pages = 0; - vacrelstats->max_free_pages = maxpages; - vacrelstats->free_pages = (FSMPageData *) - palloc(maxpages * sizeof(FSMPageData)); - vacrelstats->tot_free_pages = 0; } /* @@ -1063,127 +985,6 @@ lazy_record_dead_tuple(LVRelStats *vacrelstats, } } -/* - * lazy_record_free_space - remember free space on one page - */ -static void -lazy_record_free_space(LVRelStats *vacrelstats, - BlockNumber page, - Size avail) -{ - FSMPageData *pageSpaces; - int n; - - /* - * A page with less than stats->threshold free space will be forgotten - * immediately, and never passed to the free space map. Removing the - * uselessly small entries early saves cycles, and in particular reduces - * the amount of time we spend holding the FSM lock when we finally call - * RecordRelationFreeSpace. Since the FSM will probably drop pages with - * little free space anyway, there's no point in making this really small. - * - * XXX Is it worth trying to measure average tuple size, and using that to - * adjust the threshold? Would be worthwhile if FSM has no stats yet for - * this relation. But changing the threshold as we scan the rel might - * lead to bizarre behavior, too. Also, it's probably better if vacuum.c - * has the same thresholding behavior as we do here. - */ - if (avail < vacrelstats->threshold) - return; - - /* Count all pages over threshold, even if not enough space in array */ - vacrelstats->tot_free_pages++; - - /* Copy pointers to local variables for notational simplicity */ - pageSpaces = vacrelstats->free_pages; - n = vacrelstats->max_free_pages; - - /* If we haven't filled the array yet, just keep adding entries */ - if (vacrelstats->num_free_pages < n) - { - FSMPageSetPageNum(&pageSpaces[vacrelstats->num_free_pages], page); - FSMPageSetSpace(&pageSpaces[vacrelstats->num_free_pages], avail); - vacrelstats->num_free_pages++; - return; - } - - /*---------- - * The rest of this routine works with "heap" organization of the - * free space arrays, wherein we maintain the heap property - * avail[(j-1) div 2] <= avail[j] for 0 < j < n. - * In particular, the zero'th element always has the smallest available - * space and can be discarded to make room for a new page with more space. - * See Knuth's discussion of heap-based priority queues, sec 5.2.3; - * but note he uses 1-origin array subscripts, not 0-origin. - *---------- - */ - - /* If we haven't yet converted the array to heap organization, do it */ - if (!vacrelstats->fs_is_heap) - { - /* - * Scan backwards through the array, "sift-up" each value into its - * correct position. We can start the scan at n/2-1 since each entry - * above that position has no children to worry about. - */ - int l = n / 2; - - while (--l >= 0) - { - BlockNumber R = FSMPageGetPageNum(&pageSpaces[l]); - Size K = FSMPageGetSpace(&pageSpaces[l]); - int i; /* i is where the "hole" is */ - - i = l; - for (;;) - { - int j = 2 * i + 1; - - if (j >= n) - break; - if (j + 1 < n && FSMPageGetSpace(&pageSpaces[j]) > FSMPageGetSpace(&pageSpaces[j + 1])) - j++; - if (K <= FSMPageGetSpace(&pageSpaces[j])) - break; - pageSpaces[i] = pageSpaces[j]; - i = j; - } - FSMPageSetPageNum(&pageSpaces[i], R); - FSMPageSetSpace(&pageSpaces[i], K); - } - - vacrelstats->fs_is_heap = true; - } - - /* If new page has more than zero'th entry, insert it into heap */ - if (avail > FSMPageGetSpace(&pageSpaces[0])) - { - /* - * Notionally, we replace the zero'th entry with the new data, and - * then sift-up to maintain the heap property. Physically, the new - * data doesn't get stored into the arrays until we find the right - * location for it. - */ - int i = 0; /* i is where the "hole" is */ - - for (;;) - { - int j = 2 * i + 1; - - if (j >= n) - break; - if (j + 1 < n && FSMPageGetSpace(&pageSpaces[j]) > FSMPageGetSpace(&pageSpaces[j + 1])) - j++; - if (avail <= FSMPageGetSpace(&pageSpaces[j])) - break; - pageSpaces[i] = pageSpaces[j]; - i = j; - } - FSMPageSetPageNum(&pageSpaces[i], page); - FSMPageSetSpace(&pageSpaces[i], avail); - } -} - /* * lazy_tid_reaped() -- is a particular tid deletable? * @@ -1206,27 +1007,6 @@ lazy_tid_reaped(ItemPointer itemptr, void *state) return (res != NULL); } -/* - * Update the shared Free Space Map with the info we now have about - * free space in the relation, discarding any old info the map may have. - */ -static void -lazy_update_fsm(Relation onerel, LVRelStats *vacrelstats) -{ - FSMPageData *pageSpaces = vacrelstats->free_pages; - int nPages = vacrelstats->num_free_pages; - - /* - * Sort data into order, as required by RecordRelationFreeSpace. - */ - if (nPages > 1) - qsort(pageSpaces, nPages, sizeof(FSMPageData), - vac_cmp_page_spaces); - - RecordRelationFreeSpace(&onerel->rd_node, vacrelstats->tot_free_pages, - nPages, pageSpaces); -} - /* * Comparator routines for use with qsort() and bsearch(). */ @@ -1256,18 +1036,3 @@ vac_cmp_itemptr(const void *left, const void *right) return 0; } - -static int -vac_cmp_page_spaces(const void *left, const void *right) -{ - FSMPageData *linfo = (FSMPageData *) left; - FSMPageData *rinfo = (FSMPageData *) right; - BlockNumber lblkno = FSMPageGetPageNum(linfo); - BlockNumber rblkno = FSMPageGetPageNum(rinfo); - - if (lblkno < rblkno) - return -1; - else if (lblkno > rblkno) - return 1; - return 0; -} diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index 823c524379..bf805e977e 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -37,7 +37,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.51 2008/08/11 11:05:11 heikki Exp $ + * $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.52 2008/09/30 10:52:13 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -55,7 +55,6 @@ #include "postmaster/bgwriter.h" #include "storage/bufmgr.h" #include "storage/fd.h" -#include "storage/freespace.h" #include "storage/ipc.h" #include "storage/lwlock.h" #include "storage/pmsignal.h" @@ -398,7 +397,6 @@ BackgroundWriterMain(void) ExitOnAnyError = true; /* Close down the database */ ShutdownXLOG(0, 0); - DumpFreeSpaceMap(0, 0); /* Normal exit from the bgwriter is here */ proc_exit(0); /* done */ } diff --git a/src/backend/storage/freespace/Makefile b/src/backend/storage/freespace/Makefile index 553131d8f0..bc9cae622c 100644 --- a/src/backend/storage/freespace/Makefile +++ b/src/backend/storage/freespace/Makefile @@ -4,7 +4,7 @@ # Makefile for storage/freespace # # IDENTIFICATION -# $PostgreSQL: pgsql/src/backend/storage/freespace/Makefile,v 1.4 2008/02/19 10:30:08 petere Exp $ +# $PostgreSQL: pgsql/src/backend/storage/freespace/Makefile,v 1.5 2008/09/30 10:52:13 heikki Exp $ # #------------------------------------------------------------------------- @@ -12,6 +12,6 @@ subdir = src/backend/storage/freespace top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = freespace.o +OBJS = freespace.o fsmpage.o indexfsm.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/freespace/README b/src/backend/storage/freespace/README new file mode 100644 index 0000000000..b5a37f9cb5 --- /dev/null +++ b/src/backend/storage/freespace/README @@ -0,0 +1,195 @@ +$PostgreSQL: pgsql/src/backend/storage/freespace/README,v 1.1 2008/09/30 10:52:13 heikki Exp $ + +Free Space Map +-------------- + +The purpose of the free space map is to quickly locate a page with enough +free space to hold a tuple to be stored; or to determine that no such page +exists and the relation must be extended by one page. As of PostgreSQL 8.4 +each relation has its own, extensible free space map stored in a separate +"fork" of its relation. This eliminates the disadvantages of the former +fixed-size FSM. + +It is important to keep the map small so that it can be searched rapidly. +Therefore, we don't attempt to record the exact free space on a page. +We allocate one map byte to each page, allowing us to record free space +at a granularity of 1/256th of a page. Another way to say it is that +the stored value is the free space divided by BLCKSZ/256 (rounding down). +We assume that the free space must always be less than BLCKSZ, since +all pages have some overhead; so the maximum map value is 255. + +To assist in fast searching, the map isn't simply an array of per-page +entries, but has a tree structure above those entries. There is a tree +structure of pages, and a tree structure within each page, as described +below. + +FSM page structure +------------------ + +Within each FSM page, we use a binary tree structure where leaf nodes store +the amount of free space on heap pages (or lower level FSM pages, see +"Higher-level structure" below), with one leaf node per heap page. A non-leaf +node stores the max amount of free space on any of its children. + +For example: + + 4 + 4 2 +3 4 0 2 <- This level represents heap pages + +We need two basic operations: search and update. + +To search for a page with X amount of free space, traverse down the tree +along a path where n >= X, until you hit the bottom. If both children of a +node satisfy the condition, you can pick either one arbitrarily. + +To update the amount of free space on a page to X, first update the leaf node +corresponding to the heap page, then "bubble up" the change to upper nodes, +by walking up to each parent and recomputing its value as the max of its +two children. Repeat until reaching the root or a parent whose value +doesn't change. + +This data structure has a couple of nice properties: +- to discover that there is no page with X bytes of free space, you only + need to look at the root node +- by varying which child to traverse to in the search algorithm, when you have + a choice, we can implement various strategies, like preferring pages closer + to a given page, or spreading the load across the table. + +Higher-level routines that use FSM pages access them through the fsm_set_avail() +and fsm_search_avail() functions. The interface to those functions hides the +page's internal tree structure, treating the FSM page as a black box that has +a certain number of "slots" for storing free space information. (However, +the higher routines have to be aware of the tree structure of the whole map.) + +The binary tree is stored on each FSM page as an array. Because the page +header takes some space on a page, the binary tree isn't perfect. That is, +a few right-most leaf nodes are missing, and there are some useless non-leaf +nodes at the right. So the tree looks something like this: + + 0 + 1 2 + 3 4 5 6 +7 8 9 A B + +where the numbers denote each node's position in the array. Note that the +tree is guaranteed complete above the leaf level; only some leaf nodes are +missing. This is reflected in the number of usable "slots" per page not +being an exact power of 2. + +A FSM page also has a next slot pointer, fp_next_slot, that determines where +to start the next search for free space within that page. The reason for that +is to spread out the pages that are returned by FSM searches. When several +backends are concurrently inserting into a relation, contention can be avoided +by having them insert into different pages. But it is also desirable to fill +up pages in sequential order, to get the benefit of OS prefetching and batched +writes. The FSM is responsible for making that happen, and the next slot +pointer helps provide the desired behavior. + +Higher-level structure +---------------------- + +To scale up the data structure described above beyond a single page, we +maintain a similar tree-structure across pages. Leaf nodes in higher level +pages correspond to lower level FSM pages. The root node within each page +has the same value as the corresponding leaf node on its parent page. + +The root page is always stored at physical block 0. + +For example, assuming each FSM page can hold information about 4 pages (in +reality, it holds (BLCKSZ - headers) / 2, or ~4000 with default BLCKSZ), +we get a disk layout like this: + + 0 <-- page 0 at level 2 (root page) + 0 <-- page 0 at level 1 + 0 <-- page 0 at level 0 + 1 <-- page 1 at level 0 + 2 <-- ... + 3 + 1 <-- page 1 at level 1 + 4 + 5 + 6 + 7 + 2 + 8 + 9 + 10 + 11 + 3 + 12 + 13 + 14 + 15 + +where the numbers are page numbers *at that level*, starting from 0. + +To find the physical block # corresponding to leaf page n, we need to +count the number number of leaf and upper-level pages preceding page n. +This turns out to be + +y = n + (n / F + 1) + (n / F^2 + 1) + ... + 1 + +where F is the fanout (4 in the above example). The first term n is the number +of preceding leaf pages, the second term is the number of pages at level 1, +and so forth. + +To keep things simple, the tree is always constant height. To cover the +maximum relation size of 2^32-1 blocks, three levels is enough with the default +BLCKSZ (4000^3 > 2^32). + +Addressing +---------- + +The higher-level routines operate on "logical" addresses, consisting of +- level, +- logical page number, and +- slot (if applicable) + +Bottom level FSM pages have level of 0, the level above that 1, and root 2. +As in the diagram above, logical page number is the page number at that level, +starting from 0. + +Locking +------- + +When traversing down to search for free space, only one page is locked at a +time: the parent page is released before locking the child. If the child page +is concurrently modified, and there no longer is free space on the child page +when you land on it, you need to start from scratch (after correcting the +parent page, so that you don't get into an infinite loop). + +We use shared buffer locks when searching, but exclusive buffer lock when +updating a page. However, the next slot search pointer is updated during +searches even though we have only a shared lock. fp_next_slot is just a hint +and we can easily reset it if it gets corrupted; so it seems better to accept +some risk of that type than to pay the overhead of exclusive locking. + +Recovery +-------- + +The FSM is not explicitly WAL-logged. Instead, we rely on a bunch of +self-correcting measures to repair possible corruption. + +First of all, whenever a value is set on an FSM page, the root node of the +page is compared against the new value after bubbling up the change is +finished. It should be greater than or equal to the value just set, or we +have a corrupted page, with a parent somewhere with too small a value. +Secondly, if we detect corrupted pages while we search, traversing down +the tree. That check will notice if a parent node is set to too high a value. +In both cases, the upper nodes on the page are immediately rebuilt, fixing +the corruption. + +Vacuum updates all the bottom level pages with correct amount of free space +on the heap pages, fixing any outdated values there. After the heap and +index passes are done, FreeSpaceMapVacuum is called, and the FSM tree is +scanned in depth-first order. This fixes any discrepancies between upper +and lower level FSM pages. + +TODO +---- + +- fastroot to avoid traversing upper nodes with just 1 child +- use a different system for tables that fit into one FSM page, with a + mechanism to switch to the real thing as it grows. + diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c index 9373675b8c..1602ec0cc9 100644 --- a/src/backend/storage/freespace/freespace.c +++ b/src/backend/storage/freespace/freespace.c @@ -8,245 +8,123 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/freespace/freespace.c,v 1.60 2008/03/10 02:04:09 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/storage/freespace/freespace.c,v 1.61 2008/09/30 10:52:13 heikki Exp $ * * * NOTES: * - * The only really interesting aspect of this code is the heuristics for - * deciding how much information we can afford to keep about each relation, - * given that we have a limited amount of workspace in shared memory. - * These currently work as follows: - * - * The number of distinct relations tracked is limited by a configuration - * variable (MaxFSMRelations). When this would be exceeded, we discard the - * least recently used relation. A doubly-linked list with move-to-front - * behavior keeps track of which relation is least recently used. - * - * For each known relation, we track the average request size given to - * GetPageWithFreeSpace() as well as the most recent number of pages reported - * to RecordRelationFreeSpace(). The average request size is not directly - * used in this module, but we expect VACUUM to use it to filter out - * uninteresting amounts of space before calling RecordRelationFreeSpace(). - * The sum of the RRFS page counts is thus the total number of "interesting" - * pages that we would like to track; this is called DesiredFSMPages. - * - * The number of pages actually tracked is limited by a configuration variable - * (MaxFSMPages). When this is less than DesiredFSMPages, each relation - * gets to keep a fraction MaxFSMPages/DesiredFSMPages of its free pages. - * We discard pages with less free space to reach this target. - * - * Actually, our space allocation is done in "chunks" of CHUNKPAGES pages, - * with each relation guaranteed at least one chunk. This reduces thrashing - * of the storage allocations when there are small changes in the RRFS page - * counts from one VACUUM to the next. (XXX it might also be worthwhile to - * impose some kind of moving-average smoothing on the RRFS page counts?) - * - * So the actual arithmetic is: for each relation compute myRequest as the - * number of chunks needed to hold its RRFS page count (not counting the - * first, guaranteed chunk); compute sumRequests as the sum of these values - * over all relations; then for each relation figure its target allocation - * as - * 1 + round(spareChunks * myRequest / sumRequests) - * where spareChunks = totalChunks - numRels is the number of chunks we have - * a choice what to do with. We round off these numbers because truncating - * all of them would waste significant space. But because of roundoff, it's - * possible for the last few relations to get less space than they should; - * the target allocation must be checked against remaining available space. + * Free Space Map keeps track of the amount of free space on pages, and + * allows quickly searching for a page with enough free space. The FSM is + * stored in a dedicated relation fork of all heap relations, and those + * index access methods that need it (see also indexfsm.c). See README for + * more information. * *------------------------------------------------------------------------- */ #include "postgres.h" -#include -#include -#include - -#include "storage/fd.h" +#include "access/htup.h" +#include "access/xlogutils.h" +#include "storage/bufpage.h" +#include "storage/bufmgr.h" #include "storage/freespace.h" +#include "storage/fsm_internals.h" +#include "storage/lmgr.h" #include "storage/lwlock.h" -#include "storage/shmem.h" +#include "storage/smgr.h" +#include "utils/rel.h" +#include "utils/inval.h" +#include "miscadmin.h" - -/*---------- - * During database shutdown, we store the contents of FSM into a disk file, - * which is re-read during startup. This way we don't have a startup - * transient condition where FSM isn't really functioning. +/* + * We use just one byte to store the amount of free space on a page, so we + * divide the amount of free space a page can have into 256 different + * categories. The highest category, 255, represents a page with at least + * MaxFSMRequestSize bytes of free space, and the second highest category + * represents the range from 254 * FSM_CAT_STEP, inclusive, to + * MaxFSMRequestSize, exclusive. * - * The file format is: - * label "FSM\0" - * endian constant 0x01020304 for detecting endianness problems - * version# - * numRels - * -- for each rel, in *reverse* usage order: - * relfilenode - * isIndex - * avgRequest - * interestingPages - * storedPages - * arena data array of storedPages FSMPageData or IndexFSMPageData - *---------- - */ - -/* Name of FSM cache file (relative to $PGDATA) */ -#define FSM_CACHE_FILENAME "global/pg_fsm.cache" - -/* Fixed values in header */ -#define FSM_CACHE_LABEL "FSM" -#define FSM_CACHE_ENDIAN 0x01020304 -#define FSM_CACHE_VERSION 20030305 - -/* File header layout */ -typedef struct FsmCacheFileHeader -{ - char label[4]; - uint32 endian; - uint32 version; - int32 numRels; -} FsmCacheFileHeader; - -/* Per-relation header */ -typedef struct FsmCacheRelHeader -{ - RelFileNode key; /* hash key (must be first) */ - bool isIndex; /* if true, we store only page numbers */ - uint32 avgRequest; /* moving average of space requests */ - BlockNumber interestingPages; /* # of pages with useful free space */ - int32 storedPages; /* # of pages stored in arena */ -} FsmCacheRelHeader; - -int MaxFSMRelations; /* these are set by guc.c */ -int MaxFSMPages; - -static FSMHeader *FreeSpaceMap; /* points to FSMHeader in shared memory */ -static HTAB *FreeSpaceMapRelHash; /* points to (what used to be) - * FSMHeader->relHash */ - - -static void CheckFreeSpaceMapStatistics(int elevel, int numRels, - double needed); -static FSMRelation *lookup_fsm_rel(RelFileNode *rel); -static FSMRelation *create_fsm_rel(RelFileNode *rel); -static void delete_fsm_rel(FSMRelation *fsmrel); -static int realloc_fsm_rel(FSMRelation *fsmrel, BlockNumber interestingPages, - bool isIndex); -static void link_fsm_rel_usage(FSMRelation *fsmrel); -static void unlink_fsm_rel_usage(FSMRelation *fsmrel); -static void link_fsm_rel_storage(FSMRelation *fsmrel); -static void unlink_fsm_rel_storage(FSMRelation *fsmrel); -static BlockNumber find_free_space(FSMRelation *fsmrel, Size spaceNeeded); -static BlockNumber find_index_free_space(FSMRelation *fsmrel); -static void fsm_record_free_space(FSMRelation *fsmrel, BlockNumber page, - Size spaceAvail); -static bool lookup_fsm_page_entry(FSMRelation *fsmrel, BlockNumber page, - int *outPageIndex); -static void compact_fsm_storage(void); -static void push_fsm_rels_after(FSMRelation *afterRel); -static void pack_incoming_pages(FSMPageData *newLocation, int newPages, - FSMPageData *pageSpaces, int nPages); -static void pack_existing_pages(FSMPageData *newLocation, int newPages, - FSMPageData *oldLocation, int oldPages); -static int fsm_calc_request(FSMRelation *fsmrel); -static int fsm_calc_request_unclamped(FSMRelation *fsmrel); -static int fsm_calc_target_allocation(int myRequest); -static int fsm_current_chunks(FSMRelation *fsmrel); -static int fsm_current_allocation(FSMRelation *fsmrel); - - -/* - * Exported routines - */ - - -/* - * InitFreeSpaceMap -- Initialize the freespace module. + * MaxFSMRequestSize depends on the architecture and BLCKSZ, but assuming + * default 8k BLCKSZ, and that MaxFSMRequestSize is 24 bytes, the categories + * look like this + * * - * This must be called once during shared memory initialization. - * It builds the empty free space map table. FreeSpaceLock must also be - * initialized at some point, but is not touched here --- we assume there is - * no need for locking, since only the calling process can be accessing shared - * memory as yet. + * Range Category + * 0 - 31 0 + * 32 - 63 1 + * ... ... ... + * 8096 - 8127 253 + * 8128 - 8163 254 + * 8164 - 8192 255 + * + * The reason that MaxFSMRequestSize is special is that if MaxFSMRequestSize + * isn't equal to a range boundary, a page with exactly MaxFSMRequestSize + * bytes of free space wouldn't satisfy a request for MaxFSMRequestSize + * bytes. If there isn't more than MaxFSMRequestSize bytes of free space on a + * completely empty page, that would mean that we could never satisfy a + * request of exactly MaxFSMRequestSize bytes. */ -void -InitFreeSpaceMap(void) -{ - HASHCTL info; - int nchunks; - bool found; - - /* Create table header */ - FreeSpaceMap = (FSMHeader *) ShmemInitStruct("Free Space Map Header", - sizeof(FSMHeader), - &found); - if (FreeSpaceMap == NULL) - ereport(FATAL, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("insufficient shared memory for free space map"))); - if (!found) - MemSet(FreeSpaceMap, 0, sizeof(FSMHeader)); - - /* Create hashtable for FSMRelations */ - info.keysize = sizeof(RelFileNode); - info.entrysize = sizeof(FSMRelation); - info.hash = tag_hash; - - FreeSpaceMapRelHash = ShmemInitHash("Free Space Map Hash", - MaxFSMRelations + 1, - MaxFSMRelations + 1, - &info, - (HASH_ELEM | HASH_FUNCTION)); - - if (!FreeSpaceMapRelHash) - ereport(FATAL, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("insufficient shared memory for free space map"))); - - if (found) - return; - - - /* Allocate page-storage arena */ - nchunks = (MaxFSMPages - 1) / CHUNKPAGES + 1; - /* This check ensures spareChunks will be greater than zero */ - if (nchunks <= MaxFSMRelations) - ereport(FATAL, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("max_fsm_pages must exceed max_fsm_relations * %d", - CHUNKPAGES))); - - FreeSpaceMap->arena = (char *) ShmemAlloc((Size) nchunks * CHUNKBYTES); - if (FreeSpaceMap->arena == NULL) - ereport(FATAL, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("insufficient shared memory for free space map"))); - - FreeSpaceMap->totalChunks = nchunks; - FreeSpaceMap->usedChunks = 0; - FreeSpaceMap->sumRequests = 0; -} +#define FSM_CATEGORIES 256 +#define FSM_CAT_STEP (BLCKSZ / FSM_CATEGORIES) +#define MaxFSMRequestSize MaxHeapTupleSize /* - * Estimate amount of shmem space needed for FSM. + * Depth of the on-disk tree. We need to be able to address 2^32-1 blocks, + * and 1626 is the smallest number that satisfies X^3 >= 2^32-1. Likewise, + * 216 is the smallest number that satisfies X^4 >= 2^32-1. In practice, + * this means that 4096 bytes is the smallest BLCKSZ that we can get away + * with a 3-level tree, and 512 is the smallest we support. */ -Size -FreeSpaceShmemSize(void) +#define FSM_TREE_DEPTH ((SlotsPerFSMPage >= 1626) ? 3 : 4) + +#define FSM_ROOT_LEVEL (FSM_TREE_DEPTH - 1) +#define FSM_BOTTOM_LEVEL 0 + +/* + * The internal FSM routines work on a logical addressing scheme. Each + * level of the tree can be thought of as a separately addressable file. + */ +typedef struct { - Size size; - int nchunks; + int level; /* level */ + int logpageno; /* page number within the level */ +} FSMAddress; - /* table header */ - size = MAXALIGN(sizeof(FSMHeader)); +/* Address of the root page. */ +static const FSMAddress FSM_ROOT_ADDRESS = { FSM_ROOT_LEVEL, 0 }; - /* hash table, including the FSMRelation objects */ - size = add_size(size, hash_estimate_size(MaxFSMRelations + 1, - sizeof(FSMRelation))); +/* XLOG record types */ +#define XLOG_FSM_TRUNCATE 0x00 /* truncate */ - /* page-storage arena */ - nchunks = (MaxFSMPages - 1) / CHUNKPAGES + 1; - size = add_size(size, mul_size(nchunks, CHUNKBYTES)); +typedef struct +{ + RelFileNode node; /* truncated relation */ + BlockNumber nheapblocks; /* new number of blocks in the heap */ +} xl_fsm_truncate; - return size; -} +/* functions to navigate the tree */ +static FSMAddress fsm_get_child(FSMAddress parent, uint16 slot); +static FSMAddress fsm_get_parent(FSMAddress child, uint16 *slot); +static FSMAddress fsm_get_location(BlockNumber heapblk, uint16 *slot); +static BlockNumber fsm_get_heap_blk(FSMAddress addr, uint16 slot); +static BlockNumber fsm_logical_to_physical(FSMAddress addr); + +static Buffer fsm_readbuf(Relation rel, FSMAddress addr, bool extend); +static void fsm_extend(Relation rel, BlockNumber nfsmblocks); + +/* functions to convert amount of free space to a FSM category */ +static uint8 fsm_space_avail_to_cat(Size avail); +static uint8 fsm_space_needed_to_cat(Size needed); +static Size fsm_space_cat_to_avail(uint8 cat); + +/* workhorse functions for various operations */ +static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot, + uint8 newValue, uint8 minValue); +static BlockNumber fsm_search(Relation rel, uint8 min_cat); +static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof); + + +/******** Public API ********/ /* * GetPageWithFreeSpace - try to find a page in the given relation with @@ -262,1608 +140,668 @@ FreeSpaceShmemSize(void) * extend the relation. */ BlockNumber -GetPageWithFreeSpace(RelFileNode *rel, Size spaceNeeded) +GetPageWithFreeSpace(Relation rel, Size spaceNeeded) { - FSMRelation *fsmrel; - BlockNumber freepage; - - LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE); - - /* - * We always add a rel to the hashtable when it is inquired about. - */ - fsmrel = create_fsm_rel(rel); - - /* - * Update the moving average of space requests. This code implements an - * exponential moving average with an equivalent period of about 63 - * requests. Ignore silly requests, however, to ensure that the average - * stays sane. - */ - if (spaceNeeded > 0 && spaceNeeded < BLCKSZ) - { - int cur_avg = (int) fsmrel->avgRequest; - - cur_avg += ((int) spaceNeeded - cur_avg) / 32; - fsmrel->avgRequest = (Size) cur_avg; - } - freepage = find_free_space(fsmrel, spaceNeeded); - LWLockRelease(FreeSpaceLock); - return freepage; + uint8 min_cat = fsm_space_needed_to_cat(spaceNeeded); + return fsm_search(rel, min_cat); } /* * RecordAndGetPageWithFreeSpace - update info about a page and try again. * - * We provide this combo form, instead of a separate Record operation, - * to save one lock and hash table lookup cycle. + * We provide this combo form to save some locking overhead, compared to + * separate RecordPageWithFreeSpace + GetPageWithFreeSpace calls. There's + * also some effort to return a page close to the old page; if there's a + * page with enough free space on the same FSM page where the old one page + * is located, it is preferred. */ BlockNumber -RecordAndGetPageWithFreeSpace(RelFileNode *rel, - BlockNumber oldPage, - Size oldSpaceAvail, - Size spaceNeeded) +RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage, + Size oldSpaceAvail, Size spaceNeeded) { - FSMRelation *fsmrel; - BlockNumber freepage; + int old_cat = fsm_space_avail_to_cat(oldSpaceAvail); + int search_cat = fsm_space_needed_to_cat(spaceNeeded); + FSMAddress addr; + uint16 slot; + int search_slot; - /* Sanity check: ensure spaceAvail will fit into OffsetNumber */ - AssertArg(oldSpaceAvail < BLCKSZ); + /* Get the location of the FSM byte representing the heap block */ + addr = fsm_get_location(oldPage, &slot); - LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE); + search_slot = fsm_set_and_search(rel, addr, slot, old_cat, search_cat); /* - * We always add a rel to the hashtable when it is inquired about. + * If fsm_set_and_search found a suitable new block, return that. + * Otherwise, search as usual. */ - fsmrel = create_fsm_rel(rel); - - /* Do the Record */ - fsm_record_free_space(fsmrel, oldPage, oldSpaceAvail); - - /* - * Update the moving average of space requests, same as in - * GetPageWithFreeSpace. - */ - if (spaceNeeded > 0 && spaceNeeded < BLCKSZ) - { - int cur_avg = (int) fsmrel->avgRequest; - - cur_avg += ((int) spaceNeeded - cur_avg) / 32; - fsmrel->avgRequest = (Size) cur_avg; - } - /* Do the Get */ - freepage = find_free_space(fsmrel, spaceNeeded); - LWLockRelease(FreeSpaceLock); - return freepage; + if (search_slot != -1) + return fsm_get_heap_blk(addr, search_slot); + else + return fsm_search(rel, search_cat); } /* - * GetAvgFSMRequestSize - get average FSM request size for a relation. + * RecordPageWithFreeSpace - update info about a page. * - * If the relation is not known to FSM, return a default value. + * Note that if the new spaceAvail value is higher than the old value stored + * in the FSM, the space might not become visible to searchers until the next + * FreeSpaceMapVacuum call, which updates the upper level pages. + */ +void +RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail) +{ + int new_cat = fsm_space_avail_to_cat(spaceAvail); + FSMAddress addr; + uint16 slot; + + /* Get the location of the FSM byte representing the heap block */ + addr = fsm_get_location(heapBlk, &slot); + + fsm_set_and_search(rel, addr, slot, new_cat, 0); +} + +/* + * GetRecordedFreePage - return the amount of free space on a particular page, + * according to the FSM. */ Size -GetAvgFSMRequestSize(RelFileNode *rel) +GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk) { - Size result; - FSMRelation *fsmrel; + FSMAddress addr; + uint16 slot; + Buffer buf; + uint8 cat; - LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE); - fsmrel = lookup_fsm_rel(rel); - if (fsmrel) - result = fsmrel->avgRequest; - else - result = INITIAL_AVERAGE; - LWLockRelease(FreeSpaceLock); - return result; -} + /* Get the location of the FSM byte representing the heap block */ + addr = fsm_get_location(heapBlk, &slot); -/* - * RecordRelationFreeSpace - record available-space info about a relation. - * - * Any pre-existing info about the relation is assumed obsolete and discarded. - * - * interestingPages is the total number of pages in the relation that have - * at least threshold free space; nPages is the number actually reported in - * pageSpaces[] (may be less --- in particular, callers typically clamp their - * space usage to MaxFSMPages). - * - * The given pageSpaces[] array must be sorted in order by blkno. Note that - * the FSM is at liberty to discard some or all of the data. - */ -void -RecordRelationFreeSpace(RelFileNode *rel, - BlockNumber interestingPages, - int nPages, - FSMPageData *pageSpaces) -{ - FSMRelation *fsmrel; + buf = fsm_readbuf(rel, addr, false); + if (!BufferIsValid(buf)) + return 0; + cat = fsm_get_avail(BufferGetPage(buf), slot); + ReleaseBuffer(buf); - /* Limit nPages to something sane */ - if (nPages < 0) - nPages = 0; - else if (nPages > MaxFSMPages) - nPages = MaxFSMPages; - - LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE); - - /* - * Note we don't record info about a relation unless there's already an - * FSM entry for it, implying someone has done GetPageWithFreeSpace for - * it. Inactive rels thus will not clutter the map simply by being - * vacuumed. - */ - fsmrel = lookup_fsm_rel(rel); - if (fsmrel) - { - int curAlloc; - int curAllocPages; - FSMPageData *newLocation; - - curAlloc = realloc_fsm_rel(fsmrel, interestingPages, false); - curAllocPages = curAlloc * CHUNKPAGES; - - /* - * If the data fits in our current allocation, just copy it; otherwise - * must compress. - */ - newLocation = (FSMPageData *) - (FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES); - if (nPages <= curAllocPages) - { - int i; - - for (i = 0; i < nPages; i++) - { - BlockNumber page = FSMPageGetPageNum(&pageSpaces[i]); - - /* Check caller provides sorted data */ - if (i > 0 && page <= FSMPageGetPageNum(&pageSpaces[i - 1])) - elog(ERROR, "free-space data is not in page order"); - *newLocation = pageSpaces[i]; - newLocation++; - } - fsmrel->storedPages = nPages; - } - else - { - pack_incoming_pages(newLocation, curAllocPages, - pageSpaces, nPages); - fsmrel->storedPages = curAllocPages; - } - } - LWLockRelease(FreeSpaceLock); -} - -/* - * GetFreeIndexPage - like GetPageWithFreeSpace, but for indexes - */ -BlockNumber -GetFreeIndexPage(RelFileNode *rel) -{ - FSMRelation *fsmrel; - BlockNumber freepage; - - LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE); - - /* - * We always add a rel to the hashtable when it is inquired about. - */ - fsmrel = create_fsm_rel(rel); - - freepage = find_index_free_space(fsmrel); - LWLockRelease(FreeSpaceLock); - return freepage; -} - -/* - * RecordIndexFreeSpace - like RecordRelationFreeSpace, but for indexes - */ -void -RecordIndexFreeSpace(RelFileNode *rel, - BlockNumber interestingPages, - int nPages, - BlockNumber *pages) -{ - FSMRelation *fsmrel; - - /* Limit nPages to something sane */ - if (nPages < 0) - nPages = 0; - else if (nPages > MaxFSMPages) - nPages = MaxFSMPages; - - LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE); - - /* - * Note we don't record info about a relation unless there's already an - * FSM entry for it, implying someone has done GetFreeIndexPage for it. - * Inactive rels thus will not clutter the map simply by being vacuumed. - */ - fsmrel = lookup_fsm_rel(rel); - if (fsmrel) - { - int curAlloc; - int curAllocPages; - int i; - IndexFSMPageData *newLocation; - - curAlloc = realloc_fsm_rel(fsmrel, interestingPages, true); - curAllocPages = curAlloc * INDEXCHUNKPAGES; - - /* - * If the data fits in our current allocation, just copy it; otherwise - * must compress. But compression is easy: we merely forget extra - * pages. - */ - newLocation = (IndexFSMPageData *) - (FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES); - if (nPages > curAllocPages) - nPages = curAllocPages; - - for (i = 0; i < nPages; i++) - { - BlockNumber page = pages[i]; - - /* Check caller provides sorted data */ - if (i > 0 && page <= pages[i - 1]) - elog(ERROR, "free-space data is not in page order"); - IndexFSMPageSetPageNum(newLocation, page); - newLocation++; - } - fsmrel->storedPages = nPages; - } - LWLockRelease(FreeSpaceLock); + return fsm_space_cat_to_avail(cat); } /* * FreeSpaceMapTruncateRel - adjust for truncation of a relation. * - * We need to delete any stored data past the new relation length, so that - * we don't bogusly return removed block numbers. - */ -void -FreeSpaceMapTruncateRel(RelFileNode *rel, BlockNumber nblocks) -{ - FSMRelation *fsmrel; - - LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE); - fsmrel = lookup_fsm_rel(rel); - if (fsmrel) - { - int pageIndex; - - /* Use lookup to locate first entry >= nblocks */ - (void) lookup_fsm_page_entry(fsmrel, nblocks, &pageIndex); - /* Delete all such entries */ - fsmrel->storedPages = pageIndex; - /* XXX should we adjust rel's interestingPages and sumRequests? */ - } - LWLockRelease(FreeSpaceLock); -} - -/* - * FreeSpaceMapForgetRel - forget all about a relation. + * The caller must hold AccessExclusiveLock on the relation, to ensure + * that other backends receive the relcache invalidation event that this + * function sends, before accessing the FSM again. * - * This is called when a relation is deleted. Although we could just let - * the rel age out of the map, it's better to reclaim and reuse the space - * sooner. + * nblocks is the new size of the heap. */ void -FreeSpaceMapForgetRel(RelFileNode *rel) +FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks) { - FSMRelation *fsmrel; + BlockNumber new_nfsmblocks; + FSMAddress first_removed_address; + uint16 first_removed_slot; + Buffer buf; - LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE); - fsmrel = lookup_fsm_rel(rel); - if (fsmrel) - delete_fsm_rel(fsmrel); - LWLockRelease(FreeSpaceLock); -} + RelationOpenSmgr(rel); -/* - * FreeSpaceMapForgetDatabase - forget all relations of a database. - * - * This is called during DROP DATABASE. As above, might as well reclaim - * map space sooner instead of later. - */ -void -FreeSpaceMapForgetDatabase(Oid dbid) -{ - FSMRelation *fsmrel, - *nextrel; - - LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE); - for (fsmrel = FreeSpaceMap->usageList; fsmrel; fsmrel = nextrel) - { - nextrel = fsmrel->nextUsage; /* in case we delete it */ - if (fsmrel->key.dbNode == dbid) - delete_fsm_rel(fsmrel); - } - LWLockRelease(FreeSpaceLock); -} - -/* - * PrintFreeSpaceMapStatistics - print statistics about FSM contents - * - * The info is sent to ereport() with the specified message level. This is - * intended for use during VACUUM. - */ -void -PrintFreeSpaceMapStatistics(int elevel) -{ - FSMRelation *fsmrel; - int storedPages = 0; - double sumRequests = 0; - int numRels; - double needed; - - LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE); + /* Get the location in the FSM of the first removed heap block */ + first_removed_address = fsm_get_location(nblocks, &first_removed_slot); /* - * Count total space actually used, as well as the unclamped request total + * Zero out the tail of the last remaining FSM page. If the slot + * representing the first removed heap block is at a page boundary, as + * the first slot on the FSM page that first_removed_address points to, + * we can just truncate that page altogether. */ - for (fsmrel = FreeSpaceMap->firstRel; - fsmrel != NULL; - fsmrel = fsmrel->nextPhysical) + if (first_removed_slot > 0) { - storedPages += fsmrel->storedPages; - sumRequests += fsm_calc_request_unclamped(fsmrel); + buf = fsm_readbuf(rel, first_removed_address, false); + if (!BufferIsValid(buf)) + return; /* nothing to do; the FSM was already smaller */ + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + fsm_truncate_avail(BufferGetPage(buf), first_removed_slot); + MarkBufferDirty(buf); + UnlockReleaseBuffer(buf); + + new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1; + } + else + { + new_nfsmblocks = fsm_logical_to_physical(first_removed_address); + if (smgrnblocks(rel->rd_smgr, FSM_FORKNUM) <= new_nfsmblocks) + return; /* nothing to do; the FSM was already smaller */ } - /* Copy other stats before dropping lock */ - numRels = FreeSpaceMap->numRels; - LWLockRelease(FreeSpaceLock); + /* Truncate the unused FSM pages */ + smgrtruncate(rel->rd_smgr, FSM_FORKNUM, new_nfsmblocks, rel->rd_istemp); - /* Convert stats to actual number of page slots needed */ - needed = (sumRequests + numRels) * CHUNKPAGES; - - ereport(elevel, - (errmsg("free space map contains %d pages in %d relations", - storedPages, numRels), - errdetail("A total of %.0f page slots are in use (including overhead).\n" - "%.0f page slots are required to track all free space.\n" - "Current limits are: %d page slots, %d relations, using %.0f kB.", - Min(needed, MaxFSMPages), - needed, - MaxFSMPages, MaxFSMRelations, - (double) FreeSpaceShmemSize() / 1024.0))); - - CheckFreeSpaceMapStatistics(NOTICE, numRels, needed); - /* Print to server logs too because is deals with a config variable. */ - CheckFreeSpaceMapStatistics(LOG, numRels, needed); -} - -static void -CheckFreeSpaceMapStatistics(int elevel, int numRels, double needed) -{ - if (numRels == MaxFSMRelations) - ereport(elevel, - (errmsg("max_fsm_relations(%d) equals the number of relations checked", - MaxFSMRelations), - errhint("You have at least %d relations. " - "Consider increasing the configuration parameter \"max_fsm_relations\".", - numRels))); - else if (needed > MaxFSMPages) - ereport(elevel, - (errmsg("number of page slots needed (%.0f) exceeds max_fsm_pages (%d)", - needed, MaxFSMPages), - errhint("Consider increasing the configuration parameter \"max_fsm_pages\" " - "to a value over %.0f.", needed))); -} - -/* - * DumpFreeSpaceMap - dump contents of FSM into a disk file for later reload - * - * This is expected to be called during database shutdown, after updates to - * the FSM have stopped. We lock the FreeSpaceLock but that's purely pro - * forma --- if anyone else is still accessing FSM, there's a problem. - */ -void -DumpFreeSpaceMap(int code, Datum arg) -{ - FILE *fp; - FsmCacheFileHeader header; - FSMRelation *fsmrel; - - /* Try to create file */ - unlink(FSM_CACHE_FILENAME); /* in case it exists w/wrong permissions */ - - fp = AllocateFile(FSM_CACHE_FILENAME, PG_BINARY_W); - if (fp == NULL) + /* + * FSM truncations are WAL-logged, because we must never return a block + * that doesn't exist in the heap, not even if we crash before the FSM + * truncation has made it to disk. smgrtruncate() writes its own WAL + * record, but that's not enough to zero out the last remaining FSM page. + * (if we didn't need to zero out anything above, we can skip this) + */ + if (!rel->rd_istemp && !InRecovery && first_removed_slot != 0) { - elog(LOG, "could not write \"%s\": %m", FSM_CACHE_FILENAME); - return; - } + xl_fsm_truncate xlrec; + XLogRecData rdata; + XLogRecPtr recptr; - LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE); + xlrec.node = rel->rd_node; + xlrec.nheapblocks = nblocks; - /* Write file header */ - MemSet(&header, 0, sizeof(header)); - strcpy(header.label, FSM_CACHE_LABEL); - header.endian = FSM_CACHE_ENDIAN; - header.version = FSM_CACHE_VERSION; - header.numRels = FreeSpaceMap->numRels; - if (fwrite(&header, 1, sizeof(header), fp) != sizeof(header)) - goto write_failed; + rdata.data = (char *) &xlrec; + rdata.len = sizeof(xl_fsm_truncate); + rdata.buffer = InvalidBuffer; + rdata.next = NULL; - /* For each relation, in order from least to most recently used... */ - for (fsmrel = FreeSpaceMap->usageListTail; - fsmrel != NULL; - fsmrel = fsmrel->priorUsage) - { - FsmCacheRelHeader relheader; - int nPages; - - /* Write relation header */ - MemSet(&relheader, 0, sizeof(relheader)); - relheader.key = fsmrel->key; - relheader.isIndex = fsmrel->isIndex; - relheader.avgRequest = fsmrel->avgRequest; - relheader.interestingPages = fsmrel->interestingPages; - relheader.storedPages = fsmrel->storedPages; - if (fwrite(&relheader, 1, sizeof(relheader), fp) != sizeof(relheader)) - goto write_failed; - - /* Write the per-page data directly from the arena */ - nPages = fsmrel->storedPages; - if (nPages > 0) - { - Size len; - char *data; - - if (fsmrel->isIndex) - len = nPages * sizeof(IndexFSMPageData); - else - len = nPages * sizeof(FSMPageData); - data = (char *) - (FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES); - if (fwrite(data, 1, len, fp) != len) - goto write_failed; - } - } - - /* Clean up */ - LWLockRelease(FreeSpaceLock); - - if (FreeFile(fp)) - { - elog(LOG, "could not write \"%s\": %m", FSM_CACHE_FILENAME); - /* Remove busted cache file */ - unlink(FSM_CACHE_FILENAME); - } - - return; - -write_failed: - elog(LOG, "could not write \"%s\": %m", FSM_CACHE_FILENAME); - - /* Clean up */ - LWLockRelease(FreeSpaceLock); - - FreeFile(fp); - - /* Remove busted cache file */ - unlink(FSM_CACHE_FILENAME); -} - -/* - * LoadFreeSpaceMap - load contents of FSM from a disk file - * - * This is expected to be called during database startup, before any FSM - * updates begin. We lock the FreeSpaceLock but that's purely pro - * forma --- if anyone else is accessing FSM yet, there's a problem. - * - * Notes: no complaint is issued if no cache file is found. If the file is - * found, it is deleted after reading. Thus, if we crash without a clean - * shutdown, the next cycle of life starts with no FSM data. To do otherwise, - * we'd need to do significantly more validation in this routine, because of - * the likelihood that what is in the dump file would be out-of-date, eg - * there might be entries for deleted or truncated rels. - */ -void -LoadFreeSpaceMap(void) -{ - FILE *fp; - FsmCacheFileHeader header; - int relno; - - /* Try to open file */ - fp = AllocateFile(FSM_CACHE_FILENAME, PG_BINARY_R); - if (fp == NULL) - { - if (errno != ENOENT) - elog(LOG, "could not read \"%s\": %m", FSM_CACHE_FILENAME); - return; - } - - LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE); - - /* Read and verify file header */ - if (fread(&header, 1, sizeof(header), fp) != sizeof(header) || - strcmp(header.label, FSM_CACHE_LABEL) != 0 || - header.endian != FSM_CACHE_ENDIAN || - header.version != FSM_CACHE_VERSION || - header.numRels < 0) - { - elog(LOG, "bogus file header in \"%s\"", FSM_CACHE_FILENAME); - goto read_failed; - } - - /* For each relation, in order from least to most recently used... */ - for (relno = 0; relno < header.numRels; relno++) - { - FsmCacheRelHeader relheader; - Size len; - char *data; - FSMRelation *fsmrel; - int nPages; - int curAlloc; - int curAllocPages; - - /* Read and verify relation header, as best we can */ - if (fread(&relheader, 1, sizeof(relheader), fp) != sizeof(relheader) || - (relheader.isIndex != false && relheader.isIndex != true) || - relheader.avgRequest >= BLCKSZ || - relheader.storedPages < 0) - { - elog(LOG, "bogus rel header in \"%s\"", FSM_CACHE_FILENAME); - goto read_failed; - } - - /* Read the per-page data */ - nPages = relheader.storedPages; - if (relheader.isIndex) - len = nPages * sizeof(IndexFSMPageData); - else - len = nPages * sizeof(FSMPageData); - data = (char *) palloc(len); - if (fread(data, 1, len, fp) != len) - { - elog(LOG, "premature EOF in \"%s\"", FSM_CACHE_FILENAME); - pfree(data); - goto read_failed; - } + recptr = XLogInsert(RM_FREESPACE_ID, XLOG_FSM_TRUNCATE, &rdata); /* - * Okay, create the FSM entry and insert data into it. Since the rels - * were stored in reverse usage order, at the end of the loop they - * will be correctly usage-ordered in memory; and if MaxFSMRelations - * is less than it used to be, we will correctly drop the least - * recently used ones. + * Flush, because otherwise the truncation of the main relation + * might hit the disk before the WAL record of truncating the + * FSM is flushed. If we crashed during that window, we'd be + * left with a truncated heap, without a truncated FSM. */ - fsmrel = create_fsm_rel(&relheader.key); - fsmrel->avgRequest = relheader.avgRequest; - - curAlloc = realloc_fsm_rel(fsmrel, relheader.interestingPages, - relheader.isIndex); - if (relheader.isIndex) - { - IndexFSMPageData *newLocation; - - curAllocPages = curAlloc * INDEXCHUNKPAGES; - - /* - * If the data fits in our current allocation, just copy it; - * otherwise must compress. But compression is easy: we merely - * forget extra pages. - */ - newLocation = (IndexFSMPageData *) - (FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES); - if (nPages > curAllocPages) - nPages = curAllocPages; - memcpy(newLocation, data, nPages * sizeof(IndexFSMPageData)); - fsmrel->storedPages = nPages; - } - else - { - FSMPageData *newLocation; - - curAllocPages = curAlloc * CHUNKPAGES; - - /* - * If the data fits in our current allocation, just copy it; - * otherwise must compress. - */ - newLocation = (FSMPageData *) - (FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES); - if (nPages <= curAllocPages) - { - memcpy(newLocation, data, nPages * sizeof(FSMPageData)); - fsmrel->storedPages = nPages; - } - else - { - pack_existing_pages(newLocation, curAllocPages, - (FSMPageData *) data, nPages); - fsmrel->storedPages = curAllocPages; - } - } - - pfree(data); + XLogFlush(recptr); } -read_failed: - - /* Clean up */ - LWLockRelease(FreeSpaceLock); - - FreeFile(fp); - - /* Remove cache file before it can become stale; see notes above */ - unlink(FSM_CACHE_FILENAME); -} - - -/* - * Internal routines. These all assume the caller holds the FreeSpaceLock. - */ - -/* - * Lookup a relation in the hash table. If not present, return NULL. - * - * The relation's position in the LRU list is not changed. - */ -static FSMRelation * -lookup_fsm_rel(RelFileNode *rel) -{ - FSMRelation *fsmrel; - - fsmrel = (FSMRelation *) hash_search(FreeSpaceMapRelHash, - (void *) rel, - HASH_FIND, - NULL); - if (!fsmrel) - return NULL; - - return fsmrel; + /* + * Need to invalidate the relcache entry, because rd_fsm_nblocks_cache + * seen by other backends is no longer valid. + */ + if (!InRecovery) + CacheInvalidateRelcache(rel); + rel->rd_fsm_nblocks_cache = new_nfsmblocks; } /* - * Lookup a relation in the hash table, creating an entry if not present. - * - * On successful lookup, the relation is moved to the front of the LRU list. + * FreeSpaceMapVacuum - scan and fix any inconsistencies in the FSM */ -static FSMRelation * -create_fsm_rel(RelFileNode *rel) +void +FreeSpaceMapVacuum(Relation rel) { - FSMRelation *fsmrel; - bool found; + bool dummy; - fsmrel = (FSMRelation *) hash_search(FreeSpaceMapRelHash, - (void *) rel, - HASH_ENTER, - &found); + /* + * Traverse the tree in depth-first order. The tree is stored physically + * in depth-first order, so this should be pretty I/O efficient. + */ + fsm_vacuum_page(rel, FSM_ROOT_ADDRESS, &dummy); +} - if (!found) - { - /* New hashtable entry, initialize it (hash_search set the key) */ - fsmrel->isIndex = false; /* until we learn different */ - fsmrel->avgRequest = INITIAL_AVERAGE; - fsmrel->interestingPages = 0; - fsmrel->firstChunk = -1; /* no space allocated */ - fsmrel->storedPages = 0; - fsmrel->nextPage = 0; +/******** Internal routines ********/ - /* Discard lowest-priority existing rel, if we are over limit */ - if (FreeSpaceMap->numRels >= MaxFSMRelations) - delete_fsm_rel(FreeSpaceMap->usageListTail); +/* + * Return category corresponding x bytes of free space + */ +static uint8 +fsm_space_avail_to_cat(Size avail) +{ + int cat; - /* Add new entry at front of LRU list */ - link_fsm_rel_usage(fsmrel); - fsmrel->nextPhysical = NULL; /* not in physical-storage list */ - fsmrel->priorPhysical = NULL; - FreeSpaceMap->numRels++; - /* sumRequests is unchanged because request must be zero */ - } + Assert(avail < BLCKSZ); + + if (avail >= MaxFSMRequestSize) + return 255; + + cat = avail / FSM_CAT_STEP; + + /* + * The highest category, 255, is reserved for MaxFSMRequestSize bytes or + * more. + */ + if (cat > 254) + cat = 254; + + return (uint8) cat; +} + +/* + * Return the lower bound of the range of free space represented by given + * category. + */ +static Size +fsm_space_cat_to_avail(uint8 cat) +{ + /* The highest category represents exactly MaxFSMRequestSize bytes. */ + if (cat == 255) + return MaxFSMRequestSize; else - { - /* Existing entry, move to front of LRU list */ - if (fsmrel->priorUsage != NULL) - { - unlink_fsm_rel_usage(fsmrel); - link_fsm_rel_usage(fsmrel); - } - } - - return fsmrel; + return cat * FSM_CAT_STEP; } /* - * Remove an existing FSMRelation entry. + * Which category does a page need to have, to accommodate x bytes of data? + * While fsm_size_to_avail_cat() rounds down, this needs to round up. + */ +static uint8 +fsm_space_needed_to_cat(Size needed) +{ + int cat; + + /* Can't ask for more space than the highest category represents */ + if (needed > MaxFSMRequestSize) + elog(ERROR, "invalid FSM request size %d", needed); + + if (needed == 0) + return 1; + + cat = (needed + FSM_CAT_STEP - 1) / FSM_CAT_STEP; + + if (cat > 255) + cat = 255; + + return (uint8) cat; +} + +/* + * Returns the physical block number an FSM page + */ +static BlockNumber +fsm_logical_to_physical(FSMAddress addr) +{ + BlockNumber pages; + int leafno; + int l; + + /* + * Calculate the logical page number of the first leaf page below the + * given page. + */ + leafno = addr.logpageno; + for (l = 0; l < addr.level; l++) + leafno *= SlotsPerFSMPage; + + /* Count upper level nodes required to address the leaf page */ + pages = 0; + for (l = 0; l < FSM_TREE_DEPTH; l++) + { + pages += leafno + 1; + leafno /= SlotsPerFSMPage; + } + + /* + * If the page we were asked for wasn't at the bottom level, subtract + * the additional lower level pages we counted above. + */ + pages -= addr.level; + + /* Turn the page count into 0-based block number */ + return pages - 1; +} + +/* + * Return the FSM location corresponding to given heap block. + */ +static FSMAddress +fsm_get_location(BlockNumber heapblk, uint16 *slot) +{ + FSMAddress addr; + + addr.level = FSM_BOTTOM_LEVEL; + addr.logpageno = heapblk / SlotsPerFSMPage; + *slot = heapblk % SlotsPerFSMPage; + + return addr; +} + +/* + * Return the heap block number corresponding to given location in the FSM. + */ +static BlockNumber +fsm_get_heap_blk(FSMAddress addr, uint16 slot) +{ + Assert(addr.level == FSM_BOTTOM_LEVEL); + return ((unsigned int) addr.logpageno) * SlotsPerFSMPage + slot; +} + +/* + * Given a logical address of a child page, get the logical page number of + * the parent, and the slot within the parent corresponding to the child. + */ +static FSMAddress +fsm_get_parent(FSMAddress child, uint16 *slot) +{ + FSMAddress parent; + + Assert(child.level < FSM_ROOT_LEVEL); + + parent.level = child.level + 1; + parent.logpageno = child.logpageno / SlotsPerFSMPage; + *slot = child.logpageno % SlotsPerFSMPage; + + return parent; +} + +/* + * Given a logical address of a parent page, and a slot number get the + * logical address of the corresponding child page. + */ +static FSMAddress +fsm_get_child(FSMAddress parent, uint16 slot) +{ + FSMAddress child; + + Assert(parent.level > FSM_BOTTOM_LEVEL); + + child.level = parent.level - 1; + child.logpageno = parent.logpageno * SlotsPerFSMPage + slot; + + return child; +} + +/* + * Read a FSM page. + * + * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is + * true, the FSM file is extended. + */ +static Buffer +fsm_readbuf(Relation rel, FSMAddress addr, bool extend) +{ + BlockNumber blkno = fsm_logical_to_physical(addr); + + RelationOpenSmgr(rel); + + if (rel->rd_fsm_nblocks_cache == InvalidBlockNumber || + rel->rd_fsm_nblocks_cache <= blkno) + rel->rd_fsm_nblocks_cache = smgrnblocks(rel->rd_smgr, FSM_FORKNUM); + + if (blkno >= rel->rd_fsm_nblocks_cache) + { + if (extend) + fsm_extend(rel, blkno + 1); + else + return InvalidBuffer; + } + return ReadBufferWithFork(rel, FSM_FORKNUM, blkno); +} + +/* + * Ensure that the FSM fork is at least n_fsmblocks long, extending + * it if necessary with empty pages. And by empty, I mean pages filled + * with zeros, meaning there's no free space. */ static void -delete_fsm_rel(FSMRelation *fsmrel) +fsm_extend(Relation rel, BlockNumber n_fsmblocks) { - FSMRelation *result; + BlockNumber n_fsmblocks_now; + Page pg; - FreeSpaceMap->sumRequests -= fsm_calc_request(fsmrel); - unlink_fsm_rel_usage(fsmrel); - unlink_fsm_rel_storage(fsmrel); - FreeSpaceMap->numRels--; - result = (FSMRelation *) hash_search(FreeSpaceMapRelHash, - (void *) &(fsmrel->key), - HASH_REMOVE, - NULL); - if (!result) - elog(ERROR, "FreeSpaceMap hashtable corrupted"); + pg = (Page) palloc(BLCKSZ); + PageInit(pg, BLCKSZ, 0); + + /* + * We use the relation extension lock to lock out other backends + * trying to extend the FSM at the same time. It also locks out + * extension of the main fork, unnecessarily, but extending the + * FSM happens seldom enough that it doesn't seem worthwhile to + * have a separate lock tag type for it. + * + * Note that another backend might have extended the relation + * before we get the lock. + */ + LockRelationForExtension(rel, ExclusiveLock); + + n_fsmblocks_now = smgrnblocks(rel->rd_smgr, FSM_FORKNUM); + while (n_fsmblocks_now < n_fsmblocks) + { + smgrextend(rel->rd_smgr, FSM_FORKNUM, n_fsmblocks_now, + (char *) pg, rel->rd_istemp); + n_fsmblocks_now++; + } + + UnlockRelationForExtension(rel, ExclusiveLock); + + pfree(pg); + + /* update the cache with the up-to-date size */ + rel->rd_fsm_nblocks_cache = n_fsmblocks_now; } /* - * Reallocate space for a FSMRelation. + * Set value in given FSM page and slot. * - * This is shared code for RecordRelationFreeSpace and RecordIndexFreeSpace. - * The return value is the actual new allocation, in chunks. + * If minValue > 0, the updated page is also searched for a page with at + * least minValue of free space. If one is found, its slot number is + * returned, -1 otherwise. */ static int -realloc_fsm_rel(FSMRelation *fsmrel, BlockNumber interestingPages, - bool isIndex) +fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot, + uint8 newValue, uint8 minValue) { - int myRequest; - int myAlloc; - int curAlloc; + Buffer buf; + Page page; + int newslot = -1; - /* - * Delete any existing entries, and update request status. - */ - fsmrel->storedPages = 0; - FreeSpaceMap->sumRequests -= fsm_calc_request(fsmrel); - fsmrel->interestingPages = interestingPages; - fsmrel->isIndex = isIndex; - myRequest = fsm_calc_request(fsmrel); - FreeSpaceMap->sumRequests += myRequest; - myAlloc = fsm_calc_target_allocation(myRequest); + buf = fsm_readbuf(rel, addr, true); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); - /* - * Need to reallocate space if (a) my target allocation is more than my - * current allocation, AND (b) my actual immediate need (myRequest+1 - * chunks) is more than my current allocation. Otherwise just store the - * new data in-place. - */ - curAlloc = fsm_current_allocation(fsmrel); - if (myAlloc > curAlloc && (myRequest + 1) > curAlloc && interestingPages > 0) + page = BufferGetPage(buf); + + if (fsm_set_avail(page, slot, newValue)) + MarkBufferDirty(buf); + + if (minValue != 0) { - /* Remove entry from storage list, and compact */ - unlink_fsm_rel_storage(fsmrel); - compact_fsm_storage(); - /* Reattach to end of storage list */ - link_fsm_rel_storage(fsmrel); - /* And allocate storage */ - fsmrel->firstChunk = FreeSpaceMap->usedChunks; - FreeSpaceMap->usedChunks += myAlloc; - curAlloc = myAlloc; - /* Watch out for roundoff error */ - if (FreeSpaceMap->usedChunks > FreeSpaceMap->totalChunks) - { - FreeSpaceMap->usedChunks = FreeSpaceMap->totalChunks; - curAlloc = FreeSpaceMap->totalChunks - fsmrel->firstChunk; - } + /* Search while we still hold the lock */ + newslot = fsm_search_avail(buf, minValue, + addr.level == FSM_BOTTOM_LEVEL, + true); } - return curAlloc; + + UnlockReleaseBuffer(buf); + + return newslot; } /* - * Link a FSMRelation into the LRU list (always at the head). - */ -static void -link_fsm_rel_usage(FSMRelation *fsmrel) -{ - fsmrel->priorUsage = NULL; - fsmrel->nextUsage = FreeSpaceMap->usageList; - FreeSpaceMap->usageList = fsmrel; - if (fsmrel->nextUsage != NULL) - fsmrel->nextUsage->priorUsage = fsmrel; - else - FreeSpaceMap->usageListTail = fsmrel; -} - -/* - * Delink a FSMRelation from the LRU list. - */ -static void -unlink_fsm_rel_usage(FSMRelation *fsmrel) -{ - if (fsmrel->priorUsage != NULL) - fsmrel->priorUsage->nextUsage = fsmrel->nextUsage; - else - FreeSpaceMap->usageList = fsmrel->nextUsage; - if (fsmrel->nextUsage != NULL) - fsmrel->nextUsage->priorUsage = fsmrel->priorUsage; - else - FreeSpaceMap->usageListTail = fsmrel->priorUsage; - - /* - * We don't bother resetting fsmrel's links, since it's about to be - * deleted or relinked at the head. - */ -} - -/* - * Link a FSMRelation into the storage-order list (always at the tail). - */ -static void -link_fsm_rel_storage(FSMRelation *fsmrel) -{ - fsmrel->nextPhysical = NULL; - fsmrel->priorPhysical = FreeSpaceMap->lastRel; - if (FreeSpaceMap->lastRel != NULL) - FreeSpaceMap->lastRel->nextPhysical = fsmrel; - else - FreeSpaceMap->firstRel = fsmrel; - FreeSpaceMap->lastRel = fsmrel; -} - -/* - * Delink a FSMRelation from the storage-order list, if it's in it. - */ -static void -unlink_fsm_rel_storage(FSMRelation *fsmrel) -{ - if (fsmrel->priorPhysical != NULL || FreeSpaceMap->firstRel == fsmrel) - { - if (fsmrel->priorPhysical != NULL) - fsmrel->priorPhysical->nextPhysical = fsmrel->nextPhysical; - else - FreeSpaceMap->firstRel = fsmrel->nextPhysical; - if (fsmrel->nextPhysical != NULL) - fsmrel->nextPhysical->priorPhysical = fsmrel->priorPhysical; - else - FreeSpaceMap->lastRel = fsmrel->priorPhysical; - } - /* mark as not in list, since we may not put it back immediately */ - fsmrel->nextPhysical = NULL; - fsmrel->priorPhysical = NULL; - /* Also mark it as having no storage */ - fsmrel->firstChunk = -1; - fsmrel->storedPages = 0; -} - -/* - * Look to see if a page with at least the specified amount of space is - * available in the given FSMRelation. If so, return its page number, - * and advance the nextPage counter so that the next inquiry will return - * a different page if possible; also update the entry to show that the - * requested space is not available anymore. Return InvalidBlockNumber - * if no success. + * Search the tree for a heap page with at least min_cat of free space */ static BlockNumber -find_free_space(FSMRelation *fsmrel, Size spaceNeeded) +fsm_search(Relation rel, uint8 min_cat) { - FSMPageData *info; - int pagesToCheck, /* outer loop counter */ - pageIndex; /* current page index */ + int restarts = 0; + FSMAddress addr = FSM_ROOT_ADDRESS; - if (fsmrel->isIndex) - elog(ERROR, "find_free_space called for an index relation"); - info = (FSMPageData *) - (FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES); - pageIndex = fsmrel->nextPage; - /* Last operation may have left nextPage pointing past end */ - if (pageIndex >= fsmrel->storedPages) - pageIndex = 0; - - for (pagesToCheck = fsmrel->storedPages; pagesToCheck > 0; pagesToCheck--) + for (;;) { - FSMPageData *page = info + pageIndex; - Size spaceAvail = FSMPageGetSpace(page); + int slot; + Buffer buf; + uint8 max_avail; - /* Check this page */ - if (spaceAvail >= spaceNeeded) + /* + * Read the FSM page. The root page is created if it doesn't exist + * yet, to save future searchers the effort of having to call + * smgrnblocks() in fsm_readbuf(), only to see that the FSM is + * completely empty. + */ + buf = fsm_readbuf(rel, addr, (addr.level != FSM_ROOT_LEVEL)); + + /* Search within the page */ + if (BufferIsValid(buf)) + { + LockBuffer(buf, BUFFER_LOCK_SHARE); + slot = fsm_search_avail(buf, min_cat, + (addr.level == FSM_BOTTOM_LEVEL), + false); + if (slot == -1) + max_avail = fsm_get_max_avail(BufferGetPage(buf)); + UnlockReleaseBuffer(buf); + } + else + { + slot = -1; + max_avail = 0; + } + + if (slot != -1) { /* - * Found what we want --- adjust the entry, and update nextPage. + * Descend the tree, or return the found block if we're at the + * bottom. */ - FSMPageSetSpace(page, spaceAvail - spaceNeeded); - fsmrel->nextPage = pageIndex + 1; - return FSMPageGetPageNum(page); + if (addr.level == FSM_BOTTOM_LEVEL) + return fsm_get_heap_blk(addr, slot); + + addr = fsm_get_child(addr, slot); } - /* Advance pageIndex, wrapping around if needed */ - if (++pageIndex >= fsmrel->storedPages) - pageIndex = 0; - } - - return InvalidBlockNumber; /* nothing found */ -} - -/* - * As above, but for index case --- we only deal in whole pages. - */ -static BlockNumber -find_index_free_space(FSMRelation *fsmrel) -{ - IndexFSMPageData *info; - BlockNumber result; - - /* - * If isIndex isn't set, it could be that RecordIndexFreeSpace() has never - * yet been called on this relation, and we're still looking at the - * default setting from create_fsm_rel(). If so, just act as though - * there's no space. - */ - if (!fsmrel->isIndex) - { - if (fsmrel->storedPages == 0) + else if (addr.level == FSM_ROOT_LEVEL) + { + /* + * At the root, failure means there's no page with enough free + * space in the FSM. Give up. + */ return InvalidBlockNumber; - elog(ERROR, "find_index_free_space called for a non-index relation"); - } - - /* - * For indexes, there's no need for the nextPage state variable; we just - * remove and return the first available page. (We could save cycles here - * by returning the last page, but it seems better to encourage re-use of - * lower-numbered pages.) - */ - if (fsmrel->storedPages <= 0) - return InvalidBlockNumber; /* no pages available */ - info = (IndexFSMPageData *) - (FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES); - result = IndexFSMPageGetPageNum(info); - fsmrel->storedPages--; - memmove(info, info + 1, fsmrel->storedPages * sizeof(IndexFSMPageData)); - return result; -} - -/* - * fsm_record_free_space - guts of RecordFreeSpace operation (now only - * provided as part of RecordAndGetPageWithFreeSpace). - */ -static void -fsm_record_free_space(FSMRelation *fsmrel, BlockNumber page, Size spaceAvail) -{ - int pageIndex; - - if (fsmrel->isIndex) - elog(ERROR, "fsm_record_free_space called for an index relation"); - if (lookup_fsm_page_entry(fsmrel, page, &pageIndex)) - { - /* Found an existing entry for page; update it */ - FSMPageData *info; - - info = (FSMPageData *) - (FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES); - info += pageIndex; - FSMPageSetSpace(info, spaceAvail); - } - else - { - /* - * No existing entry; ignore the call. We used to add the page to the - * FSM --- but in practice, if the page hasn't got enough space to - * satisfy the caller who's kicking it back to us, then it's probably - * uninteresting to everyone else as well. - */ - } -} - -/* - * Look for an entry for a specific page (block number) in a FSMRelation. - * Returns TRUE if a matching entry exists, else FALSE. - * - * The output argument *outPageIndex is set to indicate where the entry exists - * (if TRUE result) or could be inserted (if FALSE result). - */ -static bool -lookup_fsm_page_entry(FSMRelation *fsmrel, BlockNumber page, - int *outPageIndex) -{ - /* Check for empty relation */ - if (fsmrel->storedPages <= 0) - { - *outPageIndex = 0; - return false; - } - - /* Do binary search */ - if (fsmrel->isIndex) - { - IndexFSMPageData *info; - int low, - high; - - info = (IndexFSMPageData *) - (FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES); - low = 0; - high = fsmrel->storedPages - 1; - while (low <= high) - { - int middle; - BlockNumber probe; - - middle = low + (high - low) / 2; - probe = IndexFSMPageGetPageNum(info + middle); - if (probe == page) - { - *outPageIndex = middle; - return true; - } - else if (probe < page) - low = middle + 1; - else - high = middle - 1; } - *outPageIndex = low; - return false; - } - else - { - FSMPageData *info; - int low, - high; - - info = (FSMPageData *) - (FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES); - low = 0; - high = fsmrel->storedPages - 1; - while (low <= high) - { - int middle; - BlockNumber probe; - - middle = low + (high - low) / 2; - probe = FSMPageGetPageNum(info + middle); - if (probe == page) - { - *outPageIndex = middle; - return true; - } - else if (probe < page) - low = middle + 1; - else - high = middle - 1; - } - *outPageIndex = low; - return false; - } -} - -/* - * Re-pack the FSM storage arena, dropping data if necessary to meet the - * current allocation target for each relation. At conclusion, all available - * space in the arena will be coalesced at the end. - */ -static void -compact_fsm_storage(void) -{ - int nextChunkIndex = 0; - bool did_push = false; - FSMRelation *fsmrel; - - for (fsmrel = FreeSpaceMap->firstRel; - fsmrel != NULL; - fsmrel = fsmrel->nextPhysical) - { - int newAlloc; - int newAllocPages; - int newChunkIndex; - int oldChunkIndex; - int curChunks; - char *newLocation; - char *oldLocation; - - /* - * Calculate target allocation, make sure we don't overrun due to - * roundoff error - */ - newAlloc = fsm_calc_target_allocation(fsm_calc_request(fsmrel)); - if (newAlloc > FreeSpaceMap->totalChunks - nextChunkIndex) - newAlloc = FreeSpaceMap->totalChunks - nextChunkIndex; - if (fsmrel->isIndex) - newAllocPages = newAlloc * INDEXCHUNKPAGES; else - newAllocPages = newAlloc * CHUNKPAGES; - - /* - * Determine current size, current and new locations - */ - curChunks = fsm_current_chunks(fsmrel); - oldChunkIndex = fsmrel->firstChunk; - oldLocation = FreeSpaceMap->arena + oldChunkIndex * CHUNKBYTES; - newChunkIndex = nextChunkIndex; - newLocation = FreeSpaceMap->arena + newChunkIndex * CHUNKBYTES; - - /* - * It's possible that we have to move data down, not up, if the - * allocations of previous rels expanded. This normally means that - * our allocation expanded too (or at least got no worse), and ditto - * for later rels. So there should be room to move all our data down - * without dropping any --- but we might have to push down following - * rels to acquire the room. We don't want to do the push more than - * once, so pack everything against the end of the arena if so. - * - * In corner cases where we are on the short end of a roundoff choice - * that we were formerly on the long end of, it's possible that we - * have to move down and compress our data too. In fact, even after - * pushing down the following rels, there might not be as much space - * as we computed for this rel above --- that would imply that some - * following rel(s) are also on the losing end of roundoff choices. We - * could handle this fairly by doing the per-rel compactions - * out-of-order, but that seems like way too much complexity to deal - * with a very infrequent corner case. Instead, we simply drop pages - * from the end of the current rel's data until it fits. - */ - if (newChunkIndex > oldChunkIndex) { - int limitChunkIndex; + uint16 parentslot; + FSMAddress parent; - if (newAllocPages < fsmrel->storedPages) - { - /* move and compress --- just drop excess pages */ - fsmrel->storedPages = newAllocPages; - curChunks = fsm_current_chunks(fsmrel); - } - /* is there enough space? */ - if (fsmrel->nextPhysical != NULL) - limitChunkIndex = fsmrel->nextPhysical->firstChunk; - else - limitChunkIndex = FreeSpaceMap->totalChunks; - if (newChunkIndex + curChunks > limitChunkIndex) - { - /* not enough space, push down following rels */ - if (!did_push) - { - push_fsm_rels_after(fsmrel); - did_push = true; - } - /* now is there enough space? */ - if (fsmrel->nextPhysical != NULL) - limitChunkIndex = fsmrel->nextPhysical->firstChunk; - else - limitChunkIndex = FreeSpaceMap->totalChunks; - if (newChunkIndex + curChunks > limitChunkIndex) - { - /* uh-oh, forcibly cut the allocation to fit */ - newAlloc = limitChunkIndex - newChunkIndex; - - /* - * If newAlloc < 0 at this point, we are moving the rel's - * firstChunk into territory currently assigned to a later - * rel. This is okay so long as we do not copy any data. - * The rels will be back in nondecreasing firstChunk order - * at completion of the compaction pass. - */ - if (newAlloc < 0) - newAlloc = 0; - if (fsmrel->isIndex) - newAllocPages = newAlloc * INDEXCHUNKPAGES; - else - newAllocPages = newAlloc * CHUNKPAGES; - fsmrel->storedPages = newAllocPages; - curChunks = fsm_current_chunks(fsmrel); - } - } - memmove(newLocation, oldLocation, curChunks * CHUNKBYTES); - } - else if (newAllocPages < fsmrel->storedPages) - { /* - * Need to compress the page data. For an index, "compression" - * just means dropping excess pages; otherwise we try to keep the - * ones with the most space. + * At lower level, failure can happen if the value in the upper- + * level node didn't reflect the value on the lower page. Update + * the upper node, to avoid falling into the same trap again, and + * start over. + * + * There's a race condition here, if another backend updates this + * page right after we release it, and gets the lock on the parent + * page before us. We'll then update the parent page with the now + * stale information we had. It's OK, because it should happen + * rarely, and will be fixed by the next vacuum. */ - if (fsmrel->isIndex) - { - fsmrel->storedPages = newAllocPages; - /* may need to move data */ - if (newChunkIndex != oldChunkIndex) - memmove(newLocation, oldLocation, newAlloc * CHUNKBYTES); - } - else - { - pack_existing_pages((FSMPageData *) newLocation, - newAllocPages, - (FSMPageData *) oldLocation, - fsmrel->storedPages); - fsmrel->storedPages = newAllocPages; - } - } - else if (newChunkIndex != oldChunkIndex) - { + parent = fsm_get_parent(addr, &parentslot); + fsm_set_and_search(rel, parent, parentslot, max_avail, 0); + /* - * No compression needed, but must copy the data up + * If the upper pages are badly out of date, we might need to + * loop quite a few times, updating them as we go. Any + * inconsistencies should eventually be corrected and the loop + * should end. Looping indefinitely is nevertheless scary, so + * provide an emergency valve. */ - memmove(newLocation, oldLocation, curChunks * CHUNKBYTES); + if (restarts++ > 10000) + return InvalidBlockNumber; + + /* Start search all over from the root */ + addr = FSM_ROOT_ADDRESS; } - fsmrel->firstChunk = newChunkIndex; - nextChunkIndex += newAlloc; } - Assert(nextChunkIndex <= FreeSpaceMap->totalChunks); - FreeSpaceMap->usedChunks = nextChunkIndex; } -/* - * Push all FSMRels physically after afterRel to the end of the storage arena. - * - * We sometimes have to do this when deletion or truncation of a relation - * causes the allocations of remaining rels to expand markedly. We must - * temporarily push existing data down to the end so that we can move it - * back up in an orderly fashion. - */ -static void -push_fsm_rels_after(FSMRelation *afterRel) -{ - int nextChunkIndex = FreeSpaceMap->totalChunks; - FSMRelation *fsmrel; - - FreeSpaceMap->usedChunks = FreeSpaceMap->totalChunks; - - for (fsmrel = FreeSpaceMap->lastRel; - fsmrel != NULL; - fsmrel = fsmrel->priorPhysical) - { - int chunkCount; - int newChunkIndex; - int oldChunkIndex; - char *newLocation; - char *oldLocation; - - if (fsmrel == afterRel) - break; - - chunkCount = fsm_current_chunks(fsmrel); - nextChunkIndex -= chunkCount; - newChunkIndex = nextChunkIndex; - oldChunkIndex = fsmrel->firstChunk; - if (newChunkIndex < oldChunkIndex) - { - /* we're pushing down, how can it move up? */ - elog(PANIC, "inconsistent entry sizes in FSM"); - } - else if (newChunkIndex > oldChunkIndex) - { - /* need to move it */ - newLocation = FreeSpaceMap->arena + newChunkIndex * CHUNKBYTES; - oldLocation = FreeSpaceMap->arena + oldChunkIndex * CHUNKBYTES; - memmove(newLocation, oldLocation, chunkCount * CHUNKBYTES); - fsmrel->firstChunk = newChunkIndex; - } - } - Assert(nextChunkIndex >= 0); -} /* - * Pack a set of per-page freespace data into a smaller amount of space. - * - * The method is to compute a low-resolution histogram of the free space - * amounts, then determine which histogram bin contains the break point. - * We then keep all pages above that bin, none below it, and just enough - * of the pages in that bin to fill the output area exactly. + * Recursive guts of FreeSpaceMapVacuum */ -#define HISTOGRAM_BINS 64 - -static void -pack_incoming_pages(FSMPageData *newLocation, int newPages, - FSMPageData *pageSpaces, int nPages) +static uint8 +fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof_p) { - int histogram[HISTOGRAM_BINS]; - int above, - binct, - i; - Size thresholdL, - thresholdU; + Buffer buf; + Page page; + uint8 max_avail; - Assert(newPages < nPages); /* else I shouldn't have been called */ - /* Build histogram */ - MemSet(histogram, 0, sizeof(histogram)); - for (i = 0; i < nPages; i++) + /* Read the page if it exists, or return EOF */ + buf = fsm_readbuf(rel, addr, false); + if (!BufferIsValid(buf)) { - Size avail = FSMPageGetSpace(&pageSpaces[i]); - - if (avail >= BLCKSZ) - elog(ERROR, "bogus freespace amount"); - avail /= (BLCKSZ / HISTOGRAM_BINS); - histogram[avail]++; - } - /* Find the breakpoint bin */ - above = 0; - for (i = HISTOGRAM_BINS - 1; i >= 0; i--) - { - int sum = above + histogram[i]; - - if (sum > newPages) - break; - above = sum; - } - Assert(i >= 0); - thresholdL = i * BLCKSZ / HISTOGRAM_BINS; /* low bound of bp bin */ - thresholdU = (i + 1) * BLCKSZ / HISTOGRAM_BINS; /* hi bound */ - binct = newPages - above; /* number to take from bp bin */ - /* And copy the appropriate data */ - for (i = 0; i < nPages; i++) - { - BlockNumber page = FSMPageGetPageNum(&pageSpaces[i]); - Size avail = FSMPageGetSpace(&pageSpaces[i]); - - /* Check caller provides sorted data */ - if (i > 0 && page <= FSMPageGetPageNum(&pageSpaces[i - 1])) - elog(ERROR, "free-space data is not in page order"); - /* Save this page? */ - if (avail >= thresholdU || - (avail >= thresholdL && (--binct >= 0))) - { - *newLocation = pageSpaces[i]; - newLocation++; - newPages--; - } - } - Assert(newPages == 0); -} - -/* - * Pack a set of per-page freespace data into a smaller amount of space. - * - * This is algorithmically identical to pack_incoming_pages(), but accepts - * a different input representation. Also, we assume the input data has - * previously been checked for validity (size in bounds, pages in order). - * - * Note: it is possible for the source and destination arrays to overlap. - * The caller is responsible for making sure newLocation is at lower addresses - * so that we can copy data moving forward in the arrays without problem. - */ -static void -pack_existing_pages(FSMPageData *newLocation, int newPages, - FSMPageData *oldLocation, int oldPages) -{ - int histogram[HISTOGRAM_BINS]; - int above, - binct, - i; - Size thresholdL, - thresholdU; - - Assert(newPages < oldPages); /* else I shouldn't have been called */ - /* Build histogram */ - MemSet(histogram, 0, sizeof(histogram)); - for (i = 0; i < oldPages; i++) - { - Size avail = FSMPageGetSpace(oldLocation + i); - - /* Shouldn't happen, but test to protect against stack clobber */ - if (avail >= BLCKSZ) - elog(ERROR, "bogus freespace amount"); - avail /= (BLCKSZ / HISTOGRAM_BINS); - histogram[avail]++; - } - /* Find the breakpoint bin */ - above = 0; - for (i = HISTOGRAM_BINS - 1; i >= 0; i--) - { - int sum = above + histogram[i]; - - if (sum > newPages) - break; - above = sum; - } - Assert(i >= 0); - thresholdL = i * BLCKSZ / HISTOGRAM_BINS; /* low bound of bp bin */ - thresholdU = (i + 1) * BLCKSZ / HISTOGRAM_BINS; /* hi bound */ - binct = newPages - above; /* number to take from bp bin */ - /* And copy the appropriate data */ - for (i = 0; i < oldPages; i++) - { - BlockNumber page = FSMPageGetPageNum(oldLocation + i); - Size avail = FSMPageGetSpace(oldLocation + i); - - /* Save this page? */ - if (avail >= thresholdU || - (avail >= thresholdL && (--binct >= 0))) - { - FSMPageSetPageNum(newLocation, page); - FSMPageSetSpace(newLocation, avail); - newLocation++; - newPages--; - } - } - Assert(newPages == 0); -} - -/* - * Calculate number of chunks "requested" by a rel. The "request" is - * anything beyond the rel's one guaranteed chunk. - * - * Rel's interestingPages and isIndex settings must be up-to-date when called. - * - * See notes at top of file for details. - */ -static int -fsm_calc_request(FSMRelation *fsmrel) -{ - int req; - - /* Convert page count to chunk count */ - if (fsmrel->isIndex) - { - /* test to avoid unsigned underflow at zero */ - if (fsmrel->interestingPages <= INDEXCHUNKPAGES) - return 0; - /* quotient will fit in int, even if interestingPages doesn't */ - req = (fsmrel->interestingPages - 1) / INDEXCHUNKPAGES; + *eof_p = true; + return 0; } else - { - if (fsmrel->interestingPages <= CHUNKPAGES) - return 0; - req = (fsmrel->interestingPages - 1) / CHUNKPAGES; - } + *eof_p = false; + + page = BufferGetPage(buf); /* - * We clamp the per-relation requests to at most half the arena size; this - * is intended to prevent a single bloated relation from crowding out FSM - * service for every other rel. + * Recurse into children, and fix the information stored about them + * at this level. */ - req = Min(req, FreeSpaceMap->totalChunks / 2); - - return req; -} - -/* - * Same as above, but without the clamp ... this is just intended for - * reporting the total space needed to store all information. - */ -static int -fsm_calc_request_unclamped(FSMRelation *fsmrel) -{ - int req; - - /* Convert page count to chunk count */ - if (fsmrel->isIndex) + if (addr.level > FSM_BOTTOM_LEVEL) { - /* test to avoid unsigned underflow at zero */ - if (fsmrel->interestingPages <= INDEXCHUNKPAGES) - return 0; - /* quotient will fit in int, even if interestingPages doesn't */ - req = (fsmrel->interestingPages - 1) / INDEXCHUNKPAGES; - } - else - { - if (fsmrel->interestingPages <= CHUNKPAGES) - return 0; - req = (fsmrel->interestingPages - 1) / CHUNKPAGES; + int slot; + bool eof = false; + + for (slot = 0; slot < SlotsPerFSMPage; slot++) + { + int child_avail; + + /* After we hit end-of-file, just clear the rest of the slots */ + if (!eof) + child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot), &eof); + else + child_avail = 0; + + /* Update information about the child */ + if (fsm_get_avail(page, slot) != child_avail) + { + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + fsm_set_avail(BufferGetPage(buf), slot, child_avail); + MarkBufferDirty(buf); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + } } - return req; -} + max_avail = fsm_get_max_avail(BufferGetPage(buf)); -/* - * Calculate target allocation (number of chunks) for a rel - * - * Parameter is the result from fsm_calc_request(). The global sumRequests - * and numRels totals must be up-to-date already. - * - * See notes at top of file for details. - */ -static int -fsm_calc_target_allocation(int myRequest) -{ - double spareChunks; - int extra; + /* + * Reset the next slot pointer. This encourages the use of low-numbered + * pages, increasing the chances that a later vacuum can truncate the + * relation. + */ + ((FSMPage) PageGetContents(page))->fp_next_slot = 0; - spareChunks = FreeSpaceMap->totalChunks - FreeSpaceMap->numRels; - Assert(spareChunks > 0); - if (spareChunks >= FreeSpaceMap->sumRequests) - { - /* We aren't oversubscribed, so allocate exactly the request */ - extra = myRequest; - } - else - { - extra = (int) rint(spareChunks * myRequest / FreeSpaceMap->sumRequests); - if (extra < 0) /* shouldn't happen, but make sure */ - extra = 0; - } - return 1 + extra; -} + ReleaseBuffer(buf); -/* - * Calculate number of chunks actually used to store current data - */ -static int -fsm_current_chunks(FSMRelation *fsmrel) -{ - int chunkCount; - - /* Make sure storedPages==0 produces right answer */ - if (fsmrel->storedPages <= 0) - return 0; - /* Convert page count to chunk count */ - if (fsmrel->isIndex) - chunkCount = (fsmrel->storedPages - 1) / INDEXCHUNKPAGES + 1; - else - chunkCount = (fsmrel->storedPages - 1) / CHUNKPAGES + 1; - return chunkCount; -} - -/* - * Calculate current actual allocation (number of chunks) for a rel - */ -static int -fsm_current_allocation(FSMRelation *fsmrel) -{ - if (fsmrel->nextPhysical != NULL) - return fsmrel->nextPhysical->firstChunk - fsmrel->firstChunk; - else if (fsmrel == FreeSpaceMap->lastRel) - return FreeSpaceMap->usedChunks - fsmrel->firstChunk; - else - { - /* it's not in the storage-order list */ - Assert(fsmrel->firstChunk < 0 && fsmrel->storedPages == 0); - return 0; - } + return max_avail; } -/* - * Return the FreeSpaceMap structure for examination. - */ -FSMHeader * -GetFreeSpaceMap(void) -{ +/****** WAL-logging ******/ - return FreeSpaceMap; -} - - -#ifdef FREESPACE_DEBUG -/* - * Dump contents of freespace map for debugging. - * - * We assume caller holds the FreeSpaceLock, or is otherwise unconcerned - * about other processes. - */ void -DumpFreeSpace(void) +fsm_redo(XLogRecPtr lsn, XLogRecord *record) { - FSMRelation *fsmrel; - FSMRelation *prevrel = NULL; - int relNum = 0; - int nPages; + uint8 info = record->xl_info & ~XLR_INFO_MASK; - for (fsmrel = FreeSpaceMap->usageList; fsmrel; fsmrel = fsmrel->nextUsage) + switch (info) { - relNum++; - fprintf(stderr, "Map %d: rel %u/%u/%u isIndex %d avgRequest %u interestingPages %u nextPage %d\nMap= ", - relNum, - fsmrel->key.spcNode, fsmrel->key.dbNode, fsmrel->key.relNode, - (int) fsmrel->isIndex, fsmrel->avgRequest, - fsmrel->interestingPages, fsmrel->nextPage); - if (fsmrel->isIndex) - { - IndexFSMPageData *page; - - page = (IndexFSMPageData *) - (FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES); - for (nPages = 0; nPages < fsmrel->storedPages; nPages++) + case XLOG_FSM_TRUNCATE: { - fprintf(stderr, " %u", - IndexFSMPageGetPageNum(page)); - page++; - } - } - else - { - FSMPageData *page; + xl_fsm_truncate *xlrec; + Relation rel; - page = (FSMPageData *) - (FreeSpaceMap->arena + fsmrel->firstChunk * CHUNKBYTES); - for (nPages = 0; nPages < fsmrel->storedPages; nPages++) - { - fprintf(stderr, " %u:%u", - FSMPageGetPageNum(page), - FSMPageGetSpace(page)); - page++; + xlrec = (xl_fsm_truncate *) XLogRecGetData(record); + rel = CreateFakeRelcacheEntry(xlrec->node); + FreeSpaceMapTruncateRel(rel, xlrec->nheapblocks); + FreeFakeRelcacheEntry(rel); } - } - fprintf(stderr, "\n"); - /* Cross-check list links */ - if (prevrel != fsmrel->priorUsage) - fprintf(stderr, "DumpFreeSpace: broken list links\n"); - prevrel = fsmrel; + break; + default: + elog(PANIC, "fsm_redo: unknown op code %u", info); } - if (prevrel != FreeSpaceMap->usageListTail) - fprintf(stderr, "DumpFreeSpace: broken list links\n"); - /* Cross-check global counters */ - if (relNum != FreeSpaceMap->numRels) - fprintf(stderr, "DumpFreeSpace: %d rels in list, but numRels = %d\n", - relNum, FreeSpaceMap->numRels); } -#endif /* FREESPACE_DEBUG */ +void +fsm_desc(StringInfo buf, uint8 xl_info, char *rec) +{ + uint8 info = xl_info & ~XLR_INFO_MASK; + + switch (info) + { + case XLOG_FSM_TRUNCATE: + { + xl_fsm_truncate *xlrec = (xl_fsm_truncate *) rec; + + appendStringInfo(buf, "truncate: rel %u/%u/%u; nheapblocks %u;", + xlrec->node.spcNode, xlrec->node.dbNode, + xlrec->node.relNode, xlrec->nheapblocks); + break; + } + default: + appendStringInfo(buf, "UNKNOWN"); + break; + } +} diff --git a/src/backend/storage/freespace/fsmpage.c b/src/backend/storage/freespace/fsmpage.c new file mode 100644 index 0000000000..ce6f47e8b9 --- /dev/null +++ b/src/backend/storage/freespace/fsmpage.c @@ -0,0 +1,352 @@ +/*------------------------------------------------------------------------- + * + * fsmpage.c + * routines to search and manipulate one FSM page. + * + * + * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/storage/freespace/fsmpage.c,v 1.1 2008/09/30 10:52:13 heikki Exp $ + * + * NOTES: + * + * The public functions in this file form an API that hides the internal + * structure of a FSM page. This allows freespace.c to treat each FSM page + * as a black box with SlotsPerPage "slots". fsm_set_avail() and + * fsm_get_avail() let's you get/set the value of a slot, and + * fsm_search_avail() let's you search for a slot with value >= X. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/bufmgr.h" +#include "storage/fsm_internals.h" + +/* macros to navigate the tree within a page. */ +#define leftchild(x) (2 * (x) + 1) +#define rightchild(x) (2 * (x) + 2) +#define parentof(x) (((x) - 1) / 2) + +/* returns right sibling of x, wrapping around within the level */ +static int +rightsibling(int x) +{ + /* + * Move right. This might wrap around, stepping to the leftmost node at + * the next level. + */ + x++; + + /* + * Check if we stepped to the leftmost node at next level, and correct + * if so. The leftmost nodes at each level are of form x = 2^level - 1, so + * check if (x + 1) is a power of two. + */ + if (((x + 1) & x) == 0) + x = parentof(x); + + return x; +} + +/* + * Sets the value of a slot on page. Returns true if the page was + * modified. + * + * The caller must hold an exclusive lock on the page. + */ +bool +fsm_set_avail(Page page, int slot, uint8 value) +{ + int nodeno = NonLeafNodesPerPage + slot; + FSMPage fsmpage = (FSMPage) PageGetContents(page); + uint8 oldvalue; + + Assert(slot < LeafNodesPerPage); + + oldvalue = fsmpage->fp_nodes[nodeno]; + + /* If the value hasn't changed, we don't need to do anything */ + if (oldvalue == value && value <= fsmpage->fp_nodes[0]) + return false; + + fsmpage->fp_nodes[nodeno] = value; + + /* + * Propagate up, until we hit the root or a node that doesn't + * need to be updated. + */ + do + { + uint8 newvalue = 0; + int lchild; + int rchild; + + nodeno = parentof(nodeno); + lchild = leftchild(nodeno); + rchild = lchild + 1; + + newvalue = fsmpage->fp_nodes[lchild]; + if (rchild < NodesPerPage) + newvalue = Max(newvalue, + fsmpage->fp_nodes[rchild]); + + oldvalue = fsmpage->fp_nodes[nodeno]; + if (oldvalue == newvalue) + break; + + fsmpage->fp_nodes[nodeno] = newvalue; + } while (nodeno > 0); + + /* + * sanity check: if the new value value is higher than the value + * at the top, the tree is corrupt. + */ + if (value > fsmpage->fp_nodes[0]) + fsm_rebuild_page(page); + + return true; +} + +/* + * Returns the value of given slot on page. + * + * Since this is just a read-only access of a single byte, the page doesn't + * need to be locked. + */ +uint8 +fsm_get_avail(Page page, int slot) +{ + FSMPage fsmpage = (FSMPage) PageGetContents(page); + + return fsmpage->fp_nodes[NonLeafNodesPerPage + slot]; +} + +/* + * Returns the value at the root of a page. + * Since this is just a read-only access of a single byte, the page doesn't + * need to be locked. + */ +uint8 +fsm_get_max_avail(Page page) +{ + FSMPage fsmpage = (FSMPage) PageGetContents(page); + return fsmpage->fp_nodes[0]; +} + +/* + * Searches for a slot with min. category. Returns slot number, or -1 if + * none found. + * + * The caller must hold at least a shared lock on the page, and this + * function can unlock and lock the page again in exclusive mode if it + * needs to be updated. exclusive_lock_held should be set to true if the + * caller is already holding an exclusive lock, to avoid extra work. + * + * If advancenext is false, fp_next_slot is set to point to the returned + * slot, and if it's true, to the slot next to the returned slot. + */ +int +fsm_search_avail(Buffer buf, uint8 minvalue, bool advancenext, + bool exclusive_lock_held) +{ + Page page = BufferGetPage(buf); + FSMPage fsmpage = (FSMPage) PageGetContents(page); + int nodeno; + int target; + uint16 slot; + + restart: + /* + * Check the root first, and exit quickly if there's no page with + * enough free space + */ + if (fsmpage->fp_nodes[0] < minvalue) + return -1; + + + /* fp_next_slot is just a hint, so check that it's sane */ + target = fsmpage->fp_next_slot; + if (target < 0 || target >= LeafNodesPerPage) + target = 0; + target += NonLeafNodesPerPage; + + /* + * Start the search from the target slot. At every step, move one + * node to the right, and climb up to the parent. Stop when we reach a + * node with enough free space. (note that moving to the right only + * makes a difference if we're on the right child of the parent) + * + * The idea is to graduall expand our "search triangle", that is, all + * nodes covered by the current node. In the beginning, just the target + * node is included, and more nodes to the right of the target node, + * taking wrap-around into account, is included at each step. Nodes are + * added to the search triangle in left-to-right order, starting from + * the target node. This ensures that we'll find the first suitable node + * to the right of the target node, and not some other node with enough + * free space. + * + * For example, consider this tree: + * + * 7 + * 7 6 + * 5 7 6 5 + * 4 5 5 7 2 6 5 2 + * T + * + * Imagine that target node is the node indicated by the letter T, and + * we're searching for a node with value of 6 or higher. The search + * begins at T. At first iteration, we move to the right, and to the + * parent, arriving the rightmost 5. At the 2nd iteration, we move to the + * right, wrapping around, and climb up, arriving at the 7 at the 2nd + * level. 7 satisfies our search, so we descend down to the bottom, + * following the path of sevens. + */ + nodeno = target; + while (nodeno > 0) + { + if (fsmpage->fp_nodes[nodeno] >= minvalue) + break; + + /* + * Move to the right, wrapping around at the level if necessary, and + * climb up. + */ + nodeno = parentof(rightsibling(nodeno)); + } + + /* + * We're now at a node with enough free space, somewhere in the middle of + * the tree. Descend to the bottom, following a path with enough free + * space, preferring to move left if there's a choice. + */ + while (nodeno < NonLeafNodesPerPage) + { + int leftnodeno = leftchild(nodeno); + int rightnodeno = leftnodeno + 1; + bool leftok = (leftnodeno < NodesPerPage) && + (fsmpage->fp_nodes[leftnodeno] >= minvalue); + bool rightok = (rightnodeno < NodesPerPage) && + (fsmpage->fp_nodes[rightnodeno] >= minvalue); + + if (leftok) + nodeno = leftnodeno; + else if (rightok) + nodeno = rightnodeno; + else + { + /* + * Oops. The parent node promised that either left or right + * child has enough space, but neither actually did. This can + * happen in case of a "torn page", IOW if we crashed earlier + * while writing the page to disk, and only part of the page + * made it to disk. + * + * Fix the corruption and restart. + */ + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blknum; + + BufferGetTag(buf, &rnode, &forknum, &blknum); + elog(DEBUG1, "fixing corrupt FSM block %u, relation %u/%u/%u", + blknum, rnode.spcNode, rnode.dbNode, rnode.relNode); + + /* make sure we hold an exclusive lock */ + if (!exclusive_lock_held) + { + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + exclusive_lock_held = true; + } + fsm_rebuild_page(page); + MarkBufferDirty(buf); + goto restart; + } + } + + /* We're now at the bottom level, at a node with enough space. */ + slot = nodeno - NonLeafNodesPerPage; + + /* + * Update the next slot pointer. Note that we do this even if we're only + * holding a shared lock, on the grounds that it's better to use a shared + * lock and get a garbled next pointer every now and then, than take the + * concurrency hit of an exlusive lock. + * + * Wrap-around is handled at the beginning of this function. + */ + fsmpage->fp_next_slot = slot + (advancenext ? 1 : 0); + + return slot; +} + +/* + * Sets the available space to zero for all slots numbered >= nslots. + * Returns true if the page was modified. + */ +bool +fsm_truncate_avail(Page page, int nslots) +{ + FSMPage fsmpage = (FSMPage) PageGetContents(page); + uint8 *ptr; + bool changed = false; + + Assert(nslots >= 0 && nslots < LeafNodesPerPage); + + /* Clear all truncated leaf nodes */ + ptr = &fsmpage->fp_nodes[NonLeafNodesPerPage + nslots]; + for (; ptr < &fsmpage->fp_nodes[NodesPerPage]; ptr++) + { + if (*ptr != 0) + changed = true; + *ptr = 0; + } + + /* Fix upper nodes. */ + if (changed) + fsm_rebuild_page(page); + + return changed; +} + +/* + * Reconstructs the upper levels of a page. Returns true if the page + * was modified. + */ +bool +fsm_rebuild_page(Page page) +{ + FSMPage fsmpage = (FSMPage) PageGetContents(page); + bool changed = false; + int nodeno; + + /* + * Start from the lowest non-leaflevel, at last node, working our way + * backwards, through all non-leaf nodes at all levels, up to the root. + */ + for (nodeno = NonLeafNodesPerPage - 1; nodeno >= 0; nodeno--) + { + int lchild = leftchild(nodeno); + int rchild = lchild + 1; + uint8 newvalue = 0; + + if (lchild < NodesPerPage) + newvalue = fsmpage->fp_nodes[lchild]; + + if (rchild < NodesPerPage) + newvalue = Max(newvalue, + fsmpage->fp_nodes[rchild]); + + if (fsmpage->fp_nodes[nodeno] != newvalue) + { + fsmpage->fp_nodes[nodeno] = newvalue; + changed = true; + } + } + + return changed; +} + diff --git a/src/backend/storage/freespace/indexfsm.c b/src/backend/storage/freespace/indexfsm.c new file mode 100644 index 0000000000..62fd3d3794 --- /dev/null +++ b/src/backend/storage/freespace/indexfsm.c @@ -0,0 +1,92 @@ +/*------------------------------------------------------------------------- + * + * indexfsm.c + * POSTGRES free space map for quickly finding free pages in relations + * + * + * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/storage/freespace/indexfsm.c,v 1.1 2008/09/30 10:52:13 heikki Exp $ + * + * + * NOTES: + * + * This is similar to the FSM used for heap, in freespace.c, but instead + * of tracking the amount of free space on pages, we only track whether + * pages are completely free or in-use. We use the same FSM implementation + * as for heaps, using BLCKSZ - 1 to denote used pages, and 0 for unused. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/freespace.h" +#include "storage/indexfsm.h" +#include "storage/smgr.h" + +/* + * Exported routines + */ + +/* + * InitIndexFreeSpaceMap - Create or reset the FSM fork for relation. + */ +void +InitIndexFreeSpaceMap(Relation rel) +{ + /* Create FSM fork if it doesn't exist yet, or truncate it if it does */ + RelationOpenSmgr(rel); + if (!smgrexists(rel->rd_smgr, FSM_FORKNUM)) + smgrcreate(rel->rd_smgr, FSM_FORKNUM, rel->rd_istemp, false); + else + smgrtruncate(rel->rd_smgr, FSM_FORKNUM, 0, rel->rd_istemp); +} + +/* + * GetFreeIndexPage - return a free page from the FSM + * + * As a side effect, the page is marked as used in the FSM. + */ +BlockNumber +GetFreeIndexPage(Relation rel) +{ + BlockNumber blkno = GetPageWithFreeSpace(rel, BLCKSZ/2); + + if (blkno != InvalidBlockNumber) + RecordUsedIndexPage(rel, blkno); + + return blkno; +} + +/* + * RecordFreeIndexPage - mark a page as free in the FSM + */ +void +RecordFreeIndexPage(Relation rel, BlockNumber freeBlock) +{ + RecordPageWithFreeSpace(rel, freeBlock, BLCKSZ - 1); +} + + +/* + * RecordUsedIndexPage - mark a page as used in the FSM + */ +void +RecordUsedIndexPage(Relation rel, BlockNumber usedBlock) +{ + RecordPageWithFreeSpace(rel, usedBlock, 0); +} + +/* + * IndexFreeSpaceMapTruncate - adjust for truncation of a relation. + * + * We need to delete any stored data past the new relation length, so that + * we don't bogusly return removed block numbers. + */ +void +IndexFreeSpaceMapTruncate(Relation rel, BlockNumber nblocks) +{ + FreeSpaceMapTruncateRel(rel, nblocks); +} diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index d388321e9c..0365e56609 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/ipc/ipci.c,v 1.96 2008/05/12 00:00:50 alvherre Exp $ + * $PostgreSQL: pgsql/src/backend/storage/ipc/ipci.c,v 1.97 2008/09/30 10:52:13 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -26,7 +26,6 @@ #include "postmaster/bgwriter.h" #include "postmaster/postmaster.h" #include "storage/bufmgr.h" -#include "storage/freespace.h" #include "storage/ipc.h" #include "storage/pg_shmem.h" #include "storage/pmsignal.h" @@ -110,7 +109,6 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port) size = add_size(size, ProcArrayShmemSize()); size = add_size(size, BackendStatusShmemSize()); size = add_size(size, SInvalShmemSize()); - size = add_size(size, FreeSpaceShmemSize()); size = add_size(size, BgWriterShmemSize()); size = add_size(size, AutoVacuumShmemSize()); size = add_size(size, BTreeShmemSize()); @@ -203,11 +201,6 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port) */ CreateSharedInvalidationState(); - /* - * Set up free-space map - */ - InitFreeSpaceMap(); - /* * Set up interprocess signaling mechanisms */ diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index da4a9766ca..4909256cc1 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -11,7 +11,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.111 2008/08/11 11:05:11 heikki Exp $ + * $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.112 2008/09/30 10:52:13 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -21,7 +21,6 @@ #include "access/xlogutils.h" #include "commands/tablespace.h" #include "storage/bufmgr.h" -#include "storage/freespace.h" #include "storage/ipc.h" #include "storage/smgr.h" #include "utils/hsearch.h" @@ -474,13 +473,6 @@ smgr_internal_unlink(RelFileNode rnode, ForkNumber forknum, */ DropRelFileNodeBuffers(rnode, forknum, isTemp, 0); - /* - * Tell the free space map to forget this relation. It won't be accessed - * any more anyway, but we may as well recycle the map space quickly. - */ - if (forknum == MAIN_FORKNUM) - FreeSpaceMapForgetRel(&rnode); - /* * It'd be nice to tell the stats collector to forget it immediately, too. * But we can't because we don't know the OID (and in cases involving @@ -577,13 +569,6 @@ smgrtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks, */ DropRelFileNodeBuffers(reln->smgr_rnode, forknum, isTemp, nblocks); - /* - * Tell the free space map to forget anything it may have stored for the - * about-to-be-deleted blocks. We want to be sure it won't return bogus - * block numbers later on. - */ - FreeSpaceMapTruncateRel(&reln->smgr_rnode, nblocks); - /* Do the truncation */ (*(smgrsw[reln->smgr_which].smgr_truncate)) (reln, forknum, nblocks, isTemp); @@ -905,13 +890,6 @@ smgr_redo(XLogRecPtr lsn, XLogRecord *record) DropRelFileNodeBuffers(xlrec->rnode, xlrec->forknum, false, xlrec->blkno); - /* - * Tell the free space map to forget anything it may have stored for - * the about-to-be-deleted blocks. We want to be sure it won't return - * bogus block numbers later on. - */ - FreeSpaceMapTruncateRel(&reln->smgr_rnode, xlrec->blkno); - /* Do the truncation */ (*(smgrsw[reln->smgr_which].smgr_truncate)) (reln, xlrec->forknum, diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 8449cb4d4c..c9e7b5e626 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/tcop/postgres.c,v 1.556 2008/08/19 18:30:04 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/tcop/postgres.c,v 1.557 2008/09/30 10:52:13 heikki Exp $ * * NOTES * this is the "main" module of the postgres backend and @@ -57,7 +57,6 @@ #include "postmaster/autovacuum.h" #include "rewrite/rewriteHandler.h" #include "storage/bufmgr.h" -#include "storage/freespace.h" #include "storage/ipc.h" #include "storage/proc.h" #include "storage/sinval.h" @@ -3258,13 +3257,6 @@ PostgresMain(int argc, char *argv[], const char *username) StartupXLOG(); on_shmem_exit(ShutdownXLOG, 0); - /* - * Read any existing FSM cache file, and register to write one out at - * exit. - */ - LoadFreeSpaceMap(); - on_shmem_exit(DumpFreeSpaceMap, 0); - /* * We have to build the flat file for pg_database, but not for the * user and group tables, since we won't try to do authentication. diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 7bfb23aaf0..cec75ada72 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.273 2008/08/10 19:02:33 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.274 2008/09/30 10:52:13 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -304,6 +304,7 @@ AllocateRelationDesc(Relation relation, Form_pg_class relp) */ MemSet(relation, 0, sizeof(RelationData)); relation->rd_targblock = InvalidBlockNumber; + relation->rd_fsm_nblocks_cache = InvalidBlockNumber; /* make sure relation is marked as having no open file yet */ relation->rd_smgr = NULL; @@ -1364,6 +1365,7 @@ formrdesc(const char *relationName, Oid relationReltype, */ relation = (Relation) palloc0(sizeof(RelationData)); relation->rd_targblock = InvalidBlockNumber; + relation->rd_fsm_nblocks_cache = InvalidBlockNumber; /* make sure relation is marked as having no open file yet */ relation->rd_smgr = NULL; @@ -1652,8 +1654,9 @@ RelationReloadIndexInfo(Relation relation) heap_freetuple(pg_class_tuple); /* We must recalculate physical address in case it changed */ RelationInitPhysicalAddr(relation); - /* Make sure targblock is reset in case rel was truncated */ + /* Must reset targblock and fsm_nblocks_cache in case rel was truncated */ relation->rd_targblock = InvalidBlockNumber; + relation->rd_fsm_nblocks_cache = InvalidBlockNumber; /* Must free any AM cached data, too */ if (relation->rd_amcache) pfree(relation->rd_amcache); @@ -1736,6 +1739,7 @@ RelationClearRelation(Relation relation, bool rebuild) if (relation->rd_isnailed) { relation->rd_targblock = InvalidBlockNumber; + relation->rd_fsm_nblocks_cache = InvalidBlockNumber; if (relation->rd_rel->relkind == RELKIND_INDEX) { relation->rd_isvalid = false; /* needs to be revalidated */ @@ -2330,6 +2334,7 @@ RelationBuildLocalRelation(const char *relname, rel = (Relation) palloc0(sizeof(RelationData)); rel->rd_targblock = InvalidBlockNumber; + rel->rd_fsm_nblocks_cache = InvalidBlockNumber; /* make sure relation is marked as having no open file yet */ rel->rd_smgr = NULL; @@ -3586,6 +3591,7 @@ load_relcache_init_file(void) */ rel->rd_smgr = NULL; rel->rd_targblock = InvalidBlockNumber; + rel->rd_fsm_nblocks_cache = InvalidBlockNumber; if (rel->rd_isnailed) rel->rd_refcnt = 1; else diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index f0f49538e7..93f20eef35 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -10,7 +10,7 @@ * Written by Peter Eisentraut . * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.473 2008/09/23 21:12:03 mha Exp $ + * $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.474 2008/09/30 10:52:13 heikki Exp $ * *-------------------------------------------------------------------- */ @@ -57,7 +57,6 @@ #include "regex/regex.h" #include "storage/bufmgr.h" #include "storage/fd.h" -#include "storage/freespace.h" #include "tcop/tcopprot.h" #include "tsearch/ts_cache.h" #include "utils/builtins.h" @@ -446,8 +445,6 @@ const char *const config_group_names[] = gettext_noop("Resource Usage"), /* RESOURCES_MEM */ gettext_noop("Resource Usage / Memory"), - /* RESOURCES_FSM */ - gettext_noop("Resource Usage / Free Space Map"), /* RESOURCES_KERNEL */ gettext_noop("Resource Usage / Kernel Resources"), /* WAL */ @@ -1528,23 +1525,6 @@ static struct config_int ConfigureNamesInt[] = 100000000, 0, 1000000000, NULL, NULL }, - { - {"max_fsm_relations", PGC_POSTMASTER, RESOURCES_FSM, - gettext_noop("Sets the maximum number of tables and indexes for which free space is tracked."), - NULL - }, - &MaxFSMRelations, - 1000, 100, INT_MAX, NULL, NULL - }, - { - {"max_fsm_pages", PGC_POSTMASTER, RESOURCES_FSM, - gettext_noop("Sets the maximum number of disk pages for which free space is tracked."), - NULL - }, - &MaxFSMPages, - 20000, 1000, INT_MAX, NULL, NULL - }, - { {"max_locks_per_transaction", PGC_POSTMASTER, LOCK_MANAGEMENT, gettext_noop("Sets the maximum number of locks per transaction."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 342be9d6c3..56afb2e488 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -114,13 +114,6 @@ #maintenance_work_mem = 16MB # min 1MB #max_stack_depth = 2MB # min 100kB -# - Free Space Map - - -#max_fsm_pages = 204800 # min max_fsm_relations*16, 6 bytes each - # (change requires restart) -#max_fsm_relations = 1000 # min 100, ~70 bytes each - # (change requires restart) - # - Kernel Resource Usage - #max_files_per_process = 1000 # min 25 diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 4caee3e215..02105ac57e 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -42,7 +42,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * Portions taken from FreeBSD. * - * $PostgreSQL: pgsql/src/bin/initdb/initdb.c,v 1.161 2008/09/23 10:58:03 heikki Exp $ + * $PostgreSQL: pgsql/src/bin/initdb/initdb.c,v 1.162 2008/09/30 10:52:13 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -119,7 +119,6 @@ static int output_errno = 0; /* defaults */ static int n_connections = 10; static int n_buffers = 50; -static int n_fsm_pages = 20000; /* * Warning messages for authentication methods @@ -1041,13 +1040,10 @@ static void test_config_settings(void) { /* - * These macros define the minimum shared_buffers we want for a given - * max_connections value, and the max_fsm_pages setting to be used for a - * given shared_buffers value. The arrays show the settings to try. + * This macro defines the minimum shared_buffers we want for a given + * max_connections value. The arrays show the settings to try. */ - #define MIN_BUFS_FOR_CONNS(nconns) ((nconns) * 10) -#define FSM_FOR_BUFS(nbuffers) ((nbuffers) > 1000 ? 50 * (nbuffers) : 20000) static const int trial_conns[] = { 100, 50, 40, 30, 20, 10 @@ -1065,7 +1061,6 @@ test_config_settings(void) status, test_conns, test_buffs, - test_max_fsm, ok_buffers = 0; @@ -1076,16 +1071,14 @@ test_config_settings(void) { test_conns = trial_conns[i]; test_buffs = MIN_BUFS_FOR_CONNS(test_conns); - test_max_fsm = FSM_FOR_BUFS(test_buffs); snprintf(cmd, sizeof(cmd), SYSTEMQUOTE "\"%s\" --boot -x0 %s " "-c max_connections=%d " "-c shared_buffers=%d " - "-c max_fsm_pages=%d " "< \"%s\" > \"%s\" 2>&1" SYSTEMQUOTE, backend_exec, boot_options, - test_conns, test_buffs, test_max_fsm, + test_conns, test_buffs, DEVNULL, DEVNULL); status = system(cmd); if (status == 0) @@ -1100,7 +1093,7 @@ test_config_settings(void) printf("%d\n", n_connections); - printf(_("selecting default shared_buffers/max_fsm_pages ... ")); + printf(_("selecting default shared_buffers ... ")); fflush(stdout); for (i = 0; i < bufslen; i++) @@ -1112,28 +1105,25 @@ test_config_settings(void) test_buffs = ok_buffers; break; } - test_max_fsm = FSM_FOR_BUFS(test_buffs); snprintf(cmd, sizeof(cmd), SYSTEMQUOTE "\"%s\" --boot -x0 %s " "-c max_connections=%d " "-c shared_buffers=%d " - "-c max_fsm_pages=%d " "< \"%s\" > \"%s\" 2>&1" SYSTEMQUOTE, backend_exec, boot_options, - n_connections, test_buffs, test_max_fsm, + n_connections, test_buffs, DEVNULL, DEVNULL); status = system(cmd); if (status == 0) break; } n_buffers = test_buffs; - n_fsm_pages = FSM_FOR_BUFS(n_buffers); if ((n_buffers * (BLCKSZ / 1024)) % 1024 == 0) - printf("%dMB/%d\n", (n_buffers * (BLCKSZ / 1024)) / 1024, n_fsm_pages); + printf("%dMB\n", (n_buffers * (BLCKSZ / 1024)) / 1024); else - printf("%dkB/%d\n", n_buffers * (BLCKSZ / 1024), n_fsm_pages); + printf("%dkB\n", n_buffers * (BLCKSZ / 1024)); } /* @@ -1164,9 +1154,6 @@ setup_config(void) n_buffers * (BLCKSZ / 1024)); conflines = replace_token(conflines, "#shared_buffers = 32MB", repltok); - snprintf(repltok, sizeof(repltok), "max_fsm_pages = %d", n_fsm_pages); - conflines = replace_token(conflines, "#max_fsm_pages = 204800", repltok); - #if DEF_PGPORT != 5432 snprintf(repltok, sizeof(repltok), "#port = %d", DEF_PGPORT); conflines = replace_token(conflines, "#port = 5432", repltok); diff --git a/src/include/access/rmgr.h b/src/include/access/rmgr.h index 7be2dfc9f6..6f018f0bee 100644 --- a/src/include/access/rmgr.h +++ b/src/include/access/rmgr.h @@ -3,7 +3,7 @@ * * Resource managers definition * - * $PostgreSQL: pgsql/src/include/access/rmgr.h,v 1.17 2006/11/05 22:42:10 tgl Exp $ + * $PostgreSQL: pgsql/src/include/access/rmgr.h,v 1.18 2008/09/30 10:52:13 heikki Exp $ */ #ifndef RMGR_H #define RMGR_H @@ -23,6 +23,7 @@ typedef uint8 RmgrId; #define RM_DBASE_ID 4 #define RM_TBLSPC_ID 5 #define RM_MULTIXACT_ID 6 +#define RM_FREESPACE_ID 7 #define RM_HEAP2_ID 9 #define RM_HEAP_ID 10 #define RM_BTREE_ID 11 diff --git a/src/include/storage/freespace.h b/src/include/storage/freespace.h index 86dd22647c..d417e8c980 100644 --- a/src/include/storage/freespace.h +++ b/src/include/storage/freespace.h @@ -7,152 +7,32 @@ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/freespace.h,v 1.28 2008/03/10 02:04:10 tgl Exp $ + * $PostgreSQL: pgsql/src/include/storage/freespace.h,v 1.29 2008/09/30 10:52:13 heikki Exp $ * *------------------------------------------------------------------------- */ #ifndef FREESPACE_H_ #define FREESPACE_H_ -#include "storage/relfilenode.h" -#include "storage/itemptr.h" +#include "utils/rel.h" +#include "storage/bufpage.h" +#include "access/xlog.h" - -/* Initial value for average-request moving average */ -#define INITIAL_AVERAGE ((Size) (BLCKSZ / 32)) - -/* - * Number of pages and bytes per allocation chunk. Indexes can squeeze 50% - * more pages into the same space because they don't need to remember how much - * free space on each page. The nominal number of pages, CHUNKPAGES, is for - * regular rels, and INDEXCHUNKPAGES is for indexes. CHUNKPAGES should be - * even so that no space is wasted in the index case. - */ -#define CHUNKPAGES 16 -#define CHUNKBYTES (CHUNKPAGES * sizeof(FSMPageData)) -#define INDEXCHUNKPAGES ((int) (CHUNKBYTES / sizeof(IndexFSMPageData))) - - -/* - * Typedefs and macros for items in the page-storage arena. We use the - * existing ItemPointer and BlockId data structures, which are designed - * to pack well (they should be 6 and 4 bytes apiece regardless of machine - * alignment issues). Unfortunately we can't use the ItemPointer access - * macros, because they include Asserts insisting that ip_posid != 0. - */ -typedef ItemPointerData FSMPageData; -typedef BlockIdData IndexFSMPageData; - -#define FSMPageGetPageNum(ptr) \ - BlockIdGetBlockNumber(&(ptr)->ip_blkid) -#define FSMPageGetSpace(ptr) \ - ((Size) (ptr)->ip_posid) -#define FSMPageSetPageNum(ptr, pg) \ - BlockIdSet(&(ptr)->ip_blkid, pg) -#define FSMPageSetSpace(ptr, sz) \ - ((ptr)->ip_posid = (OffsetNumber) (sz)) -#define IndexFSMPageGetPageNum(ptr) \ - BlockIdGetBlockNumber(ptr) -#define IndexFSMPageSetPageNum(ptr, pg) \ - BlockIdSet(ptr, pg) - -/* - * Shared free-space-map objects - * - * The per-relation objects are indexed by a hash table, and are also members - * of two linked lists: one ordered by recency of usage (most recent first), - * and the other ordered by physical location of the associated storage in - * the page-info arena. - * - * Each relation owns one or more chunks of per-page storage in the "arena". - * The chunks for each relation are always consecutive, so that it can treat - * its page storage as a simple array. We further insist that its page data - * be ordered by block number, so that binary search is possible. - * - * Note: we handle pointers to these items as pointers, not as SHMEM_OFFSETs. - * This assumes that all processes accessing the map will have the shared - * memory segment mapped at the same place in their address space. - */ -typedef struct FSMHeader FSMHeader; -typedef struct FSMRelation FSMRelation; - -/* Header for whole map */ -struct FSMHeader -{ - FSMRelation *usageList; /* FSMRelations in usage-recency order */ - FSMRelation *usageListTail; /* tail of usage-recency list */ - FSMRelation *firstRel; /* FSMRelations in arena storage order */ - FSMRelation *lastRel; /* tail of storage-order list */ - int numRels; /* number of FSMRelations now in use */ - double sumRequests; /* sum of requested chunks over all rels */ - char *arena; /* arena for page-info storage */ - int totalChunks; /* total size of arena, in chunks */ - int usedChunks; /* # of chunks assigned */ - /* NB: there are totalChunks - usedChunks free chunks at end of arena */ -}; - -/* - * Per-relation struct --- this is an entry in the shared hash table. - * The hash key is the RelFileNode value (hence, we look at the physical - * relation ID, not the logical ID, which is appropriate). - */ -struct FSMRelation -{ - RelFileNode key; /* hash key (must be first) */ - FSMRelation *nextUsage; /* next rel in usage-recency order */ - FSMRelation *priorUsage; /* prior rel in usage-recency order */ - FSMRelation *nextPhysical; /* next rel in arena-storage order */ - FSMRelation *priorPhysical; /* prior rel in arena-storage order */ - bool isIndex; /* if true, we store only page numbers */ - Size avgRequest; /* moving average of space requests */ - BlockNumber interestingPages; /* # of pages with useful free space */ - int firstChunk; /* chunk # of my first chunk in arena */ - int storedPages; /* # of pages stored in arena */ - int nextPage; /* index (from 0) to start next search at */ -}; - - - -/* GUC variables */ -extern PGDLLIMPORT int MaxFSMRelations; -extern PGDLLIMPORT int MaxFSMPages; - - -/* - * function prototypes - */ -extern void InitFreeSpaceMap(void); -extern Size FreeSpaceShmemSize(void); -extern FSMHeader *GetFreeSpaceMap(void); - -extern BlockNumber GetPageWithFreeSpace(RelFileNode *rel, Size spaceNeeded); -extern BlockNumber RecordAndGetPageWithFreeSpace(RelFileNode *rel, +/* prototypes for public functions in freespace.c */ +extern Size GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk); +extern BlockNumber GetPageWithFreeSpace(Relation rel, Size spaceNeeded); +extern BlockNumber RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage, Size oldSpaceAvail, Size spaceNeeded); -extern Size GetAvgFSMRequestSize(RelFileNode *rel); -extern void RecordRelationFreeSpace(RelFileNode *rel, - BlockNumber interestingPages, - int nPages, - FSMPageData *pageSpaces); +extern void RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, + Size spaceAvail); -extern BlockNumber GetFreeIndexPage(RelFileNode *rel); -extern void RecordIndexFreeSpace(RelFileNode *rel, - BlockNumber interestingPages, - int nPages, - BlockNumber *pages); +extern void FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks); +extern void FreeSpaceMapVacuum(Relation rel); -extern void FreeSpaceMapTruncateRel(RelFileNode *rel, BlockNumber nblocks); -extern void FreeSpaceMapForgetRel(RelFileNode *rel); -extern void FreeSpaceMapForgetDatabase(Oid dbid); - -extern void PrintFreeSpaceMapStatistics(int elevel); - -extern void DumpFreeSpaceMap(int code, Datum arg); -extern void LoadFreeSpaceMap(void); - -#ifdef FREESPACE_DEBUG -extern void DumpFreeSpace(void); -#endif +/* WAL prototypes */ +extern void fsm_desc(StringInfo buf, uint8 xl_info, char *rec); +extern void fsm_redo(XLogRecPtr lsn, XLogRecord *record); #endif /* FREESPACE_H */ diff --git a/src/include/storage/fsm_internals.h b/src/include/storage/fsm_internals.h new file mode 100644 index 0000000000..e7fbbf2b9b --- /dev/null +++ b/src/include/storage/fsm_internals.h @@ -0,0 +1,73 @@ +/*------------------------------------------------------------------------- + * + * fsm_internal.h + * internal functions for free space map + * + * + * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * $PostgreSQL: pgsql/src/include/storage/fsm_internals.h,v 1.1 2008/09/30 10:52:14 heikki Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef FSM_INTERNALS_H +#define FSM_INTERNALS_H + +#include "storage/buf.h" +#include "storage/bufpage.h" +#include "lib/stringinfo.h" + +/* + * Structure of a FSM page. See src/backend/storage/freespace/README for + * details. + */ +typedef struct +{ + /* + * fsm_search_avail() tries to spread the load of multiple backends + * by returning different pages to different backends in a round-robin + * fashion. fp_next_slot points to the next slot to be returned + * (assuming there's enough space on it for the request). It's defined + * as an int, because it's updated without an exclusive lock. uint16 + * would be more appropriate, but int is more likely to be atomically + * fetchable/storable. + */ + int fp_next_slot; + + /* + * fp_nodes contains the binary tree, stored in array. The first + * NonLeafNodesPerPage elements are upper nodes, and the following + * LeafNodesPerPage elements are leaf nodes. Unused nodes are zero. + */ + uint8 fp_nodes[1]; +} FSMPageData; + +typedef FSMPageData *FSMPage; + +/* + * Number of non-leaf and leaf nodes, and nodes in total, on an FSM page. + * These definitions are internal to fsmpage.c. + */ +#define NodesPerPage (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - \ + offsetof(FSMPageData, fp_nodes)) + +#define NonLeafNodesPerPage (BLCKSZ / 2 - 1) +#define LeafNodesPerPage (NodesPerPage - NonLeafNodesPerPage) + +/* + * Number of FSM "slots" on a FSM page. This is what should be used + * outside fsmpage.c. + */ +#define SlotsPerFSMPage LeafNodesPerPage + +/* Prototypes for functions in fsmpage.c */ +extern int fsm_search_avail(Buffer buf, uint8 min_cat, bool advancenext, + bool exclusive_lock_held); +extern uint8 fsm_get_avail(Page page, int slot); +extern uint8 fsm_get_max_avail(Page page); +extern bool fsm_set_avail(Page page, int slot, uint8 value); +extern bool fsm_truncate_avail(Page page, int nslots); +extern bool fsm_rebuild_page(Page page); + +#endif /* FSM_INTERNALS_H */ diff --git a/src/include/storage/indexfsm.h b/src/include/storage/indexfsm.h new file mode 100644 index 0000000000..76bb26f7bc --- /dev/null +++ b/src/include/storage/indexfsm.h @@ -0,0 +1,27 @@ +/*------------------------------------------------------------------------- + * + * indexfsm.h + * POSTGRES free space map for quickly finding an unused page in index + * + * + * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * $PostgreSQL: pgsql/src/include/storage/indexfsm.h,v 1.1 2008/09/30 10:52:14 heikki Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef INDEXFSM_H_ +#define INDEXFSM_H_ + +#include "utils/rel.h" + +extern void InitIndexFreeSpaceMap(Relation rel); + +extern BlockNumber GetFreeIndexPage(Relation rel); +extern void RecordFreeIndexPage(Relation rel, BlockNumber page); +extern void RecordUsedIndexPage(Relation rel, BlockNumber page); + +extern void IndexFreeSpaceMapTruncate(Relation rel, BlockNumber nblocks); + +#endif /* INDEXFSM_H */ diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index b1088fcd33..5f993fa2ba 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.39 2008/06/19 21:32:56 tgl Exp $ + * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.40 2008/09/30 10:52:14 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -35,6 +35,10 @@ * by allowing values not listed in the enum declaration to be assigned. * The extra value MaxDynamicLWLock is there to keep the compiler from * deciding that the enum can be represented as char or short ... + * + * If you remove a lock, please replace it with a placeholder like was done + * for FreeSpaceMapLock. This retains the lock numbering, which is helpful for + * DTrace and other external debugging scripts. */ typedef enum LWLockId { @@ -45,7 +49,7 @@ typedef enum LWLockId ProcArrayLock, SInvalReadLock, SInvalWriteLock, - FreeSpaceLock, + UnusedLock1, /* FreeSpaceMapLock used to be here */ WALInsertLock, WALWriteLock, ControlFileLock, diff --git a/src/include/storage/relfilenode.h b/src/include/storage/relfilenode.h index 8ac8147ed9..571f261c3d 100644 --- a/src/include/storage/relfilenode.h +++ b/src/include/storage/relfilenode.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/relfilenode.h,v 1.16 2008/08/11 11:05:11 heikki Exp $ + * $PostgreSQL: pgsql/src/include/storage/relfilenode.h,v 1.17 2008/09/30 10:52:14 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -23,11 +23,12 @@ typedef enum ForkNumber { InvalidForkNumber = -1, - MAIN_FORKNUM = 0 - /* NOTE: change NUM_FORKS below when you add new forks */ + MAIN_FORKNUM = 0, + FSM_FORKNUM + /* NOTE: change MAX_FORKNUM below when you add new forks */ } ForkNumber; -#define MAX_FORKNUM MAIN_FORKNUM +#define MAX_FORKNUM FSM_FORKNUM /* * RelFileNode must provide all that we need to know to physically access diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h index 436b003286..0eca0f54a3 100644 --- a/src/include/utils/guc_tables.h +++ b/src/include/utils/guc_tables.h @@ -7,7 +7,7 @@ * * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * - * $PostgreSQL: pgsql/src/include/utils/guc_tables.h,v 1.42 2008/09/10 18:09:20 alvherre Exp $ + * $PostgreSQL: pgsql/src/include/utils/guc_tables.h,v 1.43 2008/09/30 10:52:14 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -49,7 +49,6 @@ enum config_group CONN_AUTH_SECURITY, RESOURCES, RESOURCES_MEM, - RESOURCES_FSM, RESOURCES_KERNEL, WAL, WAL_SETTINGS, diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index 0d9d75dd8b..71ad936d27 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.107 2008/06/19 00:46:06 alvherre Exp $ + * $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.108 2008/09/30 10:52:14 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -195,6 +195,9 @@ typedef struct RelationData List *rd_indpred; /* index predicate tree, if any */ void *rd_amcache; /* available for use by index AM */ + /* Cached last-seen size of the FSM */ + BlockNumber rd_fsm_nblocks_cache; + /* use "struct" here to avoid needing to include pgstat.h: */ struct PgStat_TableStatus *pgstat_info; /* statistics collection area */ } RelationData;