Arrange to cache btree metapage data in the relcache entry for the index,

thereby saving a visit to the metapage in most index searches/updates.
This wouldn't actually save any I/O (since in the old regime the metapage
generally stayed in cache anyway), but it does provide a useful decrease
in bufmgr traffic in high-contention scenarios.  Per my recent proposal.
This commit is contained in:
Tom Lane 2006-04-25 22:46:05 +00:00
parent 89083876c9
commit d2896a9ed1
6 changed files with 122 additions and 9 deletions

View File

@ -1,4 +1,4 @@
$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.9 2006/01/17 00:09:00 tgl Exp $
$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.10 2006/04/25 22:46:05 tgl Exp $
This directory contains a correct implementation of Lehman and Yao's
high-concurrency B-tree management algorithm (P. Lehman and S. Yao,
@ -316,7 +316,17 @@ Other things that are handy to know
Page zero of every btree is a meta-data page. This page stores the
location of the root page --- both the true root and the current effective
root ("fast" root).
root ("fast" root). To avoid fetching the metapage for every single index
search, we cache a copy of the meta-data information in the index's
relcache entry (rd_amcache). This is a bit ticklish since using the cache
implies following a root page pointer that could be stale. We require
every metapage update to send out a SI "relcache inval" message on the
index relation. That ensures that each backend will flush its cached copy
not later than the start of its next transaction. Therefore, stale
pointers cannot be used for longer than the current transaction, which
reduces the problem to the same one already dealt with for concurrent
VACUUM --- we can just imagine that each open transaction is potentially
"already in flight" to the old root.
The algorithm assumes we can fit at least three items per page
(a "high key" and two real data items). Therefore it's unsafe

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.135 2006/04/13 03:53:05 tgl Exp $
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.136 2006/04/25 22:46:05 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -18,6 +18,7 @@
#include "access/heapam.h"
#include "access/nbtree.h"
#include "miscadmin.h"
#include "utils/inval.h"
typedef struct
@ -638,9 +639,12 @@ _bt_insertonpg(Relation rel,
END_CRIT_SECTION();
/* release pin/lock */
/* release buffers; send out relcache inval if metapage changed */
if (BufferIsValid(metabuf))
{
CacheInvalidateRelcache(rel);
_bt_relbuf(rel, metabuf);
}
_bt_relbuf(rel, buf);
}
@ -1526,6 +1530,9 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
END_CRIT_SECTION();
/* send out relcache inval for metapage change */
CacheInvalidateRelcache(rel);
/* done with metapage */
_bt_relbuf(rel, metabuf);

View File

@ -9,7 +9,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.95 2006/04/01 03:03:36 tgl Exp $
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.96 2006/04/25 22:46:05 tgl Exp $
*
* NOTES
* Postgres btree pages look like ordinary relation pages. The opaque
@ -26,6 +26,7 @@
#include "miscadmin.h"
#include "storage/freespace.h"
#include "storage/lmgr.h"
#include "utils/inval.h"
/*
@ -99,6 +100,49 @@ _bt_getroot(Relation rel, int access)
uint32 rootlevel;
BTMetaPageData *metad;
/*
* Try to use previously-cached metapage data to find the root. This
* normally saves one buffer access per index search, which is a very
* helpful savings in bufmgr traffic and hence contention.
*/
if (rel->rd_amcache != NULL)
{
metad = (BTMetaPageData *) rel->rd_amcache;
/* We shouldn't have cached it if any of these fail */
Assert(metad->btm_magic == BTREE_MAGIC);
Assert(metad->btm_version == BTREE_VERSION);
Assert(metad->btm_root != P_NONE);
rootblkno = metad->btm_fastroot;
Assert(rootblkno != P_NONE);
rootlevel = metad->btm_fastlevel;
rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
rootpage = BufferGetPage(rootbuf);
rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
/*
* Since the cache might be stale, we check the page more carefully
* here than normal. We *must* check that it's not deleted.
* If it's not alone on its level, then we reject too --- this
* may be overly paranoid but better safe than sorry. Note we
* don't check P_ISROOT, because that's not set in a "fast root".
*/
if (!P_IGNORE(rootopaque) &&
rootopaque->btpo.level == rootlevel &&
P_LEFTMOST(rootopaque) &&
P_RIGHTMOST(rootopaque))
{
/* OK, accept cached page as the root */
return rootbuf;
}
_bt_relbuf(rel, rootbuf);
/* Cache is stale, throw it away */
if (rel->rd_amcache)
pfree(rel->rd_amcache);
rel->rd_amcache = NULL;
}
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
metapg = BufferGetPage(metabuf);
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
@ -200,6 +244,12 @@ _bt_getroot(Relation rel, int access)
END_CRIT_SECTION();
/*
* Send out relcache inval for metapage change (probably unnecessary
* here, but let's be safe).
*/
CacheInvalidateRelcache(rel);
/*
* swap root write lock for read lock. There is no danger of anyone
* else accessing the new root page while it's unlocked, since no one
@ -217,6 +267,13 @@ _bt_getroot(Relation rel, int access)
Assert(rootblkno != P_NONE);
rootlevel = metad->btm_fastlevel;
/*
* Cache the metapage data for next time
*/
rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
sizeof(BTMetaPageData));
memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
/*
* We are done with the metapage; arrange to release it via first
* _bt_relandgetbuf call
@ -280,6 +337,16 @@ _bt_gettrueroot(Relation rel)
uint32 rootlevel;
BTMetaPageData *metad;
/*
* We don't try to use cached metapage data here, since (a) this path is
* not performance-critical, and (b) if we are here it suggests our cache
* is out-of-date anyway. In light of point (b), it's probably safest to
* actively flush any cached metapage info.
*/
if (rel->rd_amcache)
pfree(rel->rd_amcache);
rel->rd_amcache = NULL;
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
metapg = BufferGetPage(metabuf);
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
@ -1052,9 +1119,12 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
END_CRIT_SECTION();
/* release buffers */
/* release buffers; send out relcache inval if metapage changed */
if (BufferIsValid(metabuf))
{
CacheInvalidateRelcache(rel);
_bt_relbuf(rel, metabuf);
}
_bt_relbuf(rel, pbuf);
_bt_relbuf(rel, rbuf);
_bt_relbuf(rel, buf);

View File

@ -12,7 +12,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.144 2006/04/01 03:03:37 tgl Exp $
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.145 2006/04/25 22:46:05 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -26,6 +26,7 @@
#include "miscadmin.h"
#include "storage/freespace.h"
#include "storage/smgr.h"
#include "utils/inval.h"
#include "utils/memutils.h"
@ -127,6 +128,17 @@ btbuild(PG_FUNCTION_ARGS)
}
#endif /* BTREE_BUILD_STATS */
/*
* If we are reindexing a pre-existing index, it is critical to send out
* a relcache invalidation SI message to ensure all backends re-read the
* index metapage. In most circumstances the update-stats operation will
* cause that to happen, but at the moment there are corner cases where
* no pg_class update will occur, so force an inval here. XXX FIXME:
* the upper levels of CREATE INDEX should handle the stats update as
* well as guaranteeing relcache inval.
*/
CacheInvalidateRelcache(index);
/* since we just counted the # of tuples, may as well update stats */
IndexCloseAndUpdateStats(heap, reltuples, index, buildstate.indtuples);

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.238 2006/03/05 15:58:45 momjian Exp $
* $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.239 2006/04/25 22:46:05 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -948,6 +948,7 @@ RelationInitIndexAccessInfo(Relation relation)
*/
relation->rd_indexprs = NIL;
relation->rd_indpred = NIL;
relation->rd_amcache = NULL;
}
/*
@ -1481,6 +1482,10 @@ RelationReloadClassinfo(Relation relation)
RelationInitPhysicalAddr(relation);
/* Make sure targblock is reset in case rel was truncated */
relation->rd_targblock = InvalidBlockNumber;
/* Must free any AM cached data, too */
if (relation->rd_amcache)
pfree(relation->rd_amcache);
relation->rd_amcache = NULL;
/* Okay, now it's valid again */
relation->rd_isvalid = true;
}
@ -3141,6 +3146,7 @@ load_relcache_init_file(void)
rel->rd_indexlist = NIL;
rel->rd_oidindex = InvalidOid;
rel->rd_createSubid = InvalidSubTransactionId;
rel->rd_amcache = NULL;
MemSet(&rel->pgstat_info, 0, sizeof(rel->pgstat_info));
/*

View File

@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.88 2006/03/05 15:59:07 momjian Exp $
* $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.89 2006/04/25 22:46:05 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -167,6 +167,13 @@ typedef struct RelationData
* cached, namely those with subtype zero. The arrays are indexed by
* strategy or support number, which is a sufficient identifier given that
* restriction.
*
* Note: rd_amcache is available for index AMs to cache private data about
* an index. This must be just a cache since it may get reset at any time
* (in particular, it will get reset by a relcache inval message for the
* index). If used, it must point to a single memory chunk palloc'd in
* rd_indexcxt. A relcache reset will include freeing that chunk and
* setting rd_amcache = NULL.
*/
MemoryContext rd_indexcxt; /* private memory cxt for this stuff */
RelationAmInfo *rd_aminfo; /* lookup info for funcs found in pg_am */
@ -175,6 +182,7 @@ typedef struct RelationData
FmgrInfo *rd_supportinfo; /* lookup info for support procedures */
List *rd_indexprs; /* index expression trees, if any */
List *rd_indpred; /* index predicate tree, if any */
void *rd_amcache; /* available for use by index AM */
/* statistics collection area */
PgStat_Info pgstat_info;