postgresql/src/backend/storage/smgr/smgr.c
Tom Lane 499abb0c0f Implement new 'lightweight lock manager' that's intermediate between
existing lock manager and spinlocks: it understands exclusive vs shared
lock but has few other fancy features.  Replace most uses of spinlocks
with lightweight locks.  All remaining uses of spinlocks have very short
lock hold times (a few dozen instructions), so tweak spinlock backoff
code to work efficiently given this assumption.  All per my proposal on
pghackers 26-Sep-01.
2001-09-29 04:02:27 +00:00

621 lines
16 KiB
C

/*-------------------------------------------------------------------------
*
* smgr.c
* public interface routines to storage manager switch.
*
* All file system operations in POSTGRES dispatch through these
* routines.
*
* Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.53 2001/09/29 04:02:25 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "storage/bufmgr.h"
#include "storage/freespace.h"
#include "storage/ipc.h"
#include "storage/smgr.h"
#include "utils/memutils.h"
static void smgrshutdown(void);
typedef struct f_smgr
{
int (*smgr_init) (void); /* may be NULL */
int (*smgr_shutdown) (void); /* may be NULL */
int (*smgr_create) (Relation reln);
int (*smgr_unlink) (RelFileNode rnode);
int (*smgr_extend) (Relation reln, BlockNumber blocknum,
char *buffer);
int (*smgr_open) (Relation reln);
int (*smgr_close) (Relation reln);
int (*smgr_read) (Relation reln, BlockNumber blocknum,
char *buffer);
int (*smgr_write) (Relation reln, BlockNumber blocknum,
char *buffer);
int (*smgr_flush) (Relation reln, BlockNumber blocknum,
char *buffer);
int (*smgr_blindwrt) (RelFileNode rnode, BlockNumber blkno,
char *buffer, bool dofsync);
int (*smgr_markdirty) (Relation reln, BlockNumber blkno);
int (*smgr_blindmarkdirty) (RelFileNode, BlockNumber blkno);
BlockNumber (*smgr_nblocks) (Relation reln);
BlockNumber (*smgr_truncate) (Relation reln, BlockNumber nblocks);
int (*smgr_commit) (void); /* may be NULL */
int (*smgr_abort) (void); /* may be NULL */
int (*smgr_sync) (void);
} f_smgr;
/*
* The weird placement of commas in this init block is to keep the compiler
* happy, regardless of what storage managers we have (or don't have).
*/
static f_smgr smgrsw[] = {
/* magnetic disk */
{mdinit, NULL, mdcreate, mdunlink, mdextend, mdopen, mdclose,
mdread, mdwrite, mdflush, mdblindwrt, mdmarkdirty, mdblindmarkdirty,
mdnblocks, mdtruncate, mdcommit, mdabort, mdsync
},
#ifdef STABLE_MEMORY_STORAGE
/* main memory */
{mminit, mmshutdown, mmcreate, mmunlink, mmextend, mmopen, mmclose,
mmread, mmwrite, mmflush, mmblindwrt, mmmarkdirty, mmblindmarkdirty,
mmnblocks, NULL, mmcommit, mmabort},
#endif
};
/*
* This array records which storage managers are write-once, and which
* support overwrite. A 'true' entry means that the storage manager is
* write-once. In the best of all possible worlds, there would be no
* write-once storage managers.
*/
#ifdef NOT_USED
static bool smgrwo[] = {
false, /* magnetic disk */
#ifdef STABLE_MEMORY_STORAGE
false, /* main memory */
#endif
};
#endif
static int NSmgr = lengthof(smgrsw);
/*
* We keep a list of all relations (represented as RelFileNode values)
* that have been created or deleted in the current transaction. When
* a relation is created, we create the physical file immediately, but
* remember it so that we can delete the file again if the current
* transaction is aborted. Conversely, a deletion request is NOT
* executed immediately, but is just entered in the list. When and if
* the transaction commits, we can delete the physical file.
*
* NOTE: the list is kept in TopMemoryContext to be sure it won't disappear
* unbetimes. It'd probably be OK to keep it in TopTransactionContext,
* but I'm being paranoid.
*/
typedef struct PendingRelDelete
{
RelFileNode relnode; /* relation that may need to be deleted */
int16 which; /* which storage manager? */
bool atCommit; /* T=delete at commit; F=delete at abort */
struct PendingRelDelete *next; /* linked-list link */
} PendingRelDelete;
static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
/*
* smgrinit(), smgrshutdown() -- Initialize or shut down all storage
* managers.
*
*/
int
smgrinit()
{
int i;
for (i = 0; i < NSmgr; i++)
{
if (smgrsw[i].smgr_init)
{
if ((*(smgrsw[i].smgr_init)) () == SM_FAIL)
elog(FATAL, "initialization failed on %s: %m",
DatumGetCString(DirectFunctionCall1(smgrout,
Int16GetDatum(i))));
}
}
/* register the shutdown proc */
on_proc_exit(smgrshutdown, 0);
return SM_SUCCESS;
}
static void
smgrshutdown(void)
{
int i;
for (i = 0; i < NSmgr; i++)
{
if (smgrsw[i].smgr_shutdown)
{
if ((*(smgrsw[i].smgr_shutdown)) () == SM_FAIL)
elog(FATAL, "shutdown failed on %s: %m",
DatumGetCString(DirectFunctionCall1(smgrout,
Int16GetDatum(i))));
}
}
}
/*
* smgrcreate() -- Create a new relation.
*
* This routine takes a reldesc, creates the relation on the appropriate
* device, and returns a file descriptor for it.
*/
int
smgrcreate(int16 which, Relation reln)
{
int fd;
PendingRelDelete *pending;
if ((fd = (*(smgrsw[which].smgr_create)) (reln)) < 0)
elog(ERROR, "cannot create %s: %m", RelationGetRelationName(reln));
/* Add the relation to the list of stuff to delete at abort */
pending = (PendingRelDelete *)
MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
pending->relnode = reln->rd_node;
pending->which = which;
pending->atCommit = false; /* delete if abort */
pending->next = pendingDeletes;
pendingDeletes = pending;
return fd;
}
/*
* smgrunlink() -- Unlink a relation.
*
* The relation is removed from the store. Actually, we just remember
* that we want to do this at transaction commit.
*/
int
smgrunlink(int16 which, Relation reln)
{
PendingRelDelete *pending;
/* Make sure the file is closed */
if (reln->rd_fd >= 0)
smgrclose(which, reln);
/* Add the relation to the list of stuff to delete at commit */
pending = (PendingRelDelete *)
MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
pending->relnode = reln->rd_node;
pending->which = which;
pending->atCommit = true; /* delete if commit */
pending->next = pendingDeletes;
pendingDeletes = pending;
/*
* NOTE: if the relation was created in this transaction, it will now
* be present in the pending-delete list twice, once with atCommit
* true and once with atCommit false. Hence, it will be physically
* deleted at end of xact in either case (and the other entry will be
* ignored by smgrDoPendingDeletes, so no error will occur). We could
* instead remove the existing list entry and delete the physical file
* immediately, but for now I'll keep the logic simple.
*/
return SM_SUCCESS;
}
/*
* smgrextend() -- Add a new block to a file.
*
* The semantics are basically the same as smgrwrite(): write at the
* specified position. However, we are expecting to extend the
* relation (ie, blocknum is the current EOF), and so in case of
* failure we clean up by truncating.
*
* Returns SM_SUCCESS on success; aborts the current transaction on
* failure.
*/
int
smgrextend(int16 which, Relation reln, BlockNumber blocknum, char *buffer)
{
int status;
status = (*(smgrsw[which].smgr_extend)) (reln, blocknum, buffer);
if (status == SM_FAIL)
elog(ERROR, "cannot extend %s: %m.\n\tCheck free disk space.",
RelationGetRelationName(reln));
return status;
}
/*
* smgropen() -- Open a relation using a particular storage manager.
*
* Returns the fd for the open relation on success.
*
* On failure, returns -1 if failOK, else aborts the transaction.
*/
int
smgropen(int16 which, Relation reln, bool failOK)
{
int fd;
if (reln->rd_rel->relkind == RELKIND_VIEW)
return -1;
if ((fd = (*(smgrsw[which].smgr_open)) (reln)) < 0)
if (!failOK)
elog(ERROR, "cannot open %s: %m", RelationGetRelationName(reln));
return fd;
}
/*
* smgrclose() -- Close a relation.
*
* Returns SM_SUCCESS on success, aborts on failure.
*/
int
smgrclose(int16 which, Relation reln)
{
if ((*(smgrsw[which].smgr_close)) (reln) == SM_FAIL)
elog(ERROR, "cannot close %s: %m", RelationGetRelationName(reln));
return SM_SUCCESS;
}
/*
* smgrread() -- read a particular block from a relation into the supplied
* buffer.
*
* This routine is called from the buffer manager in order to
* instantiate pages in the shared buffer cache. All storage managers
* return pages in the format that POSTGRES expects. This routine
* dispatches the read. On success, it returns SM_SUCCESS. On failure,
* the current transaction is aborted.
*/
int
smgrread(int16 which, Relation reln, BlockNumber blocknum, char *buffer)
{
int status;
status = (*(smgrsw[which].smgr_read)) (reln, blocknum, buffer);
if (status == SM_FAIL)
elog(ERROR, "cannot read block %d of %s: %m",
blocknum, RelationGetRelationName(reln));
return status;
}
/*
* smgrwrite() -- Write the supplied buffer out.
*
* This is not a synchronous write -- the interface for that is
* smgrflush(). The buffer is written out via the appropriate
* storage manager. This routine returns SM_SUCCESS or aborts
* the current transaction.
*/
int
smgrwrite(int16 which, Relation reln, BlockNumber blocknum, char *buffer)
{
int status;
status = (*(smgrsw[which].smgr_write)) (reln, blocknum, buffer);
if (status == SM_FAIL)
elog(ERROR, "cannot write block %d of %s: %m",
blocknum, RelationGetRelationName(reln));
return status;
}
/*
* smgrflush() -- A synchronous smgrwrite().
*/
int
smgrflush(int16 which, Relation reln, BlockNumber blocknum, char *buffer)
{
int status;
status = (*(smgrsw[which].smgr_flush)) (reln, blocknum, buffer);
if (status == SM_FAIL)
elog(ERROR, "cannot flush block %d of %s to stable store: %m",
blocknum, RelationGetRelationName(reln));
return status;
}
/*
* smgrblindwrt() -- Write a page out blind.
*
* In some cases, we may find a page in the buffer cache that we
* can't make a reldesc for. This happens, for example, when we
* want to reuse a dirty page that was written by a transaction
* that has not yet committed, which created a new relation. In
* this case, the buffer manager will call smgrblindwrt() with
* the name and OID of the database and the relation to which the
* buffer belongs. Every storage manager must be able to force
* this page down to stable storage in this circumstance. The
* write should be synchronous if dofsync is true.
*/
int
smgrblindwrt(int16 which,
RelFileNode rnode,
BlockNumber blkno,
char *buffer,
bool dofsync)
{
int status;
status = (*(smgrsw[which].smgr_blindwrt)) (rnode, blkno, buffer, dofsync);
if (status == SM_FAIL)
elog(ERROR, "cannot write block %d of %u/%u blind: %m",
blkno, rnode.tblNode, rnode.relNode);
return status;
}
/*
* smgrmarkdirty() -- Mark a page dirty (needs fsync).
*
* Mark the specified page as needing to be fsync'd before commit.
* Ordinarily, the storage manager will do this implicitly during
* smgrwrite(). However, the buffer manager may discover that some
* other backend has written a buffer that we dirtied in the current
* transaction. In that case, we still need to fsync the file to be
* sure the page is down to disk before we commit.
*/
int
smgrmarkdirty(int16 which,
Relation reln,
BlockNumber blkno)
{
int status;
status = (*(smgrsw[which].smgr_markdirty)) (reln, blkno);
if (status == SM_FAIL)
elog(ERROR, "cannot mark block %d of %s: %m",
blkno, RelationGetRelationName(reln));
return status;
}
/*
* smgrblindmarkdirty() -- Mark a page dirty, "blind".
*
* Just like smgrmarkdirty, except we don't have a reldesc.
*/
int
smgrblindmarkdirty(int16 which,
RelFileNode rnode,
BlockNumber blkno)
{
int status;
status = (*(smgrsw[which].smgr_blindmarkdirty)) (rnode, blkno);
if (status == SM_FAIL)
elog(ERROR, "cannot mark block %d of %u/%u blind: %m",
blkno, rnode.tblNode, rnode.relNode);
return status;
}
/*
* smgrnblocks() -- Calculate the number of POSTGRES blocks in the
* supplied relation.
*
* Returns the number of blocks on success, aborts the current
* transaction on failure.
*/
BlockNumber
smgrnblocks(int16 which, Relation reln)
{
BlockNumber nblocks;
nblocks = (*(smgrsw[which].smgr_nblocks)) (reln);
/*
* NOTE: if a relation ever did grow to 2^32-1 blocks, this code would
* fail --- but that's a good thing, because it would stop us from
* extending the rel another block and having a block whose number
* actually is InvalidBlockNumber.
*/
if (nblocks == InvalidBlockNumber)
elog(ERROR, "cannot count blocks for %s: %m",
RelationGetRelationName(reln));
return nblocks;
}
/*
* smgrtruncate() -- Truncate supplied relation to a specified number
* of blocks
*
* Returns the number of blocks on success, aborts the current
* transaction on failure.
*/
BlockNumber
smgrtruncate(int16 which, Relation reln, BlockNumber nblocks)
{
BlockNumber newblks;
newblks = nblocks;
if (smgrsw[which].smgr_truncate)
{
/*
* Tell the free space map to forget anything it may have stored
* for the about-to-be-deleted blocks. We want to be sure it won't
* return bogus block numbers later on.
*/
MultiRecordFreeSpace(&reln->rd_node,
nblocks, MaxBlockNumber,
0, NULL, NULL);
newblks = (*(smgrsw[which].smgr_truncate)) (reln, nblocks);
if (newblks == InvalidBlockNumber)
elog(ERROR, "cannot truncate %s to %u blocks: %m",
RelationGetRelationName(reln), nblocks);
}
return newblks;
}
/*
* smgrDoPendingDeletes() -- take care of relation deletes at end of xact.
*/
int
smgrDoPendingDeletes(bool isCommit)
{
while (pendingDeletes != NULL)
{
PendingRelDelete *pending = pendingDeletes;
pendingDeletes = pending->next;
if (pending->atCommit == isCommit)
{
/*
* Get rid of any leftover buffers for the rel (shouldn't be
* any in the commit case, but there can be in the abort
* case).
*/
DropRelFileNodeBuffers(pending->relnode);
/*
* Tell the free space map to forget this relation. It won't
* be accessed any more anyway, but we may as well recycle the
* map space quickly.
*/
FreeSpaceMapForgetRel(&pending->relnode);
/*
* And delete the physical files.
*
* Note: we treat deletion failure as a NOTICE, not an error,
* because we've already decided to commit or abort the
* current xact.
*/
if ((*(smgrsw[pending->which].smgr_unlink)) (pending->relnode) == SM_FAIL)
elog(NOTICE, "cannot unlink %u/%u: %m",
pending->relnode.tblNode, pending->relnode.relNode);
}
pfree(pending);
}
return SM_SUCCESS;
}
/*
* smgrcommit(), smgrabort() -- Commit or abort changes made during the
* current transaction.
*/
int
smgrcommit()
{
int i;
for (i = 0; i < NSmgr; i++)
{
if (smgrsw[i].smgr_commit)
{
if ((*(smgrsw[i].smgr_commit)) () == SM_FAIL)
elog(FATAL, "transaction commit failed on %s: %m",
DatumGetCString(DirectFunctionCall1(smgrout,
Int16GetDatum(i))));
}
}
return SM_SUCCESS;
}
int
smgrabort()
{
int i;
for (i = 0; i < NSmgr; i++)
{
if (smgrsw[i].smgr_abort)
{
if ((*(smgrsw[i].smgr_abort)) () == SM_FAIL)
elog(FATAL, "transaction abort failed on %s: %m",
DatumGetCString(DirectFunctionCall1(smgrout,
Int16GetDatum(i))));
}
}
return SM_SUCCESS;
}
int
smgrsync()
{
int i;
for (i = 0; i < NSmgr; i++)
{
if (smgrsw[i].smgr_sync)
{
if ((*(smgrsw[i].smgr_sync)) () == SM_FAIL)
elog(STOP, "storage sync failed on %s: %m",
DatumGetCString(DirectFunctionCall1(smgrout,
Int16GetDatum(i))));
}
}
return SM_SUCCESS;
}
#ifdef NOT_USED
bool
smgriswo(int16 smgrno)
{
if (smgrno < 0 || smgrno >= NSmgr)
elog(ERROR, "illegal storage manager number %d", smgrno);
return smgrwo[smgrno];
}
#endif
void
smgr_redo(XLogRecPtr lsn, XLogRecord *record)
{
}
void
smgr_undo(XLogRecPtr lsn, XLogRecord *record)
{
}
void
smgr_desc(char *buf, uint8 xl_info, char *rec)
{
}