/*------------------------------------------------------------------------- * * smgr.c * public interface routines to storage manager switch. * * All file system operations in POSTGRES dispatch through these * routines. * * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION * $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.49 2001/05/10 20:38:49 tgl Exp $ * *------------------------------------------------------------------------- */ #include "postgres.h" #include "storage/bufmgr.h" #include "storage/smgr.h" #include "utils/memutils.h" static void smgrshutdown(void); typedef struct f_smgr { int (*smgr_init) (void); /* may be NULL */ int (*smgr_shutdown) (void); /* may be NULL */ int (*smgr_create) (Relation reln); int (*smgr_unlink) (RelFileNode rnode); int (*smgr_extend) (Relation reln, BlockNumber blocknum, char *buffer); int (*smgr_open) (Relation reln); int (*smgr_close) (Relation reln); int (*smgr_read) (Relation reln, BlockNumber blocknum, char *buffer); int (*smgr_write) (Relation reln, BlockNumber blocknum, char *buffer); int (*smgr_flush) (Relation reln, BlockNumber blocknum, char *buffer); int (*smgr_blindwrt) (RelFileNode rnode, BlockNumber blkno, char *buffer, bool dofsync); int (*smgr_markdirty) (Relation reln, BlockNumber blkno); int (*smgr_blindmarkdirty) (RelFileNode, BlockNumber blkno); int (*smgr_nblocks) (Relation reln); int (*smgr_truncate) (Relation reln, int nblocks); int (*smgr_commit) (void); /* may be NULL */ int (*smgr_abort) (void); /* may be NULL */ int (*smgr_sync) (void); } f_smgr; /* * The weird placement of commas in this init block is to keep the compiler * happy, regardless of what storage managers we have (or don't have). */ static f_smgr smgrsw[] = { /* magnetic disk */ {mdinit, NULL, mdcreate, mdunlink, mdextend, mdopen, mdclose, mdread, mdwrite, mdflush, mdblindwrt, mdmarkdirty, mdblindmarkdirty, mdnblocks, mdtruncate, mdcommit, mdabort, mdsync }, #ifdef STABLE_MEMORY_STORAGE /* main memory */ {mminit, mmshutdown, mmcreate, mmunlink, mmextend, mmopen, mmclose, mmread, mmwrite, mmflush, mmblindwrt, mmmarkdirty, mmblindmarkdirty, mmnblocks, NULL, mmcommit, mmabort}, #endif }; /* * This array records which storage managers are write-once, and which * support overwrite. A 'true' entry means that the storage manager is * write-once. In the best of all possible worlds, there would be no * write-once storage managers. */ #ifdef NOT_USED static bool smgrwo[] = { false, /* magnetic disk */ #ifdef STABLE_MEMORY_STORAGE false, /* main memory */ #endif }; #endif static int NSmgr = lengthof(smgrsw); /* * We keep a list of all relations (represented as RelFileNode values) * that have been created or deleted in the current transaction. When * a relation is created, we create the physical file immediately, but * remember it so that we can delete the file again if the current * transaction is aborted. Conversely, a deletion request is NOT * executed immediately, but is just entered in the list. When and if * the transaction commits, we can delete the physical file. * * NOTE: the list is kept in TopMemoryContext to be sure it won't disappear * unbetimes. It'd probably be OK to keep it in TopTransactionContext, * but I'm being paranoid. */ typedef struct PendingRelDelete { RelFileNode relnode; /* relation that may need to be deleted */ int16 which; /* which storage manager? */ bool atCommit; /* T=delete at commit; F=delete at abort */ struct PendingRelDelete *next; /* linked-list link */ } PendingRelDelete; static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */ /* * smgrinit(), smgrshutdown() -- Initialize or shut down all storage * managers. * */ int smgrinit() { int i; for (i = 0; i < NSmgr; i++) { if (smgrsw[i].smgr_init) { if ((*(smgrsw[i].smgr_init)) () == SM_FAIL) elog(FATAL, "initialization failed on %s: %m", DatumGetCString(DirectFunctionCall1(smgrout, Int16GetDatum(i)))); } } /* register the shutdown proc */ on_proc_exit(smgrshutdown, 0); return SM_SUCCESS; } static void smgrshutdown(void) { int i; for (i = 0; i < NSmgr; i++) { if (smgrsw[i].smgr_shutdown) { if ((*(smgrsw[i].smgr_shutdown)) () == SM_FAIL) elog(FATAL, "shutdown failed on %s: %m", DatumGetCString(DirectFunctionCall1(smgrout, Int16GetDatum(i)))); } } } /* * smgrcreate() -- Create a new relation. * * This routine takes a reldesc, creates the relation on the appropriate * device, and returns a file descriptor for it. */ int smgrcreate(int16 which, Relation reln) { int fd; PendingRelDelete *pending; if ((fd = (*(smgrsw[which].smgr_create)) (reln)) < 0) elog(ERROR, "cannot create %s: %m", RelationGetRelationName(reln)); /* Add the relation to the list of stuff to delete at abort */ pending = (PendingRelDelete *) MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete)); pending->relnode = reln->rd_node; pending->which = which; pending->atCommit = false; /* delete if abort */ pending->next = pendingDeletes; pendingDeletes = pending; return fd; } /* * smgrunlink() -- Unlink a relation. * * The relation is removed from the store. Actually, we just remember * that we want to do this at transaction commit. */ int smgrunlink(int16 which, Relation reln) { PendingRelDelete *pending; /* Make sure the file is closed */ if (reln->rd_fd >= 0) smgrclose(which, reln); /* Add the relation to the list of stuff to delete at commit */ pending = (PendingRelDelete *) MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete)); pending->relnode = reln->rd_node; pending->which = which; pending->atCommit = true; /* delete if commit */ pending->next = pendingDeletes; pendingDeletes = pending; /* * NOTE: if the relation was created in this transaction, it will now * be present in the pending-delete list twice, once with atCommit * true and once with atCommit false. Hence, it will be physically * deleted at end of xact in either case (and the other entry will be * ignored by smgrDoPendingDeletes, so no error will occur). We could * instead remove the existing list entry and delete the physical file * immediately, but for now I'll keep the logic simple. */ return SM_SUCCESS; } /* * smgrextend() -- Add a new block to a file. * * The semantics are basically the same as smgrwrite(): write at the * specified position. However, we are expecting to extend the * relation (ie, blocknum is the current EOF), and so in case of * failure we clean up by truncating. * * Returns SM_SUCCESS on success; aborts the current transaction on * failure. */ int smgrextend(int16 which, Relation reln, BlockNumber blocknum, char *buffer) { int status; status = (*(smgrsw[which].smgr_extend)) (reln, blocknum, buffer); if (status == SM_FAIL) elog(ERROR, "cannot extend %s: %m.\n\tCheck free disk space.", RelationGetRelationName(reln)); return status; } /* * smgropen() -- Open a relation using a particular storage manager. * * Returns the fd for the open relation on success. * * On failure, returns -1 if failOK, else aborts the transaction. */ int smgropen(int16 which, Relation reln, bool failOK) { int fd; if (reln->rd_rel->relkind == RELKIND_VIEW) return -1; if ((fd = (*(smgrsw[which].smgr_open)) (reln)) < 0) if (!failOK) elog(ERROR, "cannot open %s: %m", RelationGetRelationName(reln)); return fd; } /* * smgrclose() -- Close a relation. * * Returns SM_SUCCESS on success, aborts on failure. */ int smgrclose(int16 which, Relation reln) { if ((*(smgrsw[which].smgr_close)) (reln) == SM_FAIL) elog(ERROR, "cannot close %s: %m", RelationGetRelationName(reln)); return SM_SUCCESS; } /* * smgrread() -- read a particular block from a relation into the supplied * buffer. * * This routine is called from the buffer manager in order to * instantiate pages in the shared buffer cache. All storage managers * return pages in the format that POSTGRES expects. This routine * dispatches the read. On success, it returns SM_SUCCESS. On failure, * the current transaction is aborted. */ int smgrread(int16 which, Relation reln, BlockNumber blocknum, char *buffer) { int status; status = (*(smgrsw[which].smgr_read)) (reln, blocknum, buffer); if (status == SM_FAIL) elog(ERROR, "cannot read block %d of %s: %m", blocknum, RelationGetRelationName(reln)); return status; } /* * smgrwrite() -- Write the supplied buffer out. * * This is not a synchronous write -- the interface for that is * smgrflush(). The buffer is written out via the appropriate * storage manager. This routine returns SM_SUCCESS or aborts * the current transaction. */ int smgrwrite(int16 which, Relation reln, BlockNumber blocknum, char *buffer) { int status; status = (*(smgrsw[which].smgr_write)) (reln, blocknum, buffer); if (status == SM_FAIL) elog(ERROR, "cannot write block %d of %s: %m", blocknum, RelationGetRelationName(reln)); return status; } /* * smgrflush() -- A synchronous smgrwrite(). */ int smgrflush(int16 which, Relation reln, BlockNumber blocknum, char *buffer) { int status; status = (*(smgrsw[which].smgr_flush)) (reln, blocknum, buffer); if (status == SM_FAIL) elog(ERROR, "cannot flush block %d of %s to stable store: %m", blocknum, RelationGetRelationName(reln)); return status; } /* * smgrblindwrt() -- Write a page out blind. * * In some cases, we may find a page in the buffer cache that we * can't make a reldesc for. This happens, for example, when we * want to reuse a dirty page that was written by a transaction * that has not yet committed, which created a new relation. In * this case, the buffer manager will call smgrblindwrt() with * the name and OID of the database and the relation to which the * buffer belongs. Every storage manager must be able to force * this page down to stable storage in this circumstance. The * write should be synchronous if dofsync is true. */ int smgrblindwrt(int16 which, RelFileNode rnode, BlockNumber blkno, char *buffer, bool dofsync) { int status; status = (*(smgrsw[which].smgr_blindwrt)) (rnode, blkno, buffer, dofsync); if (status == SM_FAIL) elog(ERROR, "cannot write block %d of %u/%u blind: %m", blkno, rnode.tblNode, rnode.relNode); return status; } /* * smgrmarkdirty() -- Mark a page dirty (needs fsync). * * Mark the specified page as needing to be fsync'd before commit. * Ordinarily, the storage manager will do this implicitly during * smgrwrite(). However, the buffer manager may discover that some * other backend has written a buffer that we dirtied in the current * transaction. In that case, we still need to fsync the file to be * sure the page is down to disk before we commit. */ int smgrmarkdirty(int16 which, Relation reln, BlockNumber blkno) { int status; status = (*(smgrsw[which].smgr_markdirty)) (reln, blkno); if (status == SM_FAIL) elog(ERROR, "cannot mark block %d of %s: %m", blkno, RelationGetRelationName(reln)); return status; } /* * smgrblindmarkdirty() -- Mark a page dirty, "blind". * * Just like smgrmarkdirty, except we don't have a reldesc. */ int smgrblindmarkdirty(int16 which, RelFileNode rnode, BlockNumber blkno) { int status; status = (*(smgrsw[which].smgr_blindmarkdirty)) (rnode, blkno); if (status == SM_FAIL) elog(ERROR, "cannot mark block %d of %u/%u blind: %m", blkno, rnode.tblNode, rnode.relNode); return status; } /* * smgrnblocks() -- Calculate the number of POSTGRES blocks in the * supplied relation. * * Returns the number of blocks on success, aborts the current * transaction on failure. */ int smgrnblocks(int16 which, Relation reln) { int nblocks; if ((nblocks = (*(smgrsw[which].smgr_nblocks)) (reln)) < 0) elog(ERROR, "cannot count blocks for %s: %m", RelationGetRelationName(reln)); return nblocks; } /* * smgrtruncate() -- Truncate supplied relation to a specified number * of blocks * * Returns the number of blocks on success, aborts the current * transaction on failure. */ int smgrtruncate(int16 which, Relation reln, int nblocks) { int newblks; newblks = nblocks; if (smgrsw[which].smgr_truncate) { if ((newblks = (*(smgrsw[which].smgr_truncate)) (reln, nblocks)) < 0) elog(ERROR, "cannot truncate %s to %d blocks: %m", RelationGetRelationName(reln), nblocks); } return newblks; } /* * smgrDoPendingDeletes() -- take care of relation deletes at end of xact. */ int smgrDoPendingDeletes(bool isCommit) { while (pendingDeletes != NULL) { PendingRelDelete *pending = pendingDeletes; pendingDeletes = pending->next; if (pending->atCommit == isCommit) { /* * Get rid of any leftover buffers for the rel (shouldn't be * any in the commit case, but there can be in the abort * case). */ DropRelFileNodeBuffers(pending->relnode); /* * And delete the physical files. * * Note: we treat deletion failure as a NOTICE, not an error, * because we've already decided to commit or abort the * current xact. */ if ((*(smgrsw[pending->which].smgr_unlink)) (pending->relnode) == SM_FAIL) elog(NOTICE, "cannot unlink %u/%u: %m", pending->relnode.tblNode, pending->relnode.relNode); } pfree(pending); } return SM_SUCCESS; } /* * smgrcommit(), smgrabort() -- Commit or abort changes made during the * current transaction. */ int smgrcommit() { int i; for (i = 0; i < NSmgr; i++) { if (smgrsw[i].smgr_commit) { if ((*(smgrsw[i].smgr_commit)) () == SM_FAIL) elog(FATAL, "transaction commit failed on %s: %m", DatumGetCString(DirectFunctionCall1(smgrout, Int16GetDatum(i)))); } } return SM_SUCCESS; } int smgrabort() { int i; for (i = 0; i < NSmgr; i++) { if (smgrsw[i].smgr_abort) { if ((*(smgrsw[i].smgr_abort)) () == SM_FAIL) elog(FATAL, "transaction abort failed on %s: %m", DatumGetCString(DirectFunctionCall1(smgrout, Int16GetDatum(i)))); } } return SM_SUCCESS; } int smgrsync() { int i; for (i = 0; i < NSmgr; i++) { if (smgrsw[i].smgr_sync) { if ((*(smgrsw[i].smgr_sync)) () == SM_FAIL) elog(STOP, "storage sync failed on %s: %m", DatumGetCString(DirectFunctionCall1(smgrout, Int16GetDatum(i)))); } } return SM_SUCCESS; } #ifdef NOT_USED bool smgriswo(int16 smgrno) { if (smgrno < 0 || smgrno >= NSmgr) elog(ERROR, "illegal storage manager number %d", smgrno); return smgrwo[smgrno]; } #endif void smgr_redo(XLogRecPtr lsn, XLogRecord *record) { } void smgr_undo(XLogRecPtr lsn, XLogRecord *record) { } void smgr_desc(char *buf, uint8 xl_info, char *rec) { }