/*------------------------------------------------------------------------- * * smgr.c * public interface routines to storage manager switch. * * All file system operations in POSTGRES dispatch through these * routines. * * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION * $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.69 2004/02/10 01:55:26 tgl Exp $ * *------------------------------------------------------------------------- */ #include "postgres.h" #include "storage/bufmgr.h" #include "storage/freespace.h" #include "storage/ipc.h" #include "storage/smgr.h" #include "utils/hsearch.h" #include "utils/memutils.h" /* * This struct of function pointers defines the API between smgr.c and * any individual storage manager module. Note that smgr subfunctions are * generally expected to return TRUE on success, FALSE on error. (For * nblocks and truncate we instead say that returning InvalidBlockNumber * indicates an error.) */ typedef struct f_smgr { bool (*smgr_init) (void); /* may be NULL */ bool (*smgr_shutdown) (void); /* may be NULL */ bool (*smgr_close) (SMgrRelation reln); bool (*smgr_create) (SMgrRelation reln, bool isRedo); bool (*smgr_unlink) (RelFileNode rnode, bool isRedo); bool (*smgr_extend) (SMgrRelation reln, BlockNumber blocknum, char *buffer); bool (*smgr_read) (SMgrRelation reln, BlockNumber blocknum, char *buffer); bool (*smgr_write) (SMgrRelation reln, BlockNumber blocknum, char *buffer); BlockNumber (*smgr_nblocks) (SMgrRelation reln); BlockNumber (*smgr_truncate) (SMgrRelation reln, BlockNumber nblocks); bool (*smgr_commit) (void); /* may be NULL */ bool (*smgr_abort) (void); /* may be NULL */ bool (*smgr_sync) (void); /* may be NULL */ } f_smgr; static const f_smgr smgrsw[] = { /* magnetic disk */ {mdinit, NULL, mdclose, mdcreate, mdunlink, mdextend, mdread, mdwrite, mdnblocks, mdtruncate, mdcommit, mdabort, mdsync } }; static const int NSmgr = lengthof(smgrsw); /* * Each backend has a hashtable that stores all extant SMgrRelation objects. */ static HTAB *SMgrRelationHash = NULL; /* * We keep a list of all relations (represented as RelFileNode values) * that have been created or deleted in the current transaction. When * a relation is created, we create the physical file immediately, but * remember it so that we can delete the file again if the current * transaction is aborted. Conversely, a deletion request is NOT * executed immediately, but is just entered in the list. When and if * the transaction commits, we can delete the physical file. * * NOTE: the list is kept in TopMemoryContext to be sure it won't disappear * unbetimes. It'd probably be OK to keep it in TopTransactionContext, * but I'm being paranoid. */ typedef struct PendingRelDelete { RelFileNode relnode; /* relation that may need to be deleted */ int which; /* which storage manager? */ bool isTemp; /* is it a temporary relation? */ bool atCommit; /* T=delete at commit; F=delete at abort */ struct PendingRelDelete *next; /* linked-list link */ } PendingRelDelete; static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */ /* local function prototypes */ static void smgrshutdown(int code, Datum arg); static void smgr_internal_unlink(RelFileNode rnode, int which, bool isTemp, bool isRedo); /* * smgrinit(), smgrshutdown() -- Initialize or shut down all storage * managers. * * Note: in the normal multiprocess scenario with a postmaster, these are * called at postmaster start and stop, not per-backend. */ void smgrinit(void) { int i; for (i = 0; i < NSmgr; i++) { if (smgrsw[i].smgr_init) { if (! (*(smgrsw[i].smgr_init)) ()) elog(FATAL, "smgr initialization failed on %s: %m", DatumGetCString(DirectFunctionCall1(smgrout, Int16GetDatum(i)))); } } /* register the shutdown proc */ on_proc_exit(smgrshutdown, 0); } static void smgrshutdown(int code, Datum arg) { int i; for (i = 0; i < NSmgr; i++) { if (smgrsw[i].smgr_shutdown) { if (! (*(smgrsw[i].smgr_shutdown)) ()) elog(FATAL, "smgr shutdown failed on %s: %m", DatumGetCString(DirectFunctionCall1(smgrout, Int16GetDatum(i)))); } } } /* * smgropen() -- Return an SMgrRelation object, creating it if need be. * * This does not attempt to actually open the object. */ SMgrRelation smgropen(RelFileNode rnode) { SMgrRelation reln; bool found; if (SMgrRelationHash == NULL) { /* First time through: initialize the hash table */ HASHCTL ctl; MemSet(&ctl, 0, sizeof(ctl)); ctl.keysize = sizeof(RelFileNode); ctl.entrysize = sizeof(SMgrRelationData); ctl.hash = tag_hash; SMgrRelationHash = hash_create("smgr relation table", 400, &ctl, HASH_ELEM | HASH_FUNCTION); } /* Look up or create an entry */ reln = (SMgrRelation) hash_search(SMgrRelationHash, (void *) &rnode, HASH_ENTER, &found); if (reln == NULL) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); /* Initialize it if not present before */ if (!found) { /* hash_search already filled in the lookup key */ reln->smgr_which = 0; /* we only have md.c at present */ reln->md_fd = NULL; /* mark it not open */ } return reln; } /* * smgrclose() -- Close and delete an SMgrRelation object. * * It is the caller's responsibility not to leave any dangling references * to the object. (Pointers should be cleared after successful return; * on the off chance of failure, the SMgrRelation object will still exist.) */ void smgrclose(SMgrRelation reln) { if (! (*(smgrsw[reln->smgr_which].smgr_close)) (reln)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close relation %u/%u: %m", reln->smgr_rnode.tblNode, reln->smgr_rnode.relNode))); if (hash_search(SMgrRelationHash, (void *) &(reln->smgr_rnode), HASH_REMOVE, NULL) == NULL) elog(ERROR, "SMgrRelation hashtable corrupted"); } /* * smgrcloseall() -- Close all existing SMgrRelation objects. * * It is the caller's responsibility not to leave any dangling references. */ void smgrcloseall(void) { HASH_SEQ_STATUS status; SMgrRelation reln; /* Nothing to do if hashtable not set up */ if (SMgrRelationHash == NULL) return; hash_seq_init(&status, SMgrRelationHash); while ((reln = (SMgrRelation) hash_seq_search(&status)) != NULL) { smgrclose(reln); } } /* * smgrclosenode() -- Close SMgrRelation object for given RelFileNode, * if one exists. * * This has the same effects as smgrclose(smgropen(rnode)), but it avoids * uselessly creating a hashtable entry only to drop it again when no * such entry exists already. * * It is the caller's responsibility not to leave any dangling references. */ void smgrclosenode(RelFileNode rnode) { SMgrRelation reln; /* Nothing to do if hashtable not set up */ if (SMgrRelationHash == NULL) return; reln = (SMgrRelation) hash_search(SMgrRelationHash, (void *) &rnode, HASH_FIND, NULL); if (reln != NULL) smgrclose(reln); } /* * smgrcreate() -- Create a new relation. * * Given an already-created (but presumably unused) SMgrRelation, * cause the underlying disk file or other storage to be created. * * If isRedo is true, it is okay for the underlying file to exist * already because we are in a WAL replay sequence. In this case * we should make no PendingRelDelete entry; the WAL sequence will * tell whether to drop the file. */ void smgrcreate(SMgrRelation reln, bool isTemp, bool isRedo) { PendingRelDelete *pending; if (! (*(smgrsw[reln->smgr_which].smgr_create)) (reln, isRedo)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create relation %u/%u: %m", reln->smgr_rnode.tblNode, reln->smgr_rnode.relNode))); if (isRedo) return; /* Add the relation to the list of stuff to delete at abort */ pending = (PendingRelDelete *) MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete)); pending->relnode = reln->smgr_rnode; pending->which = reln->smgr_which; pending->isTemp = isTemp; pending->atCommit = false; /* delete if abort */ pending->next = pendingDeletes; pendingDeletes = pending; } /* * smgrscheduleunlink() -- Schedule unlinking a relation at xact commit. * * The relation is marked to be removed from the store if we * successfully commit the current transaction. * * This also implies smgrclose() on the SMgrRelation object. */ void smgrscheduleunlink(SMgrRelation reln, bool isTemp) { PendingRelDelete *pending; /* Add the relation to the list of stuff to delete at commit */ pending = (PendingRelDelete *) MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete)); pending->relnode = reln->smgr_rnode; pending->which = reln->smgr_which; pending->isTemp = isTemp; pending->atCommit = true; /* delete if commit */ pending->next = pendingDeletes; pendingDeletes = pending; /* * NOTE: if the relation was created in this transaction, it will now * be present in the pending-delete list twice, once with atCommit * true and once with atCommit false. Hence, it will be physically * deleted at end of xact in either case (and the other entry will be * ignored by smgrDoPendingDeletes, so no error will occur). We could * instead remove the existing list entry and delete the physical file * immediately, but for now I'll keep the logic simple. */ /* Now close the file and throw away the hashtable entry */ smgrclose(reln); } /* * smgrdounlink() -- Immediately unlink a relation. * * The relation is removed from the store. This should not be used * during transactional operations, since it can't be undone. * * If isRedo is true, it is okay for the underlying file to be gone * already. (In practice isRedo will always be true.) * * This also implies smgrclose() on the SMgrRelation object. */ void smgrdounlink(SMgrRelation reln, bool isTemp, bool isRedo) { RelFileNode rnode = reln->smgr_rnode; int which = reln->smgr_which; /* Close the file and throw away the hashtable entry */ smgrclose(reln); smgr_internal_unlink(rnode, which, isTemp, isRedo); } /* * Shared subroutine that actually does the unlink ... */ static void smgr_internal_unlink(RelFileNode rnode, int which, bool isTemp, bool isRedo) { /* * Get rid of any leftover buffers for the rel (shouldn't be any in the * commit case, but there can be in the abort case). */ DropRelFileNodeBuffers(rnode, isTemp); /* * Tell the free space map to forget this relation. It won't be accessed * any more anyway, but we may as well recycle the map space quickly. */ FreeSpaceMapForgetRel(&rnode); /* * And delete the physical files. * * Note: we treat deletion failure as a WARNING, not an error, * because we've already decided to commit or abort the current xact. */ if (! (*(smgrsw[which].smgr_unlink)) (rnode, isRedo)) ereport(WARNING, (errcode_for_file_access(), errmsg("could not unlink relation %u/%u: %m", rnode.tblNode, rnode.relNode))); } /* * smgrextend() -- Add a new block to a file. * * The semantics are basically the same as smgrwrite(): write at the * specified position. However, we are expecting to extend the * relation (ie, blocknum is the current EOF), and so in case of * failure we clean up by truncating. */ void smgrextend(SMgrRelation reln, BlockNumber blocknum, char *buffer) { if (! (*(smgrsw[reln->smgr_which].smgr_extend)) (reln, blocknum, buffer)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not extend relation %u/%u: %m", reln->smgr_rnode.tblNode, reln->smgr_rnode.relNode), errhint("Check free disk space."))); } /* * smgrread() -- read a particular block from a relation into the supplied * buffer. * * This routine is called from the buffer manager in order to * instantiate pages in the shared buffer cache. All storage managers * return pages in the format that POSTGRES expects. */ void smgrread(SMgrRelation reln, BlockNumber blocknum, char *buffer) { if (! (*(smgrsw[reln->smgr_which].smgr_read)) (reln, blocknum, buffer)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not read block %u of relation %u/%u: %m", blocknum, reln->smgr_rnode.tblNode, reln->smgr_rnode.relNode))); } /* * smgrwrite() -- Write the supplied buffer out. * * This is not a synchronous write -- the block is not necessarily * on disk at return, only dumped out to the kernel. */ void smgrwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer) { if (! (*(smgrsw[reln->smgr_which].smgr_write)) (reln, blocknum, buffer)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not write block %u of relation %u/%u: %m", blocknum, reln->smgr_rnode.tblNode, reln->smgr_rnode.relNode))); } /* * smgrnblocks() -- Calculate the number of blocks in the * supplied relation. * * Returns the number of blocks on success, aborts the current * transaction on failure. */ BlockNumber smgrnblocks(SMgrRelation reln) { BlockNumber nblocks; nblocks = (*(smgrsw[reln->smgr_which].smgr_nblocks)) (reln); /* * NOTE: if a relation ever did grow to 2^32-1 blocks, this code would * fail --- but that's a good thing, because it would stop us from * extending the rel another block and having a block whose number * actually is InvalidBlockNumber. */ if (nblocks == InvalidBlockNumber) ereport(ERROR, (errcode_for_file_access(), errmsg("could not count blocks of relation %u/%u: %m", reln->smgr_rnode.tblNode, reln->smgr_rnode.relNode))); return nblocks; } /* * smgrtruncate() -- Truncate supplied relation to the specified number * of blocks * * Returns the number of blocks on success, aborts the current * transaction on failure. */ BlockNumber smgrtruncate(SMgrRelation reln, BlockNumber nblocks) { BlockNumber newblks; /* * Tell the free space map to forget anything it may have stored * for the about-to-be-deleted blocks. We want to be sure it * won't return bogus block numbers later on. */ FreeSpaceMapTruncateRel(&reln->smgr_rnode, nblocks); newblks = (*(smgrsw[reln->smgr_which].smgr_truncate)) (reln, nblocks); if (newblks == InvalidBlockNumber) ereport(ERROR, (errcode_for_file_access(), errmsg("could not truncate relation %u/%u to %u blocks: %m", reln->smgr_rnode.tblNode, reln->smgr_rnode.relNode, nblocks))); return newblks; } /* * smgrDoPendingDeletes() -- Take care of relation deletes at end of xact. */ void smgrDoPendingDeletes(bool isCommit) { while (pendingDeletes != NULL) { PendingRelDelete *pending = pendingDeletes; pendingDeletes = pending->next; if (pending->atCommit == isCommit) smgr_internal_unlink(pending->relnode, pending->which, pending->isTemp, false); pfree(pending); } } /* * smgrcommit() -- Prepare to commit changes made during the current * transaction. * * This is called before we actually commit. */ void smgrcommit(void) { int i; for (i = 0; i < NSmgr; i++) { if (smgrsw[i].smgr_commit) { if (! (*(smgrsw[i].smgr_commit)) ()) elog(FATAL, "transaction commit failed on %s: %m", DatumGetCString(DirectFunctionCall1(smgrout, Int16GetDatum(i)))); } } } /* * smgrabort() -- Abort changes made during the current transaction. */ void smgrabort(void) { int i; for (i = 0; i < NSmgr; i++) { if (smgrsw[i].smgr_abort) { if (! (*(smgrsw[i].smgr_abort)) ()) elog(FATAL, "transaction abort failed on %s: %m", DatumGetCString(DirectFunctionCall1(smgrout, Int16GetDatum(i)))); } } } /* * smgrsync() -- Sync files to disk at checkpoint time. */ void smgrsync(void) { int i; for (i = 0; i < NSmgr; i++) { if (smgrsw[i].smgr_sync) { if (! (*(smgrsw[i].smgr_sync)) ()) elog(PANIC, "storage sync failed on %s: %m", DatumGetCString(DirectFunctionCall1(smgrout, Int16GetDatum(i)))); } } } void smgr_redo(XLogRecPtr lsn, XLogRecord *record) { } void smgr_undo(XLogRecPtr lsn, XLogRecord *record) { } void smgr_desc(char *buf, uint8 xl_info, char *rec) { }