/*------------------------------------------------------------------------- * * md.c * This code manages relations that reside on magnetic disk. * * Portions Copyright (c) 1996-2000, PostgreSQL, Inc * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION * $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.66 2000/04/10 23:41:51 tgl Exp $ * *------------------------------------------------------------------------- */ #include #include #include #include "postgres.h" #include "catalog/catalog.h" #include "miscadmin.h" #include "storage/smgr.h" #include "utils/inval.h" /* ImmediateSharedRelationCacheInvalidate() */ #undef DIAGNOSTIC /* * The magnetic disk storage manager keeps track of open file descriptors * in its own descriptor pool. This happens for two reasons. First, at * transaction boundaries, we walk the list of descriptors and flush * anything that we've dirtied in the current transaction. Second, we want * to support relations larger than the OS' file size limit (often 2GBytes). * In order to do that, we break relations up into chunks of < 2GBytes * and store one chunk in each of several files that represent the relation. * See the BLCKSZ and RELSEG_SIZE configuration constants in include/config.h. * * The file descriptor stored in the relation cache (see RelationGetFile()) * is actually an index into the Md_fdvec array. -1 indicates not open. * * When a relation is broken into multiple chunks, only the first chunk * has its own entry in the Md_fdvec array; the remaining chunks have * palloc'd MdfdVec objects that are chained onto the first chunk via the * mdfd_chain links. All chunks except the last MUST have size exactly * equal to RELSEG_SIZE blocks --- see mdnblocks() and mdtruncate(). */ typedef struct _MdfdVec { int mdfd_vfd; /* fd number in vfd pool */ int mdfd_flags; /* fd status flags */ /* these are the assigned bits in mdfd_flags: */ #define MDFD_FREE (1 << 0)/* unused entry */ int mdfd_lstbcnt; /* most recent block count */ int mdfd_nextFree; /* next free vector */ #ifndef LET_OS_MANAGE_FILESIZE struct _MdfdVec *mdfd_chain;/* for large relations */ #endif } MdfdVec; static int Nfds = 100; /* initial/current size of Md_fdvec array */ static MdfdVec *Md_fdvec = (MdfdVec *) NULL; static int Md_Free = -1; /* head of freelist of unused fdvec entries */ static int CurFd = 0; /* first never-used fdvec index */ static MemoryContext MdCxt; /* context for all my allocations */ /* routines declared here */ static void mdclose_fd(int fd); static int _mdfd_getrelnfd(Relation reln); static MdfdVec *_mdfd_openseg(Relation reln, int segno, int oflags); static MdfdVec *_mdfd_getseg(Relation reln, int blkno); static int _mdfd_blind_getseg(char *dbname, char *relname, Oid dbid, Oid relid, int blkno); static int _fdvec_alloc(void); static void _fdvec_free(int); static BlockNumber _mdnblocks(File file, Size blcksz); /* * mdinit() -- Initialize private state for magnetic disk storage manager. * * We keep a private table of all file descriptors. Whenever we do * a write to one, we mark it dirty in our table. Whenever we force * changes to disk, we mark the file descriptor clean. At transaction * commit, we force changes to disk for all dirty file descriptors. * This routine allocates and initializes the table. * * Returns SM_SUCCESS or SM_FAIL with errno set as appropriate. */ int mdinit() { MemoryContext oldcxt; int i; MdCxt = (MemoryContext) CreateGlobalMemory("MdSmgr"); if (MdCxt == (MemoryContext) NULL) return SM_FAIL; oldcxt = MemoryContextSwitchTo(MdCxt); Md_fdvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec)); MemoryContextSwitchTo(oldcxt); if (Md_fdvec == (MdfdVec *) NULL) return SM_FAIL; MemSet(Md_fdvec, 0, Nfds * sizeof(MdfdVec)); /* Set free list */ for (i = 0; i < Nfds; i++) { Md_fdvec[i].mdfd_nextFree = i + 1; Md_fdvec[i].mdfd_flags = MDFD_FREE; } Md_Free = 0; Md_fdvec[Nfds - 1].mdfd_nextFree = -1; return SM_SUCCESS; } int mdcreate(Relation reln) { int fd, vfd; char *path; Assert(reln->rd_unlinked && reln->rd_fd < 0); path = relpath(RelationGetPhysicalRelationName(reln)); #ifndef __CYGWIN32__ fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL, 0600); #else fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | O_BINARY, 0600); #endif /* * During bootstrap processing, we skip that check, because pg_time, * pg_variable, and pg_log get created before their .bki file entries * are processed. * * For cataloged relations,pg_class is guaranteed to have an unique * record with the same relname by the unique index. * So we are able to reuse existent files for new catloged relations. * Currently we reuse them in the following cases. * 1. they are empty. * 2. they are used for Index relations and their size == BLCKSZ * 2. */ if (fd < 0) { if (!IsBootstrapProcessingMode() && reln->rd_rel->relkind == RELKIND_UNCATALOGED) return -1; #ifndef __CYGWIN32__ fd = FileNameOpenFile(path, O_RDWR, 0600); #else fd = FileNameOpenFile(path, O_RDWR | O_BINARY, 0600); #endif if (fd < 0) return -1; if (!IsBootstrapProcessingMode()) { bool reuse = false; int len = FileSeek(fd, 0L, SEEK_END); if (len == 0) reuse = true; else if (reln->rd_rel->relkind == RELKIND_INDEX && len == BLCKSZ * 2) reuse = true; if (!reuse) { FileClose(fd); return -1; } } } reln->rd_unlinked = false; vfd = _fdvec_alloc(); if (vfd < 0) return -1; Md_fdvec[vfd].mdfd_vfd = fd; Md_fdvec[vfd].mdfd_flags = (uint16) 0; #ifndef LET_OS_MANAGE_FILESIZE Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL; #endif Md_fdvec[vfd].mdfd_lstbcnt = 0; pfree(path); return vfd; } /* * mdunlink() -- Unlink a relation. */ int mdunlink(Relation reln) { int nblocks; int fd; MdfdVec *v; MemoryContext oldcxt; /* If the relation is already unlinked,we have nothing to do * any more. */ if (reln->rd_unlinked && reln->rd_fd < 0) return SM_SUCCESS; /* * Force all segments of the relation to be opened, so that we * won't miss deleting any of them. */ nblocks = mdnblocks(reln); /* * Clean out the mdfd vector, letting fd.c unlink the physical files. * * NOTE: We truncate the file(s) before deleting 'em, because if other * backends are holding the files open, the unlink will fail on some * platforms (think Microsoft). Better a zero-size file gets left around * than a big file. Those other backends will be forced to close the * relation by cache invalidation, but that probably hasn't happened yet. */ fd = RelationGetFile(reln); if (fd < 0) /* should not happen */ elog(ERROR, "mdunlink: mdnblocks didn't open relation"); Md_fdvec[fd].mdfd_flags = (uint16) 0; oldcxt = MemoryContextSwitchTo(MdCxt); #ifndef LET_OS_MANAGE_FILESIZE for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL;) { MdfdVec *ov = v; FileTruncate(v->mdfd_vfd, 0); FileUnlink(v->mdfd_vfd); v = v->mdfd_chain; if (ov != &Md_fdvec[fd]) pfree(ov); } Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL; #else v = &Md_fdvec[fd]; FileTruncate(v->mdfd_vfd, 0); FileUnlink(v->mdfd_vfd); #endif MemoryContextSwitchTo(oldcxt); _fdvec_free(fd); /* be sure to mark relation closed && unlinked */ reln->rd_fd = -1; reln->rd_unlinked = true; return SM_SUCCESS; } /* * mdextend() -- Add a block to the specified relation. * * This routine returns SM_FAIL or SM_SUCCESS, with errno set as * appropriate. */ int mdextend(Relation reln, char *buffer) { long pos, nbytes; int nblocks; MdfdVec *v; nblocks = mdnblocks(reln); v = _mdfd_getseg(reln, nblocks); if ((pos = FileSeek(v->mdfd_vfd, 0L, SEEK_END)) < 0) return SM_FAIL; if (pos % BLCKSZ != 0) /* the last block is incomplete */ { pos -= pos % BLCKSZ; if (FileSeek(v->mdfd_vfd, pos, SEEK_SET) < 0) return SM_FAIL; } if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ) { if (nbytes > 0) { FileTruncate(v->mdfd_vfd, pos); FileSeek(v->mdfd_vfd, pos, SEEK_SET); } return SM_FAIL; } /* try to keep the last block count current, though it's just a hint */ #ifndef LET_OS_MANAGE_FILESIZE if ((v->mdfd_lstbcnt = (++nblocks % RELSEG_SIZE)) == 0) v->mdfd_lstbcnt = RELSEG_SIZE; #ifdef DIAGNOSTIC if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE || v->mdfd_lstbcnt > RELSEG_SIZE) elog(FATAL, "segment too big!"); #endif #else v->mdfd_lstbcnt = ++nblocks; #endif return SM_SUCCESS; } /* * mdopen() -- Open the specified relation. */ int mdopen(Relation reln) { char *path; int fd; int vfd; Assert(reln->rd_fd < 0); path = relpath(RelationGetPhysicalRelationName(reln)); #ifndef __CYGWIN32__ fd = FileNameOpenFile(path, O_RDWR, 0600); #else fd = FileNameOpenFile(path, O_RDWR | O_BINARY, 0600); #endif if (fd < 0) { /* in bootstrap mode, accept mdopen as substitute for mdcreate */ if (IsBootstrapProcessingMode()) { #ifndef __CYGWIN32__ fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL, 0600); #else fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | O_BINARY, 0600); #endif } if (fd < 0) { elog(NOTICE, "mdopen: couldn't open %s: %m", path); /* mark relation closed and unlinked */ reln->rd_fd = -1; reln->rd_unlinked = true; return -1; } } reln->rd_unlinked = false; vfd = _fdvec_alloc(); if (vfd < 0) return -1; Md_fdvec[vfd].mdfd_vfd = fd; Md_fdvec[vfd].mdfd_flags = (uint16) 0; Md_fdvec[vfd].mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ); #ifndef LET_OS_MANAGE_FILESIZE Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL; #ifdef DIAGNOSTIC if (Md_fdvec[vfd].mdfd_lstbcnt > RELSEG_SIZE) elog(FATAL, "segment too big on relopen!"); #endif #endif pfree(path); return vfd; } /* * mdclose() -- Close the specified relation, if it isn't closed already. * * AND FREE fd vector! It may be re-used for other relation! * reln should be flushed from cache after closing !.. * * Returns SM_SUCCESS or SM_FAIL with errno set as appropriate. */ int mdclose(Relation reln) { int fd; fd = RelationGetFile(reln); if (fd < 0) return SM_SUCCESS; /* already closed, so no work */ mdclose_fd(fd); reln->rd_fd = -1; return SM_SUCCESS; } static void mdclose_fd(int fd) { MdfdVec *v; MemoryContext oldcxt; oldcxt = MemoryContextSwitchTo(MdCxt); #ifndef LET_OS_MANAGE_FILESIZE for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL;) { MdfdVec *ov = v; /* if not closed already */ if (v->mdfd_vfd >= 0) { /* * We sync the file descriptor so that we don't need to reopen * it at transaction commit to force changes to disk. (This * is not really optional, because we are about to forget that * the file even exists...) */ FileSync(v->mdfd_vfd); FileClose(v->mdfd_vfd); } /* Now free vector */ v = v->mdfd_chain; if (ov != &Md_fdvec[fd]) pfree(ov); } Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL; #else v = &Md_fdvec[fd]; if (v != (MdfdVec *) NULL) { if (v->mdfd_vfd >= 0) { /* * We sync the file descriptor so that we don't need to reopen * it at transaction commit to force changes to disk. (This * is not really optional, because we are about to forget that * the file even exists...) */ FileSync(v->mdfd_vfd); FileClose(v->mdfd_vfd); } } #endif MemoryContextSwitchTo(oldcxt); _fdvec_free(fd); } /* * mdread() -- Read the specified block from a relation. * * Returns SM_SUCCESS or SM_FAIL. */ int mdread(Relation reln, BlockNumber blocknum, char *buffer) { int status; long seekpos; int nbytes; MdfdVec *v; v = _mdfd_getseg(reln, blocknum); #ifndef LET_OS_MANAGE_FILESIZE seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE)); #ifdef DIAGNOSTIC if (seekpos >= BLCKSZ * RELSEG_SIZE) elog(FATAL, "seekpos too big!"); #endif #else seekpos = (long) (BLCKSZ * (blocknum)); #endif if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) return SM_FAIL; status = SM_SUCCESS; if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ) { if (nbytes == 0) MemSet(buffer, 0, BLCKSZ); else if (blocknum == 0 && nbytes > 0 && mdnblocks(reln) == 0) MemSet(buffer, 0, BLCKSZ); else status = SM_FAIL; } return status; } /* * mdwrite() -- Write the supplied block at the appropriate location. * * Returns SM_SUCCESS or SM_FAIL. */ int mdwrite(Relation reln, BlockNumber blocknum, char *buffer) { int status; long seekpos; MdfdVec *v; v = _mdfd_getseg(reln, blocknum); #ifndef LET_OS_MANAGE_FILESIZE seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE)); #ifdef DIAGNOSTIC if (seekpos >= BLCKSZ * RELSEG_SIZE) elog(FATAL, "seekpos too big!"); #endif #else seekpos = (long) (BLCKSZ * (blocknum)); #endif if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) return SM_FAIL; status = SM_SUCCESS; if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ) status = SM_FAIL; return status; } /* * mdflush() -- Synchronously write a block to disk. * * This is exactly like mdwrite(), but doesn't return until the file * system buffer cache has been flushed. */ int mdflush(Relation reln, BlockNumber blocknum, char *buffer) { int status; long seekpos; MdfdVec *v; v = _mdfd_getseg(reln, blocknum); #ifndef LET_OS_MANAGE_FILESIZE seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE)); #ifdef DIAGNOSTIC if (seekpos >= BLCKSZ * RELSEG_SIZE) elog(FATAL, "seekpos too big!"); #endif #else seekpos = (long) (BLCKSZ * (blocknum)); #endif if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) return SM_FAIL; /* write and sync the block */ status = SM_SUCCESS; if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ || FileSync(v->mdfd_vfd) < 0) status = SM_FAIL; return status; } /* * mdblindwrt() -- Write a block to disk blind. * * We have to be able to do this using only the name and OID of * the database and relation in which the block belongs. Otherwise * this is much like mdwrite(). If dofsync is TRUE, then we fsync * the file, making it more like mdflush(). */ int mdblindwrt(char *dbname, char *relname, Oid dbid, Oid relid, BlockNumber blkno, char *buffer, bool dofsync) { int status; long seekpos; int fd; fd = _mdfd_blind_getseg(dbname, relname, dbid, relid, blkno); if (fd < 0) return SM_FAIL; #ifndef LET_OS_MANAGE_FILESIZE seekpos = (long) (BLCKSZ * (blkno % RELSEG_SIZE)); #ifdef DIAGNOSTIC if (seekpos >= BLCKSZ * RELSEG_SIZE) elog(FATAL, "seekpos too big!"); #endif #else seekpos = (long) (BLCKSZ * (blkno)); #endif if (lseek(fd, seekpos, SEEK_SET) != seekpos) { close(fd); return SM_FAIL; } status = SM_SUCCESS; /* write and optionally sync the block */ if (write(fd, buffer, BLCKSZ) != BLCKSZ) status = SM_FAIL; else if (dofsync && pg_fsync(fd) < 0) status = SM_FAIL; if (close(fd) < 0) status = SM_FAIL; return status; } /* * mdmarkdirty() -- Mark the specified block "dirty" (ie, needs fsync). * * Returns SM_SUCCESS or SM_FAIL. */ int mdmarkdirty(Relation reln, BlockNumber blkno) { MdfdVec *v; v = _mdfd_getseg(reln, blkno); FileMarkDirty(v->mdfd_vfd); return SM_SUCCESS; } /* * mdblindmarkdirty() -- Mark the specified block "dirty" (ie, needs fsync). * * We have to be able to do this using only the name and OID of * the database and relation in which the block belongs. Otherwise * this is much like mdmarkdirty(). However, we do the fsync immediately * rather than building md/fd datastructures to postpone it till later. */ int mdblindmarkdirty(char *dbname, char *relname, Oid dbid, Oid relid, BlockNumber blkno) { int status; int fd; fd = _mdfd_blind_getseg(dbname, relname, dbid, relid, blkno); if (fd < 0) return SM_FAIL; status = SM_SUCCESS; if (pg_fsync(fd) < 0) status = SM_FAIL; if (close(fd) < 0) status = SM_FAIL; return status; } /* * mdnblocks() -- Get the number of blocks stored in a relation. * * Important side effect: all segments of the relation are opened * and added to the mdfd_chain list. If this routine has not been * called, then only segments up to the last one actually touched * are present in the chain... * * Returns # of blocks, elog's on error. */ int mdnblocks(Relation reln) { int fd; MdfdVec *v; #ifndef LET_OS_MANAGE_FILESIZE int nblocks; int segno; #endif fd = _mdfd_getrelnfd(reln); v = &Md_fdvec[fd]; #ifndef LET_OS_MANAGE_FILESIZE segno = 0; for (;;) { nblocks = _mdnblocks(v->mdfd_vfd, BLCKSZ); if (nblocks > RELSEG_SIZE) elog(FATAL, "segment too big in mdnblocks!"); v->mdfd_lstbcnt = nblocks; if (nblocks == RELSEG_SIZE) { segno++; if (v->mdfd_chain == (MdfdVec *) NULL) { v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT); if (v->mdfd_chain == (MdfdVec *) NULL) elog(ERROR, "cannot count blocks for %s -- open failed", RelationGetRelationName(reln)); } v = v->mdfd_chain; } else return (segno * RELSEG_SIZE) + nblocks; } #else return _mdnblocks(v->mdfd_vfd, BLCKSZ); #endif } /* * mdtruncate() -- Truncate relation to specified number of blocks. * * Returns # of blocks or -1 on error. */ int mdtruncate(Relation reln, int nblocks) { int curnblk; int fd; MdfdVec *v; #ifndef LET_OS_MANAGE_FILESIZE MemoryContext oldcxt; int priorblocks; #endif /* NOTE: mdnblocks makes sure we have opened all existing segments, * so that truncate/delete loop will get them all! */ curnblk = mdnblocks(reln); if (nblocks < 0 || nblocks > curnblk) return -1; /* bogus request */ if (nblocks == curnblk) return nblocks; /* no work */ fd = _mdfd_getrelnfd(reln); v = &Md_fdvec[fd]; #ifndef LET_OS_MANAGE_FILESIZE oldcxt = MemoryContextSwitchTo(MdCxt); priorblocks = 0; while (v != (MdfdVec *) NULL) { MdfdVec *ov = v; if (priorblocks > nblocks) { /* This segment is no longer wanted at all (and has already been * unlinked from the mdfd_chain). * We truncate the file before deleting it because if other * backends are holding the file open, the unlink will fail on * some platforms. Better a zero-size file gets left around than * a big file... */ FileTruncate(v->mdfd_vfd, 0); FileUnlink(v->mdfd_vfd); v = v->mdfd_chain; Assert(ov != &Md_fdvec[fd]); /* we never drop the 1st segment */ pfree(ov); } else if (priorblocks + RELSEG_SIZE > nblocks) { /* This is the last segment we want to keep. * Truncate the file to the right length, and clear chain link * that points to any remaining segments (which we shall zap). * NOTE: if nblocks is exactly a multiple K of RELSEG_SIZE, * we will truncate the K+1st segment to 0 length but keep it. * This is mainly so that the right thing happens if nblocks=0. */ int lastsegblocks = nblocks - priorblocks; if (FileTruncate(v->mdfd_vfd, lastsegblocks * BLCKSZ) < 0) return -1; v->mdfd_lstbcnt = lastsegblocks; v = v->mdfd_chain; ov->mdfd_chain = (MdfdVec *) NULL; } else { /* We still need this segment and 0 or more blocks beyond it, * so nothing to do here. */ v = v->mdfd_chain; } priorblocks += RELSEG_SIZE; } MemoryContextSwitchTo(oldcxt); #else if (FileTruncate(v->mdfd_vfd, nblocks * BLCKSZ) < 0) return -1; v->mdfd_lstbcnt = nblocks; #endif return nblocks; } /* mdtruncate */ /* * mdcommit() -- Commit a transaction. * * All changes to magnetic disk relations must be forced to stable * storage. This routine makes a pass over the private table of * file descriptors. Any descriptors to which we have done writes, * but not synced, are synced here. * * Returns SM_SUCCESS or SM_FAIL with errno set as appropriate. */ int mdcommit() { #ifdef XLOG sync(); sleep(1); sync(); return SM_SUCCESS; #else int i; MdfdVec *v; for (i = 0; i < CurFd; i++) { v = &Md_fdvec[i]; if (v->mdfd_flags & MDFD_FREE) continue; /* Sync the file entry */ #ifndef LET_OS_MANAGE_FILESIZE for ( ; v != (MdfdVec *) NULL; v = v->mdfd_chain) #else if (v != (MdfdVec *) NULL) #endif { if (FileSync(v->mdfd_vfd) < 0) return SM_FAIL; } } return SM_SUCCESS; #endif /* XLOG */ } /* * mdabort() -- Abort a transaction. * * Changes need not be forced to disk at transaction abort. We mark * all file descriptors as clean here. Always returns SM_SUCCESS. */ int mdabort() { /* We don't actually have to do anything here. fd.c will discard * fsync-needed bits in its AtEOXact_Files() routine. */ return SM_SUCCESS; } /* * _fdvec_alloc () -- grab a free (or new) md file descriptor vector. * */ static int _fdvec_alloc() { MdfdVec *nvec; int fdvec, i; MemoryContext oldcxt; if (Md_Free >= 0) /* get from free list */ { fdvec = Md_Free; Md_Free = Md_fdvec[fdvec].mdfd_nextFree; Assert(Md_fdvec[fdvec].mdfd_flags == MDFD_FREE); Md_fdvec[fdvec].mdfd_flags = 0; if (fdvec >= CurFd) { Assert(fdvec == CurFd); CurFd++; } return fdvec; } /* Must allocate more room */ if (Nfds != CurFd) elog(FATAL, "_fdvec_alloc error"); Nfds *= 2; oldcxt = MemoryContextSwitchTo(MdCxt); nvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec)); MemSet(nvec, 0, Nfds * sizeof(MdfdVec)); memmove(nvec, (char *) Md_fdvec, CurFd * sizeof(MdfdVec)); pfree(Md_fdvec); MemoryContextSwitchTo(oldcxt); Md_fdvec = nvec; /* Set new free list */ for (i = CurFd; i < Nfds; i++) { Md_fdvec[i].mdfd_nextFree = i + 1; Md_fdvec[i].mdfd_flags = MDFD_FREE; } Md_fdvec[Nfds - 1].mdfd_nextFree = -1; Md_Free = CurFd + 1; fdvec = CurFd; CurFd++; Md_fdvec[fdvec].mdfd_flags = 0; return fdvec; } /* * _fdvec_free () -- free md file descriptor vector. * */ static void _fdvec_free(int fdvec) { Assert(Md_Free < 0 || Md_fdvec[Md_Free].mdfd_flags == MDFD_FREE); Assert(Md_fdvec[fdvec].mdfd_flags != MDFD_FREE); Md_fdvec[fdvec].mdfd_nextFree = Md_Free; Md_fdvec[fdvec].mdfd_flags = MDFD_FREE; Md_Free = fdvec; } static MdfdVec * _mdfd_openseg(Relation reln, int segno, int oflags) { MemoryContext oldcxt; MdfdVec *v; int fd; char *path, *fullpath; /* be sure we have enough space for the '.segno', if any */ path = relpath(RelationGetPhysicalRelationName(reln)); if (segno > 0) { fullpath = (char *) palloc(strlen(path) + 12); sprintf(fullpath, "%s.%d", path, segno); pfree(path); } else fullpath = path; /* open the file */ #ifndef __CYGWIN32__ fd = FileNameOpenFile(fullpath, O_RDWR | oflags, 0600); #else fd = FileNameOpenFile(fullpath, O_RDWR | O_BINARY | oflags, 0600); #endif pfree(fullpath); if (fd < 0) return (MdfdVec *) NULL; /* allocate an mdfdvec entry for it */ oldcxt = MemoryContextSwitchTo(MdCxt); v = (MdfdVec *) palloc(sizeof(MdfdVec)); MemoryContextSwitchTo(oldcxt); /* fill the entry */ v->mdfd_vfd = fd; v->mdfd_flags = (uint16) 0; v->mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ); #ifndef LET_OS_MANAGE_FILESIZE v->mdfd_chain = (MdfdVec *) NULL; #ifdef DIAGNOSTIC if (v->mdfd_lstbcnt > RELSEG_SIZE) elog(FATAL, "segment too big on open!"); #endif #endif /* all done */ return v; } /* Get the fd for the relation, opening it if it's not already open */ static int _mdfd_getrelnfd(Relation reln) { int fd; fd = RelationGetFile(reln); if (fd < 0) { if ((fd = mdopen(reln)) < 0) elog(ERROR, "cannot open relation %s", RelationGetRelationName(reln)); reln->rd_fd = fd; } return fd; } /* Find the segment of the relation holding the specified block */ static MdfdVec * _mdfd_getseg(Relation reln, int blkno) { MdfdVec *v; int segno; int fd; int i; fd = _mdfd_getrelnfd(reln); #ifndef LET_OS_MANAGE_FILESIZE for (v = &Md_fdvec[fd], segno = blkno / RELSEG_SIZE, i = 1; segno > 0; i++, segno--) { if (v->mdfd_chain == (MdfdVec *) NULL) { v->mdfd_chain = _mdfd_openseg(reln, i, O_CREAT); if (v->mdfd_chain == (MdfdVec *) NULL) elog(ERROR, "cannot open segment %d of relation %s", i, RelationGetRelationName(reln)); } v = v->mdfd_chain; } #else v = &Md_fdvec[fd]; #endif return v; } /* * Find the segment of the relation holding the specified block. * * This performs the same work as _mdfd_getseg() except that we must work * "blind" with no Relation struct. We assume that we are not likely to * touch the same relation again soon, so we do not create an FD entry for * the relation --- we just open a kernel file descriptor which will be * used and promptly closed. The return value is the kernel descriptor, * or -1 on failure. */ static int _mdfd_blind_getseg(char *dbname, char *relname, Oid dbid, Oid relid, int blkno) { char *path; int fd; #ifndef LET_OS_MANAGE_FILESIZE int segno; #endif /* construct the path to the relation */ path = relpath_blind(dbname, relname, dbid, relid); #ifndef LET_OS_MANAGE_FILESIZE /* append the '.segno', if needed */ segno = blkno / RELSEG_SIZE; if (segno > 0) { char *segpath = (char *) palloc(strlen(path) + 12); sprintf(segpath, "%s.%d", path, segno); pfree(path); path = segpath; } #endif #ifndef __CYGWIN32__ fd = open(path, O_RDWR, 0600); #else fd = open(path, O_RDWR | O_BINARY, 0600); #endif pfree(path); return fd; } static BlockNumber _mdnblocks(File file, Size blcksz) { long len; len = FileSeek(file, 0L, SEEK_END); if (len < 0) return 0; /* on failure, assume file is empty */ return (BlockNumber) (len / blcksz); }