diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 0887f3d1ec..b5eb53b03a 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.78 2000/04/09 04:43:18 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.79 2000/04/10 23:41:49 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1127,7 +1127,8 @@ BufferSync() bufHdr->blind.relname, bufdb, bufrel, bufHdr->tag.blockNum, - (char *) MAKE_PTR(bufHdr->data)); + (char *) MAKE_PTR(bufHdr->data), + true); /* must fsync */ } else { @@ -1529,7 +1530,8 @@ BufferReplace(BufferDesc *bufHdr) status = smgrblindwrt(DEFAULT_SMGR, bufHdr->blind.dbname, bufHdr->blind.relname, bufdb, bufrel, bufHdr->tag.blockNum, - (char *) MAKE_PTR(bufHdr->data)); + (char *) MAKE_PTR(bufHdr->data), + false); /* no fsync */ } #ifndef OPTIMIZE_SINGLE @@ -1544,9 +1546,11 @@ BufferReplace(BufferDesc *bufHdr) return FALSE; /* If we had marked this buffer as needing to be fsync'd, we can forget - * about that, because it's now the storage manager's responsibility. + * about that, because it's now the storage manager's responsibility + * (but only if we called smgrwrite, not smgrblindwrt). */ - ClearBufferDirtiedByMe(BufferDescriptorGetBuffer(bufHdr), bufHdr); + if (reln != (Relation) NULL) + ClearBufferDirtiedByMe(BufferDescriptorGetBuffer(bufHdr), bufHdr); BufferFlushCount++; diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 233bbb0ac2..b30b0386af 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.65 2000/04/09 04:43:20 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.66 2000/04/10 23:41:51 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -48,11 +48,10 @@ typedef struct _MdfdVec { int mdfd_vfd; /* fd number in vfd pool */ - int mdfd_flags; /* free, temporary */ + int mdfd_flags; /* fd status flags */ /* these are the assigned bits in mdfd_flags: */ #define MDFD_FREE (1 << 0)/* unused entry */ -#define MDFD_TEMP (1 << 1)/* close this entry at transaction end */ int mdfd_lstbcnt; /* most recent block count */ int mdfd_nextFree; /* next free vector */ @@ -72,8 +71,8 @@ static void mdclose_fd(int fd); static int _mdfd_getrelnfd(Relation reln); static MdfdVec *_mdfd_openseg(Relation reln, int segno, int oflags); static MdfdVec *_mdfd_getseg(Relation reln, int blkno); -static MdfdVec *_mdfd_blind_getseg(char *dbname, char *relname, - Oid dbid, Oid relid, int blkno); +static int _mdfd_blind_getseg(char *dbname, char *relname, + Oid dbid, Oid relid, int blkno); static int _fdvec_alloc(void); static void _fdvec_free(int); static BlockNumber _mdnblocks(File file, Size blcksz); @@ -572,7 +571,8 @@ mdflush(Relation reln, BlockNumber blocknum, char *buffer) * * We have to be able to do this using only the name and OID of * the database and relation in which the block belongs. Otherwise - * this is just like mdwrite(). + * this is much like mdwrite(). If dofsync is TRUE, then we fsync + * the file, making it more like mdflush(). */ int mdblindwrt(char *dbname, @@ -580,15 +580,16 @@ mdblindwrt(char *dbname, Oid dbid, Oid relid, BlockNumber blkno, - char *buffer) + char *buffer, + bool dofsync) { int status; long seekpos; - MdfdVec *v; + int fd; - v = _mdfd_blind_getseg(dbname, relname, dbid, relid, blkno); + fd = _mdfd_blind_getseg(dbname, relname, dbid, relid, blkno); - if (v == NULL) + if (fd < 0) return SM_FAIL; #ifndef LET_OS_MANAGE_FILESIZE @@ -601,11 +602,22 @@ mdblindwrt(char *dbname, seekpos = (long) (BLCKSZ * (blkno)); #endif - if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) + if (lseek(fd, seekpos, SEEK_SET) != seekpos) + { + close(fd); return SM_FAIL; + } status = SM_SUCCESS; - if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ) + + /* write and optionally sync the block */ + if (write(fd, buffer, BLCKSZ) != BLCKSZ) + status = SM_FAIL; + else if (dofsync && + pg_fsync(fd) < 0) + status = SM_FAIL; + + if (close(fd) < 0) status = SM_FAIL; return status; @@ -633,7 +645,8 @@ mdmarkdirty(Relation reln, BlockNumber blkno) * * We have to be able to do this using only the name and OID of * the database and relation in which the block belongs. Otherwise - * this is just like mdmarkdirty(). + * this is much like mdmarkdirty(). However, we do the fsync immediately + * rather than building md/fd datastructures to postpone it till later. */ int mdblindmarkdirty(char *dbname, @@ -642,16 +655,23 @@ mdblindmarkdirty(char *dbname, Oid relid, BlockNumber blkno) { - MdfdVec *v; + int status; + int fd; - v = _mdfd_blind_getseg(dbname, relname, dbid, relid, blkno); + fd = _mdfd_blind_getseg(dbname, relname, dbid, relid, blkno); - if (v == NULL) + if (fd < 0) return SM_FAIL; - FileMarkDirty(v->mdfd_vfd); + status = SM_SUCCESS; - return SM_SUCCESS; + if (pg_fsync(fd) < 0) + status = SM_FAIL; + + if (close(fd) < 0) + status = SM_FAIL; + + return status; } /* @@ -820,24 +840,15 @@ mdcommit() v = &Md_fdvec[i]; if (v->mdfd_flags & MDFD_FREE) continue; - if (v->mdfd_flags & MDFD_TEMP) - { - /* Sync and close the file */ - mdclose_fd(i); - } - else - { - /* Sync, but keep the file entry */ - + /* Sync the file entry */ #ifndef LET_OS_MANAGE_FILESIZE - for ( ; v != (MdfdVec *) NULL; v = v->mdfd_chain) + for ( ; v != (MdfdVec *) NULL; v = v->mdfd_chain) #else - if (v != (MdfdVec *) NULL) + if (v != (MdfdVec *) NULL) #endif - { - if (FileSync(v->mdfd_vfd) < 0) - return SM_FAIL; - } + { + if (FileSync(v->mdfd_vfd) < 0) + return SM_FAIL; } } @@ -854,21 +865,9 @@ mdcommit() int mdabort() { - int i; - MdfdVec *v; - - for (i = 0; i < CurFd; i++) - { - v = &Md_fdvec[i]; - if (v->mdfd_flags & MDFD_FREE) - continue; - if (v->mdfd_flags & MDFD_TEMP) - { - /* Close the file */ - mdclose_fd(i); - } - } - + /* We don't actually have to do anything here. fd.c will discard + * fsync-needed bits in its AtEOXact_Files() routine. + */ return SM_SUCCESS; } @@ -1057,102 +1056,52 @@ _mdfd_getseg(Relation reln, int blkno) return v; } -/* Find the segment of the relation holding the specified block. - * This is the same as _mdfd_getseg() except that we must work - * "blind" with no Relation struct. +/* + * Find the segment of the relation holding the specified block. * - * NOTE: we have no easy way to tell whether a FD already exists for the - * target relation, so we always make a new one. This should probably - * be improved somehow, but I doubt it's a significant performance issue - * under normal circumstances. The FD is marked to be closed at end of xact - * so that we don't accumulate a lot of dead FDs. + * This performs the same work as _mdfd_getseg() except that we must work + * "blind" with no Relation struct. We assume that we are not likely to + * touch the same relation again soon, so we do not create an FD entry for + * the relation --- we just open a kernel file descriptor which will be + * used and promptly closed. The return value is the kernel descriptor, + * or -1 on failure. */ -static MdfdVec * +static int _mdfd_blind_getseg(char *dbname, char *relname, Oid dbid, Oid relid, int blkno) { - MdfdVec *v; char *path; int fd; - int vfd; #ifndef LET_OS_MANAGE_FILESIZE int segno; - int targsegno; #endif - /* construct the path to the file and open it */ + /* construct the path to the relation */ path = relpath_blind(dbname, relname, dbid, relid); -#ifndef __CYGWIN32__ - fd = FileNameOpenFile(path, O_RDWR, 0600); -#else - fd = FileNameOpenFile(path, O_RDWR | O_BINARY, 0600); -#endif - - if (fd < 0) - return NULL; - - vfd = _fdvec_alloc(); - if (vfd < 0) - return NULL; - - Md_fdvec[vfd].mdfd_vfd = fd; - Md_fdvec[vfd].mdfd_flags = MDFD_TEMP; - Md_fdvec[vfd].mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ); #ifndef LET_OS_MANAGE_FILESIZE - Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL; - -#ifdef DIAGNOSTIC - if (Md_fdvec[vfd].mdfd_lstbcnt > RELSEG_SIZE) - elog(FATAL, "segment too big on relopen!"); -#endif - - targsegno = blkno / RELSEG_SIZE; - for (v = &Md_fdvec[vfd], segno = 1; segno <= targsegno; segno++) + /* append the '.segno', if needed */ + segno = blkno / RELSEG_SIZE; + if (segno > 0) { - char *segpath; - MdfdVec *newv; - MemoryContext oldcxt; + char *segpath = (char *) palloc(strlen(path) + 12); - segpath = (char *) palloc(strlen(path) + 12); sprintf(segpath, "%s.%d", path, segno); + pfree(path); + path = segpath; + } +#endif #ifndef __CYGWIN32__ - fd = FileNameOpenFile(segpath, O_RDWR | O_CREAT, 0600); + fd = open(path, O_RDWR, 0600); #else - fd = FileNameOpenFile(segpath, O_RDWR | O_BINARY | O_CREAT, 0600); -#endif - - pfree(segpath); - - if (fd < 0) - return (MdfdVec *) NULL; - - /* allocate an mdfdvec entry for it */ - oldcxt = MemoryContextSwitchTo(MdCxt); - newv = (MdfdVec *) palloc(sizeof(MdfdVec)); - MemoryContextSwitchTo(oldcxt); - - /* fill the entry */ - newv->mdfd_vfd = fd; - newv->mdfd_flags = MDFD_TEMP; - newv->mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ); - newv->mdfd_chain = (MdfdVec *) NULL; -#ifdef DIAGNOSTIC - if (newv->mdfd_lstbcnt > RELSEG_SIZE) - elog(FATAL, "segment too big on open!"); -#endif - v->mdfd_chain = newv; - v = newv; - } -#else - v = &Md_fdvec[vfd]; + fd = open(path, O_RDWR | O_BINARY, 0600); #endif pfree(path); - return v; + return fd; } static BlockNumber diff --git a/src/backend/storage/smgr/mm.c b/src/backend/storage/smgr/mm.c index fc3acead66..a5b22cbcc5 100644 --- a/src/backend/storage/smgr/mm.c +++ b/src/backend/storage/smgr/mm.c @@ -11,7 +11,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/storage/smgr/Attic/mm.c,v 1.18 2000/01/26 05:57:05 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/storage/smgr/Attic/mm.c,v 1.19 2000/04/10 23:41:51 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -478,7 +478,8 @@ mmblindwrt(char *dbstr, Oid dbid, Oid relid, BlockNumber blkno, - char *buffer) + char *buffer, + bool dofsync) { return SM_FAIL; } diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 839636b118..27cad952ae 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -11,7 +11,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.33 2000/04/09 04:43:20 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.34 2000/04/10 23:41:52 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -38,7 +38,8 @@ typedef struct f_smgr char *buffer); int (*smgr_blindwrt) (char *dbname, char *relname, Oid dbid, Oid relid, - BlockNumber blkno, char *buffer); + BlockNumber blkno, char *buffer, + bool dofsync); int (*smgr_markdirty) (Relation reln, BlockNumber blkno); int (*smgr_blindmarkdirty) (char *dbname, char *relname, Oid dbid, Oid relid, @@ -293,7 +294,8 @@ smgrflush(int16 which, Relation reln, BlockNumber blocknum, char *buffer) * this case, the buffer manager will call smgrblindwrt() with * the name and OID of the database and the relation to which the * buffer belongs. Every storage manager must be able to force - * this page down to stable storage in this circumstance. + * this page down to stable storage in this circumstance. The + * write should be synchronous if dofsync is true. */ int smgrblindwrt(int16 which, @@ -302,7 +304,8 @@ smgrblindwrt(int16 which, Oid dbid, Oid relid, BlockNumber blkno, - char *buffer) + char *buffer, + bool dofsync) { char *dbstr; char *relstr; @@ -313,7 +316,7 @@ smgrblindwrt(int16 which, relstr = pstrdup(relname); status = (*(smgrsw[which].smgr_blindwrt)) (dbstr, relstr, dbid, relid, - blkno, buffer); + blkno, buffer, dofsync); if (status == SM_FAIL) elog(ERROR, "cannot write block %d of %s [%s] blind", diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 053a63196e..bc0ec04bb2 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2000, PostgreSQL, Inc * Portions Copyright (c) 1994, Regents of the University of California * - * $Id: smgr.h,v 1.18 2000/04/09 04:43:18 tgl Exp $ + * $Id: smgr.h,v 1.19 2000/04/10 23:41:45 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -37,7 +37,8 @@ extern int smgrflush(int16 which, Relation reln, BlockNumber blocknum, char *buffer); extern int smgrblindwrt(int16 which, char *dbname, char *relname, Oid dbid, Oid relid, - BlockNumber blkno, char *buffer); + BlockNumber blkno, char *buffer, + bool dofsync); extern int smgrmarkdirty(int16 which, Relation reln, BlockNumber blkno); extern int smgrblindmarkdirty(int16 which, char *dbname, char *relname, Oid dbid, Oid relid, @@ -62,7 +63,8 @@ extern int mdread(Relation reln, BlockNumber blocknum, char *buffer); extern int mdwrite(Relation reln, BlockNumber blocknum, char *buffer); extern int mdflush(Relation reln, BlockNumber blocknum, char *buffer); extern int mdblindwrt(char *dbname, char *relname, Oid dbid, Oid relid, - BlockNumber blkno, char *buffer); + BlockNumber blkno, char *buffer, + bool dofsync); extern int mdmarkdirty(Relation reln, BlockNumber blkno); extern int mdblindmarkdirty(char *dbname, char *relname, Oid dbid, Oid relid, BlockNumber blkno); @@ -84,7 +86,8 @@ extern int mmread(Relation reln, BlockNumber blocknum, char *buffer); extern int mmwrite(Relation reln, BlockNumber blocknum, char *buffer); extern int mmflush(Relation reln, BlockNumber blocknum, char *buffer); extern int mmblindwrt(char *dbname, char *relname, Oid dbid, Oid relid, - BlockNumber blkno, char *buffer); + BlockNumber blkno, char *buffer, + bool dofsync); extern int mmmarkdirty(Relation reln, BlockNumber blkno); extern int mmblindmarkdirty(char *dbname, char *relname, Oid dbid, Oid relid, BlockNumber blkno);