diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index 7d19bf9f42..2665025336 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -2,7 +2,7 @@ * * bgwriter.c * - * The background writer (bgwriter) is new in Postgres 8.0. It attempts + * The background writer (bgwriter) is new as of Postgres 8.0. It attempts * to keep regular backends from having to write out dirty shared buffers * (which they would only do when needing to free a shared buffer to read in * another page). In the best scenario all writes from shared buffers will @@ -37,7 +37,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.34 2007/01/05 22:19:36 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.35 2007/01/17 00:17:20 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -103,8 +103,8 @@ typedef struct { RelFileNode rnode; - BlockNumber segno; - /* might add a request-type field later */ + BlockNumber segno; /* InvalidBlockNumber means "revoke" */ + /* might add a real request-type field later; not needed yet */ } BgWriterRequest; typedef struct @@ -695,6 +695,11 @@ RequestCheckpoint(bool waitforit, bool warnontime) * ForwardFsyncRequest * Forward a file-fsync request from a backend to the bgwriter * + * segno specifies which segment (not block!) of the relation needs to be + * fsync'd. If segno == InvalidBlockNumber, the meaning is to revoke any + * pending fsync requests for the entire relation (this message is sent + * when the relation is about to be deleted). + * * Whenever a backend is compelled to write directly to a relation * (which should be seldom, if the bgwriter is getting its job done), * the backend calls this routine to pass over knowledge that the relation diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 27a7c3d19a..e3cc00ca1a 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.125 2007/01/05 22:19:38 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.126 2007/01/17 00:17:21 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -31,6 +31,19 @@ /* interval for calling AbsorbFsyncRequests in mdsync */ #define FSYNCS_PER_ABSORB 10 +/* + * On Windows, we have to interpret EACCES as possibly meaning the same as + * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform, + * that's what you get. Ugh. This code is designed so that we don't + * actually believe these cases are okay without further evidence (namely, + * a pending fsync request getting revoked ... see mdsync). + */ +#ifndef WIN32 +#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT) +#else +#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT || (err) == EACCES) +#endif + /* * The magnetic disk storage manager keeps track of open file * descriptors in its own descriptor pool. This is done to make it @@ -93,9 +106,8 @@ static MemoryContext MdCxt; /* context for all md.c allocations */ * we keep track of pending fsync operations: we need to remember all relation * segments that have been written since the last checkpoint, so that we can * fsync them down to disk before completing the next checkpoint. This hash - * table remembers the pending operations. We use a hash table not because - * we want to look up individual operations, but simply as a convenient way - * of eliminating duplicate requests. + * table remembers the pending operations. We use a hash table mostly as + * a convenient way of eliminating duplicate requests. * * (Regular backends do not track pending operations locally, but forward * them to the bgwriter.) @@ -104,6 +116,12 @@ typedef struct { RelFileNode rnode; /* the targeted relation */ BlockNumber segno; /* which segment */ +} PendingOperationTag; + +typedef struct +{ + PendingOperationTag tag; /* hash table key (must be first!) */ + int failures; /* number of failed attempts to fsync */ } PendingOperationEntry; static HTAB *pendingOpsTable = NULL; @@ -153,7 +171,7 @@ mdinit(void) HASHCTL hash_ctl; MemSet(&hash_ctl, 0, sizeof(hash_ctl)); - hash_ctl.keysize = sizeof(PendingOperationEntry); + hash_ctl.keysize = sizeof(PendingOperationTag); hash_ctl.entrysize = sizeof(PendingOperationEntry); hash_ctl.hash = tag_hash; hash_ctl.hcxt = MdCxt; @@ -236,6 +254,35 @@ mdunlink(RelFileNode rnode, bool isRedo) { char *path; + /* + * We have to clean out any pending fsync requests for the doomed relation, + * else the next mdsync() will fail. + */ + if (pendingOpsTable) + { + /* standalone backend or startup process: fsync state is local */ + RememberFsyncRequest(rnode, InvalidBlockNumber); + } + else if (IsUnderPostmaster) + { + /* + * Notify the bgwriter about it. If we fail to queue the revoke + * message, we have to sleep and try again ... ugly, but hopefully + * won't happen often. + * + * XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with + * an error would leave the no-longer-used file still present on + * disk, which would be bad, so I'm inclined to assume that the + * bgwriter will always empty the queue soon. + */ + while (!ForwardFsyncRequest(rnode, InvalidBlockNumber)) + pg_usleep(10000L); /* 10 msec seems a good number */ + /* + * Note we don't wait for the bgwriter to actually absorb the + * revoke message; see mdsync() for the implications. + */ + } + path = relpath(rnode); /* Delete the first segment, or only segment if not doing segmenting */ @@ -414,7 +461,8 @@ mdopen(SMgrRelation reln, ExtensionBehavior behavior) if (fd < 0) { pfree(path); - if (behavior == EXTENSION_RETURN_NULL && errno == ENOENT) + if (behavior == EXTENSION_RETURN_NULL && + FILE_POSSIBLY_DELETED(errno)) return NULL; ereport(ERROR, (errcode_for_file_access(), @@ -834,94 +882,126 @@ mdimmedsync(SMgrRelation reln) void mdsync(void) { - HASH_SEQ_STATUS hstat; - PendingOperationEntry *entry; - int absorb_counter; + bool need_retry; if (!pendingOpsTable) elog(ERROR, "cannot sync without a pendingOpsTable"); /* - * If we are in the bgwriter, the sync had better include all fsync - * requests that were queued by backends before the checkpoint REDO point - * was determined. We go that a little better by accepting all requests - * queued up to the point where we start fsync'ing. + * The fsync table could contain requests to fsync relations that have + * been deleted (unlinked) by the time we get to them. Rather than + * just hoping an ENOENT (or EACCES on Windows) error can be ignored, + * what we will do is retry the whole process after absorbing fsync + * request messages again. Since mdunlink() queues a "revoke" message + * before actually unlinking, the fsync request is guaranteed to be gone + * the second time if it really was this case. */ - AbsorbFsyncRequests(); + do { + HASH_SEQ_STATUS hstat; + PendingOperationEntry *entry; + int absorb_counter; + + need_retry = false; - absorb_counter = FSYNCS_PER_ABSORB; - hash_seq_init(&hstat, pendingOpsTable); - while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) - { /* - * If fsync is off then we don't have to bother opening the file at - * all. (We delay checking until this point so that changing fsync on - * the fly behaves sensibly.) + * If we are in the bgwriter, the sync had better include all fsync + * requests that were queued by backends before the checkpoint REDO + * point was determined. We go that a little better by accepting all + * requests queued up to the point where we start fsync'ing. */ - if (enableFsync) + AbsorbFsyncRequests(); + + absorb_counter = FSYNCS_PER_ABSORB; + hash_seq_init(&hstat, pendingOpsTable); + while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) { - SMgrRelation reln; - MdfdVec *seg; - /* - * If in bgwriter, absorb pending requests every so often to - * prevent overflow of the fsync request queue. The hashtable - * code does not specify whether entries added by this will be - * visited by our search, but we don't really care: it's OK if we - * do, and OK if we don't. + * If fsync is off then we don't have to bother opening the file + * at all. (We delay checking until this point so that changing + * fsync on the fly behaves sensibly.) */ - if (--absorb_counter <= 0) + if (enableFsync) { - AbsorbFsyncRequests(); - absorb_counter = FSYNCS_PER_ABSORB; + SMgrRelation reln; + MdfdVec *seg; + + /* + * If in bgwriter, we want to absorb pending requests every so + * often to prevent overflow of the fsync request queue. This + * could result in deleting the current entry out from under + * our hashtable scan, so the procedure is to fall out of the + * scan and start over from the top of the function. + */ + if (--absorb_counter <= 0) + { + need_retry = true; + break; + } + + /* + * Find or create an smgr hash entry for this relation. This + * may seem a bit unclean -- md calling smgr? But it's really + * the best solution. It ensures that the open file reference + * isn't permanently leaked if we get an error here. (You may + * say "but an unreferenced SMgrRelation is still a leak!" Not + * really, because the only case in which a checkpoint is done + * by a process that isn't about to shut down is in the + * bgwriter, and it will periodically do smgrcloseall(). This + * fact justifies our not closing the reln in the success path + * either, which is a good thing since in non-bgwriter cases + * we couldn't safely do that.) Furthermore, in many cases + * the relation will have been dirtied through this same smgr + * relation, and so we can save a file open/close cycle. + */ + reln = smgropen(entry->tag.rnode); + + /* + * It is possible that the relation has been dropped or + * truncated since the fsync request was entered. Therefore, + * allow ENOENT, but only if we didn't fail once already on + * this file. This applies both during _mdfd_getseg() and + * during FileSync, since fd.c might have closed the file + * behind our back. + */ + seg = _mdfd_getseg(reln, + entry->tag.segno * ((BlockNumber) RELSEG_SIZE), + false, EXTENSION_RETURN_NULL); + if (seg == NULL || + FileSync(seg->mdfd_vfd) < 0) + { + /* + * XXX is there any point in allowing more than one try? + * Don't see one at the moment, but easy to change the + * test here if so. + */ + if (!FILE_POSSIBLY_DELETED(errno) || + ++(entry->failures) > 1) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fsync segment %u of relation %u/%u/%u: %m", + entry->tag.segno, + entry->tag.rnode.spcNode, + entry->tag.rnode.dbNode, + entry->tag.rnode.relNode))); + else + ereport(DEBUG1, + (errcode_for_file_access(), + errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m", + entry->tag.segno, + entry->tag.rnode.spcNode, + entry->tag.rnode.dbNode, + entry->tag.rnode.relNode))); + need_retry = true; + continue; /* don't delete the hashtable entry */ + } } - /* - * Find or create an smgr hash entry for this relation. This may - * seem a bit unclean -- md calling smgr? But it's really the - * best solution. It ensures that the open file reference isn't - * permanently leaked if we get an error here. (You may say "but - * an unreferenced SMgrRelation is still a leak!" Not really, - * because the only case in which a checkpoint is done by a - * process that isn't about to shut down is in the bgwriter, and - * it will periodically do smgrcloseall(). This fact justifies - * our not closing the reln in the success path either, which is a - * good thing since in non-bgwriter cases we couldn't safely do - * that.) Furthermore, in many cases the relation will have been - * dirtied through this same smgr relation, and so we can save a - * file open/close cycle. - */ - reln = smgropen(entry->rnode); - - /* - * It is possible that the relation has been dropped or truncated - * since the fsync request was entered. Therefore, we have to - * allow file-not-found errors. This applies both during - * _mdfd_getseg() and during FileSync, since fd.c might have - * closed the file behind our back. - */ - seg = _mdfd_getseg(reln, - entry->segno * ((BlockNumber) RELSEG_SIZE), - false, EXTENSION_RETURN_NULL); - if (seg) - { - if (FileSync(seg->mdfd_vfd) < 0 && - errno != ENOENT) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not fsync segment %u of relation %u/%u/%u: %m", - entry->segno, - entry->rnode.spcNode, - entry->rnode.dbNode, - entry->rnode.relNode))); - } + /* Okay, delete this entry */ + if (hash_search(pendingOpsTable, &entry->tag, + HASH_REMOVE, NULL) == NULL) + elog(ERROR, "pendingOpsTable corrupted"); } - - /* Okay, delete this entry */ - if (hash_search(pendingOpsTable, entry, - HASH_REMOVE, NULL) == NULL) - elog(ERROR, "pendingOpsTable corrupted"); - } + } while (need_retry); } /* @@ -938,14 +1018,8 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg) { if (pendingOpsTable) { - PendingOperationEntry entry; - - /* ensure any pad bytes in the struct are zeroed */ - MemSet(&entry, 0, sizeof(entry)); - entry.rnode = reln->smgr_rnode; - entry.segno = seg->mdfd_segno; - - (void) hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL); + /* push it into local pending-ops table */ + RememberFsyncRequest(reln->smgr_rnode, seg->mdfd_segno); } else { @@ -968,20 +1042,55 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg) * * We stuff the fsync request into the local hash table for execution * during the bgwriter's next checkpoint. + * + * segno == InvalidBlockNumber is a "revoke" request: remove any pending + * fsync requests for the whole relation. */ void RememberFsyncRequest(RelFileNode rnode, BlockNumber segno) { - PendingOperationEntry entry; - Assert(pendingOpsTable); - /* ensure any pad bytes in the struct are zeroed */ - MemSet(&entry, 0, sizeof(entry)); - entry.rnode = rnode; - entry.segno = segno; + if (segno != InvalidBlockNumber) + { + /* Enter a request to fsync this segment */ + PendingOperationTag key; + PendingOperationEntry *entry; + bool found; - (void) hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL); + /* ensure any pad bytes in the hash key are zeroed */ + MemSet(&key, 0, sizeof(key)); + key.rnode = rnode; + key.segno = segno; + + entry = (PendingOperationEntry *) hash_search(pendingOpsTable, + &key, + HASH_ENTER, + &found); + if (!found) /* new entry, so initialize it */ + entry->failures = 0; + } + else + { + /* + * Remove any pending requests for the entire relation. (This is a + * tad slow but it doesn't seem worth rethinking the table structure.) + */ + HASH_SEQ_STATUS hstat; + PendingOperationEntry *entry; + + hash_seq_init(&hstat, pendingOpsTable); + while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) + { + if (RelFileNodeEquals(entry->tag.rnode, rnode)) + { + /* Okay, delete this entry */ + if (hash_search(pendingOpsTable, &entry->tag, + HASH_REMOVE, NULL) == NULL) + elog(ERROR, "pendingOpsTable corrupted"); + } + } + } } /* @@ -1102,7 +1211,8 @@ _mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool isTemp, } if (v->mdfd_chain == NULL) { - if (behavior == EXTENSION_RETURN_NULL && errno == ENOENT) + if (behavior == EXTENSION_RETURN_NULL && + FILE_POSSIBLY_DELETED(errno)) return NULL; ereport(ERROR, (errcode_for_file_access(),