diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index 7d19bf9f42..2665025336 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -2,7 +2,7 @@
  *
  * bgwriter.c
  *
- * The background writer (bgwriter) is new in Postgres 8.0.  It attempts
+ * The background writer (bgwriter) is new as of Postgres 8.0.  It attempts
  * to keep regular backends from having to write out dirty shared buffers
  * (which they would only do when needing to free a shared buffer to read in
  * another page).  In the best scenario all writes from shared buffers will
@@ -37,7 +37,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.34 2007/01/05 22:19:36 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.35 2007/01/17 00:17:20 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -103,8 +103,8 @@
 typedef struct
 {
 	RelFileNode rnode;
-	BlockNumber segno;
-	/* might add a request-type field later */
+	BlockNumber segno;			/* InvalidBlockNumber means "revoke" */
+	/* might add a real request-type field later; not needed yet */
 } BgWriterRequest;
 
 typedef struct
@@ -695,6 +695,11 @@ RequestCheckpoint(bool waitforit, bool warnontime)
  * ForwardFsyncRequest
  *		Forward a file-fsync request from a backend to the bgwriter
  *
+ * segno specifies which segment (not block!) of the relation needs to be
+ * fsync'd.  If segno == InvalidBlockNumber, the meaning is to revoke any
+ * pending fsync requests for the entire relation (this message is sent
+ * when the relation is about to be deleted).
+ *
  * Whenever a backend is compelled to write directly to a relation
  * (which should be seldom, if the bgwriter is getting its job done),
  * the backend calls this routine to pass over knowledge that the relation
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 27a7c3d19a..e3cc00ca1a 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.125 2007/01/05 22:19:38 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.126 2007/01/17 00:17:21 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -31,6 +31,19 @@
 /* interval for calling AbsorbFsyncRequests in mdsync */
 #define FSYNCS_PER_ABSORB		10
 
+/*
+ * On Windows, we have to interpret EACCES as possibly meaning the same as
+ * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
+ * that's what you get.  Ugh.  This code is designed so that we don't
+ * actually believe these cases are okay without further evidence (namely,
+ * a pending fsync request getting revoked ... see mdsync).
+ */
+#ifndef WIN32
+#define FILE_POSSIBLY_DELETED(err)  ((err) == ENOENT)
+#else
+#define FILE_POSSIBLY_DELETED(err)  ((err) == ENOENT || (err) == EACCES)
+#endif
+
 /*
  *	The magnetic disk storage manager keeps track of open file
  *	descriptors in its own descriptor pool.  This is done to make it
@@ -93,9 +106,8 @@ static MemoryContext MdCxt;		/* context for all md.c allocations */
  * we keep track of pending fsync operations: we need to remember all relation
  * segments that have been written since the last checkpoint, so that we can
  * fsync them down to disk before completing the next checkpoint.  This hash
- * table remembers the pending operations.	We use a hash table not because
- * we want to look up individual operations, but simply as a convenient way
- * of eliminating duplicate requests.
+ * table remembers the pending operations.	We use a hash table mostly as
+ * a convenient way of eliminating duplicate requests.
  *
  * (Regular backends do not track pending operations locally, but forward
  * them to the bgwriter.)
@@ -104,6 +116,12 @@ typedef struct
 {
 	RelFileNode rnode;			/* the targeted relation */
 	BlockNumber segno;			/* which segment */
+} PendingOperationTag;
+
+typedef struct
+{
+	PendingOperationTag tag;	/* hash table key (must be first!) */
+	int			failures;		/* number of failed attempts to fsync */
 } PendingOperationEntry;
 
 static HTAB *pendingOpsTable = NULL;
@@ -153,7 +171,7 @@ mdinit(void)
 		HASHCTL		hash_ctl;
 
 		MemSet(&hash_ctl, 0, sizeof(hash_ctl));
-		hash_ctl.keysize = sizeof(PendingOperationEntry);
+		hash_ctl.keysize = sizeof(PendingOperationTag);
 		hash_ctl.entrysize = sizeof(PendingOperationEntry);
 		hash_ctl.hash = tag_hash;
 		hash_ctl.hcxt = MdCxt;
@@ -236,6 +254,35 @@ mdunlink(RelFileNode rnode, bool isRedo)
 {
 	char	   *path;
 
+	/*
+	 * We have to clean out any pending fsync requests for the doomed relation,
+	 * else the next mdsync() will fail.
+	 */
+	if (pendingOpsTable)
+	{
+		/* standalone backend or startup process: fsync state is local */
+		RememberFsyncRequest(rnode, InvalidBlockNumber);
+	}
+	else if (IsUnderPostmaster)
+	{
+		/*
+		 * Notify the bgwriter about it.  If we fail to queue the revoke
+		 * message, we have to sleep and try again ... ugly, but hopefully
+		 * won't happen often.
+		 *
+		 * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with
+		 * an error would leave the no-longer-used file still present on
+		 * disk, which would be bad, so I'm inclined to assume that the
+		 * bgwriter will always empty the queue soon.
+		 */
+		while (!ForwardFsyncRequest(rnode, InvalidBlockNumber))
+			pg_usleep(10000L);	/* 10 msec seems a good number */
+		/*
+		 * Note we don't wait for the bgwriter to actually absorb the
+		 * revoke message; see mdsync() for the implications.
+		 */
+	}
+
 	path = relpath(rnode);
 
 	/* Delete the first segment, or only segment if not doing segmenting */
@@ -414,7 +461,8 @@ mdopen(SMgrRelation reln, ExtensionBehavior behavior)
 		if (fd < 0)
 		{
 			pfree(path);
-			if (behavior == EXTENSION_RETURN_NULL && errno == ENOENT)
+			if (behavior == EXTENSION_RETURN_NULL &&
+				FILE_POSSIBLY_DELETED(errno))
 				return NULL;
 			ereport(ERROR,
 					(errcode_for_file_access(),
@@ -834,94 +882,126 @@ mdimmedsync(SMgrRelation reln)
 void
 mdsync(void)
 {
-	HASH_SEQ_STATUS hstat;
-	PendingOperationEntry *entry;
-	int			absorb_counter;
+	bool		need_retry;
 
 	if (!pendingOpsTable)
 		elog(ERROR, "cannot sync without a pendingOpsTable");
 
 	/*
-	 * If we are in the bgwriter, the sync had better include all fsync
-	 * requests that were queued by backends before the checkpoint REDO point
-	 * was determined.	We go that a little better by accepting all requests
-	 * queued up to the point where we start fsync'ing.
+	 * The fsync table could contain requests to fsync relations that have
+	 * been deleted (unlinked) by the time we get to them.  Rather than
+	 * just hoping an ENOENT (or EACCES on Windows) error can be ignored,
+	 * what we will do is retry the whole process after absorbing fsync
+	 * request messages again.  Since mdunlink() queues a "revoke" message
+	 * before actually unlinking, the fsync request is guaranteed to be gone
+	 * the second time if it really was this case.
 	 */
-	AbsorbFsyncRequests();
+	do {
+		HASH_SEQ_STATUS hstat;
+		PendingOperationEntry *entry;
+		int			absorb_counter;
+
+		need_retry = false;
 
-	absorb_counter = FSYNCS_PER_ABSORB;
-	hash_seq_init(&hstat, pendingOpsTable);
-	while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
-	{
 		/*
-		 * If fsync is off then we don't have to bother opening the file at
-		 * all.  (We delay checking until this point so that changing fsync on
-		 * the fly behaves sensibly.)
+		 * If we are in the bgwriter, the sync had better include all fsync
+		 * requests that were queued by backends before the checkpoint REDO
+		 * point was determined. We go that a little better by accepting all
+		 * requests queued up to the point where we start fsync'ing.
 		 */
-		if (enableFsync)
+		AbsorbFsyncRequests();
+
+		absorb_counter = FSYNCS_PER_ABSORB;
+		hash_seq_init(&hstat, pendingOpsTable);
+		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
 		{
-			SMgrRelation reln;
-			MdfdVec    *seg;
-
 			/*
-			 * If in bgwriter, absorb pending requests every so often to
-			 * prevent overflow of the fsync request queue.  The hashtable
-			 * code does not specify whether entries added by this will be
-			 * visited by our search, but we don't really care: it's OK if we
-			 * do, and OK if we don't.
+			 * If fsync is off then we don't have to bother opening the file
+			 * at all.  (We delay checking until this point so that changing
+			 * fsync on the fly behaves sensibly.)
 			 */
-			if (--absorb_counter <= 0)
+			if (enableFsync)
 			{
-				AbsorbFsyncRequests();
-				absorb_counter = FSYNCS_PER_ABSORB;
+				SMgrRelation reln;
+				MdfdVec    *seg;
+
+				/*
+				 * If in bgwriter, we want to absorb pending requests every so
+				 * often to prevent overflow of the fsync request queue.  This
+				 * could result in deleting the current entry out from under
+				 * our hashtable scan, so the procedure is to fall out of the
+				 * scan and start over from the top of the function.
+				 */
+				if (--absorb_counter <= 0)
+				{
+					need_retry = true;
+					break;
+				}
+
+				/*
+				 * Find or create an smgr hash entry for this relation. This
+				 * may seem a bit unclean -- md calling smgr?  But it's really
+				 * the best solution.  It ensures that the open file reference
+				 * isn't permanently leaked if we get an error here. (You may
+				 * say "but an unreferenced SMgrRelation is still a leak!" Not
+				 * really, because the only case in which a checkpoint is done
+				 * by a process that isn't about to shut down is in the
+				 * bgwriter, and it will periodically do smgrcloseall(). This
+				 * fact justifies our not closing the reln in the success path
+				 * either, which is a good thing since in non-bgwriter cases
+				 * we couldn't safely do that.)  Furthermore, in many cases
+				 * the relation will have been dirtied through this same smgr
+				 * relation, and so we can save a file open/close cycle.
+				 */
+				reln = smgropen(entry->tag.rnode);
+
+				/*
+				 * It is possible that the relation has been dropped or
+				 * truncated since the fsync request was entered.  Therefore,
+				 * allow ENOENT, but only if we didn't fail once already on
+				 * this file.  This applies both during _mdfd_getseg() and
+				 * during FileSync, since fd.c might have closed the file
+				 * behind our back.
+				 */
+				seg = _mdfd_getseg(reln,
+								   entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
+								   false, EXTENSION_RETURN_NULL);
+				if (seg == NULL ||
+					FileSync(seg->mdfd_vfd) < 0)
+				{
+					/*
+					 * XXX is there any point in allowing more than one try?
+					 * Don't see one at the moment, but easy to change the
+					 * test here if so.
+					 */
+					if (!FILE_POSSIBLY_DELETED(errno) ||
+						++(entry->failures) > 1)
+						ereport(ERROR,
+								(errcode_for_file_access(),
+								 errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
+										entry->tag.segno,
+										entry->tag.rnode.spcNode,
+										entry->tag.rnode.dbNode,
+										entry->tag.rnode.relNode)));
+					else
+						ereport(DEBUG1,
+								(errcode_for_file_access(),
+								 errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",
+										entry->tag.segno,
+										entry->tag.rnode.spcNode,
+										entry->tag.rnode.dbNode,
+										entry->tag.rnode.relNode)));
+					need_retry = true;
+					continue;	/* don't delete the hashtable entry */
+				}
 			}
 
-			/*
-			 * Find or create an smgr hash entry for this relation. This may
-			 * seem a bit unclean -- md calling smgr?  But it's really the
-			 * best solution.  It ensures that the open file reference isn't
-			 * permanently leaked if we get an error here. (You may say "but
-			 * an unreferenced SMgrRelation is still a leak!" Not really,
-			 * because the only case in which a checkpoint is done by a
-			 * process that isn't about to shut down is in the bgwriter, and
-			 * it will periodically do smgrcloseall().	This fact justifies
-			 * our not closing the reln in the success path either, which is a
-			 * good thing since in non-bgwriter cases we couldn't safely do
-			 * that.)  Furthermore, in many cases the relation will have been
-			 * dirtied through this same smgr relation, and so we can save a
-			 * file open/close cycle.
-			 */
-			reln = smgropen(entry->rnode);
-
-			/*
-			 * It is possible that the relation has been dropped or truncated
-			 * since the fsync request was entered.  Therefore, we have to
-			 * allow file-not-found errors.  This applies both during
-			 * _mdfd_getseg() and during FileSync, since fd.c might have
-			 * closed the file behind our back.
-			 */
-			seg = _mdfd_getseg(reln,
-							   entry->segno * ((BlockNumber) RELSEG_SIZE),
-							   false, EXTENSION_RETURN_NULL);
-			if (seg)
-			{
-				if (FileSync(seg->mdfd_vfd) < 0 &&
-					errno != ENOENT)
-					ereport(ERROR,
-							(errcode_for_file_access(),
-							 errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
-									entry->segno,
-									entry->rnode.spcNode,
-									entry->rnode.dbNode,
-									entry->rnode.relNode)));
-			}
+			/* Okay, delete this entry */
+			if (hash_search(pendingOpsTable, &entry->tag,
+							HASH_REMOVE, NULL) == NULL)
+				elog(ERROR, "pendingOpsTable corrupted");
 		}
-
-		/* Okay, delete this entry */
-		if (hash_search(pendingOpsTable, entry,
-						HASH_REMOVE, NULL) == NULL)
-			elog(ERROR, "pendingOpsTable corrupted");
-	}
+	} while (need_retry);
 }
 
 /*
@@ -938,14 +1018,8 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
 {
 	if (pendingOpsTable)
 	{
-		PendingOperationEntry entry;
-
-		/* ensure any pad bytes in the struct are zeroed */
-		MemSet(&entry, 0, sizeof(entry));
-		entry.rnode = reln->smgr_rnode;
-		entry.segno = seg->mdfd_segno;
-
-		(void) hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL);
+		/* push it into local pending-ops table */
+		RememberFsyncRequest(reln->smgr_rnode, seg->mdfd_segno);
 	}
 	else
 	{
@@ -968,20 +1042,55 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
  *
  * We stuff the fsync request into the local hash table for execution
  * during the bgwriter's next checkpoint.
+ *
+ * segno == InvalidBlockNumber is a "revoke" request: remove any pending
+ * fsync requests for the whole relation.
  */
 void
 RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
 {
-	PendingOperationEntry entry;
-
 	Assert(pendingOpsTable);
 
-	/* ensure any pad bytes in the struct are zeroed */
-	MemSet(&entry, 0, sizeof(entry));
-	entry.rnode = rnode;
-	entry.segno = segno;
+	if (segno != InvalidBlockNumber)
+	{
+		/* Enter a request to fsync this segment */
+		PendingOperationTag key;
+		PendingOperationEntry *entry;
+		bool		found;
 
-	(void) hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL);
+		/* ensure any pad bytes in the hash key are zeroed */
+		MemSet(&key, 0, sizeof(key));
+		key.rnode = rnode;
+		key.segno = segno;
+
+		entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
+													  &key,
+													  HASH_ENTER,
+													  &found);
+		if (!found)				/* new entry, so initialize it */
+			entry->failures = 0;
+	}
+	else
+	{
+		/*
+		 * Remove any pending requests for the entire relation.  (This is a
+		 * tad slow but it doesn't seem worth rethinking the table structure.)
+		 */
+		HASH_SEQ_STATUS hstat;
+		PendingOperationEntry *entry;
+
+		hash_seq_init(&hstat, pendingOpsTable);
+		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
+		{
+			if (RelFileNodeEquals(entry->tag.rnode, rnode))
+			{
+				/* Okay, delete this entry */
+				if (hash_search(pendingOpsTable, &entry->tag,
+								HASH_REMOVE, NULL) == NULL)
+					elog(ERROR, "pendingOpsTable corrupted");
+			}
+		}
+	}
 }
 
 /*
@@ -1102,7 +1211,8 @@ _mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool isTemp,
 			}
 			if (v->mdfd_chain == NULL)
 			{
-				if (behavior == EXTENSION_RETURN_NULL && errno == ENOENT)
+				if (behavior == EXTENSION_RETURN_NULL &&
+					FILE_POSSIBLY_DELETED(errno))
 					return NULL;
 				ereport(ERROR,
 						(errcode_for_file_access(),