Don't error out if recycling or removing an old WAL segment fails at the end

of checkpoint. Although the checkpoint has been written to WAL at that point
already, so that all data is safe, and we'll retry removing the WAL segment at
the next checkpoint, if such a failure persists we won't be able to remove any
other old WAL segments either and will eventually run out of disk space. It's
better to treat the failure as non-fatal, and move on to clean any other WAL
segment and continue with any other end-of-checkpoint cleanup.

We don't normally expect any such failures, but on Windows it can happen with
some anti-virus or backup software that lock files without FILE_SHARE_DELETE
flag.

Also, the loop in pgrename() to retry when the file is locked was broken. If a
file is locked on Windows, you get ERROR_SHARE_VIOLATION, not
ERROR_ACCESS_DENIED, at least on modern versions. Fix that, although I left
the check for ERROR_ACCESS_DENIED in there as well (presumably it was correct
in some environment), and added ERROR_LOCK_VIOLATION to be consistent with
similar checks in pgwin32_open(). Reduce the timeout on the loop from 30s to
10s, on the grounds that since it's been broken, we've effectively had a
timeout of 0s and no-one has complained, so a smaller timeout is actually
closer to the old behavior. A longer timeout would mean that if recycling a
WAL file fails because it's locked for some reason, InstallXLogFileSegment()
will hold ControlFileLock for longer, potentially blocking other backends, so
a long timeout isn't totally harmless.

While we're at it, set errno correctly in pgrename().

Backpatch to 8.2, which is the oldest version supported on Windows. The xlog.c
changes would make sense on other platforms and thus on older versions as
well, but since there's no such locking issues on other platforms, it's not
worth it.
This commit is contained in:
Heikki Linnakangas 2009-09-13 18:32:08 +00:00
parent d6119d8091
commit 7f2a10fecd
2 changed files with 55 additions and 35 deletions

View File

@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.352 2009/09/10 09:42:10 heikki Exp $
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.353 2009/09/13 18:32:07 heikki Exp $
*
*-------------------------------------------------------------------------
*/
@ -2262,12 +2262,14 @@ XLogFileInit(uint32 log, uint32 seg,
*use_existent, &max_advance,
use_lock))
{
/* No need for any more future segments... */
/*
* No need for any more future segments, or InstallXLogFileSegment()
* failed to rename the file into place. If the rename failed, opening
* the file below will fail.
*/
unlink(tmppath);
}
elog(DEBUG2, "done creating and filling new WAL file");
/* Set flag to tell caller there was no existent file */
*use_existent = false;
@ -2280,6 +2282,8 @@ XLogFileInit(uint32 log, uint32 seg,
errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
path, log, seg)));
elog(DEBUG2, "done creating and filling new WAL file");
return fd;
}
@ -2409,10 +2413,9 @@ XLogFileCopy(uint32 log, uint32 seg,
* place. This should be TRUE except during bootstrap log creation. The
* caller must *not* hold the lock at call.
*
* Returns TRUE if file installed, FALSE if not installed because of
* exceeding max_advance limit. On Windows, we also return FALSE if we
* can't rename the file into place because someone's got it open.
* (Any other kind of failure causes ereport().)
* Returns TRUE if the file was installed successfully. FALSE indicates that
* max_advance limit was exceeded, or an error occurred while renaming the
* file into place.
*/
static bool
InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
@ -2460,31 +2463,26 @@ InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
*/
#if HAVE_WORKING_LINK
if (link(tmppath, path) < 0)
ereport(ERROR,
{
if (use_lock)
LWLockRelease(ControlFileLock);
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not link file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
tmppath, path, *log, *seg)));
return false;
}
unlink(tmppath);
#else
if (rename(tmppath, path) < 0)
{
#ifdef WIN32
#if !defined(__CYGWIN__)
if (GetLastError() == ERROR_ACCESS_DENIED)
#else
if (errno == EACCES)
#endif
{
if (use_lock)
LWLockRelease(ControlFileLock);
return false;
}
#endif /* WIN32 */
ereport(ERROR,
if (use_lock)
LWLockRelease(ControlFileLock);
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
tmppath, path, *log, *seg)));
return false;
}
#endif
@ -3128,19 +3126,25 @@ RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr)
*/
snprintf(newpath, MAXPGPATH, "%s.deleted", path);
if (rename(path, newpath) != 0)
ereport(ERROR,
{
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not rename old transaction log file \"%s\"",
errmsg("could not rename old transaction log file \"%s\": %m",
path)));
continue;
}
rc = unlink(newpath);
#else
rc = unlink(path);
#endif
if (rc != 0)
ereport(ERROR,
{
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not remove old transaction log file \"%s\": %m",
path)));
continue;
}
CheckpointStats.ckpt_segs_removed++;
}

View File

@ -10,7 +10,7 @@
* Win32 (NT4 and newer).
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/port/dirmod.c,v 1.58 2009/06/11 14:49:15 momjian Exp $
* $PostgreSQL: pgsql/src/port/dirmod.c,v 1.59 2009/09/13 18:32:08 heikki Exp $
*
*-------------------------------------------------------------------------
*/
@ -120,7 +120,8 @@ pgrename(const char *from, const char *to)
* We need to loop because even though PostgreSQL uses flags that allow
* rename while the file is open, other applications might have the file
* open without those flags. However, we won't wait indefinitely for
* someone else to close the file.
* someone else to close the file, as the caller might be holding locks
* and blocking other backends.
*/
#if defined(WIN32) && !defined(__CYGWIN__)
while (!MoveFileEx(from, to, MOVEFILE_REPLACE_EXISTING))
@ -129,13 +130,28 @@ pgrename(const char *from, const char *to)
#endif
{
#if defined(WIN32) && !defined(__CYGWIN__)
if (GetLastError() != ERROR_ACCESS_DENIED)
DWORD err = GetLastError();
_dosmaperr(err);
/*
* Modern NT-based Windows versions return ERROR_SHARING_VIOLATION
* if another process has the file open without FILE_SHARE_DELETE.
* ERROR_LOCK_VIOLATION has also been seen with some anti-virus
* software. This used to check for just ERROR_ACCESS_DENIED, so
* presumably you can get that too with some OS versions. We don't
* expect real permission errors where we currently use rename().
*/
if (err != ERROR_ACCESS_DENIED &&
err != ERROR_SHARING_VIOLATION &&
err != ERROR_LOCK_VIOLATION)
return -1;
#else
if (errno != EACCES)
#endif
/* set errno? */
return -1;
if (++loops > 300) /* time out after 30 sec */
#endif
if (++loops > 100) /* time out after 10 sec */
return -1;
pg_usleep(100000); /* us */
}
@ -155,14 +171,14 @@ pgunlink(const char *path)
* We need to loop because even though PostgreSQL uses flags that allow
* unlink while the file is open, other applications might have the file
* open without those flags. However, we won't wait indefinitely for
* someone else to close the file.
* someone else to close the file, as the caller might be holding locks
* and blocking other backends.
*/
while (unlink(path))
{
if (errno != EACCES)
/* set errno? */
return -1;
if (++loops > 300) /* time out after 30 sec */
if (++loops > 100) /* time out after 10 sec */
return -1;
pg_usleep(100000); /* us */
}