diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c index 844b9d1b0e..0dcf609f19 100644 --- a/src/backend/postmaster/pgarch.c +++ b/src/backend/postmaster/pgarch.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -59,8 +60,18 @@ #define PGARCH_RESTART_INTERVAL 10 /* How often to attempt to restart a * failed archiver; in seconds. */ +/* + * Maximum number of retries allowed when attempting to archive a WAL + * file. + */ #define NUM_ARCHIVE_RETRIES 3 +/* + * Maximum number of retries allowed when attempting to remove an + * orphan archive status file. + */ +#define NUM_ORPHAN_CLEANUP_RETRIES 3 + /* ---------- * Local data @@ -424,9 +435,13 @@ pgarch_ArchiverCopyLoop(void) while (pgarch_readyXlog(xlog)) { int failures = 0; + int failures_orphan = 0; for (;;) { + struct stat stat_buf; + char pathname[MAXPGPATH]; + /* * Do not initiate any more archive commands after receiving * SIGTERM, nor after the postmaster has died unexpectedly. The @@ -456,6 +471,46 @@ pgarch_ArchiverCopyLoop(void) return; } + /* + * Since archive status files are not removed in a durable manner, + * a system crash could leave behind .ready files for WAL segments + * that have already been recycled or removed. In this case, + * simply remove the orphan status file and move on. unlink() is + * used here as even on subsequent crashes the same orphan files + * would get removed, so there is no need to worry about + * durability. + */ + snprintf(pathname, MAXPGPATH, XLOGDIR "/%s", xlog); + if (stat(pathname, &stat_buf) != 0 && errno == ENOENT) + { + char xlogready[MAXPGPATH]; + + StatusFilePath(xlogready, xlog, ".ready"); + if (unlink(xlogready) == 0) + { + ereport(WARNING, + (errmsg("removed orphan archive status file \"%s\"", + xlogready))); + + /* leave loop and move to the next status file */ + break; + } + + if (++failures_orphan >= NUM_ORPHAN_CLEANUP_RETRIES) + { + ereport(WARNING, + (errmsg("removal of orphan archive status file \"%s\" failed too many times, will try again later", + xlogready))); + + /* give up cleanup of orphan status files */ + return; + } + + /* wait a bit before retrying */ + pg_usleep(1000000L); + continue; + } + if (pgarch_archiveXlog(xlog)) { /* successful */