diff --git a/changelog/unreleased/issue-4627 b/changelog/unreleased/issue-4627 index 95c9d4bc4..6f6a00098 100644 --- a/changelog/unreleased/issue-4627 +++ b/changelog/unreleased/issue-4627 @@ -9,7 +9,9 @@ downloading are now forcibly interrupted. This ensures that stuck requests are retried after a short timeout. Attempts to access a missing file or a truncated file will no longer be retried. -This avoids unnecessary retries in those cases. +This avoids unnecessary retries in those cases. All other backend requests are +retried for up to 15 minutes. This ensures that a temporarily interrupted network +connections can be tolerated. If a download yields a corrupt file or blob, then the download will be retried once. @@ -26,3 +28,4 @@ https://github.com/restic/restic/issues/4515 https://github.com/restic/restic/issues/1523 https://github.com/restic/restic/pull/4520 https://github.com/restic/restic/pull/4800 +https://github.com/restic/restic/pull/4784 diff --git a/cmd/restic/global.go b/cmd/restic/global.go index 6920caa8d..c954a4270 100644 --- a/cmd/restic/global.go +++ b/cmd/restic/global.go @@ -416,12 +416,16 @@ func OpenRepository(ctx context.Context, opts GlobalOptions) (*repository.Reposi } report := func(msg string, err error, d time.Duration) { - Warnf("%v returned error, retrying after %v: %v\n", msg, d, err) + if d >= 0 { + Warnf("%v returned error, retrying after %v: %v\n", msg, d, err) + } else { + Warnf("%v failed: %v\n", msg, err) + } } success := func(msg string, retries int) { Warnf("%v operation successful after %d retries\n", msg, retries) } - be = retry.New(be, 10, report, success) + be = retry.New(be, 15*time.Minute, report, success) // wrap backend if a test specified a hook if opts.backendTestHook != nil { diff --git a/internal/backend/retry/backend_retry.go b/internal/backend/retry/backend_retry.go index 31934ec96..d5134d433 100644 --- a/internal/backend/retry/backend_retry.go +++ b/internal/backend/retry/backend_retry.go @@ -18,9 +18,9 @@ import ( // backoff. type Backend struct { backend.Backend - MaxTries int - Report func(string, error, time.Duration) - Success func(string, int) + MaxElapsedTime time.Duration + Report func(string, error, time.Duration) + Success func(string, int) failedLoads sync.Map } @@ -32,32 +32,64 @@ var _ backend.Backend = &Backend{} // backoff. report is called with a description and the error, if one occurred. // success is called with the number of retries before a successful operation // (it is not called if it succeeded on the first try) -func New(be backend.Backend, maxTries int, report func(string, error, time.Duration), success func(string, int)) *Backend { +func New(be backend.Backend, maxElapsedTime time.Duration, report func(string, error, time.Duration), success func(string, int)) *Backend { return &Backend{ - Backend: be, - MaxTries: maxTries, - Report: report, - Success: success, + Backend: be, + MaxElapsedTime: maxElapsedTime, + Report: report, + Success: success, } } // retryNotifyErrorWithSuccess is an extension of backoff.RetryNotify with notification of success after an error. // success is NOT notified on the first run of operation (only after an error). func retryNotifyErrorWithSuccess(operation backoff.Operation, b backoff.BackOff, notify backoff.Notify, success func(retries int)) error { + var operationWrapper backoff.Operation if success == nil { - return backoff.RetryNotify(operation, b, notify) - } - retries := 0 - operationWrapper := func() error { - err := operation() - if err != nil { - retries++ - } else if retries > 0 { - success(retries) + operationWrapper = operation + } else { + retries := 0 + operationWrapper = func() error { + err := operation() + if err != nil { + retries++ + } else if retries > 0 { + success(retries) + } + return err } - return err } - return backoff.RetryNotify(operationWrapper, b, notify) + err := backoff.RetryNotify(operationWrapper, b, notify) + + if err != nil && notify != nil { + // log final error + notify(err, -1) + } + return err +} + +func withRetryAtLeastOnce(delegate *backoff.ExponentialBackOff) *retryAtLeastOnce { + return &retryAtLeastOnce{delegate: delegate} +} + +type retryAtLeastOnce struct { + delegate *backoff.ExponentialBackOff + numTries uint64 +} + +func (b *retryAtLeastOnce) NextBackOff() time.Duration { + delay := b.delegate.NextBackOff() + + b.numTries++ + if b.numTries == 1 && b.delegate.Stop == delay { + return b.delegate.InitialInterval + } + return delay +} + +func (b *retryAtLeastOnce) Reset() { + b.numTries = 0 + b.delegate.Reset() } var fastRetries = false @@ -74,9 +106,25 @@ func (be *Backend) retry(ctx context.Context, msg string, f func() error) error } bo := backoff.NewExponentialBackOff() + bo.MaxElapsedTime = be.MaxElapsedTime + + if feature.Flag.Enabled(feature.BackendErrorRedesign) { + bo.InitialInterval = 1 * time.Second + bo.Multiplier = 2 + } if fastRetries { // speed up integration tests bo.InitialInterval = 1 * time.Millisecond + maxElapsedTime := 200 * time.Millisecond + if bo.MaxElapsedTime > maxElapsedTime { + bo.MaxElapsedTime = maxElapsedTime + } + } + + var b backoff.BackOff = withRetryAtLeastOnce(bo) + if !feature.Flag.Enabled(feature.BackendErrorRedesign) { + // deprecated behavior + b = backoff.WithMaxRetries(b, 10) } err := retryNotifyErrorWithSuccess( @@ -89,7 +137,7 @@ func (be *Backend) retry(ctx context.Context, msg string, f func() error) error } return err }, - backoff.WithContext(backoff.WithMaxRetries(bo, uint64(be.MaxTries)), ctx), + backoff.WithContext(b, ctx), func(err error, d time.Duration) { if be.Report != nil { be.Report(msg, err, d) diff --git a/internal/backend/retry/backend_retry_test.go b/internal/backend/retry/backend_retry_test.go index a515b0b7d..cd0c4d48b 100644 --- a/internal/backend/retry/backend_retry_test.go +++ b/internal/backend/retry/backend_retry_test.go @@ -193,8 +193,9 @@ func TestBackendListRetryErrorBackend(t *testing.T) { } TestFastRetries(t) - const maxRetries = 2 - retryBackend := New(be, maxRetries, nil, nil) + const maxElapsedTime = 10 * time.Millisecond + now := time.Now() + retryBackend := New(be, maxElapsedTime, nil, nil) var listed []string err := retryBackend.List(context.TODO(), backend.PackFile, func(fi backend.FileInfo) error { @@ -207,8 +208,9 @@ func TestBackendListRetryErrorBackend(t *testing.T) { t.Fatalf("wrong error returned, want %v, got %v", ErrBackendTest, err) } - if retries != maxRetries+1 { - t.Fatalf("List was called %d times, wanted %v", retries, maxRetries+1) + duration := time.Since(now) + if duration > 100*time.Millisecond { + t.Fatalf("list retries took %v, expected at most 10ms", duration) } test.Equals(t, names[:2], listed) @@ -327,7 +329,7 @@ func TestBackendLoadCircuitBreaker(t *testing.T) { // trip the circuit breaker for file "other" err := retryBackend.Load(context.TODO(), backend.Handle{Name: "other"}, 0, 0, nilRd) test.Equals(t, otherError, err, "unexpected error") - test.Equals(t, 3, attempt) + test.Equals(t, 2, attempt) attempt = 0 err = retryBackend.Load(context.TODO(), backend.Handle{Name: "other"}, 0, 0, nilRd) @@ -407,7 +409,7 @@ func TestBackendRetryPermanent(t *testing.T) { return errors.New("something") }) test.Assert(t, !be.IsPermanentErrorFn(err), "error unexpectedly considered permanent %v", err) - test.Equals(t, 3, attempt) + test.Equals(t, 2, attempt) } @@ -497,3 +499,64 @@ func TestNotifyWithSuccessIsCalled(t *testing.T) { t.Fatalf("Success should have been called only once, but was called %d times instead", successCalled) } } + +func TestNotifyWithSuccessFinalError(t *testing.T) { + operation := func() error { + return errors.New("expected error in test") + } + + notifyCalled := 0 + notify := func(error, time.Duration) { + notifyCalled++ + } + + successCalled := 0 + success := func(retries int) { + successCalled++ + } + + err := retryNotifyErrorWithSuccess(operation, backoff.WithMaxRetries(&backoff.ZeroBackOff{}, 5), notify, success) + test.Assert(t, err.Error() == "expected error in test", "wrong error message %v", err) + test.Equals(t, 6, notifyCalled, "notify should have been called 6 times") + test.Equals(t, 0, successCalled, "success should not have been called") +} + +type testClock struct { + Time time.Time +} + +func (c *testClock) Now() time.Time { + return c.Time +} + +func TestRetryAtLeastOnce(t *testing.T) { + expBackOff := backoff.NewExponentialBackOff() + expBackOff.InitialInterval = 500 * time.Millisecond + expBackOff.RandomizationFactor = 0 + expBackOff.MaxElapsedTime = 5 * time.Second + expBackOff.Multiplier = 2 // guarantee numerical stability + clock := &testClock{Time: time.Now()} + expBackOff.Clock = clock + expBackOff.Reset() + + retry := withRetryAtLeastOnce(expBackOff) + + // expire backoff + clock.Time = clock.Time.Add(10 * time.Second) + delay := retry.NextBackOff() + test.Equals(t, expBackOff.InitialInterval, delay, "must retry at least once") + + delay = retry.NextBackOff() + test.Equals(t, expBackOff.Stop, delay, "must not retry more than once") + + // test reset behavior + retry.Reset() + test.Equals(t, uint64(0), retry.numTries, "numTries should be reset to 0") + + // Verify that after reset, NextBackOff returns the initial interval again + delay = retry.NextBackOff() + test.Equals(t, expBackOff.InitialInterval, delay, "retries must work after reset") + + delay = retry.NextBackOff() + test.Equals(t, expBackOff.InitialInterval*time.Duration(expBackOff.Multiplier), delay, "retries must work after reset") +} diff --git a/internal/repository/lock.go b/internal/repository/lock.go index 7035e3c59..fd46066d1 100644 --- a/internal/repository/lock.go +++ b/internal/repository/lock.go @@ -132,7 +132,7 @@ func (l *locker) refreshLocks(ctx context.Context, backend backend.Backend, lock // remove the lock from the repo debug.Log("unlocking repository with lock %v", lock) - if err := lock.Unlock(); err != nil { + if err := lock.Unlock(ctx); err != nil { debug.Log("error while unlocking: %v", err) logger("error while unlocking: %v", err) } diff --git a/internal/restic/lock.go b/internal/restic/lock.go index 1e393c7ed..49c7cedf2 100644 --- a/internal/restic/lock.go +++ b/internal/restic/lock.go @@ -17,6 +17,10 @@ import ( "github.com/restic/restic/internal/debug" ) +// UnlockCancelDelay bounds the duration how long lock cleanup operations will wait +// if the passed in context was canceled. +const UnlockCancelDelay time.Duration = 1 * time.Minute + // Lock represents a process locking the repository for an operation. // // There are two types of locks: exclusive and non-exclusive. There may be many @@ -136,7 +140,7 @@ func newLock(ctx context.Context, repo Unpacked, excl bool) (*Lock, error) { time.Sleep(waitBeforeLockCheck) if err = lock.checkForOtherLocks(ctx); err != nil { - _ = lock.Unlock() + _ = lock.Unlock(ctx) return nil, err } @@ -220,12 +224,15 @@ func (l *Lock) createLock(ctx context.Context) (ID, error) { } // Unlock removes the lock from the repository. -func (l *Lock) Unlock() error { +func (l *Lock) Unlock(ctx context.Context) error { if l == nil || l.lockID == nil { return nil } - return l.repo.RemoveUnpacked(context.TODO(), LockFile, *l.lockID) + ctx, cancel := delayedCancelContext(ctx, UnlockCancelDelay) + defer cancel() + + return l.repo.RemoveUnpacked(ctx, LockFile, *l.lockID) } var StaleLockTimeout = 30 * time.Minute @@ -266,6 +273,23 @@ func (l *Lock) Stale() bool { return false } +func delayedCancelContext(parentCtx context.Context, delay time.Duration) (context.Context, context.CancelFunc) { + ctx, cancel := context.WithCancel(context.Background()) + + go func() { + select { + case <-parentCtx.Done(): + case <-ctx.Done(): + return + } + + time.Sleep(delay) + cancel() + }() + + return ctx, cancel +} + // Refresh refreshes the lock by creating a new file in the backend with a new // timestamp. Afterwards the old lock is removed. func (l *Lock) Refresh(ctx context.Context) error { @@ -285,7 +309,10 @@ func (l *Lock) Refresh(ctx context.Context) error { oldLockID := l.lockID l.lockID = &id - return l.repo.RemoveUnpacked(context.TODO(), LockFile, *oldLockID) + ctx, cancel := delayedCancelContext(ctx, UnlockCancelDelay) + defer cancel() + + return l.repo.RemoveUnpacked(ctx, LockFile, *oldLockID) } // RefreshStaleLock is an extended variant of Refresh that can also refresh stale lock files. @@ -312,15 +339,19 @@ func (l *Lock) RefreshStaleLock(ctx context.Context) error { time.Sleep(waitBeforeLockCheck) exists, err = l.checkExistence(ctx) + + ctx, cancel := delayedCancelContext(ctx, UnlockCancelDelay) + defer cancel() + if err != nil { // cleanup replacement lock - _ = l.repo.RemoveUnpacked(context.TODO(), LockFile, id) + _ = l.repo.RemoveUnpacked(ctx, LockFile, id) return err } if !exists { // cleanup replacement lock - _ = l.repo.RemoveUnpacked(context.TODO(), LockFile, id) + _ = l.repo.RemoveUnpacked(ctx, LockFile, id) return ErrRemovedLock } @@ -331,7 +362,7 @@ func (l *Lock) RefreshStaleLock(ctx context.Context) error { oldLockID := l.lockID l.lockID = &id - return l.repo.RemoveUnpacked(context.TODO(), LockFile, *oldLockID) + return l.repo.RemoveUnpacked(ctx, LockFile, *oldLockID) } func (l *Lock) checkExistence(ctx context.Context) (bool, error) { diff --git a/internal/restic/lock_test.go b/internal/restic/lock_test.go index b96b11e35..606ed210d 100644 --- a/internal/restic/lock_test.go +++ b/internal/restic/lock_test.go @@ -22,7 +22,7 @@ func TestLock(t *testing.T) { lock, err := restic.NewLock(context.TODO(), repo) rtest.OK(t, err) - rtest.OK(t, lock.Unlock()) + rtest.OK(t, lock.Unlock(context.TODO())) } func TestDoubleUnlock(t *testing.T) { @@ -32,9 +32,9 @@ func TestDoubleUnlock(t *testing.T) { lock, err := restic.NewLock(context.TODO(), repo) rtest.OK(t, err) - rtest.OK(t, lock.Unlock()) + rtest.OK(t, lock.Unlock(context.TODO())) - err = lock.Unlock() + err = lock.Unlock(context.TODO()) rtest.Assert(t, err != nil, "double unlock didn't return an error, got %v", err) } @@ -49,8 +49,8 @@ func TestMultipleLock(t *testing.T) { lock2, err := restic.NewLock(context.TODO(), repo) rtest.OK(t, err) - rtest.OK(t, lock1.Unlock()) - rtest.OK(t, lock2.Unlock()) + rtest.OK(t, lock1.Unlock(context.TODO())) + rtest.OK(t, lock2.Unlock(context.TODO())) } type failLockLoadingBackend struct { @@ -75,7 +75,7 @@ func TestMultipleLockFailure(t *testing.T) { _, err = restic.NewLock(context.TODO(), repo) rtest.Assert(t, err != nil, "unreadable lock file did not result in an error") - rtest.OK(t, lock1.Unlock()) + rtest.OK(t, lock1.Unlock(context.TODO())) } func TestLockExclusive(t *testing.T) { @@ -83,7 +83,7 @@ func TestLockExclusive(t *testing.T) { elock, err := restic.NewExclusiveLock(context.TODO(), repo) rtest.OK(t, err) - rtest.OK(t, elock.Unlock()) + rtest.OK(t, elock.Unlock(context.TODO())) } func TestLockOnExclusiveLockedRepo(t *testing.T) { @@ -99,8 +99,8 @@ func TestLockOnExclusiveLockedRepo(t *testing.T) { rtest.Assert(t, restic.IsAlreadyLocked(err), "create normal lock with exclusively locked repo didn't return the correct error") - rtest.OK(t, lock.Unlock()) - rtest.OK(t, elock.Unlock()) + rtest.OK(t, lock.Unlock(context.TODO())) + rtest.OK(t, elock.Unlock(context.TODO())) } func TestExclusiveLockOnLockedRepo(t *testing.T) { @@ -116,8 +116,8 @@ func TestExclusiveLockOnLockedRepo(t *testing.T) { rtest.Assert(t, restic.IsAlreadyLocked(err), "create normal lock with exclusively locked repo didn't return the correct error") - rtest.OK(t, lock.Unlock()) - rtest.OK(t, elock.Unlock()) + rtest.OK(t, lock.Unlock(context.TODO())) + rtest.OK(t, elock.Unlock(context.TODO())) } func createFakeLock(repo restic.SaverUnpacked, t time.Time, pid int) (restic.ID, error) { @@ -296,7 +296,7 @@ func testLockRefresh(t *testing.T, refresh func(lock *restic.Lock) error) { rtest.OK(t, err) rtest.Assert(t, lock2.Time.After(time0), "expected a later timestamp after lock refresh") - rtest.OK(t, lock.Unlock()) + rtest.OK(t, lock.Unlock(context.TODO())) } func TestLockRefresh(t *testing.T) {