From a3633cad9e23d0bd936a394837874c7accbe97ce Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Mon, 29 Apr 2024 21:07:17 +0200 Subject: [PATCH 1/7] retry: explicitly log failed requests This simplifies finding the request in the log output that cause an operation to fail. --- cmd/restic/global.go | 6 +++- internal/backend/retry/backend_retry.go | 30 +++++++++++++------- internal/backend/retry/backend_retry_test.go | 21 ++++++++++++++ 3 files changed, 45 insertions(+), 12 deletions(-) diff --git a/cmd/restic/global.go b/cmd/restic/global.go index 6920caa8d..d0facc674 100644 --- a/cmd/restic/global.go +++ b/cmd/restic/global.go @@ -416,7 +416,11 @@ func OpenRepository(ctx context.Context, opts GlobalOptions) (*repository.Reposi } report := func(msg string, err error, d time.Duration) { - Warnf("%v returned error, retrying after %v: %v\n", msg, d, err) + if d >= 0 { + Warnf("%v returned error, retrying after %v: %v\n", msg, d, err) + } else { + Warnf("%v failed: %v\n", msg, err) + } } success := func(msg string, retries int) { Warnf("%v operation successful after %d retries\n", msg, retries) diff --git a/internal/backend/retry/backend_retry.go b/internal/backend/retry/backend_retry.go index 31934ec96..c6815413c 100644 --- a/internal/backend/retry/backend_retry.go +++ b/internal/backend/retry/backend_retry.go @@ -44,20 +44,28 @@ func New(be backend.Backend, maxTries int, report func(string, error, time.Durat // retryNotifyErrorWithSuccess is an extension of backoff.RetryNotify with notification of success after an error. // success is NOT notified on the first run of operation (only after an error). func retryNotifyErrorWithSuccess(operation backoff.Operation, b backoff.BackOff, notify backoff.Notify, success func(retries int)) error { + var operationWrapper backoff.Operation if success == nil { - return backoff.RetryNotify(operation, b, notify) - } - retries := 0 - operationWrapper := func() error { - err := operation() - if err != nil { - retries++ - } else if retries > 0 { - success(retries) + operationWrapper = operation + } else { + retries := 0 + operationWrapper = func() error { + err := operation() + if err != nil { + retries++ + } else if retries > 0 { + success(retries) + } + return err } - return err } - return backoff.RetryNotify(operationWrapper, b, notify) + err := backoff.RetryNotify(operationWrapper, b, notify) + + if err != nil && notify != nil { + // log final error + notify(err, -1) + } + return err } var fastRetries = false diff --git a/internal/backend/retry/backend_retry_test.go b/internal/backend/retry/backend_retry_test.go index a515b0b7d..de86e6cf6 100644 --- a/internal/backend/retry/backend_retry_test.go +++ b/internal/backend/retry/backend_retry_test.go @@ -497,3 +497,24 @@ func TestNotifyWithSuccessIsCalled(t *testing.T) { t.Fatalf("Success should have been called only once, but was called %d times instead", successCalled) } } + +func TestNotifyWithSuccessFinalError(t *testing.T) { + operation := func() error { + return errors.New("expected error in test") + } + + notifyCalled := 0 + notify := func(error, time.Duration) { + notifyCalled++ + } + + successCalled := 0 + success := func(retries int) { + successCalled++ + } + + err := retryNotifyErrorWithSuccess(operation, backoff.WithMaxRetries(&backoff.ZeroBackOff{}, 5), notify, success) + test.Assert(t, err.Error() == "expected error in test", "wrong error message %v", err) + test.Equals(t, 6, notifyCalled, "notify should have been called 6 times") + test.Equals(t, 0, successCalled, "success should not have been called") +} From a60ee9b764fe25e1f923e4faeab7f1e3bfb9f057 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Mon, 29 Apr 2024 21:12:21 +0200 Subject: [PATCH 2/7] retry: limit retries based on elapsed time not count Depending on how long an operation takes to fail, the total retry duration can currently vary between 1.5 and 15 minutes. In particular for temporarily interrupted network connections, the former timeout is too short. Thus always use a limit of 15 minutes. --- cmd/restic/global.go | 2 +- internal/backend/retry/backend_retry.go | 24 ++++++++++++-------- internal/backend/retry/backend_retry_test.go | 14 +++++++----- 3 files changed, 24 insertions(+), 16 deletions(-) diff --git a/cmd/restic/global.go b/cmd/restic/global.go index d0facc674..c954a4270 100644 --- a/cmd/restic/global.go +++ b/cmd/restic/global.go @@ -425,7 +425,7 @@ func OpenRepository(ctx context.Context, opts GlobalOptions) (*repository.Reposi success := func(msg string, retries int) { Warnf("%v operation successful after %d retries\n", msg, retries) } - be = retry.New(be, 10, report, success) + be = retry.New(be, 15*time.Minute, report, success) // wrap backend if a test specified a hook if opts.backendTestHook != nil { diff --git a/internal/backend/retry/backend_retry.go b/internal/backend/retry/backend_retry.go index c6815413c..fb2e6cf98 100644 --- a/internal/backend/retry/backend_retry.go +++ b/internal/backend/retry/backend_retry.go @@ -18,9 +18,9 @@ import ( // backoff. type Backend struct { backend.Backend - MaxTries int - Report func(string, error, time.Duration) - Success func(string, int) + MaxElapsedTime time.Duration + Report func(string, error, time.Duration) + Success func(string, int) failedLoads sync.Map } @@ -32,12 +32,12 @@ var _ backend.Backend = &Backend{} // backoff. report is called with a description and the error, if one occurred. // success is called with the number of retries before a successful operation // (it is not called if it succeeded on the first try) -func New(be backend.Backend, maxTries int, report func(string, error, time.Duration), success func(string, int)) *Backend { +func New(be backend.Backend, maxElapsedTime time.Duration, report func(string, error, time.Duration), success func(string, int)) *Backend { return &Backend{ - Backend: be, - MaxTries: maxTries, - Report: report, - Success: success, + Backend: be, + MaxElapsedTime: maxElapsedTime, + Report: report, + Success: success, } } @@ -82,9 +82,15 @@ func (be *Backend) retry(ctx context.Context, msg string, f func() error) error } bo := backoff.NewExponentialBackOff() + bo.MaxElapsedTime = be.MaxElapsedTime + if fastRetries { // speed up integration tests bo.InitialInterval = 1 * time.Millisecond + maxElapsedTime := 200 * time.Millisecond + if bo.MaxElapsedTime > maxElapsedTime { + bo.MaxElapsedTime = maxElapsedTime + } } err := retryNotifyErrorWithSuccess( @@ -97,7 +103,7 @@ func (be *Backend) retry(ctx context.Context, msg string, f func() error) error } return err }, - backoff.WithContext(backoff.WithMaxRetries(bo, uint64(be.MaxTries)), ctx), + backoff.WithContext(bo, ctx), func(err error, d time.Duration) { if be.Report != nil { be.Report(msg, err, d) diff --git a/internal/backend/retry/backend_retry_test.go b/internal/backend/retry/backend_retry_test.go index de86e6cf6..ce0b99637 100644 --- a/internal/backend/retry/backend_retry_test.go +++ b/internal/backend/retry/backend_retry_test.go @@ -193,8 +193,9 @@ func TestBackendListRetryErrorBackend(t *testing.T) { } TestFastRetries(t) - const maxRetries = 2 - retryBackend := New(be, maxRetries, nil, nil) + const maxElapsedTime = 10 * time.Millisecond + now := time.Now() + retryBackend := New(be, maxElapsedTime, nil, nil) var listed []string err := retryBackend.List(context.TODO(), backend.PackFile, func(fi backend.FileInfo) error { @@ -207,8 +208,9 @@ func TestBackendListRetryErrorBackend(t *testing.T) { t.Fatalf("wrong error returned, want %v, got %v", ErrBackendTest, err) } - if retries != maxRetries+1 { - t.Fatalf("List was called %d times, wanted %v", retries, maxRetries+1) + duration := time.Since(now) + if duration > 100*time.Millisecond { + t.Fatalf("list retries took %v, expected at most 10ms", duration) } test.Equals(t, names[:2], listed) @@ -327,7 +329,7 @@ func TestBackendLoadCircuitBreaker(t *testing.T) { // trip the circuit breaker for file "other" err := retryBackend.Load(context.TODO(), backend.Handle{Name: "other"}, 0, 0, nilRd) test.Equals(t, otherError, err, "unexpected error") - test.Equals(t, 3, attempt) + test.Equals(t, 2, attempt) attempt = 0 err = retryBackend.Load(context.TODO(), backend.Handle{Name: "other"}, 0, 0, nilRd) @@ -407,7 +409,7 @@ func TestBackendRetryPermanent(t *testing.T) { return errors.New("something") }) test.Assert(t, !be.IsPermanentErrorFn(err), "error unexpectedly considered permanent %v", err) - test.Equals(t, 3, attempt) + test.Equals(t, 2, attempt) } From 512cd6ef07ee7ee78b36149dd78cba80e6a1aac4 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Mon, 29 Apr 2024 21:16:24 +0200 Subject: [PATCH 3/7] retry: ensure that there's always at least one retry Previously, if an operation failed after 15 minutes, then it would never be retried. This means that large backend requests are more unreliable than smaller ones. --- internal/backend/retry/backend_retry.go | 26 ++++++++++++- internal/backend/retry/backend_retry_test.go | 40 ++++++++++++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/internal/backend/retry/backend_retry.go b/internal/backend/retry/backend_retry.go index fb2e6cf98..e40cce122 100644 --- a/internal/backend/retry/backend_retry.go +++ b/internal/backend/retry/backend_retry.go @@ -68,6 +68,30 @@ func retryNotifyErrorWithSuccess(operation backoff.Operation, b backoff.BackOff, return err } +func withRetryAtLeastOnce(delegate *backoff.ExponentialBackOff) *retryAtLeastOnce { + return &retryAtLeastOnce{delegate: delegate} +} + +type retryAtLeastOnce struct { + delegate *backoff.ExponentialBackOff + numTries uint64 +} + +func (b *retryAtLeastOnce) NextBackOff() time.Duration { + delay := b.delegate.NextBackOff() + + b.numTries++ + if b.numTries == 1 && b.delegate.Stop == delay { + return b.delegate.InitialInterval + } + return delay +} + +func (b *retryAtLeastOnce) Reset() { + b.numTries = 0 + b.delegate.Reset() +} + var fastRetries = false func (be *Backend) retry(ctx context.Context, msg string, f func() error) error { @@ -103,7 +127,7 @@ func (be *Backend) retry(ctx context.Context, msg string, f func() error) error } return err }, - backoff.WithContext(bo, ctx), + backoff.WithContext(withRetryAtLeastOnce(bo), ctx), func(err error, d time.Duration) { if be.Report != nil { be.Report(msg, err, d) diff --git a/internal/backend/retry/backend_retry_test.go b/internal/backend/retry/backend_retry_test.go index ce0b99637..cd0c4d48b 100644 --- a/internal/backend/retry/backend_retry_test.go +++ b/internal/backend/retry/backend_retry_test.go @@ -520,3 +520,43 @@ func TestNotifyWithSuccessFinalError(t *testing.T) { test.Equals(t, 6, notifyCalled, "notify should have been called 6 times") test.Equals(t, 0, successCalled, "success should not have been called") } + +type testClock struct { + Time time.Time +} + +func (c *testClock) Now() time.Time { + return c.Time +} + +func TestRetryAtLeastOnce(t *testing.T) { + expBackOff := backoff.NewExponentialBackOff() + expBackOff.InitialInterval = 500 * time.Millisecond + expBackOff.RandomizationFactor = 0 + expBackOff.MaxElapsedTime = 5 * time.Second + expBackOff.Multiplier = 2 // guarantee numerical stability + clock := &testClock{Time: time.Now()} + expBackOff.Clock = clock + expBackOff.Reset() + + retry := withRetryAtLeastOnce(expBackOff) + + // expire backoff + clock.Time = clock.Time.Add(10 * time.Second) + delay := retry.NextBackOff() + test.Equals(t, expBackOff.InitialInterval, delay, "must retry at least once") + + delay = retry.NextBackOff() + test.Equals(t, expBackOff.Stop, delay, "must not retry more than once") + + // test reset behavior + retry.Reset() + test.Equals(t, uint64(0), retry.numTries, "numTries should be reset to 0") + + // Verify that after reset, NextBackOff returns the initial interval again + delay = retry.NextBackOff() + test.Equals(t, expBackOff.InitialInterval, delay, "retries must work after reset") + + delay = retry.NextBackOff() + test.Equals(t, expBackOff.InitialInterval*time.Duration(expBackOff.Multiplier), delay, "retries must work after reset") +} From 98709a437250b9da0d15bf5d579c71deac295a5f Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Mon, 29 Apr 2024 21:19:15 +0200 Subject: [PATCH 4/7] retry: reduce total number of retries Retries in restic try to solve two main problems: - retry a temporarily failed operation - tolerate temporary network interruptions The first problem only requires a few retries, whereas the last one benefits primarily from spreading the requests over a longer duration. Increasing the default multiplier and the initial interval works for both cases. The first few retries only take a few seconds, while later retries quickly reach the maximum interval of one minute. This ensures that the total number of retries issued by restic will remain at around 21 retries for a 15 minute period. As the concurrency in restic is bounded, retries drastically reduce the number of requests sent to a backend. This helps to prevent overloading the backend. --- internal/backend/retry/backend_retry.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/internal/backend/retry/backend_retry.go b/internal/backend/retry/backend_retry.go index e40cce122..7f2b4f745 100644 --- a/internal/backend/retry/backend_retry.go +++ b/internal/backend/retry/backend_retry.go @@ -108,6 +108,9 @@ func (be *Backend) retry(ctx context.Context, msg string, f func() error) error bo := backoff.NewExponentialBackOff() bo.MaxElapsedTime = be.MaxElapsedTime + bo.InitialInterval = 1 * time.Second + bo.Multiplier = 2 + if fastRetries { // speed up integration tests bo.InitialInterval = 1 * time.Millisecond From b1266867d22fc6a424f1c3d0e31777bdfdd53de4 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Mon, 29 Apr 2024 21:43:28 +0200 Subject: [PATCH 5/7] repository: wait max 1 minutes for lock removal if context is canceled The toplevel context in restic only canceled if the user interrupts a restic operation. If the network connection has failed this can require waiting the full retry duration of 15 minutes which is a bad user experience for interactive usage. Thus limit the delay to one minute in this case. --- internal/repository/lock.go | 2 +- internal/restic/lock.go | 45 ++++++++++++++++++++++++++++++------ internal/restic/lock_test.go | 24 +++++++++---------- 3 files changed, 51 insertions(+), 20 deletions(-) diff --git a/internal/repository/lock.go b/internal/repository/lock.go index 7035e3c59..fd46066d1 100644 --- a/internal/repository/lock.go +++ b/internal/repository/lock.go @@ -132,7 +132,7 @@ func (l *locker) refreshLocks(ctx context.Context, backend backend.Backend, lock // remove the lock from the repo debug.Log("unlocking repository with lock %v", lock) - if err := lock.Unlock(); err != nil { + if err := lock.Unlock(ctx); err != nil { debug.Log("error while unlocking: %v", err) logger("error while unlocking: %v", err) } diff --git a/internal/restic/lock.go b/internal/restic/lock.go index 1e393c7ed..49c7cedf2 100644 --- a/internal/restic/lock.go +++ b/internal/restic/lock.go @@ -17,6 +17,10 @@ import ( "github.com/restic/restic/internal/debug" ) +// UnlockCancelDelay bounds the duration how long lock cleanup operations will wait +// if the passed in context was canceled. +const UnlockCancelDelay time.Duration = 1 * time.Minute + // Lock represents a process locking the repository for an operation. // // There are two types of locks: exclusive and non-exclusive. There may be many @@ -136,7 +140,7 @@ func newLock(ctx context.Context, repo Unpacked, excl bool) (*Lock, error) { time.Sleep(waitBeforeLockCheck) if err = lock.checkForOtherLocks(ctx); err != nil { - _ = lock.Unlock() + _ = lock.Unlock(ctx) return nil, err } @@ -220,12 +224,15 @@ func (l *Lock) createLock(ctx context.Context) (ID, error) { } // Unlock removes the lock from the repository. -func (l *Lock) Unlock() error { +func (l *Lock) Unlock(ctx context.Context) error { if l == nil || l.lockID == nil { return nil } - return l.repo.RemoveUnpacked(context.TODO(), LockFile, *l.lockID) + ctx, cancel := delayedCancelContext(ctx, UnlockCancelDelay) + defer cancel() + + return l.repo.RemoveUnpacked(ctx, LockFile, *l.lockID) } var StaleLockTimeout = 30 * time.Minute @@ -266,6 +273,23 @@ func (l *Lock) Stale() bool { return false } +func delayedCancelContext(parentCtx context.Context, delay time.Duration) (context.Context, context.CancelFunc) { + ctx, cancel := context.WithCancel(context.Background()) + + go func() { + select { + case <-parentCtx.Done(): + case <-ctx.Done(): + return + } + + time.Sleep(delay) + cancel() + }() + + return ctx, cancel +} + // Refresh refreshes the lock by creating a new file in the backend with a new // timestamp. Afterwards the old lock is removed. func (l *Lock) Refresh(ctx context.Context) error { @@ -285,7 +309,10 @@ func (l *Lock) Refresh(ctx context.Context) error { oldLockID := l.lockID l.lockID = &id - return l.repo.RemoveUnpacked(context.TODO(), LockFile, *oldLockID) + ctx, cancel := delayedCancelContext(ctx, UnlockCancelDelay) + defer cancel() + + return l.repo.RemoveUnpacked(ctx, LockFile, *oldLockID) } // RefreshStaleLock is an extended variant of Refresh that can also refresh stale lock files. @@ -312,15 +339,19 @@ func (l *Lock) RefreshStaleLock(ctx context.Context) error { time.Sleep(waitBeforeLockCheck) exists, err = l.checkExistence(ctx) + + ctx, cancel := delayedCancelContext(ctx, UnlockCancelDelay) + defer cancel() + if err != nil { // cleanup replacement lock - _ = l.repo.RemoveUnpacked(context.TODO(), LockFile, id) + _ = l.repo.RemoveUnpacked(ctx, LockFile, id) return err } if !exists { // cleanup replacement lock - _ = l.repo.RemoveUnpacked(context.TODO(), LockFile, id) + _ = l.repo.RemoveUnpacked(ctx, LockFile, id) return ErrRemovedLock } @@ -331,7 +362,7 @@ func (l *Lock) RefreshStaleLock(ctx context.Context) error { oldLockID := l.lockID l.lockID = &id - return l.repo.RemoveUnpacked(context.TODO(), LockFile, *oldLockID) + return l.repo.RemoveUnpacked(ctx, LockFile, *oldLockID) } func (l *Lock) checkExistence(ctx context.Context) (bool, error) { diff --git a/internal/restic/lock_test.go b/internal/restic/lock_test.go index b96b11e35..606ed210d 100644 --- a/internal/restic/lock_test.go +++ b/internal/restic/lock_test.go @@ -22,7 +22,7 @@ func TestLock(t *testing.T) { lock, err := restic.NewLock(context.TODO(), repo) rtest.OK(t, err) - rtest.OK(t, lock.Unlock()) + rtest.OK(t, lock.Unlock(context.TODO())) } func TestDoubleUnlock(t *testing.T) { @@ -32,9 +32,9 @@ func TestDoubleUnlock(t *testing.T) { lock, err := restic.NewLock(context.TODO(), repo) rtest.OK(t, err) - rtest.OK(t, lock.Unlock()) + rtest.OK(t, lock.Unlock(context.TODO())) - err = lock.Unlock() + err = lock.Unlock(context.TODO()) rtest.Assert(t, err != nil, "double unlock didn't return an error, got %v", err) } @@ -49,8 +49,8 @@ func TestMultipleLock(t *testing.T) { lock2, err := restic.NewLock(context.TODO(), repo) rtest.OK(t, err) - rtest.OK(t, lock1.Unlock()) - rtest.OK(t, lock2.Unlock()) + rtest.OK(t, lock1.Unlock(context.TODO())) + rtest.OK(t, lock2.Unlock(context.TODO())) } type failLockLoadingBackend struct { @@ -75,7 +75,7 @@ func TestMultipleLockFailure(t *testing.T) { _, err = restic.NewLock(context.TODO(), repo) rtest.Assert(t, err != nil, "unreadable lock file did not result in an error") - rtest.OK(t, lock1.Unlock()) + rtest.OK(t, lock1.Unlock(context.TODO())) } func TestLockExclusive(t *testing.T) { @@ -83,7 +83,7 @@ func TestLockExclusive(t *testing.T) { elock, err := restic.NewExclusiveLock(context.TODO(), repo) rtest.OK(t, err) - rtest.OK(t, elock.Unlock()) + rtest.OK(t, elock.Unlock(context.TODO())) } func TestLockOnExclusiveLockedRepo(t *testing.T) { @@ -99,8 +99,8 @@ func TestLockOnExclusiveLockedRepo(t *testing.T) { rtest.Assert(t, restic.IsAlreadyLocked(err), "create normal lock with exclusively locked repo didn't return the correct error") - rtest.OK(t, lock.Unlock()) - rtest.OK(t, elock.Unlock()) + rtest.OK(t, lock.Unlock(context.TODO())) + rtest.OK(t, elock.Unlock(context.TODO())) } func TestExclusiveLockOnLockedRepo(t *testing.T) { @@ -116,8 +116,8 @@ func TestExclusiveLockOnLockedRepo(t *testing.T) { rtest.Assert(t, restic.IsAlreadyLocked(err), "create normal lock with exclusively locked repo didn't return the correct error") - rtest.OK(t, lock.Unlock()) - rtest.OK(t, elock.Unlock()) + rtest.OK(t, lock.Unlock(context.TODO())) + rtest.OK(t, elock.Unlock(context.TODO())) } func createFakeLock(repo restic.SaverUnpacked, t time.Time, pid int) (restic.ID, error) { @@ -296,7 +296,7 @@ func testLockRefresh(t *testing.T, refresh func(lock *restic.Lock) error) { rtest.OK(t, err) rtest.Assert(t, lock2.Time.After(time0), "expected a later timestamp after lock refresh") - rtest.OK(t, lock.Unlock()) + rtest.OK(t, lock.Unlock(context.TODO())) } func TestLockRefresh(t *testing.T) { From 723247c8e50f92dde4389564956d784ae6760460 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Mon, 20 May 2024 20:37:28 +0200 Subject: [PATCH 6/7] add changelog for longer retries --- changelog/unreleased/issue-4627 | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/changelog/unreleased/issue-4627 b/changelog/unreleased/issue-4627 index 95c9d4bc4..6f6a00098 100644 --- a/changelog/unreleased/issue-4627 +++ b/changelog/unreleased/issue-4627 @@ -9,7 +9,9 @@ downloading are now forcibly interrupted. This ensures that stuck requests are retried after a short timeout. Attempts to access a missing file or a truncated file will no longer be retried. -This avoids unnecessary retries in those cases. +This avoids unnecessary retries in those cases. All other backend requests are +retried for up to 15 minutes. This ensures that a temporarily interrupted network +connections can be tolerated. If a download yields a corrupt file or blob, then the download will be retried once. @@ -26,3 +28,4 @@ https://github.com/restic/restic/issues/4515 https://github.com/restic/restic/issues/1523 https://github.com/restic/restic/pull/4520 https://github.com/restic/restic/pull/4800 +https://github.com/restic/restic/pull/4784 From e4a48085aee84712c23f8357445f6c6c9ded68a1 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Fri, 24 May 2024 20:16:58 +0200 Subject: [PATCH 7/7] backend/retry: feature flag new retry behavior --- internal/backend/retry/backend_retry.go | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/internal/backend/retry/backend_retry.go b/internal/backend/retry/backend_retry.go index 7f2b4f745..d5134d433 100644 --- a/internal/backend/retry/backend_retry.go +++ b/internal/backend/retry/backend_retry.go @@ -108,9 +108,10 @@ func (be *Backend) retry(ctx context.Context, msg string, f func() error) error bo := backoff.NewExponentialBackOff() bo.MaxElapsedTime = be.MaxElapsedTime - bo.InitialInterval = 1 * time.Second - bo.Multiplier = 2 - + if feature.Flag.Enabled(feature.BackendErrorRedesign) { + bo.InitialInterval = 1 * time.Second + bo.Multiplier = 2 + } if fastRetries { // speed up integration tests bo.InitialInterval = 1 * time.Millisecond @@ -120,6 +121,12 @@ func (be *Backend) retry(ctx context.Context, msg string, f func() error) error } } + var b backoff.BackOff = withRetryAtLeastOnce(bo) + if !feature.Flag.Enabled(feature.BackendErrorRedesign) { + // deprecated behavior + b = backoff.WithMaxRetries(b, 10) + } + err := retryNotifyErrorWithSuccess( func() error { err := f() @@ -130,7 +137,7 @@ func (be *Backend) retry(ctx context.Context, msg string, f func() error) error } return err }, - backoff.WithContext(withRetryAtLeastOnce(bo), ctx), + backoff.WithContext(b, ctx), func(err error, d time.Duration) { if be.Report != nil { be.Report(msg, err, d)