This commit is contained in:
Michael Eischer 2024-05-05 00:31:10 +02:00 committed by GitHub
commit 422c289a88
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 329 additions and 2 deletions

View File

@ -4,5 +4,13 @@ Restic now downloads pack files in large chunks instead of using a streaming
download. This prevents failures due to interrupted streams. The `restore`
command now also retries downloading individual blobs that cannot be retrieved.
HTTP requests that become stuck while uploading or download for more than two
minutes. This ensures that stuck requests are retried after a short timeout.
These new request timeouts can temporarily be disabled by setting the
environment variable `RESTIC_FEATURES=http-timeouts=false`. Note that this
feature flag will be removed in the next minor restic version.
https://github.com/restic/restic/issues/4627
https://github.com/restic/restic/issues/4193
https://github.com/restic/restic/pull/4605
https://github.com/restic/restic/pull/4792

View File

@ -13,6 +13,8 @@ import (
"github.com/peterbourgon/unixtransport"
"github.com/restic/restic/internal/debug"
"github.com/restic/restic/internal/errors"
"github.com/restic/restic/internal/feature"
"golang.org/x/net/http2"
)
// TransportOptions collects various options which can be set for an HTTP based
@ -74,7 +76,6 @@ func Transport(opts TransportOptions) (http.RoundTripper, error) {
KeepAlive: 30 * time.Second,
DualStack: true,
}).DialContext,
ForceAttemptHTTP2: true,
MaxIdleConns: 100,
MaxIdleConnsPerHost: 100,
IdleConnTimeout: 90 * time.Second,
@ -83,6 +84,15 @@ func Transport(opts TransportOptions) (http.RoundTripper, error) {
TLSClientConfig: &tls.Config{},
}
// ensure that http2 connections are closed if they are broken
h2, err := http2.ConfigureTransports(tr)
if err != nil {
panic(err)
}
h2.WriteByteTimeout = 120 * time.Second
h2.ReadIdleTimeout = 60 * time.Second
h2.PingTimeout = 60 * time.Second
unixtransport.Register(tr)
if opts.InsecureTLS {
@ -119,6 +129,11 @@ func Transport(opts TransportOptions) (http.RoundTripper, error) {
tr.TLSClientConfig.RootCAs = pool
}
rt := http.RoundTripper(tr)
if feature.Flag.Enabled(feature.HTTPTimeouts) {
rt = newWatchdogRoundtriper(rt, 120*time.Second, 128*1024)
}
// wrap in the debug round tripper (if active)
return debug.RoundTripper(tr), nil
return debug.RoundTripper(rt), nil
}

View File

@ -0,0 +1,102 @@
package backend
import (
"context"
"io"
"net/http"
"time"
)
// watchdogRoundtriper cancels an http request if an upload or download did not make progress
// within timeout. The time between fully sending the request and receiving an response is also
// limited by this timeout. This ensures that stuck requests are cancel after some time.
//
// The roundtriper makes the assumption that the upload and download happen continuously. In particular,
// the caller must not make long pauses between individual read requests from the response body.
type watchdogRoundtriper struct {
rt http.RoundTripper
timeout time.Duration
chunkSize int
}
var _ http.RoundTripper = &watchdogRoundtriper{}
func newWatchdogRoundtriper(rt http.RoundTripper, timeout time.Duration, chunkSize int) *watchdogRoundtriper {
return &watchdogRoundtriper{
rt: rt,
timeout: timeout,
chunkSize: chunkSize,
}
}
func (w *watchdogRoundtriper) RoundTrip(req *http.Request) (*http.Response, error) {
timer := time.NewTimer(w.timeout)
ctx, cancel := context.WithCancel(req.Context())
// cancel context if timer expires
go func() {
defer timer.Stop()
select {
case <-timer.C:
cancel()
case <-ctx.Done():
}
}()
kick := func() {
timer.Reset(w.timeout)
}
req = req.Clone(ctx)
if req.Body != nil {
// kick watchdog timer as long as uploading makes progress
req.Body = newWatchdogReadCloser(req.Body, w.chunkSize, kick, nil)
}
resp, err := w.rt.RoundTrip(req)
if err != nil {
return nil, err
}
// kick watchdog timer as long as downloading makes progress
// cancel context to stop goroutine once response body is closed
resp.Body = newWatchdogReadCloser(resp.Body, w.chunkSize, kick, cancel)
return resp, nil
}
func newWatchdogReadCloser(rc io.ReadCloser, chunkSize int, kick func(), close func()) *watchdogReadCloser {
return &watchdogReadCloser{
rc: rc,
chunkSize: chunkSize,
kick: kick,
close: close,
}
}
type watchdogReadCloser struct {
rc io.ReadCloser
chunkSize int
kick func()
close func()
}
var _ io.ReadCloser = &watchdogReadCloser{}
func (w *watchdogReadCloser) Read(p []byte) (n int, err error) {
w.kick()
if len(p) > w.chunkSize {
p = p[:w.chunkSize]
}
n, err = w.rc.Read(p)
w.kick()
return n, err
}
func (w *watchdogReadCloser) Close() error {
if w.close != nil {
w.close()
}
return w.rc.Close()
}

View File

@ -0,0 +1,200 @@
package backend
import (
"bytes"
"context"
"fmt"
"io"
"net/http"
"net/http/httptest"
"testing"
"time"
rtest "github.com/restic/restic/internal/test"
)
func TestRead(t *testing.T) {
data := []byte("abcdef")
var ctr int
kick := func() {
ctr++
}
var closed bool
onClose := func() {
closed = true
}
wd := newWatchdogReadCloser(io.NopCloser(bytes.NewReader(data)), 1, kick, onClose)
out, err := io.ReadAll(wd)
rtest.OK(t, err)
rtest.Equals(t, data, out, "data mismatch")
// the EOF read also triggers the kick function
rtest.Equals(t, len(data)*2+2, ctr, "unexpected number of kick calls")
rtest.Equals(t, false, closed, "close function called too early")
rtest.OK(t, wd.Close())
rtest.Equals(t, true, closed, "close function not called")
}
func TestRoundtrip(t *testing.T) {
t.Parallel()
// at the higher delay values, it takes longer to transmit the
for _, delay := range []int{0, 1, 10, 20} {
t.Run(fmt.Sprintf("%v", delay), func(t *testing.T) {
msg := []byte("ping-pong-data")
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
data, err := io.ReadAll(r.Body)
if err != nil {
w.WriteHeader(500)
return
}
w.WriteHeader(200)
// slowly send the reply
for len(data) >= 2 {
_, _ = w.Write(data[:2])
w.(http.Flusher).Flush()
data = data[2:]
time.Sleep(time.Duration(delay) * time.Millisecond)
}
_, _ = w.Write(data)
}))
defer srv.Close()
rt := newWatchdogRoundtriper(http.DefaultTransport, 50*time.Millisecond, 2)
req, err := http.NewRequestWithContext(context.TODO(), "GET", srv.URL, io.NopCloser(newSlowReader(bytes.NewReader(msg), time.Duration(delay)*time.Millisecond)))
rtest.OK(t, err)
resp, err := rt.RoundTrip(req)
rtest.OK(t, err)
rtest.Equals(t, 200, resp.StatusCode, "unexpected status code")
response, err := io.ReadAll(resp.Body)
rtest.OK(t, err)
rtest.Equals(t, msg, response, "unexpected response")
rtest.OK(t, resp.Body.Close())
})
}
}
func TestCanceledRoundtrip(t *testing.T) {
rt := newWatchdogRoundtriper(http.DefaultTransport, time.Second, 2)
ctx, cancel := context.WithCancel(context.Background())
cancel()
req, err := http.NewRequestWithContext(ctx, "GET", "http://some.random.url.dfdgsfg", nil)
rtest.OK(t, err)
resp, err := rt.RoundTrip(req)
rtest.Equals(t, context.Canceled, err)
// make linter happy
if resp != nil {
rtest.OK(t, resp.Body.Close())
}
}
type slowReader struct {
data io.Reader
delay time.Duration
}
func newSlowReader(data io.Reader, delay time.Duration) *slowReader {
return &slowReader{
data: data,
delay: delay,
}
}
func (s *slowReader) Read(p []byte) (n int, err error) {
time.Sleep(s.delay)
return s.data.Read(p)
}
func TestUploadTimeout(t *testing.T) {
t.Parallel()
msg := []byte("ping")
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
_, err := io.ReadAll(r.Body)
if err != nil {
w.WriteHeader(500)
return
}
t.Error("upload should have been canceled")
}))
defer srv.Close()
rt := newWatchdogRoundtriper(http.DefaultTransport, 10*time.Millisecond, 1024)
req, err := http.NewRequestWithContext(context.TODO(), "GET", srv.URL, io.NopCloser(newSlowReader(bytes.NewReader(msg), 100*time.Millisecond)))
rtest.OK(t, err)
resp, err := rt.RoundTrip(req)
rtest.Equals(t, context.Canceled, err)
// make linter happy
if resp != nil {
rtest.OK(t, resp.Body.Close())
}
}
func TestProcessingTimeout(t *testing.T) {
t.Parallel()
msg := []byte("ping")
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
_, err := io.ReadAll(r.Body)
if err != nil {
w.WriteHeader(500)
return
}
time.Sleep(100 * time.Millisecond)
w.WriteHeader(200)
}))
defer srv.Close()
rt := newWatchdogRoundtriper(http.DefaultTransport, 10*time.Millisecond, 1024)
req, err := http.NewRequestWithContext(context.TODO(), "GET", srv.URL, io.NopCloser(bytes.NewReader(msg)))
rtest.OK(t, err)
resp, err := rt.RoundTrip(req)
rtest.Equals(t, context.Canceled, err)
// make linter happy
if resp != nil {
rtest.OK(t, resp.Body.Close())
}
}
func TestDownloadTimeout(t *testing.T) {
t.Parallel()
msg := []byte("ping")
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
data, err := io.ReadAll(r.Body)
if err != nil {
w.WriteHeader(500)
return
}
w.WriteHeader(200)
_, _ = w.Write(data[:2])
w.(http.Flusher).Flush()
data = data[2:]
time.Sleep(100 * time.Millisecond)
_, _ = w.Write(data)
}))
defer srv.Close()
rt := newWatchdogRoundtriper(http.DefaultTransport, 10*time.Millisecond, 1024)
req, err := http.NewRequestWithContext(context.TODO(), "GET", srv.URL, io.NopCloser(bytes.NewReader(msg)))
rtest.OK(t, err)
resp, err := rt.RoundTrip(req)
rtest.OK(t, err)
rtest.Equals(t, 200, resp.StatusCode, "unexpected status code")
_, err = io.ReadAll(resp.Body)
rtest.Equals(t, context.Canceled, err, "response download not canceled")
rtest.OK(t, resp.Body.Close())
}

View File

@ -8,6 +8,7 @@ const (
DeprecateLegacyIndex FlagName = "deprecate-legacy-index"
DeprecateS3LegacyLayout FlagName = "deprecate-s3-legacy-layout"
DeviceIDForHardlinks FlagName = "device-id-for-hardlinks"
HTTPTimeouts FlagName = "http-timeouts"
)
func init() {
@ -15,5 +16,6 @@ func init() {
DeprecateLegacyIndex: {Type: Beta, Description: "disable support for index format used by restic 0.1.0. Use `restic repair index` to update the index if necessary."},
DeprecateS3LegacyLayout: {Type: Beta, Description: "disable support for S3 legacy layout used up to restic 0.7.0. Use `RESTIC_FEATURES=deprecate-s3-legacy-layout=false restic migrate s3_layout` to migrate your S3 repository if necessary."},
DeviceIDForHardlinks: {Type: Alpha, Description: "store deviceID only for hardlinks to reduce metadata changes for example when using btrfs subvolumes. Will be removed in a future restic version after repository format 3 is available"},
HTTPTimeouts: {Type: Beta, Description: "enforce timeouts for stuck HTTP requests."},
})
}