From d8f0e7cbd1587498371b80dabda429cefef92fe1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zlatko=20=C4=8Calu=C5=A1i=C4=87?= Date: Wed, 9 Nov 2016 22:37:20 +0100 Subject: [PATCH] Fix REST backend HTTP keepalive This is subtle. A combination od fast client disk (read: SSD) with lots of files and fast network connection to restic-server would suddenly start getting lots of "dial tcp: connect: cannot assign requested address" errors during backup stage. Further inspection revealed that client machine was plagued with TCP sockets in TIME_WAIT state. When ephemeral port range was finally exhausted, no more sockets could be opened, so restic would freak out. To understand the magnitude of this problem, with ~18k ports and default timeout of 60 seconds, it means more than 300 HTTP connections per seconds were created and teared down. Yeah, restic-server is that fast. :) As it turns out, this behavior was product of 2 subtle issues: 1) The body of HTTP response wasn't read completely with io.ReadFull() at the end of the Load() function. This deactivated HTTP keepalive, so already open connections were not reused, but closed instead, and new ones opened for every new request. io.Copy(ioutil.Discard, resp.Body) before resp.Body.Close() remedies this. 2) Even with the above fix, somehow having MaxIdleConnsPerHost at its default value of 2 wasn't enough to stop reconnecting. It is hard to understand why this would be so detrimental, it could even be some subtle Go runtime bug. Anyhow, setting this value to match the connection limit, as set by connLimit global variable, finally nails this ugly bug. I fixed several other places where the response body wasn't read in full (or at all). For example, json.NewDecoder() is also known not to read the whole body of response. Unfortunately, this is not over yet. :( The check command is firing up to 40 simultaneous connections to the restic-server. Then, once again, MaxIdleConnsPerHost is too low to support keepalive, and sockets in the TIME_WAIT state pile up. But, as this kind of concurrency absolutely kill the poor disk on the server side, this is a completely different bug then. --- src/restic/backend/rest/rest.go | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/restic/backend/rest/rest.go b/src/restic/backend/rest/rest.go index ce1d25db9..f01854fd6 100644 --- a/src/restic/backend/rest/rest.go +++ b/src/restic/backend/rest/rest.go @@ -5,6 +5,7 @@ import ( "encoding/json" "fmt" "io" + "io/ioutil" "net/http" "net/url" "path" @@ -59,7 +60,7 @@ func Open(cfg Config) (restic.Backend, error) { for i := 0; i < connLimit; i++ { connChan <- struct{}{} } - tr := &http.Transport{} + tr := &http.Transport{MaxIdleConnsPerHost: connLimit} client := http.Client{Transport: tr} return &restBackend{url: cfg.URL, connChan: connChan, client: client}, nil @@ -102,6 +103,7 @@ func (b *restBackend) Load(h restic.Handle, p []byte, off int64) (n int, err err if resp != nil { defer func() { + io.Copy(ioutil.Discard, resp.Body) e := resp.Body.Close() if err == nil { @@ -132,6 +134,7 @@ func (b *restBackend) Save(h restic.Handle, p []byte) (err error) { if resp != nil { defer func() { + io.Copy(ioutil.Discard, resp.Body) e := resp.Body.Close() if err == nil { @@ -164,6 +167,7 @@ func (b *restBackend) Stat(h restic.Handle) (restic.FileInfo, error) { return restic.FileInfo{}, errors.Wrap(err, "client.Head") } + io.Copy(ioutil.Discard, resp.Body) if err = resp.Body.Close(); err != nil { return restic.FileInfo{}, errors.Wrap(err, "Close") } @@ -216,6 +220,7 @@ func (b *restBackend) Remove(t restic.FileType, name string) error { return errors.New("blob not removed") } + io.Copy(ioutil.Discard, resp.Body) return resp.Body.Close() } @@ -235,7 +240,14 @@ func (b *restBackend) List(t restic.FileType, done <-chan struct{}) <-chan strin b.connChan <- struct{}{} if resp != nil { - defer resp.Body.Close() + defer func() { + io.Copy(ioutil.Discard, resp.Body) + e := resp.Body.Close() + + if err == nil { + err = errors.Wrap(e, "Close") + } + }() } if err != nil {