Merge pull request #2328 from MichaelEischer/no-repeated-checks

Fix duplicate tree checks within `restic check`
2020-07-22 22:08:02 +02:00 · 2020-07-22 22:08:02 +02:00 · 34181b13a2
parent bcd47ec3a2 9b0e718852
commit 34181b13a2
5 changed files with 302 additions and 56 deletions
--- a/changelog/unreleased/pull-2328
+++ b/changelog/unreleased/pull-2328
@ -0,0 +1,8 @@
 Enhancement: Improve speed of check command
 We've improved the check command to traverse trees only once independent of
 whether they are contained in multiple snapshots. The check command is now much
 faster for repositories with a large number of snapshots.
 https://github.com/restic/restic/pull/2328
 https://github.com/restic/restic/issues/2284
--- a/cmd/restic/cmd_check.go
+++ b/cmd/restic/cmd_check.go
@ -259,7 +259,7 @@ func runCheck(opts CheckOptions, gopts GlobalOptions, args []string) error {
 	if opts.CheckUnused {
 		for _, id := range chkr.UnusedBlobs() {
-			Verbosef("unused blob %v\n", id.Str())
+			Verbosef("unused blob %v\n", id)
 			errorsFound = true
 		}
 	}
--- a/internal/checker/checker.go
+++ b/internal/checker/checker.go
@ -22,29 +22,33 @@ import (
 // repository (e.g. missing blobs), and needs a valid Repository to work on.
 type Checker struct {
 	packs    restic.IDSet
 	blobs    restic.IDSet
 	blobRefs struct {
 		sync.Mutex
-		M map[restic.ID]uint
+		// see flags below
 		M map[restic.BlobHandle]blobStatus
 	}
 	indexes map[restic.ID]*repository.Index
 	masterIndex *repository.MasterIndex
 	repo restic.Repository
 }
 type blobStatus uint8
 const (
 	blobStatusExists blobStatus = 1 << iota
 	blobStatusReferenced
 )
 // New returns a new checker which runs on repo.
 func New(repo restic.Repository) *Checker {
 	c := &Checker{
 		packs:       restic.NewIDSet(),
 		blobs:       restic.NewIDSet(),
 		masterIndex: repository.NewMasterIndex(),
 		indexes:     make(map[restic.ID]*repository.Index),
 		repo:        repo,
 	}
-	c.blobRefs.M = make(map[restic.ID]uint)
+	c.blobRefs.M = make(map[restic.BlobHandle]blobStatus)
 	return c
 }
@ -152,15 +156,14 @@ func (c *Checker) LoadIndex(ctx context.Context) (hints []error, errs []error) {
 				continue
 			}
 			c.indexes[res.ID] = res.Index
 			c.masterIndex.Insert(res.Index)
 			debug.Log("process blobs")
 			cnt := 0
 			for blob := range res.Index.Each(ctx) {
 				c.packs.Insert(blob.PackID)
-				c.blobs.Insert(blob.ID)
+				h := restic.BlobHandle{ID: blob.ID, Type: blob.Type}
-				c.blobRefs.M[blob.ID] = 0
+				c.blobRefs.M[h] = blobStatusExists
 				cnt++
 				if _, ok := packToIndex[blob.PackID]; !ok {
@ -447,20 +450,6 @@ func (c *Checker) checkTreeWorker(ctx context.Context, in <-chan treeJob, out ch
 				return
 			}
 			id := job.ID
 			alreadyChecked := false
 			c.blobRefs.Lock()
 			if c.blobRefs.M[id] > 0 {
 				alreadyChecked = true
 			}
 			c.blobRefs.M[id]++
 			debug.Log("tree %v refcount %d", job.ID, c.blobRefs.M[id])
 			c.blobRefs.Unlock()
 			if alreadyChecked {
 				continue
 			}
 			debug.Log("check tree %v (tree %v, err %v)", job.ID, job.Tree, job.error)
 			var errs []error
@ -485,7 +474,7 @@ func (c *Checker) checkTreeWorker(ctx context.Context, in <-chan treeJob, out ch
 	}
 }
-func filterTrees(ctx context.Context, backlog restic.IDs, loaderChan chan<- restic.ID, in <-chan treeJob, out chan<- treeJob) {
+func (c *Checker) filterTrees(ctx context.Context, backlog restic.IDs, loaderChan chan<- restic.ID, in <-chan treeJob, out chan<- treeJob) {
 	defer func() {
 		debug.Log("closing output channels")
 		close(loaderChan)
@ -506,8 +495,21 @@ func filterTrees(ctx context.Context, backlog restic.IDs, loaderChan chan<- rest
 	for {
 		if loadCh == nil && len(backlog) > 0 {
 			// process last added ids first, that is traverse the tree in depth-first order
 			ln := len(backlog) - 1
 			nextTreeID, backlog = backlog[ln], backlog[:ln]
 			// use a separate flag for processed trees to ensure that check still processes trees
 			// even when a file references a tree blob
 			c.blobRefs.Lock()
 			h := restic.BlobHandle{ID: nextTreeID, Type: restic.TreeBlob}
 			status := c.blobRefs.M[h]
 			c.blobRefs.Unlock()
 			if (status & blobStatusReferenced) != 0 {
 				continue
 			}
 			loadCh = loaderChan
 			nextTreeID, backlog = backlog[0], backlog[1:]
 		}
 		if loadCh == nil && outCh == nil && outstandingLoadTreeJobs == 0 {
@ -522,6 +524,10 @@ func filterTrees(ctx context.Context, backlog restic.IDs, loaderChan chan<- rest
 		case loadCh <- nextTreeID:
 			outstandingLoadTreeJobs++
 			loadCh = nil
 			c.blobRefs.Lock()
 			h := restic.BlobHandle{ID: nextTreeID, Type: restic.TreeBlob}
 			c.blobRefs.M[h] |= blobStatusReferenced
 			c.blobRefs.Unlock()
 		case j, ok := <-inCh:
 			if !ok {
@ -535,16 +541,18 @@ func filterTrees(ctx context.Context, backlog restic.IDs, loaderChan chan<- rest
 			debug.Log("input job tree %v", j.ID)
 			var err error
 			if j.error != nil {
 				debug.Log("received job with error: %v (tree %v, ID %v)", j.error, j.Tree, j.ID)
 			} else if j.Tree == nil {
 				debug.Log("received job with nil tree pointer: %v (ID %v)", j.error, j.ID)
-				err = errors.New("tree is nil and error is nil")
+				// send a new job with the new error instead of the old one
 				j = treeJob{ID: j.ID, error: errors.New("tree is nil and error is nil")}
 			} else {
-				debug.Log("subtrees for tree %v: %v", j.ID, j.Tree.Subtrees())
+				subtrees := j.Tree.Subtrees()
-				for _, id := range j.Tree.Subtrees() {
+				debug.Log("subtrees for tree %v: %v", j.ID, subtrees)
 				// iterate backwards over subtree to compensate backwards traversal order of nextTreeID selection
 				for i := len(subtrees) - 1; i >= 0; i-- {
 					id := subtrees[i]
 					if id.IsNull() {
 						// We do not need to raise this error here, it is
 						// checked when the tree is checked. Just make sure
@ -556,11 +564,6 @@ func filterTrees(ctx context.Context, backlog restic.IDs, loaderChan chan<- rest
 				}
 			}
 			if err != nil {
 				// send a new job with the new error instead of the old one
 				j = treeJob{ID: j.ID, error: err}
 			}
 			job = j
 			outCh = out
 			inCh = nil
@ -601,7 +604,7 @@ func (c *Checker) Structure(ctx context.Context, errChan chan<- error) {
 		go c.checkTreeWorker(ctx, treeJobChan2, errChan, &wg)
 	}
-	filterTrees(ctx, trees, treeIDChan, treeJobChan1, treeJobChan2)
+	c.filterTrees(ctx, trees, treeIDChan, treeJobChan1, treeJobChan2)
 	wg.Wait()
 }
@ -656,28 +659,27 @@ func (c *Checker) checkTree(id restic.ID, tree *restic.Tree) (errs []error) {
 	for _, blobID := range blobs {
 		c.blobRefs.Lock()
-		c.blobRefs.M[blobID]++
+		h := restic.BlobHandle{ID: blobID, Type: restic.DataBlob}
-		debug.Log("blob %v refcount %d", blobID, c.blobRefs.M[blobID])
+		if (c.blobRefs.M[h] & blobStatusExists) == 0 {
 		c.blobRefs.Unlock()
 		if !c.blobs.Has(blobID) {
 			debug.Log("tree %v references blob %v which isn't contained in index", id, blobID)
 			errs = append(errs, Error{TreeID: id, BlobID: blobID, Err: errors.New("not found in index")})
 		}
 		c.blobRefs.M[h] |= blobStatusReferenced
 		debug.Log("blob %v is referenced", blobID)
 		c.blobRefs.Unlock()
 	}
 	return errs
 }
 // UnusedBlobs returns all blobs that have never been referenced.
-func (c *Checker) UnusedBlobs() (blobs restic.IDs) {
+func (c *Checker) UnusedBlobs() (blobs restic.BlobHandles) {
 	c.blobRefs.Lock()
 	defer c.blobRefs.Unlock()
-	debug.Log("checking %d blobs", len(c.blobs))
+	debug.Log("checking %d blobs", len(c.blobRefs.M))
-	for id := range c.blobs {
+	for id, flags := range c.blobRefs.M {
-		if c.blobRefs.M[id] == 0 {
+		if (flags & blobStatusReferenced) == 0 {
 			debug.Log("blob %v not referenced", id)
 			blobs = append(blobs, id)
 		}
--- a/internal/checker/checker_test.go
+++ b/internal/checker/checker_test.go
@ -8,10 +8,14 @@ import (
 	"os"
 	"path/filepath"
 	"sort"
 	"strconv"
 	"sync"
 	"testing"
 	"time"
 	"github.com/restic/restic/internal/archiver"
 	"github.com/restic/restic/internal/checker"
 	"github.com/restic/restic/internal/errors"
 	"github.com/restic/restic/internal/repository"
 	"github.com/restic/restic/internal/restic"
 	"github.com/restic/restic/internal/test"
@ -153,13 +157,13 @@ func TestUnreferencedBlobs(t *testing.T) {
 	}
 	test.OK(t, repo.Backend().Remove(context.TODO(), snapshotHandle))
-	unusedBlobsBySnapshot := restic.IDs{
+	unusedBlobsBySnapshot := restic.BlobHandles{
-		restic.TestParseID("58c748bbe2929fdf30c73262bd8313fe828f8925b05d1d4a87fe109082acb849"),
+		restic.TestParseHandle("58c748bbe2929fdf30c73262bd8313fe828f8925b05d1d4a87fe109082acb849", restic.DataBlob),
-		restic.TestParseID("988a272ab9768182abfd1fe7d7a7b68967825f0b861d3b36156795832c772235"),
+		restic.TestParseHandle("988a272ab9768182abfd1fe7d7a7b68967825f0b861d3b36156795832c772235", restic.DataBlob),
-		restic.TestParseID("c01952de4d91da1b1b80bc6e06eaa4ec21523f4853b69dc8231708b9b7ec62d8"),
+		restic.TestParseHandle("c01952de4d91da1b1b80bc6e06eaa4ec21523f4853b69dc8231708b9b7ec62d8", restic.TreeBlob),
-		restic.TestParseID("bec3a53d7dc737f9a9bee68b107ec9e8ad722019f649b34d474b9982c3a3fec7"),
+		restic.TestParseHandle("bec3a53d7dc737f9a9bee68b107ec9e8ad722019f649b34d474b9982c3a3fec7", restic.TreeBlob),
-		restic.TestParseID("2a6f01e5e92d8343c4c6b78b51c5a4dc9c39d42c04e26088c7614b13d8d0559d"),
+		restic.TestParseHandle("2a6f01e5e92d8343c4c6b78b51c5a4dc9c39d42c04e26088c7614b13d8d0559d", restic.TreeBlob),
-		restic.TestParseID("18b51b327df9391732ba7aaf841a4885f350d8a557b2da8352c9acf8898e3f10"),
+		restic.TestParseHandle("18b51b327df9391732ba7aaf841a4885f350d8a557b2da8352c9acf8898e3f10", restic.DataBlob),
 	}
 	sort.Sort(unusedBlobsBySnapshot)
@ -363,13 +367,38 @@ func TestCheckerModifiedData(t *testing.T) {
 	}
 }
-func BenchmarkChecker(t *testing.B) {
+// loadTreesOnceRepository allows each tree to be loaded only once
 type loadTreesOnceRepository struct {
 	restic.Repository
 	loadedTrees   restic.IDSet
 	mutex         sync.Mutex
 	DuplicateTree bool
 }
 func (r *loadTreesOnceRepository) LoadTree(ctx context.Context, id restic.ID) (*restic.Tree, error) {
 	r.mutex.Lock()
 	defer r.mutex.Unlock()
 	if r.loadedTrees.Has(id) {
 		// additionally store error to ensure that it cannot be swallowed
 		r.DuplicateTree = true
 		return nil, errors.Errorf("trying to load tree with id %v twice", id)
 	}
 	r.loadedTrees.Insert(id)
 	return r.Repository.LoadTree(ctx, id)
 }
 func TestCheckerNoDuplicateTreeDecodes(t *testing.T) {
 	repodir, cleanup := test.Env(t, checkerTestData)
 	defer cleanup()
 	repo := repository.TestOpenLocal(t, repodir)
 	checkRepo := &loadTreesOnceRepository{
 		Repository:  repo,
 		loadedTrees: restic.NewIDSet(),
 	}
-	chkr := checker.New(repo)
+	chkr := checker.New(checkRepo)
 	hints, errs := chkr.LoadIndex(context.TODO())
 	if len(errs) > 0 {
 		t.Fatalf("expected no errors, got %v: %v", len(errs), errs)
@ -379,6 +408,198 @@ func BenchmarkChecker(t *testing.B) {
 		t.Errorf("expected no hints, got %v: %v", len(hints), hints)
 	}
 	test.OKs(t, checkPacks(chkr))
 	test.OKs(t, checkStruct(chkr))
 	test.Assert(t, !checkRepo.DuplicateTree, "detected duplicate tree loading")
 }
 // delayRepository delays read of a specific handle.
 type delayRepository struct {
 	restic.Repository
 	DelayTree      restic.ID
 	UnblockChannel chan struct{}
 	Unblocker      sync.Once
 }
 func (r *delayRepository) LoadTree(ctx context.Context, id restic.ID) (*restic.Tree, error) {
 	if id == r.DelayTree {
 		<-r.UnblockChannel
 	}
 	return r.Repository.LoadTree(ctx, id)
 }
 func (r *delayRepository) LookupBlobSize(id restic.ID, t restic.BlobType) (uint, bool) {
 	if id == r.DelayTree && t == restic.DataBlob {
 		r.Unblock()
 	}
 	return r.Repository.LookupBlobSize(id, t)
 }
 func (r *delayRepository) Unblock() {
 	r.Unblocker.Do(func() {
 		close(r.UnblockChannel)
 	})
 }
 func TestCheckerBlobTypeConfusion(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), time.Second)
 	defer cancel()
 	repo, cleanup := repository.TestRepository(t)
 	defer cleanup()
 	damagedNode := &restic.Node{
 		Name:    "damaged",
 		Type:    "file",
 		Mode:    0644,
 		Size:    42,
 		Content: restic.IDs{restic.TestParseID("0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef")},
 	}
 	damagedTree := &restic.Tree{
 		Nodes: []*restic.Node{damagedNode},
 	}
 	id, err := repo.SaveTree(ctx, damagedTree)
 	test.OK(t, repo.Flush(ctx))
 	test.OK(t, err)
 	buf, err := repo.LoadBlob(ctx, restic.TreeBlob, id, nil)
 	test.OK(t, err)
 	_, _, err = repo.SaveBlob(ctx, restic.DataBlob, buf, id, false)
 	test.OK(t, err)
 	malNode := &restic.Node{
 		Name:    "aaaaa",
 		Type:    "file",
 		Mode:    0644,
 		Size:    uint64(len(buf)),
 		Content: restic.IDs{id},
 	}
 	dirNode := &restic.Node{
 		Name:    "bbbbb",
 		Type:    "dir",
 		Mode:    0755,
 		Subtree: &id,
 	}
 	rootTree := &restic.Tree{
 		Nodes: []*restic.Node{malNode, dirNode},
 	}
 	rootId, err := repo.SaveTree(ctx, rootTree)
 	test.OK(t, err)
 	test.OK(t, repo.Flush(ctx))
 	test.OK(t, repo.SaveIndex(ctx))
 	snapshot, err := restic.NewSnapshot([]string{"/damaged"}, []string{"test"}, "foo", time.Now())
 	test.OK(t, err)
 	snapshot.Tree = &rootId
 	snapId, err := repo.SaveJSONUnpacked(ctx, restic.SnapshotFile, snapshot)
 	test.OK(t, err)
 	t.Logf("saved snapshot %v", snapId.Str())
 	delayRepo := &delayRepository{
 		Repository:     repo,
 		DelayTree:      id,
 		UnblockChannel: make(chan struct{}),
 	}
 	chkr := checker.New(delayRepo)
 	go func() {
 		<-ctx.Done()
 		delayRepo.Unblock()
 	}()
 	hints, errs := chkr.LoadIndex(ctx)
 	if len(errs) > 0 {
 		t.Fatalf("expected no errors, got %v: %v", len(errs), errs)
 	}
 	if len(hints) > 0 {
 		t.Errorf("expected no hints, got %v: %v", len(hints), hints)
 	}
 	errFound := false
 	for _, err := range checkStruct(chkr) {
 		t.Logf("struct error: %v", err)
 		errFound = true
 	}
 	test.OK(t, ctx.Err())
 	if !errFound {
 		t.Fatal("no error found, checker is broken")
 	}
 }
 func loadBenchRepository(t *testing.B) (*checker.Checker, restic.Repository, func()) {
 	repodir, cleanup := test.Env(t, checkerTestData)
 	repo := repository.TestOpenLocal(t, repodir)
 	chkr := checker.New(repo)
 	hints, errs := chkr.LoadIndex(context.TODO())
 	if len(errs) > 0 {
 		defer cleanup()
 		t.Fatalf("expected no errors, got %v: %v", len(errs), errs)
 	}
 	if len(hints) > 0 {
 		t.Errorf("expected no hints, got %v: %v", len(hints), hints)
 	}
 	return chkr, repo, cleanup
 }
 func BenchmarkChecker(t *testing.B) {
 	chkr, _, cleanup := loadBenchRepository(t)
 	defer cleanup()
 	t.ResetTimer()
 	for i := 0; i < t.N; i++ {
 		test.OKs(t, checkPacks(chkr))
 		test.OKs(t, checkStruct(chkr))
 		test.OKs(t, checkData(chkr))
 	}
 }
 func benchmarkSnapshotScaling(t *testing.B, newSnapshots int) {
 	chkr, repo, cleanup := loadBenchRepository(t)
 	defer cleanup()
 	snID, err := restic.FindSnapshot(repo, "51d249d2")
 	if err != nil {
 		t.Fatal(err)
 	}
 	var sn2 restic.Snapshot
 	err = repo.LoadJSONUnpacked(context.TODO(), restic.SnapshotFile, snID, &sn2)
 	if err != nil {
 		t.Fatal(err)
 	}
 	treeID := sn2.Tree
 	for i := 0; i < newSnapshots; i++ {
 		sn, err := restic.NewSnapshot([]string{"test" + strconv.Itoa(i)}, nil, "", time.Now())
 		if err != nil {
 			t.Fatal(err)
 		}
 		sn.Tree = treeID
 		_, err = repo.SaveJSONUnpacked(context.TODO(), restic.SnapshotFile, sn)
 		if err != nil {
 			t.Fatal(err)
 		}
 	}
 	t.ResetTimer()
 	for i := 0; i < t.N; i++ {
@ -387,3 +608,13 @@ func BenchmarkChecker(t *testing.B) {
 		test.OKs(t, checkData(chkr))
 	}
 }
 func BenchmarkCheckerSnapshotScaling(b *testing.B) {
 	counts := []int{50, 100, 200}
 	for _, count := range counts {
 		count := count
 		b.Run(strconv.Itoa(count), func(b *testing.B) {
 			benchmarkSnapshotScaling(b, count)
 		})
 	}
 }
--- a/internal/restic/testing.go
+++ b/internal/restic/testing.go
@ -200,3 +200,8 @@ func TestParseID(s string) ID {
 	return id
 }
 // TestParseHandle parses s as a ID, panics if that fails and creates a BlobHandle with t.
 func TestParseHandle(s string, t BlobType) BlobHandle {
 	return BlobHandle{ID: TestParseID(s), Type: t}
 }