From 184103647ad5e8ce2cf87d1efa5985cf4d4b2560 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Sat, 1 Feb 2020 21:09:52 +0100 Subject: [PATCH 1/5] FindUsedBlobs: merge seen into blobs BlobSet The seen BlobSet always contained a subset of the entries in blobs. Thus use blobs instead and avoid the memory overhead of the second set. Suggested-by: Alexander Weiss --- cmd/restic/cmd_prune.go | 3 +-- cmd/restic/cmd_stats.go | 9 ++++----- internal/restic/find.go | 11 ++++------- internal/restic/find_test.go | 5 ++--- 4 files changed, 11 insertions(+), 17 deletions(-) diff --git a/cmd/restic/cmd_prune.go b/cmd/restic/cmd_prune.go index 6a3af8332..7d374b1b8 100644 --- a/cmd/restic/cmd_prune.go +++ b/cmd/restic/cmd_prune.go @@ -189,14 +189,13 @@ func pruneRepository(gopts GlobalOptions, repo restic.Repository) error { Verbosef("find data that is still in use for %d snapshots\n", stats.snapshots) usedBlobs := restic.NewBlobSet() - seenBlobs := restic.NewBlobSet() bar = newProgressMax(!gopts.Quiet, uint64(len(snapshots)), "snapshots") bar.Start() for _, sn := range snapshots { debug.Log("process snapshot %v", sn.ID()) - err = restic.FindUsedBlobs(ctx, repo, *sn.Tree, usedBlobs, seenBlobs) + err = restic.FindUsedBlobs(ctx, repo, *sn.Tree, usedBlobs) if err != nil { if repo.Backend().IsNotExist(err) { return errors.Fatal("unable to load a tree from the repo: " + err.Error()) diff --git a/cmd/restic/cmd_stats.go b/cmd/restic/cmd_stats.go index 7228bf6b0..a779447b4 100644 --- a/cmd/restic/cmd_stats.go +++ b/cmd/restic/cmd_stats.go @@ -93,7 +93,6 @@ func runStats(gopts GlobalOptions, args []string) error { uniqueInodes: make(map[uint64]struct{}), fileBlobs: make(map[string]restic.IDSet), blobs: restic.NewBlobSet(), - blobsSeen: restic.NewBlobSet(), } if snapshotIDString != "" { @@ -183,7 +182,7 @@ func statsWalkSnapshot(ctx context.Context, snapshot *restic.Snapshot, repo rest if countMode == countModeRawData { // count just the sizes of unique blobs; we don't need to walk the tree // ourselves in this case, since a nifty function does it for us - return restic.FindUsedBlobs(ctx, repo, *snapshot.Tree, stats.blobs, stats.blobsSeen) + return restic.FindUsedBlobs(ctx, repo, *snapshot.Tree, stats.blobs) } err := walker.Walk(ctx, repo, *snapshot.Tree, restic.NewIDSet(), statsWalkTree(repo, stats)) @@ -318,9 +317,9 @@ type statsContainer struct { // blobs that have been seen as a part of the file fileBlobs map[string]restic.IDSet - // blobs and blobsSeen are used to count individual - // unique blobs, independent of references to files - blobs, blobsSeen restic.BlobSet + // blobs is used to count individual unique blobs, + // independent of references to files + blobs restic.BlobSet } // fileID is a 256-bit hash that distinguishes unique files. diff --git a/internal/restic/find.go b/internal/restic/find.go index 4b118abb0..09654f938 100644 --- a/internal/restic/find.go +++ b/internal/restic/find.go @@ -3,9 +3,8 @@ package restic import "context" // FindUsedBlobs traverses the tree ID and adds all seen blobs (trees and data -// blobs) to the set blobs. The tree blobs in the `seen` BlobSet will not be visited -// again. -func FindUsedBlobs(ctx context.Context, repo Repository, treeID ID, blobs BlobSet, seen BlobSet) error { +// blobs) to the set blobs. Already seen tree blobs will not be visited again. +func FindUsedBlobs(ctx context.Context, repo Repository, treeID ID, blobs BlobSet) error { blobs.Insert(BlobHandle{ID: treeID, Type: TreeBlob}) tree, err := repo.LoadTree(ctx, treeID) @@ -22,13 +21,11 @@ func FindUsedBlobs(ctx context.Context, repo Repository, treeID ID, blobs BlobSe case "dir": subtreeID := *node.Subtree h := BlobHandle{ID: subtreeID, Type: TreeBlob} - if seen.Has(h) { + if blobs.Has(h) { continue } - seen.Insert(h) - - err := FindUsedBlobs(ctx, repo, subtreeID, blobs, seen) + err := FindUsedBlobs(ctx, repo, subtreeID, blobs) if err != nil { return err } diff --git a/internal/restic/find_test.go b/internal/restic/find_test.go index d3620b472..7e52dd681 100644 --- a/internal/restic/find_test.go +++ b/internal/restic/find_test.go @@ -93,7 +93,7 @@ func TestFindUsedBlobs(t *testing.T) { for i, sn := range snapshots { usedBlobs := restic.NewBlobSet() - err := restic.FindUsedBlobs(context.TODO(), repo, *sn.Tree, usedBlobs, restic.NewBlobSet()) + err := restic.FindUsedBlobs(context.TODO(), repo, *sn.Tree, usedBlobs) if err != nil { t.Errorf("FindUsedBlobs returned error: %v", err) continue @@ -127,9 +127,8 @@ func BenchmarkFindUsedBlobs(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - seen := restic.NewBlobSet() blobs := restic.NewBlobSet() - err := restic.FindUsedBlobs(context.TODO(), repo, *sn.Tree, blobs, seen) + err := restic.FindUsedBlobs(context.TODO(), repo, *sn.Tree, blobs) if err != nil { b.Error(err) } From 9ea1a78bd4c57f0db465342f05496828a7784509 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Sat, 22 Feb 2020 21:17:39 +0100 Subject: [PATCH 2/5] FindUsedBlobs: Check for seen blobs before loading trees The only effective change in behavior is that that toplevel nodes can also be skipped. --- internal/restic/find.go | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/internal/restic/find.go b/internal/restic/find.go index 09654f938..40ed2dac8 100644 --- a/internal/restic/find.go +++ b/internal/restic/find.go @@ -5,7 +5,11 @@ import "context" // FindUsedBlobs traverses the tree ID and adds all seen blobs (trees and data // blobs) to the set blobs. Already seen tree blobs will not be visited again. func FindUsedBlobs(ctx context.Context, repo Repository, treeID ID, blobs BlobSet) error { - blobs.Insert(BlobHandle{ID: treeID, Type: TreeBlob}) + h := BlobHandle{ID: treeID, Type: TreeBlob} + if blobs.Has(h) { + return nil + } + blobs.Insert(h) tree, err := repo.LoadTree(ctx, treeID) if err != nil { @@ -19,13 +23,7 @@ func FindUsedBlobs(ctx context.Context, repo Repository, treeID ID, blobs BlobSe blobs.Insert(BlobHandle{ID: blob, Type: DataBlob}) } case "dir": - subtreeID := *node.Subtree - h := BlobHandle{ID: subtreeID, Type: TreeBlob} - if blobs.Has(h) { - continue - } - - err := FindUsedBlobs(ctx, repo, subtreeID, blobs) + err := FindUsedBlobs(ctx, repo, *node.Subtree, blobs) if err != nil { return err } From af66a62c04b4e926796c60db969073b741f0e8f7 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Sat, 22 Feb 2020 21:21:09 +0100 Subject: [PATCH 3/5] FindUsedBlobs: Test that seen blobs are skipped This also copies the TreeLoader interface from internal/walker to allow stubbing the repository in the call to `FindUsedBlobs`. --- internal/restic/find.go | 7 ++++++- internal/restic/find_test.go | 26 ++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/internal/restic/find.go b/internal/restic/find.go index 40ed2dac8..b5bef0720 100644 --- a/internal/restic/find.go +++ b/internal/restic/find.go @@ -2,9 +2,14 @@ package restic import "context" +// TreeLoader loads a tree from a repository. +type TreeLoader interface { + LoadTree(context.Context, ID) (*Tree, error) +} + // FindUsedBlobs traverses the tree ID and adds all seen blobs (trees and data // blobs) to the set blobs. Already seen tree blobs will not be visited again. -func FindUsedBlobs(ctx context.Context, repo Repository, treeID ID, blobs BlobSet) error { +func FindUsedBlobs(ctx context.Context, repo TreeLoader, treeID ID, blobs BlobSet) error { h := BlobHandle{ID: treeID, Type: TreeBlob} if blobs.Has(h) { return nil diff --git a/internal/restic/find_test.go b/internal/restic/find_test.go index 7e52dd681..635421d8b 100644 --- a/internal/restic/find_test.go +++ b/internal/restic/find_test.go @@ -12,6 +12,7 @@ import ( "testing" "time" + "github.com/restic/restic/internal/errors" "github.com/restic/restic/internal/repository" "github.com/restic/restic/internal/restic" ) @@ -118,6 +119,31 @@ func TestFindUsedBlobs(t *testing.T) { } } +type ForbiddenRepo struct{} + +func (r ForbiddenRepo) LoadTree(ctx context.Context, id restic.ID) (*restic.Tree, error) { + return nil, errors.New("should not be called") +} + +func TestFindUsedBlobsSkipsSeenBlobs(t *testing.T) { + repo, cleanup := repository.TestRepository(t) + defer cleanup() + + snapshot := restic.TestCreateSnapshot(t, repo, findTestTime, findTestDepth, 0) + t.Logf("snapshot %v saved, tree %v", snapshot.ID().Str(), snapshot.Tree.Str()) + + usedBlobs := restic.NewBlobSet() + err := restic.FindUsedBlobs(context.TODO(), repo, *snapshot.Tree, usedBlobs) + if err != nil { + t.Fatalf("FindUsedBlobs returned error: %v", err) + } + + err = restic.FindUsedBlobs(context.TODO(), ForbiddenRepo{}, *snapshot.Tree, usedBlobs) + if err != nil { + t.Fatalf("FindUsedBlobs returned error: %v", err) + } +} + func BenchmarkFindUsedBlobs(b *testing.B) { repo, cleanup := repository.TestRepository(b) defer cleanup() From a178e5628ec3aa9bbf42800ade0d67753ade26d4 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Sat, 22 Feb 2020 21:21:36 +0100 Subject: [PATCH 4/5] Remove duplicate TreeLoader interface --- internal/walker/walker.go | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/internal/walker/walker.go b/internal/walker/walker.go index 7d6db3abe..1794e8a71 100644 --- a/internal/walker/walker.go +++ b/internal/walker/walker.go @@ -10,11 +10,6 @@ import ( "github.com/restic/restic/internal/restic" ) -// TreeLoader loads a tree from a repository. -type TreeLoader interface { - LoadTree(context.Context, restic.ID) (*restic.Tree, error) -} - // SkipNode is returned by WalkFunc when a dir node should not be walked. var SkipNode = errors.New("skip this node") @@ -38,7 +33,7 @@ type WalkFunc func(parentTreeID restic.ID, path string, node *restic.Node, nodeE // Walk calls walkFn recursively for each node in root. If walkFn returns an // error, it is passed up the call stack. The trees in ignoreTrees are not // walked. If walkFn ignores trees, these are added to the set. -func Walk(ctx context.Context, repo TreeLoader, root restic.ID, ignoreTrees restic.IDSet, walkFn WalkFunc) error { +func Walk(ctx context.Context, repo restic.TreeLoader, root restic.ID, ignoreTrees restic.IDSet, walkFn WalkFunc) error { tree, err := repo.LoadTree(ctx, root) _, err = walkFn(root, "/", nil, err) @@ -60,7 +55,7 @@ func Walk(ctx context.Context, repo TreeLoader, root restic.ID, ignoreTrees rest // walk recursively traverses the tree, ignoring subtrees when the ID of the // subtree is in ignoreTrees. If err is nil and ignore is true, the subtree ID // will be added to ignoreTrees by walk. -func walk(ctx context.Context, repo TreeLoader, prefix string, parentTreeID restic.ID, tree *restic.Tree, ignoreTrees restic.IDSet, walkFn WalkFunc) (ignore bool, err error) { +func walk(ctx context.Context, repo restic.TreeLoader, prefix string, parentTreeID restic.ID, tree *restic.Tree, ignoreTrees restic.IDSet, walkFn WalkFunc) (ignore bool, err error) { var allNodesIgnored = true if len(tree.Nodes) == 0 { From 2d7ab9115f89b762dfef064d4eb6b0cb81ae45db Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Thu, 5 Mar 2020 23:24:57 +0100 Subject: [PATCH 5/5] Add changelog entry --- changelog/unreleased/pull-2599 | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 changelog/unreleased/pull-2599 diff --git a/changelog/unreleased/pull-2599 b/changelog/unreleased/pull-2599 new file mode 100644 index 000000000..d28955a5c --- /dev/null +++ b/changelog/unreleased/pull-2599 @@ -0,0 +1,6 @@ +Enhancement: Slightly reduce memory usage of prune and stats commands + +The prune and the stats command kept directory identifiers in memory twice +while searching for used blobs. + +https://github.com/restic/restic/pull/2599