FindUsedBlobs: merge seen into blobs BlobSet

The seen BlobSet always contained a subset of the entries in blobs.
Thus use blobs instead and avoid the memory overhead of the second set.

Suggested-by: Alexander Weiss <alex@weissfam.de>
This commit is contained in:
Michael Eischer 2020-02-01 21:09:52 +01:00
parent 48f97f3567
commit 184103647a
4 changed files with 11 additions and 17 deletions

View File

@ -189,14 +189,13 @@ func pruneRepository(gopts GlobalOptions, repo restic.Repository) error {
Verbosef("find data that is still in use for %d snapshots\n", stats.snapshots) Verbosef("find data that is still in use for %d snapshots\n", stats.snapshots)
usedBlobs := restic.NewBlobSet() usedBlobs := restic.NewBlobSet()
seenBlobs := restic.NewBlobSet()
bar = newProgressMax(!gopts.Quiet, uint64(len(snapshots)), "snapshots") bar = newProgressMax(!gopts.Quiet, uint64(len(snapshots)), "snapshots")
bar.Start() bar.Start()
for _, sn := range snapshots { for _, sn := range snapshots {
debug.Log("process snapshot %v", sn.ID()) debug.Log("process snapshot %v", sn.ID())
err = restic.FindUsedBlobs(ctx, repo, *sn.Tree, usedBlobs, seenBlobs) err = restic.FindUsedBlobs(ctx, repo, *sn.Tree, usedBlobs)
if err != nil { if err != nil {
if repo.Backend().IsNotExist(err) { if repo.Backend().IsNotExist(err) {
return errors.Fatal("unable to load a tree from the repo: " + err.Error()) return errors.Fatal("unable to load a tree from the repo: " + err.Error())

View File

@ -93,7 +93,6 @@ func runStats(gopts GlobalOptions, args []string) error {
uniqueInodes: make(map[uint64]struct{}), uniqueInodes: make(map[uint64]struct{}),
fileBlobs: make(map[string]restic.IDSet), fileBlobs: make(map[string]restic.IDSet),
blobs: restic.NewBlobSet(), blobs: restic.NewBlobSet(),
blobsSeen: restic.NewBlobSet(),
} }
if snapshotIDString != "" { if snapshotIDString != "" {
@ -183,7 +182,7 @@ func statsWalkSnapshot(ctx context.Context, snapshot *restic.Snapshot, repo rest
if countMode == countModeRawData { if countMode == countModeRawData {
// count just the sizes of unique blobs; we don't need to walk the tree // count just the sizes of unique blobs; we don't need to walk the tree
// ourselves in this case, since a nifty function does it for us // ourselves in this case, since a nifty function does it for us
return restic.FindUsedBlobs(ctx, repo, *snapshot.Tree, stats.blobs, stats.blobsSeen) return restic.FindUsedBlobs(ctx, repo, *snapshot.Tree, stats.blobs)
} }
err := walker.Walk(ctx, repo, *snapshot.Tree, restic.NewIDSet(), statsWalkTree(repo, stats)) err := walker.Walk(ctx, repo, *snapshot.Tree, restic.NewIDSet(), statsWalkTree(repo, stats))
@ -318,9 +317,9 @@ type statsContainer struct {
// blobs that have been seen as a part of the file // blobs that have been seen as a part of the file
fileBlobs map[string]restic.IDSet fileBlobs map[string]restic.IDSet
// blobs and blobsSeen are used to count individual // blobs is used to count individual unique blobs,
// unique blobs, independent of references to files // independent of references to files
blobs, blobsSeen restic.BlobSet blobs restic.BlobSet
} }
// fileID is a 256-bit hash that distinguishes unique files. // fileID is a 256-bit hash that distinguishes unique files.

View File

@ -3,9 +3,8 @@ package restic
import "context" import "context"
// FindUsedBlobs traverses the tree ID and adds all seen blobs (trees and data // FindUsedBlobs traverses the tree ID and adds all seen blobs (trees and data
// blobs) to the set blobs. The tree blobs in the `seen` BlobSet will not be visited // blobs) to the set blobs. Already seen tree blobs will not be visited again.
// again. func FindUsedBlobs(ctx context.Context, repo Repository, treeID ID, blobs BlobSet) error {
func FindUsedBlobs(ctx context.Context, repo Repository, treeID ID, blobs BlobSet, seen BlobSet) error {
blobs.Insert(BlobHandle{ID: treeID, Type: TreeBlob}) blobs.Insert(BlobHandle{ID: treeID, Type: TreeBlob})
tree, err := repo.LoadTree(ctx, treeID) tree, err := repo.LoadTree(ctx, treeID)
@ -22,13 +21,11 @@ func FindUsedBlobs(ctx context.Context, repo Repository, treeID ID, blobs BlobSe
case "dir": case "dir":
subtreeID := *node.Subtree subtreeID := *node.Subtree
h := BlobHandle{ID: subtreeID, Type: TreeBlob} h := BlobHandle{ID: subtreeID, Type: TreeBlob}
if seen.Has(h) { if blobs.Has(h) {
continue continue
} }
seen.Insert(h) err := FindUsedBlobs(ctx, repo, subtreeID, blobs)
err := FindUsedBlobs(ctx, repo, subtreeID, blobs, seen)
if err != nil { if err != nil {
return err return err
} }

View File

@ -93,7 +93,7 @@ func TestFindUsedBlobs(t *testing.T) {
for i, sn := range snapshots { for i, sn := range snapshots {
usedBlobs := restic.NewBlobSet() usedBlobs := restic.NewBlobSet()
err := restic.FindUsedBlobs(context.TODO(), repo, *sn.Tree, usedBlobs, restic.NewBlobSet()) err := restic.FindUsedBlobs(context.TODO(), repo, *sn.Tree, usedBlobs)
if err != nil { if err != nil {
t.Errorf("FindUsedBlobs returned error: %v", err) t.Errorf("FindUsedBlobs returned error: %v", err)
continue continue
@ -127,9 +127,8 @@ func BenchmarkFindUsedBlobs(b *testing.B) {
b.ResetTimer() b.ResetTimer()
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {
seen := restic.NewBlobSet()
blobs := restic.NewBlobSet() blobs := restic.NewBlobSet()
err := restic.FindUsedBlobs(context.TODO(), repo, *sn.Tree, blobs, seen) err := restic.FindUsedBlobs(context.TODO(), repo, *sn.Tree, blobs)
if err != nil { if err != nil {
b.Error(err) b.Error(err)
} }