From 05651d6d4fcb23a23fccb0ca68665e3e456c5056 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Sat, 22 Oct 2022 19:10:33 +0200 Subject: [PATCH] prune: Correctly count used/duplicate blobs for partially compressed repos Counting the first occurrence of a duplicate blob as used and counting all other as duplicates, independent of which instance of the blob is kept, is only accurate if all copies of the blob have the same size. This is no longer the case for a repository containing both compressed and uncompressed blobs. Thus for duplicated blobs first count all instances as duplicates and then subtract the actually used instance later on. --- changelog/unreleased/issue-3918 | 12 ++++++++++++ cmd/restic/cmd_prune.go | 28 +++++++++++++++------------- 2 files changed, 27 insertions(+), 13 deletions(-) create mode 100644 changelog/unreleased/issue-3918 diff --git a/changelog/unreleased/issue-3918 b/changelog/unreleased/issue-3918 new file mode 100644 index 000000000..2cc60bf0d --- /dev/null +++ b/changelog/unreleased/issue-3918 @@ -0,0 +1,12 @@ +Bugfix: Correct prune statistics for partially compressed repositories + +In a partially compressed repository, one data blob can exist both in an +uncompressed and a compressed version. This caused the prune statistics to +become inaccurate and for example report a too high value for the unused size: + +> unused size after prune: 16777215.991 TiB + +This has been fixed. + +https://github.com/restic/restic/issues/3918 +https://github.com/restic/restic/pull/3980 diff --git a/cmd/restic/cmd_prune.go b/cmd/restic/cmd_prune.go index 7918b2294..198e269a7 100644 --- a/cmd/restic/cmd_prune.go +++ b/cmd/restic/cmd_prune.go @@ -306,7 +306,6 @@ func packInfoFromIndex(ctx context.Context, idx restic.MasterIndex, usedBlobs re // Thus 0 == blob is missing, 1 == blob exists once, >= 2 == duplicates exist idx.Each(ctx, func(blob restic.PackedBlob) { bh := blob.BlobHandle - size := uint64(blob.Length) count, ok := usedBlobs[bh] if ok { if count < math.MaxUint8 { @@ -316,19 +315,7 @@ func packInfoFromIndex(ctx context.Context, idx restic.MasterIndex, usedBlobs re count++ } - if count == 1 { - stats.size.used += size - stats.blobs.used++ - } else { - // duplicate if counted more than once - stats.size.duplicate += size - stats.blobs.duplicate++ - } - usedBlobs[bh] = count - } else { - stats.size.unused += size - stats.blobs.unused++ } }) @@ -382,12 +369,22 @@ func packInfoFromIndex(ctx context.Context, idx restic.MasterIndex, usedBlobs re // mark as unused for now, we will later on select one copy ip.unusedSize += size ip.unusedBlobs++ + + // count as duplicate, will later on change one copy to be counted as used + stats.size.duplicate += size + stats.blobs.duplicate++ case dupCount == 1: // used blob, not duplicate ip.usedSize += size ip.usedBlobs++ + + stats.size.used += size + stats.blobs.used++ default: // unused blob ip.unusedSize += size ip.unusedBlobs++ + + stats.size.unused += size + stats.blobs.unused++ } if !blob.IsCompressed() { ip.uncompressed = true @@ -420,6 +417,11 @@ func packInfoFromIndex(ctx context.Context, idx restic.MasterIndex, usedBlobs re ip.usedBlobs++ ip.unusedSize -= size ip.unusedBlobs-- + // same for the global statistics + stats.size.used += size + stats.blobs.used++ + stats.size.duplicate -= size + stats.blobs.duplicate-- // let other occurences remain marked as unused usedBlobs[bh] = 1 default: