Implement four counting modes

This commit is contained in:
Matthew Holt 2018-04-22 15:27:33 -06:00 committed by Alexander Neumann
parent 925b542eb0
commit a7b95d716a
1 changed files with 215 additions and 36 deletions

View File

@ -6,6 +6,7 @@ import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"os" "os"
"path/filepath"
"github.com/restic/restic/internal/restic" "github.com/restic/restic/internal/restic"
"github.com/spf13/cobra" "github.com/spf13/cobra"
@ -15,9 +16,10 @@ var cmdStats = &cobra.Command{
Use: "stats", Use: "stats",
Short: "Scan the repository and show basic statistics", Short: "Scan the repository and show basic statistics",
Long: ` Long: `
The "stats" command walks all snapshots in a repository and accumulates The "stats" command walks one or all snapshots in a repository and
statistics about the data stored therein. It reports on the number of accumulates statistics about the data stored therein. It reports on
unique files and their sizes. the number of unique files and their sizes, according to one of
the counting modes as given by a flag.
`, `,
DisableAutoGenTag: true, DisableAutoGenTag: true,
RunE: func(cmd *cobra.Command, args []string) error { RunE: func(cmd *cobra.Command, args []string) error {
@ -25,11 +27,25 @@ unique files and their sizes.
}, },
} }
var countModeFlag []string
func init() { func init() {
cmdRoot.AddCommand(cmdStats) cmdRoot.AddCommand(cmdStats)
f := cmdStats.Flags()
f.BoolVar(&countModeRestoreSize, "count-restore-size", false, "count the size of files that would be restored (default)")
f.BoolVar(&countModeUniqueFilesByContent, "count-files-by-contents", false, "count files as unique by their contents")
f.BoolVar(&countModeBlobsPerFile, "count-blobs-per-file", false, "count sizes of blobs by filename")
f.BoolVar(&countModeRawData, "count-raw-data", false, "count unique blob sizes irrespective of files referencing them")
f.StringVar(&snapshotByHost, "host", "", "filter latest snapshot by this hostname")
} }
func runStats(gopts GlobalOptions, args []string) error { func runStats(gopts GlobalOptions, args []string) error {
err := verifyStatsInput(gopts, args)
if err != nil {
return err
}
ctx, cancel := context.WithCancel(gopts.ctx) ctx, cancel := context.WithCancel(gopts.ctx)
defer cancel() defer cancel()
@ -50,27 +66,62 @@ func runStats(gopts GlobalOptions, args []string) error {
} }
} }
// create a container for the stats, and other state // create a container for the stats (and other needed state)
// needed while walking the trees stats := &statsContainer{
stats := &statsContainer{uniqueFiles: make(map[fileID]struct{}), idSet: make(restic.IDSet)} uniqueFiles: make(map[fileID]struct{}),
idSet: make(restic.IDSet),
fileBlobs: make(map[string]restic.IDSet),
blobs: restic.NewBlobSet(),
blobsSeen: restic.NewBlobSet(),
}
// iterate every snapshot in the repo if snapshotIDString != "" {
err = repo.List(ctx, restic.SnapshotFile, func(snapshotID restic.ID, size int64) error { // scan just a single snapshot
snapshot, err := restic.LoadSnapshot(ctx, repo, snapshotID)
var sID restic.ID
if snapshotIDString == "latest" {
sID, err = restic.FindLatestSnapshot(ctx, repo, []string{}, []restic.TagList{}, snapshotByHost)
if err != nil {
Exitf(1, "latest snapshot for criteria not found: %v", err)
}
} else {
sID, err = restic.FindSnapshot(repo, snapshotIDString)
if err != nil {
return err
}
}
snapshot, err := restic.LoadSnapshot(ctx, repo, sID)
if err != nil { if err != nil {
return fmt.Errorf("Error loading snapshot %s: %v", snapshotID.Str(), err) return err
}
if snapshot.Tree == nil {
return fmt.Errorf("snapshot %s has nil tree", snapshot.ID().Str())
} }
err = walkTree(ctx, repo, *snapshot.Tree, stats) err = statsWalkSnapshot(ctx, snapshot, repo, stats)
if err != nil { } else {
return fmt.Errorf("walking tree %s: %v", *snapshot.Tree, err) // iterate every snapshot in the repo
} err = repo.List(ctx, restic.SnapshotFile, func(snapshotID restic.ID, size int64) error {
snapshot, err := restic.LoadSnapshot(ctx, repo, snapshotID)
if err != nil {
return fmt.Errorf("Error loading snapshot %s: %v", snapshotID.Str(), err)
}
return statsWalkSnapshot(ctx, snapshot, repo, stats)
})
}
if err != nil {
return err
}
return nil if countModeRawData {
}) // the blob handles have been collected, but not yet counted
for blobHandle := range stats.blobs {
blobSize, found := repo.LookupBlobSize(blobHandle.ID, blobHandle.Type)
if !found {
return fmt.Errorf("blob %v not found", blobHandle)
}
stats.TotalSize += uint64(blobSize)
stats.TotalBlobCount++
}
}
if gopts.JSON { if gopts.JSON {
err = json.NewEncoder(os.Stdout).Encode(stats) err = json.NewEncoder(os.Stdout).Encode(stats)
@ -80,12 +131,37 @@ func runStats(gopts GlobalOptions, args []string) error {
return nil return nil
} }
Printf(" Cumulative Original Size: %-5s\n", formatBytes(stats.TotalOriginalSize)) if stats.TotalBlobCount > 0 {
Printf(" Total Original File Count: %d\n", stats.TotalCount) Printf(" Total Blob Count: %d\n", stats.TotalBlobCount)
}
if stats.TotalFileCount > 0 {
Printf(" Total File Count: %d\n", stats.TotalFileCount)
}
Printf(" Total Size: %-5s\n", formatBytes(stats.TotalSize))
return nil return nil
} }
func walkTree(ctx context.Context, repo restic.Repository, treeID restic.ID, stats *statsContainer) error { func statsWalkSnapshot(ctx context.Context, snapshot *restic.Snapshot, repo restic.Repository, stats *statsContainer) error {
if snapshot.Tree == nil {
return fmt.Errorf("snapshot %s has nil tree", snapshot.ID().Str())
}
if countModeRawData {
// count just the sizes of unique blobs; we don't need to walk the tree
// ourselves in this case, since a nifty function does it for us
return restic.FindUsedBlobs(ctx, repo, *snapshot.Tree, stats.blobs, stats.blobsSeen)
}
err := statsWalkTree(ctx, repo, *snapshot.Tree, stats, string(filepath.Separator))
if err != nil {
return fmt.Errorf("walking tree %s: %v", *snapshot.Tree, err)
}
return nil
}
func statsWalkTree(ctx context.Context, repo restic.Repository, treeID restic.ID, stats *statsContainer, fpath string) error {
// don't visit a tree we've already walked
if stats.idSet.Has(treeID) { if stats.idSet.Has(treeID) {
return nil return nil
} }
@ -97,20 +173,59 @@ func walkTree(ctx context.Context, repo restic.Repository, treeID restic.ID, sta
} }
for _, node := range tree.Nodes { for _, node := range tree.Nodes {
// only count this file if we haven't visited it before if countModeUniqueFilesByContent || countModeBlobsPerFile {
fid := makeFileID(node) // only count this file if we haven't visited it before
if _, ok := stats.uniqueFiles[fid]; !ok { fid := makeFileIDByContents(node)
// mark the file as visited if _, ok := stats.uniqueFiles[fid]; !ok {
stats.uniqueFiles[fid] = struct{}{} // mark the file as visited
stats.uniqueFiles[fid] = struct{}{}
// update our stats to account for this node if countModeUniqueFilesByContent {
stats.TotalOriginalSize += node.Size // simply count the size of each unique file (unique by contents only)
stats.TotalCount++ stats.TotalSize += node.Size
stats.TotalFileCount++
}
if countModeBlobsPerFile {
// count the size of each unique blob reference, which is
// by unique file (unique by contents and file path)
for _, blobID := range node.Content {
// ensure we have this file (by path) in our map; in this
// mode, a file is unique by both contents and path
if _, ok := stats.fileBlobs[fpath]; !ok {
stats.fileBlobs[fpath] = restic.NewIDSet()
stats.TotalFileCount++
}
if _, ok := stats.fileBlobs[fpath][blobID]; !ok {
// TODO: Is the blob type always 'data' in this case?
blobSize, found := repo.LookupBlobSize(blobID, restic.DataBlob)
if !found {
return fmt.Errorf("blob %s not found for tree %s", blobID, treeID)
}
// count the blob's size, then add this blob by this
// file (path) so we don't double-count it
stats.TotalSize += uint64(blobSize)
stats.fileBlobs[fpath].Insert(blobID)
// this mode also counts total unique blob _references_ per file
stats.TotalBlobCount++
}
}
}
}
}
if countModeRestoreSize {
// as this is a file in the snapshot, we can simply count its
// size without worrying about uniqueness, since duplicate files
// will still be restored
stats.TotalSize += node.Size
stats.TotalFileCount++
} }
// visit subtrees (i.e. directory contents) // visit subtrees (i.e. directory contents)
if node.Subtree != nil { if node.Subtree != nil {
err = walkTree(ctx, repo, *node.Subtree, stats) err = statsWalkTree(ctx, repo, *node.Subtree, stats, filepath.Join(fpath, node.Name))
if err != nil { if err != nil {
return err return err
} }
@ -120,7 +235,9 @@ func walkTree(ctx context.Context, repo restic.Repository, treeID restic.ID, sta
return nil return nil
} }
func makeFileID(node *restic.Node) fileID { // makeFileIDByContents returns a hash of the blob IDs of the
// node's Content in sequence.
func makeFileIDByContents(node *restic.Node) fileID {
var bb []byte var bb []byte
for _, c := range node.Content { for _, c := range node.Content {
bb = append(bb, []byte(c[:])...) bb = append(bb, []byte(c[:])...)
@ -128,14 +245,76 @@ func makeFileID(node *restic.Node) fileID {
return sha256.Sum256(bb) return sha256.Sum256(bb)
} }
func verifyStatsInput(gopts GlobalOptions, args []string) error {
// ensure only one counting mode was specified, for clarity
var countModes int
if countModeRestoreSize {
countModes++
}
if countModeUniqueFilesByContent {
countModes++
}
if countModeBlobsPerFile {
countModes++
}
if countModeRawData {
countModes++
}
if countModes > 1 {
return fmt.Errorf("only one counting mode may be used")
}
// set a default count mode if none were specified
if countModes == 0 {
countModeRestoreSize = true
}
// ensure one or none snapshots were specified
if len(args) > 1 {
return fmt.Errorf("only one snapshot may be specified")
}
// set the snapshot to scan, if one was specified
if len(args) == 1 {
snapshotIDString = args[0]
}
return nil
}
// statsContainer holds information during a walk of a repository // statsContainer holds information during a walk of a repository
// to collect information about it, as well as state needed // to collect information about it, as well as state needed
// for a successful and efficient walk. // for a successful and efficient walk.
type statsContainer struct { type statsContainer struct {
TotalCount uint64 `json:"total_count"` TotalSize uint64 `json:"total_size"`
TotalOriginalSize uint64 `json:"total_original_size"` TotalFileCount uint64 `json:"total_file_count"`
idSet restic.IDSet TotalBlobCount uint64 `json:"total_blob_count,omitempty"`
uniqueFiles map[fileID]struct{}
// idSet marks visited trees, to avoid repeated walks
idSet restic.IDSet
// uniqueFiles marks visited files according to their
// contents (hashed sequence of content blob IDs)
uniqueFiles map[fileID]struct{}
// fileBlobs maps a file name (path) to the set of
// blobs that have been seen as a part of the file
fileBlobs map[string]restic.IDSet
// blobs and blobsSeen are used to count indiviudal
// unique blobs, independent of references to files
blobs, blobsSeen restic.BlobSet
} }
// fileID is a 256-bit hash that distinguishes unique files.
type fileID [32]byte type fileID [32]byte
var (
countModeRestoreSize bool
countModeUniqueFilesByContent bool
countModeBlobsPerFile bool
countModeRawData bool
// the snapshot to scan, as given by the user
snapshotIDString string
// snapshotByHost is the host to filter latest
// snapshot by, if given by user
snapshotByHost string
)