Consolidate mode flags; use new Walk function

This commit is contained in:
Matthew Holt 2018-06-23 12:01:13 -06:00 committed by Alexander Neumann
parent 930602a444
commit daca9d6815
1 changed files with 78 additions and 103 deletions

View File

@ -9,17 +9,18 @@ import (
"path/filepath" "path/filepath"
"github.com/restic/restic/internal/restic" "github.com/restic/restic/internal/restic"
"github.com/restic/restic/internal/walker"
"github.com/spf13/cobra" "github.com/spf13/cobra"
) )
var cmdStats = &cobra.Command{ var cmdStats = &cobra.Command{
Use: "stats", Use: "stats [flags] [snapshot-ID]",
Short: "Scan the repository and show basic statistics", Short: "Scan the repository and show basic statistics",
Long: ` Long: `
The "stats" command walks one or all snapshots in a repository and The "stats" command walks one or all snapshots in a repository and
accumulates statistics about the data stored therein. It reports on accumulates statistics about the data stored therein. It reports on
the number of unique files and their sizes, according to one of the number of unique files and their sizes, according to one of
the counting modes as given by a flag. the counting modes as given by the --mode flag.
`, `,
DisableAutoGenTag: true, DisableAutoGenTag: true,
RunE: func(cmd *cobra.Command, args []string) error { RunE: func(cmd *cobra.Command, args []string) error {
@ -27,16 +28,10 @@ the counting modes as given by a flag.
}, },
} }
var countModeFlag []string
func init() { func init() {
cmdRoot.AddCommand(cmdStats) cmdRoot.AddCommand(cmdStats)
f := cmdStats.Flags() f := cmdStats.Flags()
f.BoolVar(&countModeRestoreSize, "count-restore-size", false, "count the size of files that would be restored (default)") f.StringVar(&countMode, "mode", countModeRestoreSize, "counting mode: restore-size (default), files-by-content, blobs-per-file, or raw-data")
f.BoolVar(&countModeUniqueFilesByContent, "count-files-by-contents", false, "count files as unique by their contents")
f.BoolVar(&countModeBlobsPerFile, "count-blobs-per-file", false, "count sizes of blobs by filename")
f.BoolVar(&countModeRawData, "count-raw-data", false, "count unique blob sizes irrespective of files referencing them")
f.StringVar(&snapshotByHost, "host", "", "filter latest snapshot by this hostname") f.StringVar(&snapshotByHost, "host", "", "filter latest snapshot by this hostname")
} }
@ -69,7 +64,6 @@ func runStats(gopts GlobalOptions, args []string) error {
// create a container for the stats (and other needed state) // create a container for the stats (and other needed state)
stats := &statsContainer{ stats := &statsContainer{
uniqueFiles: make(map[fileID]struct{}), uniqueFiles: make(map[fileID]struct{}),
idSet: make(restic.IDSet),
fileBlobs: make(map[string]restic.IDSet), fileBlobs: make(map[string]restic.IDSet),
blobs: restic.NewBlobSet(), blobs: restic.NewBlobSet(),
blobsSeen: restic.NewBlobSet(), blobsSeen: restic.NewBlobSet(),
@ -111,7 +105,7 @@ func runStats(gopts GlobalOptions, args []string) error {
return err return err
} }
if countModeRawData { if countMode == countModeRawData {
// the blob handles have been collected, but not yet counted // the blob handles have been collected, but not yet counted
for blobHandle := range stats.blobs { for blobHandle := range stats.blobs {
blobSize, found := repo.LookupBlobSize(blobHandle.ID, blobHandle.Type) blobSize, found := repo.LookupBlobSize(blobHandle.ID, blobHandle.Type)
@ -147,93 +141,81 @@ func statsWalkSnapshot(ctx context.Context, snapshot *restic.Snapshot, repo rest
return fmt.Errorf("snapshot %s has nil tree", snapshot.ID().Str()) return fmt.Errorf("snapshot %s has nil tree", snapshot.ID().Str())
} }
if countModeRawData { if countMode == countModeRawData {
// count just the sizes of unique blobs; we don't need to walk the tree // count just the sizes of unique blobs; we don't need to walk the tree
// ourselves in this case, since a nifty function does it for us // ourselves in this case, since a nifty function does it for us
return restic.FindUsedBlobs(ctx, repo, *snapshot.Tree, stats.blobs, stats.blobsSeen) return restic.FindUsedBlobs(ctx, repo, *snapshot.Tree, stats.blobs, stats.blobsSeen)
} }
err := statsWalkTree(ctx, repo, *snapshot.Tree, stats, string(filepath.Separator)) err := walker.Walk(ctx, repo, *snapshot.Tree, restic.NewIDSet(), func(path string, node *restic.Node, nodeErr error) (bool, error) {
return statsWalkTree(path, node, nodeErr, repo, stats)
})
if err != nil { if err != nil {
return fmt.Errorf("walking tree %s: %v", *snapshot.Tree, err) return fmt.Errorf("walking tree %s: %v", *snapshot.Tree, err)
} }
return nil return nil
} }
func statsWalkTree(ctx context.Context, repo restic.Repository, treeID restic.ID, stats *statsContainer, fpath string) error { func statsWalkTree(npath string, node *restic.Node, nodeErr error, repo restic.Repository, stats *statsContainer) (ignore bool, err error) {
// don't visit a tree we've already walked if nodeErr != nil {
if stats.idSet.Has(treeID) { return true, nodeErr
return nil
} }
stats.idSet.Insert(treeID) if node == nil {
return true, nil
tree, err := repo.LoadTree(ctx, treeID)
if err != nil {
return fmt.Errorf("loading tree: %v", err)
} }
for _, node := range tree.Nodes { if countMode == countModeUniqueFilesByContent || countMode == countModeBlobsPerFile {
if countModeUniqueFilesByContent || countModeBlobsPerFile { // only count this file if we haven't visited it before
// only count this file if we haven't visited it before fid := makeFileIDByContents(node)
fid := makeFileIDByContents(node) if _, ok := stats.uniqueFiles[fid]; !ok {
if _, ok := stats.uniqueFiles[fid]; !ok { // mark the file as visited
// mark the file as visited stats.uniqueFiles[fid] = struct{}{}
stats.uniqueFiles[fid] = struct{}{}
if countModeUniqueFilesByContent { if countMode == countModeUniqueFilesByContent {
// simply count the size of each unique file (unique by contents only) // simply count the size of each unique file (unique by contents only)
stats.TotalSize += node.Size stats.TotalSize += node.Size
stats.TotalFileCount++ stats.TotalFileCount++
} }
if countModeBlobsPerFile { if countMode == countModeBlobsPerFile {
// count the size of each unique blob reference, which is // count the size of each unique blob reference, which is
// by unique file (unique by contents and file path) // by unique file (unique by contents and file path)
for _, blobID := range node.Content { for _, blobID := range node.Content {
// ensure we have this file (by path) in our map; in this // ensure we have this file (by path) in our map; in this
// mode, a file is unique by both contents and path // mode, a file is unique by both contents and path
nodePath := filepath.Join(fpath, node.Name) nodePath := filepath.Join(npath, node.Name)
if _, ok := stats.fileBlobs[nodePath]; !ok { if _, ok := stats.fileBlobs[nodePath]; !ok {
stats.fileBlobs[nodePath] = restic.NewIDSet() stats.fileBlobs[nodePath] = restic.NewIDSet()
stats.TotalFileCount++ stats.TotalFileCount++
}
if _, ok := stats.fileBlobs[nodePath][blobID]; !ok {
// is always a data blob since we're accessing it via a file's Content array
blobSize, found := repo.LookupBlobSize(blobID, restic.DataBlob)
if !found {
return true, fmt.Errorf("blob %s not found for tree %s", blobID, *node.Subtree)
} }
if _, ok := stats.fileBlobs[nodePath][blobID]; !ok {
// is always a data blob since we're accessing it via a file's Content array
blobSize, found := repo.LookupBlobSize(blobID, restic.DataBlob)
if !found {
return fmt.Errorf("blob %s not found for tree %s", blobID, treeID)
}
// count the blob's size, then add this blob by this // count the blob's size, then add this blob by this
// file (path) so we don't double-count it // file (path) so we don't double-count it
stats.TotalSize += uint64(blobSize) stats.TotalSize += uint64(blobSize)
stats.fileBlobs[nodePath].Insert(blobID) stats.fileBlobs[nodePath].Insert(blobID)
// this mode also counts total unique blob _references_ per file // this mode also counts total unique blob _references_ per file
stats.TotalBlobCount++ stats.TotalBlobCount++
}
} }
} }
} }
} }
if countModeRestoreSize {
// as this is a file in the snapshot, we can simply count its
// size without worrying about uniqueness, since duplicate files
// will still be restored
stats.TotalSize += node.Size
stats.TotalFileCount++
}
// visit subtrees (i.e. directory contents)
if node.Subtree != nil {
err = statsWalkTree(ctx, repo, *node.Subtree, stats, filepath.Join(fpath, node.Name))
if err != nil {
return err
}
}
} }
return nil if countMode == countModeRestoreSize {
// as this is a file in the snapshot, we can simply count its
// size without worrying about uniqueness, since duplicate files
// will still be restored
stats.TotalSize += node.Size
stats.TotalFileCount++
}
return true, nil
} }
// makeFileIDByContents returns a hash of the blob IDs of the // makeFileIDByContents returns a hash of the blob IDs of the
@ -247,35 +229,26 @@ func makeFileIDByContents(node *restic.Node) fileID {
} }
func verifyStatsInput(gopts GlobalOptions, args []string) error { func verifyStatsInput(gopts GlobalOptions, args []string) error {
// ensure only one counting mode was specified, for clarity // require a recognized counting mode
var countModes int switch countMode {
if countModeRestoreSize { case countModeRestoreSize:
countModes++ case countModeUniqueFilesByContent:
case countModeBlobsPerFile:
case countModeRawData:
default:
return fmt.Errorf("unknown counting mode: %s (use the -h flag to get a list of supported modes)", countMode)
} }
if countModeUniqueFilesByContent {
countModes++ // ensure at most one snapshot was specified
}
if countModeBlobsPerFile {
countModes++
}
if countModeRawData {
countModes++
}
if countModes > 1 {
return fmt.Errorf("only one counting mode may be used")
}
// set a default count mode if none were specified
if countModes == 0 {
countModeRestoreSize = true
}
// ensure one or none snapshots were specified
if len(args) > 1 { if len(args) > 1 {
return fmt.Errorf("only one snapshot may be specified") return fmt.Errorf("only one snapshot may be specified")
} }
// set the snapshot to scan, if one was specified
// if a snapshot was specified, mark it as the one to scan
if len(args) == 1 { if len(args) == 1 {
snapshotIDString = args[0] snapshotIDString = args[0]
} }
return nil return nil
} }
@ -287,9 +260,6 @@ type statsContainer struct {
TotalFileCount uint64 `json:"total_file_count"` TotalFileCount uint64 `json:"total_file_count"`
TotalBlobCount uint64 `json:"total_blob_count,omitempty"` TotalBlobCount uint64 `json:"total_blob_count,omitempty"`
// idSet marks visited trees, to avoid repeated walks
idSet restic.IDSet
// uniqueFiles marks visited files according to their // uniqueFiles marks visited files according to their
// contents (hashed sequence of content blob IDs) // contents (hashed sequence of content blob IDs)
uniqueFiles map[fileID]struct{} uniqueFiles map[fileID]struct{}
@ -307,10 +277,8 @@ type statsContainer struct {
type fileID [32]byte type fileID [32]byte
var ( var (
countModeRestoreSize bool // the mode of counting to perform
countModeUniqueFilesByContent bool countMode string
countModeBlobsPerFile bool
countModeRawData bool
// the snapshot to scan, as given by the user // the snapshot to scan, as given by the user
snapshotIDString string snapshotIDString string
@ -319,3 +287,10 @@ var (
// snapshot by, if given by user // snapshot by, if given by user
snapshotByHost string snapshotByHost string
) )
const (
countModeRestoreSize = "restore-size"
countModeUniqueFilesByContent = "files-by-content"
countModeBlobsPerFile = "blobs-per-file"
countModeRawData = "raw-data"
)