Reject files excluded by name before calling lstat to improve scan speed

Adds a SelectByName method to the archive and scanner which only require
the filename as input, and can thus be run before calling lstat on the
file. Can speed up scanning significantly if a lot of filename excludes
are used.
This commit is contained in:
Andreas Skielboe 2018-07-31 17:25:25 +02:00 committed by Alexander Neumann
parent 9b513312e2
commit b07bb3d8c3
5 changed files with 96 additions and 56 deletions

View File

@ -186,18 +186,9 @@ func (opts BackupOptions) Check(gopts GlobalOptions, args []string) error {
return nil return nil
} }
// collectRejectFuncs returns a list of all functions which may reject data // collectRejectByNameFuncs returns a list of all functions which may reject data
// from being saved in a snapshot // from being saved in a snapshot based on path only
func collectRejectFuncs(opts BackupOptions, repo *repository.Repository, targets []string) (fs []RejectFunc, err error) { func collectRejectByNameFuncs(opts BackupOptions, repo *repository.Repository, targets []string) (fs []RejectByNameFunc, err error) {
// allowed devices
if opts.ExcludeOtherFS && !opts.Stdin {
f, err := rejectByDevice(targets)
if err != nil {
return nil, err
}
fs = append(fs, f)
}
// exclude restic cache // exclude restic cache
if repo.Cache != nil { if repo.Cache != nil {
f, err := rejectResticCache(repo) f, err := rejectResticCache(repo)
@ -237,6 +228,21 @@ func collectRejectFuncs(opts BackupOptions, repo *repository.Repository, targets
return fs, nil return fs, nil
} }
// collectRejectFuncs returns a list of all functions which may reject data
// from being saved in a snapshot based on path and file info
func collectRejectFuncs(opts BackupOptions, repo *repository.Repository, targets []string) (fs []RejectFunc, err error) {
// allowed devices
if opts.ExcludeOtherFS && !opts.Stdin {
f, err := rejectByDevice(targets)
if err != nil {
return nil, err
}
fs = append(fs, f)
}
return fs, nil
}
// readExcludePatternsFromFiles reads all exclude files and returns the list of // readExcludePatternsFromFiles reads all exclude files and returns the list of
// exclude patterns. For each line, leading and trailing white space is removed // exclude patterns. For each line, leading and trailing white space is removed
// and comment lines are ignored. For each remaining pattern, environment // and comment lines are ignored. For each remaining pattern, environment
@ -393,7 +399,13 @@ func runBackup(opts BackupOptions, gopts GlobalOptions, term *termstatus.Termina
return err return err
} }
// rejectFuncs collect functions that can reject items from the backup // rejectByNameFuncs collect functions that can reject items from the backup based on path only
rejectByNameFuncs, err := collectRejectByNameFuncs(opts, repo, targets)
if err != nil {
return err
}
// rejectFuncs collect functions that can reject items from the backup based on path and file info
rejectFuncs, err := collectRejectFuncs(opts, repo, targets) rejectFuncs, err := collectRejectFuncs(opts, repo, targets)
if err != nil { if err != nil {
return err return err
@ -414,6 +426,15 @@ func runBackup(opts BackupOptions, gopts GlobalOptions, term *termstatus.Termina
p.V("using parent snapshot %v\n", parentSnapshotID.Str()) p.V("using parent snapshot %v\n", parentSnapshotID.Str())
} }
selectByNameFilter := func(item string) bool {
for _, reject := range rejectByNameFuncs {
if reject(item) {
return false
}
}
return true
}
selectFilter := func(item string, fi os.FileInfo) bool { selectFilter := func(item string, fi os.FileInfo) bool {
for _, reject := range rejectFuncs { for _, reject := range rejectFuncs {
if reject(item, fi) { if reject(item, fi) {
@ -436,6 +457,7 @@ func runBackup(opts BackupOptions, gopts GlobalOptions, term *termstatus.Termina
} }
sc := archiver.NewScanner(targetFS) sc := archiver.NewScanner(targetFS)
sc.SelectByName = selectByNameFilter
sc.Select = selectFilter sc.Select = selectFilter
sc.Error = p.ScannerError sc.Error = p.ScannerError
sc.Result = p.ReportTotal sc.Result = p.ReportTotal
@ -444,6 +466,7 @@ func runBackup(opts BackupOptions, gopts GlobalOptions, term *termstatus.Termina
t.Go(func() error { return sc.Scan(t.Context(gopts.ctx), targets) }) t.Go(func() error { return sc.Scan(t.Context(gopts.ctx), targets) })
arch := archiver.New(repo, targetFS, archiver.Options{}) arch := archiver.New(repo, targetFS, archiver.Options{})
arch.SelectByName = selectByNameFilter
arch.Select = selectFilter arch.Select = selectFilter
arch.WithAtime = opts.WithAtime arch.WithAtime = opts.WithAtime
arch.Error = p.Error arch.Error = p.Error

View File

@ -60,15 +60,20 @@ func (rc *rejectionCache) Store(dir string, rejected bool) {
rc.m[dir] = rejected rc.m[dir] = rejected
} }
// RejectByNameFunc is a function that takes a filename of a
// file that would be included in the backup. The function returns true if it
// should be excluded (rejected) from the backup.
type RejectByNameFunc func(path string) bool
// RejectFunc is a function that takes a filename and os.FileInfo of a // RejectFunc is a function that takes a filename and os.FileInfo of a
// file that would be included in the backup. The function returns true if it // file that would be included in the backup. The function returns true if it
// should be excluded (rejected) from the backup. // should be excluded (rejected) from the backup.
type RejectFunc func(path string, fi os.FileInfo) bool type RejectFunc func(path string, fi os.FileInfo) bool
// rejectByPattern returns a RejectFunc which rejects files that match // rejectByPattern returns a RejectByNameFunc which rejects files that match
// one of the patterns. // one of the patterns.
func rejectByPattern(patterns []string) RejectFunc { func rejectByPattern(patterns []string) RejectByNameFunc {
return func(item string, fi os.FileInfo) bool { return func(item string) bool {
matched, _, err := filter.List(patterns, item) matched, _, err := filter.List(patterns, item)
if err != nil { if err != nil {
Warnf("error for exclude pattern: %v", err) Warnf("error for exclude pattern: %v", err)
@ -83,14 +88,14 @@ func rejectByPattern(patterns []string) RejectFunc {
} }
} }
// rejectIfPresent returns a RejectFunc which itself returns whether a path // rejectIfPresent returns a RejectByNameFunc which itself returns whether a path
// should be excluded. The RejectFunc considers a file to be excluded when // should be excluded. The RejectByNameFunc considers a file to be excluded when
// it resides in a directory with an exclusion file, that is specified by // it resides in a directory with an exclusion file, that is specified by
// excludeFileSpec in the form "filename[:content]". The returned error is // excludeFileSpec in the form "filename[:content]". The returned error is
// non-nil if the filename component of excludeFileSpec is empty. If rc is // non-nil if the filename component of excludeFileSpec is empty. If rc is
// non-nil, it is going to be used in the RejectFunc to expedite the evaluation // non-nil, it is going to be used in the RejectByNameFunc to expedite the evaluation
// of a directory based on previous visits. // of a directory based on previous visits.
func rejectIfPresent(excludeFileSpec string) (RejectFunc, error) { func rejectIfPresent(excludeFileSpec string) (RejectByNameFunc, error) {
if excludeFileSpec == "" { if excludeFileSpec == "" {
return nil, errors.New("name for exclusion tagfile is empty") return nil, errors.New("name for exclusion tagfile is empty")
} }
@ -107,7 +112,7 @@ func rejectIfPresent(excludeFileSpec string) (RejectFunc, error) {
} }
debug.Log("using %q as exclusion tagfile", tf) debug.Log("using %q as exclusion tagfile", tf)
rc := &rejectionCache{} rc := &rejectionCache{}
fn := func(filename string, _ os.FileInfo) bool { fn := func(filename string) bool {
return isExcludedByFile(filename, tf, tc, rc) return isExcludedByFile(filename, tf, tc, rc)
} }
return fn, nil return fn, nil
@ -252,11 +257,11 @@ func rejectByDevice(samples []string) (RejectFunc, error) {
}, nil }, nil
} }
// rejectResticCache returns a RejectFunc that rejects the restic cache // rejectResticCache returns a RejectByNameFunc that rejects the restic cache
// directory (if set). // directory (if set).
func rejectResticCache(repo *repository.Repository) (RejectFunc, error) { func rejectResticCache(repo *repository.Repository) (RejectByNameFunc, error) {
if repo.Cache == nil { if repo.Cache == nil {
return func(string, os.FileInfo) bool { return func(string) bool {
return false return false
}, nil }, nil
} }
@ -266,7 +271,7 @@ func rejectResticCache(repo *repository.Repository) (RejectFunc, error) {
return nil, errors.New("cacheBase is empty string") return nil, errors.New("cacheBase is empty string")
} }
return func(item string, _ os.FileInfo) bool { return func(item string) bool {
if fs.HasPathPrefix(cacheBase, item) { if fs.HasPathPrefix(cacheBase, item) {
debug.Log("rejecting restic cache directory %v", item) debug.Log("rejecting restic cache directory %v", item)
return true return true

View File

@ -27,7 +27,7 @@ func TestRejectByPattern(t *testing.T) {
for _, tc := range tests { for _, tc := range tests {
t.Run("", func(t *testing.T) { t.Run("", func(t *testing.T) {
reject := rejectByPattern(patterns) reject := rejectByPattern(patterns)
res := reject(tc.filename, nil) res := reject(tc.filename)
if res != tc.reject { if res != tc.reject {
t.Fatalf("wrong result for filename %v: want %v, got %v", t.Fatalf("wrong result for filename %v: want %v, got %v",
tc.filename, tc.reject, res) tc.filename, tc.reject, res)
@ -140,8 +140,8 @@ func TestMultipleIsExcludedByFile(t *testing.T) {
if err != nil { if err != nil {
return err return err
} }
excludedByFoo := fooExclude(p, fi) excludedByFoo := fooExclude(p)
excludedByBar := barExclude(p, fi) excludedByBar := barExclude(p)
excluded := excludedByFoo || excludedByBar excluded := excludedByFoo || excludedByBar
// the log message helps debugging in case the test fails // the log message helps debugging in case the test fails
t.Logf("%q: %v || %v = %v", p, excludedByFoo, excludedByBar, excluded) t.Logf("%q: %v || %v = %v", p, excludedByFoo, excludedByBar, excluded)

View File

@ -16,6 +16,10 @@ import (
tomb "gopkg.in/tomb.v2" tomb "gopkg.in/tomb.v2"
) )
// SelectByNameFunc returns true for all items that should be included (files and
// dirs). If false is returned, files are ignored and dirs are not even walked.
type SelectByNameFunc func(item string) bool
// SelectFunc returns true for all items that should be included (files and // SelectFunc returns true for all items that should be included (files and
// dirs). If false is returned, files are ignored and dirs are not even walked. // dirs). If false is returned, files are ignored and dirs are not even walked.
type SelectFunc func(item string, fi os.FileInfo) bool type SelectFunc func(item string, fi os.FileInfo) bool
@ -43,10 +47,11 @@ func (s *ItemStats) Add(other ItemStats) {
// Archiver saves a directory structure to the repo. // Archiver saves a directory structure to the repo.
type Archiver struct { type Archiver struct {
Repo restic.Repository Repo restic.Repository
Select SelectFunc SelectByName SelectByNameFunc
FS fs.FS Select SelectFunc
Options Options FS fs.FS
Options Options
blobSaver *BlobSaver blobSaver *BlobSaver
fileSaver *FileSaver fileSaver *FileSaver
@ -119,10 +124,11 @@ func (o Options) ApplyDefaults() Options {
// New initializes a new archiver. // New initializes a new archiver.
func New(repo restic.Repository, fs fs.FS, opts Options) *Archiver { func New(repo restic.Repository, fs fs.FS, opts Options) *Archiver {
arch := &Archiver{ arch := &Archiver{
Repo: repo, Repo: repo,
Select: func(string, os.FileInfo) bool { return true }, SelectByName: func(item string) bool { return true },
FS: fs, Select: func(item string, fi os.FileInfo) bool { return true },
Options: opts.ApplyDefaults(), FS: fs,
Options: opts.ApplyDefaults(),
CompleteItem: func(string, *restic.Node, *restic.Node, ItemStats, time.Duration) {}, CompleteItem: func(string, *restic.Node, *restic.Node, ItemStats, time.Duration) {},
StartFile: func(string) {}, StartFile: func(string) {},
@ -294,10 +300,10 @@ func (fn *FutureNode) wait(ctx context.Context) {
} }
// Save saves a target (file or directory) to the repo. If the item is // Save saves a target (file or directory) to the repo. If the item is
// excluded,this function returns a nil node and error, with excluded set to // excluded, this function returns a nil node and error, with excluded set to
// true. // true.
// //
// Errors and completion is needs to be handled by the caller. // Errors and completion needs to be handled by the caller.
// //
// snPath is the path within the current snapshot. // snPath is the path within the current snapshot.
func (arch *Archiver) Save(ctx context.Context, snPath, target string, previous *restic.Node) (fn FutureNode, excluded bool, err error) { func (arch *Archiver) Save(ctx context.Context, snPath, target string, previous *restic.Node) (fn FutureNode, excluded bool, err error) {
@ -316,6 +322,13 @@ func (arch *Archiver) Save(ctx context.Context, snPath, target string, previous
fn.absTarget = abstarget fn.absTarget = abstarget
// exclude files by path before running Lstat to reduce number of lstat calls
if !arch.SelectByName(abstarget) {
debug.Log("%v is excluded by path", target)
return FutureNode{}, true, nil
}
// get file info and run remaining select functions that require file information
fi, err := arch.FS.Lstat(target) fi, err := arch.FS.Lstat(target)
if !arch.Select(abstarget, fi) { if !arch.Select(abstarget, fi) {
debug.Log("%v is excluded", target) debug.Log("%v is excluded", target)

View File

@ -12,23 +12,21 @@ import (
// stats concerning the files and folders found. Select is used to decide which // stats concerning the files and folders found. Select is used to decide which
// items should be included. Error is called when an error occurs. // items should be included. Error is called when an error occurs.
type Scanner struct { type Scanner struct {
FS fs.FS FS fs.FS
Select SelectFunc SelectByName SelectByNameFunc
Error ErrorFunc Select SelectFunc
Result func(item string, s ScanStats) Error ErrorFunc
Result func(item string, s ScanStats)
} }
// NewScanner initializes a new Scanner. // NewScanner initializes a new Scanner.
func NewScanner(fs fs.FS) *Scanner { func NewScanner(fs fs.FS) *Scanner {
return &Scanner{ return &Scanner{
FS: fs, FS: fs,
Select: func(item string, fi os.FileInfo) bool { SelectByName: func(item string) bool { return true },
return true Select: func(item string, fi os.FileInfo) bool { return true },
}, Error: func(item string, fi os.FileInfo, err error) error { return err },
Error: func(item string, fi os.FileInfo, err error) error { Result: func(item string, s ScanStats) {},
return err
},
Result: func(item string, s ScanStats) {},
} }
} }
@ -70,17 +68,18 @@ func (s *Scanner) scan(ctx context.Context, stats ScanStats, target string) (Sca
return stats, ctx.Err() return stats, ctx.Err()
} }
// exclude files by path before running stat to reduce number of lstat calls
if !s.SelectByName(target) {
return stats, nil
}
// get file information
fi, err := s.FS.Lstat(target) fi, err := s.FS.Lstat(target)
if err != nil { if err != nil {
// ignore error if the target is to be excluded anyway
if !s.Select(target, nil) {
return stats, nil
}
// else return filtered error
return stats, s.Error(target, fi, err) return stats, s.Error(target, fi, err)
} }
// run remaining select functions that require file information
if !s.Select(target, fi) { if !s.Select(target, fi) {
return stats, nil return stats, nil
} }