Merge pull request #4006 from MichaelEischer/deviceID-only-for-hardlinks

archiver: only store deviceID for hardlinks
This commit is contained in:
Michael Eischer 2024-03-28 22:33:28 +01:00 committed by GitHub
commit 71b6284155
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 113 additions and 8 deletions

View File

@ -0,0 +1,16 @@
Enhancement: (alpha) Store deviceID only for hardlinks
Set `RESTIC_FEATURES=device-id-for-hardlinks` to enable this alpha feature.
The feature flag will be removed after repository format version 3 becomes
available or be replaced with a different solution.
When creating backups from a filesystem snapshot, for example created using
btrfs subvolumes, the deviceID of the filesystem changes compared to previous
snapshots. This prevented restic from deduplicating the directory metadata of
a snapshot.
When this alpha feature is enabled, then the deviceID is only stored for
hardlinks. This significantly reduces the metadata duplication for most
backups.
https://github.com/restic/restic/pull/4006

View File

@ -4,4 +4,5 @@ If files on different devices had the same inode id, then the `stats` command
did not correctly calculate the snapshot size. This has been fixed.
https://github.com/restic/restic/pull/4503
https://github.com/restic/restic/pull/4006
https://forum.restic.net/t/possible-bug-in-stats/6461/8

View File

@ -270,11 +270,14 @@ func statsWalkTree(repo restic.Loader, opts StatsOptions, stats *statsContainer,
// will still be restored
stats.TotalFileCount++
// if inodes are present, only count each inode once
// (hard links do not increase restore size)
if !hardLinkIndex.Has(node.Inode, node.DeviceID) || node.Inode == 0 {
hardLinkIndex.Add(node.Inode, node.DeviceID, struct{}{})
if node.Links == 1 || node.Type == "dir" {
stats.TotalSize += node.Size
} else {
// if hardlinks are present only count each deviceID+inode once
if !hardLinkIndex.Has(node.Inode, node.DeviceID) || node.Inode == 0 {
hardLinkIndex.Add(node.Inode, node.DeviceID, struct{}{})
stats.TotalSize += node.Size
}
}
}

View File

@ -12,6 +12,7 @@ import (
"github.com/restic/restic/internal/debug"
"github.com/restic/restic/internal/errors"
"github.com/restic/restic/internal/feature"
"github.com/restic/restic/internal/fs"
"github.com/restic/restic/internal/restic"
"golang.org/x/sync/errgroup"
@ -188,6 +189,14 @@ func (arch *Archiver) nodeFromFileInfo(snPath, filename string, fi os.FileInfo)
if !arch.WithAtime {
node.AccessTime = node.ModTime
}
if feature.Flag.Enabled(feature.DeviceIDForHardlinks) {
if node.Links == 1 || node.Type == "dir" {
// the DeviceID is only necessary for hardlinked files
// when using subvolumes or snapshots their deviceIDs tend to change which causes
// restic to upload new tree blobs
node.DeviceID = 0
}
}
// overwrite name to match that within the snapshot
node.Name = path.Base(snPath)
if err != nil {

View File

@ -19,6 +19,7 @@ import (
"github.com/restic/restic/internal/backend/mem"
"github.com/restic/restic/internal/checker"
"github.com/restic/restic/internal/errors"
"github.com/restic/restic/internal/feature"
"github.com/restic/restic/internal/fs"
"github.com/restic/restic/internal/repository"
"github.com/restic/restic/internal/restic"
@ -2125,6 +2126,8 @@ const (
)
func TestMetadataChanged(t *testing.T) {
defer feature.TestSetFlag(t, feature.Flag, feature.DeviceIDForHardlinks, true)()
files := TestDir{
"testfile": TestFile{
Content: "foo bar test file",
@ -2153,6 +2156,7 @@ func TestMetadataChanged(t *testing.T) {
sn, node2 := snapshot(t, repo, fs, nil, "testfile")
// set some values so we can then compare the nodes
want.DeviceID = 0
want.Content = node2.Content
want.Path = ""
if len(want.ExtendedAttributes) == 0 {

View File

@ -6,6 +6,12 @@ package archiver
import (
"os"
"syscall"
"testing"
"github.com/restic/restic/internal/feature"
"github.com/restic/restic/internal/fs"
"github.com/restic/restic/internal/restic"
restictest "github.com/restic/restic/internal/test"
)
type wrappedFileInfo struct {
@ -39,3 +45,45 @@ func wrapFileInfo(fi os.FileInfo) os.FileInfo {
return res
}
func statAndSnapshot(t *testing.T, repo restic.Repository, name string) (*restic.Node, *restic.Node) {
fi := lstat(t, name)
want, err := restic.NodeFromFileInfo(name, fi)
restictest.OK(t, err)
_, node := snapshot(t, repo, fs.Local{}, nil, name)
return want, node
}
func TestHardlinkMetadata(t *testing.T) {
defer feature.TestSetFlag(t, feature.Flag, feature.DeviceIDForHardlinks, true)()
files := TestDir{
"testfile": TestFile{
Content: "foo bar test file",
},
"linktarget": TestFile{
Content: "test file",
},
"testlink": TestHardlink{
Target: "./linktarget",
},
"testdir": TestDir{},
}
tempdir, repo := prepareTempdirRepoSrc(t, files)
back := restictest.Chdir(t, tempdir)
defer back()
want, node := statAndSnapshot(t, repo, "testlink")
restictest.Assert(t, node.DeviceID == want.DeviceID, "device id mismatch expected %v got %v", want.DeviceID, node.DeviceID)
restictest.Assert(t, node.Links == want.Links, "link count mismatch expected %v got %v", want.Links, node.Links)
restictest.Assert(t, node.Inode == want.Inode, "inode mismatch expected %v got %v", want.Inode, node.Inode)
_, node = statAndSnapshot(t, repo, "testfile")
restictest.Assert(t, node.DeviceID == 0, "device id mismatch for testfile expected %v got %v", 0, node.DeviceID)
_, node = statAndSnapshot(t, repo, "testdir")
restictest.Assert(t, node.DeviceID == 0, "device id mismatch for testdir expected %v got %v", 0, node.DeviceID)
}

View File

@ -6,6 +6,7 @@ import (
"path"
"path/filepath"
"runtime"
"sort"
"strings"
"testing"
"time"
@ -63,11 +64,29 @@ func (s TestSymlink) String() string {
return "<Symlink>"
}
// TestHardlink describes a hardlink created for a test.
type TestHardlink struct {
Target string
}
func (s TestHardlink) String() string {
return "<Hardlink>"
}
// TestCreateFiles creates a directory structure described by dir at target,
// which must already exist. On Windows, symlinks aren't created.
func TestCreateFiles(t testing.TB, target string, dir TestDir) {
t.Helper()
for name, item := range dir {
// ensure a stable order such that it can be guaranteed that a hardlink target already exists
var names []string
for name := range dir {
names = append(names, name)
}
sort.Strings(names)
for _, name := range names {
item := dir[name]
targetPath := filepath.Join(target, name)
switch it := item.(type) {
@ -81,6 +100,11 @@ func TestCreateFiles(t testing.TB, target string, dir TestDir) {
if err != nil {
t.Fatal(err)
}
case TestHardlink:
err := fs.Link(filepath.Join(target, filepath.FromSlash(it.Target)), targetPath)
if err != nil {
t.Fatal(err)
}
case TestDir:
err := fs.Mkdir(targetPath, 0755)
if err != nil {

View File

@ -5,13 +5,13 @@ var Flag = New()
// flag names are written in kebab-case
const (
ExampleFeature FlagName = "example-feature"
DeprecateLegacyIndex FlagName = "deprecate-legacy-index"
DeviceIDForHardlinks FlagName = "device-id-for-hardlinks"
)
func init() {
Flag.SetFlags(map[FlagName]FlagDesc{
ExampleFeature: {Type: Alpha, Description: "just for testing"},
DeprecateLegacyIndex: {Type: Beta, Description: "disable support for index format used by restic 0.1.0. Use `restic repair index` to update the index if necessary."},
DeviceIDForHardlinks: {Type: Alpha, Description: "store deviceID only for hardlinks to reduce metadata changes for example when using btrfs subvolumes. Will be removed in a future restic version after repository format 3 is available"},
})
}

View File

@ -82,7 +82,7 @@ type Node struct {
User string `json:"user,omitempty"`
Group string `json:"group,omitempty"`
Inode uint64 `json:"inode,omitempty"`
DeviceID uint64 `json:"device_id,omitempty"` // device id of the file, stat.st_dev
DeviceID uint64 `json:"device_id,omitempty"` // device id of the file, stat.st_dev, only stored for hardlinks
Size uint64 `json:"size,omitempty"`
Links uint64 `json:"links,omitempty"`
LinkTarget string `json:"linktarget,omitempty"`