From 7419844885b0468df53e97cebfadf218277c60d3 Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Fri, 12 Jun 2020 12:57:23 +0200 Subject: [PATCH 1/5] add changelog, benchmark, memory calculation --- changelog/unreleased/pull-2781 | 6 ++++++ internal/repository/index.go | 22 ++++++++++++++++++++++ internal/repository/index_test.go | 22 ++++++++++++++-------- 3 files changed, 42 insertions(+), 8 deletions(-) create mode 100644 changelog/unreleased/pull-2781 diff --git a/changelog/unreleased/pull-2781 b/changelog/unreleased/pull-2781 new file mode 100644 index 000000000..9535fc101 --- /dev/null +++ b/changelog/unreleased/pull-2781 @@ -0,0 +1,6 @@ +Enhancement: Reduce memory consumption of in-memory index + +We've improved how the index is stored in memory. +This change reduces memory usage for large repositories by about 30-40%. + +https://github.com/restic/restic/pull/2781 diff --git a/internal/repository/index.go b/internal/repository/index.go index 70e7b7bf7..d6038e2e1 100644 --- a/internal/repository/index.go +++ b/internal/repository/index.go @@ -13,6 +13,28 @@ import ( "github.com/restic/restic/internal/debug" ) +// In large repositories, millions of blobs are stored in the repository +// and restic needs to store an index entry for each blob in memory for +// most operations. +// Hence the index data structure defined here is one of the main contributions +// to the total memory requirements of restic. +// +// We use a map to store each index entry. +// The key of the map is a BlobHandle +// The entries of the maps are slices which contain the actual index entries. +// +// To compute the needed amount of memory, we need some assumptions. +// Maps need an overhead of allocated but not needed elements. +// For computations, we assume an overhead of 50% and use OF=1.5 (overhead factor) +// +// We have the following sizes: +// key: 32 + 1 = 33 bytes +// slice: 24 bytes (pointer, len and cap) +// indexEntry: 32 + 8 + 8 = 48 bytes +// +// To save N index entries, we therefore need: +// N * OF * (33 + 24) bytes + N * 48 bytes = N * 134 bytes + // Index holds a lookup table for id -> pack. type Index struct { m sync.Mutex diff --git a/internal/repository/index_test.go b/internal/repository/index_test.go index e1f2829bd..c96101906 100644 --- a/internal/repository/index_test.go +++ b/internal/repository/index_test.go @@ -398,18 +398,16 @@ func createRandomIndex(rng *rand.Rand) (idx *repository.Index, lookupID restic.I // create index with 200k pack files for i := 0; i < 200000; i++ { packID := NewRandomTestID(rng) + var blobs []restic.Blob offset := 0 for offset < maxPackSize { size := 2000 + rand.Intn(4*1024*1024) id := NewRandomTestID(rng) - idx.Store(restic.PackedBlob{ - PackID: packID, - Blob: restic.Blob{ - Type: restic.DataBlob, - ID: id, - Length: uint(size), - Offset: uint(offset), - }, + blobs = append(blobs, restic.Blob{ + Type: restic.DataBlob, + ID: id, + Length: uint(size), + Offset: uint(offset), }) offset += size @@ -418,6 +416,7 @@ func createRandomIndex(rng *rand.Rand) (idx *repository.Index, lookupID restic.I lookupID = id } } + idx.StorePack(packID, blobs) } return idx, lookupID @@ -444,6 +443,13 @@ func BenchmarkIndexHasKnown(b *testing.B) { } } +func BenchmarkIndexAlloc(b *testing.B) { + b.ReportAllocs() + for i := 0; i < b.N; i++ { + createRandomIndex(rand.New(rand.NewSource(0))) + } +} + func TestIndexHas(t *testing.T) { type testEntry struct { id restic.ID From d92e2c5769bb2cb5eefef2dbb9d8761da977f5a4 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Sat, 8 Feb 2020 20:51:50 +0100 Subject: [PATCH 2/5] simplify index code --- internal/repository/index.go | 45 ++++++++++------------------- internal/repository/master_index.go | 13 --------- 2 files changed, 16 insertions(+), 42 deletions(-) diff --git a/internal/repository/index.go b/internal/repository/index.go index d6038e2e1..cf3f1e560 100644 --- a/internal/repository/index.go +++ b/internal/repository/index.go @@ -140,6 +140,19 @@ func (idx *Index) StorePack(id restic.ID, blobs []restic.Blob) { } } +// ListPack returns a list of blobs contained in a pack. +func indexEntryToPackedBlob(h restic.BlobHandle, entry indexEntry) restic.PackedBlob { + return restic.PackedBlob{ + Blob: restic.Blob{ + ID: h.ID, + Type: h.Type, + Length: entry.length, + Offset: entry.offset, + }, + PackID: entry.packID, + } +} + // Lookup queries the index for the blob ID and returns a restic.PackedBlob. func (idx *Index) Lookup(id restic.ID, tpe restic.BlobType) (blobs []restic.PackedBlob, found bool) { idx.m.Lock() @@ -151,17 +164,7 @@ func (idx *Index) Lookup(id restic.ID, tpe restic.BlobType) (blobs []restic.Pack blobs = make([]restic.PackedBlob, 0, len(packs)) for _, p := range packs { - blob := restic.PackedBlob{ - Blob: restic.Blob{ - Type: tpe, - Length: p.length, - ID: id, - Offset: p.offset, - }, - PackID: p.packID, - } - - blobs = append(blobs, blob) + blobs = append(blobs, indexEntryToPackedBlob(h, p)) } return blobs, true @@ -178,15 +181,7 @@ func (idx *Index) ListPack(id restic.ID) (list []restic.PackedBlob) { for h, packList := range idx.pack { for _, entry := range packList { if entry.packID == id { - list = append(list, restic.PackedBlob{ - Blob: restic.Blob{ - ID: h.ID, - Type: h.Type, - Length: entry.length, - Offset: entry.offset, - }, - PackID: entry.packID, - }) + list = append(list, indexEntryToPackedBlob(h, entry)) } } } @@ -254,15 +249,7 @@ func (idx *Index) Each(ctx context.Context) <-chan restic.PackedBlob { select { case <-ctx.Done(): return - case ch <- restic.PackedBlob{ - Blob: restic.Blob{ - ID: h.ID, - Type: h.Type, - Offset: blob.offset, - Length: blob.length, - }, - PackID: blob.packID, - }: + case ch <- indexEntryToPackedBlob(h, blob): } } } diff --git a/internal/repository/master_index.go b/internal/repository/master_index.go index 1caa42957..24762ffb2 100644 --- a/internal/repository/master_index.go +++ b/internal/repository/master_index.go @@ -132,19 +132,6 @@ func (mi *MasterIndex) Insert(idx *Index) { mi.idx = append(mi.idx, idx) } -// Remove deletes an index from the MasterIndex. -func (mi *MasterIndex) Remove(index *Index) { - mi.idxMutex.Lock() - defer mi.idxMutex.Unlock() - - for i, idx := range mi.idx { - if idx == index { - mi.idx = append(mi.idx[:i], mi.idx[i+1:]...) - return - } - } -} - // Store remembers the id and pack in the index. func (mi *MasterIndex) StorePack(id restic.ID, blobs []restic.Blob) { mi.idxMutex.Lock() From cf979e2b819e676aaff1c3beec7250b00a41a3ca Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Fri, 12 Jun 2020 08:17:14 +0200 Subject: [PATCH 3/5] make offset and length uint32 --- internal/repository/index.go | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/internal/repository/index.go b/internal/repository/index.go index cf3f1e560..00cc92797 100644 --- a/internal/repository/index.go +++ b/internal/repository/index.go @@ -30,10 +30,10 @@ import ( // We have the following sizes: // key: 32 + 1 = 33 bytes // slice: 24 bytes (pointer, len and cap) -// indexEntry: 32 + 8 + 8 = 48 bytes +// indexEntry: 32 + 4 + 4 = 40 bytes // // To save N index entries, we therefore need: -// N * OF * (33 + 24) bytes + N * 48 bytes = N * 134 bytes +// N * OF * (33 + 24) bytes + N * 40 bytes = N * 126 bytes // Index holds a lookup table for id -> pack. type Index struct { @@ -49,8 +49,8 @@ type Index struct { type indexEntry struct { packID restic.ID - offset uint - length uint + offset uint32 + length uint32 } // NewIndex returns a new index. @@ -61,11 +61,17 @@ func NewIndex() *Index { } } +const maxuint32 = 1<<32 - 1 + func (idx *Index) store(blob restic.PackedBlob) { + // assert that offset and length fit into uint32! + if blob.Offset > maxuint32 || blob.Length > maxuint32 { + panic("offset or length does not fit in uint32. You have packs > 4GB!") + } newEntry := indexEntry{ packID: blob.PackID, - offset: blob.Offset, - length: blob.Length, + offset: uint32(blob.Offset), + length: uint32(blob.Length), } h := restic.BlobHandle{ID: blob.ID, Type: blob.Type} idx.pack[h] = append(idx.pack[h], newEntry) @@ -146,8 +152,8 @@ func indexEntryToPackedBlob(h restic.BlobHandle, entry indexEntry) restic.Packed Blob: restic.Blob{ ID: h.ID, Type: h.Type, - Length: entry.length, - Offset: entry.offset, + Length: uint(entry.length), + Offset: uint(entry.offset), }, PackID: entry.packID, } @@ -336,8 +342,8 @@ func (idx *Index) generatePackList() ([]*packJSON, error) { p.Blobs = append(p.Blobs, blobJSON{ ID: h.ID, Type: h.Type, - Offset: blob.offset, - Length: blob.length, + Offset: uint(blob.offset), + Length: uint(blob.length), }) } } From ce4a2f4ca61c921a77c56fb52ac9c189efc43360 Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Fri, 12 Jun 2020 08:25:49 +0200 Subject: [PATCH 4/5] save packIDs and duplicates separately A side remark to the definition of Index.blob: Another possibility would have been to use: blob map[restic.BlobHandle]*indexEntry This would have led to the following sizes: key: 32 + 1 = 33 bytes value: 8 bytes indexEntry: 8 + 4 + 4 = 16 bytes each packID: 32 bytes To save N index entries, we would therefore have needed: N * OF * (33 + 8) bytes + N * 16 + N * 32 bytes / BP = N * 82 bytes More precicely, using a pointer instead of a direct entry is the better memory choice if: OF * 8 bytes + entrysize < OF * entrysize <=> entrysize > 8 bytes * OF/(OF-1) Under the assumption of OF=1.5, this means using pointers would have been the better choice if sizeof(indexEntry) > 24 bytes. --- internal/repository/index.go | 176 +++++++++++++++++++++-------------- 1 file changed, 106 insertions(+), 70 deletions(-) diff --git a/internal/repository/index.go b/internal/repository/index.go index 00cc92797..194feeb74 100644 --- a/internal/repository/index.go +++ b/internal/repository/index.go @@ -19,27 +19,40 @@ import ( // Hence the index data structure defined here is one of the main contributions // to the total memory requirements of restic. // -// We use a map to store each index entry. +// We use two maps to store each index entry. +// The first map stores the first entry of a blobtype/blobID // The key of the map is a BlobHandle -// The entries of the maps are slices which contain the actual index entries. +// The entries are the actual index entries. +// In the second map we store duplicate index entries, i.e. entries with same +// blobtype/blobID +// In the index entries, we need to reference the packID. As one pack may +// contain many blobs the packIDs are saved in a separate array and only the index +// within this array is saved in the indexEntry // // To compute the needed amount of memory, we need some assumptions. // Maps need an overhead of allocated but not needed elements. // For computations, we assume an overhead of 50% and use OF=1.5 (overhead factor) +// As duplicates are only present in edge cases and are also removed by prune runs, +// we assume that there are no significant duplicates and omit them in the calculations. +// Moreover we asssume on average a minimum of 8 blobs per pack; BP=8 +// (Note that for large files there should be 3 blobs per pack as the average chunk +// size is 1.5 MB and the minimum pack size is 4 MB) // // We have the following sizes: // key: 32 + 1 = 33 bytes -// slice: 24 bytes (pointer, len and cap) -// indexEntry: 32 + 4 + 4 = 40 bytes +// indexEntry: 8 + 4 + 4 = 16 bytes +// each packID: 32 bytes // // To save N index entries, we therefore need: -// N * OF * (33 + 24) bytes + N * 40 bytes = N * 126 bytes +// N * OF * (33 + 16) bytes + N * 32 bytes / BP = N * 78 bytes -// Index holds a lookup table for id -> pack. +// Index holds lookup tables for id -> pack. type Index struct { - m sync.Mutex - pack map[restic.BlobHandle][]indexEntry - treePacks restic.IDs + m sync.Mutex + blob map[restic.BlobHandle]indexEntry + duplicates map[restic.BlobHandle][]indexEntry + packs restic.IDs + treePacks restic.IDs final bool // set to true for all indexes read from the backend ("finalized") id restic.ID // set to the ID of the index when it's finalized @@ -48,33 +61,59 @@ type Index struct { } type indexEntry struct { - packID restic.ID - offset uint32 - length uint32 + // only save index do packs; i.e. packs[packindex] yields the packID + packIndex int + offset uint32 + length uint32 } // NewIndex returns a new index. func NewIndex() *Index { return &Index{ - pack: make(map[restic.BlobHandle][]indexEntry), - created: time.Now(), + blob: make(map[restic.BlobHandle]indexEntry), + duplicates: make(map[restic.BlobHandle][]indexEntry), + created: time.Now(), } } +// withDuplicates returns the list of all entries for the given blob handle +func (idx *Index) withDuplicates(h restic.BlobHandle, entry indexEntry) []indexEntry { + entries, ok := idx.duplicates[h] + if ok { + all := make([]indexEntry, len(entries)+1) + all[0] = entry + copy(all[1:], entries) + return all + } + + return []indexEntry{entry} +} + +// addToPacks saves the given pack ID and return the index. +// This procedere allows to use pack IDs which can be easily garbage collected after. +func (idx *Index) addToPacks(id restic.ID) int { + idx.packs = append(idx.packs, id) + return len(idx.packs) - 1 +} + const maxuint32 = 1<<32 - 1 -func (idx *Index) store(blob restic.PackedBlob) { +func (idx *Index) store(packIndex int, blob restic.Blob) { // assert that offset and length fit into uint32! if blob.Offset > maxuint32 || blob.Length > maxuint32 { panic("offset or length does not fit in uint32. You have packs > 4GB!") } newEntry := indexEntry{ - packID: blob.PackID, - offset: uint32(blob.Offset), - length: uint32(blob.Length), + packIndex: packIndex, + offset: uint32(blob.Offset), + length: uint32(blob.Length), } h := restic.BlobHandle{ID: blob.ID, Type: blob.Type} - idx.pack[h] = append(idx.pack[h], newEntry) + if _, ok := idx.blob[h]; ok { + idx.duplicates[h] = append(idx.duplicates[h], newEntry) + } else { + idx.blob[h] = newEntry + } } // Final returns true iff the index is already written to the repository, it is @@ -98,7 +137,7 @@ var IndexFull = func(idx *Index) bool { debug.Log("checking whether index %p is full", idx) - blobs := len(idx.pack) + blobs := len(idx.blob) age := time.Now().Sub(idx.created) switch { @@ -126,7 +165,7 @@ func (idx *Index) Store(blob restic.PackedBlob) { debug.Log("%v", blob) - idx.store(blob) + idx.store(idx.addToPacks(blob.PackID), blob.Blob) } // StorePack remembers the ids of all blobs of a given pack @@ -140,14 +179,15 @@ func (idx *Index) StorePack(id restic.ID, blobs []restic.Blob) { } debug.Log("%v", blobs) + packIndex := idx.addToPacks(id) for _, blob := range blobs { - idx.store(restic.PackedBlob{Blob: blob, PackID: id}) + idx.store(packIndex, blob) } } // ListPack returns a list of blobs contained in a pack. -func indexEntryToPackedBlob(h restic.BlobHandle, entry indexEntry) restic.PackedBlob { +func (idx *Index) indexEntryToPackedBlob(h restic.BlobHandle, entry indexEntry) restic.PackedBlob { return restic.PackedBlob{ Blob: restic.Blob{ ID: h.ID, @@ -155,7 +195,7 @@ func indexEntryToPackedBlob(h restic.BlobHandle, entry indexEntry) restic.Packed Length: uint(entry.length), Offset: uint(entry.offset), }, - PackID: entry.packID, + PackID: idx.packs[entry.packIndex], } } @@ -166,11 +206,13 @@ func (idx *Index) Lookup(id restic.ID, tpe restic.BlobType) (blobs []restic.Pack h := restic.BlobHandle{ID: id, Type: tpe} - if packs, ok := idx.pack[h]; ok { - blobs = make([]restic.PackedBlob, 0, len(packs)) + blob, ok := idx.blob[h] + if ok { + blobList := idx.withDuplicates(h, blob) + blobs = make([]restic.PackedBlob, 0, len(blobList)) - for _, p := range packs { - blobs = append(blobs, indexEntryToPackedBlob(h, p)) + for _, p := range blobList { + blobs = append(blobs, idx.indexEntryToPackedBlob(h, p)) } return blobs, true @@ -184,10 +226,10 @@ func (idx *Index) ListPack(id restic.ID) (list []restic.PackedBlob) { idx.m.Lock() defer idx.m.Unlock() - for h, packList := range idx.pack { - for _, entry := range packList { - if entry.packID == id { - list = append(list, indexEntryToPackedBlob(h, entry)) + for h, entry := range idx.blob { + for _, blob := range idx.withDuplicates(h, entry) { + if idx.packs[blob.packIndex] == id { + list = append(list, idx.indexEntryToPackedBlob(h, blob)) } } } @@ -202,7 +244,7 @@ func (idx *Index) Has(id restic.ID, tpe restic.BlobType) bool { h := restic.BlobHandle{ID: id, Type: tpe} - _, ok := idx.pack[h] + _, ok := idx.blob[h] return ok } @@ -250,12 +292,12 @@ func (idx *Index) Each(ctx context.Context) <-chan restic.PackedBlob { close(ch) }() - for h, packs := range idx.pack { - for _, blob := range packs { + for h, entry := range idx.blob { + for _, blob := range idx.withDuplicates(h, entry) { select { case <-ctx.Done(): return - case ch <- indexEntryToPackedBlob(h, blob): + case ch <- idx.indexEntryToPackedBlob(h, blob): } } } @@ -270,10 +312,8 @@ func (idx *Index) Packs() restic.IDSet { defer idx.m.Unlock() packs := restic.NewIDSet() - for _, list := range idx.pack { - for _, entry := range list { - packs.Insert(entry.packID) - } + for _, packID := range idx.packs { + packs.Insert(packID) } return packs @@ -285,12 +325,17 @@ func (idx *Index) Count(t restic.BlobType) (n uint) { idx.m.Lock() defer idx.m.Unlock() - for h, list := range idx.pack { + for h := range idx.blob { if h.Type != t { continue } - - n += uint(len(list)) + n++ + } + for h, dups := range idx.duplicates { + if h.Type != t { + continue + } + n += uint(len(dups)) } return @@ -313,25 +358,20 @@ func (idx *Index) generatePackList() ([]*packJSON, error) { list := []*packJSON{} packs := make(map[restic.ID]*packJSON) - for h, packedBlobs := range idx.pack { - for _, blob := range packedBlobs { - if blob.packID.IsNull() { + for h, entry := range idx.blob { + for _, blob := range idx.withDuplicates(h, entry) { + packID := idx.packs[blob.packIndex] + if packID.IsNull() { panic("null pack id") } debug.Log("handle blob %v", h) - if blob.packID.IsNull() { - debug.Log("blob %v has no packID! (offset %v, length %v)", - h, blob.offset, blob.length) - return nil, errors.Errorf("unable to serialize index: pack for blob %v hasn't been written yet", h) - } - // see if pack is already in map - p, ok := packs[blob.packID] + p, ok := packs[packID] if !ok { // else create new pack - p = &packJSON{ID: blob.packID} + p = &packJSON{ID: packID} // and append it to the list and map list = append(list, p) @@ -495,16 +535,14 @@ func DecodeIndex(buf []byte) (idx *Index, err error) { idx = NewIndex() for _, pack := range idxJSON.Packs { var data, tree bool + packID := idx.addToPacks(pack.ID) for _, blob := range pack.Blobs { - idx.store(restic.PackedBlob{ - Blob: restic.Blob{ - Type: blob.Type, - ID: blob.ID, - Offset: blob.Offset, - Length: blob.Length, - }, - PackID: pack.ID, + idx.store(packID, restic.Blob{ + Type: blob.Type, + ID: blob.ID, + Offset: blob.Offset, + Length: blob.Length, }) switch blob.Type { @@ -540,16 +578,14 @@ func DecodeOldIndex(buf []byte) (idx *Index, err error) { idx = NewIndex() for _, pack := range list { var data, tree bool + packID := idx.addToPacks(pack.ID) for _, blob := range pack.Blobs { - idx.store(restic.PackedBlob{ - Blob: restic.Blob{ - Type: blob.Type, - ID: blob.ID, - Offset: blob.Offset, - Length: blob.Length, - }, - PackID: pack.ID, + idx.store(packID, restic.Blob{ + Type: blob.Type, + ID: blob.ID, + Offset: blob.Offset, + Length: blob.Length, }) switch blob.Type { From 1361341c58e4a0b5fb1b0107504deaa05b71ddd8 Mon Sep 17 00:00:00 2001 From: Alexander Weiss Date: Sat, 13 Jun 2020 12:43:03 +0200 Subject: [PATCH 5/5] don't save duplicate packIDs when using internal/repository/Index.Store --- internal/repository/index.go | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/internal/repository/index.go b/internal/repository/index.go index 194feeb74..39163c666 100644 --- a/internal/repository/index.go +++ b/internal/repository/index.go @@ -53,6 +53,8 @@ type Index struct { duplicates map[restic.BlobHandle][]indexEntry packs restic.IDs treePacks restic.IDs + // only used by Store, StorePacks does not check for already saved packIDs + packIDToIndex map[restic.ID]int final bool // set to true for all indexes read from the backend ("finalized") id restic.ID // set to the ID of the index when it's finalized @@ -70,9 +72,10 @@ type indexEntry struct { // NewIndex returns a new index. func NewIndex() *Index { return &Index{ - blob: make(map[restic.BlobHandle]indexEntry), - duplicates: make(map[restic.BlobHandle][]indexEntry), - created: time.Now(), + blob: make(map[restic.BlobHandle]indexEntry), + duplicates: make(map[restic.BlobHandle][]indexEntry), + packIDToIndex: make(map[restic.ID]int), + created: time.Now(), } } @@ -165,7 +168,14 @@ func (idx *Index) Store(blob restic.PackedBlob) { debug.Log("%v", blob) - idx.store(idx.addToPacks(blob.PackID), blob.Blob) + // get packIndex and save if new packID + packIndex, ok := idx.packIDToIndex[blob.PackID] + if !ok { + packIndex = idx.addToPacks(blob.PackID) + idx.packIDToIndex[blob.PackID] = packIndex + } + + idx.store(packIndex, blob.Blob) } // StorePack remembers the ids of all blobs of a given pack @@ -431,6 +441,8 @@ func (idx *Index) Finalize() { defer idx.m.Unlock() idx.final = true + // clear packIDToIndex as no more elements will be added + idx.packIDToIndex = nil } // ID returns the ID of the index, if available. If the index is not yet