Merge pull request #1748 from restic/detect-bom

Respect Encoding and Byte Order Mark when reading text files
This commit is contained in:
Alexander Neumann 2018-05-01 21:25:46 +02:00
commit 01f9662614
7 changed files with 175 additions and 23 deletions

4
Gopkg.lock generated
View File

@ -219,7 +219,7 @@
[[projects]] [[projects]]
name = "golang.org/x/text" name = "golang.org/x/text"
packages = ["collate","collate/build","internal/colltab","internal/gen","internal/tag","internal/triegen","internal/ucd","language","secure/bidirule","transform","unicode/bidi","unicode/cldr","unicode/norm","unicode/rangetable"] packages = ["collate","collate/build","encoding","encoding/internal","encoding/internal/identifier","encoding/unicode","internal/colltab","internal/gen","internal/tag","internal/triegen","internal/ucd","internal/utf8internal","language","runes","secure/bidirule","transform","unicode/bidi","unicode/cldr","unicode/norm","unicode/rangetable"]
revision = "f21a4dfb5e38f5895301dc265a8def02365cc3d0" revision = "f21a4dfb5e38f5895301dc265a8def02365cc3d0"
version = "v0.3.0" version = "v0.3.0"
@ -250,6 +250,6 @@
[solve-meta] [solve-meta]
analyzer-name = "dep" analyzer-name = "dep"
analyzer-version = 1 analyzer-version = 1
inputs-digest = "44a8f2ed127a6eaa38c1449b97d298fc703c961617bd93565b89bcc6c9a41483" inputs-digest = "a5de339cba7570216b212439b90e1e6c384c94be8342fe7755b7cb66aa0a3440"
solver-name = "gps-cdcl" solver-name = "gps-cdcl"
solver-version = 1 solver-version = 1

View File

@ -0,0 +1,12 @@
Enhancement: Support UTF-16 encoding and process Byte Order Mark
On Windows, text editors commonly leave a Byte Order Mark at the beginning of
the file to define which encoding is used (oftentimes UTF-16). We've added code
to support processing the BOMs in text files, like the exclude files, the
password file and the file passed via `--files-from`. This does not apply to
any file being saved in a backup, those are not touched and archived as they
are.
https://github.com/restic/restic/issues/1433
https://github.com/restic/restic/issues/1738
https://github.com/restic/restic/pull/1748

View File

@ -2,8 +2,9 @@ package main
import ( import (
"bufio" "bufio"
"bytes"
"context" "context"
"io" "io/ioutil"
"os" "os"
"strconv" "strconv"
"strings" "strings"
@ -18,6 +19,7 @@ import (
"github.com/restic/restic/internal/fs" "github.com/restic/restic/internal/fs"
"github.com/restic/restic/internal/repository" "github.com/restic/restic/internal/repository"
"github.com/restic/restic/internal/restic" "github.com/restic/restic/internal/restic"
"github.com/restic/restic/internal/textfile"
"github.com/restic/restic/internal/ui" "github.com/restic/restic/internal/ui"
"github.com/restic/restic/internal/ui/termstatus" "github.com/restic/restic/internal/ui/termstatus"
) )
@ -127,19 +129,24 @@ func readLinesFromFile(filename string) ([]string, error) {
return nil, nil return nil, nil
} }
var r io.Reader = os.Stdin var (
if filename != "-" { data []byte
f, err := os.Open(filename) err error
if err != nil { )
return nil, err
} if filename == "-" {
defer f.Close() data, err = ioutil.ReadAll(os.Stdin)
r = f } else {
data, err = textfile.Read(filename)
}
if err != nil {
return nil, err
} }
var lines []string var lines []string
scanner := bufio.NewScanner(r) scanner := bufio.NewScanner(bytes.NewReader(data))
for scanner.Scan() { for scanner.Scan() {
line := strings.TrimSpace(scanner.Text()) line := strings.TrimSpace(scanner.Text())
// ignore empty lines // ignore empty lines
@ -232,18 +239,12 @@ func readExcludePatternsFromFiles(excludeFiles []string) []string {
var excludes []string var excludes []string
for _, filename := range excludeFiles { for _, filename := range excludeFiles {
err := func() (err error) { err := func() (err error) {
file, err := fs.Open(filename) data, err := textfile.Read(filename)
if err != nil { if err != nil {
return err return err
} }
defer func() {
// return pre-close error if there was one
if errClose := file.Close(); err == nil {
err = errClose
}
}()
scanner := bufio.NewScanner(file) scanner := bufio.NewScanner(bytes.NewReader(data))
for scanner.Scan() { for scanner.Scan() {
line := strings.TrimSpace(scanner.Text()) line := strings.TrimSpace(scanner.Text())

View File

@ -4,7 +4,6 @@ import (
"context" "context"
"fmt" "fmt"
"io" "io"
"io/ioutil"
"os" "os"
"path/filepath" "path/filepath"
"runtime" "runtime"
@ -30,6 +29,7 @@ import (
"github.com/restic/restic/internal/options" "github.com/restic/restic/internal/options"
"github.com/restic/restic/internal/repository" "github.com/restic/restic/internal/repository"
"github.com/restic/restic/internal/restic" "github.com/restic/restic/internal/restic"
"github.com/restic/restic/internal/textfile"
"github.com/restic/restic/internal/errors" "github.com/restic/restic/internal/errors"
@ -235,8 +235,8 @@ func Exitf(exitcode int, format string, args ...interface{}) {
// resolvePassword determines the password to be used for opening the repository. // resolvePassword determines the password to be used for opening the repository.
func resolvePassword(opts GlobalOptions, env string) (string, error) { func resolvePassword(opts GlobalOptions, env string) (string, error) {
if opts.PasswordFile != "" { if opts.PasswordFile != "" {
s, err := ioutil.ReadFile(opts.PasswordFile) s, err := textfile.Read(opts.PasswordFile)
if os.IsNotExist(err) { if os.IsNotExist(errors.Cause(err)) {
return "", errors.Fatalf("%s does not exist", opts.PasswordFile) return "", errors.Fatalf("%s does not exist", opts.PasswordFile)
} }
return strings.TrimSpace(string(s)), errors.Wrap(err, "Readfile") return strings.TrimSpace(string(s)), errors.Wrap(err, "Readfile")

View File

@ -1,6 +1,7 @@
package fs package fs
import ( import (
"io/ioutil"
"os" "os"
"testing" "testing"
@ -41,3 +42,22 @@ func TestChdir(t testing.TB, dest string) (back func()) {
} }
} }
} }
// TestTempFile returns a new temporary file, which is removed when cleanup()
// is called.
func TestTempFile(t testing.TB, prefix string) (File, func()) {
f, err := ioutil.TempFile("", prefix)
if err != nil {
t.Fatal(err)
}
cleanup := func() {
_ = f.Close()
err = Remove(f.Name())
if err != nil {
t.Fatal(err)
}
}
return f, cleanup
}

43
internal/textfile/read.go Normal file
View File

@ -0,0 +1,43 @@
// Package textfile allows reading files that contain text. It automatically
// detects and converts several encodings and removes Byte Order Marks (BOM).
package textfile
import (
"bytes"
"io/ioutil"
"golang.org/x/text/encoding/unicode"
)
// All supported BOMs (Byte Order Marks)
var (
bomUTF8 = []byte{0xef, 0xbb, 0xbf}
bomUTF16BigEndian = []byte{0xfe, 0xff}
bomUTF16LittleEndian = []byte{0xff, 0xfe}
)
// Decode removes a byte order mark and converts the bytes to UTF-8.
func Decode(data []byte) ([]byte, error) {
if bytes.HasPrefix(data, bomUTF8) {
return data[len(bomUTF8):], nil
}
if !bytes.HasPrefix(data, bomUTF16BigEndian) && !bytes.HasPrefix(data, bomUTF16LittleEndian) {
// no encoding specified, let's assume UTF-8
return data, nil
}
// UseBom means automatic endianness selection
e := unicode.UTF16(unicode.BigEndian, unicode.UseBOM)
return e.NewDecoder().Bytes(data)
}
// Read returns the contents of the file, converted to UTF-8, stripped of any BOM.
func Read(filename string) ([]byte, error) {
data, err := ioutil.ReadFile(filename)
if err != nil {
return nil, err
}
return Decode(data)
}

View File

@ -0,0 +1,76 @@
package textfile
import (
"bytes"
"encoding/hex"
"testing"
"github.com/restic/restic/internal/fs"
)
func writeTempfile(t testing.TB, data []byte) (fs.File, func()) {
f, removeTempfile := fs.TestTempFile(t, "restic-test-textfile-read-")
_, err := f.Write(data)
if err != nil {
t.Fatal(err)
}
err = f.Close()
if err != nil {
t.Fatal(err)
}
return f, removeTempfile
}
func dec(s string) []byte {
data, err := hex.DecodeString(s)
if err != nil {
panic(err)
}
return data
}
func TestRead(t *testing.T) {
var tests = []struct {
data []byte
want []byte
}{
{data: []byte("foo bar baz")},
{data: []byte("Ööbär")},
{
data: []byte("\xef\xbb\xbffööbär"),
want: []byte("fööbär"),
},
{
data: dec("feff006600f600f6006200e40072"),
want: []byte("fööbär"),
},
{
data: dec("fffe6600f600f6006200e4007200"),
want: []byte("fööbär"),
},
}
for _, test := range tests {
t.Run("", func(t *testing.T) {
want := test.want
if want == nil {
want = test.data
}
f, cleanup := writeTempfile(t, test.data)
defer cleanup()
data, err := Read(f.Name())
if err != nil {
t.Fatal(err)
}
if !bytes.Equal(want, data) {
t.Errorf("invalid data returned, want:\n %q\ngot:\n %q", want, data)
}
})
}
}