From 286535320029f669a2b4b96723a2a1ba313214a1 Mon Sep 17 00:00:00 2001 From: Vincent Batts Date: Wed, 23 Sep 2015 13:30:00 -0400 Subject: [PATCH 1/4] common: add a UTF-8 check helper --- tar/common/utf8.go | 21 +++++++++++++++++++++ tar/common/utf8_test.go | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 tar/common/utf8.go create mode 100644 tar/common/utf8_test.go diff --git a/tar/common/utf8.go b/tar/common/utf8.go new file mode 100644 index 0000000..ffb1646 --- /dev/null +++ b/tar/common/utf8.go @@ -0,0 +1,21 @@ +package common + +// IsValidUtf8String checks for in valid UTF-8 characters +func IsValidUtf8String(s string) bool { + for _, r := range s { + if int(r) == 0xfffd { + return false + } + } + return true +} + +// IsValidUtf8Btyes checks for in valid UTF-8 characters +func IsValidUtf8Btyes(b []byte) bool { + for _, r := range string(b) { + if int(r) == 0xfffd { + return false + } + } + return true +} diff --git a/tar/common/utf8_test.go b/tar/common/utf8_test.go new file mode 100644 index 0000000..e546f55 --- /dev/null +++ b/tar/common/utf8_test.go @@ -0,0 +1,34 @@ +package common + +import "testing" + +func TestStringValidation(t *testing.T) { + cases := []struct { + value string + result bool + }{ + {"aä\uFFFD本☺", false}, + {"aä本☺", true}, + } + + for _, c := range cases { + if got := IsValidUtf8String(c.value); got != c.result { + t.Errorf("string %q - expected %v, got %v", c.value, c.result, got) + } + } +} +func TestBytesValidation(t *testing.T) { + cases := []struct { + value []byte + result bool + }{ + {[]byte{0xE4}, false}, + {[]byte("aä本☺"), true}, + } + + for _, c := range cases { + if got := IsValidUtf8Btyes(c.value); got != c.result { + t.Errorf("bytes %q - expected %v, got %v", c.value, c.result, got) + } + } +} From 39d06b9dc4eaf75c34407e8fd8c161d54e4c6b4d Mon Sep 17 00:00:00 2001 From: Vincent Batts Date: Wed, 23 Sep 2015 15:13:54 -0400 Subject: [PATCH 2/4] tar/common: get index of first invalid utf-8 char --- tar/common/utf8.go | 19 ++++++++++--------- tar/common/utf8_test.go | 17 +++++++++++++---- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/tar/common/utf8.go b/tar/common/utf8.go index ffb1646..568e929 100644 --- a/tar/common/utf8.go +++ b/tar/common/utf8.go @@ -2,20 +2,21 @@ package common // IsValidUtf8String checks for in valid UTF-8 characters func IsValidUtf8String(s string) bool { - for _, r := range s { - if int(r) == 0xfffd { - return false - } - } - return true + return InvalidUtf8Index([]byte(s)) == -1 } // IsValidUtf8Btyes checks for in valid UTF-8 characters func IsValidUtf8Btyes(b []byte) bool { - for _, r := range string(b) { + return InvalidUtf8Index(b) == -1 +} + +// InvalidUtf8Index returns the offset of the first invalid UTF-8 character. +// Default is to return -1 for a wholly valid sequence. +func InvalidUtf8Index(b []byte) int { + for i, r := range string(b) { if int(r) == 0xfffd { - return false + return i } } - return true + return -1 } diff --git a/tar/common/utf8_test.go b/tar/common/utf8_test.go index e546f55..3cf81df 100644 --- a/tar/common/utf8_test.go +++ b/tar/common/utf8_test.go @@ -6,27 +6,36 @@ func TestStringValidation(t *testing.T) { cases := []struct { value string result bool + offset int }{ - {"aä\uFFFD本☺", false}, - {"aä本☺", true}, + {"aä\uFFFD本☺", false, 3}, + {"aä本☺", true, -1}, } for _, c := range cases { + if i := InvalidUtf8Index([]byte(c.value)); i != c.offset { + t.Errorf("string %q - offset expected %d, got %d", c.value, c.offset, i) + } if got := IsValidUtf8String(c.value); got != c.result { t.Errorf("string %q - expected %v, got %v", c.value, c.result, got) } } } + func TestBytesValidation(t *testing.T) { cases := []struct { value []byte result bool + offset int }{ - {[]byte{0xE4}, false}, - {[]byte("aä本☺"), true}, + {[]byte{0xE4}, false, 0}, + {[]byte("aä本☺"), true, -1}, } for _, c := range cases { + if i := InvalidUtf8Index(c.value); i != c.offset { + t.Errorf("bytes %q - offset expected %d, got %d", c.value, c.offset, i) + } if got := IsValidUtf8Btyes(c.value); got != c.result { t.Errorf("bytes %q - expected %v, got %v", c.value, c.result, got) } From 032efafc29636d38ea45b9a57fe0bad7dd90d124 Mon Sep 17 00:00:00 2001 From: Vincent Batts Date: Wed, 23 Sep 2015 15:20:09 -0400 Subject: [PATCH 3/4] tar/storage: work with raw (invalid utf8) names When the entry name is not UTF-8, for example ISO-8859-1, then store the raw bytes. To accommodate this, we will have getters and setters for the entry's name now. Since this most heavily affects the json marshalling, we'll double check the sanity of the name before storing it in the JSONPacker. --- tar/storage/entry.go | 43 +++++++++++++++++++++++++++++++++++++++ tar/storage/entry_test.go | 35 ++++++++++++++++++++++++++++--- tar/storage/packer.go | 14 +++++++++++-- 3 files changed, 87 insertions(+), 5 deletions(-) diff --git a/tar/storage/entry.go b/tar/storage/entry.go index 38fe7ba..a152ac2 100644 --- a/tar/storage/entry.go +++ b/tar/storage/entry.go @@ -1,5 +1,11 @@ package storage +import ( + "fmt" + + "github.com/vbatts/tar-split/tar/common" +) + // Entries is for sorting by Position type Entries []Entry @@ -33,7 +39,44 @@ const ( type Entry struct { Type Type `json:"type"` Name string `json:"name,omitempty"` + NameRaw []byte `json:"name_raw,omitempty"` Size int64 `json:"size,omitempty"` Payload []byte `json:"payload"` // SegmentType stores payload here; FileType stores crc64 checksum here; Position int `json:"position"` } + +// SetName will check name for valid UTF-8 string, and set the appropriate +// field. See https://github.com/vbatts/tar-split/issues/17 +func (e *Entry) SetName(name string) { + if common.IsValidUtf8String(name) { + e.Name = name + } else { + e.NameRaw = []byte(name) + } +} + +// SetNameBytes will check name for valid UTF-8 string, and set the appropriate +// field +func (e *Entry) SetNameBytes(name []byte) { + if !common.IsValidUtf8Btyes(name) { + e.NameRaw = name + } else { + e.Name = string(name) + } +} + +// GetName returns the string for the entry's name, regardless of the field stored in +func (e *Entry) GetName() string { + if len(e.NameRaw) > 0 { + return fmt.Sprintf("%s", e.NameRaw) + } + return e.Name +} + +// GetNameBytes returns the bytes for the entry's name, regardless of the field stored in +func (e *Entry) GetNameBytes() []byte { + if len(e.NameRaw) > 0 { + return e.NameRaw + } + return []byte(e.Name) +} diff --git a/tar/storage/entry_test.go b/tar/storage/entry_test.go index c797bca..90d103e 100644 --- a/tar/storage/entry_test.go +++ b/tar/storage/entry_test.go @@ -39,10 +39,10 @@ func TestEntries(t *testing.T) { func TestFile(t *testing.T) { f := Entry{ Type: FileType, - Name: "./hello.txt", Size: 100, Position: 2, } + f.SetName("./hello.txt") buf, err := json.Marshal(f) if err != nil { @@ -54,8 +54,37 @@ func TestFile(t *testing.T) { t.Fatal(err) } - if f.Name != f1.Name { - t.Errorf("expected Name %q, got %q", f.Name, f1.Name) + if f.GetName() != f1.GetName() { + t.Errorf("expected Name %q, got %q", f.GetName(), f1.GetName()) + } + if f.Size != f1.Size { + t.Errorf("expected Size %q, got %q", f.Size, f1.Size) + } + if f.Position != f1.Position { + t.Errorf("expected Position %q, got %q", f.Position, f1.Position) + } +} + +func TestFileRaw(t *testing.T) { + f := Entry{ + Type: FileType, + Size: 100, + Position: 2, + } + f.SetNameBytes([]byte{0x2E, 0x2F, 0x68, 0x65, 0x6C, 0x6C, 0x6F, 0xE4, 0x2E, 0x74, 0x78, 0x74}) + + buf, err := json.Marshal(f) + if err != nil { + t.Fatal(err) + } + + f1 := Entry{} + if err = json.Unmarshal(buf, &f1); err != nil { + t.Fatal(err) + } + + if f.GetName() != f1.GetName() { + t.Errorf("expected Name %q, got %q", f.GetName(), f1.GetName()) } if f.Size != f1.Size { t.Errorf("expected Size %q, got %q", f.Size, f1.Size) diff --git a/tar/storage/packer.go b/tar/storage/packer.go index a02a19a..1ea8208 100644 --- a/tar/storage/packer.go +++ b/tar/storage/packer.go @@ -6,6 +6,8 @@ import ( "errors" "io" "path/filepath" + + "github.com/vbatts/tar-split/tar/common" ) // ErrDuplicatePath occurs when a tar archive has more than one entry for the @@ -61,7 +63,7 @@ func (jup *jsonUnpacker) Next() (*Entry, error) { // check for dup name if e.Type == FileType { - cName := filepath.Clean(e.Name) + cName := filepath.Clean(e.GetName()) if _, ok := jup.seen[cName]; ok { return nil, ErrDuplicatePath } @@ -93,9 +95,17 @@ type jsonPacker struct { type seenNames map[string]struct{} func (jp *jsonPacker) AddEntry(e Entry) (int, error) { + // if Name is not valid utf8, switch it to raw first. + if e.Name != "" { + if !common.IsValidUtf8String(e.Name) { + e.NameRaw = []byte(e.Name) + e.Name = "" + } + } + // check early for dup name if e.Type == FileType { - cName := filepath.Clean(e.Name) + cName := filepath.Clean(e.GetName()) if _, ok := jp.seen[cName]; ok { return -1, ErrDuplicatePath } From cde639172fb276d8fbc3e0bbee73791315e30f04 Mon Sep 17 00:00:00 2001 From: Vincent Batts Date: Wed, 23 Sep 2015 15:24:15 -0400 Subject: [PATCH 4/4] tar/asm: work with non-utf8 entry names --- tar/asm/assemble.go | 4 +-- tar/asm/assemble_test.go | 60 +++++++++++++++++++++++++++---- tar/asm/disassemble.go | 11 +++--- tar/asm/testdata/iso-8859.tar.gz | Bin 0 -> 187 bytes 4 files changed, 63 insertions(+), 12 deletions(-) create mode 100644 tar/asm/testdata/iso-8859.tar.gz diff --git a/tar/asm/assemble.go b/tar/asm/assemble.go index 74317cb..83d6426 100644 --- a/tar/asm/assemble.go +++ b/tar/asm/assemble.go @@ -39,7 +39,7 @@ func NewOutputTarStream(fg storage.FileGetter, up storage.Unpacker) io.ReadClose if entry.Size == 0 { continue } - fh, err := fg.Get(entry.Name) + fh, err := fg.Get(entry.GetName()) if err != nil { pw.CloseWithError(err) return @@ -56,7 +56,7 @@ func NewOutputTarStream(fg storage.FileGetter, up storage.Unpacker) io.ReadClose // but since it's coming through the PipeReader, the context of // _which_ file would be lost... fh.Close() - pw.CloseWithError(fmt.Errorf("file integrity checksum failed for %q", entry.Name)) + pw.CloseWithError(fmt.Errorf("file integrity checksum failed for %q", entry.GetName())) return } fh.Close() diff --git a/tar/asm/assemble_test.go b/tar/asm/assemble_test.go index da515f2..e7609c0 100644 --- a/tar/asm/assemble_test.go +++ b/tar/asm/assemble_test.go @@ -11,9 +11,38 @@ import ( "os" "testing" + "github.com/vbatts/tar-split/archive/tar" + "github.com/vbatts/tar-split/tar/common" "github.com/vbatts/tar-split/tar/storage" ) +func TestISO8859(t *testing.T) { + fh, err := os.Open("./testdata/iso-8859.tar.gz") + if err != nil { + t.Fatal(err) + } + defer fh.Close() + gzRdr, err := gzip.NewReader(fh) + if err != nil { + t.Fatal(err) + } + defer gzRdr.Close() + tr := tar.NewReader(gzRdr) + for { + hdr, err := tr.Next() + if err != nil { + if err != io.EOF { + t.Error(err) + } + break + } + fmt.Println(hdr.Name) + if !common.IsValidUtf8String(hdr.Name) { + fmt.Println([]byte(hdr.Name)) + } + } +} + var entries = []struct { Entry storage.Entry Body []byte @@ -36,6 +65,15 @@ var entries = []struct { }, Body: []byte("café con leche, por favor"), }, + { + Entry: storage.Entry{ + Type: storage.FileType, + NameRaw: []byte{0x66, 0x69, 0x6c, 0x65, 0x2d, 0xe4}, // this is invalid UTF-8. Just checking the round trip. + Payload: []byte{126, 72, 89, 239, 230, 252, 160, 187}, + Size: 26, + }, + Body: []byte("café con leche, por favor"), + }, } var entriesMangled = []struct { Entry storage.Entry @@ -61,6 +99,15 @@ var entriesMangled = []struct { // san not con Body: []byte("café sans leche, por favor"), }, + { + Entry: storage.Entry{ + Type: storage.FileType, + NameRaw: []byte{0x66, 0x69, 0x6c, 0x65, 0x2d, 0xe4}, + Payload: []byte{127, 72, 89, 239, 230, 252, 160, 187}, + Size: 26, + }, + Body: []byte("café con leche, por favor"), + }, } func TestTarStreamMangledGetterPutter(t *testing.T) { @@ -69,19 +116,19 @@ func TestTarStreamMangledGetterPutter(t *testing.T) { // first lets prep a GetPutter and Packer for i := range entries { if entries[i].Entry.Type == storage.FileType { - j, csum, err := fgp.Put(entries[i].Entry.Name, bytes.NewBuffer(entries[i].Body)) + j, csum, err := fgp.Put(entries[i].Entry.GetName(), bytes.NewBuffer(entries[i].Body)) if err != nil { t.Error(err) } if j != entries[i].Entry.Size { t.Errorf("size %q: expected %d; got %d", - entries[i].Entry.Name, + entries[i].Entry.GetName(), entries[i].Entry.Size, j) } if !bytes.Equal(csum, entries[i].Entry.Payload) { t.Errorf("checksum %q: expected %v; got %v", - entries[i].Entry.Name, + entries[i].Entry.GetName(), entries[i].Entry.Payload, csum) } @@ -90,7 +137,7 @@ func TestTarStreamMangledGetterPutter(t *testing.T) { for _, e := range entriesMangled { if e.Entry.Type == storage.FileType { - rdr, err := fgp.Get(e.Entry.Name) + rdr, err := fgp.Get(e.Entry.GetName()) if err != nil { t.Error(err) } @@ -105,7 +152,7 @@ func TestTarStreamMangledGetterPutter(t *testing.T) { if bytes.Equal(csum, e.Entry.Payload) { t.Errorf("wrote %d bytes. checksum for %q should not have matched! %v", i, - e.Entry.Name, + e.Entry.GetName(), csum) } } @@ -121,6 +168,7 @@ func TestTarStream(t *testing.T) { {"./testdata/t.tar.gz", "1eb237ff69bca6e22789ecb05b45d35ca307adbd", 10240}, {"./testdata/longlink.tar.gz", "d9f6babe107b7247953dff6b5b5ae31a3a880add", 20480}, {"./testdata/fatlonglink.tar.gz", "8537f03f89aeef537382f8b0bb065d93e03b0be8", 26234880}, + {"./testdata/iso-8859.tar.gz", "ddafa51cb03c74ec117ab366ee2240d13bba1ec3", 10240}, } for _, tc := range testCases { @@ -163,7 +211,7 @@ func TestTarStream(t *testing.T) { t.Fatalf("checksum of tar: expected %s; got %x", tc.expectedSHA1Sum, h0.Sum(nil)) } - t.Logf("%s", w.String()) // if we fail, then show the packed info + //t.Logf("%s", w.String()) // if we fail, then show the packed info // If we've made it this far, then we'll turn it around and create a tar // stream from the packed metadata and buffered file contents. diff --git a/tar/asm/disassemble.go b/tar/asm/disassemble.go index 7986890..54ef23a 100644 --- a/tar/asm/disassemble.go +++ b/tar/asm/disassemble.go @@ -92,13 +92,16 @@ func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io } } - // File entries added, regardless of size - _, err = p.AddEntry(storage.Entry{ + entry := storage.Entry{ Type: storage.FileType, - Name: hdr.Name, Size: hdr.Size, Payload: csum, - }) + } + // For proper marshalling of non-utf8 characters + entry.SetName(hdr.Name) + + // File entries added, regardless of size + _, err = p.AddEntry(entry) if err != nil { pW.CloseWithError(err) return diff --git a/tar/asm/testdata/iso-8859.tar.gz b/tar/asm/testdata/iso-8859.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..3e87f30a45f5dbf742a51c5c8252688452aeb2d0 GIT binary patch literal 187 zcmb2|=HU3ek133aIkPxl*TTZoQm-Vjh~e!eN3KH#0uC3~*t+To%(z?)O8{fJKT(T_prp5-2ZE8zMWtG k&#}fJ?8SbI-rs7jz2?@jhmJ8IgNS~1mB~df7&I6d0D{m~djJ3c literal 0 HcmV?d00001