diff --git a/tar/asm/assemble.go b/tar/asm/assemble.go index 74317cb..83d6426 100644 --- a/tar/asm/assemble.go +++ b/tar/asm/assemble.go @@ -39,7 +39,7 @@ func NewOutputTarStream(fg storage.FileGetter, up storage.Unpacker) io.ReadClose if entry.Size == 0 { continue } - fh, err := fg.Get(entry.Name) + fh, err := fg.Get(entry.GetName()) if err != nil { pw.CloseWithError(err) return @@ -56,7 +56,7 @@ func NewOutputTarStream(fg storage.FileGetter, up storage.Unpacker) io.ReadClose // but since it's coming through the PipeReader, the context of // _which_ file would be lost... fh.Close() - pw.CloseWithError(fmt.Errorf("file integrity checksum failed for %q", entry.Name)) + pw.CloseWithError(fmt.Errorf("file integrity checksum failed for %q", entry.GetName())) return } fh.Close() diff --git a/tar/asm/assemble_test.go b/tar/asm/assemble_test.go index da515f2..e7609c0 100644 --- a/tar/asm/assemble_test.go +++ b/tar/asm/assemble_test.go @@ -11,9 +11,38 @@ import ( "os" "testing" + "github.com/vbatts/tar-split/archive/tar" + "github.com/vbatts/tar-split/tar/common" "github.com/vbatts/tar-split/tar/storage" ) +func TestISO8859(t *testing.T) { + fh, err := os.Open("./testdata/iso-8859.tar.gz") + if err != nil { + t.Fatal(err) + } + defer fh.Close() + gzRdr, err := gzip.NewReader(fh) + if err != nil { + t.Fatal(err) + } + defer gzRdr.Close() + tr := tar.NewReader(gzRdr) + for { + hdr, err := tr.Next() + if err != nil { + if err != io.EOF { + t.Error(err) + } + break + } + fmt.Println(hdr.Name) + if !common.IsValidUtf8String(hdr.Name) { + fmt.Println([]byte(hdr.Name)) + } + } +} + var entries = []struct { Entry storage.Entry Body []byte @@ -36,6 +65,15 @@ var entries = []struct { }, Body: []byte("café con leche, por favor"), }, + { + Entry: storage.Entry{ + Type: storage.FileType, + NameRaw: []byte{0x66, 0x69, 0x6c, 0x65, 0x2d, 0xe4}, // this is invalid UTF-8. Just checking the round trip. + Payload: []byte{126, 72, 89, 239, 230, 252, 160, 187}, + Size: 26, + }, + Body: []byte("café con leche, por favor"), + }, } var entriesMangled = []struct { Entry storage.Entry @@ -61,6 +99,15 @@ var entriesMangled = []struct { // san not con Body: []byte("café sans leche, por favor"), }, + { + Entry: storage.Entry{ + Type: storage.FileType, + NameRaw: []byte{0x66, 0x69, 0x6c, 0x65, 0x2d, 0xe4}, + Payload: []byte{127, 72, 89, 239, 230, 252, 160, 187}, + Size: 26, + }, + Body: []byte("café con leche, por favor"), + }, } func TestTarStreamMangledGetterPutter(t *testing.T) { @@ -69,19 +116,19 @@ func TestTarStreamMangledGetterPutter(t *testing.T) { // first lets prep a GetPutter and Packer for i := range entries { if entries[i].Entry.Type == storage.FileType { - j, csum, err := fgp.Put(entries[i].Entry.Name, bytes.NewBuffer(entries[i].Body)) + j, csum, err := fgp.Put(entries[i].Entry.GetName(), bytes.NewBuffer(entries[i].Body)) if err != nil { t.Error(err) } if j != entries[i].Entry.Size { t.Errorf("size %q: expected %d; got %d", - entries[i].Entry.Name, + entries[i].Entry.GetName(), entries[i].Entry.Size, j) } if !bytes.Equal(csum, entries[i].Entry.Payload) { t.Errorf("checksum %q: expected %v; got %v", - entries[i].Entry.Name, + entries[i].Entry.GetName(), entries[i].Entry.Payload, csum) } @@ -90,7 +137,7 @@ func TestTarStreamMangledGetterPutter(t *testing.T) { for _, e := range entriesMangled { if e.Entry.Type == storage.FileType { - rdr, err := fgp.Get(e.Entry.Name) + rdr, err := fgp.Get(e.Entry.GetName()) if err != nil { t.Error(err) } @@ -105,7 +152,7 @@ func TestTarStreamMangledGetterPutter(t *testing.T) { if bytes.Equal(csum, e.Entry.Payload) { t.Errorf("wrote %d bytes. checksum for %q should not have matched! %v", i, - e.Entry.Name, + e.Entry.GetName(), csum) } } @@ -121,6 +168,7 @@ func TestTarStream(t *testing.T) { {"./testdata/t.tar.gz", "1eb237ff69bca6e22789ecb05b45d35ca307adbd", 10240}, {"./testdata/longlink.tar.gz", "d9f6babe107b7247953dff6b5b5ae31a3a880add", 20480}, {"./testdata/fatlonglink.tar.gz", "8537f03f89aeef537382f8b0bb065d93e03b0be8", 26234880}, + {"./testdata/iso-8859.tar.gz", "ddafa51cb03c74ec117ab366ee2240d13bba1ec3", 10240}, } for _, tc := range testCases { @@ -163,7 +211,7 @@ func TestTarStream(t *testing.T) { t.Fatalf("checksum of tar: expected %s; got %x", tc.expectedSHA1Sum, h0.Sum(nil)) } - t.Logf("%s", w.String()) // if we fail, then show the packed info + //t.Logf("%s", w.String()) // if we fail, then show the packed info // If we've made it this far, then we'll turn it around and create a tar // stream from the packed metadata and buffered file contents. diff --git a/tar/asm/disassemble.go b/tar/asm/disassemble.go index 7986890..54ef23a 100644 --- a/tar/asm/disassemble.go +++ b/tar/asm/disassemble.go @@ -92,13 +92,16 @@ func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io } } - // File entries added, regardless of size - _, err = p.AddEntry(storage.Entry{ + entry := storage.Entry{ Type: storage.FileType, - Name: hdr.Name, Size: hdr.Size, Payload: csum, - }) + } + // For proper marshalling of non-utf8 characters + entry.SetName(hdr.Name) + + // File entries added, regardless of size + _, err = p.AddEntry(entry) if err != nil { pW.CloseWithError(err) return diff --git a/tar/asm/testdata/iso-8859.tar.gz b/tar/asm/testdata/iso-8859.tar.gz new file mode 100644 index 0000000..3e87f30 Binary files /dev/null and b/tar/asm/testdata/iso-8859.tar.gz differ diff --git a/tar/common/utf8.go b/tar/common/utf8.go new file mode 100644 index 0000000..568e929 --- /dev/null +++ b/tar/common/utf8.go @@ -0,0 +1,22 @@ +package common + +// IsValidUtf8String checks for in valid UTF-8 characters +func IsValidUtf8String(s string) bool { + return InvalidUtf8Index([]byte(s)) == -1 +} + +// IsValidUtf8Btyes checks for in valid UTF-8 characters +func IsValidUtf8Btyes(b []byte) bool { + return InvalidUtf8Index(b) == -1 +} + +// InvalidUtf8Index returns the offset of the first invalid UTF-8 character. +// Default is to return -1 for a wholly valid sequence. +func InvalidUtf8Index(b []byte) int { + for i, r := range string(b) { + if int(r) == 0xfffd { + return i + } + } + return -1 +} diff --git a/tar/common/utf8_test.go b/tar/common/utf8_test.go new file mode 100644 index 0000000..3cf81df --- /dev/null +++ b/tar/common/utf8_test.go @@ -0,0 +1,43 @@ +package common + +import "testing" + +func TestStringValidation(t *testing.T) { + cases := []struct { + value string + result bool + offset int + }{ + {"aä\uFFFD本☺", false, 3}, + {"aä本☺", true, -1}, + } + + for _, c := range cases { + if i := InvalidUtf8Index([]byte(c.value)); i != c.offset { + t.Errorf("string %q - offset expected %d, got %d", c.value, c.offset, i) + } + if got := IsValidUtf8String(c.value); got != c.result { + t.Errorf("string %q - expected %v, got %v", c.value, c.result, got) + } + } +} + +func TestBytesValidation(t *testing.T) { + cases := []struct { + value []byte + result bool + offset int + }{ + {[]byte{0xE4}, false, 0}, + {[]byte("aä本☺"), true, -1}, + } + + for _, c := range cases { + if i := InvalidUtf8Index(c.value); i != c.offset { + t.Errorf("bytes %q - offset expected %d, got %d", c.value, c.offset, i) + } + if got := IsValidUtf8Btyes(c.value); got != c.result { + t.Errorf("bytes %q - expected %v, got %v", c.value, c.result, got) + } + } +} diff --git a/tar/storage/entry.go b/tar/storage/entry.go index 38fe7ba..a152ac2 100644 --- a/tar/storage/entry.go +++ b/tar/storage/entry.go @@ -1,5 +1,11 @@ package storage +import ( + "fmt" + + "github.com/vbatts/tar-split/tar/common" +) + // Entries is for sorting by Position type Entries []Entry @@ -33,7 +39,44 @@ const ( type Entry struct { Type Type `json:"type"` Name string `json:"name,omitempty"` + NameRaw []byte `json:"name_raw,omitempty"` Size int64 `json:"size,omitempty"` Payload []byte `json:"payload"` // SegmentType stores payload here; FileType stores crc64 checksum here; Position int `json:"position"` } + +// SetName will check name for valid UTF-8 string, and set the appropriate +// field. See https://github.com/vbatts/tar-split/issues/17 +func (e *Entry) SetName(name string) { + if common.IsValidUtf8String(name) { + e.Name = name + } else { + e.NameRaw = []byte(name) + } +} + +// SetNameBytes will check name for valid UTF-8 string, and set the appropriate +// field +func (e *Entry) SetNameBytes(name []byte) { + if !common.IsValidUtf8Btyes(name) { + e.NameRaw = name + } else { + e.Name = string(name) + } +} + +// GetName returns the string for the entry's name, regardless of the field stored in +func (e *Entry) GetName() string { + if len(e.NameRaw) > 0 { + return fmt.Sprintf("%s", e.NameRaw) + } + return e.Name +} + +// GetNameBytes returns the bytes for the entry's name, regardless of the field stored in +func (e *Entry) GetNameBytes() []byte { + if len(e.NameRaw) > 0 { + return e.NameRaw + } + return []byte(e.Name) +} diff --git a/tar/storage/entry_test.go b/tar/storage/entry_test.go index c797bca..90d103e 100644 --- a/tar/storage/entry_test.go +++ b/tar/storage/entry_test.go @@ -39,10 +39,10 @@ func TestEntries(t *testing.T) { func TestFile(t *testing.T) { f := Entry{ Type: FileType, - Name: "./hello.txt", Size: 100, Position: 2, } + f.SetName("./hello.txt") buf, err := json.Marshal(f) if err != nil { @@ -54,8 +54,37 @@ func TestFile(t *testing.T) { t.Fatal(err) } - if f.Name != f1.Name { - t.Errorf("expected Name %q, got %q", f.Name, f1.Name) + if f.GetName() != f1.GetName() { + t.Errorf("expected Name %q, got %q", f.GetName(), f1.GetName()) + } + if f.Size != f1.Size { + t.Errorf("expected Size %q, got %q", f.Size, f1.Size) + } + if f.Position != f1.Position { + t.Errorf("expected Position %q, got %q", f.Position, f1.Position) + } +} + +func TestFileRaw(t *testing.T) { + f := Entry{ + Type: FileType, + Size: 100, + Position: 2, + } + f.SetNameBytes([]byte{0x2E, 0x2F, 0x68, 0x65, 0x6C, 0x6C, 0x6F, 0xE4, 0x2E, 0x74, 0x78, 0x74}) + + buf, err := json.Marshal(f) + if err != nil { + t.Fatal(err) + } + + f1 := Entry{} + if err = json.Unmarshal(buf, &f1); err != nil { + t.Fatal(err) + } + + if f.GetName() != f1.GetName() { + t.Errorf("expected Name %q, got %q", f.GetName(), f1.GetName()) } if f.Size != f1.Size { t.Errorf("expected Size %q, got %q", f.Size, f1.Size) diff --git a/tar/storage/packer.go b/tar/storage/packer.go index a02a19a..1ea8208 100644 --- a/tar/storage/packer.go +++ b/tar/storage/packer.go @@ -6,6 +6,8 @@ import ( "errors" "io" "path/filepath" + + "github.com/vbatts/tar-split/tar/common" ) // ErrDuplicatePath occurs when a tar archive has more than one entry for the @@ -61,7 +63,7 @@ func (jup *jsonUnpacker) Next() (*Entry, error) { // check for dup name if e.Type == FileType { - cName := filepath.Clean(e.Name) + cName := filepath.Clean(e.GetName()) if _, ok := jup.seen[cName]; ok { return nil, ErrDuplicatePath } @@ -93,9 +95,17 @@ type jsonPacker struct { type seenNames map[string]struct{} func (jp *jsonPacker) AddEntry(e Entry) (int, error) { + // if Name is not valid utf8, switch it to raw first. + if e.Name != "" { + if !common.IsValidUtf8String(e.Name) { + e.NameRaw = []byte(e.Name) + e.Name = "" + } + } + // check early for dup name if e.Type == FileType { - cName := filepath.Clean(e.Name) + cName := filepath.Clean(e.GetName()) if _, ok := jp.seen[cName]; ok { return -1, ErrDuplicatePath }