Merge pull request #18 from vbatts/iso-8859-1

Iso 8859 1
2015-09-23 15:47:23 -04:00 · 2015-09-23 15:47:23 -04:00 · 7f56c08c48
commit 7f56c08c48
parent 7384cf1827 cde639172f
9 changed files with 215 additions and 17 deletions
--- a/tar/asm/assemble.go
+++ b/tar/asm/assemble.go
@ -39,7 +39,7 @@ func NewOutputTarStream(fg storage.FileGetter, up storage.Unpacker) io.ReadClose
 				if entry.Size == 0 {
 					continue
 				}
-				fh, err := fg.Get(entry.Name)
+				fh, err := fg.Get(entry.GetName())
 				if err != nil {
 					pw.CloseWithError(err)
 					return
@ -56,7 +56,7 @@ func NewOutputTarStream(fg storage.FileGetter, up storage.Unpacker) io.ReadClose
 					// but since it's coming through the PipeReader, the context of
 					// _which_ file would be lost...
 					fh.Close()
-					pw.CloseWithError(fmt.Errorf("file integrity checksum failed for %q", entry.Name))
+					pw.CloseWithError(fmt.Errorf("file integrity checksum failed for %q", entry.GetName()))
 					return
 				}
 				fh.Close()
--- a/tar/asm/assemble_test.go
+++ b/tar/asm/assemble_test.go
@ -11,9 +11,38 @@ import (
 	"os"
 	"testing"

+	"github.com/vbatts/tar-split/archive/tar"
+	"github.com/vbatts/tar-split/tar/common"
 	"github.com/vbatts/tar-split/tar/storage"
 )

+func TestISO8859(t *testing.T) {
+	fh, err := os.Open("./testdata/iso-8859.tar.gz")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer fh.Close()
+	gzRdr, err := gzip.NewReader(fh)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer gzRdr.Close()
+	tr := tar.NewReader(gzRdr)
+	for {
+		hdr, err := tr.Next()
+		if err != nil {
+			if err != io.EOF {
+				t.Error(err)
+			}
+			break
+		}
+		fmt.Println(hdr.Name)
+		if !common.IsValidUtf8String(hdr.Name) {
+			fmt.Println([]byte(hdr.Name))
+		}
+	}
+}
+
 var entries = []struct {
 	Entry storage.Entry
 	Body  []byte
@ -36,6 +65,15 @@ var entries = []struct {
 		},
 		Body: []byte("café con leche, por favor"),
 	},
+	{
+		Entry: storage.Entry{
+			Type:    storage.FileType,
+			NameRaw: []byte{0x66, 0x69, 0x6c, 0x65, 0x2d, 0xe4}, // this is invalid UTF-8. Just checking the round trip.
+			Payload: []byte{126, 72, 89, 239, 230, 252, 160, 187},
+			Size:    26,
+		},
+		Body: []byte("café con leche, por favor"),
+	},
 }
 var entriesMangled = []struct {
 	Entry storage.Entry
@ -61,6 +99,15 @@ var entriesMangled = []struct {
 		// san not con
 		Body: []byte("café sans leche, por favor"),
 	},
+	{
+		Entry: storage.Entry{
+			Type:    storage.FileType,
+			NameRaw: []byte{0x66, 0x69, 0x6c, 0x65, 0x2d, 0xe4},
+			Payload: []byte{127, 72, 89, 239, 230, 252, 160, 187},
+			Size:    26,
+		},
+		Body: []byte("café con leche, por favor"),
+	},
 }

 func TestTarStreamMangledGetterPutter(t *testing.T) {
@ -69,19 +116,19 @@ func TestTarStreamMangledGetterPutter(t *testing.T) {
 	// first lets prep a GetPutter and Packer
 	for i := range entries {
 		if entries[i].Entry.Type == storage.FileType {
-			j, csum, err := fgp.Put(entries[i].Entry.Name, bytes.NewBuffer(entries[i].Body))
+			j, csum, err := fgp.Put(entries[i].Entry.GetName(), bytes.NewBuffer(entries[i].Body))
 			if err != nil {
 				t.Error(err)
 			}
 			if j != entries[i].Entry.Size {
 				t.Errorf("size %q: expected %d; got %d",
-					entries[i].Entry.Name,
+					entries[i].Entry.GetName(),
 					entries[i].Entry.Size,
 					j)
 			}
 			if !bytes.Equal(csum, entries[i].Entry.Payload) {
 				t.Errorf("checksum %q: expected %v; got %v",
-					entries[i].Entry.Name,
+					entries[i].Entry.GetName(),
 					entries[i].Entry.Payload,
 					csum)
 			}
@ -90,7 +137,7 @@ func TestTarStreamMangledGetterPutter(t *testing.T) {

 	for _, e := range entriesMangled {
 		if e.Entry.Type == storage.FileType {
-			rdr, err := fgp.Get(e.Entry.Name)
+			rdr, err := fgp.Get(e.Entry.GetName())
 			if err != nil {
 				t.Error(err)
 			}
@ -105,7 +152,7 @@ func TestTarStreamMangledGetterPutter(t *testing.T) {
 			if bytes.Equal(csum, e.Entry.Payload) {
 				t.Errorf("wrote %d bytes. checksum for %q should not have matched! %v",
 					i,
-					e.Entry.Name,
+					e.Entry.GetName(),
 					csum)
 			}
 		}
@ -121,6 +168,7 @@ func TestTarStream(t *testing.T) {
 		{"./testdata/t.tar.gz", "1eb237ff69bca6e22789ecb05b45d35ca307adbd", 10240},
 		{"./testdata/longlink.tar.gz", "d9f6babe107b7247953dff6b5b5ae31a3a880add", 20480},
 		{"./testdata/fatlonglink.tar.gz", "8537f03f89aeef537382f8b0bb065d93e03b0be8", 26234880},
+		{"./testdata/iso-8859.tar.gz", "ddafa51cb03c74ec117ab366ee2240d13bba1ec3", 10240},
 	}

 	for _, tc := range testCases {
@ -163,7 +211,7 @@ func TestTarStream(t *testing.T) {
 			t.Fatalf("checksum of tar: expected %s; got %x", tc.expectedSHA1Sum, h0.Sum(nil))
 		}

-		t.Logf("%s", w.String()) // if we fail, then show the packed info
+		//t.Logf("%s", w.String()) // if we fail, then show the packed info

 		// If we've made it this far, then we'll turn it around and create a tar
 		// stream from the packed metadata and buffered file contents.
--- a/tar/asm/disassemble.go
+++ b/tar/asm/disassemble.go
@ -92,13 +92,16 @@ func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io
 				}
 			}

-			// File entries added, regardless of size
-			_, err = p.AddEntry(storage.Entry{
+			entry := storage.Entry{
 				Type:    storage.FileType,
-				Name:    hdr.Name,
 				Size:    hdr.Size,
 				Payload: csum,
-			})
+			}
+			// For proper marshalling of non-utf8 characters
+			entry.SetName(hdr.Name)
+
+			// File entries added, regardless of size
+			_, err = p.AddEntry(entry)
 			if err != nil {
 				pW.CloseWithError(err)
 				return
--- a/tar/asm/testdata/iso-8859.tar.gz
+++ b/tar/asm/testdata/iso-8859.tar.gz
--- a/tar/common/utf8.go
+++ b/tar/common/utf8.go
@ -0,0 +1,22 @@
+package common
+
+// IsValidUtf8String checks for in valid UTF-8 characters
+func IsValidUtf8String(s string) bool {
+	return InvalidUtf8Index([]byte(s)) == -1
+}
+
+// IsValidUtf8Btyes checks for in valid UTF-8 characters
+func IsValidUtf8Btyes(b []byte) bool {
+	return InvalidUtf8Index(b) == -1
+}
+
+// InvalidUtf8Index returns the offset of the first invalid UTF-8 character.
+// Default is to return -1 for a wholly valid sequence.
+func InvalidUtf8Index(b []byte) int {
+	for i, r := range string(b) {
+		if int(r) == 0xfffd {
+			return i
+		}
+	}
+	return -1
+}
--- a/tar/common/utf8_test.go
+++ b/tar/common/utf8_test.go
@ -0,0 +1,43 @@
+package common
+
+import "testing"
+
+func TestStringValidation(t *testing.T) {
+	cases := []struct {
+		value  string
+		result bool
+		offset int
+	}{
+		{"aä\uFFFD本☺", false, 3},
+		{"aä本☺", true, -1},
+	}
+
+	for _, c := range cases {
+		if i := InvalidUtf8Index([]byte(c.value)); i != c.offset {
+			t.Errorf("string %q - offset expected %d, got %d", c.value, c.offset, i)
+		}
+		if got := IsValidUtf8String(c.value); got != c.result {
+			t.Errorf("string %q - expected %v, got %v", c.value, c.result, got)
+		}
+	}
+}
+
+func TestBytesValidation(t *testing.T) {
+	cases := []struct {
+		value  []byte
+		result bool
+		offset int
+	}{
+		{[]byte{0xE4}, false, 0},
+		{[]byte("aä本☺"), true, -1},
+	}
+
+	for _, c := range cases {
+		if i := InvalidUtf8Index(c.value); i != c.offset {
+			t.Errorf("bytes %q - offset expected %d, got %d", c.value, c.offset, i)
+		}
+		if got := IsValidUtf8Btyes(c.value); got != c.result {
+			t.Errorf("bytes %q - expected %v, got %v", c.value, c.result, got)
+		}
+	}
+}
--- a/tar/storage/entry.go
+++ b/tar/storage/entry.go
@ -1,5 +1,11 @@
 package storage

+import (
+	"fmt"
+
+	"github.com/vbatts/tar-split/tar/common"
+)
+
 // Entries is for sorting by Position
 type Entries []Entry

@ -33,7 +39,44 @@ const (
 type Entry struct {
 	Type     Type   `json:"type"`
 	Name     string `json:"name,omitempty"`
+	NameRaw  []byte `json:"name_raw,omitempty"`
 	Size     int64  `json:"size,omitempty"`
 	Payload  []byte `json:"payload"` // SegmentType stores payload here; FileType stores crc64 checksum here;
 	Position int    `json:"position"`
 }
+
+// SetName will check name for valid UTF-8 string, and set the appropriate
+// field. See https://github.com/vbatts/tar-split/issues/17
+func (e *Entry) SetName(name string) {
+	if common.IsValidUtf8String(name) {
+		e.Name = name
+	} else {
+		e.NameRaw = []byte(name)
+	}
+}
+
+// SetNameBytes will check name for valid UTF-8 string, and set the appropriate
+// field
+func (e *Entry) SetNameBytes(name []byte) {
+	if !common.IsValidUtf8Btyes(name) {
+		e.NameRaw = name
+	} else {
+		e.Name = string(name)
+	}
+}
+
+// GetName returns the string for the entry's name, regardless of the field stored in
+func (e *Entry) GetName() string {
+	if len(e.NameRaw) > 0 {
+		return fmt.Sprintf("%s", e.NameRaw)
+	}
+	return e.Name
+}
+
+// GetNameBytes returns the bytes for the entry's name, regardless of the field stored in
+func (e *Entry) GetNameBytes() []byte {
+	if len(e.NameRaw) > 0 {
+		return e.NameRaw
+	}
+	return []byte(e.Name)
+}
--- a/tar/storage/entry_test.go
+++ b/tar/storage/entry_test.go
@ -39,10 +39,10 @@ func TestEntries(t *testing.T) {
 func TestFile(t *testing.T) {
 	f := Entry{
 		Type:     FileType,
-		Name:     "./hello.txt",
 		Size:     100,
 		Position: 2,
 	}
+	f.SetName("./hello.txt")

 	buf, err := json.Marshal(f)
 	if err != nil {
@ -54,8 +54,37 @@ func TestFile(t *testing.T) {
 		t.Fatal(err)
 	}

-	if f.Name != f1.Name {
-		t.Errorf("expected Name %q, got %q", f.Name, f1.Name)
+	if f.GetName() != f1.GetName() {
+		t.Errorf("expected Name %q, got %q", f.GetName(), f1.GetName())
+	}
+	if f.Size != f1.Size {
+		t.Errorf("expected Size %q, got %q", f.Size, f1.Size)
+	}
+	if f.Position != f1.Position {
+		t.Errorf("expected Position %q, got %q", f.Position, f1.Position)
+	}
+}
+
+func TestFileRaw(t *testing.T) {
+	f := Entry{
+		Type:     FileType,
+		Size:     100,
+		Position: 2,
+	}
+	f.SetNameBytes([]byte{0x2E, 0x2F, 0x68, 0x65, 0x6C, 0x6C, 0x6F, 0xE4, 0x2E, 0x74, 0x78, 0x74})
+
+	buf, err := json.Marshal(f)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	f1 := Entry{}
+	if err = json.Unmarshal(buf, &f1); err != nil {
+		t.Fatal(err)
+	}
+
+	if f.GetName() != f1.GetName() {
+		t.Errorf("expected Name %q, got %q", f.GetName(), f1.GetName())
 	}
 	if f.Size != f1.Size {
 		t.Errorf("expected Size %q, got %q", f.Size, f1.Size)
--- a/tar/storage/packer.go
+++ b/tar/storage/packer.go
@ -6,6 +6,8 @@ import (
 	"errors"
 	"io"
 	"path/filepath"
+
+	"github.com/vbatts/tar-split/tar/common"
 )

 // ErrDuplicatePath occurs when a tar archive has more than one entry for the
@ -61,7 +63,7 @@ func (jup *jsonUnpacker) Next() (*Entry, error) {

 	// check for dup name
 	if e.Type == FileType {
-		cName := filepath.Clean(e.Name)
+		cName := filepath.Clean(e.GetName())
 		if _, ok := jup.seen[cName]; ok {
 			return nil, ErrDuplicatePath
 		}
@ -93,9 +95,17 @@ type jsonPacker struct {
 type seenNames map[string]struct{}

 func (jp *jsonPacker) AddEntry(e Entry) (int, error) {
+	// if Name is not valid utf8, switch it to raw first.
+	if e.Name != "" {
+		if !common.IsValidUtf8String(e.Name) {
+			e.NameRaw = []byte(e.Name)
+			e.Name = ""
+		}
+	}
+
 	// check early for dup name
 	if e.Type == FileType {
-		cName := filepath.Clean(e.Name)
+		cName := filepath.Clean(e.GetName())
 		if _, ok := jp.seen[cName]; ok {
 			return -1, ErrDuplicatePath
 		}