From 7e38cefd4bf1a3ee9fbd1f8ee72dafb55889a5b6 Mon Sep 17 00:00:00 2001 From: Vincent Batts Date: Fri, 25 Sep 2015 14:33:24 -0400 Subject: [PATCH 1/2] common: remove in favor of stdlib `unicode/utf8` --- tar/asm/assemble_test.go | 4 ++-- tar/common/utf8.go | 22 -------------------- tar/common/utf8_test.go | 43 ---------------------------------------- tar/storage/entry.go | 10 +++++----- tar/storage/packer.go | 5 ++--- 5 files changed, 9 insertions(+), 75 deletions(-) delete mode 100644 tar/common/utf8.go delete mode 100644 tar/common/utf8_test.go diff --git a/tar/asm/assemble_test.go b/tar/asm/assemble_test.go index e7609c0..29b7a17 100644 --- a/tar/asm/assemble_test.go +++ b/tar/asm/assemble_test.go @@ -10,9 +10,9 @@ import ( "io/ioutil" "os" "testing" + "unicode/utf8" "github.com/vbatts/tar-split/archive/tar" - "github.com/vbatts/tar-split/tar/common" "github.com/vbatts/tar-split/tar/storage" ) @@ -37,7 +37,7 @@ func TestISO8859(t *testing.T) { break } fmt.Println(hdr.Name) - if !common.IsValidUtf8String(hdr.Name) { + if !utf8.ValidString(hdr.Name) { fmt.Println([]byte(hdr.Name)) } } diff --git a/tar/common/utf8.go b/tar/common/utf8.go deleted file mode 100644 index 568e929..0000000 --- a/tar/common/utf8.go +++ /dev/null @@ -1,22 +0,0 @@ -package common - -// IsValidUtf8String checks for in valid UTF-8 characters -func IsValidUtf8String(s string) bool { - return InvalidUtf8Index([]byte(s)) == -1 -} - -// IsValidUtf8Btyes checks for in valid UTF-8 characters -func IsValidUtf8Btyes(b []byte) bool { - return InvalidUtf8Index(b) == -1 -} - -// InvalidUtf8Index returns the offset of the first invalid UTF-8 character. -// Default is to return -1 for a wholly valid sequence. -func InvalidUtf8Index(b []byte) int { - for i, r := range string(b) { - if int(r) == 0xfffd { - return i - } - } - return -1 -} diff --git a/tar/common/utf8_test.go b/tar/common/utf8_test.go deleted file mode 100644 index 3cf81df..0000000 --- a/tar/common/utf8_test.go +++ /dev/null @@ -1,43 +0,0 @@ -package common - -import "testing" - -func TestStringValidation(t *testing.T) { - cases := []struct { - value string - result bool - offset int - }{ - {"aä\uFFFD本☺", false, 3}, - {"aä本☺", true, -1}, - } - - for _, c := range cases { - if i := InvalidUtf8Index([]byte(c.value)); i != c.offset { - t.Errorf("string %q - offset expected %d, got %d", c.value, c.offset, i) - } - if got := IsValidUtf8String(c.value); got != c.result { - t.Errorf("string %q - expected %v, got %v", c.value, c.result, got) - } - } -} - -func TestBytesValidation(t *testing.T) { - cases := []struct { - value []byte - result bool - offset int - }{ - {[]byte{0xE4}, false, 0}, - {[]byte("aä本☺"), true, -1}, - } - - for _, c := range cases { - if i := InvalidUtf8Index(c.value); i != c.offset { - t.Errorf("bytes %q - offset expected %d, got %d", c.value, c.offset, i) - } - if got := IsValidUtf8Btyes(c.value); got != c.result { - t.Errorf("bytes %q - expected %v, got %v", c.value, c.result, got) - } - } -} diff --git a/tar/storage/entry.go b/tar/storage/entry.go index b61758e..c91e7ea 100644 --- a/tar/storage/entry.go +++ b/tar/storage/entry.go @@ -1,6 +1,6 @@ package storage -import "github.com/vbatts/tar-split/tar/common" +import "unicode/utf8" // Entries is for sorting by Position type Entries []Entry @@ -44,7 +44,7 @@ type Entry struct { // SetName will check name for valid UTF-8 string, and set the appropriate // field. See https://github.com/vbatts/tar-split/issues/17 func (e *Entry) SetName(name string) { - if common.IsValidUtf8String(name) { + if utf8.ValidString(name) { e.Name = name } else { e.NameRaw = []byte(name) @@ -54,10 +54,10 @@ func (e *Entry) SetName(name string) { // SetNameBytes will check name for valid UTF-8 string, and set the appropriate // field func (e *Entry) SetNameBytes(name []byte) { - if !common.IsValidUtf8Btyes(name) { - e.NameRaw = name - } else { + if utf8.Valid(name) { e.Name = string(name) + } else { + e.NameRaw = name } } diff --git a/tar/storage/packer.go b/tar/storage/packer.go index 1ea8208..0c9d99b 100644 --- a/tar/storage/packer.go +++ b/tar/storage/packer.go @@ -6,8 +6,7 @@ import ( "errors" "io" "path/filepath" - - "github.com/vbatts/tar-split/tar/common" + "unicode/utf8" ) // ErrDuplicatePath occurs when a tar archive has more than one entry for the @@ -97,7 +96,7 @@ type seenNames map[string]struct{} func (jp *jsonPacker) AddEntry(e Entry) (int, error) { // if Name is not valid utf8, switch it to raw first. if e.Name != "" { - if !common.IsValidUtf8String(e.Name) { + if !utf8.ValidString(e.Name) { e.NameRaw = []byte(e.Name) e.Name = "" } From 10250c25e0cb4b64f89280d0dde72feff25ef7ab Mon Sep 17 00:00:00 2001 From: Vincent Batts Date: Fri, 25 Sep 2015 14:35:12 -0400 Subject: [PATCH 2/2] tar/asm: remove useless test The iso-8859-1 archive is already tested round trip, and this test did not do anything really. --- tar/asm/assemble_test.go | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/tar/asm/assemble_test.go b/tar/asm/assemble_test.go index 29b7a17..3d0c99c 100644 --- a/tar/asm/assemble_test.go +++ b/tar/asm/assemble_test.go @@ -10,39 +10,10 @@ import ( "io/ioutil" "os" "testing" - "unicode/utf8" - "github.com/vbatts/tar-split/archive/tar" "github.com/vbatts/tar-split/tar/storage" ) -func TestISO8859(t *testing.T) { - fh, err := os.Open("./testdata/iso-8859.tar.gz") - if err != nil { - t.Fatal(err) - } - defer fh.Close() - gzRdr, err := gzip.NewReader(fh) - if err != nil { - t.Fatal(err) - } - defer gzRdr.Close() - tr := tar.NewReader(gzRdr) - for { - hdr, err := tr.Next() - if err != nil { - if err != io.EOF { - t.Error(err) - } - break - } - fmt.Println(hdr.Name) - if !utf8.ValidString(hdr.Name) { - fmt.Println([]byte(hdr.Name)) - } - } -} - var entries = []struct { Entry storage.Entry Body []byte