diff --git a/tar/asm/assemble_test.go b/tar/asm/assemble_test.go index e7609c0..29b7a17 100644 --- a/tar/asm/assemble_test.go +++ b/tar/asm/assemble_test.go @@ -10,9 +10,9 @@ import ( "io/ioutil" "os" "testing" + "unicode/utf8" "github.com/vbatts/tar-split/archive/tar" - "github.com/vbatts/tar-split/tar/common" "github.com/vbatts/tar-split/tar/storage" ) @@ -37,7 +37,7 @@ func TestISO8859(t *testing.T) { break } fmt.Println(hdr.Name) - if !common.IsValidUtf8String(hdr.Name) { + if !utf8.ValidString(hdr.Name) { fmt.Println([]byte(hdr.Name)) } } diff --git a/tar/common/utf8.go b/tar/common/utf8.go deleted file mode 100644 index 568e929..0000000 --- a/tar/common/utf8.go +++ /dev/null @@ -1,22 +0,0 @@ -package common - -// IsValidUtf8String checks for in valid UTF-8 characters -func IsValidUtf8String(s string) bool { - return InvalidUtf8Index([]byte(s)) == -1 -} - -// IsValidUtf8Btyes checks for in valid UTF-8 characters -func IsValidUtf8Btyes(b []byte) bool { - return InvalidUtf8Index(b) == -1 -} - -// InvalidUtf8Index returns the offset of the first invalid UTF-8 character. -// Default is to return -1 for a wholly valid sequence. -func InvalidUtf8Index(b []byte) int { - for i, r := range string(b) { - if int(r) == 0xfffd { - return i - } - } - return -1 -} diff --git a/tar/common/utf8_test.go b/tar/common/utf8_test.go deleted file mode 100644 index 3cf81df..0000000 --- a/tar/common/utf8_test.go +++ /dev/null @@ -1,43 +0,0 @@ -package common - -import "testing" - -func TestStringValidation(t *testing.T) { - cases := []struct { - value string - result bool - offset int - }{ - {"aä\uFFFD本☺", false, 3}, - {"aä本☺", true, -1}, - } - - for _, c := range cases { - if i := InvalidUtf8Index([]byte(c.value)); i != c.offset { - t.Errorf("string %q - offset expected %d, got %d", c.value, c.offset, i) - } - if got := IsValidUtf8String(c.value); got != c.result { - t.Errorf("string %q - expected %v, got %v", c.value, c.result, got) - } - } -} - -func TestBytesValidation(t *testing.T) { - cases := []struct { - value []byte - result bool - offset int - }{ - {[]byte{0xE4}, false, 0}, - {[]byte("aä本☺"), true, -1}, - } - - for _, c := range cases { - if i := InvalidUtf8Index(c.value); i != c.offset { - t.Errorf("bytes %q - offset expected %d, got %d", c.value, c.offset, i) - } - if got := IsValidUtf8Btyes(c.value); got != c.result { - t.Errorf("bytes %q - expected %v, got %v", c.value, c.result, got) - } - } -} diff --git a/tar/storage/entry.go b/tar/storage/entry.go index b61758e..c91e7ea 100644 --- a/tar/storage/entry.go +++ b/tar/storage/entry.go @@ -1,6 +1,6 @@ package storage -import "github.com/vbatts/tar-split/tar/common" +import "unicode/utf8" // Entries is for sorting by Position type Entries []Entry @@ -44,7 +44,7 @@ type Entry struct { // SetName will check name for valid UTF-8 string, and set the appropriate // field. See https://github.com/vbatts/tar-split/issues/17 func (e *Entry) SetName(name string) { - if common.IsValidUtf8String(name) { + if utf8.ValidString(name) { e.Name = name } else { e.NameRaw = []byte(name) @@ -54,10 +54,10 @@ func (e *Entry) SetName(name string) { // SetNameBytes will check name for valid UTF-8 string, and set the appropriate // field func (e *Entry) SetNameBytes(name []byte) { - if !common.IsValidUtf8Btyes(name) { - e.NameRaw = name - } else { + if utf8.Valid(name) { e.Name = string(name) + } else { + e.NameRaw = name } } diff --git a/tar/storage/packer.go b/tar/storage/packer.go index 1ea8208..0c9d99b 100644 --- a/tar/storage/packer.go +++ b/tar/storage/packer.go @@ -6,8 +6,7 @@ import ( "errors" "io" "path/filepath" - - "github.com/vbatts/tar-split/tar/common" + "unicode/utf8" ) // ErrDuplicatePath occurs when a tar archive has more than one entry for the @@ -97,7 +96,7 @@ type seenNames map[string]struct{} func (jp *jsonPacker) AddEntry(e Entry) (int, error) { // if Name is not valid utf8, switch it to raw first. if e.Name != "" { - if !common.IsValidUtf8String(e.Name) { + if !utf8.ValidString(e.Name) { e.NameRaw = []byte(e.Name) e.Name = "" }