Merge pull request #20 from vbatts/unicode-utf8

remove common in favor of stdlib `unicode/utf8`
2015-09-25 14:38:49 -04:00 · 2015-09-25 14:38:49 -04:00 · c955161e57
commit c955161e57
parent 7ef16e6f67 10250c25e0
5 changed files with 7 additions and 102 deletions
--- a/tar/asm/assemble_test.go
+++ b/tar/asm/assemble_test.go
@ -11,38 +11,9 @@ import (
 	"os"
 	"testing"

-	"github.com/vbatts/tar-split/archive/tar"
-	"github.com/vbatts/tar-split/tar/common"
 	"github.com/vbatts/tar-split/tar/storage"
 )

-func TestISO8859(t *testing.T) {
-	fh, err := os.Open("./testdata/iso-8859.tar.gz")
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer fh.Close()
-	gzRdr, err := gzip.NewReader(fh)
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer gzRdr.Close()
-	tr := tar.NewReader(gzRdr)
-	for {
-		hdr, err := tr.Next()
-		if err != nil {
-			if err != io.EOF {
-				t.Error(err)
-			}
-			break
-		}
-		fmt.Println(hdr.Name)
-		if !common.IsValidUtf8String(hdr.Name) {
-			fmt.Println([]byte(hdr.Name))
-		}
-	}
-}
-
 var entries = []struct {
 	Entry storage.Entry
 	Body  []byte
--- a/tar/common/utf8.go
+++ b/tar/common/utf8.go
@ -1,22 +0,0 @@
-package common
-
-// IsValidUtf8String checks for in valid UTF-8 characters
-func IsValidUtf8String(s string) bool {
-	return InvalidUtf8Index([]byte(s)) == -1
-}
-
-// IsValidUtf8Btyes checks for in valid UTF-8 characters
-func IsValidUtf8Btyes(b []byte) bool {
-	return InvalidUtf8Index(b) == -1
-}
-
-// InvalidUtf8Index returns the offset of the first invalid UTF-8 character.
-// Default is to return -1 for a wholly valid sequence.
-func InvalidUtf8Index(b []byte) int {
-	for i, r := range string(b) {
-		if int(r) == 0xfffd {
-			return i
-		}
-	}
-	return -1
-}
--- a/tar/common/utf8_test.go
+++ b/tar/common/utf8_test.go
@ -1,43 +0,0 @@
-package common
-
-import "testing"
-
-func TestStringValidation(t *testing.T) {
-	cases := []struct {
-		value  string
-		result bool
-		offset int
-	}{
-		{"aä\uFFFD本☺", false, 3},
-		{"aä本☺", true, -1},
-	}
-
-	for _, c := range cases {
-		if i := InvalidUtf8Index([]byte(c.value)); i != c.offset {
-			t.Errorf("string %q - offset expected %d, got %d", c.value, c.offset, i)
-		}
-		if got := IsValidUtf8String(c.value); got != c.result {
-			t.Errorf("string %q - expected %v, got %v", c.value, c.result, got)
-		}
-	}
-}
-
-func TestBytesValidation(t *testing.T) {
-	cases := []struct {
-		value  []byte
-		result bool
-		offset int
-	}{
-		{[]byte{0xE4}, false, 0},
-		{[]byte("aä本☺"), true, -1},
-	}
-
-	for _, c := range cases {
-		if i := InvalidUtf8Index(c.value); i != c.offset {
-			t.Errorf("bytes %q - offset expected %d, got %d", c.value, c.offset, i)
-		}
-		if got := IsValidUtf8Btyes(c.value); got != c.result {
-			t.Errorf("bytes %q - expected %v, got %v", c.value, c.result, got)
-		}
-	}
-}
--- a/tar/storage/entry.go
+++ b/tar/storage/entry.go
@ -1,6 +1,6 @@
 package storage

-import "github.com/vbatts/tar-split/tar/common"
+import "unicode/utf8"

 // Entries is for sorting by Position
 type Entries []Entry
@ -44,7 +44,7 @@ type Entry struct {
 // SetName will check name for valid UTF-8 string, and set the appropriate
 // field. See https://github.com/vbatts/tar-split/issues/17
 func (e *Entry) SetName(name string) {
-	if common.IsValidUtf8String(name) {
+	if utf8.ValidString(name) {
 		e.Name = name
 	} else {
 		e.NameRaw = []byte(name)
@ -54,10 +54,10 @@ func (e *Entry) SetName(name string) {
 // SetNameBytes will check name for valid UTF-8 string, and set the appropriate
 // field
 func (e *Entry) SetNameBytes(name []byte) {
-	if !common.IsValidUtf8Btyes(name) {
-		e.NameRaw = name
-	} else {
+	if utf8.Valid(name) {
 		e.Name = string(name)
+	} else {
+		e.NameRaw = name
 	}
 }

--- a/tar/storage/packer.go
+++ b/tar/storage/packer.go
@ -6,8 +6,7 @@ import (
 	"errors"
 	"io"
 	"path/filepath"
-
-	"github.com/vbatts/tar-split/tar/common"
+	"unicode/utf8"
 )

 // ErrDuplicatePath occurs when a tar archive has more than one entry for the
@ -97,7 +96,7 @@ type seenNames map[string]struct{}
 func (jp *jsonPacker) AddEntry(e Entry) (int, error) {
 	// if Name is not valid utf8, switch it to raw first.
 	if e.Name != "" {
-		if !common.IsValidUtf8String(e.Name) {
+		if !utf8.ValidString(e.Name) {
 			e.NameRaw = []byte(e.Name)
 			e.Name = ""
 		}