1
0
Fork 0

Merge pull request #20 from vbatts/unicode-utf8

remove common in favor of stdlib `unicode/utf8`
This commit is contained in:
Vincent Batts 2015-09-25 14:38:49 -04:00
commit c955161e57
5 changed files with 7 additions and 102 deletions

View File

@ -11,38 +11,9 @@ import (
"os"
"testing"
"github.com/vbatts/tar-split/archive/tar"
"github.com/vbatts/tar-split/tar/common"
"github.com/vbatts/tar-split/tar/storage"
)
func TestISO8859(t *testing.T) {
fh, err := os.Open("./testdata/iso-8859.tar.gz")
if err != nil {
t.Fatal(err)
}
defer fh.Close()
gzRdr, err := gzip.NewReader(fh)
if err != nil {
t.Fatal(err)
}
defer gzRdr.Close()
tr := tar.NewReader(gzRdr)
for {
hdr, err := tr.Next()
if err != nil {
if err != io.EOF {
t.Error(err)
}
break
}
fmt.Println(hdr.Name)
if !common.IsValidUtf8String(hdr.Name) {
fmt.Println([]byte(hdr.Name))
}
}
}
var entries = []struct {
Entry storage.Entry
Body []byte

View File

@ -1,22 +0,0 @@
package common
// IsValidUtf8String checks for in valid UTF-8 characters
func IsValidUtf8String(s string) bool {
return InvalidUtf8Index([]byte(s)) == -1
}
// IsValidUtf8Btyes checks for in valid UTF-8 characters
func IsValidUtf8Btyes(b []byte) bool {
return InvalidUtf8Index(b) == -1
}
// InvalidUtf8Index returns the offset of the first invalid UTF-8 character.
// Default is to return -1 for a wholly valid sequence.
func InvalidUtf8Index(b []byte) int {
for i, r := range string(b) {
if int(r) == 0xfffd {
return i
}
}
return -1
}

View File

@ -1,43 +0,0 @@
package common
import "testing"
func TestStringValidation(t *testing.T) {
cases := []struct {
value string
result bool
offset int
}{
{"aä\uFFFD本☺", false, 3},
{"aä本☺", true, -1},
}
for _, c := range cases {
if i := InvalidUtf8Index([]byte(c.value)); i != c.offset {
t.Errorf("string %q - offset expected %d, got %d", c.value, c.offset, i)
}
if got := IsValidUtf8String(c.value); got != c.result {
t.Errorf("string %q - expected %v, got %v", c.value, c.result, got)
}
}
}
func TestBytesValidation(t *testing.T) {
cases := []struct {
value []byte
result bool
offset int
}{
{[]byte{0xE4}, false, 0},
{[]byte("aä本☺"), true, -1},
}
for _, c := range cases {
if i := InvalidUtf8Index(c.value); i != c.offset {
t.Errorf("bytes %q - offset expected %d, got %d", c.value, c.offset, i)
}
if got := IsValidUtf8Btyes(c.value); got != c.result {
t.Errorf("bytes %q - expected %v, got %v", c.value, c.result, got)
}
}
}

View File

@ -1,6 +1,6 @@
package storage
import "github.com/vbatts/tar-split/tar/common"
import "unicode/utf8"
// Entries is for sorting by Position
type Entries []Entry
@ -44,7 +44,7 @@ type Entry struct {
// SetName will check name for valid UTF-8 string, and set the appropriate
// field. See https://github.com/vbatts/tar-split/issues/17
func (e *Entry) SetName(name string) {
if common.IsValidUtf8String(name) {
if utf8.ValidString(name) {
e.Name = name
} else {
e.NameRaw = []byte(name)
@ -54,10 +54,10 @@ func (e *Entry) SetName(name string) {
// SetNameBytes will check name for valid UTF-8 string, and set the appropriate
// field
func (e *Entry) SetNameBytes(name []byte) {
if !common.IsValidUtf8Btyes(name) {
e.NameRaw = name
} else {
if utf8.Valid(name) {
e.Name = string(name)
} else {
e.NameRaw = name
}
}

View File

@ -6,8 +6,7 @@ import (
"errors"
"io"
"path/filepath"
"github.com/vbatts/tar-split/tar/common"
"unicode/utf8"
)
// ErrDuplicatePath occurs when a tar archive has more than one entry for the
@ -97,7 +96,7 @@ type seenNames map[string]struct{}
func (jp *jsonPacker) AddEntry(e Entry) (int, error) {
// if Name is not valid utf8, switch it to raw first.
if e.Name != "" {
if !common.IsValidUtf8String(e.Name) {
if !utf8.ValidString(e.Name) {
e.NameRaw = []byte(e.Name)
e.Name = ""
}