1
0
Fork 0
forked from mirrors/tar-split

common: remove in favor of stdlib unicode/utf8

This commit is contained in:
Vincent Batts 2015-09-25 14:33:24 -04:00
parent 7ef16e6f67
commit 7e38cefd4b
5 changed files with 9 additions and 75 deletions

View file

@ -10,9 +10,9 @@ import (
"io/ioutil" "io/ioutil"
"os" "os"
"testing" "testing"
"unicode/utf8"
"github.com/vbatts/tar-split/archive/tar" "github.com/vbatts/tar-split/archive/tar"
"github.com/vbatts/tar-split/tar/common"
"github.com/vbatts/tar-split/tar/storage" "github.com/vbatts/tar-split/tar/storage"
) )
@ -37,7 +37,7 @@ func TestISO8859(t *testing.T) {
break break
} }
fmt.Println(hdr.Name) fmt.Println(hdr.Name)
if !common.IsValidUtf8String(hdr.Name) { if !utf8.ValidString(hdr.Name) {
fmt.Println([]byte(hdr.Name)) fmt.Println([]byte(hdr.Name))
} }
} }

View file

@ -1,22 +0,0 @@
package common
// IsValidUtf8String checks for in valid UTF-8 characters
func IsValidUtf8String(s string) bool {
return InvalidUtf8Index([]byte(s)) == -1
}
// IsValidUtf8Btyes checks for in valid UTF-8 characters
func IsValidUtf8Btyes(b []byte) bool {
return InvalidUtf8Index(b) == -1
}
// InvalidUtf8Index returns the offset of the first invalid UTF-8 character.
// Default is to return -1 for a wholly valid sequence.
func InvalidUtf8Index(b []byte) int {
for i, r := range string(b) {
if int(r) == 0xfffd {
return i
}
}
return -1
}

View file

@ -1,43 +0,0 @@
package common
import "testing"
func TestStringValidation(t *testing.T) {
cases := []struct {
value string
result bool
offset int
}{
{"aä\uFFFD本☺", false, 3},
{"aä本☺", true, -1},
}
for _, c := range cases {
if i := InvalidUtf8Index([]byte(c.value)); i != c.offset {
t.Errorf("string %q - offset expected %d, got %d", c.value, c.offset, i)
}
if got := IsValidUtf8String(c.value); got != c.result {
t.Errorf("string %q - expected %v, got %v", c.value, c.result, got)
}
}
}
func TestBytesValidation(t *testing.T) {
cases := []struct {
value []byte
result bool
offset int
}{
{[]byte{0xE4}, false, 0},
{[]byte("aä本☺"), true, -1},
}
for _, c := range cases {
if i := InvalidUtf8Index(c.value); i != c.offset {
t.Errorf("bytes %q - offset expected %d, got %d", c.value, c.offset, i)
}
if got := IsValidUtf8Btyes(c.value); got != c.result {
t.Errorf("bytes %q - expected %v, got %v", c.value, c.result, got)
}
}
}

View file

@ -1,6 +1,6 @@
package storage package storage
import "github.com/vbatts/tar-split/tar/common" import "unicode/utf8"
// Entries is for sorting by Position // Entries is for sorting by Position
type Entries []Entry type Entries []Entry
@ -44,7 +44,7 @@ type Entry struct {
// SetName will check name for valid UTF-8 string, and set the appropriate // SetName will check name for valid UTF-8 string, and set the appropriate
// field. See https://github.com/vbatts/tar-split/issues/17 // field. See https://github.com/vbatts/tar-split/issues/17
func (e *Entry) SetName(name string) { func (e *Entry) SetName(name string) {
if common.IsValidUtf8String(name) { if utf8.ValidString(name) {
e.Name = name e.Name = name
} else { } else {
e.NameRaw = []byte(name) e.NameRaw = []byte(name)
@ -54,10 +54,10 @@ func (e *Entry) SetName(name string) {
// SetNameBytes will check name for valid UTF-8 string, and set the appropriate // SetNameBytes will check name for valid UTF-8 string, and set the appropriate
// field // field
func (e *Entry) SetNameBytes(name []byte) { func (e *Entry) SetNameBytes(name []byte) {
if !common.IsValidUtf8Btyes(name) { if utf8.Valid(name) {
e.NameRaw = name
} else {
e.Name = string(name) e.Name = string(name)
} else {
e.NameRaw = name
} }
} }

View file

@ -6,8 +6,7 @@ import (
"errors" "errors"
"io" "io"
"path/filepath" "path/filepath"
"unicode/utf8"
"github.com/vbatts/tar-split/tar/common"
) )
// ErrDuplicatePath occurs when a tar archive has more than one entry for the // ErrDuplicatePath occurs when a tar archive has more than one entry for the
@ -97,7 +96,7 @@ type seenNames map[string]struct{}
func (jp *jsonPacker) AddEntry(e Entry) (int, error) { func (jp *jsonPacker) AddEntry(e Entry) (int, error) {
// if Name is not valid utf8, switch it to raw first. // if Name is not valid utf8, switch it to raw first.
if e.Name != "" { if e.Name != "" {
if !common.IsValidUtf8String(e.Name) { if !utf8.ValidString(e.Name) {
e.NameRaw = []byte(e.Name) e.NameRaw = []byte(e.Name)
e.Name = "" e.Name = ""
} }