mirror of
https://github.com/vbatts/tar-split.git
synced 2024-12-18 11:36:30 +00:00
Merge pull request #20 from vbatts/unicode-utf8
remove common in favor of stdlib `unicode/utf8`
This commit is contained in:
commit
c955161e57
5 changed files with 7 additions and 102 deletions
|
@ -11,38 +11,9 @@ import (
|
|||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/vbatts/tar-split/archive/tar"
|
||||
"github.com/vbatts/tar-split/tar/common"
|
||||
"github.com/vbatts/tar-split/tar/storage"
|
||||
)
|
||||
|
||||
func TestISO8859(t *testing.T) {
|
||||
fh, err := os.Open("./testdata/iso-8859.tar.gz")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer fh.Close()
|
||||
gzRdr, err := gzip.NewReader(fh)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer gzRdr.Close()
|
||||
tr := tar.NewReader(gzRdr)
|
||||
for {
|
||||
hdr, err := tr.Next()
|
||||
if err != nil {
|
||||
if err != io.EOF {
|
||||
t.Error(err)
|
||||
}
|
||||
break
|
||||
}
|
||||
fmt.Println(hdr.Name)
|
||||
if !common.IsValidUtf8String(hdr.Name) {
|
||||
fmt.Println([]byte(hdr.Name))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var entries = []struct {
|
||||
Entry storage.Entry
|
||||
Body []byte
|
||||
|
|
|
@ -1,22 +0,0 @@
|
|||
package common
|
||||
|
||||
// IsValidUtf8String checks for in valid UTF-8 characters
|
||||
func IsValidUtf8String(s string) bool {
|
||||
return InvalidUtf8Index([]byte(s)) == -1
|
||||
}
|
||||
|
||||
// IsValidUtf8Btyes checks for in valid UTF-8 characters
|
||||
func IsValidUtf8Btyes(b []byte) bool {
|
||||
return InvalidUtf8Index(b) == -1
|
||||
}
|
||||
|
||||
// InvalidUtf8Index returns the offset of the first invalid UTF-8 character.
|
||||
// Default is to return -1 for a wholly valid sequence.
|
||||
func InvalidUtf8Index(b []byte) int {
|
||||
for i, r := range string(b) {
|
||||
if int(r) == 0xfffd {
|
||||
return i
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
|
@ -1,43 +0,0 @@
|
|||
package common
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestStringValidation(t *testing.T) {
|
||||
cases := []struct {
|
||||
value string
|
||||
result bool
|
||||
offset int
|
||||
}{
|
||||
{"aä\uFFFD本☺", false, 3},
|
||||
{"aä本☺", true, -1},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
if i := InvalidUtf8Index([]byte(c.value)); i != c.offset {
|
||||
t.Errorf("string %q - offset expected %d, got %d", c.value, c.offset, i)
|
||||
}
|
||||
if got := IsValidUtf8String(c.value); got != c.result {
|
||||
t.Errorf("string %q - expected %v, got %v", c.value, c.result, got)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestBytesValidation(t *testing.T) {
|
||||
cases := []struct {
|
||||
value []byte
|
||||
result bool
|
||||
offset int
|
||||
}{
|
||||
{[]byte{0xE4}, false, 0},
|
||||
{[]byte("aä本☺"), true, -1},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
if i := InvalidUtf8Index(c.value); i != c.offset {
|
||||
t.Errorf("bytes %q - offset expected %d, got %d", c.value, c.offset, i)
|
||||
}
|
||||
if got := IsValidUtf8Btyes(c.value); got != c.result {
|
||||
t.Errorf("bytes %q - expected %v, got %v", c.value, c.result, got)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,6 +1,6 @@
|
|||
package storage
|
||||
|
||||
import "github.com/vbatts/tar-split/tar/common"
|
||||
import "unicode/utf8"
|
||||
|
||||
// Entries is for sorting by Position
|
||||
type Entries []Entry
|
||||
|
@ -44,7 +44,7 @@ type Entry struct {
|
|||
// SetName will check name for valid UTF-8 string, and set the appropriate
|
||||
// field. See https://github.com/vbatts/tar-split/issues/17
|
||||
func (e *Entry) SetName(name string) {
|
||||
if common.IsValidUtf8String(name) {
|
||||
if utf8.ValidString(name) {
|
||||
e.Name = name
|
||||
} else {
|
||||
e.NameRaw = []byte(name)
|
||||
|
@ -54,10 +54,10 @@ func (e *Entry) SetName(name string) {
|
|||
// SetNameBytes will check name for valid UTF-8 string, and set the appropriate
|
||||
// field
|
||||
func (e *Entry) SetNameBytes(name []byte) {
|
||||
if !common.IsValidUtf8Btyes(name) {
|
||||
e.NameRaw = name
|
||||
} else {
|
||||
if utf8.Valid(name) {
|
||||
e.Name = string(name)
|
||||
} else {
|
||||
e.NameRaw = name
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -6,8 +6,7 @@ import (
|
|||
"errors"
|
||||
"io"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/vbatts/tar-split/tar/common"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
// ErrDuplicatePath occurs when a tar archive has more than one entry for the
|
||||
|
@ -97,7 +96,7 @@ type seenNames map[string]struct{}
|
|||
func (jp *jsonPacker) AddEntry(e Entry) (int, error) {
|
||||
// if Name is not valid utf8, switch it to raw first.
|
||||
if e.Name != "" {
|
||||
if !common.IsValidUtf8String(e.Name) {
|
||||
if !utf8.ValidString(e.Name) {
|
||||
e.NameRaw = []byte(e.Name)
|
||||
e.Name = ""
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue