1
0
Fork 1
mirror of https://github.com/vbatts/tar-split.git synced 2025-10-25 00:10:56 +00:00

tar/common: get index of first invalid utf-8 char

This commit is contained in:
Vincent Batts 2015-09-23 15:13:54 -04:00
parent 2865353200
commit 39d06b9dc4
2 changed files with 23 additions and 13 deletions

View file

@ -2,20 +2,21 @@ package common
// IsValidUtf8String checks for in valid UTF-8 characters // IsValidUtf8String checks for in valid UTF-8 characters
func IsValidUtf8String(s string) bool { func IsValidUtf8String(s string) bool {
for _, r := range s { return InvalidUtf8Index([]byte(s)) == -1
if int(r) == 0xfffd {
return false
}
}
return true
} }
// IsValidUtf8Btyes checks for in valid UTF-8 characters // IsValidUtf8Btyes checks for in valid UTF-8 characters
func IsValidUtf8Btyes(b []byte) bool { func IsValidUtf8Btyes(b []byte) bool {
for _, r := range string(b) { return InvalidUtf8Index(b) == -1
}
// InvalidUtf8Index returns the offset of the first invalid UTF-8 character.
// Default is to return -1 for a wholly valid sequence.
func InvalidUtf8Index(b []byte) int {
for i, r := range string(b) {
if int(r) == 0xfffd { if int(r) == 0xfffd {
return false return i
} }
} }
return true return -1
} }

View file

@ -6,27 +6,36 @@ func TestStringValidation(t *testing.T) {
cases := []struct { cases := []struct {
value string value string
result bool result bool
offset int
}{ }{
{"aä\uFFFD本☺", false}, {"aä\uFFFD本☺", false, 3},
{"aä本☺", true}, {"aä本☺", true, -1},
} }
for _, c := range cases { for _, c := range cases {
if i := InvalidUtf8Index([]byte(c.value)); i != c.offset {
t.Errorf("string %q - offset expected %d, got %d", c.value, c.offset, i)
}
if got := IsValidUtf8String(c.value); got != c.result { if got := IsValidUtf8String(c.value); got != c.result {
t.Errorf("string %q - expected %v, got %v", c.value, c.result, got) t.Errorf("string %q - expected %v, got %v", c.value, c.result, got)
} }
} }
} }
func TestBytesValidation(t *testing.T) { func TestBytesValidation(t *testing.T) {
cases := []struct { cases := []struct {
value []byte value []byte
result bool result bool
offset int
}{ }{
{[]byte{0xE4}, false}, {[]byte{0xE4}, false, 0},
{[]byte("aä本☺"), true}, {[]byte("aä本☺"), true, -1},
} }
for _, c := range cases { for _, c := range cases {
if i := InvalidUtf8Index(c.value); i != c.offset {
t.Errorf("bytes %q - offset expected %d, got %d", c.value, c.offset, i)
}
if got := IsValidUtf8Btyes(c.value); got != c.result { if got := IsValidUtf8Btyes(c.value); got != c.result {
t.Errorf("bytes %q - expected %v, got %v", c.value, c.result, got) t.Errorf("bytes %q - expected %v, got %v", c.value, c.result, got)
} }