forked from mirrors/tar-split
tar/common: get index of first invalid utf-8 char
This commit is contained in:
parent
2865353200
commit
39d06b9dc4
2 changed files with 23 additions and 13 deletions
|
@ -2,20 +2,21 @@ package common
|
|||
|
||||
// IsValidUtf8String checks for in valid UTF-8 characters
|
||||
func IsValidUtf8String(s string) bool {
|
||||
for _, r := range s {
|
||||
if int(r) == 0xfffd {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
return InvalidUtf8Index([]byte(s)) == -1
|
||||
}
|
||||
|
||||
// IsValidUtf8Btyes checks for in valid UTF-8 characters
|
||||
func IsValidUtf8Btyes(b []byte) bool {
|
||||
for _, r := range string(b) {
|
||||
if int(r) == 0xfffd {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
return InvalidUtf8Index(b) == -1
|
||||
}
|
||||
|
||||
// InvalidUtf8Index returns the offset of the first invalid UTF-8 character.
|
||||
// Default is to return -1 for a wholly valid sequence.
|
||||
func InvalidUtf8Index(b []byte) int {
|
||||
for i, r := range string(b) {
|
||||
if int(r) == 0xfffd {
|
||||
return i
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
|
|
@ -6,27 +6,36 @@ func TestStringValidation(t *testing.T) {
|
|||
cases := []struct {
|
||||
value string
|
||||
result bool
|
||||
offset int
|
||||
}{
|
||||
{"aä\uFFFD本☺", false},
|
||||
{"aä本☺", true},
|
||||
{"aä\uFFFD本☺", false, 3},
|
||||
{"aä本☺", true, -1},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
if i := InvalidUtf8Index([]byte(c.value)); i != c.offset {
|
||||
t.Errorf("string %q - offset expected %d, got %d", c.value, c.offset, i)
|
||||
}
|
||||
if got := IsValidUtf8String(c.value); got != c.result {
|
||||
t.Errorf("string %q - expected %v, got %v", c.value, c.result, got)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestBytesValidation(t *testing.T) {
|
||||
cases := []struct {
|
||||
value []byte
|
||||
result bool
|
||||
offset int
|
||||
}{
|
||||
{[]byte{0xE4}, false},
|
||||
{[]byte("aä本☺"), true},
|
||||
{[]byte{0xE4}, false, 0},
|
||||
{[]byte("aä本☺"), true, -1},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
if i := InvalidUtf8Index(c.value); i != c.offset {
|
||||
t.Errorf("bytes %q - offset expected %d, got %d", c.value, c.offset, i)
|
||||
}
|
||||
if got := IsValidUtf8Btyes(c.value); got != c.result {
|
||||
t.Errorf("bytes %q - expected %v, got %v", c.value, c.result, got)
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue