forked from mirrors/tar-split
tar/common: get index of first invalid utf-8 char
This commit is contained in:
parent
2865353200
commit
39d06b9dc4
2 changed files with 23 additions and 13 deletions
|
@ -2,20 +2,21 @@ package common
|
||||||
|
|
||||||
// IsValidUtf8String checks for in valid UTF-8 characters
|
// IsValidUtf8String checks for in valid UTF-8 characters
|
||||||
func IsValidUtf8String(s string) bool {
|
func IsValidUtf8String(s string) bool {
|
||||||
for _, r := range s {
|
return InvalidUtf8Index([]byte(s)) == -1
|
||||||
if int(r) == 0xfffd {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// IsValidUtf8Btyes checks for in valid UTF-8 characters
|
// IsValidUtf8Btyes checks for in valid UTF-8 characters
|
||||||
func IsValidUtf8Btyes(b []byte) bool {
|
func IsValidUtf8Btyes(b []byte) bool {
|
||||||
for _, r := range string(b) {
|
return InvalidUtf8Index(b) == -1
|
||||||
if int(r) == 0xfffd {
|
}
|
||||||
return false
|
|
||||||
}
|
// InvalidUtf8Index returns the offset of the first invalid UTF-8 character.
|
||||||
}
|
// Default is to return -1 for a wholly valid sequence.
|
||||||
return true
|
func InvalidUtf8Index(b []byte) int {
|
||||||
|
for i, r := range string(b) {
|
||||||
|
if int(r) == 0xfffd {
|
||||||
|
return i
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return -1
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,27 +6,36 @@ func TestStringValidation(t *testing.T) {
|
||||||
cases := []struct {
|
cases := []struct {
|
||||||
value string
|
value string
|
||||||
result bool
|
result bool
|
||||||
|
offset int
|
||||||
}{
|
}{
|
||||||
{"aä\uFFFD本☺", false},
|
{"aä\uFFFD本☺", false, 3},
|
||||||
{"aä本☺", true},
|
{"aä本☺", true, -1},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, c := range cases {
|
for _, c := range cases {
|
||||||
|
if i := InvalidUtf8Index([]byte(c.value)); i != c.offset {
|
||||||
|
t.Errorf("string %q - offset expected %d, got %d", c.value, c.offset, i)
|
||||||
|
}
|
||||||
if got := IsValidUtf8String(c.value); got != c.result {
|
if got := IsValidUtf8String(c.value); got != c.result {
|
||||||
t.Errorf("string %q - expected %v, got %v", c.value, c.result, got)
|
t.Errorf("string %q - expected %v, got %v", c.value, c.result, got)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestBytesValidation(t *testing.T) {
|
func TestBytesValidation(t *testing.T) {
|
||||||
cases := []struct {
|
cases := []struct {
|
||||||
value []byte
|
value []byte
|
||||||
result bool
|
result bool
|
||||||
|
offset int
|
||||||
}{
|
}{
|
||||||
{[]byte{0xE4}, false},
|
{[]byte{0xE4}, false, 0},
|
||||||
{[]byte("aä本☺"), true},
|
{[]byte("aä本☺"), true, -1},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, c := range cases {
|
for _, c := range cases {
|
||||||
|
if i := InvalidUtf8Index(c.value); i != c.offset {
|
||||||
|
t.Errorf("bytes %q - offset expected %d, got %d", c.value, c.offset, i)
|
||||||
|
}
|
||||||
if got := IsValidUtf8Btyes(c.value); got != c.result {
|
if got := IsValidUtf8Btyes(c.value); got != c.result {
|
||||||
t.Errorf("bytes %q - expected %v, got %v", c.value, c.result, got)
|
t.Errorf("bytes %q - expected %v, got %v", c.value, c.result, got)
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue