diff --git a/tar/common/utf8.go b/tar/common/utf8.go index ffb1646..568e929 100644 --- a/tar/common/utf8.go +++ b/tar/common/utf8.go @@ -2,20 +2,21 @@ package common // IsValidUtf8String checks for in valid UTF-8 characters func IsValidUtf8String(s string) bool { - for _, r := range s { - if int(r) == 0xfffd { - return false - } - } - return true + return InvalidUtf8Index([]byte(s)) == -1 } // IsValidUtf8Btyes checks for in valid UTF-8 characters func IsValidUtf8Btyes(b []byte) bool { - for _, r := range string(b) { + return InvalidUtf8Index(b) == -1 +} + +// InvalidUtf8Index returns the offset of the first invalid UTF-8 character. +// Default is to return -1 for a wholly valid sequence. +func InvalidUtf8Index(b []byte) int { + for i, r := range string(b) { if int(r) == 0xfffd { - return false + return i } } - return true + return -1 } diff --git a/tar/common/utf8_test.go b/tar/common/utf8_test.go index e546f55..3cf81df 100644 --- a/tar/common/utf8_test.go +++ b/tar/common/utf8_test.go @@ -6,27 +6,36 @@ func TestStringValidation(t *testing.T) { cases := []struct { value string result bool + offset int }{ - {"aä\uFFFD本☺", false}, - {"aä本☺", true}, + {"aä\uFFFD本☺", false, 3}, + {"aä本☺", true, -1}, } for _, c := range cases { + if i := InvalidUtf8Index([]byte(c.value)); i != c.offset { + t.Errorf("string %q - offset expected %d, got %d", c.value, c.offset, i) + } if got := IsValidUtf8String(c.value); got != c.result { t.Errorf("string %q - expected %v, got %v", c.value, c.result, got) } } } + func TestBytesValidation(t *testing.T) { cases := []struct { value []byte result bool + offset int }{ - {[]byte{0xE4}, false}, - {[]byte("aä本☺"), true}, + {[]byte{0xE4}, false, 0}, + {[]byte("aä本☺"), true, -1}, } for _, c := range cases { + if i := InvalidUtf8Index(c.value); i != c.offset { + t.Errorf("bytes %q - offset expected %d, got %d", c.value, c.offset, i) + } if got := IsValidUtf8Btyes(c.value); got != c.result { t.Errorf("bytes %q - expected %v, got %v", c.value, c.result, got) }