unvis_go: leave unicode unchanged with Unvis()

Because the original code for vis() was ported to Go using the []byte{}
notion, this causes issues for multi-rune bytes (which were not
correctly treated -- and caused loss of information).

Fix this by dealing with []rune instead, which better conveys the
concept at hand. In addition, add tests to ensure that this does not
happen again.

Though, we _really_ should move this code into a library which has a
better test suite -- and the parser itself should be reimplemented to be
less ... 80s.

Fixes: #118
Signed-off-by: Aleksa Sarai <asarai@suse.de>
This commit is contained in:
Aleksa Sarai 2017-02-10 23:08:00 +11:00
parent 0185fe9b62
commit f6dd726b66
No known key found for this signature in database
GPG key ID: 9E18AA267DDB8DB4
2 changed files with 74 additions and 21 deletions

View file

@ -5,11 +5,11 @@ package mtree
import "unicode" import "unicode"
func unvis(src string) (string, error) { func unvis(src string) (string, error) {
dst := &[]byte{} dst := []rune{}
var s state var s state
for i, r := range src { for i, r := range src {
again: again:
err := unvisRune(dst, r, &s, 0) err := unvisRune(&dst, r, &s, 0)
switch err { switch err {
case unvisValid: case unvisValid:
break break
@ -23,13 +23,18 @@ func unvis(src string) (string, error) {
return "", err return "", err
} }
if i == len(src)-1 { if i == len(src)-1 {
unvisRune(dst, r, &s, unvisEnd) unvisRune(&dst, r, &s, unvisEnd)
} }
} }
return string(*dst), nil
}
func unvisRune(dst *[]byte, r rune, s *state, flags VisFlag) error { str := ""
for _, ch := range dst {
str += string(ch)
}
return str, nil
}
func unvisRune(dst *[]rune, r rune, s *state, flags VisFlag) error {
if (flags & unvisEnd) != 0 { if (flags & unvisEnd) != 0 {
if *s == stateOctal2 || *s == stateOctal3 { if *s == stateOctal2 || *s == stateOctal3 {
*s = stateGround *s = stateGround
@ -51,14 +56,14 @@ func unvisRune(dst *[]byte, r rune, s *state, flags VisFlag) error {
*s = stateStart | stateHTTP *s = stateStart | stateHTTP
return unvisNone return unvisNone
} }
*dst = append(*dst, byte(r)) *dst = append(*dst, r)
return unvisValid return unvisValid
case stateStart: case stateStart:
if *s&stateHTTP != 0 && ishex(unicode.ToLower(r)) { if *s&stateHTTP != 0 && ishex(unicode.ToLower(r)) {
if unicode.IsNumber(r) { if unicode.IsNumber(r) {
*dst = append(*dst, byte(r-'0')) *dst = append(*dst, r-'0')
} else { } else {
*dst = append(*dst, byte(unicode.ToLower(r)-'a')) *dst = append(*dst, unicode.ToLower(r)-'a')
} }
*s = stateHex2 *s = stateHex2
return unvisNone return unvisNone
@ -66,7 +71,7 @@ func unvisRune(dst *[]byte, r rune, s *state, flags VisFlag) error {
switch r { switch r {
case '\\': case '\\':
*s = stateGround *s = stateGround
*dst = append(*dst, byte(r)) *dst = append(*dst, r)
return unvisValid return unvisValid
case '0': case '0':
fallthrough fallthrough
@ -84,11 +89,11 @@ func unvisRune(dst *[]byte, r rune, s *state, flags VisFlag) error {
fallthrough fallthrough
case '7': case '7':
*s = stateOctal2 *s = stateOctal2
*dst = append(*dst, byte(r-'0')) *dst = append(*dst, r-'0')
return unvisNone return unvisNone
case 'M': case 'M':
*s = stateMeta *s = stateMeta
*dst = append(*dst, 0200) *dst = append(*dst, rune(0200))
return unvisNone return unvisNone
case '^': case '^':
*s = stateCtrl *s = stateCtrl
@ -153,14 +158,14 @@ func unvisRune(dst *[]byte, r rune, s *state, flags VisFlag) error {
case stateMeta1: case stateMeta1:
*s = stateGround *s = stateGround
dp := *dst dp := *dst
dp[len(dp)-1] |= byte(r) dp[len(dp)-1] |= r
return unvisValid return unvisValid
case stateCtrl: case stateCtrl:
dp := *dst dp := *dst
if r == '?' { if r == '?' {
dp[len(dp)-1] |= 0177 dp[len(dp)-1] |= rune(0177)
} else { } else {
dp[len(dp)-1] |= byte(r & 037) dp[len(dp)-1] |= r & 037
} }
*s = stateGround *s = stateGround
return unvisValid return unvisValid
@ -169,9 +174,9 @@ func unvisRune(dst *[]byte, r rune, s *state, flags VisFlag) error {
dp := *dst dp := *dst
if len(dp) > 0 { if len(dp) > 0 {
last := dp[len(dp)-1] last := dp[len(dp)-1]
dp[len(dp)-1] = (last << 3) + byte(r-'0') dp[len(dp)-1] = (last << 3) + (r - '0')
} else { } else {
dp = append(dp, byte((0<<3)+(r-'0'))) dp = append(dp, (0<<3)+(r-'0'))
} }
*s = stateOctal3 *s = stateOctal3
return unvisNone return unvisNone
@ -184,24 +189,24 @@ func unvisRune(dst *[]byte, r rune, s *state, flags VisFlag) error {
dp := *dst dp := *dst
if len(dp) > 0 { if len(dp) > 0 {
last := dp[len(dp)-1] last := dp[len(dp)-1]
dp[len(dp)-1] = (last << 3) + byte(r-'0') dp[len(dp)-1] = (last << 3) + (r - '0')
} else { } else {
dp = append(dp, (0<<3)+byte(r-'0')) dp = append(dp, (0<<3)+(r-'0'))
} }
return unvisValid return unvisValid
} }
return unvisValidPush return unvisValidPush
case stateHex2: case stateHex2:
if ishex(unicode.ToLower(r)) { if ishex(unicode.ToLower(r)) {
last := byte(0) last := rune(0)
dp := *dst dp := *dst
if len(dp) > 0 { if len(dp) > 0 {
last = dp[len(dp)-1] last = dp[len(dp)-1]
} }
if unicode.IsNumber(r) { if unicode.IsNumber(r) {
dp = append(dp, (last<<4)+byte(r-'0')) dp = append(dp, (last<<4)+(r-'0'))
} else { } else {
dp = append(dp, (last<<4)+byte(unicode.ToLower(r)-'a'+10)) dp = append(dp, (last<<4)+(unicode.ToLower(r)-'a'+10))
} }
} }
*s = stateGround *s = stateGround

View file

@ -43,3 +43,51 @@ func TestUnvisHelpers(t *testing.T) {
} }
} }
} }
func TestUnvisUnicode(t *testing.T) {
// Ensure that unicode strings are not messed up by Unvis.
for _, test := range []string{
"",
"this.is.a.normal_string",
"AC_Raíz_Certicámara_S.A..pem",
"NetLock_Arany_=Class_Gold=_Főtanúsítvány.pem",
"TÜBİTAK_UEKAE_Kök_Sertifika_Hizmet_Sağlayıcısı_-_Sürüm_3.pem",
} {
got, err := Unvis(test)
if err != nil {
t.Errorf("unexpected error doing unvis(%q): %s", test, err)
continue
}
if got != test {
t.Errorf("expected %q to be unchanged, got %q", test, got)
}
}
}
func TestVisUnvis(t *testing.T) {
// Round-trip testing.
for _, test := range []string{
"",
"this.is.a.normal_string",
"AC_Raíz_Certicámara_S.A..pem",
"NetLock_Arany_=Class_Gold=_Főtanúsítvány.pem",
"TÜBİTAK_UEKAE_Kök_Sertifika_Hizmet_Sağlayıcısı_-_Sürüm_3.pem",
"hello world [ this string needs=enco ding! ]",
"even \n more encoding necessary\a\a ",
"\024 <-- some more weird characters --> 你好,世界",
} {
enc, err := Vis(test, DefaultVisFlags)
if err != nil {
t.Errorf("unexpected error doing vis(%q): %s", test, err)
continue
}
dec, err := Unvis(enc)
if err != nil {
t.Errorf("unexpected error doing unvis(%q): %s", enc, err)
continue
}
if dec != test {
t.Errorf("roundtrip failed: unvis(vis(%q) = %q) = %q", test, enc, dec)
}
}
}