From f6dd726b66e7e06071132dc1e35af029ebf4fbe6 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Fri, 10 Feb 2017 23:08:00 +1100 Subject: [PATCH] unvis_go: leave unicode unchanged with Unvis() Because the original code for vis() was ported to Go using the []byte{} notion, this causes issues for multi-rune bytes (which were not correctly treated -- and caused loss of information). Fix this by dealing with []rune instead, which better conveys the concept at hand. In addition, add tests to ensure that this does not happen again. Though, we _really_ should move this code into a library which has a better test suite -- and the parser itself should be reimplemented to be less ... 80s. Fixes: #118 Signed-off-by: Aleksa Sarai --- unvis_go.go | 47 ++++++++++++++++++++++++++--------------------- unvis_go_test.go | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 21 deletions(-) diff --git a/unvis_go.go b/unvis_go.go index 6287e48..0f2a848 100644 --- a/unvis_go.go +++ b/unvis_go.go @@ -5,11 +5,11 @@ package mtree import "unicode" func unvis(src string) (string, error) { - dst := &[]byte{} + dst := []rune{} var s state for i, r := range src { again: - err := unvisRune(dst, r, &s, 0) + err := unvisRune(&dst, r, &s, 0) switch err { case unvisValid: break @@ -23,13 +23,18 @@ func unvis(src string) (string, error) { return "", err } if i == len(src)-1 { - unvisRune(dst, r, &s, unvisEnd) + unvisRune(&dst, r, &s, unvisEnd) } } - return string(*dst), nil + + str := "" + for _, ch := range dst { + str += string(ch) + } + return str, nil } -func unvisRune(dst *[]byte, r rune, s *state, flags VisFlag) error { +func unvisRune(dst *[]rune, r rune, s *state, flags VisFlag) error { if (flags & unvisEnd) != 0 { if *s == stateOctal2 || *s == stateOctal3 { *s = stateGround @@ -51,14 +56,14 @@ func unvisRune(dst *[]byte, r rune, s *state, flags VisFlag) error { *s = stateStart | stateHTTP return unvisNone } - *dst = append(*dst, byte(r)) + *dst = append(*dst, r) return unvisValid case stateStart: if *s&stateHTTP != 0 && ishex(unicode.ToLower(r)) { if unicode.IsNumber(r) { - *dst = append(*dst, byte(r-'0')) + *dst = append(*dst, r-'0') } else { - *dst = append(*dst, byte(unicode.ToLower(r)-'a')) + *dst = append(*dst, unicode.ToLower(r)-'a') } *s = stateHex2 return unvisNone @@ -66,7 +71,7 @@ func unvisRune(dst *[]byte, r rune, s *state, flags VisFlag) error { switch r { case '\\': *s = stateGround - *dst = append(*dst, byte(r)) + *dst = append(*dst, r) return unvisValid case '0': fallthrough @@ -84,11 +89,11 @@ func unvisRune(dst *[]byte, r rune, s *state, flags VisFlag) error { fallthrough case '7': *s = stateOctal2 - *dst = append(*dst, byte(r-'0')) + *dst = append(*dst, r-'0') return unvisNone case 'M': *s = stateMeta - *dst = append(*dst, 0200) + *dst = append(*dst, rune(0200)) return unvisNone case '^': *s = stateCtrl @@ -153,14 +158,14 @@ func unvisRune(dst *[]byte, r rune, s *state, flags VisFlag) error { case stateMeta1: *s = stateGround dp := *dst - dp[len(dp)-1] |= byte(r) + dp[len(dp)-1] |= r return unvisValid case stateCtrl: dp := *dst if r == '?' { - dp[len(dp)-1] |= 0177 + dp[len(dp)-1] |= rune(0177) } else { - dp[len(dp)-1] |= byte(r & 037) + dp[len(dp)-1] |= r & 037 } *s = stateGround return unvisValid @@ -169,9 +174,9 @@ func unvisRune(dst *[]byte, r rune, s *state, flags VisFlag) error { dp := *dst if len(dp) > 0 { last := dp[len(dp)-1] - dp[len(dp)-1] = (last << 3) + byte(r-'0') + dp[len(dp)-1] = (last << 3) + (r - '0') } else { - dp = append(dp, byte((0<<3)+(r-'0'))) + dp = append(dp, (0<<3)+(r-'0')) } *s = stateOctal3 return unvisNone @@ -184,24 +189,24 @@ func unvisRune(dst *[]byte, r rune, s *state, flags VisFlag) error { dp := *dst if len(dp) > 0 { last := dp[len(dp)-1] - dp[len(dp)-1] = (last << 3) + byte(r-'0') + dp[len(dp)-1] = (last << 3) + (r - '0') } else { - dp = append(dp, (0<<3)+byte(r-'0')) + dp = append(dp, (0<<3)+(r-'0')) } return unvisValid } return unvisValidPush case stateHex2: if ishex(unicode.ToLower(r)) { - last := byte(0) + last := rune(0) dp := *dst if len(dp) > 0 { last = dp[len(dp)-1] } if unicode.IsNumber(r) { - dp = append(dp, (last<<4)+byte(r-'0')) + dp = append(dp, (last<<4)+(r-'0')) } else { - dp = append(dp, (last<<4)+byte(unicode.ToLower(r)-'a'+10)) + dp = append(dp, (last<<4)+(unicode.ToLower(r)-'a'+10)) } } *s = stateGround diff --git a/unvis_go_test.go b/unvis_go_test.go index fc61881..0289330 100644 --- a/unvis_go_test.go +++ b/unvis_go_test.go @@ -43,3 +43,51 @@ func TestUnvisHelpers(t *testing.T) { } } } + +func TestUnvisUnicode(t *testing.T) { + // Ensure that unicode strings are not messed up by Unvis. + for _, test := range []string{ + "", + "this.is.a.normal_string", + "AC_Raíz_Certicámara_S.A..pem", + "NetLock_Arany_=Class_Gold=_Főtanúsítvány.pem", + "TÜBİTAK_UEKAE_Kök_Sertifika_Hizmet_Sağlayıcısı_-_Sürüm_3.pem", + } { + got, err := Unvis(test) + if err != nil { + t.Errorf("unexpected error doing unvis(%q): %s", test, err) + continue + } + if got != test { + t.Errorf("expected %q to be unchanged, got %q", test, got) + } + } +} + +func TestVisUnvis(t *testing.T) { + // Round-trip testing. + for _, test := range []string{ + "", + "this.is.a.normal_string", + "AC_Raíz_Certicámara_S.A..pem", + "NetLock_Arany_=Class_Gold=_Főtanúsítvány.pem", + "TÜBİTAK_UEKAE_Kök_Sertifika_Hizmet_Sağlayıcısı_-_Sürüm_3.pem", + "hello world [ this string needs=enco ding! ]", + "even \n more encoding necessary\a\a ", + "\024 <-- some more weird characters --> 你好,世界", + } { + enc, err := Vis(test, DefaultVisFlags) + if err != nil { + t.Errorf("unexpected error doing vis(%q): %s", test, err) + continue + } + dec, err := Unvis(enc) + if err != nil { + t.Errorf("unexpected error doing unvis(%q): %s", enc, err) + continue + } + if dec != test { + t.Errorf("roundtrip failed: unvis(vis(%q) = %q) = %q", test, enc, dec) + } + } +}