diff --git a/govis_test.go b/govis_test.go index d236f85..244ae23 100644 --- a/govis_test.go +++ b/govis_test.go @@ -18,6 +18,7 @@ package govis import ( + "bytes" "testing" ) @@ -34,6 +35,7 @@ func TestVisUnvis(t *testing.T) { "hello world [ this string needs=enco ding! ]", "even \n more encoding necessary\a\a ", "\024 <-- some more weird characters --> 你好,世界", + "\\xff\\n double encoding is also great fun \\x", } { enc, err := Vis(test, DefaultVisFlags) if err != nil { @@ -50,3 +52,40 @@ func TestVisUnvis(t *testing.T) { } } } + +func TestByteStrings(t *testing.T) { + // It's important to make sure that we don't mess around with the layout of + // bytes when doing a round-trip. Otherwise we risk outputting visually + // identical but bit-stream non-identical strings (causing much confusion + // when trying to access such files). + + for _, test := range [][]byte{ + []byte("This is a man in business suit levitating: \U0001f574"), + {0x7f, 0x17, 0x01, 0x33}, + // TODO: Test arbitrary byte streams like the one below. Currently this + // fails because Vis() is messing around with it (converting it + // to a rune and spacing it out). + //{'\xef', '\xae', 'h', '\077', 'k'}, + } { + testString := string(test) + enc, err := Vis(testString, DefaultVisFlags) + if err != nil { + t.Errorf("unexpected error doing vis(%q): %s", test, err) + continue + } + dec, err := Unvis(enc, DefaultVisFlags) + if err != nil { + t.Errorf("unexpected error doing unvis(%q): %s", enc, err) + continue + } + decBytes := []byte(dec) + + if dec != testString { + t.Errorf("roundtrip failed [string comparison]: unvis(vis(%q) = %q) = %q", test, enc, dec) + } + if !bytes.Equal(decBytes, test) { + t.Errorf("roundtrip failed [byte comparison]: unvis(vis(%q) = %q) = %q", test, enc, dec) + } + } + +} diff --git a/unvis.go b/unvis.go index 0a3dc44..9d59b9e 100644 --- a/unvis.go +++ b/unvis.go @@ -20,7 +20,7 @@ package govis import ( "fmt" "strconv" - "unicode/utf8" + "unicode" ) // unvisParser stores the current state of the token parser. @@ -38,7 +38,7 @@ func (p *unvisParser) Next() { // Peek gets the current token. func (p *unvisParser) Peek() (rune, error) { if p.idx >= len(p.tokens) { - return utf8.RuneError, fmt.Errorf("tried to read past end of token list") + return unicode.ReplacementChar, fmt.Errorf("tried to read past end of token list") } return p.tokens[p.idx], nil } @@ -68,19 +68,26 @@ func newParser(input string, flag VisFlag) *unvisParser { // ::= "\" | "n" | "r" | "b" | "a" | "v" | "t" | "f" // ::= [0-7] ([0-7] ([0-7])?)? -func unvisPlainRune(p *unvisParser) (string, error) { +func unvisPlainRune(p *unvisParser) ([]byte, error) { ch, err := p.Peek() if err != nil { - return "", fmt.Errorf("plain rune: %s", ch) + return nil, fmt.Errorf("plain rune: %s", ch) } p.Next() - return string(ch), nil + + // XXX: Maybe we should not be converting to runes and then back to strings + // here. Are we sure that the byte-for-byte representation is the + // same? If the bytes change, then using these strings for paths will + // break... + + str := string(ch) + return []byte(str), nil } -func unvisEscapeCStyle(p *unvisParser) (string, error) { +func unvisEscapeCStyle(p *unvisParser) ([]byte, error) { ch, err := p.Peek() if err != nil { - return "", fmt.Errorf("escape hex: %s", err) + return nil, fmt.Errorf("escape hex: %s", err) } output := "" @@ -109,85 +116,62 @@ func unvisEscapeCStyle(p *unvisParser) (string, error) { // Hidden marker. default: // XXX: We should probably allow falling through and return "\" here... - return "", fmt.Errorf("escape cstyle: unknown escape character: %q", ch) + return nil, fmt.Errorf("escape cstyle: unknown escape character: %q", ch) } p.Next() - return output, nil + return []byte(output), nil } -func unvisEscapeHex(p *unvisParser) (string, error) { - var output rune +func unvisEscapeDigits(p *unvisParser, base int, force bool) ([]byte, error) { + var code int - for i := 0; i < 2; i++ { + for i := int(0xFF); i > 0; i /= base { ch, err := p.Peek() if err != nil { - return "", fmt.Errorf("escape hex: %s", err) + if !force && i != 0xFF { + break + } + return nil, fmt.Errorf("escape base %d: %s", base, err) } - digit, err := strconv.ParseInt(string(ch), 16, 32) + digit, err := strconv.ParseInt(string(ch), base, 8) if err != nil { - return "", fmt.Errorf("escape hex: parse int: %s", err) + if !force && i != 0xFF { + break + } + return nil, fmt.Errorf("escape base %d: could not parse digit: %s", base, err) } - output = (output << 4) | rune(digit) + code = (code * base) + int(digit) p.Next() } - // TODO: We need to handle runes properly to output byte strings again. In - // particular, if rune has 0xf0 set then we know that we're currently - // decoding a messed up string. - return string(output), nil -} - -func unvisEscapeOctal(p *unvisParser) (string, error) { - var output rune - var err error - - for i := 0; i < 3; i++ { - ch, err := p.Peek() - if err != nil { - if i == 0 { - err = fmt.Errorf("escape octal[first]: %s", err) - } - break - } - - digit, err := strconv.ParseInt(string(ch), 8, 32) - if err != nil { - if i == 0 { - err = fmt.Errorf("escape octal[first]: parse int: %s", err) - } - break - } - - output = (output << 3) | rune(digit) - p.Next() + if code > unicode.MaxLatin1 { + return nil, fmt.Errorf("escape base %d: code %q outside latin-1 encoding", base, code) } - // TODO: We need to handle runes properly to output byte strings again. In - // particular, if rune has 0xf0 set then we know that we're currently - // decoding a messed up string. - return string(output), err + char := byte(code & 0xFF) + return []byte{char}, nil } -func unvisEscapeSequence(p *unvisParser) (string, error) { +func unvisEscapeSequence(p *unvisParser) ([]byte, error) { ch, err := p.Peek() if err != nil { - return "", fmt.Errorf("escape sequence: %s", err) + return nil, fmt.Errorf("escape sequence: %s", err) } switch ch { case '\\': p.Next() - return "\\", nil + return []byte("\\"), nil case '0', '1', '2', '3', '4', '5', '6', '7': - return unvisEscapeOctal(p) + return unvisEscapeDigits(p, 8, false) case 'x': p.Next() - return unvisEscapeHex(p) + return unvisEscapeDigits(p, 16, true) case 'M': // TODO @@ -198,13 +182,13 @@ func unvisEscapeSequence(p *unvisParser) (string, error) { return unvisEscapeCStyle(p) } - return "", fmt.Errorf("escape sequence: unsupported sequence: %q", ch) + return nil, fmt.Errorf("escape sequence: unsupported sequence: %q", ch) } -func unvisRune(p *unvisParser) (string, error) { +func unvisRune(p *unvisParser) ([]byte, error) { ch, err := p.Peek() if err != nil { - return "", fmt.Errorf("rune: %s", err) + return nil, fmt.Errorf("rune: %s", err) } switch ch { @@ -216,7 +200,7 @@ func unvisRune(p *unvisParser) (string, error) { // % HEX HEX only applies to HTTPStyle encodings. if p.flag&VisHTTPStyle == VisHTTPStyle { p.Next() - return unvisEscapeHex(p) + return unvisEscapeDigits(p, 16, true) } fallthrough @@ -226,15 +210,15 @@ func unvisRune(p *unvisParser) (string, error) { } func unvis(p *unvisParser) (string, error) { - output := "" + var output []byte for !p.End() { ch, err := unvisRune(p) if err != nil { return "", fmt.Errorf("input: %s", err) } - output += ch + output = append(output, ch...) } - return output, nil + return string(output), nil } // Unvis takes a string formatted with the given Vis flags (though only the diff --git a/unvis_test.go b/unvis_test.go index 0b6ca04..66d60eb 100644 --- a/unvis_test.go +++ b/unvis_test.go @@ -21,6 +21,20 @@ import ( "testing" ) +func TestUnvisError(t *testing.T) { + for _, test := range []string{ + // Octal escape codes allow you to specify invalid byte values. + "\\777", + "\\420\\322\\455", + "\\652\\233", + } { + got, err := Unvis(test, DefaultVisFlags) + if err == nil { + t.Errorf("expected unvis(%q) to give an error, got %q", test, got) + } + } +} + func TestUnvisOctalEscape(t *testing.T) { for _, test := range []struct { input string @@ -34,6 +48,11 @@ func TestUnvisOctalEscape(t *testing.T) { {"\\170YET\\01another test\\1\\\\82", "\170YET\001another test\001\\82"}, {"\\177MORE tests\\09a", "\177MORE tests\x009a"}, {"\\\\710more\\1215testing", "\\710more\1215testing"}, + // Make sure that decoding unicode works properly, when it's been encoded as single bytes. + {"\\360\\237\\225\\264", "\U0001f574"}, + {"T\\303\\234B\\304\\260TAK_UEKAE_K\\303\\266k_Sertifika_Hizmet_Sa\\304\\237lay\\304\\261c\\304\\261s\\304\\261_-_S\\303\\274r\\303\\274m_3.pem", "TÜBİTAK_UEKAE_Kök_Sertifika_Hizmet_Sağlayıcısı_-_Sürüm_3.pem"}, + // Some invalid characters... + {"\\377\\2\\225\\264", "\xff\x02\x95\xb4"}, } { got, err := Unvis(test.input, DefaultVisFlags) if err != nil { @@ -57,8 +76,11 @@ func TestUnvisHexEscape(t *testing.T) { {"this is a test\\x13\\x52\\x6f", "this is a test\x13\x52\x6f"}, {"\\x170YET\\x01a\\x22nother test\\x11", "\x170YET\x01a\x22nother test\x11"}, {"\\\\x007more\\\\x215testing", "\\x007more\\x215testing"}, - // Make sure that decoding unicode works properly. + // Make sure that decoding unicode works properly, when it's been encoded as single bytes. {"\\xf0\\x9f\\x95\\xb4", "\U0001f574"}, + {"T\\xc3\\x9cB\\xc4\\xb0TAK_UEKAE_K\\xc3\\xb6k_Sertifika_Hizmet_Sa\\xc4\\x9flay\\xc4\\xb1c\\xc4\\xb1s\\xc4\\xb1_-_S\\xc3\\xbcr\\xc3\\xbcm_3.pem", "TÜBİTAK_UEKAE_Kök_Sertifika_Hizmet_Sağlayıcısı_-_Sürüm_3.pem"}, + // Some invalid characters... + {"\\xff\\x02\\x95\\xb4", "\xff\x02\x95\xb4"}, } { got, err := Unvis(test.input, DefaultVisFlags) if err != nil {