unvis: implement proper '\xff' and '\377' escape handling

In particular, previously such escape handling would break because we would attempt to encode characters >0x7f as runes -- which would then result in escapes that want to encode multi-byte characters breaking. There's still some work necessary in Vis() to make it act sanely when it comes to arbitrary bit streams. Not to mention that we need to figure out what we actually want to do there... Signed-off-by: Aleksa Sarai <asarai@suse.de>
2017-02-12 23:12:46 +11:00 · 2017-02-12 23:12:46 +11:00 · 35708696fe
commit 35708696fe
parent 3b18d38388
3 changed files with 107 additions and 62 deletions
--- a/govis_test.go
+++ b/govis_test.go
@ -18,6 +18,7 @@
 package govis

 import (
+	"bytes"
 	"testing"
 )

@ -34,6 +35,7 @@ func TestVisUnvis(t *testing.T) {
 		"hello world [ this string needs=enco ding! ]",
 		"even \n more encoding necessary\a\a ",
 		"\024 <-- some more weird characters --> 你好，世界",
+		"\\xff\\n double encoding is also great fun \\x",
 	} {
 		enc, err := Vis(test, DefaultVisFlags)
 		if err != nil {
@ -50,3 +52,40 @@ func TestVisUnvis(t *testing.T) {
 		}
 	}
 }
+
+func TestByteStrings(t *testing.T) {
+	// It's important to make sure that we don't mess around with the layout of
+	// bytes when doing a round-trip. Otherwise we risk outputting visually
+	// identical but bit-stream non-identical strings (causing much confusion
+	// when trying to access such files).
+
+	for _, test := range [][]byte{
+		[]byte("This is a man in business suit levitating: \U0001f574"),
+		{0x7f, 0x17, 0x01, 0x33},
+		// TODO: Test arbitrary byte streams like the one below. Currently this
+		//       fails because Vis() is messing around with it (converting it
+		//       to a rune and spacing it out).
+		//{'\xef', '\xae', 'h', '\077', 'k'},
+	} {
+		testString := string(test)
+		enc, err := Vis(testString, DefaultVisFlags)
+		if err != nil {
+			t.Errorf("unexpected error doing vis(%q): %s", test, err)
+			continue
+		}
+		dec, err := Unvis(enc, DefaultVisFlags)
+		if err != nil {
+			t.Errorf("unexpected error doing unvis(%q): %s", enc, err)
+			continue
+		}
+		decBytes := []byte(dec)
+
+		if dec != testString {
+			t.Errorf("roundtrip failed [string comparison]: unvis(vis(%q) = %q) = %q", test, enc, dec)
+		}
+		if !bytes.Equal(decBytes, test) {
+			t.Errorf("roundtrip failed [byte comparison]: unvis(vis(%q) = %q) = %q", test, enc, dec)
+		}
+	}
+
+}
--- a/unvis.go
+++ b/unvis.go
@ -20,7 +20,7 @@ package govis
 import (
 	"fmt"
 	"strconv"
-	"unicode/utf8"
+	"unicode"
 )

 // unvisParser stores the current state of the token parser.
@ -38,7 +38,7 @@ func (p *unvisParser) Next() {
 // Peek gets the current token.
 func (p *unvisParser) Peek() (rune, error) {
 	if p.idx >= len(p.tokens) {
-		return utf8.RuneError, fmt.Errorf("tried to read past end of token list")
+		return unicode.ReplacementChar, fmt.Errorf("tried to read past end of token list")
 	}
 	return p.tokens[p.idx], nil
 }
@ -68,19 +68,26 @@ func newParser(input string, flag VisFlag) *unvisParser {
 // <escape-cstyle>   ::= "\" | "n" | "r" | "b" | "a" | "v" | "t" | "f"
 // <escape-octal>    ::= [0-7] ([0-7] ([0-7])?)?

-func unvisPlainRune(p *unvisParser) (string, error) {
+func unvisPlainRune(p *unvisParser) ([]byte, error) {
 	ch, err := p.Peek()
 	if err != nil {
-		return "", fmt.Errorf("plain rune: %s", ch)
+		return nil, fmt.Errorf("plain rune: %s", ch)
 	}
 	p.Next()
-	return string(ch), nil
+
+	// XXX: Maybe we should not be converting to runes and then back to strings
+	//      here. Are we sure that the byte-for-byte representation is the
+	//      same? If the bytes change, then using these strings for paths will
+	//      break...
+
+	str := string(ch)
+	return []byte(str), nil
 }

-func unvisEscapeCStyle(p *unvisParser) (string, error) {
+func unvisEscapeCStyle(p *unvisParser) ([]byte, error) {
 	ch, err := p.Peek()
 	if err != nil {
-		return "", fmt.Errorf("escape hex: %s", err)
+		return nil, fmt.Errorf("escape hex: %s", err)
 	}

 	output := ""
@ -109,85 +116,62 @@ func unvisEscapeCStyle(p *unvisParser) (string, error) {
 		// Hidden marker.
 	default:
 		// XXX: We should probably allow falling through and return "\" here...
-		return "", fmt.Errorf("escape cstyle: unknown escape character: %q", ch)
+		return nil, fmt.Errorf("escape cstyle: unknown escape character: %q", ch)
 	}

 	p.Next()
-	return output, nil
+	return []byte(output), nil
 }

-func unvisEscapeHex(p *unvisParser) (string, error) {
-	var output rune
+func unvisEscapeDigits(p *unvisParser, base int, force bool) ([]byte, error) {
+	var code int

-	for i := 0; i < 2; i++ {
+	for i := int(0xFF); i > 0; i /= base {
 		ch, err := p.Peek()
 		if err != nil {
-			return "", fmt.Errorf("escape hex: %s", err)
+			if !force && i != 0xFF {
+				break
+			}
+			return nil, fmt.Errorf("escape base %d: %s", base, err)
 		}

-		digit, err := strconv.ParseInt(string(ch), 16, 32)
+		digit, err := strconv.ParseInt(string(ch), base, 8)
 		if err != nil {
-			return "", fmt.Errorf("escape hex: parse int: %s", err)
+			if !force && i != 0xFF {
+				break
+			}
+			return nil, fmt.Errorf("escape base %d: could not parse digit: %s", base, err)
 		}

-		output = (output << 4) | rune(digit)
+		code = (code * base) + int(digit)
 		p.Next()
 	}

-	// TODO: We need to handle runes properly to output byte strings again. In
-	//       particular, if rune has 0xf0 set then we know that we're currently
-	//       decoding a messed up string.
-	return string(output), nil
-}
-
-func unvisEscapeOctal(p *unvisParser) (string, error) {
-	var output rune
-	var err error
-
-	for i := 0; i < 3; i++ {
-		ch, err := p.Peek()
-		if err != nil {
-			if i == 0 {
-				err = fmt.Errorf("escape octal[first]: %s", err)
-			}
-			break
-		}
-
-		digit, err := strconv.ParseInt(string(ch), 8, 32)
-		if err != nil {
-			if i == 0 {
-				err = fmt.Errorf("escape octal[first]: parse int: %s", err)
-			}
-			break
-		}
-
-		output = (output << 3) | rune(digit)
-		p.Next()
+	if code > unicode.MaxLatin1 {
+		return nil, fmt.Errorf("escape base %d: code %q outside latin-1 encoding", base, code)
 	}

-	// TODO: We need to handle runes properly to output byte strings again. In
-	//       particular, if rune has 0xf0 set then we know that we're currently
-	//       decoding a messed up string.
-	return string(output), err
+	char := byte(code & 0xFF)
+	return []byte{char}, nil
 }

-func unvisEscapeSequence(p *unvisParser) (string, error) {
+func unvisEscapeSequence(p *unvisParser) ([]byte, error) {
 	ch, err := p.Peek()
 	if err != nil {
-		return "", fmt.Errorf("escape sequence: %s", err)
+		return nil, fmt.Errorf("escape sequence: %s", err)
 	}

 	switch ch {
 	case '\\':
 		p.Next()
-		return "\\", nil
+		return []byte("\\"), nil

 	case '0', '1', '2', '3', '4', '5', '6', '7':
-		return unvisEscapeOctal(p)
+		return unvisEscapeDigits(p, 8, false)

 	case 'x':
 		p.Next()
-		return unvisEscapeHex(p)
+		return unvisEscapeDigits(p, 16, true)

 	case 'M':
 		// TODO
@ -198,13 +182,13 @@ func unvisEscapeSequence(p *unvisParser) (string, error) {
 		return unvisEscapeCStyle(p)
 	}

-	return "", fmt.Errorf("escape sequence: unsupported sequence: %q", ch)
+	return nil, fmt.Errorf("escape sequence: unsupported sequence: %q", ch)
 }

-func unvisRune(p *unvisParser) (string, error) {
+func unvisRune(p *unvisParser) ([]byte, error) {
 	ch, err := p.Peek()
 	if err != nil {
-		return "", fmt.Errorf("rune: %s", err)
+		return nil, fmt.Errorf("rune: %s", err)
 	}

 	switch ch {
@ -216,7 +200,7 @@ func unvisRune(p *unvisParser) (string, error) {
 		// % HEX HEX only applies to HTTPStyle encodings.
 		if p.flag&VisHTTPStyle == VisHTTPStyle {
 			p.Next()
-			return unvisEscapeHex(p)
+			return unvisEscapeDigits(p, 16, true)
 		}
 		fallthrough

@ -226,15 +210,15 @@ func unvisRune(p *unvisParser) (string, error) {
 }

 func unvis(p *unvisParser) (string, error) {
-	output := ""
+	var output []byte
 	for !p.End() {
 		ch, err := unvisRune(p)
 		if err != nil {
 			return "", fmt.Errorf("input: %s", err)
 		}
-		output += ch
+		output = append(output, ch...)
 	}
-	return output, nil
+	return string(output), nil
 }

 // Unvis takes a string formatted with the given Vis flags (though only the
--- a/unvis_test.go
+++ b/unvis_test.go
@ -21,6 +21,20 @@ import (
 	"testing"
 )

+func TestUnvisError(t *testing.T) {
+	for _, test := range []string{
+		// Octal escape codes allow you to specify invalid byte values.
+		"\\777",
+		"\\420\\322\\455",
+		"\\652\\233",
+	} {
+		got, err := Unvis(test, DefaultVisFlags)
+		if err == nil {
+			t.Errorf("expected unvis(%q) to give an error, got %q", test, got)
+		}
+	}
+}
+
 func TestUnvisOctalEscape(t *testing.T) {
 	for _, test := range []struct {
 		input    string
@ -34,6 +48,11 @@ func TestUnvisOctalEscape(t *testing.T) {
 		{"\\170YET\\01another test\\1\\\\82", "\170YET\001another test\001\\82"},
 		{"\\177MORE tests\\09a", "\177MORE tests\x009a"},
 		{"\\\\710more\\1215testing", "\\710more\1215testing"},
+		// Make sure that decoding unicode works properly, when it's been encoded as single bytes.
+		{"\\360\\237\\225\\264", "\U0001f574"},
+		{"T\\303\\234B\\304\\260TAK_UEKAE_K\\303\\266k_Sertifika_Hizmet_Sa\\304\\237lay\\304\\261c\\304\\261s\\304\\261_-_S\\303\\274r\\303\\274m_3.pem", "TÜBİTAK_UEKAE_Kök_Sertifika_Hizmet_Sağlayıcısı_-_Sürüm_3.pem"},
+		// Some invalid characters...
+		{"\\377\\2\\225\\264", "\xff\x02\x95\xb4"},
 	} {
 		got, err := Unvis(test.input, DefaultVisFlags)
 		if err != nil {
@ -57,8 +76,11 @@ func TestUnvisHexEscape(t *testing.T) {
 		{"this is a test\\x13\\x52\\x6f", "this is a test\x13\x52\x6f"},
 		{"\\x170YET\\x01a\\x22nother test\\x11", "\x170YET\x01a\x22nother test\x11"},
 		{"\\\\x007more\\\\x215testing", "\\x007more\\x215testing"},
-		// Make sure that decoding unicode works properly.
+		// Make sure that decoding unicode works properly, when it's been encoded as single bytes.
 		{"\\xf0\\x9f\\x95\\xb4", "\U0001f574"},
+		{"T\\xc3\\x9cB\\xc4\\xb0TAK_UEKAE_K\\xc3\\xb6k_Sertifika_Hizmet_Sa\\xc4\\x9flay\\xc4\\xb1c\\xc4\\xb1s\\xc4\\xb1_-_S\\xc3\\xbcr\\xc3\\xbcm_3.pem", "TÜBİTAK_UEKAE_Kök_Sertifika_Hizmet_Sağlayıcısı_-_Sürüm_3.pem"},
+		// Some invalid characters...
+		{"\\xff\\x02\\x95\\xb4", "\xff\x02\x95\xb4"},
 	} {
 		got, err := Unvis(test.input, DefaultVisFlags)
 		if err != nil {