unvis: implement proper '\xff' and '\377' escape handling

In particular, previously such escape handling would break because we
would attempt to encode characters >0x7f as runes -- which would then
result in escapes that want to encode multi-byte characters breaking.

There's still some work necessary in Vis() to make it act sanely when it
comes to arbitrary bit streams. Not to mention that we need to figure
out what we actually want to do there...

Signed-off-by: Aleksa Sarai <asarai@suse.de>
This commit is contained in:
Aleksa Sarai 2017-02-12 23:12:46 +11:00
parent 3b18d38388
commit 35708696fe
No known key found for this signature in database
GPG key ID: 9E18AA267DDB8DB4
3 changed files with 107 additions and 62 deletions

View file

@ -18,6 +18,7 @@
package govis package govis
import ( import (
"bytes"
"testing" "testing"
) )
@ -34,6 +35,7 @@ func TestVisUnvis(t *testing.T) {
"hello world [ this string needs=enco ding! ]", "hello world [ this string needs=enco ding! ]",
"even \n more encoding necessary\a\a ", "even \n more encoding necessary\a\a ",
"\024 <-- some more weird characters --> 你好,世界", "\024 <-- some more weird characters --> 你好,世界",
"\\xff\\n double encoding is also great fun \\x",
} { } {
enc, err := Vis(test, DefaultVisFlags) enc, err := Vis(test, DefaultVisFlags)
if err != nil { if err != nil {
@ -50,3 +52,40 @@ func TestVisUnvis(t *testing.T) {
} }
} }
} }
func TestByteStrings(t *testing.T) {
// It's important to make sure that we don't mess around with the layout of
// bytes when doing a round-trip. Otherwise we risk outputting visually
// identical but bit-stream non-identical strings (causing much confusion
// when trying to access such files).
for _, test := range [][]byte{
[]byte("This is a man in business suit levitating: \U0001f574"),
{0x7f, 0x17, 0x01, 0x33},
// TODO: Test arbitrary byte streams like the one below. Currently this
// fails because Vis() is messing around with it (converting it
// to a rune and spacing it out).
//{'\xef', '\xae', 'h', '\077', 'k'},
} {
testString := string(test)
enc, err := Vis(testString, DefaultVisFlags)
if err != nil {
t.Errorf("unexpected error doing vis(%q): %s", test, err)
continue
}
dec, err := Unvis(enc, DefaultVisFlags)
if err != nil {
t.Errorf("unexpected error doing unvis(%q): %s", enc, err)
continue
}
decBytes := []byte(dec)
if dec != testString {
t.Errorf("roundtrip failed [string comparison]: unvis(vis(%q) = %q) = %q", test, enc, dec)
}
if !bytes.Equal(decBytes, test) {
t.Errorf("roundtrip failed [byte comparison]: unvis(vis(%q) = %q) = %q", test, enc, dec)
}
}
}

108
unvis.go
View file

@ -20,7 +20,7 @@ package govis
import ( import (
"fmt" "fmt"
"strconv" "strconv"
"unicode/utf8" "unicode"
) )
// unvisParser stores the current state of the token parser. // unvisParser stores the current state of the token parser.
@ -38,7 +38,7 @@ func (p *unvisParser) Next() {
// Peek gets the current token. // Peek gets the current token.
func (p *unvisParser) Peek() (rune, error) { func (p *unvisParser) Peek() (rune, error) {
if p.idx >= len(p.tokens) { if p.idx >= len(p.tokens) {
return utf8.RuneError, fmt.Errorf("tried to read past end of token list") return unicode.ReplacementChar, fmt.Errorf("tried to read past end of token list")
} }
return p.tokens[p.idx], nil return p.tokens[p.idx], nil
} }
@ -68,19 +68,26 @@ func newParser(input string, flag VisFlag) *unvisParser {
// <escape-cstyle> ::= "\" | "n" | "r" | "b" | "a" | "v" | "t" | "f" // <escape-cstyle> ::= "\" | "n" | "r" | "b" | "a" | "v" | "t" | "f"
// <escape-octal> ::= [0-7] ([0-7] ([0-7])?)? // <escape-octal> ::= [0-7] ([0-7] ([0-7])?)?
func unvisPlainRune(p *unvisParser) (string, error) { func unvisPlainRune(p *unvisParser) ([]byte, error) {
ch, err := p.Peek() ch, err := p.Peek()
if err != nil { if err != nil {
return "", fmt.Errorf("plain rune: %s", ch) return nil, fmt.Errorf("plain rune: %s", ch)
} }
p.Next() p.Next()
return string(ch), nil
// XXX: Maybe we should not be converting to runes and then back to strings
// here. Are we sure that the byte-for-byte representation is the
// same? If the bytes change, then using these strings for paths will
// break...
str := string(ch)
return []byte(str), nil
} }
func unvisEscapeCStyle(p *unvisParser) (string, error) { func unvisEscapeCStyle(p *unvisParser) ([]byte, error) {
ch, err := p.Peek() ch, err := p.Peek()
if err != nil { if err != nil {
return "", fmt.Errorf("escape hex: %s", err) return nil, fmt.Errorf("escape hex: %s", err)
} }
output := "" output := ""
@ -109,85 +116,62 @@ func unvisEscapeCStyle(p *unvisParser) (string, error) {
// Hidden marker. // Hidden marker.
default: default:
// XXX: We should probably allow falling through and return "\" here... // XXX: We should probably allow falling through and return "\" here...
return "", fmt.Errorf("escape cstyle: unknown escape character: %q", ch) return nil, fmt.Errorf("escape cstyle: unknown escape character: %q", ch)
} }
p.Next() p.Next()
return output, nil return []byte(output), nil
} }
func unvisEscapeHex(p *unvisParser) (string, error) { func unvisEscapeDigits(p *unvisParser, base int, force bool) ([]byte, error) {
var output rune var code int
for i := 0; i < 2; i++ { for i := int(0xFF); i > 0; i /= base {
ch, err := p.Peek() ch, err := p.Peek()
if err != nil { if err != nil {
return "", fmt.Errorf("escape hex: %s", err) if !force && i != 0xFF {
}
digit, err := strconv.ParseInt(string(ch), 16, 32)
if err != nil {
return "", fmt.Errorf("escape hex: parse int: %s", err)
}
output = (output << 4) | rune(digit)
p.Next()
}
// TODO: We need to handle runes properly to output byte strings again. In
// particular, if rune has 0xf0 set then we know that we're currently
// decoding a messed up string.
return string(output), nil
}
func unvisEscapeOctal(p *unvisParser) (string, error) {
var output rune
var err error
for i := 0; i < 3; i++ {
ch, err := p.Peek()
if err != nil {
if i == 0 {
err = fmt.Errorf("escape octal[first]: %s", err)
}
break break
} }
return nil, fmt.Errorf("escape base %d: %s", base, err)
digit, err := strconv.ParseInt(string(ch), 8, 32)
if err != nil {
if i == 0 {
err = fmt.Errorf("escape octal[first]: parse int: %s", err)
} }
digit, err := strconv.ParseInt(string(ch), base, 8)
if err != nil {
if !force && i != 0xFF {
break break
} }
return nil, fmt.Errorf("escape base %d: could not parse digit: %s", base, err)
}
output = (output << 3) | rune(digit) code = (code * base) + int(digit)
p.Next() p.Next()
} }
// TODO: We need to handle runes properly to output byte strings again. In if code > unicode.MaxLatin1 {
// particular, if rune has 0xf0 set then we know that we're currently return nil, fmt.Errorf("escape base %d: code %q outside latin-1 encoding", base, code)
// decoding a messed up string. }
return string(output), err
char := byte(code & 0xFF)
return []byte{char}, nil
} }
func unvisEscapeSequence(p *unvisParser) (string, error) { func unvisEscapeSequence(p *unvisParser) ([]byte, error) {
ch, err := p.Peek() ch, err := p.Peek()
if err != nil { if err != nil {
return "", fmt.Errorf("escape sequence: %s", err) return nil, fmt.Errorf("escape sequence: %s", err)
} }
switch ch { switch ch {
case '\\': case '\\':
p.Next() p.Next()
return "\\", nil return []byte("\\"), nil
case '0', '1', '2', '3', '4', '5', '6', '7': case '0', '1', '2', '3', '4', '5', '6', '7':
return unvisEscapeOctal(p) return unvisEscapeDigits(p, 8, false)
case 'x': case 'x':
p.Next() p.Next()
return unvisEscapeHex(p) return unvisEscapeDigits(p, 16, true)
case 'M': case 'M':
// TODO // TODO
@ -198,13 +182,13 @@ func unvisEscapeSequence(p *unvisParser) (string, error) {
return unvisEscapeCStyle(p) return unvisEscapeCStyle(p)
} }
return "", fmt.Errorf("escape sequence: unsupported sequence: %q", ch) return nil, fmt.Errorf("escape sequence: unsupported sequence: %q", ch)
} }
func unvisRune(p *unvisParser) (string, error) { func unvisRune(p *unvisParser) ([]byte, error) {
ch, err := p.Peek() ch, err := p.Peek()
if err != nil { if err != nil {
return "", fmt.Errorf("rune: %s", err) return nil, fmt.Errorf("rune: %s", err)
} }
switch ch { switch ch {
@ -216,7 +200,7 @@ func unvisRune(p *unvisParser) (string, error) {
// % HEX HEX only applies to HTTPStyle encodings. // % HEX HEX only applies to HTTPStyle encodings.
if p.flag&VisHTTPStyle == VisHTTPStyle { if p.flag&VisHTTPStyle == VisHTTPStyle {
p.Next() p.Next()
return unvisEscapeHex(p) return unvisEscapeDigits(p, 16, true)
} }
fallthrough fallthrough
@ -226,15 +210,15 @@ func unvisRune(p *unvisParser) (string, error) {
} }
func unvis(p *unvisParser) (string, error) { func unvis(p *unvisParser) (string, error) {
output := "" var output []byte
for !p.End() { for !p.End() {
ch, err := unvisRune(p) ch, err := unvisRune(p)
if err != nil { if err != nil {
return "", fmt.Errorf("input: %s", err) return "", fmt.Errorf("input: %s", err)
} }
output += ch output = append(output, ch...)
} }
return output, nil return string(output), nil
} }
// Unvis takes a string formatted with the given Vis flags (though only the // Unvis takes a string formatted with the given Vis flags (though only the

View file

@ -21,6 +21,20 @@ import (
"testing" "testing"
) )
func TestUnvisError(t *testing.T) {
for _, test := range []string{
// Octal escape codes allow you to specify invalid byte values.
"\\777",
"\\420\\322\\455",
"\\652\\233",
} {
got, err := Unvis(test, DefaultVisFlags)
if err == nil {
t.Errorf("expected unvis(%q) to give an error, got %q", test, got)
}
}
}
func TestUnvisOctalEscape(t *testing.T) { func TestUnvisOctalEscape(t *testing.T) {
for _, test := range []struct { for _, test := range []struct {
input string input string
@ -34,6 +48,11 @@ func TestUnvisOctalEscape(t *testing.T) {
{"\\170YET\\01another test\\1\\\\82", "\170YET\001another test\001\\82"}, {"\\170YET\\01another test\\1\\\\82", "\170YET\001another test\001\\82"},
{"\\177MORE tests\\09a", "\177MORE tests\x009a"}, {"\\177MORE tests\\09a", "\177MORE tests\x009a"},
{"\\\\710more\\1215testing", "\\710more\1215testing"}, {"\\\\710more\\1215testing", "\\710more\1215testing"},
// Make sure that decoding unicode works properly, when it's been encoded as single bytes.
{"\\360\\237\\225\\264", "\U0001f574"},
{"T\\303\\234B\\304\\260TAK_UEKAE_K\\303\\266k_Sertifika_Hizmet_Sa\\304\\237lay\\304\\261c\\304\\261s\\304\\261_-_S\\303\\274r\\303\\274m_3.pem", "TÜBİTAK_UEKAE_Kök_Sertifika_Hizmet_Sağlayıcısı_-_Sürüm_3.pem"},
// Some invalid characters...
{"\\377\\2\\225\\264", "\xff\x02\x95\xb4"},
} { } {
got, err := Unvis(test.input, DefaultVisFlags) got, err := Unvis(test.input, DefaultVisFlags)
if err != nil { if err != nil {
@ -57,8 +76,11 @@ func TestUnvisHexEscape(t *testing.T) {
{"this is a test\\x13\\x52\\x6f", "this is a test\x13\x52\x6f"}, {"this is a test\\x13\\x52\\x6f", "this is a test\x13\x52\x6f"},
{"\\x170YET\\x01a\\x22nother test\\x11", "\x170YET\x01a\x22nother test\x11"}, {"\\x170YET\\x01a\\x22nother test\\x11", "\x170YET\x01a\x22nother test\x11"},
{"\\\\x007more\\\\x215testing", "\\x007more\\x215testing"}, {"\\\\x007more\\\\x215testing", "\\x007more\\x215testing"},
// Make sure that decoding unicode works properly. // Make sure that decoding unicode works properly, when it's been encoded as single bytes.
{"\\xf0\\x9f\\x95\\xb4", "\U0001f574"}, {"\\xf0\\x9f\\x95\\xb4", "\U0001f574"},
{"T\\xc3\\x9cB\\xc4\\xb0TAK_UEKAE_K\\xc3\\xb6k_Sertifika_Hizmet_Sa\\xc4\\x9flay\\xc4\\xb1c\\xc4\\xb1s\\xc4\\xb1_-_S\\xc3\\xbcr\\xc3\\xbcm_3.pem", "TÜBİTAK_UEKAE_Kök_Sertifika_Hizmet_Sağlayıcısı_-_Sürüm_3.pem"},
// Some invalid characters...
{"\\xff\\x02\\x95\\xb4", "\xff\x02\x95\xb4"},
} { } {
got, err := Unvis(test.input, DefaultVisFlags) got, err := Unvis(test.input, DefaultVisFlags)
if err != nil { if err != nil {