vis: rewrite to use byte slices

This results in all multi-byte characters to be encoded in a way that
naive unvis(3) implementations will not bork up the encoding. In
addition, it also ensures that the output of Vis will always be ASCII
*only*.

Also test far more cases in *_test.go when it comes to different flags,
and do far more tests to ensure that the output of Vis() makes sense.
These outputs come directly from vis(3) and so are useful regression
tests to ensure that the handling of Vis() is identical to the original.

Signed-off-by: Aleksa Sarai <asarai@suse.de>
This commit is contained in:
Aleksa Sarai 2017-02-14 03:52:52 +11:00
parent c2a9f1a56d
commit 4045484afb
No known key found for this signature in database
GPG key ID: 9E18AA267DDB8DB4
3 changed files with 171 additions and 82 deletions

View file

@ -32,6 +32,7 @@ const (
VisNoSlash // VIS_NOSLASH: Inhibit printing '\'. VisNoSlash // VIS_NOSLASH: Inhibit printing '\'.
VisHTTPStyle // VIS_HTTPSTYLE: HTTP-style escape %xx. VisHTTPStyle // VIS_HTTPSTYLE: HTTP-style escape %xx.
VisGlob // VIS_GLOB: Encode glob(3) magics. VisGlob // VIS_GLOB: Encode glob(3) magics.
visMask VisFlag = (1 << iota) - 1 // Mask of all flags.
VisWhite VisFlag = (VisSpace | VisTab | VisNewline) VisWhite VisFlag = (VisSpace | VisTab | VisNewline)
) )

133
vis.go
View file

@ -20,7 +20,6 @@ package govis
import ( import (
"fmt" "fmt"
"unicode" "unicode"
"unicode/utf8"
) )
func isunsafe(ch rune) bool { func isunsafe(ch rune) bool {
@ -31,7 +30,15 @@ func isglob(ch rune) bool {
return ch == '*' || ch == '?' || ch == '[' || ch == '#' return ch == '*' || ch == '?' || ch == '[' || ch == '#'
} }
// ishttp is defined by RFC 1808.
func ishttp(ch rune) bool { func ishttp(ch rune) bool {
// RFC1808 does not really consider characters outside of ASCII, so just to
// be safe always treat characters outside the ASCII character set as "not
// HTTP".
if ch > unicode.MaxASCII {
return false
}
return unicode.IsDigit(ch) || unicode.IsLetter(ch) || return unicode.IsDigit(ch) || unicode.IsLetter(ch) ||
// Safe characters. // Safe characters.
ch == '$' || ch == '-' || ch == '_' || ch == '.' || ch == '+' || ch == '$' || ch == '-' || ch == '_' || ch == '.' || ch == '+' ||
@ -40,54 +47,56 @@ func ishttp(ch rune) bool {
ch == ')' || ch == ',' ch == ')' || ch == ','
} }
func mapRuneBytes(ch rune, fn func(byte) string) string { func isgraph(ch rune) bool {
bytes := make([]byte, utf8.RuneLen(ch)) return unicode.IsGraphic(ch) && !unicode.IsSpace(ch) && ch <= unicode.MaxASCII
n := utf8.EncodeRune(bytes, ch)
mapped := ""
for i := 0; i < n; i++ {
mapped += fn(bytes[i])
}
return mapped
} }
// vis converts a single rune into its encoding, ensuring that it is "safe" // vis converts a single *byte* into its encoding. While Go supports the
// (for some definition of safe). Note that some visual characters (such as // concept of runes (and thus native utf-8 parsing), in order to make sure that
// accented characters or similar things) can be made up of several runes -- in // the bit-stream will be completely maintained through an Unvis(Vis(...))
// order to maintain my sanity Vis() makes no attempt to handle such cases // round-trip. The downside is that Vis() will never output unicode -- but on
// specially. // the plus side this is actually a benefit on the encoding side (it will
func vis(ch rune, flag VisFlag) (string, error) { // always work with the simple unvis(3) implementation). It also means that we
// XXX: Currently we are just allowing regular multi-byte characters such // don't have to worry about different multi-byte encodings.
// as accents and so on to be passed through without encoding. Is this func vis(b byte, flag VisFlag) (string, error) {
// really the best idea? In order to maintain compatibility with // Treat the single-byte character as a rune.
// vis(3) such that an older unvis(3) will do the right thing maybe we ch := rune(b)
// should only output 7-bit ASCII? I'm not sure.
// XXX: This is quite a horrible thing to support.
if flag&VisHTTPStyle == VisHTTPStyle { if flag&VisHTTPStyle == VisHTTPStyle {
// This is described in RFC 1808.
if !ishttp(ch) { if !ishttp(ch) {
return mapRuneBytes(ch, func(b byte) string { return "%" + fmt.Sprintf("%.2X", ch), nil
return fmt.Sprintf("%.2X", b)
}), nil
} }
} }
// Handle all "ordinary" characters which don't need to be encoded. // Figure out if the character doesn't need to be encoded. Effectively, we
if !(flag&VisGlob == VisGlob && isglob(ch)) && // encode most "normal" (graphical) characters as themselves unless we have
((unicode.IsGraphic(ch) && !unicode.IsSpace(ch)) || // been specifically asked not to. Note though that we *ALWAYS* encode
(flag&VisSpace == 0 && ch == ' ') || // everything outside ASCII.
(flag&VisTab == 0 && ch == '\t') || // TODO: Switch this to much more logical code.
(flag&VisNewline == 0 && ch == '\n') ||
(flag&VisSafe == VisSafe && isunsafe(ch))) { if ch > unicode.MaxASCII {
enc := string(ch) /* ... */
} else if flag&VisGlob == VisGlob && isglob(ch) {
/* ... */
} else if isgraph(ch) ||
(flag&VisSpace != VisSpace && ch == ' ') ||
(flag&VisTab != VisTab && ch == '\t') ||
(flag&VisNewline != VisNewline && ch == '\n') ||
(flag&VisSafe != 0 && isunsafe(ch)) {
encoded := string(ch)
if ch == '\\' && flag&VisNoSlash == 0 { if ch == '\\' && flag&VisNoSlash == 0 {
enc += "\\" encoded += "\\"
} }
return enc, nil return encoded, nil
} }
// Try to use C-style escapes first.
if flag&VisCStyle == VisCStyle { if flag&VisCStyle == VisCStyle {
switch ch { switch ch {
case ' ':
return "\\s", nil
case '\n': case '\n':
return "\\n", nil return "\\n", nil
case '\r': case '\r':
@ -102,55 +111,61 @@ func vis(ch rune, flag VisFlag) (string, error) {
return "\\t", nil return "\\t", nil
case '\f': case '\f':
return "\\f", nil return "\\f", nil
case 0: case '\x00':
// TODO: Handle isoctal properly. // Output octal just to be safe.
return "\\000", nil return "\\000", nil
} }
} }
// TODO: ch & 0177 is not implemented... // For graphical characters we generate octal output (and also if it's
if flag&VisOctal == VisOctal || unicode.IsGraphic(ch) { // being forced by the caller's flags). Also spaces should always be
return mapRuneBytes(ch, func(b byte) string { // encoded as octal.
return fmt.Sprintf("\\%.3o", b) if flag&VisOctal == VisOctal || isgraph(ch) || ch&0x7f == ' ' {
}), nil // Always output three-character octal just to be safe.
return fmt.Sprintf("\\%.3o", ch), nil
} }
return mapRuneBytes(ch, func(b byte) string { // Now we have to output meta or ctrl escapes. As far as I can tell, this
enc := "" // is not actually defined by any standard -- so this logic is basically
// copied from the original vis(3) implementation. Hopefully nobody
// actually relies on this (octal and hex are better).
encoded := ""
if flag&VisNoSlash == 0 { if flag&VisNoSlash == 0 {
enc += "\\" encoded += "\\"
} }
// This logic is stolen from cvis, I don't understand any of it. // Meta characters have 0x80 set, but are otherwise identical to control
if b&0200 != 0 { // characters.
b &= 0177 if b&0x80 != 0 {
enc += "M" b &= 0x7f
encoded += "M"
} }
if unicode.IsControl(rune(b)) { if unicode.IsControl(rune(b)) {
enc += "^" encoded += "^"
if b == 0177 { if b == 0x7f {
enc += "?" encoded += "?"
} else { } else {
enc += string(b + '@') encoded += fmt.Sprintf("%c", b+'@')
} }
} else { } else {
enc += fmt.Sprintf("-%s", b) encoded += fmt.Sprintf("-%c", b)
} }
return enc return encoded, nil
}), nil
} }
// Vis encodes the provided string to a BSD-compatible encoding using BSD's // Vis encodes the provided string to a BSD-compatible encoding using BSD's
// vis() flags. However, it will correctly handle multi-byte encoding (which is // vis() flags. However, it will correctly handle multi-byte encoding (which is
// not done properly by BSD's vis implementation). // not done properly by BSD's vis implementation).
func Vis(src string, flag VisFlag) (string, error) { func Vis(src string, flag VisFlag) (string, error) {
if !utf8.ValidString(src) { if flag&visMask != flag {
return "", fmt.Errorf("vis: input string is invalid utf8 literal") return "", fmt.Errorf("vis: flag %q contains unknown or unsupported flags", flag)
} }
output := "" output := ""
for _, ch := range src { for _, ch := range []byte(src) {
encodedCh, err := vis(ch, flag) encodedCh, err := vis(ch, flag)
if err != nil { if err != nil {
return "", err return "", err

View file

@ -22,18 +22,90 @@ import (
) )
func TestVisUnchanged(t *testing.T) { func TestVisUnchanged(t *testing.T) {
for _, test := range []string{ for _, test := range []struct {
"helloworld", input string
"THIS_IS_A_TEST1234", flag VisFlag
"SomeEncodingsAreCool",
"AC_Raíz_Certicámara_S.A..pem",
}{ }{
enc, err := Vis(test, DefaultVisFlags) {"", DefaultVisFlags},
{"helloworld", DefaultVisFlags},
{"THIS_IS_A_TEST1234", DefaultVisFlags},
{"SomeEncodingsAreCool", DefaultVisFlags},
{"spaces are totally safe", DefaultVisFlags &^ VisSpace},
{"tabs\tare\talso\tsafe!!", DefaultVisFlags &^ VisTab},
{"just\a\atrustme\r\b\b!!", DefaultVisFlags | VisSafe},
} {
enc, err := Vis(test.input, test.flag)
if err != nil { if err != nil {
t.Errorf("unexpected error with %q: %s", test, err) t.Errorf("unexpected error with %q: %s", test, err)
} }
if enc != test { if enc != test.input {
t.Errorf("expected encoding of %q to be unchanged, got %q", test, enc) t.Errorf("expected encoding of %q (flag=%q) to be unchanged, got %q", test.input, test.flag, enc)
}
}
}
func TestVisFlags(t *testing.T) {
for _, test := range []struct {
input string
output string
flag VisFlag
}{
// Default
{"AC_Ra\u00edz_Certic\u00e1mara_S.A..pem", "AC_Ra\\M-C\\M--z_Certic\\M-C\\M-!mara_S.A..pem", 0},
{"z^i3i$\u00d3\u008anqgh5/t\u00e5<86>\u00b2kzla\\e^lv\u00df\u0093nv\u00df\u00aea|3}\u00d8\u0088\u00d6\u0084", `z^i3i$\M-C\M^S\M-B\M^Jnqgh5/t\M-C\M-%<86>\M-B\M-2kzla\\e^lv\M-C\M^_\M-B\M^Snv\M-C\M^_\M-B\M-.a|3}\M-C\M^X\M-B\M^H\M-C\M^V\M-B\M^D`, 0},
{"@?e1xs+.R_Kjo]7s8pgRP:*nXCE4{!c", "@?e1xs+.R_Kjo]7s8pgRP:*nXCE4{!c", 0},
{"62_\u00c6\u00c62\u00ae\u00b7m\u00db\u00c3r^\u00bfp\u00c6u'q\u00fbc2\u00f0u\u00b8\u00dd\u00e8v\u00ff\u00b0\u00dc\u00c2\u00f53\u00db-k\u00f2sd4\\p\u00da\u00a6\u00d3\u00eea<\u00e6s{\u00a0p\u00f0\u00ffj\u00e0\u00e8\u00b8\u00b8\u00bc\u00fcb", `62_\M-C\M^F\M-C\M^F2\M-B\M-.\M-B\M-7m\M-C\M^[\M-C\M^Cr^\M-B\M-?p\M-C\M^Fu'q\M-C\M-;c2\M-C\M-0u\M-B\M-8\M-C\M^]\M-C\M-(v\M-C\M-?\M-B\M-0\M-C\M^\\M-C\M^B\M-C\M-53\M-C\M^[-k\M-C\M-2sd4\\p\M-C\M^Z\M-B\M-&\M-C\M^S\M-C\M-.a<\M-C\M-&s{\M-B\240p\M-C\M-0\M-C\M-?j\M-C\240\M-C\M-(\M-B\M-8\M-B\M-8\M-B\M-<\M-C\M-<b`, 0},
{"\u9003\"9v1)T798|o;fly jnKX\u0489Be=", `\M-i\M^@\M^C"9v1)T798|o;fly jnKX\M-R\M^IBe=`, 0},
// VisOctal
{"", "", VisOctal},
{"\022", "\\022", VisOctal},
{"\n \t", "\\012\\040\t", VisNewline | VisSpace | VisOctal},
{"\x12\f\a\n\v\b \U00012312", "\\022\\014\\007\n\\013\\010 \\360\\222\\214\\222", VisOctal},
{"AC_Ra\u00edz_Certic\u00e1mara_S.A..pem", "AC_Ra\\303\\255z_Certic\\303\\241mara_S.A..pem", VisOctal},
{"z^i3i$\u00d3\u008anqgh5/t\u00e5<86>\u00b2kzla\\e^lv\u00df\u0093nv\u00df\u00aea|3}\u00d8\u0088\u00d6\u0084", `z^i3i$\303\223\302\212nqgh5/t\303\245<86>\302\262kzla\\e^lv\303\237\302\223nv\303\237\302\256a|3}\303\230\302\210\303\226\302\204`, VisOctal},
{"62_\u00c6\u00c62\u00ae\u00b7m\u00db\u00c3r^\u00bfp\u00c6u'q\u00fbc2\u00f0u\u00b8\u00dd\u00e8v\u00ff\u00b0\u00dc\u00c2\u00f53\u00db-k\u00f2sd4\\p\u00da\u00a6\u00d3\u00eea<\u00e6s{\u00a0p\u00f0\u00ffj\u00e0\u00e8\u00b8\u00b8\u00bc\u00fcb", `62_\303\206\303\2062\302\256\302\267m\303\233\303\203r^\302\277p\303\206u'q\303\273c2\303\260u\302\270\303\235\303\250v\303\277\302\260\303\234\303\202\303\2653\303\233-k\303\262sd4\\p\303\232\302\246\303\223\303\256a<\303\246s{\302\240p\303\260\303\277j\303\240\303\250\302\270\302\270\302\274\303\274b`, VisOctal},
{"\u9003\"9v1)T798|o;fly jnKX\u0489Be=", `\351\200\203"9v1)T798|o;fly jnKX\322\211Be=`, VisOctal},
// VisCStyle
{"\x00 \f \a \n\v\b \r \t\r", "\\000 \\f \\a \n\\v\\b \\r \t\\r", VisCStyle},
{"\t \n\v\b", "\\t \n\\v\\b", VisTab | VisCStyle},
{"\n\v\t ", "\n\\v\t\\s\\s\\s", VisSpace | VisCStyle},
{"\n \n ", "\\n \\n ", VisNewline | VisCStyle},
{"z^i3i$\u00d3\u008anqgh5/t\u00e5<86>\u00b2kzla\\e^lv\u00df\u0093nv\u00df\u00aea|3}\u00d8\u0088\u00d6\u0084", `z^i3i$\M-C\M^S\M-B\M^Jnqgh5/t\M-C\M-%<86>\M-B\M-2kzla\\e^lv\M-C\M^_\M-B\M^Snv\M-C\M^_\M-B\M-.a|3}\M-C\M^X\M-B\M^H\M-C\M^V\M-B\M^D`, VisCStyle},
{"62_\u00c6\u00c62\u00ae\u00b7m\u00db\u00c3r^\u00bfp\u00c6u'q\u00fbc2\u00f0u\u00b8\u00dd\u00e8v\u00ff\u00b0\u00dc\u00c2\u00f53\u00db-k\u00f2sd4\\p\u00da\u00a6\u00d3\u00eea<\u00e6s{\u00a0p\u00f0\u00ffj\u00e0\u00e8\u00b8\u00b8\u00bc\u00fcb", `62_\M-C\M^F\M-C\M^F2\M-B\M-.\M-B\M-7m\M-C\M^[\M-C\M^Cr^\M-B\M-?p\M-C\M^Fu'q\M-C\M-;c2\M-C\M-0u\M-B\M-8\M-C\M^]\M-C\M-(v\M-C\M-?\M-B\M-0\M-C\M^\\M-C\M^B\M-C\M-53\M-C\M^[-k\M-C\M-2sd4\\p\M-C\M^Z\M-B\M-&\M-C\M^S\M-C\M-.a<\M-C\M-&s{\M-B\240p\M-C\M-0\M-C\M-?j\M-C\240\M-C\M-(\M-B\M-8\M-B\M-8\M-B\M-<\M-C\M-<b`, VisCStyle},
{"\u9003\"9v1)T798|o;fly jnKX\u0489Be=", `\M-i\M^@\M^C"9v1)T798|o;fly\sjnKX\M-R\M^IBe=`, VisCStyle | VisSpace},
// VisSpace
{" ", `\040\040`, VisSpace},
{"\t \t", "\t\\040\t", VisSpace},
{"\\040 plenty of characters here ", `\\040\040\040\040plenty\040of\040characters\040here\040\040\040`, VisSpace},
{"Js9L\u00cd\u00b2o?4824y'$|P}FIr%mW /KL9$]~", `Js9L\M-C\M^M\M-B\M-2o?4824y'$|P}FIr%mW\040/KL9$]~`, VisWhite},
{"1\u00c6\u00abTcz+Vda?)k1%\\\"P;`po`h", `1\M-C\M^F\M-B\M-+Tcz+Vda?)k1%\\"P;` + "`po`" + `h`, VisWhite},
{"\u9003\"9v1)T798|o;fly jnKX\u0489Be=", `\M-i\M^@\M^C"9v1)T798|o;fly\040jnKX\M-R\M^IBe=`, VisSpace},
// VisTab
{"\t \v", "\\^I \\^K", VisTab},
{"\t \v", "\\011 \\013", VisTab | VisOctal},
// VisNewline
{"\t\n \v\r\n", "\t\\^J \\^K\\^M\\^J", VisNewline},
{"\t\n \v\r\n", "\t\\012 \\013\\015\\012", VisNewline | VisOctal},
// VisSafe
// VisHTTPStyle
{"\x12\f\a\n\v\b \U00012312", `%12%0C%07%0A%0B%08%20%20%F0%92%8C%92`, VisHTTPStyle},
{"1\u00c6\u00abTcz+Vda?)k1%\\\"P;`po`h", `1%C3%86%C2%ABTcz+Vda%3F)k1%25%5C%22P%3B%60po%60h`, VisHTTPStyle},
{"62_\u00c6\u00c62\u00ae\u00b7m\u00db\u00c3r^\u00bfp\u00c6u'q\u00fbc2\u00f0u\u00b8\u00dd\u00e8v\u00ff\u00b0\u00dc\u00c2\u00f53\u00db-k\u00f2sd4\\p\u00da\u00a6\u00d3\u00eea<\u00e6s{\u00a0p\u00f0\u00ffj\u00e0\u00e8\u00b8\u00b8\u00bc\u00fcb", `62_%C3%86%C3%862%C2%AE%C2%B7m%C3%9B%C3%83r%5E%C2%BFp%C3%86u'q%C3%BBc2%C3%B0u%C2%B8%C3%9D%C3%A8v%C3%BF%C2%B0%C3%9C%C3%82%C3%B53%C3%9B-k%C3%B2sd4%5Cp%C3%9A%C2%A6%C3%93%C3%AEa%3C%C3%A6s%7B%C2%A0p%C3%B0%C3%BFj%C3%A0%C3%A8%C2%B8%C2%B8%C2%BC%C3%BCb`, VisHTTPStyle},
{"'3Ze\u050e|\u02del\u069du-Rpct4+Z5b={@_{b", `'3Ze%D4%8E%7C%CB%9El%DA%9Du-Rpct4+Z5b%3D%7B%40_%7Bb`, VisHTTPStyle},
// VisGlob
{"cat /proc/**/status | grep '[pid]' ;; # cool code here", `cat /proc/\052\052/status | grep '\133pid]' ;; \043 cool code here`, VisGlob},
{"@?e1xs+.R_Kjo]7s8pgRP:*nXCE4{!c", `@\077e1xs+.R_Kjo]7s8pgRP:\052nXCE4{!c`, VisGlob},
{"62_\u00c6\u00c62\u00ae\u00b7m\u00db\u00c3r^\u00bfp\u00c6u'q\u00fbc2\u00f0u\u00b8\u00dd\u00e8v\u00ff\u00b0\u00dc\u00c2\u00f53\u00db-k\u00f2sd4\\p\u00da\u00a6\u00d3\u00eea<\u00e6s{\u00a0p\u00f0\u00ffj\u00e0\u00e8\u00b8\u00b8\u00bc\u00fcb", `62_\M-C\M^F\M-C\M^F2\M-B\M-.\M-B\M-7m\M-C\M^[\M-C\M^Cr^\M-B\M-?p\M-C\M^Fu'q\M-C\M-;c2\M-C\M-0u\M-B\M-8\M-C\M^]\M-C\M-(v\M-C\M-?\M-B\M-0\M-C\M^\\M-C\M^B\M-C\M-53\M-C\M^[-k\M-C\M-2sd4\\p\M-C\M^Z\M-B\M-&\M-C\M^S\M-C\M-.a<\M-C\M-&s{\M-B\240p\M-C\M-0\M-C\M-?j\M-C\240\M-C\M-(\M-B\M-8\M-B\M-8\M-B\M-<\M-C\M-<b`, VisGlob},
{"62_\u00c6\u00c62\u00ae\u00b7m\u00db\u00c3r^\u00bfp\u00c6u'q\u00fbc2\u00f0u\u00b8\u00dd\u00e8v\u00ff\u00b0\u00dc\u00c2\u00f53\u00db-k\u00f2sd4\\p\u00da\u00a6\u00d3\u00eea<\u00e6s{\u00a0p\u00f0\u00ffj\u00e0\u00e8\u00b8\u00b8\u00bc\u00fcb", `62_\303\206\303\2062\302\256\302\267m\303\233\303\203r^\302\277p\303\206u'q\303\273c2\303\260u\302\270\303\235\303\250v\303\277\302\260\303\234\303\202\303\2653\303\233-k\303\262sd4\\p\303\232\302\246\303\223\303\256a<\303\246s{\302\240p\303\260\303\277j\303\240\303\250\302\270\302\270\302\274\303\274b`, VisGlob | VisOctal},
{"'3Ze\u050e|\u02del\u069du-Rpct4+Z5b={@_{b", `'3Ze\M-T\M^N|\M-K\M^^l\M-Z\M^]u-Rpct4+Z5b={@_{b`, VisGlob},
{"'3Ze\u050e|\u02del\u069du-Rpct4+Z5b={@_{b", `'3Ze\324\216|\313\236l\332\235u-Rpct4+Z5b={@_{b`, VisGlob | VisOctal},
} {
enc, err := Vis(test.input, test.flag)
if err != nil {
t.Errorf("unexpected error with %q: %s", test, err)
}
if enc != test.output {
t.Errorf("expected vis(%q, flag=%b) = %q, got %q", test.input, test.flag, test.output, enc)
} }
} }
} }
@ -42,6 +114,7 @@ func TestVisChanged(t *testing.T) {
for _, test := range []string{ for _, test := range []string{
"hello world", "hello world",
"THIS\\IS_A_TEST1234", "THIS\\IS_A_TEST1234",
"AC_Ra\u00edz_Certic\u00e1mara_S.A..pem",
} { } {
enc, err := Vis(test, DefaultVisFlags) enc, err := Vis(test, DefaultVisFlags)
if err != nil { if err != nil {