vis: rewrite to use byte slices

This results in all multi-byte characters to be encoded in a way that
naive unvis(3) implementations will not bork up the encoding. In
addition, it also ensures that the output of Vis will always be ASCII
*only*.

Also test far more cases in *_test.go when it comes to different flags,
and do far more tests to ensure that the output of Vis() makes sense.
These outputs come directly from vis(3) and so are useful regression
tests to ensure that the handling of Vis() is identical to the original.

Signed-off-by: Aleksa Sarai <asarai@suse.de>
This commit is contained in:
Aleksa Sarai 2017-02-14 03:52:52 +11:00
parent c2a9f1a56d
commit 4045484afb
No known key found for this signature in database
GPG Key ID: 9E18AA267DDB8DB4
3 changed files with 171 additions and 82 deletions

View File

@ -23,15 +23,16 @@ type VisFlag uint
// mtree only uses one set of flags, implementing them all is necessary in
// order to have compatibility with BSD's vis() and unvis() commands.
const (
VisOctal VisFlag = (1 << iota) // VIS_OCTAL: Use octal \ddd format.
VisCStyle // VIS_CSTYLE: Use \[nrft0..] where appropriate.
VisSpace // VIS_SP: Also encode space.
VisTab // VIS_TAB: Also encode tab.
VisNewline // VIS_NL: Also encode newline.
VisSafe // VIS_SAFE: Encode unsafe characters.
VisNoSlash // VIS_NOSLASH: Inhibit printing '\'.
VisHTTPStyle // VIS_HTTPSTYLE: HTTP-style escape %xx.
VisGlob // VIS_GLOB: Encode glob(3) magics.
VisOctal VisFlag = (1 << iota) // VIS_OCTAL: Use octal \ddd format.
VisCStyle // VIS_CSTYLE: Use \[nrft0..] where appropriate.
VisSpace // VIS_SP: Also encode space.
VisTab // VIS_TAB: Also encode tab.
VisNewline // VIS_NL: Also encode newline.
VisSafe // VIS_SAFE: Encode unsafe characters.
VisNoSlash // VIS_NOSLASH: Inhibit printing '\'.
VisHTTPStyle // VIS_HTTPSTYLE: HTTP-style escape %xx.
VisGlob // VIS_GLOB: Encode glob(3) magics.
visMask VisFlag = (1 << iota) - 1 // Mask of all flags.
VisWhite VisFlag = (VisSpace | VisTab | VisNewline)
)

145
vis.go
View File

@ -20,7 +20,6 @@ package govis
import (
"fmt"
"unicode"
"unicode/utf8"
)
func isunsafe(ch rune) bool {
@ -31,7 +30,15 @@ func isglob(ch rune) bool {
return ch == '*' || ch == '?' || ch == '[' || ch == '#'
}
// ishttp is defined by RFC 1808.
func ishttp(ch rune) bool {
// RFC1808 does not really consider characters outside of ASCII, so just to
// be safe always treat characters outside the ASCII character set as "not
// HTTP".
if ch > unicode.MaxASCII {
return false
}
return unicode.IsDigit(ch) || unicode.IsLetter(ch) ||
// Safe characters.
ch == '$' || ch == '-' || ch == '_' || ch == '.' || ch == '+' ||
@ -40,54 +47,56 @@ func ishttp(ch rune) bool {
ch == ')' || ch == ','
}
func mapRuneBytes(ch rune, fn func(byte) string) string {
bytes := make([]byte, utf8.RuneLen(ch))
n := utf8.EncodeRune(bytes, ch)
mapped := ""
for i := 0; i < n; i++ {
mapped += fn(bytes[i])
}
return mapped
func isgraph(ch rune) bool {
return unicode.IsGraphic(ch) && !unicode.IsSpace(ch) && ch <= unicode.MaxASCII
}
// vis converts a single rune into its encoding, ensuring that it is "safe"
// (for some definition of safe). Note that some visual characters (such as
// accented characters or similar things) can be made up of several runes -- in
// order to maintain my sanity Vis() makes no attempt to handle such cases
// specially.
func vis(ch rune, flag VisFlag) (string, error) {
// XXX: Currently we are just allowing regular multi-byte characters such
// as accents and so on to be passed through without encoding. Is this
// really the best idea? In order to maintain compatibility with
// vis(3) such that an older unvis(3) will do the right thing maybe we
// should only output 7-bit ASCII? I'm not sure.
// vis converts a single *byte* into its encoding. While Go supports the
// concept of runes (and thus native utf-8 parsing), in order to make sure that
// the bit-stream will be completely maintained through an Unvis(Vis(...))
// round-trip. The downside is that Vis() will never output unicode -- but on
// the plus side this is actually a benefit on the encoding side (it will
// always work with the simple unvis(3) implementation). It also means that we
// don't have to worry about different multi-byte encodings.
func vis(b byte, flag VisFlag) (string, error) {
// Treat the single-byte character as a rune.
ch := rune(b)
// XXX: This is quite a horrible thing to support.
if flag&VisHTTPStyle == VisHTTPStyle {
// This is described in RFC 1808.
if !ishttp(ch) {
return mapRuneBytes(ch, func(b byte) string {
return fmt.Sprintf("%.2X", b)
}), nil
return "%" + fmt.Sprintf("%.2X", ch), nil
}
}
// Handle all "ordinary" characters which don't need to be encoded.
if !(flag&VisGlob == VisGlob && isglob(ch)) &&
((unicode.IsGraphic(ch) && !unicode.IsSpace(ch)) ||
(flag&VisSpace == 0 && ch == ' ') ||
(flag&VisTab == 0 && ch == '\t') ||
(flag&VisNewline == 0 && ch == '\n') ||
(flag&VisSafe == VisSafe && isunsafe(ch))) {
enc := string(ch)
// Figure out if the character doesn't need to be encoded. Effectively, we
// encode most "normal" (graphical) characters as themselves unless we have
// been specifically asked not to. Note though that we *ALWAYS* encode
// everything outside ASCII.
// TODO: Switch this to much more logical code.
if ch > unicode.MaxASCII {
/* ... */
} else if flag&VisGlob == VisGlob && isglob(ch) {
/* ... */
} else if isgraph(ch) ||
(flag&VisSpace != VisSpace && ch == ' ') ||
(flag&VisTab != VisTab && ch == '\t') ||
(flag&VisNewline != VisNewline && ch == '\n') ||
(flag&VisSafe != 0 && isunsafe(ch)) {
encoded := string(ch)
if ch == '\\' && flag&VisNoSlash == 0 {
enc += "\\"
encoded += "\\"
}
return enc, nil
return encoded, nil
}
// Try to use C-style escapes first.
if flag&VisCStyle == VisCStyle {
switch ch {
case ' ':
return "\\s", nil
case '\n':
return "\\n", nil
case '\r':
@ -102,55 +111,61 @@ func vis(ch rune, flag VisFlag) (string, error) {
return "\\t", nil
case '\f':
return "\\f", nil
case 0:
// TODO: Handle isoctal properly.
case '\x00':
// Output octal just to be safe.
return "\\000", nil
}
}
// TODO: ch & 0177 is not implemented...
if flag&VisOctal == VisOctal || unicode.IsGraphic(ch) {
return mapRuneBytes(ch, func(b byte) string {
return fmt.Sprintf("\\%.3o", b)
}), nil
// For graphical characters we generate octal output (and also if it's
// being forced by the caller's flags). Also spaces should always be
// encoded as octal.
if flag&VisOctal == VisOctal || isgraph(ch) || ch&0x7f == ' ' {
// Always output three-character octal just to be safe.
return fmt.Sprintf("\\%.3o", ch), nil
}
return mapRuneBytes(ch, func(b byte) string {
enc := ""
if flag&VisNoSlash == 0 {
enc += "\\"
}
// Now we have to output meta or ctrl escapes. As far as I can tell, this
// is not actually defined by any standard -- so this logic is basically
// copied from the original vis(3) implementation. Hopefully nobody
// actually relies on this (octal and hex are better).
// This logic is stolen from cvis, I don't understand any of it.
if b&0200 != 0 {
b &= 0177
enc += "M"
}
if unicode.IsControl(rune(b)) {
enc += "^"
if b == 0177 {
enc += "?"
} else {
enc += string(b + '@')
}
encoded := ""
if flag&VisNoSlash == 0 {
encoded += "\\"
}
// Meta characters have 0x80 set, but are otherwise identical to control
// characters.
if b&0x80 != 0 {
b &= 0x7f
encoded += "M"
}
if unicode.IsControl(rune(b)) {
encoded += "^"
if b == 0x7f {
encoded += "?"
} else {
enc += fmt.Sprintf("-%s", b)
encoded += fmt.Sprintf("%c", b+'@')
}
} else {
encoded += fmt.Sprintf("-%c", b)
}
return enc
}), nil
return encoded, nil
}
// Vis encodes the provided string to a BSD-compatible encoding using BSD's
// vis() flags. However, it will correctly handle multi-byte encoding (which is
// not done properly by BSD's vis implementation).
func Vis(src string, flag VisFlag) (string, error) {
if !utf8.ValidString(src) {
return "", fmt.Errorf("vis: input string is invalid utf8 literal")
if flag&visMask != flag {
return "", fmt.Errorf("vis: flag %q contains unknown or unsupported flags", flag)
}
output := ""
for _, ch := range src {
for _, ch := range []byte(src) {
encodedCh, err := vis(ch, flag)
if err != nil {
return "", err

View File

@ -22,18 +22,90 @@ import (
)
func TestVisUnchanged(t *testing.T) {
for _, test := range []string{
"helloworld",
"THIS_IS_A_TEST1234",
"SomeEncodingsAreCool",
"AC_Raíz_Certicámara_S.A..pem",
for _, test := range []struct {
input string
flag VisFlag
}{
{"", DefaultVisFlags},
{"helloworld", DefaultVisFlags},
{"THIS_IS_A_TEST1234", DefaultVisFlags},
{"SomeEncodingsAreCool", DefaultVisFlags},
{"spaces are totally safe", DefaultVisFlags &^ VisSpace},
{"tabs\tare\talso\tsafe!!", DefaultVisFlags &^ VisTab},
{"just\a\atrustme\r\b\b!!", DefaultVisFlags | VisSafe},
} {
enc, err := Vis(test, DefaultVisFlags)
enc, err := Vis(test.input, test.flag)
if err != nil {
t.Errorf("unexpected error with %q: %s", test, err)
}
if enc != test {
t.Errorf("expected encoding of %q to be unchanged, got %q", test, enc)
if enc != test.input {
t.Errorf("expected encoding of %q (flag=%q) to be unchanged, got %q", test.input, test.flag, enc)
}
}
}
func TestVisFlags(t *testing.T) {
for _, test := range []struct {
input string
output string
flag VisFlag
}{
// Default
{"AC_Ra\u00edz_Certic\u00e1mara_S.A..pem", "AC_Ra\\M-C\\M--z_Certic\\M-C\\M-!mara_S.A..pem", 0},
{"z^i3i$\u00d3\u008anqgh5/t\u00e5<86>\u00b2kzla\\e^lv\u00df\u0093nv\u00df\u00aea|3}\u00d8\u0088\u00d6\u0084", `z^i3i$\M-C\M^S\M-B\M^Jnqgh5/t\M-C\M-%<86>\M-B\M-2kzla\\e^lv\M-C\M^_\M-B\M^Snv\M-C\M^_\M-B\M-.a|3}\M-C\M^X\M-B\M^H\M-C\M^V\M-B\M^D`, 0},
{"@?e1xs+.R_Kjo]7s8pgRP:*nXCE4{!c", "@?e1xs+.R_Kjo]7s8pgRP:*nXCE4{!c", 0},
{"62_\u00c6\u00c62\u00ae\u00b7m\u00db\u00c3r^\u00bfp\u00c6u'q\u00fbc2\u00f0u\u00b8\u00dd\u00e8v\u00ff\u00b0\u00dc\u00c2\u00f53\u00db-k\u00f2sd4\\p\u00da\u00a6\u00d3\u00eea<\u00e6s{\u00a0p\u00f0\u00ffj\u00e0\u00e8\u00b8\u00b8\u00bc\u00fcb", `62_\M-C\M^F\M-C\M^F2\M-B\M-.\M-B\M-7m\M-C\M^[\M-C\M^Cr^\M-B\M-?p\M-C\M^Fu'q\M-C\M-;c2\M-C\M-0u\M-B\M-8\M-C\M^]\M-C\M-(v\M-C\M-?\M-B\M-0\M-C\M^\\M-C\M^B\M-C\M-53\M-C\M^[-k\M-C\M-2sd4\\p\M-C\M^Z\M-B\M-&\M-C\M^S\M-C\M-.a<\M-C\M-&s{\M-B\240p\M-C\M-0\M-C\M-?j\M-C\240\M-C\M-(\M-B\M-8\M-B\M-8\M-B\M-<\M-C\M-<b`, 0},
{"\u9003\"9v1)T798|o;fly jnKX\u0489Be=", `\M-i\M^@\M^C"9v1)T798|o;fly jnKX\M-R\M^IBe=`, 0},
// VisOctal
{"", "", VisOctal},
{"\022", "\\022", VisOctal},
{"\n \t", "\\012\\040\t", VisNewline | VisSpace | VisOctal},
{"\x12\f\a\n\v\b \U00012312", "\\022\\014\\007\n\\013\\010 \\360\\222\\214\\222", VisOctal},
{"AC_Ra\u00edz_Certic\u00e1mara_S.A..pem", "AC_Ra\\303\\255z_Certic\\303\\241mara_S.A..pem", VisOctal},
{"z^i3i$\u00d3\u008anqgh5/t\u00e5<86>\u00b2kzla\\e^lv\u00df\u0093nv\u00df\u00aea|3}\u00d8\u0088\u00d6\u0084", `z^i3i$\303\223\302\212nqgh5/t\303\245<86>\302\262kzla\\e^lv\303\237\302\223nv\303\237\302\256a|3}\303\230\302\210\303\226\302\204`, VisOctal},
{"62_\u00c6\u00c62\u00ae\u00b7m\u00db\u00c3r^\u00bfp\u00c6u'q\u00fbc2\u00f0u\u00b8\u00dd\u00e8v\u00ff\u00b0\u00dc\u00c2\u00f53\u00db-k\u00f2sd4\\p\u00da\u00a6\u00d3\u00eea<\u00e6s{\u00a0p\u00f0\u00ffj\u00e0\u00e8\u00b8\u00b8\u00bc\u00fcb", `62_\303\206\303\2062\302\256\302\267m\303\233\303\203r^\302\277p\303\206u'q\303\273c2\303\260u\302\270\303\235\303\250v\303\277\302\260\303\234\303\202\303\2653\303\233-k\303\262sd4\\p\303\232\302\246\303\223\303\256a<\303\246s{\302\240p\303\260\303\277j\303\240\303\250\302\270\302\270\302\274\303\274b`, VisOctal},
{"\u9003\"9v1)T798|o;fly jnKX\u0489Be=", `\351\200\203"9v1)T798|o;fly jnKX\322\211Be=`, VisOctal},
// VisCStyle
{"\x00 \f \a \n\v\b \r \t\r", "\\000 \\f \\a \n\\v\\b \\r \t\\r", VisCStyle},
{"\t \n\v\b", "\\t \n\\v\\b", VisTab | VisCStyle},
{"\n\v\t ", "\n\\v\t\\s\\s\\s", VisSpace | VisCStyle},
{"\n \n ", "\\n \\n ", VisNewline | VisCStyle},
{"z^i3i$\u00d3\u008anqgh5/t\u00e5<86>\u00b2kzla\\e^lv\u00df\u0093nv\u00df\u00aea|3}\u00d8\u0088\u00d6\u0084", `z^i3i$\M-C\M^S\M-B\M^Jnqgh5/t\M-C\M-%<86>\M-B\M-2kzla\\e^lv\M-C\M^_\M-B\M^Snv\M-C\M^_\M-B\M-.a|3}\M-C\M^X\M-B\M^H\M-C\M^V\M-B\M^D`, VisCStyle},
{"62_\u00c6\u00c62\u00ae\u00b7m\u00db\u00c3r^\u00bfp\u00c6u'q\u00fbc2\u00f0u\u00b8\u00dd\u00e8v\u00ff\u00b0\u00dc\u00c2\u00f53\u00db-k\u00f2sd4\\p\u00da\u00a6\u00d3\u00eea<\u00e6s{\u00a0p\u00f0\u00ffj\u00e0\u00e8\u00b8\u00b8\u00bc\u00fcb", `62_\M-C\M^F\M-C\M^F2\M-B\M-.\M-B\M-7m\M-C\M^[\M-C\M^Cr^\M-B\M-?p\M-C\M^Fu'q\M-C\M-;c2\M-C\M-0u\M-B\M-8\M-C\M^]\M-C\M-(v\M-C\M-?\M-B\M-0\M-C\M^\\M-C\M^B\M-C\M-53\M-C\M^[-k\M-C\M-2sd4\\p\M-C\M^Z\M-B\M-&\M-C\M^S\M-C\M-.a<\M-C\M-&s{\M-B\240p\M-C\M-0\M-C\M-?j\M-C\240\M-C\M-(\M-B\M-8\M-B\M-8\M-B\M-<\M-C\M-<b`, VisCStyle},
{"\u9003\"9v1)T798|o;fly jnKX\u0489Be=", `\M-i\M^@\M^C"9v1)T798|o;fly\sjnKX\M-R\M^IBe=`, VisCStyle | VisSpace},
// VisSpace
{" ", `\040\040`, VisSpace},
{"\t \t", "\t\\040\t", VisSpace},
{"\\040 plenty of characters here ", `\\040\040\040\040plenty\040of\040characters\040here\040\040\040`, VisSpace},
{"Js9L\u00cd\u00b2o?4824y'$|P}FIr%mW /KL9$]~", `Js9L\M-C\M^M\M-B\M-2o?4824y'$|P}FIr%mW\040/KL9$]~`, VisWhite},
{"1\u00c6\u00abTcz+Vda?)k1%\\\"P;`po`h", `1\M-C\M^F\M-B\M-+Tcz+Vda?)k1%\\"P;` + "`po`" + `h`, VisWhite},
{"\u9003\"9v1)T798|o;fly jnKX\u0489Be=", `\M-i\M^@\M^C"9v1)T798|o;fly\040jnKX\M-R\M^IBe=`, VisSpace},
// VisTab
{"\t \v", "\\^I \\^K", VisTab},
{"\t \v", "\\011 \\013", VisTab | VisOctal},
// VisNewline
{"\t\n \v\r\n", "\t\\^J \\^K\\^M\\^J", VisNewline},
{"\t\n \v\r\n", "\t\\012 \\013\\015\\012", VisNewline | VisOctal},
// VisSafe
// VisHTTPStyle
{"\x12\f\a\n\v\b \U00012312", `%12%0C%07%0A%0B%08%20%20%F0%92%8C%92`, VisHTTPStyle},
{"1\u00c6\u00abTcz+Vda?)k1%\\\"P;`po`h", `1%C3%86%C2%ABTcz+Vda%3F)k1%25%5C%22P%3B%60po%60h`, VisHTTPStyle},
{"62_\u00c6\u00c62\u00ae\u00b7m\u00db\u00c3r^\u00bfp\u00c6u'q\u00fbc2\u00f0u\u00b8\u00dd\u00e8v\u00ff\u00b0\u00dc\u00c2\u00f53\u00db-k\u00f2sd4\\p\u00da\u00a6\u00d3\u00eea<\u00e6s{\u00a0p\u00f0\u00ffj\u00e0\u00e8\u00b8\u00b8\u00bc\u00fcb", `62_%C3%86%C3%862%C2%AE%C2%B7m%C3%9B%C3%83r%5E%C2%BFp%C3%86u'q%C3%BBc2%C3%B0u%C2%B8%C3%9D%C3%A8v%C3%BF%C2%B0%C3%9C%C3%82%C3%B53%C3%9B-k%C3%B2sd4%5Cp%C3%9A%C2%A6%C3%93%C3%AEa%3C%C3%A6s%7B%C2%A0p%C3%B0%C3%BFj%C3%A0%C3%A8%C2%B8%C2%B8%C2%BC%C3%BCb`, VisHTTPStyle},
{"'3Ze\u050e|\u02del\u069du-Rpct4+Z5b={@_{b", `'3Ze%D4%8E%7C%CB%9El%DA%9Du-Rpct4+Z5b%3D%7B%40_%7Bb`, VisHTTPStyle},
// VisGlob
{"cat /proc/**/status | grep '[pid]' ;; # cool code here", `cat /proc/\052\052/status | grep '\133pid]' ;; \043 cool code here`, VisGlob},
{"@?e1xs+.R_Kjo]7s8pgRP:*nXCE4{!c", `@\077e1xs+.R_Kjo]7s8pgRP:\052nXCE4{!c`, VisGlob},
{"62_\u00c6\u00c62\u00ae\u00b7m\u00db\u00c3r^\u00bfp\u00c6u'q\u00fbc2\u00f0u\u00b8\u00dd\u00e8v\u00ff\u00b0\u00dc\u00c2\u00f53\u00db-k\u00f2sd4\\p\u00da\u00a6\u00d3\u00eea<\u00e6s{\u00a0p\u00f0\u00ffj\u00e0\u00e8\u00b8\u00b8\u00bc\u00fcb", `62_\M-C\M^F\M-C\M^F2\M-B\M-.\M-B\M-7m\M-C\M^[\M-C\M^Cr^\M-B\M-?p\M-C\M^Fu'q\M-C\M-;c2\M-C\M-0u\M-B\M-8\M-C\M^]\M-C\M-(v\M-C\M-?\M-B\M-0\M-C\M^\\M-C\M^B\M-C\M-53\M-C\M^[-k\M-C\M-2sd4\\p\M-C\M^Z\M-B\M-&\M-C\M^S\M-C\M-.a<\M-C\M-&s{\M-B\240p\M-C\M-0\M-C\M-?j\M-C\240\M-C\M-(\M-B\M-8\M-B\M-8\M-B\M-<\M-C\M-<b`, VisGlob},
{"62_\u00c6\u00c62\u00ae\u00b7m\u00db\u00c3r^\u00bfp\u00c6u'q\u00fbc2\u00f0u\u00b8\u00dd\u00e8v\u00ff\u00b0\u00dc\u00c2\u00f53\u00db-k\u00f2sd4\\p\u00da\u00a6\u00d3\u00eea<\u00e6s{\u00a0p\u00f0\u00ffj\u00e0\u00e8\u00b8\u00b8\u00bc\u00fcb", `62_\303\206\303\2062\302\256\302\267m\303\233\303\203r^\302\277p\303\206u'q\303\273c2\303\260u\302\270\303\235\303\250v\303\277\302\260\303\234\303\202\303\2653\303\233-k\303\262sd4\\p\303\232\302\246\303\223\303\256a<\303\246s{\302\240p\303\260\303\277j\303\240\303\250\302\270\302\270\302\274\303\274b`, VisGlob | VisOctal},
{"'3Ze\u050e|\u02del\u069du-Rpct4+Z5b={@_{b", `'3Ze\M-T\M^N|\M-K\M^^l\M-Z\M^]u-Rpct4+Z5b={@_{b`, VisGlob},
{"'3Ze\u050e|\u02del\u069du-Rpct4+Z5b={@_{b", `'3Ze\324\216|\313\236l\332\235u-Rpct4+Z5b={@_{b`, VisGlob | VisOctal},
} {
enc, err := Vis(test.input, test.flag)
if err != nil {
t.Errorf("unexpected error with %q: %s", test, err)
}
if enc != test.output {
t.Errorf("expected vis(%q, flag=%b) = %q, got %q", test.input, test.flag, test.output, enc)
}
}
}
@ -42,6 +114,7 @@ func TestVisChanged(t *testing.T) {
for _, test := range []string{
"hello world",
"THIS\\IS_A_TEST1234",
"AC_Ra\u00edz_Certic\u00e1mara_S.A..pem",
} {
enc, err := Vis(test, DefaultVisFlags)
if err != nil {