diff --git a/govis.go b/govis.go index 2e6b22f..1e88eb1 100644 --- a/govis.go +++ b/govis.go @@ -23,15 +23,16 @@ type VisFlag uint // mtree only uses one set of flags, implementing them all is necessary in // order to have compatibility with BSD's vis() and unvis() commands. const ( - VisOctal VisFlag = (1 << iota) // VIS_OCTAL: Use octal \ddd format. - VisCStyle // VIS_CSTYLE: Use \[nrft0..] where appropriate. - VisSpace // VIS_SP: Also encode space. - VisTab // VIS_TAB: Also encode tab. - VisNewline // VIS_NL: Also encode newline. - VisSafe // VIS_SAFE: Encode unsafe characters. - VisNoSlash // VIS_NOSLASH: Inhibit printing '\'. - VisHTTPStyle // VIS_HTTPSTYLE: HTTP-style escape %xx. - VisGlob // VIS_GLOB: Encode glob(3) magics. + VisOctal VisFlag = (1 << iota) // VIS_OCTAL: Use octal \ddd format. + VisCStyle // VIS_CSTYLE: Use \[nrft0..] where appropriate. + VisSpace // VIS_SP: Also encode space. + VisTab // VIS_TAB: Also encode tab. + VisNewline // VIS_NL: Also encode newline. + VisSafe // VIS_SAFE: Encode unsafe characters. + VisNoSlash // VIS_NOSLASH: Inhibit printing '\'. + VisHTTPStyle // VIS_HTTPSTYLE: HTTP-style escape %xx. + VisGlob // VIS_GLOB: Encode glob(3) magics. + visMask VisFlag = (1 << iota) - 1 // Mask of all flags. VisWhite VisFlag = (VisSpace | VisTab | VisNewline) ) diff --git a/vis.go b/vis.go index d599ecb..140556a 100644 --- a/vis.go +++ b/vis.go @@ -20,7 +20,6 @@ package govis import ( "fmt" "unicode" - "unicode/utf8" ) func isunsafe(ch rune) bool { @@ -31,7 +30,15 @@ func isglob(ch rune) bool { return ch == '*' || ch == '?' || ch == '[' || ch == '#' } +// ishttp is defined by RFC 1808. func ishttp(ch rune) bool { + // RFC1808 does not really consider characters outside of ASCII, so just to + // be safe always treat characters outside the ASCII character set as "not + // HTTP". + if ch > unicode.MaxASCII { + return false + } + return unicode.IsDigit(ch) || unicode.IsLetter(ch) || // Safe characters. ch == '$' || ch == '-' || ch == '_' || ch == '.' || ch == '+' || @@ -40,54 +47,56 @@ func ishttp(ch rune) bool { ch == ')' || ch == ',' } -func mapRuneBytes(ch rune, fn func(byte) string) string { - bytes := make([]byte, utf8.RuneLen(ch)) - n := utf8.EncodeRune(bytes, ch) - - mapped := "" - for i := 0; i < n; i++ { - mapped += fn(bytes[i]) - } - return mapped +func isgraph(ch rune) bool { + return unicode.IsGraphic(ch) && !unicode.IsSpace(ch) && ch <= unicode.MaxASCII } -// vis converts a single rune into its encoding, ensuring that it is "safe" -// (for some definition of safe). Note that some visual characters (such as -// accented characters or similar things) can be made up of several runes -- in -// order to maintain my sanity Vis() makes no attempt to handle such cases -// specially. -func vis(ch rune, flag VisFlag) (string, error) { - // XXX: Currently we are just allowing regular multi-byte characters such - // as accents and so on to be passed through without encoding. Is this - // really the best idea? In order to maintain compatibility with - // vis(3) such that an older unvis(3) will do the right thing maybe we - // should only output 7-bit ASCII? I'm not sure. +// vis converts a single *byte* into its encoding. While Go supports the +// concept of runes (and thus native utf-8 parsing), in order to make sure that +// the bit-stream will be completely maintained through an Unvis(Vis(...)) +// round-trip. The downside is that Vis() will never output unicode -- but on +// the plus side this is actually a benefit on the encoding side (it will +// always work with the simple unvis(3) implementation). It also means that we +// don't have to worry about different multi-byte encodings. +func vis(b byte, flag VisFlag) (string, error) { + // Treat the single-byte character as a rune. + ch := rune(b) + // XXX: This is quite a horrible thing to support. if flag&VisHTTPStyle == VisHTTPStyle { - // This is described in RFC 1808. if !ishttp(ch) { - return mapRuneBytes(ch, func(b byte) string { - return fmt.Sprintf("%.2X", b) - }), nil + return "%" + fmt.Sprintf("%.2X", ch), nil } } - // Handle all "ordinary" characters which don't need to be encoded. - if !(flag&VisGlob == VisGlob && isglob(ch)) && - ((unicode.IsGraphic(ch) && !unicode.IsSpace(ch)) || - (flag&VisSpace == 0 && ch == ' ') || - (flag&VisTab == 0 && ch == '\t') || - (flag&VisNewline == 0 && ch == '\n') || - (flag&VisSafe == VisSafe && isunsafe(ch))) { - enc := string(ch) + // Figure out if the character doesn't need to be encoded. Effectively, we + // encode most "normal" (graphical) characters as themselves unless we have + // been specifically asked not to. Note though that we *ALWAYS* encode + // everything outside ASCII. + // TODO: Switch this to much more logical code. + + if ch > unicode.MaxASCII { + /* ... */ + } else if flag&VisGlob == VisGlob && isglob(ch) { + /* ... */ + } else if isgraph(ch) || + (flag&VisSpace != VisSpace && ch == ' ') || + (flag&VisTab != VisTab && ch == '\t') || + (flag&VisNewline != VisNewline && ch == '\n') || + (flag&VisSafe != 0 && isunsafe(ch)) { + + encoded := string(ch) if ch == '\\' && flag&VisNoSlash == 0 { - enc += "\\" + encoded += "\\" } - return enc, nil + return encoded, nil } + // Try to use C-style escapes first. if flag&VisCStyle == VisCStyle { switch ch { + case ' ': + return "\\s", nil case '\n': return "\\n", nil case '\r': @@ -102,55 +111,61 @@ func vis(ch rune, flag VisFlag) (string, error) { return "\\t", nil case '\f': return "\\f", nil - case 0: - // TODO: Handle isoctal properly. + case '\x00': + // Output octal just to be safe. return "\\000", nil } } - // TODO: ch & 0177 is not implemented... - if flag&VisOctal == VisOctal || unicode.IsGraphic(ch) { - return mapRuneBytes(ch, func(b byte) string { - return fmt.Sprintf("\\%.3o", b) - }), nil + // For graphical characters we generate octal output (and also if it's + // being forced by the caller's flags). Also spaces should always be + // encoded as octal. + if flag&VisOctal == VisOctal || isgraph(ch) || ch&0x7f == ' ' { + // Always output three-character octal just to be safe. + return fmt.Sprintf("\\%.3o", ch), nil } - return mapRuneBytes(ch, func(b byte) string { - enc := "" - if flag&VisNoSlash == 0 { - enc += "\\" - } + // Now we have to output meta or ctrl escapes. As far as I can tell, this + // is not actually defined by any standard -- so this logic is basically + // copied from the original vis(3) implementation. Hopefully nobody + // actually relies on this (octal and hex are better). - // This logic is stolen from cvis, I don't understand any of it. - if b&0200 != 0 { - b &= 0177 - enc += "M" - } - if unicode.IsControl(rune(b)) { - enc += "^" - if b == 0177 { - enc += "?" - } else { - enc += string(b + '@') - } + encoded := "" + if flag&VisNoSlash == 0 { + encoded += "\\" + } + + // Meta characters have 0x80 set, but are otherwise identical to control + // characters. + if b&0x80 != 0 { + b &= 0x7f + encoded += "M" + } + + if unicode.IsControl(rune(b)) { + encoded += "^" + if b == 0x7f { + encoded += "?" } else { - enc += fmt.Sprintf("-%s", b) + encoded += fmt.Sprintf("%c", b+'@') } + } else { + encoded += fmt.Sprintf("-%c", b) + } - return enc - }), nil + return encoded, nil } // Vis encodes the provided string to a BSD-compatible encoding using BSD's // vis() flags. However, it will correctly handle multi-byte encoding (which is // not done properly by BSD's vis implementation). func Vis(src string, flag VisFlag) (string, error) { - if !utf8.ValidString(src) { - return "", fmt.Errorf("vis: input string is invalid utf8 literal") + if flag&visMask != flag { + return "", fmt.Errorf("vis: flag %q contains unknown or unsupported flags", flag) } output := "" - for _, ch := range src { + for _, ch := range []byte(src) { encodedCh, err := vis(ch, flag) if err != nil { return "", err diff --git a/vis_test.go b/vis_test.go index fbecacb..e4ef083 100644 --- a/vis_test.go +++ b/vis_test.go @@ -22,18 +22,90 @@ import ( ) func TestVisUnchanged(t *testing.T) { - for _, test := range []string{ - "helloworld", - "THIS_IS_A_TEST1234", - "SomeEncodingsAreCool", - "AC_Raíz_Certicámara_S.A..pem", + for _, test := range []struct { + input string + flag VisFlag + }{ + {"", DefaultVisFlags}, + {"helloworld", DefaultVisFlags}, + {"THIS_IS_A_TEST1234", DefaultVisFlags}, + {"SomeEncodingsAreCool", DefaultVisFlags}, + {"spaces are totally safe", DefaultVisFlags &^ VisSpace}, + {"tabs\tare\talso\tsafe!!", DefaultVisFlags &^ VisTab}, + {"just\a\atrustme\r\b\b!!", DefaultVisFlags | VisSafe}, } { - enc, err := Vis(test, DefaultVisFlags) + enc, err := Vis(test.input, test.flag) if err != nil { t.Errorf("unexpected error with %q: %s", test, err) } - if enc != test { - t.Errorf("expected encoding of %q to be unchanged, got %q", test, enc) + if enc != test.input { + t.Errorf("expected encoding of %q (flag=%q) to be unchanged, got %q", test.input, test.flag, enc) + } + } +} + +func TestVisFlags(t *testing.T) { + for _, test := range []struct { + input string + output string + flag VisFlag + }{ + // Default + {"AC_Ra\u00edz_Certic\u00e1mara_S.A..pem", "AC_Ra\\M-C\\M--z_Certic\\M-C\\M-!mara_S.A..pem", 0}, + {"z^i3i$\u00d3\u008anqgh5/t\u00e5<86>\u00b2kzla\\e^lv\u00df\u0093nv\u00df\u00aea|3}\u00d8\u0088\u00d6\u0084", `z^i3i$\M-C\M^S\M-B\M^Jnqgh5/t\M-C\M-%<86>\M-B\M-2kzla\\e^lv\M-C\M^_\M-B\M^Snv\M-C\M^_\M-B\M-.a|3}\M-C\M^X\M-B\M^H\M-C\M^V\M-B\M^D`, 0}, + {"@?e1xs+.R_Kjo]7s8pgRP:*nXCE4{!c", "@?e1xs+.R_Kjo]7s8pgRP:*nXCE4{!c", 0}, + {"62_\u00c6\u00c62\u00ae\u00b7m\u00db\u00c3r^\u00bfp\u00c6u'q\u00fbc2\u00f0u\u00b8\u00dd\u00e8v\u00ff\u00b0\u00dc\u00c2\u00f53\u00db-k\u00f2sd4\\p\u00da\u00a6\u00d3\u00eea<\u00e6s{\u00a0p\u00f0\u00ffj\u00e0\u00e8\u00b8\u00b8\u00bc\u00fcb", `62_\M-C\M^F\M-C\M^F2\M-B\M-.\M-B\M-7m\M-C\M^[\M-C\M^Cr^\M-B\M-?p\M-C\M^Fu'q\M-C\M-;c2\M-C\M-0u\M-B\M-8\M-C\M^]\M-C\M-(v\M-C\M-?\M-B\M-0\M-C\M^\\M-C\M^B\M-C\M-53\M-C\M^[-k\M-C\M-2sd4\\p\M-C\M^Z\M-B\M-&\M-C\M^S\M-C\M-.a<\M-C\M-&s{\M-B\240p\M-C\M-0\M-C\M-?j\M-C\240\M-C\M-(\M-B\M-8\M-B\M-8\M-B\M-<\M-C\M-\u00b2kzla\\e^lv\u00df\u0093nv\u00df\u00aea|3}\u00d8\u0088\u00d6\u0084", `z^i3i$\303\223\302\212nqgh5/t\303\245<86>\302\262kzla\\e^lv\303\237\302\223nv\303\237\302\256a|3}\303\230\302\210\303\226\302\204`, VisOctal}, + {"62_\u00c6\u00c62\u00ae\u00b7m\u00db\u00c3r^\u00bfp\u00c6u'q\u00fbc2\u00f0u\u00b8\u00dd\u00e8v\u00ff\u00b0\u00dc\u00c2\u00f53\u00db-k\u00f2sd4\\p\u00da\u00a6\u00d3\u00eea<\u00e6s{\u00a0p\u00f0\u00ffj\u00e0\u00e8\u00b8\u00b8\u00bc\u00fcb", `62_\303\206\303\2062\302\256\302\267m\303\233\303\203r^\302\277p\303\206u'q\303\273c2\303\260u\302\270\303\235\303\250v\303\277\302\260\303\234\303\202\303\2653\303\233-k\303\262sd4\\p\303\232\302\246\303\223\303\256a<\303\246s{\302\240p\303\260\303\277j\303\240\303\250\302\270\302\270\302\274\303\274b`, VisOctal}, + {"\u9003\"9v1)T798|o;fly jnKX\u0489Be=", `\351\200\203"9v1)T798|o;fly jnKX\322\211Be=`, VisOctal}, + // VisCStyle + {"\x00 \f \a \n\v\b \r \t\r", "\\000 \\f \\a \n\\v\\b \\r \t\\r", VisCStyle}, + {"\t \n\v\b", "\\t \n\\v\\b", VisTab | VisCStyle}, + {"\n\v\t ", "\n\\v\t\\s\\s\\s", VisSpace | VisCStyle}, + {"\n \n ", "\\n \\n ", VisNewline | VisCStyle}, + {"z^i3i$\u00d3\u008anqgh5/t\u00e5<86>\u00b2kzla\\e^lv\u00df\u0093nv\u00df\u00aea|3}\u00d8\u0088\u00d6\u0084", `z^i3i$\M-C\M^S\M-B\M^Jnqgh5/t\M-C\M-%<86>\M-B\M-2kzla\\e^lv\M-C\M^_\M-B\M^Snv\M-C\M^_\M-B\M-.a|3}\M-C\M^X\M-B\M^H\M-C\M^V\M-B\M^D`, VisCStyle}, + {"62_\u00c6\u00c62\u00ae\u00b7m\u00db\u00c3r^\u00bfp\u00c6u'q\u00fbc2\u00f0u\u00b8\u00dd\u00e8v\u00ff\u00b0\u00dc\u00c2\u00f53\u00db-k\u00f2sd4\\p\u00da\u00a6\u00d3\u00eea<\u00e6s{\u00a0p\u00f0\u00ffj\u00e0\u00e8\u00b8\u00b8\u00bc\u00fcb", `62_\M-C\M^F\M-C\M^F2\M-B\M-.\M-B\M-7m\M-C\M^[\M-C\M^Cr^\M-B\M-?p\M-C\M^Fu'q\M-C\M-;c2\M-C\M-0u\M-B\M-8\M-C\M^]\M-C\M-(v\M-C\M-?\M-B\M-0\M-C\M^\\M-C\M^B\M-C\M-53\M-C\M^[-k\M-C\M-2sd4\\p\M-C\M^Z\M-B\M-&\M-C\M^S\M-C\M-.a<\M-C\M-&s{\M-B\240p\M-C\M-0\M-C\M-?j\M-C\240\M-C\M-(\M-B\M-8\M-B\M-8\M-B\M-<\M-C\M-