From 44391840b6581322304d03c9ac898601a6bd2c5a Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Sun, 12 Feb 2017 04:07:35 +1100 Subject: [PATCH] vis: partial vis(3) port This is a stopgap while I figure out how I should go about implementing vis(3). It's also important to have some vis(3) implementation so I can do integration tests on round-trips. Signed-off-by: Aleksa Sarai --- vis.go | 162 ++++++++++++++++++++++++++++++++++++++++++++++++++++ vis_test.go | 54 ++++++++++++++++++ 2 files changed, 216 insertions(+) create mode 100644 vis.go create mode 100644 vis_test.go diff --git a/vis.go b/vis.go new file mode 100644 index 0000000..d599ecb --- /dev/null +++ b/vis.go @@ -0,0 +1,162 @@ +/* + * govis: unicode aware vis(3) encoding implementation + * Copyright (C) 2017 SUSE LLC. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package govis + +import ( + "fmt" + "unicode" + "unicode/utf8" +) + +func isunsafe(ch rune) bool { + return ch == '\b' || ch == '\007' || ch == '\r' +} + +func isglob(ch rune) bool { + return ch == '*' || ch == '?' || ch == '[' || ch == '#' +} + +func ishttp(ch rune) bool { + return unicode.IsDigit(ch) || unicode.IsLetter(ch) || + // Safe characters. + ch == '$' || ch == '-' || ch == '_' || ch == '.' || ch == '+' || + // Extra characters. + ch == '!' || ch == '*' || ch == '\'' || ch == '(' || + ch == ')' || ch == ',' +} + +func mapRuneBytes(ch rune, fn func(byte) string) string { + bytes := make([]byte, utf8.RuneLen(ch)) + n := utf8.EncodeRune(bytes, ch) + + mapped := "" + for i := 0; i < n; i++ { + mapped += fn(bytes[i]) + } + return mapped +} + +// vis converts a single rune into its encoding, ensuring that it is "safe" +// (for some definition of safe). Note that some visual characters (such as +// accented characters or similar things) can be made up of several runes -- in +// order to maintain my sanity Vis() makes no attempt to handle such cases +// specially. +func vis(ch rune, flag VisFlag) (string, error) { + // XXX: Currently we are just allowing regular multi-byte characters such + // as accents and so on to be passed through without encoding. Is this + // really the best idea? In order to maintain compatibility with + // vis(3) such that an older unvis(3) will do the right thing maybe we + // should only output 7-bit ASCII? I'm not sure. + + if flag&VisHTTPStyle == VisHTTPStyle { + // This is described in RFC 1808. + if !ishttp(ch) { + return mapRuneBytes(ch, func(b byte) string { + return fmt.Sprintf("%.2X", b) + }), nil + } + } + + // Handle all "ordinary" characters which don't need to be encoded. + if !(flag&VisGlob == VisGlob && isglob(ch)) && + ((unicode.IsGraphic(ch) && !unicode.IsSpace(ch)) || + (flag&VisSpace == 0 && ch == ' ') || + (flag&VisTab == 0 && ch == '\t') || + (flag&VisNewline == 0 && ch == '\n') || + (flag&VisSafe == VisSafe && isunsafe(ch))) { + enc := string(ch) + if ch == '\\' && flag&VisNoSlash == 0 { + enc += "\\" + } + return enc, nil + } + + if flag&VisCStyle == VisCStyle { + switch ch { + case '\n': + return "\\n", nil + case '\r': + return "\\r", nil + case '\b': + return "\\b", nil + case '\a': + return "\\a", nil + case '\v': + return "\\v", nil + case '\t': + return "\\t", nil + case '\f': + return "\\f", nil + case 0: + // TODO: Handle isoctal properly. + return "\\000", nil + } + } + + // TODO: ch & 0177 is not implemented... + if flag&VisOctal == VisOctal || unicode.IsGraphic(ch) { + return mapRuneBytes(ch, func(b byte) string { + return fmt.Sprintf("\\%.3o", b) + }), nil + } + + return mapRuneBytes(ch, func(b byte) string { + enc := "" + if flag&VisNoSlash == 0 { + enc += "\\" + } + + // This logic is stolen from cvis, I don't understand any of it. + if b&0200 != 0 { + b &= 0177 + enc += "M" + } + if unicode.IsControl(rune(b)) { + enc += "^" + if b == 0177 { + enc += "?" + } else { + enc += string(b + '@') + } + } else { + enc += fmt.Sprintf("-%s", b) + } + + return enc + }), nil +} + +// Vis encodes the provided string to a BSD-compatible encoding using BSD's +// vis() flags. However, it will correctly handle multi-byte encoding (which is +// not done properly by BSD's vis implementation). +func Vis(src string, flag VisFlag) (string, error) { + if !utf8.ValidString(src) { + return "", fmt.Errorf("vis: input string is invalid utf8 literal") + } + + output := "" + for _, ch := range src { + encodedCh, err := vis(ch, flag) + if err != nil { + return "", err + } + output += encodedCh + } + + return output, nil +} diff --git a/vis_test.go b/vis_test.go new file mode 100644 index 0000000..fbecacb --- /dev/null +++ b/vis_test.go @@ -0,0 +1,54 @@ +/* + * govis: unicode aware vis(3) encoding implementation + * Copyright (C) 2017 SUSE LLC. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package govis + +import ( + "testing" +) + +func TestVisUnchanged(t *testing.T) { + for _, test := range []string{ + "helloworld", + "THIS_IS_A_TEST1234", + "SomeEncodingsAreCool", + "AC_Raíz_Certicámara_S.A..pem", + } { + enc, err := Vis(test, DefaultVisFlags) + if err != nil { + t.Errorf("unexpected error with %q: %s", test, err) + } + if enc != test { + t.Errorf("expected encoding of %q to be unchanged, got %q", test, enc) + } + } +} + +func TestVisChanged(t *testing.T) { + for _, test := range []string{ + "hello world", + "THIS\\IS_A_TEST1234", + } { + enc, err := Vis(test, DefaultVisFlags) + if err != nil { + t.Errorf("unexpected error with %q: %s", test, err) + } + if enc == test { + t.Errorf("expected encoding of %q to be changed") + } + } +}