From cd1de45ba58cf1bdf556c8cb4da95176a31e2fac Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Sun, 12 Feb 2017 04:06:20 +1100 Subject: [PATCH] unvis: implement partial unvis(3) implementation Also add some unit tests -- one of which currently fails due to ongoing design discussion about how certain escape codes should be handled. Signed-off-by: Aleksa Sarai --- flags.go | 38 ++++++++ unvis.go | 255 ++++++++++++++++++++++++++++++++++++++++++++++++++ unvis_test.go | 92 ++++++++++++++++++ 3 files changed, 385 insertions(+) create mode 100644 flags.go create mode 100644 unvis.go create mode 100644 unvis_test.go diff --git a/flags.go b/flags.go new file mode 100644 index 0000000..e98ed1b --- /dev/null +++ b/flags.go @@ -0,0 +1,38 @@ +/* + * govis: unicode aware vis(3) encoding implementation + * Copyright (C) 2017 SUSE LLC. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package govis + +type VisFlag uint + +// vis() has a variety of flags when deciding what encodings to use. While +// mtree only uses one set of flags, implementing them all is necessary in +// order to have compatibility with BSD's vis() and unvis() commands. +const ( + VisOctal VisFlag = (1 << iota) // VIS_OCTAL: Use octal \ddd format. + VisCStyle // VIS_CSTYLE: Use \[nrft0..] where appropriate. + VisSpace // VIS_SP: Also encode space. + VisTab // VIS_TAB: Also encode tab. + VisNewline // VIS_NL: Also encode newline. + VisSafe // VIS_SAFE: Encode unsafe characters. + VisNoSlash // VIS_NOSLASH: Inhibit printing '\'. + VisHTTPStyle // VIS_HTTPSTYLE: HTTP-style escape %xx. + VisGlob // VIS_GLOB: Encode glob(3) magics. + unvisEnd // UNVIS_END: Internal flag used to indicate end of parsing. + + VisWhite VisFlag = (VisSpace | VisTab | VisNewline) +) diff --git a/unvis.go b/unvis.go new file mode 100644 index 0000000..0a3dc44 --- /dev/null +++ b/unvis.go @@ -0,0 +1,255 @@ +/* + * govis: unicode aware vis(3) encoding implementation + * Copyright (C) 2017 SUSE LLC. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package govis + +import ( + "fmt" + "strconv" + "unicode/utf8" +) + +// unvisParser stores the current state of the token parser. +type unvisParser struct { + tokens []rune + idx int + flag VisFlag +} + +// Next moves the index to the next character. +func (p *unvisParser) Next() { + p.idx++ +} + +// Peek gets the current token. +func (p *unvisParser) Peek() (rune, error) { + if p.idx >= len(p.tokens) { + return utf8.RuneError, fmt.Errorf("tried to read past end of token list") + } + return p.tokens[p.idx], nil +} + +// End returns whether all of the tokens have been consumed. +func (p *unvisParser) End() bool { + return p.idx >= len(p.tokens) +} + +func newParser(input string, flag VisFlag) *unvisParser { + return &unvisParser{ + tokens: []rune(input), + idx: 0, + flag: flag, + } +} + +// While a recursive descent parser is overkill for parsing simple escape +// codes, this is IMO much easier to read than the ugly 80s coroutine code used +// by the original unvis(3) parser. Here's the EBNF for an unvis sequence: +// +// ::= ()* +// ::= ("\" ) | ("%" ) | +// ::= any rune +// ::= ("x" ) | ("M") | | +// ::= [0-9a-f] [0-9a-f] +// ::= "\" | "n" | "r" | "b" | "a" | "v" | "t" | "f" +// ::= [0-7] ([0-7] ([0-7])?)? + +func unvisPlainRune(p *unvisParser) (string, error) { + ch, err := p.Peek() + if err != nil { + return "", fmt.Errorf("plain rune: %s", ch) + } + p.Next() + return string(ch), nil +} + +func unvisEscapeCStyle(p *unvisParser) (string, error) { + ch, err := p.Peek() + if err != nil { + return "", fmt.Errorf("escape hex: %s", err) + } + + output := "" + switch ch { + case 'n': + output = "\n" + case 'r': + output = "\r" + case 'b': + output = "\b" + case 'a': + output = "\x07" + case 'v': + output = "\v" + case 't': + output = "\t" + case 'f': + output = "\f" + case 's': + output = " " + case 'E': + output = "\x1b" + case '\n': + // Hidden newline. + case '$': + // Hidden marker. + default: + // XXX: We should probably allow falling through and return "\" here... + return "", fmt.Errorf("escape cstyle: unknown escape character: %q", ch) + } + + p.Next() + return output, nil +} + +func unvisEscapeHex(p *unvisParser) (string, error) { + var output rune + + for i := 0; i < 2; i++ { + ch, err := p.Peek() + if err != nil { + return "", fmt.Errorf("escape hex: %s", err) + } + + digit, err := strconv.ParseInt(string(ch), 16, 32) + if err != nil { + return "", fmt.Errorf("escape hex: parse int: %s", err) + } + + output = (output << 4) | rune(digit) + p.Next() + } + + // TODO: We need to handle runes properly to output byte strings again. In + // particular, if rune has 0xf0 set then we know that we're currently + // decoding a messed up string. + return string(output), nil +} + +func unvisEscapeOctal(p *unvisParser) (string, error) { + var output rune + var err error + + for i := 0; i < 3; i++ { + ch, err := p.Peek() + if err != nil { + if i == 0 { + err = fmt.Errorf("escape octal[first]: %s", err) + } + break + } + + digit, err := strconv.ParseInt(string(ch), 8, 32) + if err != nil { + if i == 0 { + err = fmt.Errorf("escape octal[first]: parse int: %s", err) + } + break + } + + output = (output << 3) | rune(digit) + p.Next() + } + + // TODO: We need to handle runes properly to output byte strings again. In + // particular, if rune has 0xf0 set then we know that we're currently + // decoding a messed up string. + return string(output), err +} + +func unvisEscapeSequence(p *unvisParser) (string, error) { + ch, err := p.Peek() + if err != nil { + return "", fmt.Errorf("escape sequence: %s", err) + } + + switch ch { + case '\\': + p.Next() + return "\\", nil + + case '0', '1', '2', '3', '4', '5', '6', '7': + return unvisEscapeOctal(p) + + case 'x': + p.Next() + return unvisEscapeHex(p) + + case 'M': + // TODO + case '^': + // TODO + + default: + return unvisEscapeCStyle(p) + } + + return "", fmt.Errorf("escape sequence: unsupported sequence: %q", ch) +} + +func unvisRune(p *unvisParser) (string, error) { + ch, err := p.Peek() + if err != nil { + return "", fmt.Errorf("rune: %s", err) + } + + switch ch { + case '\\': + p.Next() + return unvisEscapeSequence(p) + + case '%': + // % HEX HEX only applies to HTTPStyle encodings. + if p.flag&VisHTTPStyle == VisHTTPStyle { + p.Next() + return unvisEscapeHex(p) + } + fallthrough + + default: + return unvisPlainRune(p) + } +} + +func unvis(p *unvisParser) (string, error) { + output := "" + for !p.End() { + ch, err := unvisRune(p) + if err != nil { + return "", fmt.Errorf("input: %s", err) + } + output += ch + } + return output, nil +} + +// Unvis takes a string formatted with the given Vis flags (though only the +// VisHTTPStyle flag is checked) and output the un-encoded version of the +// encoded string. An error is returned if any escape sequences in the input +// string were invalid. +func Unvis(input string, flag VisFlag) (string, error) { + // TODO: Check all of the VisFlag bits. + p := newParser(input, flag) + output, err := unvis(p) + if err != nil { + return "", fmt.Errorf("unvis: %s", err) + } + if !p.End() { + return "", fmt.Errorf("unvis: trailing characters at end of input") + } + return output, nil +} diff --git a/unvis_test.go b/unvis_test.go new file mode 100644 index 0000000..0b6ca04 --- /dev/null +++ b/unvis_test.go @@ -0,0 +1,92 @@ +/* + * govis: unicode aware vis(3) encoding implementation + * Copyright (C) 2017 SUSE LLC. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package govis + +import ( + "testing" +) + +func TestUnvisOctalEscape(t *testing.T) { + for _, test := range []struct { + input string + expected string + }{ + {"", ""}, + {"\\1", "\001"}, + {"\\01\\02\\3", "\001\002\003"}, + {"\\001\\023\\32", "\001\023\032"}, + {"this is a test\\0k1\\133", "this is a test\000k1\133"}, + {"\\170YET\\01another test\\1\\\\82", "\170YET\001another test\001\\82"}, + {"\\177MORE tests\\09a", "\177MORE tests\x009a"}, + {"\\\\710more\\1215testing", "\\710more\1215testing"}, + } { + got, err := Unvis(test.input, DefaultVisFlags) + if err != nil { + t.Errorf("unexpected error doing unvis(%q): %q", test.input, err) + continue + } + if got != test.expected { + t.Errorf("expected unvis(%q) = %q, got %q", test.input, test.expected, got) + } + } +} + +func TestUnvisHexEscape(t *testing.T) { + for _, test := range []struct { + input string + expected string + }{ + {"", ""}, + {"\\x01", "\x01"}, + {"\\x01\\x02\\x7a", "\x01\x02\x7a"}, + {"this is a test\\x13\\x52\\x6f", "this is a test\x13\x52\x6f"}, + {"\\x170YET\\x01a\\x22nother test\\x11", "\x170YET\x01a\x22nother test\x11"}, + {"\\\\x007more\\\\x215testing", "\\x007more\\x215testing"}, + // Make sure that decoding unicode works properly. + {"\\xf0\\x9f\\x95\\xb4", "\U0001f574"}, + } { + got, err := Unvis(test.input, DefaultVisFlags) + if err != nil { + t.Errorf("unexpected error doing unvis(%q): %q", test.input, err) + continue + } + if got != test.expected { + t.Errorf("expected unvis(%q) = %q, got %q", test.input, test.expected, got) + } + } +} + +func TestUnvisUnicode(t *testing.T) { + // Ensure that unicode strings are not messed up by Unvis. + for _, test := range []string{ + "", + "this.is.a.normal_string", + "AC_Raíz_Certicámara_S.A..pem", + "NetLock_Arany_=Class_Gold=_Főtanúsítvány.pem", + "TÜBİTAK_UEKAE_Kök_Sertifika_Hizmet_Sağlayıcısı_-_Sürüm_3.pem", + } { + got, err := Unvis(test, DefaultVisFlags) + if err != nil { + t.Errorf("unexpected error doing unvis(%q): %s", test, err) + continue + } + if got != test { + t.Errorf("expected %q to be unchanged, got %q", test, got) + } + } +}