unvis: implement partial unvis(3) implementation

Also add some unit tests -- one of which currently fails due to ongoing design discussion about how certain escape codes should be handled. Signed-off-by: Aleksa Sarai <asarai@suse.de>
2025-10-04 04:31:00 +00:00 · 2017-02-12 04:06:20 +11:00 · 2017-02-12 04:06:20 +11:00 · cd1de45ba5
commit cd1de45ba5
parent 1e8de82690
3 changed files with 385 additions and 0 deletions
--- a/flags.go
+++ b/flags.go
@ -0,0 +1,38 @@
+/*
+ * govis: unicode aware vis(3) encoding implementation
+ * Copyright (C) 2017 SUSE LLC.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package govis
+
+type VisFlag uint
+
+// vis() has a variety of flags when deciding what encodings to use. While
+// mtree only uses one set of flags, implementing them all is necessary in
+// order to have compatibility with BSD's vis() and unvis() commands.
+const (
+	VisOctal     VisFlag = (1 << iota) // VIS_OCTAL: Use octal \ddd format.
+	VisCStyle                          // VIS_CSTYLE: Use \[nrft0..] where appropriate.
+	VisSpace                           // VIS_SP: Also encode space.
+	VisTab                             // VIS_TAB: Also encode tab.
+	VisNewline                         // VIS_NL: Also encode newline.
+	VisSafe                            // VIS_SAFE: Encode unsafe characters.
+	VisNoSlash                         // VIS_NOSLASH: Inhibit printing '\'.
+	VisHTTPStyle                       // VIS_HTTPSTYLE: HTTP-style escape %xx.
+	VisGlob                            // VIS_GLOB: Encode glob(3) magics.
+	unvisEnd                           // UNVIS_END: Internal flag used to indicate end of parsing.
+
+	VisWhite VisFlag = (VisSpace | VisTab | VisNewline)
+)
--- a/unvis.go
+++ b/unvis.go
@ -0,0 +1,255 @@
+/*
+ * govis: unicode aware vis(3) encoding implementation
+ * Copyright (C) 2017 SUSE LLC.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package govis
+
+import (
+	"fmt"
+	"strconv"
+	"unicode/utf8"
+)
+
+// unvisParser stores the current state of the token parser.
+type unvisParser struct {
+	tokens []rune
+	idx    int
+	flag   VisFlag
+}
+
+// Next moves the index to the next character.
+func (p *unvisParser) Next() {
+	p.idx++
+}
+
+// Peek gets the current token.
+func (p *unvisParser) Peek() (rune, error) {
+	if p.idx >= len(p.tokens) {
+		return utf8.RuneError, fmt.Errorf("tried to read past end of token list")
+	}
+	return p.tokens[p.idx], nil
+}
+
+// End returns whether all of the tokens have been consumed.
+func (p *unvisParser) End() bool {
+	return p.idx >= len(p.tokens)
+}
+
+func newParser(input string, flag VisFlag) *unvisParser {
+	return &unvisParser{
+		tokens: []rune(input),
+		idx:    0,
+		flag:   flag,
+	}
+}
+
+// While a recursive descent parser is overkill for parsing simple escape
+// codes, this is IMO much easier to read than the ugly 80s coroutine code used
+// by the original unvis(3) parser. Here's the EBNF for an unvis sequence:
+//
+// <input>           ::= (<rune>)*
+// <rune>            ::= ("\" <escape-sequence>) | ("%" <escape-hex>) | <plain-rune>
+// <plain-rune>      ::= any rune
+// <escape-sequence> ::= ("x" <escape-hex>) | ("M") | <escape-cstyle> | <escape-octal>
+// <escape-hex>      ::= [0-9a-f] [0-9a-f]
+// <escape-cstyle>   ::= "\" | "n" | "r" | "b" | "a" | "v" | "t" | "f"
+// <escape-octal>    ::= [0-7] ([0-7] ([0-7])?)?
+
+func unvisPlainRune(p *unvisParser) (string, error) {
+	ch, err := p.Peek()
+	if err != nil {
+		return "", fmt.Errorf("plain rune: %s", ch)
+	}
+	p.Next()
+	return string(ch), nil
+}
+
+func unvisEscapeCStyle(p *unvisParser) (string, error) {
+	ch, err := p.Peek()
+	if err != nil {
+		return "", fmt.Errorf("escape hex: %s", err)
+	}
+
+	output := ""
+	switch ch {
+	case 'n':
+		output = "\n"
+	case 'r':
+		output = "\r"
+	case 'b':
+		output = "\b"
+	case 'a':
+		output = "\x07"
+	case 'v':
+		output = "\v"
+	case 't':
+		output = "\t"
+	case 'f':
+		output = "\f"
+	case 's':
+		output = " "
+	case 'E':
+		output = "\x1b"
+	case '\n':
+		// Hidden newline.
+	case '$':
+		// Hidden marker.
+	default:
+		// XXX: We should probably allow falling through and return "\" here...
+		return "", fmt.Errorf("escape cstyle: unknown escape character: %q", ch)
+	}
+
+	p.Next()
+	return output, nil
+}
+
+func unvisEscapeHex(p *unvisParser) (string, error) {
+	var output rune
+
+	for i := 0; i < 2; i++ {
+		ch, err := p.Peek()
+		if err != nil {
+			return "", fmt.Errorf("escape hex: %s", err)
+		}
+
+		digit, err := strconv.ParseInt(string(ch), 16, 32)
+		if err != nil {
+			return "", fmt.Errorf("escape hex: parse int: %s", err)
+		}
+
+		output = (output << 4) | rune(digit)
+		p.Next()
+	}
+
+	// TODO: We need to handle runes properly to output byte strings again. In
+	//       particular, if rune has 0xf0 set then we know that we're currently
+	//       decoding a messed up string.
+	return string(output), nil
+}
+
+func unvisEscapeOctal(p *unvisParser) (string, error) {
+	var output rune
+	var err error
+
+	for i := 0; i < 3; i++ {
+		ch, err := p.Peek()
+		if err != nil {
+			if i == 0 {
+				err = fmt.Errorf("escape octal[first]: %s", err)
+			}
+			break
+		}
+
+		digit, err := strconv.ParseInt(string(ch), 8, 32)
+		if err != nil {
+			if i == 0 {
+				err = fmt.Errorf("escape octal[first]: parse int: %s", err)
+			}
+			break
+		}
+
+		output = (output << 3) | rune(digit)
+		p.Next()
+	}
+
+	// TODO: We need to handle runes properly to output byte strings again. In
+	//       particular, if rune has 0xf0 set then we know that we're currently
+	//       decoding a messed up string.
+	return string(output), err
+}
+
+func unvisEscapeSequence(p *unvisParser) (string, error) {
+	ch, err := p.Peek()
+	if err != nil {
+		return "", fmt.Errorf("escape sequence: %s", err)
+	}
+
+	switch ch {
+	case '\\':
+		p.Next()
+		return "\\", nil
+
+	case '0', '1', '2', '3', '4', '5', '6', '7':
+		return unvisEscapeOctal(p)
+
+	case 'x':
+		p.Next()
+		return unvisEscapeHex(p)
+
+	case 'M':
+		// TODO
+	case '^':
+		// TODO
+
+	default:
+		return unvisEscapeCStyle(p)
+	}
+
+	return "", fmt.Errorf("escape sequence: unsupported sequence: %q", ch)
+}
+
+func unvisRune(p *unvisParser) (string, error) {
+	ch, err := p.Peek()
+	if err != nil {
+		return "", fmt.Errorf("rune: %s", err)
+	}
+
+	switch ch {
+	case '\\':
+		p.Next()
+		return unvisEscapeSequence(p)
+
+	case '%':
+		// % HEX HEX only applies to HTTPStyle encodings.
+		if p.flag&VisHTTPStyle == VisHTTPStyle {
+			p.Next()
+			return unvisEscapeHex(p)
+		}
+		fallthrough
+
+	default:
+		return unvisPlainRune(p)
+	}
+}
+
+func unvis(p *unvisParser) (string, error) {
+	output := ""
+	for !p.End() {
+		ch, err := unvisRune(p)
+		if err != nil {
+			return "", fmt.Errorf("input: %s", err)
+		}
+		output += ch
+	}
+	return output, nil
+}
+
+// Unvis takes a string formatted with the given Vis flags (though only the
+// VisHTTPStyle flag is checked) and output the un-encoded version of the
+// encoded string. An error is returned if any escape sequences in the input
+// string were invalid.
+func Unvis(input string, flag VisFlag) (string, error) {
+	// TODO: Check all of the VisFlag bits.
+	p := newParser(input, flag)
+	output, err := unvis(p)
+	if err != nil {
+		return "", fmt.Errorf("unvis: %s", err)
+	}
+	if !p.End() {
+		return "", fmt.Errorf("unvis: trailing characters at end of input")
+	}
+	return output, nil
+}
--- a/unvis_test.go
+++ b/unvis_test.go
@ -0,0 +1,92 @@
+/*
+ * govis: unicode aware vis(3) encoding implementation
+ * Copyright (C) 2017 SUSE LLC.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package govis
+
+import (
+	"testing"
+)
+
+func TestUnvisOctalEscape(t *testing.T) {
+	for _, test := range []struct {
+		input    string
+		expected string
+	}{
+		{"", ""},
+		{"\\1", "\001"},
+		{"\\01\\02\\3", "\001\002\003"},
+		{"\\001\\023\\32", "\001\023\032"},
+		{"this is a test\\0k1\\133", "this is a test\000k1\133"},
+		{"\\170YET\\01another test\\1\\\\82", "\170YET\001another test\001\\82"},
+		{"\\177MORE tests\\09a", "\177MORE tests\x009a"},
+		{"\\\\710more\\1215testing", "\\710more\1215testing"},
+	} {
+		got, err := Unvis(test.input, DefaultVisFlags)
+		if err != nil {
+			t.Errorf("unexpected error doing unvis(%q): %q", test.input, err)
+			continue
+		}
+		if got != test.expected {
+			t.Errorf("expected unvis(%q) = %q, got %q", test.input, test.expected, got)
+		}
+	}
+}
+
+func TestUnvisHexEscape(t *testing.T) {
+	for _, test := range []struct {
+		input    string
+		expected string
+	}{
+		{"", ""},
+		{"\\x01", "\x01"},
+		{"\\x01\\x02\\x7a", "\x01\x02\x7a"},
+		{"this is a test\\x13\\x52\\x6f", "this is a test\x13\x52\x6f"},
+		{"\\x170YET\\x01a\\x22nother test\\x11", "\x170YET\x01a\x22nother test\x11"},
+		{"\\\\x007more\\\\x215testing", "\\x007more\\x215testing"},
+		// Make sure that decoding unicode works properly.
+		{"\\xf0\\x9f\\x95\\xb4", "\U0001f574"},
+	} {
+		got, err := Unvis(test.input, DefaultVisFlags)
+		if err != nil {
+			t.Errorf("unexpected error doing unvis(%q): %q", test.input, err)
+			continue
+		}
+		if got != test.expected {
+			t.Errorf("expected unvis(%q) = %q, got %q", test.input, test.expected, got)
+		}
+	}
+}
+
+func TestUnvisUnicode(t *testing.T) {
+	// Ensure that unicode strings are not messed up by Unvis.
+	for _, test := range []string{
+		"",
+		"this.is.a.normal_string",
+		"AC_Raíz_Certicámara_S.A..pem",
+		"NetLock_Arany_=Class_Gold=_Főtanúsítvány.pem",
+		"TÜBİTAK_UEKAE_Kök_Sertifika_Hizmet_Sağlayıcısı_-_Sürüm_3.pem",
+	} {
+		got, err := Unvis(test, DefaultVisFlags)
+		if err != nil {
+			t.Errorf("unexpected error doing unvis(%q): %s", test, err)
+			continue
+		}
+		if got != test {
+			t.Errorf("expected %q to be unchanged, got %q", test, got)
+		}
+	}
+}