unvis: implement partial unvis(3) implementation
Also add some unit tests -- one of which currently fails due to ongoing design discussion about how certain escape codes should be handled. Signed-off-by: Aleksa Sarai <asarai@suse.de>
This commit is contained in:
parent
1e8de82690
commit
cd1de45ba5
3 changed files with 385 additions and 0 deletions
38
flags.go
Normal file
38
flags.go
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
/*
|
||||||
|
* govis: unicode aware vis(3) encoding implementation
|
||||||
|
* Copyright (C) 2017 SUSE LLC.
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package govis
|
||||||
|
|
||||||
|
type VisFlag uint
|
||||||
|
|
||||||
|
// vis() has a variety of flags when deciding what encodings to use. While
|
||||||
|
// mtree only uses one set of flags, implementing them all is necessary in
|
||||||
|
// order to have compatibility with BSD's vis() and unvis() commands.
|
||||||
|
const (
|
||||||
|
VisOctal VisFlag = (1 << iota) // VIS_OCTAL: Use octal \ddd format.
|
||||||
|
VisCStyle // VIS_CSTYLE: Use \[nrft0..] where appropriate.
|
||||||
|
VisSpace // VIS_SP: Also encode space.
|
||||||
|
VisTab // VIS_TAB: Also encode tab.
|
||||||
|
VisNewline // VIS_NL: Also encode newline.
|
||||||
|
VisSafe // VIS_SAFE: Encode unsafe characters.
|
||||||
|
VisNoSlash // VIS_NOSLASH: Inhibit printing '\'.
|
||||||
|
VisHTTPStyle // VIS_HTTPSTYLE: HTTP-style escape %xx.
|
||||||
|
VisGlob // VIS_GLOB: Encode glob(3) magics.
|
||||||
|
unvisEnd // UNVIS_END: Internal flag used to indicate end of parsing.
|
||||||
|
|
||||||
|
VisWhite VisFlag = (VisSpace | VisTab | VisNewline)
|
||||||
|
)
|
255
unvis.go
Normal file
255
unvis.go
Normal file
|
@ -0,0 +1,255 @@
|
||||||
|
/*
|
||||||
|
* govis: unicode aware vis(3) encoding implementation
|
||||||
|
* Copyright (C) 2017 SUSE LLC.
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package govis
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strconv"
|
||||||
|
"unicode/utf8"
|
||||||
|
)
|
||||||
|
|
||||||
|
// unvisParser stores the current state of the token parser.
|
||||||
|
type unvisParser struct {
|
||||||
|
tokens []rune
|
||||||
|
idx int
|
||||||
|
flag VisFlag
|
||||||
|
}
|
||||||
|
|
||||||
|
// Next moves the index to the next character.
|
||||||
|
func (p *unvisParser) Next() {
|
||||||
|
p.idx++
|
||||||
|
}
|
||||||
|
|
||||||
|
// Peek gets the current token.
|
||||||
|
func (p *unvisParser) Peek() (rune, error) {
|
||||||
|
if p.idx >= len(p.tokens) {
|
||||||
|
return utf8.RuneError, fmt.Errorf("tried to read past end of token list")
|
||||||
|
}
|
||||||
|
return p.tokens[p.idx], nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// End returns whether all of the tokens have been consumed.
|
||||||
|
func (p *unvisParser) End() bool {
|
||||||
|
return p.idx >= len(p.tokens)
|
||||||
|
}
|
||||||
|
|
||||||
|
func newParser(input string, flag VisFlag) *unvisParser {
|
||||||
|
return &unvisParser{
|
||||||
|
tokens: []rune(input),
|
||||||
|
idx: 0,
|
||||||
|
flag: flag,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// While a recursive descent parser is overkill for parsing simple escape
|
||||||
|
// codes, this is IMO much easier to read than the ugly 80s coroutine code used
|
||||||
|
// by the original unvis(3) parser. Here's the EBNF for an unvis sequence:
|
||||||
|
//
|
||||||
|
// <input> ::= (<rune>)*
|
||||||
|
// <rune> ::= ("\" <escape-sequence>) | ("%" <escape-hex>) | <plain-rune>
|
||||||
|
// <plain-rune> ::= any rune
|
||||||
|
// <escape-sequence> ::= ("x" <escape-hex>) | ("M") | <escape-cstyle> | <escape-octal>
|
||||||
|
// <escape-hex> ::= [0-9a-f] [0-9a-f]
|
||||||
|
// <escape-cstyle> ::= "\" | "n" | "r" | "b" | "a" | "v" | "t" | "f"
|
||||||
|
// <escape-octal> ::= [0-7] ([0-7] ([0-7])?)?
|
||||||
|
|
||||||
|
func unvisPlainRune(p *unvisParser) (string, error) {
|
||||||
|
ch, err := p.Peek()
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("plain rune: %s", ch)
|
||||||
|
}
|
||||||
|
p.Next()
|
||||||
|
return string(ch), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func unvisEscapeCStyle(p *unvisParser) (string, error) {
|
||||||
|
ch, err := p.Peek()
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("escape hex: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
output := ""
|
||||||
|
switch ch {
|
||||||
|
case 'n':
|
||||||
|
output = "\n"
|
||||||
|
case 'r':
|
||||||
|
output = "\r"
|
||||||
|
case 'b':
|
||||||
|
output = "\b"
|
||||||
|
case 'a':
|
||||||
|
output = "\x07"
|
||||||
|
case 'v':
|
||||||
|
output = "\v"
|
||||||
|
case 't':
|
||||||
|
output = "\t"
|
||||||
|
case 'f':
|
||||||
|
output = "\f"
|
||||||
|
case 's':
|
||||||
|
output = " "
|
||||||
|
case 'E':
|
||||||
|
output = "\x1b"
|
||||||
|
case '\n':
|
||||||
|
// Hidden newline.
|
||||||
|
case '$':
|
||||||
|
// Hidden marker.
|
||||||
|
default:
|
||||||
|
// XXX: We should probably allow falling through and return "\" here...
|
||||||
|
return "", fmt.Errorf("escape cstyle: unknown escape character: %q", ch)
|
||||||
|
}
|
||||||
|
|
||||||
|
p.Next()
|
||||||
|
return output, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func unvisEscapeHex(p *unvisParser) (string, error) {
|
||||||
|
var output rune
|
||||||
|
|
||||||
|
for i := 0; i < 2; i++ {
|
||||||
|
ch, err := p.Peek()
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("escape hex: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
digit, err := strconv.ParseInt(string(ch), 16, 32)
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("escape hex: parse int: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
output = (output << 4) | rune(digit)
|
||||||
|
p.Next()
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: We need to handle runes properly to output byte strings again. In
|
||||||
|
// particular, if rune has 0xf0 set then we know that we're currently
|
||||||
|
// decoding a messed up string.
|
||||||
|
return string(output), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func unvisEscapeOctal(p *unvisParser) (string, error) {
|
||||||
|
var output rune
|
||||||
|
var err error
|
||||||
|
|
||||||
|
for i := 0; i < 3; i++ {
|
||||||
|
ch, err := p.Peek()
|
||||||
|
if err != nil {
|
||||||
|
if i == 0 {
|
||||||
|
err = fmt.Errorf("escape octal[first]: %s", err)
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
digit, err := strconv.ParseInt(string(ch), 8, 32)
|
||||||
|
if err != nil {
|
||||||
|
if i == 0 {
|
||||||
|
err = fmt.Errorf("escape octal[first]: parse int: %s", err)
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
output = (output << 3) | rune(digit)
|
||||||
|
p.Next()
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: We need to handle runes properly to output byte strings again. In
|
||||||
|
// particular, if rune has 0xf0 set then we know that we're currently
|
||||||
|
// decoding a messed up string.
|
||||||
|
return string(output), err
|
||||||
|
}
|
||||||
|
|
||||||
|
func unvisEscapeSequence(p *unvisParser) (string, error) {
|
||||||
|
ch, err := p.Peek()
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("escape sequence: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
switch ch {
|
||||||
|
case '\\':
|
||||||
|
p.Next()
|
||||||
|
return "\\", nil
|
||||||
|
|
||||||
|
case '0', '1', '2', '3', '4', '5', '6', '7':
|
||||||
|
return unvisEscapeOctal(p)
|
||||||
|
|
||||||
|
case 'x':
|
||||||
|
p.Next()
|
||||||
|
return unvisEscapeHex(p)
|
||||||
|
|
||||||
|
case 'M':
|
||||||
|
// TODO
|
||||||
|
case '^':
|
||||||
|
// TODO
|
||||||
|
|
||||||
|
default:
|
||||||
|
return unvisEscapeCStyle(p)
|
||||||
|
}
|
||||||
|
|
||||||
|
return "", fmt.Errorf("escape sequence: unsupported sequence: %q", ch)
|
||||||
|
}
|
||||||
|
|
||||||
|
func unvisRune(p *unvisParser) (string, error) {
|
||||||
|
ch, err := p.Peek()
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("rune: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
switch ch {
|
||||||
|
case '\\':
|
||||||
|
p.Next()
|
||||||
|
return unvisEscapeSequence(p)
|
||||||
|
|
||||||
|
case '%':
|
||||||
|
// % HEX HEX only applies to HTTPStyle encodings.
|
||||||
|
if p.flag&VisHTTPStyle == VisHTTPStyle {
|
||||||
|
p.Next()
|
||||||
|
return unvisEscapeHex(p)
|
||||||
|
}
|
||||||
|
fallthrough
|
||||||
|
|
||||||
|
default:
|
||||||
|
return unvisPlainRune(p)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func unvis(p *unvisParser) (string, error) {
|
||||||
|
output := ""
|
||||||
|
for !p.End() {
|
||||||
|
ch, err := unvisRune(p)
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("input: %s", err)
|
||||||
|
}
|
||||||
|
output += ch
|
||||||
|
}
|
||||||
|
return output, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Unvis takes a string formatted with the given Vis flags (though only the
|
||||||
|
// VisHTTPStyle flag is checked) and output the un-encoded version of the
|
||||||
|
// encoded string. An error is returned if any escape sequences in the input
|
||||||
|
// string were invalid.
|
||||||
|
func Unvis(input string, flag VisFlag) (string, error) {
|
||||||
|
// TODO: Check all of the VisFlag bits.
|
||||||
|
p := newParser(input, flag)
|
||||||
|
output, err := unvis(p)
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("unvis: %s", err)
|
||||||
|
}
|
||||||
|
if !p.End() {
|
||||||
|
return "", fmt.Errorf("unvis: trailing characters at end of input")
|
||||||
|
}
|
||||||
|
return output, nil
|
||||||
|
}
|
92
unvis_test.go
Normal file
92
unvis_test.go
Normal file
|
@ -0,0 +1,92 @@
|
||||||
|
/*
|
||||||
|
* govis: unicode aware vis(3) encoding implementation
|
||||||
|
* Copyright (C) 2017 SUSE LLC.
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package govis
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestUnvisOctalEscape(t *testing.T) {
|
||||||
|
for _, test := range []struct {
|
||||||
|
input string
|
||||||
|
expected string
|
||||||
|
}{
|
||||||
|
{"", ""},
|
||||||
|
{"\\1", "\001"},
|
||||||
|
{"\\01\\02\\3", "\001\002\003"},
|
||||||
|
{"\\001\\023\\32", "\001\023\032"},
|
||||||
|
{"this is a test\\0k1\\133", "this is a test\000k1\133"},
|
||||||
|
{"\\170YET\\01another test\\1\\\\82", "\170YET\001another test\001\\82"},
|
||||||
|
{"\\177MORE tests\\09a", "\177MORE tests\x009a"},
|
||||||
|
{"\\\\710more\\1215testing", "\\710more\1215testing"},
|
||||||
|
} {
|
||||||
|
got, err := Unvis(test.input, DefaultVisFlags)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("unexpected error doing unvis(%q): %q", test.input, err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if got != test.expected {
|
||||||
|
t.Errorf("expected unvis(%q) = %q, got %q", test.input, test.expected, got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestUnvisHexEscape(t *testing.T) {
|
||||||
|
for _, test := range []struct {
|
||||||
|
input string
|
||||||
|
expected string
|
||||||
|
}{
|
||||||
|
{"", ""},
|
||||||
|
{"\\x01", "\x01"},
|
||||||
|
{"\\x01\\x02\\x7a", "\x01\x02\x7a"},
|
||||||
|
{"this is a test\\x13\\x52\\x6f", "this is a test\x13\x52\x6f"},
|
||||||
|
{"\\x170YET\\x01a\\x22nother test\\x11", "\x170YET\x01a\x22nother test\x11"},
|
||||||
|
{"\\\\x007more\\\\x215testing", "\\x007more\\x215testing"},
|
||||||
|
// Make sure that decoding unicode works properly.
|
||||||
|
{"\\xf0\\x9f\\x95\\xb4", "\U0001f574"},
|
||||||
|
} {
|
||||||
|
got, err := Unvis(test.input, DefaultVisFlags)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("unexpected error doing unvis(%q): %q", test.input, err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if got != test.expected {
|
||||||
|
t.Errorf("expected unvis(%q) = %q, got %q", test.input, test.expected, got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestUnvisUnicode(t *testing.T) {
|
||||||
|
// Ensure that unicode strings are not messed up by Unvis.
|
||||||
|
for _, test := range []string{
|
||||||
|
"",
|
||||||
|
"this.is.a.normal_string",
|
||||||
|
"AC_Raíz_Certicámara_S.A..pem",
|
||||||
|
"NetLock_Arany_=Class_Gold=_Főtanúsítvány.pem",
|
||||||
|
"TÜBİTAK_UEKAE_Kök_Sertifika_Hizmet_Sağlayıcısı_-_Sürüm_3.pem",
|
||||||
|
} {
|
||||||
|
got, err := Unvis(test, DefaultVisFlags)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("unexpected error doing unvis(%q): %s", test, err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if got != test {
|
||||||
|
t.Errorf("expected %q to be unchanged, got %q", test, got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue