/* * govis: unicode aware vis(3) encoding implementation * Copyright (C) 2017 SUSE LLC. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package govis import ( "fmt" "strconv" "unicode/utf8" ) // unvisParser stores the current state of the token parser. type unvisParser struct { tokens []rune idx int flag VisFlag } // Next moves the index to the next character. func (p *unvisParser) Next() { p.idx++ } // Peek gets the current token. func (p *unvisParser) Peek() (rune, error) { if p.idx >= len(p.tokens) { return utf8.RuneError, fmt.Errorf("tried to read past end of token list") } return p.tokens[p.idx], nil } // End returns whether all of the tokens have been consumed. func (p *unvisParser) End() bool { return p.idx >= len(p.tokens) } func newParser(input string, flag VisFlag) *unvisParser { return &unvisParser{ tokens: []rune(input), idx: 0, flag: flag, } } // While a recursive descent parser is overkill for parsing simple escape // codes, this is IMO much easier to read than the ugly 80s coroutine code used // by the original unvis(3) parser. Here's the EBNF for an unvis sequence: // // ::= ()* // ::= ("\" ) | ("%" ) | // ::= any rune // ::= ("x" ) | ("M") | | // ::= [0-9a-f] [0-9a-f] // ::= "\" | "n" | "r" | "b" | "a" | "v" | "t" | "f" // ::= [0-7] ([0-7] ([0-7])?)? func unvisPlainRune(p *unvisParser) (string, error) { ch, err := p.Peek() if err != nil { return "", fmt.Errorf("plain rune: %s", ch) } p.Next() return string(ch), nil } func unvisEscapeCStyle(p *unvisParser) (string, error) { ch, err := p.Peek() if err != nil { return "", fmt.Errorf("escape hex: %s", err) } output := "" switch ch { case 'n': output = "\n" case 'r': output = "\r" case 'b': output = "\b" case 'a': output = "\x07" case 'v': output = "\v" case 't': output = "\t" case 'f': output = "\f" case 's': output = " " case 'E': output = "\x1b" case '\n': // Hidden newline. case '$': // Hidden marker. default: // XXX: We should probably allow falling through and return "\" here... return "", fmt.Errorf("escape cstyle: unknown escape character: %q", ch) } p.Next() return output, nil } func unvisEscapeHex(p *unvisParser) (string, error) { var output rune for i := 0; i < 2; i++ { ch, err := p.Peek() if err != nil { return "", fmt.Errorf("escape hex: %s", err) } digit, err := strconv.ParseInt(string(ch), 16, 32) if err != nil { return "", fmt.Errorf("escape hex: parse int: %s", err) } output = (output << 4) | rune(digit) p.Next() } // TODO: We need to handle runes properly to output byte strings again. In // particular, if rune has 0xf0 set then we know that we're currently // decoding a messed up string. return string(output), nil } func unvisEscapeOctal(p *unvisParser) (string, error) { var output rune var err error for i := 0; i < 3; i++ { ch, err := p.Peek() if err != nil { if i == 0 { err = fmt.Errorf("escape octal[first]: %s", err) } break } digit, err := strconv.ParseInt(string(ch), 8, 32) if err != nil { if i == 0 { err = fmt.Errorf("escape octal[first]: parse int: %s", err) } break } output = (output << 3) | rune(digit) p.Next() } // TODO: We need to handle runes properly to output byte strings again. In // particular, if rune has 0xf0 set then we know that we're currently // decoding a messed up string. return string(output), err } func unvisEscapeSequence(p *unvisParser) (string, error) { ch, err := p.Peek() if err != nil { return "", fmt.Errorf("escape sequence: %s", err) } switch ch { case '\\': p.Next() return "\\", nil case '0', '1', '2', '3', '4', '5', '6', '7': return unvisEscapeOctal(p) case 'x': p.Next() return unvisEscapeHex(p) case 'M': // TODO case '^': // TODO default: return unvisEscapeCStyle(p) } return "", fmt.Errorf("escape sequence: unsupported sequence: %q", ch) } func unvisRune(p *unvisParser) (string, error) { ch, err := p.Peek() if err != nil { return "", fmt.Errorf("rune: %s", err) } switch ch { case '\\': p.Next() return unvisEscapeSequence(p) case '%': // % HEX HEX only applies to HTTPStyle encodings. if p.flag&VisHTTPStyle == VisHTTPStyle { p.Next() return unvisEscapeHex(p) } fallthrough default: return unvisPlainRune(p) } } func unvis(p *unvisParser) (string, error) { output := "" for !p.End() { ch, err := unvisRune(p) if err != nil { return "", fmt.Errorf("input: %s", err) } output += ch } return output, nil } // Unvis takes a string formatted with the given Vis flags (though only the // VisHTTPStyle flag is checked) and output the un-encoded version of the // encoded string. An error is returned if any escape sequences in the input // string were invalid. func Unvis(input string, flag VisFlag) (string, error) { // TODO: Check all of the VisFlag bits. p := newParser(input, flag) output, err := unvis(p) if err != nil { return "", fmt.Errorf("unvis: %s", err) } if !p.End() { return "", fmt.Errorf("unvis: trailing characters at end of input") } return output, nil }