// SPDX-License-Identifier: Apache-2.0 /* * govis: unicode aware vis(3) encoding implementation * Copyright (C) 2017-2025 SUSE LLC. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package govis import ( "errors" "fmt" "strconv" "strings" "unicode" ) var ( errEndOfString = errors.New("unexpectedly reached end of string") errUnknownEscapeChar = errors.New("unknown escape character") errOutsideLatin1 = errors.New("outside latin-1 encoding") errParseDigit = errors.New("could not parse digit") ) // unvisParser stores the current state of the token parser. type unvisParser struct { output *strings.Builder tokens []rune idx int flags VisFlag } // Input resets the parser with a new input string. func (p *unvisParser) Input(input string) { p.output = new(strings.Builder) p.output.Grow(len(input)) // the output will be at most input-sized p.tokens = []rune(input) p.idx = 0 } // Output returns the internal [strings.Builder]. func (p *unvisParser) Output() *strings.Builder { return p.output } // Step moves the index to the next character. func (p *unvisParser) Step() { p.idx++ } // Peek gets the current token. func (p *unvisParser) Peek() (rune, error) { if p.idx >= len(p.tokens) { return unicode.ReplacementChar, errEndOfString } return p.tokens[p.idx], nil } // Next moves the index to the next character and returns said character. func (p *unvisParser) Next() (rune, error) { ch, err := p.Peek() if err == nil { p.Step() } return ch, err } // End returns whether all of the tokens have been consumed. func (p *unvisParser) End() bool { return p.idx >= len(p.tokens) } func newParser(flags VisFlag) *unvisParser { return &unvisParser{ output: nil, tokens: nil, idx: 0, flags: flags, } } // While a recursive descent parser is overkill for parsing simple escape // codes, this is IMO much easier to read than the ugly 80s coroutine code used // by the original unvis(3) parser. Here's the EBNF for an unvis sequence: // // ::= ()* // ::= ("\" ) | ("%" ) | // ::= any rune // ::= ("x" ) | ("M" ) | ("^" | // ::= ("-" ) | ("^" ) // ::= any rune // ::= "?" | any rune // ::= "\" | "n" | "r" | "b" | "a" | "v" | "t" | "f" // ::= [0-9a-f] [0-9a-f] // ::= [0-7] ([0-7] ([0-7])?)? func (p *unvisParser) plainRune() error { ch, err := p.Next() if err != nil { return fmt.Errorf("plain rune: %w", err) } _, err = p.output.WriteRune(ch) return err } func (p *unvisParser) escapeCStyle() error { ch, err := p.Next() if err != nil { return fmt.Errorf("escape cstyle: %w", err) } switch ch { case 'n': return p.output.WriteByte('\n') case 'r': return p.output.WriteByte('\r') case 'b': return p.output.WriteByte('\b') case 'a': return p.output.WriteByte('\x07') case 'v': return p.output.WriteByte('\v') case 't': return p.output.WriteByte('\t') case 'f': return p.output.WriteByte('\f') case 's': return p.output.WriteByte(' ') case 'E': return p.output.WriteByte('\x1b') case '\n', '$': // Hidden newline or marker. return nil } // XXX: We should probably allow falling through and return "\" here... return fmt.Errorf("escape cstyle: %w %q", errUnknownEscapeChar, ch) } func (p *unvisParser) escapeDigits(base int, force bool) error { var code int for i := int(0xFF); i > 0; i /= base { ch, err := p.Peek() if err != nil { if !force && i != 0xFF { break } return fmt.Errorf("escape base %d: %w", base, err) } digit, err := strconv.ParseInt(string(ch), base, 8) if err != nil { if !force && i != 0xFF { break } return fmt.Errorf("escape base %d: %w %q: %w", base, errParseDigit, ch, err) } code = (code * base) + int(digit) p.Step() // only consume token if we use it (length is variable) } if code > unicode.MaxLatin1 { return fmt.Errorf("escape base %d: code %+.2x %w", base, code, errOutsideLatin1) } return p.output.WriteByte(byte(code)) } func (p *unvisParser) escapeCtrl(mask byte) error { ch, err := p.Next() if err != nil { return fmt.Errorf("escape ctrl: %w", err) } if ch > unicode.MaxLatin1 { return fmt.Errorf("escape ctrl: code %q %w", ch, errOutsideLatin1) } char := byte(ch) & 0x1f if ch == '?' { char = 0x7f } return p.output.WriteByte(mask | char) } func (p *unvisParser) escapeMeta() error { ch, err := p.Next() if err != nil { return fmt.Errorf("escape meta: %w", err) } mask := byte(0x80) switch ch { case '^': // The same as "\^..." except we apply a mask. return p.escapeCtrl(mask) case '-': ch, err := p.Next() if err != nil { return fmt.Errorf("escape meta1: %w", err) } if ch > unicode.MaxLatin1 { return fmt.Errorf("escape meta1: code %q %w", ch, errOutsideLatin1) } // Add mask to character. return p.output.WriteByte(mask | byte(ch)) } return fmt.Errorf("escape meta: %w %q", errUnknownEscapeChar, ch) } func (p *unvisParser) escapeSequence() error { ch, err := p.Peek() if err != nil { return fmt.Errorf("escape sequence: %w", err) } switch ch { case '\\': p.Step() return p.output.WriteByte('\\') case '0', '1', '2', '3', '4', '5', '6', '7': return p.escapeDigits(8, false) case 'x': p.Step() return p.escapeDigits(16, true) case '^': p.Step() return p.escapeCtrl(0x00) case 'M': p.Step() return p.escapeMeta() default: return p.escapeCStyle() } } func (p *unvisParser) element() error { ch, err := p.Peek() if err != nil { return err } switch ch { case '\\': p.Step() return p.escapeSequence() case '%': // % HEX HEX only applies to HTTPStyle encodings. if p.flags&VisHTTPStyle == VisHTTPStyle { p.Step() return p.escapeDigits(16, true) } } return p.plainRune() } func (p *unvisParser) unvis(input string) (string, error) { p.Input(input) for !p.End() { if err := p.element(); err != nil { return "", err } } return p.Output().String(), nil } // Unvis takes a string formatted with the given Vis flags (though only the // VisHTTPStyle flag is checked) and output the un-encoded version of the // encoded string. An error is returned if any escape sequences in the input // string were invalid. func Unvis(input string, flags VisFlag) (string, error) { if unknown := flags &^ visMask; unknown != 0 { return "", unknownVisFlagsError{flags: flags} } p := newParser(flags) output, err := p.unvis(input) if err != nil { return "", fmt.Errorf("unvis '%s': %w", input, err) } return output, nil }