mirror of
https://github.com/vbatts/go-mtree.git
synced 2025-10-03 20:21:01 +00:00
By using a buffer, we can avoid a bunch of small allocations that the previous implementation did. Based on a few small benchmarks, the performance improvement is very stark (~3x faster for strings that don't require any escaping, and ~20% faster for multi-byte utf8 strings): goos: linux goarch: amd64 pkg: github.com/vbatts/go-mtree/pkg/govis cpu: AMD Ryzen 7 7840U w/ Radeon 780M Graphics │ before │ after │ │ sec/op │ sec/op vs base │ Unvis/NoChange-16 1501.0n ± 0% 497.7n ± 1% -66.84% (p=0.000 n=10) Unvis/Binary-16 1317.5n ± 3% 934.9n ± 9% -29.04% (p=0.000 n=10) Unvis/ASCII-16 1325.5n ± 1% 616.8n ± 1% -53.47% (p=0.000 n=10) Unvis/German-16 1884.5n ± 1% 986.9n ± 2% -47.63% (p=0.000 n=10) Unvis/Russian-16 4.636µ ± 1% 3.796µ ± 1% -18.11% (p=0.000 n=10) Unvis/Japanese-16 3.453µ ± 1% 2.867µ ± 1% -16.99% (p=0.000 n=10) geomean 2.072µ 1.206µ -41.77% Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
297 lines
7.1 KiB
Go
297 lines
7.1 KiB
Go
// SPDX-License-Identifier: Apache-2.0
|
|
/*
|
|
* govis: unicode aware vis(3) encoding implementation
|
|
* Copyright (C) 2017-2025 SUSE LLC.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
package govis
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"strconv"
|
|
"strings"
|
|
"unicode"
|
|
)
|
|
|
|
var (
|
|
errEndOfString = errors.New("unexpectedly reached end of string")
|
|
errUnknownEscapeChar = errors.New("unknown escape character")
|
|
errOutsideLatin1 = errors.New("outside latin-1 encoding")
|
|
errParseDigit = errors.New("could not parse digit")
|
|
)
|
|
|
|
// unvisParser stores the current state of the token parser.
|
|
type unvisParser struct {
|
|
output *strings.Builder
|
|
tokens []rune
|
|
idx int
|
|
flags VisFlag
|
|
}
|
|
|
|
// Input resets the parser with a new input string.
|
|
func (p *unvisParser) Input(input string) {
|
|
p.output = new(strings.Builder)
|
|
p.output.Grow(len(input)) // the output will be at most input-sized
|
|
|
|
p.tokens = []rune(input)
|
|
p.idx = 0
|
|
}
|
|
|
|
// Output returns the internal [strings.Builder].
|
|
func (p *unvisParser) Output() *strings.Builder {
|
|
return p.output
|
|
}
|
|
|
|
// Step moves the index to the next character.
|
|
func (p *unvisParser) Step() {
|
|
p.idx++
|
|
}
|
|
|
|
// Peek gets the current token.
|
|
func (p *unvisParser) Peek() (rune, error) {
|
|
if p.idx >= len(p.tokens) {
|
|
return unicode.ReplacementChar, errEndOfString
|
|
}
|
|
return p.tokens[p.idx], nil
|
|
}
|
|
|
|
// Next moves the index to the next character and returns said character.
|
|
func (p *unvisParser) Next() (rune, error) {
|
|
ch, err := p.Peek()
|
|
if err == nil {
|
|
p.Step()
|
|
}
|
|
return ch, err
|
|
}
|
|
|
|
// End returns whether all of the tokens have been consumed.
|
|
func (p *unvisParser) End() bool {
|
|
return p.idx >= len(p.tokens)
|
|
}
|
|
|
|
func newParser(flags VisFlag) *unvisParser {
|
|
return &unvisParser{
|
|
output: nil,
|
|
tokens: nil,
|
|
idx: 0,
|
|
flags: flags,
|
|
}
|
|
}
|
|
|
|
// While a recursive descent parser is overkill for parsing simple escape
|
|
// codes, this is IMO much easier to read than the ugly 80s coroutine code used
|
|
// by the original unvis(3) parser. Here's the EBNF for an unvis sequence:
|
|
//
|
|
// <input> ::= (<element>)*
|
|
// <element> ::= ("\" <escape-sequence>) | ("%" <escape-hex>) | <plain-rune>
|
|
// <plain-rune> ::= any rune
|
|
// <escape-sequence> ::= ("x" <escape-hex>) | ("M" <escape-meta>) | ("^" <escape-ctrl) | <escape-cstyle> | <escape-octal>
|
|
// <escape-meta> ::= ("-" <escape-meta1>) | ("^" <escape-ctrl>)
|
|
// <escape-meta1> ::= any rune
|
|
// <escape-ctrl> ::= "?" | any rune
|
|
// <escape-cstyle> ::= "\" | "n" | "r" | "b" | "a" | "v" | "t" | "f"
|
|
// <escape-hex> ::= [0-9a-f] [0-9a-f]
|
|
// <escape-octal> ::= [0-7] ([0-7] ([0-7])?)?
|
|
|
|
func (p *unvisParser) plainRune() error {
|
|
ch, err := p.Next()
|
|
if err != nil {
|
|
return fmt.Errorf("plain rune: %w", err)
|
|
}
|
|
_, err = p.output.WriteRune(ch)
|
|
return err
|
|
}
|
|
|
|
func (p *unvisParser) escapeCStyle() error {
|
|
ch, err := p.Next()
|
|
if err != nil {
|
|
return fmt.Errorf("escape cstyle: %w", err)
|
|
}
|
|
|
|
switch ch {
|
|
case 'n':
|
|
return p.output.WriteByte('\n')
|
|
case 'r':
|
|
return p.output.WriteByte('\r')
|
|
case 'b':
|
|
return p.output.WriteByte('\b')
|
|
case 'a':
|
|
return p.output.WriteByte('\x07')
|
|
case 'v':
|
|
return p.output.WriteByte('\v')
|
|
case 't':
|
|
return p.output.WriteByte('\t')
|
|
case 'f':
|
|
return p.output.WriteByte('\f')
|
|
case 's':
|
|
return p.output.WriteByte(' ')
|
|
case 'E':
|
|
return p.output.WriteByte('\x1b')
|
|
case '\n', '$':
|
|
// Hidden newline or marker.
|
|
return nil
|
|
}
|
|
// XXX: We should probably allow falling through and return "\" here...
|
|
return fmt.Errorf("escape cstyle: %w %q", errUnknownEscapeChar, ch)
|
|
}
|
|
|
|
func (p *unvisParser) escapeDigits(base int, force bool) error {
|
|
var code int
|
|
for i := int(0xFF); i > 0; i /= base {
|
|
ch, err := p.Peek()
|
|
if err != nil {
|
|
if !force && i != 0xFF {
|
|
break
|
|
}
|
|
return fmt.Errorf("escape base %d: %w", base, err)
|
|
}
|
|
|
|
digit, err := strconv.ParseInt(string(ch), base, 8)
|
|
if err != nil {
|
|
if !force && i != 0xFF {
|
|
break
|
|
}
|
|
return fmt.Errorf("escape base %d: %w %q: %w", base, errParseDigit, ch, err)
|
|
}
|
|
|
|
code = (code * base) + int(digit)
|
|
p.Step() // only consume token if we use it (length is variable)
|
|
}
|
|
if code > unicode.MaxLatin1 {
|
|
return fmt.Errorf("escape base %d: code %+.2x %w", base, code, errOutsideLatin1)
|
|
}
|
|
return p.output.WriteByte(byte(code))
|
|
}
|
|
|
|
func (p *unvisParser) escapeCtrl(mask byte) error {
|
|
ch, err := p.Next()
|
|
if err != nil {
|
|
return fmt.Errorf("escape ctrl: %w", err)
|
|
}
|
|
if ch > unicode.MaxLatin1 {
|
|
return fmt.Errorf("escape ctrl: code %q %w", ch, errOutsideLatin1)
|
|
}
|
|
char := byte(ch) & 0x1f
|
|
if ch == '?' {
|
|
char = 0x7f
|
|
}
|
|
return p.output.WriteByte(mask | char)
|
|
}
|
|
|
|
func (p *unvisParser) escapeMeta() error {
|
|
ch, err := p.Next()
|
|
if err != nil {
|
|
return fmt.Errorf("escape meta: %w", err)
|
|
}
|
|
|
|
mask := byte(0x80)
|
|
switch ch {
|
|
case '^':
|
|
// The same as "\^..." except we apply a mask.
|
|
return p.escapeCtrl(mask)
|
|
|
|
case '-':
|
|
ch, err := p.Next()
|
|
if err != nil {
|
|
return fmt.Errorf("escape meta1: %w", err)
|
|
}
|
|
if ch > unicode.MaxLatin1 {
|
|
return fmt.Errorf("escape meta1: code %q %w", ch, errOutsideLatin1)
|
|
}
|
|
// Add mask to character.
|
|
return p.output.WriteByte(mask | byte(ch))
|
|
}
|
|
|
|
return fmt.Errorf("escape meta: %w %q", errUnknownEscapeChar, ch)
|
|
}
|
|
|
|
func (p *unvisParser) escapeSequence() error {
|
|
ch, err := p.Peek()
|
|
if err != nil {
|
|
return fmt.Errorf("escape sequence: %w", err)
|
|
}
|
|
|
|
switch ch {
|
|
case '\\':
|
|
p.Step()
|
|
return p.output.WriteByte('\\')
|
|
|
|
case '0', '1', '2', '3', '4', '5', '6', '7':
|
|
return p.escapeDigits(8, false)
|
|
|
|
case 'x':
|
|
p.Step()
|
|
return p.escapeDigits(16, true)
|
|
|
|
case '^':
|
|
p.Step()
|
|
return p.escapeCtrl(0x00)
|
|
|
|
case 'M':
|
|
p.Step()
|
|
return p.escapeMeta()
|
|
|
|
default:
|
|
return p.escapeCStyle()
|
|
}
|
|
}
|
|
|
|
func (p *unvisParser) element() error {
|
|
ch, err := p.Peek()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
switch ch {
|
|
case '\\':
|
|
p.Step()
|
|
return p.escapeSequence()
|
|
|
|
case '%':
|
|
// % HEX HEX only applies to HTTPStyle encodings.
|
|
if p.flags&VisHTTPStyle == VisHTTPStyle {
|
|
p.Step()
|
|
return p.escapeDigits(16, true)
|
|
}
|
|
}
|
|
return p.plainRune()
|
|
}
|
|
|
|
func (p *unvisParser) unvis(input string) (string, error) {
|
|
p.Input(input)
|
|
for !p.End() {
|
|
if err := p.element(); err != nil {
|
|
return "", err
|
|
}
|
|
}
|
|
return p.Output().String(), nil
|
|
}
|
|
|
|
// Unvis takes a string formatted with the given Vis flags (though only the
|
|
// VisHTTPStyle flag is checked) and output the un-encoded version of the
|
|
// encoded string. An error is returned if any escape sequences in the input
|
|
// string were invalid.
|
|
func Unvis(input string, flags VisFlag) (string, error) {
|
|
if unknown := flags &^ visMask; unknown != 0 {
|
|
return "", unknownVisFlagsError{flags: flags}
|
|
}
|
|
p := newParser(flags)
|
|
output, err := p.unvis(input)
|
|
if err != nil {
|
|
return "", fmt.Errorf("unvis '%s': %w", input, err)
|
|
}
|
|
return output, nil
|
|
}
|