1
0
Fork 0
mirror of https://github.com/vbatts/go-mtree.git synced 2025-10-03 20:21:01 +00:00

unvis: improve performance by reducing allocations

By using a buffer, we can avoid a bunch of small allocations that the
previous implementation did. Based on a few small benchmarks, the
performance improvement is very stark (~3x faster for strings that don't
require any escaping, and ~20% faster for multi-byte utf8 strings):

  goos: linux
  goarch: amd64
  pkg: github.com/vbatts/go-mtree/pkg/govis
  cpu: AMD Ryzen 7 7840U w/ Radeon  780M Graphics
                    │    before    │                after                │
                    │    sec/op    │   sec/op     vs base                │
  Unvis/NoChange-16   1501.0n ± 0%   497.7n ± 1%  -66.84% (p=0.000 n=10)
  Unvis/Binary-16     1317.5n ± 3%   934.9n ± 9%  -29.04% (p=0.000 n=10)
  Unvis/ASCII-16      1325.5n ± 1%   616.8n ± 1%  -53.47% (p=0.000 n=10)
  Unvis/German-16     1884.5n ± 1%   986.9n ± 2%  -47.63% (p=0.000 n=10)
  Unvis/Russian-16     4.636µ ± 1%   3.796µ ± 1%  -18.11% (p=0.000 n=10)
  Unvis/Japanese-16    3.453µ ± 1%   2.867µ ± 1%  -16.99% (p=0.000 n=10)
  geomean              2.072µ        1.206µ       -41.77%

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
This commit is contained in:
Aleksa Sarai 2025-09-22 02:51:18 +10:00
parent 01d93a93e2
commit 70d3b19776
No known key found for this signature in database
GPG key ID: 2897FAD2B7E9446F
2 changed files with 107 additions and 55 deletions

View file

@ -22,6 +22,7 @@ import (
"errors" "errors"
"fmt" "fmt"
"strconv" "strconv"
"strings"
"unicode" "unicode"
) )
@ -34,6 +35,7 @@ var (
// unvisParser stores the current state of the token parser. // unvisParser stores the current state of the token parser.
type unvisParser struct { type unvisParser struct {
output *strings.Builder
tokens []rune tokens []rune
idx int idx int
flags VisFlag flags VisFlag
@ -41,10 +43,18 @@ type unvisParser struct {
// Input resets the parser with a new input string. // Input resets the parser with a new input string.
func (p *unvisParser) Input(input string) { func (p *unvisParser) Input(input string) {
p.output = new(strings.Builder)
p.output.Grow(len(input)) // the output will be at most input-sized
p.tokens = []rune(input) p.tokens = []rune(input)
p.idx = 0 p.idx = 0
} }
// Output returns the internal [strings.Builder].
func (p *unvisParser) Output() *strings.Builder {
return p.output
}
// Step moves the index to the next character. // Step moves the index to the next character.
func (p *unvisParser) Step() { func (p *unvisParser) Step() {
p.idx++ p.idx++
@ -74,6 +84,7 @@ func (p *unvisParser) End() bool {
func newParser(flags VisFlag) *unvisParser { func newParser(flags VisFlag) *unvisParser {
return &unvisParser{ return &unvisParser{
output: nil,
tokens: nil, tokens: nil,
idx: 0, idx: 0,
flags: flags, flags: flags,
@ -95,62 +106,57 @@ func newParser(flags VisFlag) *unvisParser {
// <escape-hex> ::= [0-9a-f] [0-9a-f] // <escape-hex> ::= [0-9a-f] [0-9a-f]
// <escape-octal> ::= [0-7] ([0-7] ([0-7])?)? // <escape-octal> ::= [0-7] ([0-7] ([0-7])?)?
func (p *unvisParser) plainRune() ([]byte, error) { func (p *unvisParser) plainRune() error {
ch, err := p.Next() ch, err := p.Next()
if err != nil { if err != nil {
return nil, fmt.Errorf("plain rune: %w", err) return fmt.Errorf("plain rune: %w", err)
} }
return []byte(string(ch)), nil _, err = p.output.WriteRune(ch)
return err
} }
func (p *unvisParser) escapeCStyle() ([]byte, error) { func (p *unvisParser) escapeCStyle() error {
ch, err := p.Next() ch, err := p.Next()
if err != nil { if err != nil {
return nil, fmt.Errorf("escape cstyle: %w", err) return fmt.Errorf("escape cstyle: %w", err)
} }
output := ""
switch ch { switch ch {
case 'n': case 'n':
output = "\n" return p.output.WriteByte('\n')
case 'r': case 'r':
output = "\r" return p.output.WriteByte('\r')
case 'b': case 'b':
output = "\b" return p.output.WriteByte('\b')
case 'a': case 'a':
output = "\x07" return p.output.WriteByte('\x07')
case 'v': case 'v':
output = "\v" return p.output.WriteByte('\v')
case 't': case 't':
output = "\t" return p.output.WriteByte('\t')
case 'f': case 'f':
output = "\f" return p.output.WriteByte('\f')
case 's': case 's':
output = " " return p.output.WriteByte(' ')
case 'E': case 'E':
output = "\x1b" return p.output.WriteByte('\x1b')
case '\n': case '\n', '$':
// Hidden newline. // Hidden newline or marker.
case '$': return nil
// Hidden marker.
default:
// XXX: We should probably allow falling through and return "\" here...
return nil, fmt.Errorf("escape cstyle: %w %q", errUnknownEscapeChar, ch)
} }
// XXX: We should probably allow falling through and return "\" here...
return []byte(output), nil return fmt.Errorf("escape cstyle: %w %q", errUnknownEscapeChar, ch)
} }
func (p *unvisParser) escapeDigits(base int, force bool) ([]byte, error) { func (p *unvisParser) escapeDigits(base int, force bool) error {
var code int var code int
for i := int(0xFF); i > 0; i /= base { for i := int(0xFF); i > 0; i /= base {
ch, err := p.Peek() ch, err := p.Peek()
if err != nil { if err != nil {
if !force && i != 0xFF { if !force && i != 0xFF {
break break
} }
return nil, fmt.Errorf("escape base %d: %w", base, err) return fmt.Errorf("escape base %d: %w", base, err)
} }
digit, err := strconv.ParseInt(string(ch), base, 8) digit, err := strconv.ParseInt(string(ch), base, 8)
@ -158,45 +164,40 @@ func (p *unvisParser) escapeDigits(base int, force bool) ([]byte, error) {
if !force && i != 0xFF { if !force && i != 0xFF {
break break
} }
return nil, fmt.Errorf("escape base %d: %w %q: %w", base, errParseDigit, ch, err) return fmt.Errorf("escape base %d: %w %q: %w", base, errParseDigit, ch, err)
} }
code = (code * base) + int(digit) code = (code * base) + int(digit)
p.Step() // only consume token if we use it (length is variable) p.Step() // only consume token if we use it (length is variable)
} }
if code > unicode.MaxLatin1 { if code > unicode.MaxLatin1 {
return nil, fmt.Errorf("escape base %d: code %+.2x %w", base, code, errOutsideLatin1) return fmt.Errorf("escape base %d: code %+.2x %w", base, code, errOutsideLatin1)
} }
return p.output.WriteByte(byte(code))
char := byte(code & 0xFF)
return []byte{char}, nil
} }
func (p *unvisParser) escapeCtrl(mask byte) ([]byte, error) { func (p *unvisParser) escapeCtrl(mask byte) error {
ch, err := p.Next() ch, err := p.Next()
if err != nil { if err != nil {
return nil, fmt.Errorf("escape ctrl: %w", err) return fmt.Errorf("escape ctrl: %w", err)
} }
if ch > unicode.MaxLatin1 { if ch > unicode.MaxLatin1 {
return nil, fmt.Errorf("escape ctrl: code %q %w", ch, errOutsideLatin1) return fmt.Errorf("escape ctrl: code %q %w", ch, errOutsideLatin1)
} }
char := byte(ch) & 0x1f char := byte(ch) & 0x1f
if ch == '?' { if ch == '?' {
char = 0x7f char = 0x7f
} }
return []byte{mask | char}, nil return p.output.WriteByte(mask | char)
} }
func (p *unvisParser) escapeMeta() ([]byte, error) { func (p *unvisParser) escapeMeta() error {
ch, err := p.Next() ch, err := p.Next()
if err != nil { if err != nil {
return nil, fmt.Errorf("escape meta: %w", err) return fmt.Errorf("escape meta: %w", err)
} }
mask := byte(0x80) mask := byte(0x80)
switch ch { switch ch {
case '^': case '^':
// The same as "\^..." except we apply a mask. // The same as "\^..." except we apply a mask.
@ -205,28 +206,28 @@ func (p *unvisParser) escapeMeta() ([]byte, error) {
case '-': case '-':
ch, err := p.Next() ch, err := p.Next()
if err != nil { if err != nil {
return nil, fmt.Errorf("escape meta1: %w", err) return fmt.Errorf("escape meta1: %w", err)
} }
if ch > unicode.MaxLatin1 { if ch > unicode.MaxLatin1 {
return nil, fmt.Errorf("escape meta1: code %q %w", ch, errOutsideLatin1) return fmt.Errorf("escape meta1: code %q %w", ch, errOutsideLatin1)
} }
// Add mask to character. // Add mask to character.
return []byte{mask | byte(ch)}, nil return p.output.WriteByte(mask | byte(ch))
} }
return nil, fmt.Errorf("escape meta: %w %q", errUnknownEscapeChar, ch) return fmt.Errorf("escape meta: %w %q", errUnknownEscapeChar, ch)
} }
func (p *unvisParser) escapeSequence() ([]byte, error) { func (p *unvisParser) escapeSequence() error {
ch, err := p.Peek() ch, err := p.Peek()
if err != nil { if err != nil {
return nil, fmt.Errorf("escape sequence: %w", err) return fmt.Errorf("escape sequence: %w", err)
} }
switch ch { switch ch {
case '\\': case '\\':
p.Step() p.Step()
return []byte("\\"), nil return p.output.WriteByte('\\')
case '0', '1', '2', '3', '4', '5', '6', '7': case '0', '1', '2', '3', '4', '5', '6', '7':
return p.escapeDigits(8, false) return p.escapeDigits(8, false)
@ -248,10 +249,10 @@ func (p *unvisParser) escapeSequence() ([]byte, error) {
} }
} }
func (p *unvisParser) element() ([]byte, error) { func (p *unvisParser) element() error {
ch, err := p.Peek() ch, err := p.Peek()
if err != nil { if err != nil {
return nil, err return err
} }
switch ch { switch ch {
@ -271,15 +272,12 @@ func (p *unvisParser) element() ([]byte, error) {
func (p *unvisParser) unvis(input string) (string, error) { func (p *unvisParser) unvis(input string) (string, error) {
p.Input(input) p.Input(input)
var output []byte
for !p.End() { for !p.End() {
ch, err := p.element() if err := p.element(); err != nil {
if err != nil {
return "", err return "", err
} }
output = append(output, ch...)
} }
return string(output), nil return p.Output().String(), nil
} }
// Unvis takes a string formatted with the given Vis flags (though only the // Unvis takes a string formatted with the given Vis flags (though only the

View file

@ -19,6 +19,7 @@
package govis package govis
import ( import (
"crypto/rand"
"strconv" "strconv"
"testing" "testing"
@ -172,3 +173,56 @@ func TestUnvisUnicode(t *testing.T) {
}) })
} }
} }
func BenchmarkUnvis(b *testing.B) {
doBench := func(b *testing.B, text string) {
encoded, err := Vis(text, DefaultVisFlags)
require.NoErrorf(b, err, "vis(%q)", text)
decoded, err := Unvis(encoded, DefaultVisFlags)
require.NoErrorf(b, err, "unvis(vis(%q) = %q)", text, encoded)
require.Equalf(b, text, decoded, "unvis(vis(%q) = %q)", text, encoded)
for b.Loop() {
_, _ = Unvis(encoded, DefaultVisFlags)
}
}
b.Run("NoChange", func(b *testing.B) {
text := "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
doBench(b, text)
})
b.Run("Binary", func(b *testing.B) {
var data [32]byte
n, err := rand.Read(data[:])
require.NoError(b, err, "rand.Read")
require.Equal(b, len(data), n, "rand.Read len return")
text := string(data[:])
doBench(b, text)
})
// The rest of these test strings come from a set of test strings collated
// in <https://www.w3.org/2001/06/utf-8-test/quickbrown.html>.
b.Run("ASCII", func(b *testing.B) {
text := "The quick brown fox jumps over the lazy dog."
doBench(b, text)
})
b.Run("German", func(b *testing.B) {
text := "Falsches Üben von Xylophonmusik quält jeden größeren Zwerg"
doBench(b, text)
})
b.Run("Russian", func(b *testing.B) {
text := "В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!"
doBench(b, text)
})
b.Run("Japanese", func(b *testing.B) {
text := "いろはにほへとちりぬるをイロハニホヘトチリヌルヲ"
doBench(b, text)
})
}