diff --git a/pkg/govis/unvis.go b/pkg/govis/unvis.go index d759f5a..7f20ad7 100644 --- a/pkg/govis/unvis.go +++ b/pkg/govis/unvis.go @@ -22,6 +22,7 @@ import ( "errors" "fmt" "strconv" + "strings" "unicode" ) @@ -34,6 +35,7 @@ var ( // unvisParser stores the current state of the token parser. type unvisParser struct { + output *strings.Builder tokens []rune idx int flags VisFlag @@ -41,10 +43,18 @@ type unvisParser struct { // Input resets the parser with a new input string. func (p *unvisParser) Input(input string) { + p.output = new(strings.Builder) + p.output.Grow(len(input)) // the output will be at most input-sized + p.tokens = []rune(input) p.idx = 0 } +// Output returns the internal [strings.Builder]. +func (p *unvisParser) Output() *strings.Builder { + return p.output +} + // Step moves the index to the next character. func (p *unvisParser) Step() { p.idx++ @@ -74,6 +84,7 @@ func (p *unvisParser) End() bool { func newParser(flags VisFlag) *unvisParser { return &unvisParser{ + output: nil, tokens: nil, idx: 0, flags: flags, @@ -95,62 +106,57 @@ func newParser(flags VisFlag) *unvisParser { // ::= [0-9a-f] [0-9a-f] // ::= [0-7] ([0-7] ([0-7])?)? -func (p *unvisParser) plainRune() ([]byte, error) { +func (p *unvisParser) plainRune() error { ch, err := p.Next() if err != nil { - return nil, fmt.Errorf("plain rune: %w", err) + return fmt.Errorf("plain rune: %w", err) } - return []byte(string(ch)), nil + _, err = p.output.WriteRune(ch) + return err } -func (p *unvisParser) escapeCStyle() ([]byte, error) { +func (p *unvisParser) escapeCStyle() error { ch, err := p.Next() if err != nil { - return nil, fmt.Errorf("escape cstyle: %w", err) + return fmt.Errorf("escape cstyle: %w", err) } - output := "" switch ch { case 'n': - output = "\n" + return p.output.WriteByte('\n') case 'r': - output = "\r" + return p.output.WriteByte('\r') case 'b': - output = "\b" + return p.output.WriteByte('\b') case 'a': - output = "\x07" + return p.output.WriteByte('\x07') case 'v': - output = "\v" + return p.output.WriteByte('\v') case 't': - output = "\t" + return p.output.WriteByte('\t') case 'f': - output = "\f" + return p.output.WriteByte('\f') case 's': - output = " " + return p.output.WriteByte(' ') case 'E': - output = "\x1b" - case '\n': - // Hidden newline. - case '$': - // Hidden marker. - default: - // XXX: We should probably allow falling through and return "\" here... - return nil, fmt.Errorf("escape cstyle: %w %q", errUnknownEscapeChar, ch) + return p.output.WriteByte('\x1b') + case '\n', '$': + // Hidden newline or marker. + return nil } - - return []byte(output), nil + // XXX: We should probably allow falling through and return "\" here... + return fmt.Errorf("escape cstyle: %w %q", errUnknownEscapeChar, ch) } -func (p *unvisParser) escapeDigits(base int, force bool) ([]byte, error) { +func (p *unvisParser) escapeDigits(base int, force bool) error { var code int - for i := int(0xFF); i > 0; i /= base { ch, err := p.Peek() if err != nil { if !force && i != 0xFF { break } - return nil, fmt.Errorf("escape base %d: %w", base, err) + return fmt.Errorf("escape base %d: %w", base, err) } digit, err := strconv.ParseInt(string(ch), base, 8) @@ -158,45 +164,40 @@ func (p *unvisParser) escapeDigits(base int, force bool) ([]byte, error) { if !force && i != 0xFF { break } - return nil, fmt.Errorf("escape base %d: %w %q: %w", base, errParseDigit, ch, err) + return fmt.Errorf("escape base %d: %w %q: %w", base, errParseDigit, ch, err) } code = (code * base) + int(digit) p.Step() // only consume token if we use it (length is variable) } - if code > unicode.MaxLatin1 { - return nil, fmt.Errorf("escape base %d: code %+.2x %w", base, code, errOutsideLatin1) + return fmt.Errorf("escape base %d: code %+.2x %w", base, code, errOutsideLatin1) } - - char := byte(code & 0xFF) - return []byte{char}, nil + return p.output.WriteByte(byte(code)) } -func (p *unvisParser) escapeCtrl(mask byte) ([]byte, error) { +func (p *unvisParser) escapeCtrl(mask byte) error { ch, err := p.Next() if err != nil { - return nil, fmt.Errorf("escape ctrl: %w", err) + return fmt.Errorf("escape ctrl: %w", err) } if ch > unicode.MaxLatin1 { - return nil, fmt.Errorf("escape ctrl: code %q %w", ch, errOutsideLatin1) + return fmt.Errorf("escape ctrl: code %q %w", ch, errOutsideLatin1) } - char := byte(ch) & 0x1f if ch == '?' { char = 0x7f } - return []byte{mask | char}, nil + return p.output.WriteByte(mask | char) } -func (p *unvisParser) escapeMeta() ([]byte, error) { +func (p *unvisParser) escapeMeta() error { ch, err := p.Next() if err != nil { - return nil, fmt.Errorf("escape meta: %w", err) + return fmt.Errorf("escape meta: %w", err) } mask := byte(0x80) - switch ch { case '^': // The same as "\^..." except we apply a mask. @@ -205,28 +206,28 @@ func (p *unvisParser) escapeMeta() ([]byte, error) { case '-': ch, err := p.Next() if err != nil { - return nil, fmt.Errorf("escape meta1: %w", err) + return fmt.Errorf("escape meta1: %w", err) } if ch > unicode.MaxLatin1 { - return nil, fmt.Errorf("escape meta1: code %q %w", ch, errOutsideLatin1) + return fmt.Errorf("escape meta1: code %q %w", ch, errOutsideLatin1) } // Add mask to character. - return []byte{mask | byte(ch)}, nil + return p.output.WriteByte(mask | byte(ch)) } - return nil, fmt.Errorf("escape meta: %w %q", errUnknownEscapeChar, ch) + return fmt.Errorf("escape meta: %w %q", errUnknownEscapeChar, ch) } -func (p *unvisParser) escapeSequence() ([]byte, error) { +func (p *unvisParser) escapeSequence() error { ch, err := p.Peek() if err != nil { - return nil, fmt.Errorf("escape sequence: %w", err) + return fmt.Errorf("escape sequence: %w", err) } switch ch { case '\\': p.Step() - return []byte("\\"), nil + return p.output.WriteByte('\\') case '0', '1', '2', '3', '4', '5', '6', '7': return p.escapeDigits(8, false) @@ -248,10 +249,10 @@ func (p *unvisParser) escapeSequence() ([]byte, error) { } } -func (p *unvisParser) element() ([]byte, error) { +func (p *unvisParser) element() error { ch, err := p.Peek() if err != nil { - return nil, err + return err } switch ch { @@ -271,15 +272,12 @@ func (p *unvisParser) element() ([]byte, error) { func (p *unvisParser) unvis(input string) (string, error) { p.Input(input) - var output []byte for !p.End() { - ch, err := p.element() - if err != nil { + if err := p.element(); err != nil { return "", err } - output = append(output, ch...) } - return string(output), nil + return p.Output().String(), nil } // Unvis takes a string formatted with the given Vis flags (though only the diff --git a/pkg/govis/unvis_test.go b/pkg/govis/unvis_test.go index ec4218c..ae9945f 100644 --- a/pkg/govis/unvis_test.go +++ b/pkg/govis/unvis_test.go @@ -19,6 +19,7 @@ package govis import ( + "crypto/rand" "strconv" "testing" @@ -172,3 +173,56 @@ func TestUnvisUnicode(t *testing.T) { }) } } + +func BenchmarkUnvis(b *testing.B) { + doBench := func(b *testing.B, text string) { + encoded, err := Vis(text, DefaultVisFlags) + require.NoErrorf(b, err, "vis(%q)", text) + + decoded, err := Unvis(encoded, DefaultVisFlags) + require.NoErrorf(b, err, "unvis(vis(%q) = %q)", text, encoded) + require.Equalf(b, text, decoded, "unvis(vis(%q) = %q)", text, encoded) + + for b.Loop() { + _, _ = Unvis(encoded, DefaultVisFlags) + } + } + + b.Run("NoChange", func(b *testing.B) { + text := "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" + doBench(b, text) + }) + + b.Run("Binary", func(b *testing.B) { + var data [32]byte + n, err := rand.Read(data[:]) + require.NoError(b, err, "rand.Read") + require.Equal(b, len(data), n, "rand.Read len return") + + text := string(data[:]) + doBench(b, text) + }) + + // The rest of these test strings come from a set of test strings collated + // in . + + b.Run("ASCII", func(b *testing.B) { + text := "The quick brown fox jumps over the lazy dog." + doBench(b, text) + }) + + b.Run("German", func(b *testing.B) { + text := "Falsches Üben von Xylophonmusik quält jeden größeren Zwerg" + doBench(b, text) + }) + + b.Run("Russian", func(b *testing.B) { + text := "В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!" + doBench(b, text) + }) + + b.Run("Japanese", func(b *testing.B) { + text := "いろはにほへとちりぬるをイロハニホヘトチリヌルヲ" + doBench(b, text) + }) +}