1
0
Fork 0
mirror of https://github.com/vbatts/go-mtree.git synced 2025-10-03 20:21:01 +00:00
go-mtree/pkg/govis/vis.go
Aleksa Sarai 47086b0654
vis: improve performance by reducing allocations
By avoiding lots of small string allocations and reallocations when
appending to the output buffer, we can get a pretty decent performance
improvement (~6x for strings that do not require escaping, and ~2x for
most other multi-byte utf8 strings).

    goos: linux
    goarch: amd64
    pkg: github.com/vbatts/go-mtree/pkg/govis
    cpu: AMD Ryzen 7 7840U w/ Radeon  780M Graphics
                    │    before    │                after                │
                    │    sec/op    │   sec/op     vs base                │
    Vis/NoChange-16   2372.5n ± 2%   379.1n ± 1%  -84.02% (p=0.000 n=10)
    Vis/Binary-16      2.104µ ± 8%   1.319µ ± 8%  -37.35% (p=0.000 n=10)
    Vis/ASCII-16      2070.0n ± 1%   737.3n ± 0%  -64.38% (p=0.000 n=10)
    Vis/German-16      3.380µ ± 1%   1.181µ ± 2%  -65.04% (p=0.000 n=10)
    Vis/Russian-16    10.927µ ± 2%   5.293µ ± 2%  -51.56% (p=0.000 n=10)
    Vis/Japanese-16    7.489µ ± 1%   3.990µ ± 0%  -46.72% (p=0.000 n=10)
    geomean            3.767µ        1.447µ       -61.58%

In theory we could get more performance if switch away from fmt.Sprintf,
but the %.N handling would be a little annoying to implement and so we
can punt on that for now.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
2025-09-23 04:40:25 +10:00

183 lines
5.3 KiB
Go

// SPDX-License-Identifier: Apache-2.0
/*
* govis: unicode aware vis(3) encoding implementation
* Copyright (C) 2017-2025 SUSE LLC.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package govis
import (
"fmt"
"strings"
"unicode"
)
func isunsafe(ch rune) bool {
return ch == '\b' || ch == '\007' || ch == '\r'
}
func isglob(ch rune) bool {
return ch == '*' || ch == '?' || ch == '[' || ch == '#'
}
// ishttp is defined by RFC 1808.
func ishttp(ch rune) bool {
// RFC1808 does not really consider characters outside of ASCII, so just to
// be safe always treat characters outside the ASCII character set as "not
// HTTP".
if ch > unicode.MaxASCII {
return false
}
return unicode.IsDigit(ch) || unicode.IsLetter(ch) ||
// Safe characters.
ch == '$' || ch == '-' || ch == '_' || ch == '.' || ch == '+' ||
// Extra characters.
ch == '!' || ch == '*' || ch == '\'' || ch == '(' ||
ch == ')' || ch == ','
}
func isgraph(ch rune) bool {
return unicode.IsGraphic(ch) && !unicode.IsSpace(ch) && ch <= unicode.MaxASCII
}
// vis converts a single *byte* into its encoding. While Go supports the
// concept of runes (and thus native utf-8 parsing), in order to make sure that
// the bit-stream will be completely maintained through an Unvis(Vis(...))
// round-trip. The downside is that Vis() will never output unicode -- but on
// the plus side this is actually a benefit on the encoding side (it will
// always work with the simple unvis(3) implementation). It also means that we
// don't have to worry about different multi-byte encodings.
func vis(output *strings.Builder, b byte, flag VisFlag) {
// Treat the single-byte character as a rune.
ch := rune(b)
// XXX: This is quite a horrible thing to support.
if flag&VisHTTPStyle == VisHTTPStyle {
if !ishttp(ch) {
_, _ = fmt.Fprintf(output, "%%%.2X", ch)
return
}
}
// Figure out if the character doesn't need to be encoded. Effectively, we
// encode most "normal" (graphical) characters as themselves unless we have
// been specifically asked not to. Note though that we *ALWAYS* encode
// everything outside ASCII.
// TODO: Switch this to much more logical code.
if ch > unicode.MaxASCII {
/* ... */
} else if flag&VisGlob == VisGlob && isglob(ch) {
/* ... */
} else if isgraph(ch) ||
(flag&VisSpace != VisSpace && ch == ' ') ||
(flag&VisTab != VisTab && ch == '\t') ||
(flag&VisNewline != VisNewline && ch == '\n') ||
(flag&VisSafe != 0 && isunsafe(ch)) {
if ch == '\\' && flag&VisNoSlash == 0 {
_ = output.WriteByte('\\')
}
_ = output.WriteByte(b)
return
}
// Try to use C-style escapes first.
if flag&VisCStyle == VisCStyle {
switch ch {
case ' ':
_, _ = output.WriteString("\\s")
return
case '\n':
_, _ = output.WriteString("\\n")
return
case '\r':
_, _ = output.WriteString("\\r")
return
case '\b':
_, _ = output.WriteString("\\b")
return
case '\a':
_, _ = output.WriteString("\\a")
return
case '\v':
_, _ = output.WriteString("\\v")
return
case '\t':
_, _ = output.WriteString("\\t")
return
case '\f':
_, _ = output.WriteString("\\f")
return
case '\x00':
// Output octal just to be safe.
_, _ = output.WriteString("\\000")
return
}
}
// For graphical characters we generate octal output (and also if it's
// being forced by the caller's flags). Also spaces should always be
// encoded as octal.
if flag&VisOctal == VisOctal || isgraph(ch) || ch&0x7f == ' ' {
// Always output three-character octal just to be safe.
_, _ = fmt.Fprintf(output, "\\%.3o", ch)
return
}
// Now we have to output meta or ctrl escapes. As far as I can tell, this
// is not actually defined by any standard -- so this logic is basically
// copied from the original vis(3) implementation. Hopefully nobody
// actually relies on this (octal and hex are better).
if flag&VisNoSlash == 0 {
_ = output.WriteByte('\\')
}
// Meta characters have 0x80 set, but are otherwise identical to control
// characters.
if b&0x80 != 0 {
b &= 0x7f
_ = output.WriteByte('M')
}
if unicode.IsControl(rune(b)) {
_ = output.WriteByte('^')
if b == 0x7f {
_ = output.WriteByte('?')
} else {
_ = output.WriteByte(b + '@')
}
} else {
_ = output.WriteByte('-')
_ = output.WriteByte(b)
}
}
// Vis encodes the provided string to a BSD-compatible encoding using BSD's
// vis() flags. However, it will correctly handle multi-byte encoding (which is
// not done properly by BSD's vis implementation).
func Vis(src string, flags VisFlag) (string, error) {
if unknown := flags &^ visMask; unknown != 0 {
return "", unknownVisFlagsError{flags: flags}
}
var output strings.Builder
output.Grow(len(src)) // vis() will always take up at least len(src) bytes
for _, ch := range []byte(src) {
vis(&output, ch, flags)
}
return output.String(), nil
}