mirror of
https://github.com/vbatts/go-mtree.git
synced 2025-10-03 20:21:01 +00:00
By using a buffer, we can avoid a bunch of small allocations that the previous implementation did. Based on a few small benchmarks, the performance improvement is very stark (~3x faster for strings that don't require any escaping, and ~20% faster for multi-byte utf8 strings): goos: linux goarch: amd64 pkg: github.com/vbatts/go-mtree/pkg/govis cpu: AMD Ryzen 7 7840U w/ Radeon 780M Graphics │ before │ after │ │ sec/op │ sec/op vs base │ Unvis/NoChange-16 1501.0n ± 0% 497.7n ± 1% -66.84% (p=0.000 n=10) Unvis/Binary-16 1317.5n ± 3% 934.9n ± 9% -29.04% (p=0.000 n=10) Unvis/ASCII-16 1325.5n ± 1% 616.8n ± 1% -53.47% (p=0.000 n=10) Unvis/German-16 1884.5n ± 1% 986.9n ± 2% -47.63% (p=0.000 n=10) Unvis/Russian-16 4.636µ ± 1% 3.796µ ± 1% -18.11% (p=0.000 n=10) Unvis/Japanese-16 3.453µ ± 1% 2.867µ ± 1% -16.99% (p=0.000 n=10) geomean 2.072µ 1.206µ -41.77% Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
228 lines
7.3 KiB
Go
228 lines
7.3 KiB
Go
// SPDX-License-Identifier: Apache-2.0
|
||
/*
|
||
* govis: unicode aware vis(3) encoding implementation
|
||
* Copyright (C) 2017-2025 SUSE LLC.
|
||
*
|
||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||
* you may not use this file except in compliance with the License.
|
||
* You may obtain a copy of the License at
|
||
*
|
||
* http://www.apache.org/licenses/LICENSE-2.0
|
||
*
|
||
* Unless required by applicable law or agreed to in writing, software
|
||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
* See the License for the specific language governing permissions and
|
||
* limitations under the License.
|
||
*/
|
||
|
||
package govis
|
||
|
||
import (
|
||
"crypto/rand"
|
||
"strconv"
|
||
"testing"
|
||
|
||
"github.com/stretchr/testify/assert"
|
||
"github.com/stretchr/testify/require"
|
||
)
|
||
|
||
func TestUnvisError(t *testing.T) {
|
||
for _, test := range []struct {
|
||
input string
|
||
err error
|
||
}{
|
||
// Octal escape codes allow you to specify invalid ASCII values.
|
||
{"\\777", errOutsideLatin1},
|
||
{"\\420\\322\\455", errOutsideLatin1},
|
||
{"\\652\\233", errOutsideLatin1},
|
||
// Escapes that end abruptly.
|
||
{"\\", errEndOfString},
|
||
{"\\J", errUnknownEscapeChar},
|
||
{"a bad slash: \\", errEndOfString},
|
||
{"testing -- \\x", errEndOfString},
|
||
{"\\xG0 test", strconv.ErrSyntax},
|
||
{" abc \\Mx", errUnknownEscapeChar},
|
||
{"\\Mx", errUnknownEscapeChar},
|
||
{"\\M-", errEndOfString},
|
||
{"\\M-\u5000", errOutsideLatin1},
|
||
{"\\M^", errEndOfString},
|
||
{"\\^", errEndOfString},
|
||
{"\\^\u5000", errOutsideLatin1},
|
||
{"\\M", errEndOfString},
|
||
} {
|
||
t.Run(test.input, func(t *testing.T) {
|
||
_, err := Unvis(test.input, DefaultVisFlags)
|
||
require.Errorf(t, err, "invalid escape string should give an error")
|
||
assert.ErrorIs(t, err, test.err, "unexpected error from invalid escape string")
|
||
})
|
||
}
|
||
}
|
||
|
||
func TestUnvisCStyleEscape(t *testing.T) {
|
||
for _, test := range []struct {
|
||
input string
|
||
expected string
|
||
}{
|
||
{"", ""},
|
||
{"\\n\\v\\t\\s", "\n\v\t "},
|
||
{"\\\\n\\tt", "\\n\tt"},
|
||
{"\\b", "\b"},
|
||
{"\\r\\b\\n", "\r\b\n"},
|
||
{"\\a\\a\\b", "\x07\x07\b"},
|
||
{"\\f\\s\\E", "\f \x1b"},
|
||
// Hidden markers. They actually aren't generated by vis(3) but for
|
||
// some reason, they're supported...
|
||
{"test\\\ning", "testing"},
|
||
{"test\\$\\$ing", "testing"},
|
||
} {
|
||
t.Run(test.input, func(t *testing.T) {
|
||
got, err := Unvis(test.input, DefaultVisFlags)
|
||
require.NoErrorf(t, err, "unvis(%q)", test.input)
|
||
assert.Equal(t, test.expected, got, "unvis(%q)", test.input)
|
||
})
|
||
}
|
||
}
|
||
|
||
func TestUnvisMetaEscape(t *testing.T) {
|
||
for _, test := range []struct {
|
||
input string
|
||
expected string
|
||
}{
|
||
{"", ""},
|
||
{"\\M^ ?\\^ ", "\x80?\x00"},
|
||
{"\\M- ?\\^?", "\xa0?\x7f"},
|
||
{"\\M-x butterfly\\M^?", "\xf8 butterfly\xff"},
|
||
{"\\M^X steady-hand \\^& needle", "\x98 steady-hand \x06 needle"},
|
||
// TODO: Add some more of these tests, but I need to have some
|
||
// secondary source to verify these outputs properly.
|
||
} {
|
||
t.Run(test.input, func(t *testing.T) {
|
||
got, err := Unvis(test.input, DefaultVisFlags)
|
||
require.NoErrorf(t, err, "unvis(%q)", test.input)
|
||
assert.Equal(t, test.expected, got, "unvis(%q)", test.input)
|
||
})
|
||
}
|
||
}
|
||
|
||
func TestUnvisOctalEscape(t *testing.T) {
|
||
for _, test := range []struct {
|
||
input string
|
||
expected string
|
||
}{
|
||
{"", ""},
|
||
{"\\1", "\001"},
|
||
{"\\01\\02\\3", "\001\002\003"},
|
||
{"\\001\\023\\32", "\001\023\032"},
|
||
{"this is a test\\0k1\\133", "this is a test\000k1\133"},
|
||
{"\\170YET\\01another test\\1\\\\82", "\170YET\001another test\001\\82"},
|
||
{"\\177MORE tests\\09a", "\177MORE tests\x009a"},
|
||
{"\\\\710more\\1215testing", "\\710more\1215testing"},
|
||
// Make sure that decoding unicode works properly, when it's been encoded as single bytes.
|
||
{"\\360\\237\\225\\264", "\U0001f574"},
|
||
{"T\\303\\234B\\304\\260TAK_UEKAE_K\\303\\266k_Sertifika_Hizmet_Sa\\304\\237lay\\304\\261c\\304\\261s\\304\\261_-_S\\303\\274r\\303\\274m_3.pem", "TÜBİTAK_UEKAE_Kök_Sertifika_Hizmet_Sağlayıcısı_-_Sürüm_3.pem"},
|
||
// Some invalid characters...
|
||
{"\\377\\2\\225\\264", "\xff\x02\x95\xb4"},
|
||
} {
|
||
t.Run(test.input, func(t *testing.T) {
|
||
got, err := Unvis(test.input, DefaultVisFlags)
|
||
require.NoErrorf(t, err, "unvis(%q)", test.input)
|
||
assert.Equal(t, test.expected, got, "unvis(%q)", test.input)
|
||
})
|
||
}
|
||
}
|
||
|
||
func TestUnvisHexEscape(t *testing.T) {
|
||
for _, test := range []struct {
|
||
input string
|
||
expected string
|
||
}{
|
||
{"", ""},
|
||
{"\\x01", "\x01"},
|
||
{"\\x01\\x02\\x7a", "\x01\x02\x7a"},
|
||
{"this is a test\\x13\\x52\\x6f", "this is a test\x13\x52\x6f"},
|
||
{"\\x170YET\\x01a\\x22nother test\\x11", "\x170YET\x01a\x22nother test\x11"},
|
||
{"\\\\x007more\\\\x215testing", "\\x007more\\x215testing"},
|
||
// Make sure that decoding unicode works properly, when it's been encoded as single bytes.
|
||
{"\\xf0\\x9f\\x95\\xb4", "\U0001f574"},
|
||
{"T\\xc3\\x9cB\\xc4\\xb0TAK_UEKAE_K\\xc3\\xb6k_Sertifika_Hizmet_Sa\\xc4\\x9flay\\xc4\\xb1c\\xc4\\xb1s\\xc4\\xb1_-_S\\xc3\\xbcr\\xc3\\xbcm_3.pem", "TÜBİTAK_UEKAE_Kök_Sertifika_Hizmet_Sağlayıcısı_-_Sürüm_3.pem"},
|
||
// Some invalid characters...
|
||
{"\\xff\\x02\\x95\\xb4", "\xff\x02\x95\xb4"},
|
||
} {
|
||
t.Run(test.input, func(t *testing.T) {
|
||
got, err := Unvis(test.input, DefaultVisFlags)
|
||
require.NoErrorf(t, err, "unvis(%q)", test.input)
|
||
assert.Equal(t, test.expected, got, "unvis(%q)", test.input)
|
||
})
|
||
}
|
||
}
|
||
|
||
func TestUnvisUnicode(t *testing.T) {
|
||
// Ensure that unicode strings are not messed up by Unvis.
|
||
for _, test := range []string{
|
||
"",
|
||
"this.is.a.normal_string",
|
||
"AC_Raíz_Certicámara_S.A..pem",
|
||
"NetLock_Arany_=Class_Gold=_Főtanúsítvány.pem",
|
||
"TÜBİTAK_UEKAE_Kök_Sertifika_Hizmet_Sağlayıcısı_-_Sürüm_3.pem",
|
||
} {
|
||
t.Run(test, func(t *testing.T) {
|
||
enc, err := Unvis(test, DefaultVisFlags)
|
||
require.NoErrorf(t, err, "unvis(%q)", test)
|
||
assert.Equalf(t, test, enc, "decoding of %q should be the same as original", test)
|
||
})
|
||
}
|
||
}
|
||
|
||
func BenchmarkUnvis(b *testing.B) {
|
||
doBench := func(b *testing.B, text string) {
|
||
encoded, err := Vis(text, DefaultVisFlags)
|
||
require.NoErrorf(b, err, "vis(%q)", text)
|
||
|
||
decoded, err := Unvis(encoded, DefaultVisFlags)
|
||
require.NoErrorf(b, err, "unvis(vis(%q) = %q)", text, encoded)
|
||
require.Equalf(b, text, decoded, "unvis(vis(%q) = %q)", text, encoded)
|
||
|
||
for b.Loop() {
|
||
_, _ = Unvis(encoded, DefaultVisFlags)
|
||
}
|
||
}
|
||
|
||
b.Run("NoChange", func(b *testing.B) {
|
||
text := "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
|
||
doBench(b, text)
|
||
})
|
||
|
||
b.Run("Binary", func(b *testing.B) {
|
||
var data [32]byte
|
||
n, err := rand.Read(data[:])
|
||
require.NoError(b, err, "rand.Read")
|
||
require.Equal(b, len(data), n, "rand.Read len return")
|
||
|
||
text := string(data[:])
|
||
doBench(b, text)
|
||
})
|
||
|
||
// The rest of these test strings come from a set of test strings collated
|
||
// in <https://www.w3.org/2001/06/utf-8-test/quickbrown.html>.
|
||
|
||
b.Run("ASCII", func(b *testing.B) {
|
||
text := "The quick brown fox jumps over the lazy dog."
|
||
doBench(b, text)
|
||
})
|
||
|
||
b.Run("German", func(b *testing.B) {
|
||
text := "Falsches Üben von Xylophonmusik quält jeden größeren Zwerg"
|
||
doBench(b, text)
|
||
})
|
||
|
||
b.Run("Russian", func(b *testing.B) {
|
||
text := "В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!"
|
||
doBench(b, text)
|
||
})
|
||
|
||
b.Run("Japanese", func(b *testing.B) {
|
||
text := "いろはにほへとちりぬるをイロハニホヘトチリヌルヲ"
|
||
doBench(b, text)
|
||
})
|
||
}
|