vis: partial vis(3) port

This is a stopgap while I figure out how I should go about implementing
vis(3). It's also important to have some vis(3) implementation so I can
do integration tests on round-trips.

Signed-off-by: Aleksa Sarai <asarai@suse.de>
This commit is contained in:
Aleksa Sarai 2017-02-12 04:07:35 +11:00
parent cd1de45ba5
commit 44391840b6
No known key found for this signature in database
GPG Key ID: 9E18AA267DDB8DB4
2 changed files with 216 additions and 0 deletions

162
vis.go Normal file
View File

@ -0,0 +1,162 @@
/*
* govis: unicode aware vis(3) encoding implementation
* Copyright (C) 2017 SUSE LLC.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package govis
import (
"fmt"
"unicode"
"unicode/utf8"
)
func isunsafe(ch rune) bool {
return ch == '\b' || ch == '\007' || ch == '\r'
}
func isglob(ch rune) bool {
return ch == '*' || ch == '?' || ch == '[' || ch == '#'
}
func ishttp(ch rune) bool {
return unicode.IsDigit(ch) || unicode.IsLetter(ch) ||
// Safe characters.
ch == '$' || ch == '-' || ch == '_' || ch == '.' || ch == '+' ||
// Extra characters.
ch == '!' || ch == '*' || ch == '\'' || ch == '(' ||
ch == ')' || ch == ','
}
func mapRuneBytes(ch rune, fn func(byte) string) string {
bytes := make([]byte, utf8.RuneLen(ch))
n := utf8.EncodeRune(bytes, ch)
mapped := ""
for i := 0; i < n; i++ {
mapped += fn(bytes[i])
}
return mapped
}
// vis converts a single rune into its encoding, ensuring that it is "safe"
// (for some definition of safe). Note that some visual characters (such as
// accented characters or similar things) can be made up of several runes -- in
// order to maintain my sanity Vis() makes no attempt to handle such cases
// specially.
func vis(ch rune, flag VisFlag) (string, error) {
// XXX: Currently we are just allowing regular multi-byte characters such
// as accents and so on to be passed through without encoding. Is this
// really the best idea? In order to maintain compatibility with
// vis(3) such that an older unvis(3) will do the right thing maybe we
// should only output 7-bit ASCII? I'm not sure.
if flag&VisHTTPStyle == VisHTTPStyle {
// This is described in RFC 1808.
if !ishttp(ch) {
return mapRuneBytes(ch, func(b byte) string {
return fmt.Sprintf("%.2X", b)
}), nil
}
}
// Handle all "ordinary" characters which don't need to be encoded.
if !(flag&VisGlob == VisGlob && isglob(ch)) &&
((unicode.IsGraphic(ch) && !unicode.IsSpace(ch)) ||
(flag&VisSpace == 0 && ch == ' ') ||
(flag&VisTab == 0 && ch == '\t') ||
(flag&VisNewline == 0 && ch == '\n') ||
(flag&VisSafe == VisSafe && isunsafe(ch))) {
enc := string(ch)
if ch == '\\' && flag&VisNoSlash == 0 {
enc += "\\"
}
return enc, nil
}
if flag&VisCStyle == VisCStyle {
switch ch {
case '\n':
return "\\n", nil
case '\r':
return "\\r", nil
case '\b':
return "\\b", nil
case '\a':
return "\\a", nil
case '\v':
return "\\v", nil
case '\t':
return "\\t", nil
case '\f':
return "\\f", nil
case 0:
// TODO: Handle isoctal properly.
return "\\000", nil
}
}
// TODO: ch & 0177 is not implemented...
if flag&VisOctal == VisOctal || unicode.IsGraphic(ch) {
return mapRuneBytes(ch, func(b byte) string {
return fmt.Sprintf("\\%.3o", b)
}), nil
}
return mapRuneBytes(ch, func(b byte) string {
enc := ""
if flag&VisNoSlash == 0 {
enc += "\\"
}
// This logic is stolen from cvis, I don't understand any of it.
if b&0200 != 0 {
b &= 0177
enc += "M"
}
if unicode.IsControl(rune(b)) {
enc += "^"
if b == 0177 {
enc += "?"
} else {
enc += string(b + '@')
}
} else {
enc += fmt.Sprintf("-%s", b)
}
return enc
}), nil
}
// Vis encodes the provided string to a BSD-compatible encoding using BSD's
// vis() flags. However, it will correctly handle multi-byte encoding (which is
// not done properly by BSD's vis implementation).
func Vis(src string, flag VisFlag) (string, error) {
if !utf8.ValidString(src) {
return "", fmt.Errorf("vis: input string is invalid utf8 literal")
}
output := ""
for _, ch := range src {
encodedCh, err := vis(ch, flag)
if err != nil {
return "", err
}
output += encodedCh
}
return output, nil
}

54
vis_test.go Normal file
View File

@ -0,0 +1,54 @@
/*
* govis: unicode aware vis(3) encoding implementation
* Copyright (C) 2017 SUSE LLC.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package govis
import (
"testing"
)
func TestVisUnchanged(t *testing.T) {
for _, test := range []string{
"helloworld",
"THIS_IS_A_TEST1234",
"SomeEncodingsAreCool",
"AC_Raíz_Certicámara_S.A..pem",
} {
enc, err := Vis(test, DefaultVisFlags)
if err != nil {
t.Errorf("unexpected error with %q: %s", test, err)
}
if enc != test {
t.Errorf("expected encoding of %q to be unchanged, got %q", test, enc)
}
}
}
func TestVisChanged(t *testing.T) {
for _, test := range []string{
"hello world",
"THIS\\IS_A_TEST1234",
} {
enc, err := Vis(test, DefaultVisFlags)
if err != nil {
t.Errorf("unexpected error with %q: %s", test, err)
}
if enc == test {
t.Errorf("expected encoding of %q to be changed")
}
}
}