go-mtree/vis.go

/*
 * govis: unicode aware vis(3) encoding implementation
 * Copyright (C) 2017 SUSE LLC.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package govis

import (
	"fmt"
	"unicode"
	"unicode/utf8"
)

func isunsafe(ch rune) bool {
	return ch == '\b' || ch == '\007' || ch == '\r'
}

func isglob(ch rune) bool {
	return ch == '*' || ch == '?' || ch == '[' || ch == '#'
}

func ishttp(ch rune) bool {
	return unicode.IsDigit(ch) || unicode.IsLetter(ch) ||
		// Safe characters.
		ch == '$' || ch == '-' || ch == '_' || ch == '.' || ch == '+' ||
		// Extra characters.
		ch == '!' || ch == '*' || ch == '\'' || ch == '(' ||
		ch == ')' || ch == ','
}

func mapRuneBytes(ch rune, fn func(byte) string) string {
	bytes := make([]byte, utf8.RuneLen(ch))
	n := utf8.EncodeRune(bytes, ch)

	mapped := ""
	for i := 0; i < n; i++ {
		mapped += fn(bytes[i])
	}
	return mapped
}

// vis converts a single rune into its encoding, ensuring that it is "safe"
// (for some definition of safe). Note that some visual characters (such as
// accented characters or similar things) can be made up of several runes -- in
// order to maintain my sanity Vis() makes no attempt to handle such cases
// specially.
func vis(ch rune, flag VisFlag) (string, error) {
	// XXX: Currently we are just allowing regular multi-byte characters such
	//      as accents and so on to be passed through without encoding. Is this
	//      really the best idea? In order to maintain compatibility with
	//      vis(3) such that an older unvis(3) will do the right thing maybe we
	//      should only output 7-bit ASCII? I'm not sure.

	if flag&VisHTTPStyle == VisHTTPStyle {
		// This is described in RFC 1808.
		if !ishttp(ch) {
			return mapRuneBytes(ch, func(b byte) string {
				return fmt.Sprintf("%.2X", b)
			}), nil
		}
	}

	// Handle all "ordinary" characters which don't need to be encoded.
	if !(flag&VisGlob == VisGlob && isglob(ch)) &&
		((unicode.IsGraphic(ch) && !unicode.IsSpace(ch)) ||
			(flag&VisSpace == 0 && ch == ' ') ||
			(flag&VisTab == 0 && ch == '\t') ||
			(flag&VisNewline == 0 && ch == '\n') ||
			(flag&VisSafe == VisSafe && isunsafe(ch))) {
		enc := string(ch)
		if ch == '\\' && flag&VisNoSlash == 0 {
			enc += "\\"
		}
		return enc, nil
	}

	if flag&VisCStyle == VisCStyle {
		switch ch {
		case '\n':
			return "\\n", nil
		case '\r':
			return "\\r", nil
		case '\b':
			return "\\b", nil
		case '\a':
			return "\\a", nil
		case '\v':
			return "\\v", nil
		case '\t':
			return "\\t", nil
		case '\f':
			return "\\f", nil
		case 0:
			// TODO: Handle isoctal properly.
			return "\\000", nil
		}
	}

	// TODO: ch & 0177 is not implemented...
	if flag&VisOctal == VisOctal || unicode.IsGraphic(ch) {
		return mapRuneBytes(ch, func(b byte) string {
			return fmt.Sprintf("\\%.3o", b)
		}), nil
	}

	return mapRuneBytes(ch, func(b byte) string {
		enc := ""
		if flag&VisNoSlash == 0 {
			enc += "\\"
		}

		// This logic is stolen from cvis, I don't understand any of it.
		if b&0200 != 0 {
			b &= 0177
			enc += "M"
		}
		if unicode.IsControl(rune(b)) {
			enc += "^"
			if b == 0177 {
				enc += "?"
			} else {
				enc += string(b + '@')
			}
		} else {
			enc += fmt.Sprintf("-%s", b)
		}

		return enc
	}), nil
}

// Vis encodes the provided string to a BSD-compatible encoding using BSD's
// vis() flags. However, it will correctly handle multi-byte encoding (which is
// not done properly by BSD's vis implementation).
func Vis(src string, flag VisFlag) (string, error) {
	if !utf8.ValidString(src) {
		return "", fmt.Errorf("vis: input string is invalid utf8 literal")
	}

	output := ""
	for _, ch := range src {
		encodedCh, err := vis(ch, flag)
		if err != nil {
			return "", err
		}
		output += encodedCh
	}

	return output, nil
}