Add minor improvements and cleanup

This commit is contained in:
Justine Tunney 2020-10-27 03:39:46 -07:00
parent 9e3e985ae5
commit feed0d2b0e
163 changed files with 2286 additions and 2245 deletions

81
libc/unicode/strnwidth.c Normal file
View file

@ -0,0 +1,81 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301 USA
*/
#include "libc/macros.h"
#include "libc/str/thompike.h"
#include "libc/unicode/unicode.h"
/**
* Returns monospace display width of UTF-8 string.
*
* - Control codes are discounted
* - ANSI escape sequences are discounted
* - East asian glyphs, emoji, etc. count as two
*
* @param s is NUL-terminated string
* @param n is max bytes to consider
* @return monospace display width
*/
int strnwidth(const char *s, size_t n) {
wint_t c, w;
unsigned l, r;
enum { kAscii, kUtf8, kEsc, kCsi } t;
for (w = r = t = l = 0; n--;) {
if ((c = *s++ & 0xff)) {
switch (t) {
case kAscii:
if (0x20 <= c && c <= 0x7E || c == '\t') {
++l;
} else if (c == 033) {
t = kEsc;
} else if (c >= 0300) {
t = kUtf8;
w = ThomPikeByte(c);
r = ThomPikeLen(c) - 1;
}
break;
case kUtf8:
if (ThomPikeCont(c)) {
w = ThomPikeMerge(w, c);
if (--r) break;
}
l += MAX(0, wcwidth(w));
t = kAscii;
break;
case kEsc:
if (c == '[') {
t = kCsi;
} else if (!(040 <= c && c < 060)) {
t = kAscii;
}
break;
case kCsi:
if (!(060 <= c && c < 0100)) {
t = kAscii;
}
break;
default:
unreachable;
}
} else {
break;
}
}
return l;
}

View file

@ -17,56 +17,18 @@
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301 USA
*/
#include "libc/bits/safemacros.h"
#include "libc/conv/conv.h"
#include "libc/limits.h"
#include "libc/str/internal.h"
#include "libc/str/str.h"
#include "libc/str/tpdecode.h"
#include "libc/unicode/unicode.h"
#define kOneTrueTabWidth 8
/**
* Returns monospace display width in UTF-8 string.
* Returns monospace display width of UTF-8 string.
*
* - Control codes are discounted
* - ANSI escape sequences are discounted
* - East asian glyphs, emoji, etc. count as two
*
* @param s is NUL-terminated string
* @return monospace display width
*/
int(strwidth)(const char *s) {
return strnwidth(s, SIZE_MAX);
}
int(strnwidth)(const char *s, size_t n) {
/* TODO(jart): Fix this function. */
size_t l;
wint_t wc;
const unsigned char *p, *pe;
l = 0;
if (n) {
p = (const unsigned char *)s;
pe = (const unsigned char *)(n == SIZE_MAX ? INTPTR_MAX : (intptr_t)s + n);
for (;;) {
while (p < pe && iscont(*p)) p++;
if (p == pe || !*p) break;
if (*p == L'\t') {
if (l & (kOneTrueTabWidth - 1)) {
l += kOneTrueTabWidth - (l & (kOneTrueTabWidth - 1));
} else {
l += kOneTrueTabWidth;
}
++p;
} else if (*p == L'\e') {
while (++p < pe && *p) {
if (*p == '[' || *p == ';' || isdigit(*p)) {
continue;
} else {
++p;
break;
}
}
} else {
p += abs(tpdecode((const char *)p, &wc));
l += max(0, wcwidth(wc));
}
}
}
return l;
int strwidth(const char *s) {
return strnwidth(s, -1);
}

View file

@ -3,19 +3,6 @@
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
/*───────────────────────────────────────────────────────────────────────────│─╗
cosmopolitan § characters » unicode
*/
extern const uint8_t kEastAsianWidth[];
extern const uint32_t kEastAsianWidthBits;
extern const uint8_t kCombiningChars[];
extern const uint32_t kCombiningCharsBits;
/*───────────────────────────────────────────────────────────────────────────│─╗
cosmopolitan § strings » multibyte » unicode
*/
int wcwidth(wchar_t) pureconst;
int wcswidth(const wchar_t *) strlenesque;
int wcsnwidth(const wchar_t *, size_t) strlenesque;
@ -24,26 +11,6 @@ int strnwidth(const char *, size_t) strlenesque;
int strwidth16(const char16_t *) strlenesque;
int strnwidth16(const char16_t *, size_t) strlenesque;
/*───────────────────────────────────────────────────────────────────────────│─╗
cosmopolitan § unicode » generic typing
*/
#if __STDC_VERSION__ + 0 >= 201112
#define strwidth(s) \
_Generic(*(s), wchar_t \
: wcswidth, char16_t \
: strwidth16, default \
: strwidth)(s)
#define strnwidth(s, n) \
_Generic(*(s), wchar_t \
: wcswidth, char16_t \
: strnwidth16, default \
: strnwidth)(s, n)
#endif /* C11 */
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_LIBC_UNICODE_UNICODE_H_ */

View file

@ -19,6 +19,11 @@
*/
#include "libc/unicode/unicode.h"
extern const uint8_t kEastAsianWidth[];
extern const uint32_t kEastAsianWidthBits;
extern const uint8_t kCombiningChars[];
extern const uint32_t kCombiningCharsBits;
/**
* Returns cell width of monospace character.
*/