Add minor improvements and cleanup

2025-10-05 14:11:01 +00:00 · 2020-10-27 03:39:46 -07:00 · 2020-10-27 03:39:46 -07:00 · feed0d2b0e
commit feed0d2b0e
parent 9e3e985ae5
163 changed files with 2286 additions and 2245 deletions
--- a/libc/unicode/strnwidth.c
+++ b/libc/unicode/strnwidth.c
@ -0,0 +1,81 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8                                :vi│
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ This program is free software; you can redistribute it and/or modify         │
+│ it under the terms of the GNU General Public License as published by         │
+│ the Free Software Foundation; version 2 of the License.                      │
+│                                                                              │
+│ This program is distributed in the hope that it will be useful, but          │
+│ WITHOUT ANY WARRANTY; without even the implied warranty of                   │
+│ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU             │
+│ General Public License for more details.                                     │
+│                                                                              │
+│ You should have received a copy of the GNU General Public License            │
+│ along with this program; if not, write to the Free Software                  │
+│ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA                │
+│ 02110-1301 USA                                                               │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/macros.h"
+#include "libc/str/thompike.h"
+#include "libc/unicode/unicode.h"
+
+/**
+ * Returns monospace display width of UTF-8 string.
+ *
+ * - Control codes are discounted
+ * - ANSI escape sequences are discounted
+ * - East asian glyphs, emoji, etc. count as two
+ *
+ * @param s is NUL-terminated string
+ * @param n is max bytes to consider
+ * @return monospace display width
+ */
+int strnwidth(const char *s, size_t n) {
+  wint_t c, w;
+  unsigned l, r;
+  enum { kAscii, kUtf8, kEsc, kCsi } t;
+  for (w = r = t = l = 0; n--;) {
+    if ((c = *s++ & 0xff)) {
+      switch (t) {
+        case kAscii:
+          if (0x20 <= c && c <= 0x7E || c == '\t') {
+            ++l;
+          } else if (c == 033) {
+            t = kEsc;
+          } else if (c >= 0300) {
+            t = kUtf8;
+            w = ThomPikeByte(c);
+            r = ThomPikeLen(c) - 1;
+          }
+          break;
+        case kUtf8:
+          if (ThomPikeCont(c)) {
+            w = ThomPikeMerge(w, c);
+            if (--r) break;
+          }
+          l += MAX(0, wcwidth(w));
+          t = kAscii;
+          break;
+        case kEsc:
+          if (c == '[') {
+            t = kCsi;
+          } else if (!(040 <= c && c < 060)) {
+            t = kAscii;
+          }
+          break;
+        case kCsi:
+          if (!(060 <= c && c < 0100)) {
+            t = kAscii;
+          }
+          break;
+        default:
+          unreachable;
+      }
+    } else {
+      break;
+    }
+  }
+  return l;
+}