/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ │vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│ ╞══════════════════════════════════════════════════════════════════════════════╡ │ Copyright 2021 Justine Alexandra Roberts Tunney │ │ │ │ Permission to use, copy, modify, and/or distribute this software for │ │ any purpose with or without fee is hereby granted, provided that the │ │ above copyright notice and this permission notice appear in all copies. │ │ │ │ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ │ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ │ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ │ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ │ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ │ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "libc/intrin/likely.h" #include "libc/dce.h" #include "libc/intrin/asan.internal.h" #include "libc/str/str.h" typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(16))); static const char kUtf8Dispatch[] = { 0, 0, 1, 1, 1, 1, 1, 1, // 0300 utf8-2 1, 1, 1, 1, 1, 1, 1, 1, // 0310 1, 1, 1, 1, 1, 1, 1, 1, // 0320 1, 1, 1, 1, 1, 1, 1, 1, // 0330 2, 3, 3, 3, 3, 3, 3, 3, // 0340 utf8-3 3, 3, 3, 3, 3, 3, 3, 3, // 0350 4, 5, 5, 5, 5, 0, 0, 0, // 0360 utf8-4 0, 0, 0, 0, 0, 0, 0, 0, // 0370 }; /** * Returns true if text is utf-8. * * _isutf8 n=0 1 nanoseconds * _isutf8 n=5 661 ps/byte 1,476 mb/s * _isutf8 ascii n=22851 26 ps/byte 35 GB/s * _isutf8 unicode n=3193 543 ps/byte 1,795 mb/s * * This function considers all ASCII characters including NUL to be * valid UTF-8. The conditions for something not being valid are: * * - Incorrect sequencing of 0300 (FIRST) and 0200 (CONT) chars * - Thompson-Pike varint sequence not encodable as UTF-16 * - Overlong UTF-8 encoding * * @param size if -1 implies strlen */ noasan bool _isutf8(const void *data, size_t size) { long c; unsigned m; const char *p, *e; if (size == -1) size = data ? strlen(data) : 0; if (IsAsan()) __asan_verify(data, size); p = data; e = p + size; while (p < e) { if (!((intptr_t)p & 15)) { for (;;) { if ((m = __builtin_ia32_pmovmskb128(*(xmm_t *)p >= (xmm_t){0}) ^ 0xffff)) { m = __builtin_ctzll(m); p += m; break; } else if ((p += 16) >= e) { break; } } if (p >= e) { return true; } } if (LIKELY((c = *p++ & 255) < 0200)) continue; if (UNLIKELY(c < 0300)) return false; switch (kUtf8Dispatch[c - 0300]) { case 0: return false; case 1: if (p < e && (*p & 0300) == 0200) { ++p; break; } else { return false; // missing cont } case 2: if (p < e && (*p & 0377) < 0240) { return false; // overlong } // fallthrough case 3: if (p + 2 <= e && // (p[0] & 0300) == 0200 && // (p[1] & 0300) == 0200) { // p += 2; break; } else { return false; // missing cont } case 4: if (p < e && (*p & 0377) < 0220) { return false; // overlong } // fallthrough case 5: if (p + 3 <= e && // (((uint32_t)(p[+2] & 0377) << 030 | // (uint32_t)(p[+1] & 0377) << 020 | // (uint32_t)(p[+0] & 0377) << 010 | // (uint32_t)(p[-1] & 0377) << 000) & // 0xC0C0C000) == 0x80808000) { // p += 3; break; } else { return false; // missing cont } default: unreachable; } } return true; }