Improve new C23 checked arithmetic feature

2025-07-05 18:58:30 +00:00 · 2023-06-16 15:32:18 -07:00 · 2023-06-16 15:32:18 -07:00 · 4eebd6b9dc
commit 4eebd6b9dc
parent 2a1c588826
10 changed files with 341 additions and 15 deletions
--- a/libc/intrin/divmodti4.c
+++ b/libc/intrin/divmodti4.c
@ -0,0 +1,63 @@
+#if 0
+/*─────────────────────────────────────────────────────────────────╗
+│ To the extent possible under law, Justine Tunney has waived      │
+│ all copyright and related or neighboring rights to division,     │
+│ as it is written in the following disclaimers:                   │
+│   • http://unlicense.org/                                        │
+│   • http://creativecommons.org/publicdomain/zero/1.0/            │
+╚─────────────────────────────────────────────────────────────────*/
+#endif
+#include "third_party/compiler_rt/int_lib.h"
+
+/**
+ * Divides 128-bit signed integers w/ remainder.
+ *
+ * @param a is numerator
+ * @param b is denominator
+ * @param opt_out_rem receives euclidean division remainder if not null
+ * @return quotient or result of division
+ * @note rounds towards zero
+ */
+COMPILER_RT_ABI ti_int __divmodti4(ti_int a, ti_int b, tu_int *opt_out_rem) {
+  int k;
+  tu_int r;
+  ti_int sa, sb, sq, sr, x, y, q;
+  k = sizeof(ti_int) * CHAR_BIT - 1;
+  if (b == -1 && a == ((ti_int)1 << k)) {
+    volatile int x = 0;
+    x = 1 / x;  // raise(SIGFPE)
+  }
+  sa = a >> k;                 // sa = a < 0 ? -1 : 0
+  sb = b >> k;                 // sb = b < 0 ? -1 : 0
+  x = (a ^ sa) - sa;           // negate if sa == -1
+  y = (b ^ sb) - sb;           // negate if sb == -1
+  sq = sa ^ sb;                // sign of quotient
+  sr = sa;                     // sign of remainder
+  q = __udivmodti4(x, y, &r);  // unsigned divide
+  q = (q ^ sq) - sq;           // fix quotient sign
+  r = (r ^ sr) - sr;           // fix remainder sign
+  if (opt_out_rem) *opt_out_rem = r;
+  return q;
+}
+
+/*
+
+  Intel Kabylake i9-9900 @ 3.10GHz Client Grade
+
+    idiv32                         l:         27𝑐          9𝑛𝑠
+    idiv64                         l:         27𝑐          9𝑛𝑠
+    divmodti4 small / small        l:         42𝑐         14𝑛𝑠
+    divmodti4 small / large        l:         14𝑐          5𝑛𝑠
+    divmodti4 large / small        l:         92𝑐         30𝑛𝑠
+    divmodti4 large / large        l:        209𝑐         68𝑛𝑠
+
+  Intel Kabylake i3-8100 @ 3.60GHz Client Grade
+
+    idiv32                         l:         51𝑐         14𝑛𝑠
+    idiv64                         l:         51𝑐         14𝑛𝑠
+    divmodti4 small / small        l:         83𝑐         23𝑛𝑠
+    divmodti4 small / large        l:         26𝑐          7𝑛𝑠
+    divmodti4 large / small        l:        175𝑐         48𝑛𝑠
+    divmodti4 large / large        l:        389𝑐        107𝑛𝑠
+
+*/
--- a/libc/intrin/divti3.c
+++ b/libc/intrin/divti3.c
@ -0,0 +1,22 @@
+#if 0
+/*─────────────────────────────────────────────────────────────────╗
+│ To the extent possible under law, Justine Tunney has waived      │
+│ all copyright and related or neighboring rights to division,     │
+│ as it is written in the following disclaimers:                   │
+│   • http://unlicense.org/                                        │
+│   • http://creativecommons.org/publicdomain/zero/1.0/            │
+╚─────────────────────────────────────────────────────────────────*/
+#endif
+#include "third_party/compiler_rt/int_lib.h"
+
+/**
+ * Divides 128-bit signed integers.
+ *
+ * @param a is numerator
+ * @param b is denominator
+ * @return quotient or result of division
+ * @note rounds towards zero
+ */
+COMPILER_RT_ABI ti_int __divti3(ti_int a, ti_int b) {
+  return __divmodti4(a, b, NULL);
+}
--- a/libc/intrin/mulodi4.c
+++ b/libc/intrin/mulodi4.c
@ -0,0 +1,59 @@
+/* clang-format off */
+/*===-- mulodi4.c - Implement __mulodi4 -----------------------------------===
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is dual licensed under the MIT and the University of Illinois Open
+ * Source Licenses. See LICENSE.TXT for details.
+ *
+ * ===----------------------------------------------------------------------===
+ *
+ * This file implements __mulodi4 for the compiler_rt library.
+ *
+ * ===----------------------------------------------------------------------===
+ */
+
+#include "third_party/compiler_rt/int_lib.h"
+
+/* Returns: a * b */
+
+/* Effects: sets *overflow to 1  if a * b overflows */
+
+COMPILER_RT_ABI di_int
+__mulodi4(di_int a, di_int b, int* overflow)
+{
+    const int N = (int)(sizeof(di_int) * CHAR_BIT);
+    const di_int MIN = (du_int)1 << (N-1);
+    const di_int MAX = ~MIN;
+    *overflow = 0; 
+    di_int result = (du_int)a * (du_int)b;
+    if (a == MIN)
+    {
+        if (b != 0 && b != 1)
+	    *overflow = 1;
+	return result;
+    }
+    if (b == MIN)
+    {
+        if (a != 0 && a != 1)
+	    *overflow = 1;
+        return result;
+    }
+    di_int sa = a >> (N - 1);
+    di_int abs_a = (a ^ sa) - sa;
+    di_int sb = b >> (N - 1);
+    di_int abs_b = (b ^ sb) - sb;
+    if (abs_a < 2 || abs_b < 2)
+        return result;
+    if (sa == sb)
+    {
+        if (abs_a > MAX / abs_b)
+            *overflow = 1;
+    }
+    else
+    {
+        if (abs_a > MIN / -abs_b)
+            *overflow = 1;
+    }
+    return result;
+}
--- a/libc/intrin/mulosi4.c
+++ b/libc/intrin/mulosi4.c
@ -0,0 +1,59 @@
+/* clang-format off */
+/*===-- mulosi4.c - Implement __mulosi4 -----------------------------------===
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is dual licensed under the MIT and the University of Illinois Open
+ * Source Licenses. See LICENSE.TXT for details.
+ *
+ * ===----------------------------------------------------------------------===
+ *
+ * This file implements __mulosi4 for the compiler_rt library.
+ *
+ * ===----------------------------------------------------------------------===
+ */
+
+#include "third_party/compiler_rt/int_lib.h"
+
+/* Returns: a * b */
+
+/* Effects: sets *overflow to 1  if a * b overflows */
+
+COMPILER_RT_ABI si_int
+__mulosi4(si_int a, si_int b, int* overflow)
+{
+    const int N = (int)(sizeof(si_int) * CHAR_BIT);
+    const si_int MIN = (su_int)1 << (N-1);
+    const si_int MAX = ~MIN;
+    *overflow = 0; 
+    si_int result = (su_int)a * (su_int)b;
+    if (a == MIN)
+    {
+        if (b != 0 && b != 1)
+	    *overflow = 1;
+	return result;
+    }
+    if (b == MIN)
+    {
+        if (a != 0 && a != 1)
+	    *overflow = 1;
+        return result;
+    }
+    si_int sa = a >> (N - 1);
+    si_int abs_a = (a ^ sa) - sa;
+    si_int sb = b >> (N - 1);
+    si_int abs_b = (b ^ sb) - sb;
+    if (abs_a < 2 || abs_b < 2)
+        return result;
+    if (sa == sb)
+    {
+        if (abs_a > MAX / abs_b)
+            *overflow = 1;
+    }
+    else
+    {
+        if (abs_a > MIN / -abs_b)
+            *overflow = 1;
+    }
+    return result;
+}
--- a/libc/intrin/muloti4.c
+++ b/libc/intrin/muloti4.c
@ -0,0 +1,63 @@
+/* clang-format off */
+/*===-- muloti4.c - Implement __muloti4 -----------------------------------===
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is dual licensed under the MIT and the University of Illinois Open
+ * Source Licenses. See LICENSE.TXT for details.
+ *
+ * ===----------------------------------------------------------------------===
+ *
+ * This file implements __muloti4 for the compiler_rt library.
+ *
+ * ===----------------------------------------------------------------------===
+ */
+
+#include "third_party/compiler_rt/int_lib.h"
+
+#ifdef CRT_HAS_128BIT
+
+/* Returns: a * b */
+
+/* Effects: sets *overflow to 1  if a * b overflows */
+
+COMPILER_RT_ABI ti_int
+__muloti4(ti_int a, ti_int b, int* overflow)
+{
+    const int N = (int)(sizeof(ti_int) * CHAR_BIT);
+    const ti_int MIN = (tu_int)1 << (N-1);
+    const ti_int MAX = ~MIN;
+    *overflow = 0;
+    ti_int result = (tu_int)a * (tu_int)b;
+    if (a == MIN)
+    {
+        if (b != 0 && b != 1)
+	    *overflow = 1;
+	return result;
+    }
+    if (b == MIN)
+    {
+        if (a != 0 && a != 1)
+	    *overflow = 1;
+        return result;
+    }
+    ti_int sa = a >> (N - 1);
+    ti_int abs_a = (a ^ sa) - sa;
+    ti_int sb = b >> (N - 1);
+    ti_int abs_b = (b ^ sb) - sb;
+    if (abs_a < 2 || abs_b < 2)
+        return result;
+    if (sa == sb)
+    {
+        if (abs_a > MAX / abs_b)
+            *overflow = 1;
+    }
+    else
+    {
+        if (abs_a > MIN / -abs_b)
+            *overflow = 1;
+    }
+    return result;
+}
+
+#endif /* CRT_HAS_128BIT */
--- a/libc/intrin/udivmodti4.c
+++ b/libc/intrin/udivmodti4.c
@ -0,0 +1,137 @@
+#if 0
+/*─────────────────────────────────────────────────────────────────╗
+│ To the extent possible under law, Justine Tunney has waived      │
+│ all copyright and related or neighboring rights to division,     │
+│ as it is written in the following disclaimers:                   │
+│   • http://unlicense.org/                                        │
+│   • http://creativecommons.org/publicdomain/zero/1.0/            │
+╚─────────────────────────────────────────────────────────────────*/
+#endif
+#include "third_party/compiler_rt/int_lib.h"
+
+/**
+ * Returns 128 bit division result by 64 bit.
+ *
+ * Result must fit in 64 bits. Remainder is stored in r.
+ *
+ * @see libdivide libdivide_128_div_64_to_64() division fallback
+ * @see Knuth, Volume 2, section 4.3.1, Algorithm D for correctness proof
+ * @see https://danlark.org/2020/06/14/128-bit-division/
+ */
+forceinline du_int udiv128by64to64default(du_int u1, du_int u0, du_int v,
+                                          du_int *r) {
+  const unsigned n_udword_bits = sizeof(du_int) * CHAR_BIT;
+  const du_int b = 1ULL << (n_udword_bits / 2);  // Number base (32 bits)
+  du_int un1, un0;                               // Norm. dividend LSD's
+  du_int vn1, vn0;                               // Norm. divisor digits
+  du_int q1, q0;                                 // Quotient digits
+  du_int un64, un21, un10;                       // Dividend digit pairs
+  du_int rhat;                                   // Remainder
+  si_int s;                                      // Normalization shift
+  s = __builtin_clzll(v);
+  if (s > 0) {
+    // Normalize the divisor.
+    v = v << s;
+    un64 = (u1 << s) | (u0 >> (n_udword_bits - s));
+    un10 = u0 << s;  // Shift dividend left
+  } else {
+    // Avoid undefined behavior of (u0 >> 64).
+    un64 = u1;
+    un10 = u0;
+  }
+  // Break divisor up into two 32-bit digits.
+  vn1 = v >> (n_udword_bits / 2);
+  vn0 = v & 0xFFFFFFFF;
+  // Break right half of dividend into two digits.
+  un1 = un10 >> (n_udword_bits / 2);
+  un0 = un10 & 0xFFFFFFFF;
+  // Compute the first quotient digit, q1.
+  q1 = un64 / vn1;
+  rhat = un64 - q1 * vn1;
+  // q1 has at most error 2. No more than 2 iterations.
+  while (q1 >= b || q1 * vn0 > b * rhat + un1) {
+    q1 = q1 - 1;
+    rhat = rhat + vn1;
+    if (rhat >= b) break;
+  }
+  un21 = un64 * b + un1 - q1 * v;
+  // Compute the second quotient digit.
+  q0 = un21 / vn1;
+  rhat = un21 - q0 * vn1;
+  // q0 has at most error 2. No more than 2 iterations.
+  while (q0 >= b || q0 * vn0 > b * rhat + un0) {
+    q0 = q0 - 1;
+    rhat = rhat + vn1;
+    if (rhat >= b) break;
+  }
+  *r = (un21 * b + un0 - q0 * v) >> s;
+  return q1 * b + q0;
+}
+
+forceinline du_int udiv128by64to64(du_int u1, du_int u0, du_int v, du_int *r) {
+#ifdef __x86_64__
+  du_int result;
+  asm("div\t%2" : "=a"(result), "=d"(*r) : "r"(v), "0"(u0), "1"(u1) : "cc");
+  return result;
+#else
+  return udiv128by64to64default(u1, u0, v, r);
+#endif
+}
+
+/**
+ * Performs 128-bit unsigned division and remainder.
+ *
+ * @param a is dividend
+ * @param b is divisor
+ * @param rem receives remainder if not NULL
+ */
+COMPILER_RT_ABI tu_int __udivmodti4(tu_int a, tu_int b, tu_int *rem) {
+  const unsigned n_utword_bits = sizeof(tu_int) * CHAR_BIT;
+  utwords dividend, divisor, quotient, remainder;
+  si_int shift;
+  dividend.all = a;
+  divisor.all = b;
+  if (divisor.all > dividend.all) {
+    if (rem) *rem = dividend.all;
+    return 0;
+  }
+  // When the divisor fits in 64 bits, we can use an optimized path.
+  if (divisor.s.high == 0) {
+    remainder.s.high = 0;
+    if (dividend.s.high < divisor.s.low) {
+      // The result fits in 64 bits.
+      quotient.s.low = udiv128by64to64(dividend.s.high, dividend.s.low,
+                                       divisor.s.low, &remainder.s.low);
+      quotient.s.high = 0;
+    } else {
+      // First, divide with the high part to get the remainder in
+      // dividend.s.high. After that dividend.s.high < divisor.s.low.
+      quotient.s.high = dividend.s.high / divisor.s.low;
+      dividend.s.high = dividend.s.high % divisor.s.low;
+      quotient.s.low = udiv128by64to64(dividend.s.high, dividend.s.low,
+                                       divisor.s.low, &remainder.s.low);
+    }
+    if (rem) *rem = remainder.all;
+    return quotient.all;
+  }
+  // 0 <= shift <= 63.
+  shift = __builtin_clzll(divisor.s.high) - __builtin_clzll(dividend.s.high);
+  divisor.all <<= shift;
+  quotient.s.high = 0;
+  quotient.s.low = 0;
+  for (; shift >= 0; --shift) {
+    quotient.s.low <<= 1;
+    // Branch free version of.
+    // if (dividend.all >= divisor.all)
+    // {
+    //    dividend.all -= divisor.all;
+    //    carry = 1;
+    // }
+    ti_int s = (ti_int)(divisor.all - dividend.all - 1) >> (n_utword_bits - 1);
+    quotient.s.low |= s & 1;
+    dividend.all -= divisor.all & s;
+    divisor.all >>= 1;
+  }
+  if (rem) *rem = dividend.all;
+  return quotient.all;
+}
--- a/libc/stdckdint.h
+++ b/libc/stdckdint.h
@ -1,8 +1,93 @@
 #ifndef COSMOPOLITAN_LIBC_STDCKDINT_H_
 #define COSMOPOLITAN_LIBC_STDCKDINT_H_
+#if !defined(MODE_DBG) &&                                                 \
+    ((defined(__GNUC__) && __GNUC__ >= 5 && !defined(__ICC)) ||           \
+     (defined(__has_builtin) && (__has_builtin(__builtin_add_overflow) && \
+                                 __has_builtin(__builtin_sub_overflow) && \
+                                 __has_builtin(__builtin_mul_overflow))))

-#define ckd_add(R, A, B) __builtin_add_overflow((A), (B), (R))
-#define ckd_sub(R, A, B) __builtin_sub_overflow((A), (B), (R))
-#define ckd_mul(R, A, B) __builtin_mul_overflow((A), (B), (R))
+#define ckd_add(res, x, y) __builtin_add_overflow((x), (y), (res))
+#define ckd_sub(res, x, y) __builtin_sub_overflow((x), (y), (res))
+#define ckd_mul(res, x, y) __builtin_mul_overflow((x), (y), (res))

+#else
+
+#define ckd_add(res, x, y) __ckd_arithmetic(add, res, x, y)
+#define ckd_sub(res, x, y) __ckd_arithmetic(sub, res, x, y)
+#define ckd_mul(res, x, y) __ckd_arithmetic(mul, res, x, y)
+
+#if defined(__SIZEOF_LONG__) && __SIZEOF_LONG__ == 8 &&               \
+    ((defined(__GNUC__) && __GNUC__ * 100 + __GNUC_MINOR__ >= 406) || \
+     defined(__llvm__)) &&                                            \
+    !defined(__STRICT_ANSI__)
+#define __ckd_dword __int128
+#else
+#define __ckd_dword long long
+#endif
+
+#define __ckd_arithmetic(op, res, x, y)                                      \
+  (sizeof(*(res)) == sizeof(int)    ? __ckd_##op((int *)(res), (x), (y))     \
+   : sizeof(*(res)) == sizeof(long) ? __ckd_##op##l((long *)(res), (x), (y)) \
+   : sizeof(*(res)) == sizeof(__ckd_dword)                                   \
+       ? __ckd_##op##ll((__ckd_dword *)(res), (x), (y))                      \
+       : __ckd_trap())
+
+__funline int __ckd_trap(void) {
+  volatile int __x = 0;
+  return 1 / __x;
+}
+
+__funline int __ckd_add(int *__z, int __x, int __y) {
+  unsigned int __a, __b, __c;
+  *__z = __c = (__a = __x) + (__b = __y);
+  return ((__c ^ __a) & (__c ^ __b)) >> (sizeof(int) * CHAR_BIT - 1);
+}
+__funline int __ckd_addl(long *__z, long __x, long __y) {
+  unsigned long __a, __b, __c;
+  *__z = __c = (__a = __x) + (__b = __y);
+  return ((__c ^ __a) & (__c ^ __b)) >> (sizeof(long) * CHAR_BIT - 1);
+}
+__funline int __ckd_addll(__ckd_dword *__z, __ckd_dword __x, __ckd_dword __y) {
+  unsigned __ckd_dword __a, __b, __c;
+  *__z = __c = (__a = __x) + (__b = __y);
+  return ((__c ^ __a) & (__c ^ __b)) >> (sizeof(__ckd_dword) * CHAR_BIT - 1);
+}
+
+__funline int __ckd_sub(int *__z, int __x, int __y) {
+  unsigned int __a, __b, __c;
+  *__z = __c = (__a = __x) - (__b = __y);
+  return ((__a ^ __b) & (__c ^ __a)) >> (sizeof(int) * CHAR_BIT - 1);
+}
+__funline int __ckd_subl(long *__z, long __x, long __y) {
+  unsigned long __a, __b, __c;
+  *__z = __c = (__a = __x) - (__b = __y);
+  return ((__a ^ __b) & (__c ^ __a)) >> (sizeof(long) * CHAR_BIT - 1);
+}
+__funline int __ckd_subll(__ckd_dword *__z, __ckd_dword __x, __ckd_dword __y) {
+  unsigned __ckd_dword __a, __b, __c;
+  *__z = __c = (__a = __x) - (__b = __y);
+  return ((__a ^ __b) & (__c ^ __a)) >> (sizeof(__ckd_dword) * CHAR_BIT - 1);
+}
+
+int __mulosi4(int, int, int *);
+long __mulodi4(long, long, int *);
+__ckd_dword __muloti4(__ckd_dword, __ckd_dword, int *);
+
+__funline int __ckd_mul(int *__z, int __x, int __y) {
+  int __o;
+  *__z = __mulosi4(__x, __y, &__o);
+  return __o;
+}
+__funline int __ckd_mull(long *__z, long __x, long __y) {
+  int __o;
+  *__z = __mulodi4(__x, __y, &__o);
+  return __o;
+}
+__funline int __ckd_mulll(__ckd_dword *__z, __ckd_dword __x, __ckd_dword __y) {
+  int __o;
+  *__z = __muloti4(__x, __y, &__o);
+  return __o;
+}
+
+#endif
 #endif /* COSMOPOLITAN_LIBC_STDCKDINT_H_ */
--- a/libc/type2str.h
+++ b/libc/type2str.h
@ -17,6 +17,8 @@
    unsigned long:      "unsigned long",      \
    long long:          "long long",          \
    unsigned long long: "unsigned long long", \
+    __int128:           "__int128",           \
+    unsigned __int128:  "unsigned __int128",  \
    float:              "float",              \
    double:             "double",             \
    long double:        "long double")