Remove division from matrix multiplication

This change reduces llama.com CPU cycles systemically by 2.5% according to the Linux Kernel `perf stat -Bddd` utility.
2025-10-27 11:26:43 +00:00 · 2023-05-10 21:19:54 -07:00 · 2023-05-10 21:19:54 -07:00 · 1f6f9e6701
commit 1f6f9e6701
parent a88290e595
7 changed files with 191 additions and 70 deletions
--- a/examples/fastdiv.c
+++ b/examples/fastdiv.c
@ -1,62 +0,0 @@
-#if 0
-/*─────────────────────────────────────────────────────────────────╗
-│ To the extent possible under law, Justine Tunney has waived      │
-│ all copyright and related or neighboring rights to this file,    │
-│ as it is written in the following disclaimers:                   │
-│   • http://unlicense.org/                                        │
-│   • http://creativecommons.org/publicdomain/zero/1.0/            │
-╚─────────────────────────────────────────────────────────────────*/
-#endif
-#include "libc/calls/calls.h"
-#include "libc/macros.internal.h"
-#include "libc/stdio/stdio.h"
-#include "libc/sysv/consts/sig.h"
-#include "libc/testlib/ezbench.h"
-
-/**
- * @fileoverview Fast Division Using Multiplication Tutorial
- *
- * Expected program output:
- *
- *     23 / 3 = 7
- *     0x5555555555555556 1 1
- *     division   l: 16𝑐  5𝑛𝑠
- *     fast div   l:  5𝑐  2𝑛𝑠
- *     precomps   l: 70𝑐 23𝑛𝑠
- */
-
-struct Divisor {
-  uint64_t m;
-  uint8_t s;
-  uint8_t t;
-};
-
-struct Divisor GetDivisor(uint64_t d) {
-  int b;
-  uint128_t x;
-  if (!d) raise(SIGFPE);
-  b = __builtin_clzll(d) ^ 63;
-  x = -d & (((1ull << b) - 1) | (1ull << b));
-  return (struct Divisor){(x << 64) / d + 1, MIN(1, b + 1), MAX(0, b)};
-}
-
-uint64_t Divide(uint64_t x, struct Divisor d) {
-  uint128_t t;
-  uint64_t l, h;
-  t = d.m;
-  t *= x;
-  l = t;
-  h = t >> 64;
-  l = (x - h) >> d.s;
-  return (h + l) >> d.t;
-}
-
-int main(int argc, char *argv[]) {
-  printf("23 / 3 = %ld\n", Divide(23, GetDivisor(3)));
-  volatile struct Divisor v = GetDivisor(3);
-  volatile uint64_t x = 23, y = 3, z;
-  EZBENCH2("division", donothing, z = x / y);
-  EZBENCH2("fast div", donothing, z = Divide(x, v));
-  EZBENCH2("precomp ", donothing, v = GetDivisor(y));
-  return 0;
-}
--- a/libc/tinymath/magicu.c
+++ b/libc/tinymath/magicu.c
@ -0,0 +1,65 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8                                :vi│
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2023 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/tinymath/magicu.h"
+
+/**
+ * Precomputes magic numbers for unsigned division by constant.
+ *
+ * The returned divisor may be passed to __magic_div() to perform
+ * unsigned integer division way faster than normal division e.g.
+ *
+ *     assert(77 / 7 == __magicu_div(77, __magicu_get(7)));
+ *
+ * @param d is intended divisor, which must not be zero
+ * @return magic divisor
+ */
+struct magicu __magicu_get(uint32_t d) {
+  // From Hacker's Delight by Henry S. Warren Jr., 9780321842688
+  // Figure 10–3. Simplified algorithm for magic number unsigned
+  int a, p;
+  struct magicu magu;
+  uint32_t p32, q, r, delta;
+  p32 = 0;                 // Avoid compiler warning.
+  a = 0;                   // Initialize "add" indicator.
+  p = 31;                  // Initialize p.
+  q = 0x7FFFFFFF / d;      // Initialize q = (2**p - 1)/d.
+  r = 0x7FFFFFFF - q * d;  // Init. r = rem(2**p - 1, d).
+  do {
+    p = p + 1;
+    if (p == 32) {
+      p32 = 1;  // Set p32 = 2**(p-32).
+    } else {
+      p32 = 2 * p32;
+    }
+    if (r + 1 >= d - r) {
+      if (q >= 0x7FFFFFFF) a = 1;
+      q = 2 * q + 1;      // Update q.
+      r = 2 * r + 1 - d;  // Update r.
+    } else {
+      if (q >= 0x80000000) a = 1;
+      q = 2 * q;
+      r = 2 * r + 1;
+    }
+    delta = d - 1 - r;
+  } while (p < 64 && p32 < delta);
+  magu.M = q + 1;       // Magic number and
+  magu.s = p - 32;      // Shift amount to return
+  if (a) magu.s |= 64;  // Sets "add" indicator
+  return magu;
+}
--- a/libc/tinymath/magicu.h
+++ b/libc/tinymath/magicu.h
@ -0,0 +1,26 @@
+#ifndef COSMOPOLITAN_LIBC_TINYMATH_MAGICU_H_
+#define COSMOPOLITAN_LIBC_TINYMATH_MAGICU_H_
+#if !(__ASSEMBLER__ + __LINKER__ + 0)
+COSMOPOLITAN_C_START_
+
+struct magicu {
+  uint32_t M;
+  uint32_t s;
+};
+
+struct magicu __magicu_get(uint32_t);
+
+/**
+ * Performs fast division using precomputed magic for constant divisor.
+ *
+ * @param x is unsigned integer that shall be divided
+ * @param d should be `__magicu_get(y)` if computing `x / y`
+ * @return result of unsigned integer division
+ */
+static inline uint32_t __magicu_div(uint32_t x, struct magicu d) {
+  return ((((uint64_t)x * d.M) >> 32) + ((d.s & 64) ? x : 0)) >> (d.s & 63);
+}
+
+COSMOPOLITAN_C_END_
+#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
+#endif /* COSMOPOLITAN_LIBC_TINYMATH_MAGICU_H_ */
--- a/test/libc/str/test.mk
+++ b/test/libc/str/test.mk
@ -47,6 +47,7 @@ TEST_LIBC_STR_DIRECTDEPS =						\
 	LIBC_TESTLIB							\
 	LIBC_X								\
 	LIBC_ZIPOS							\
+	THIRD_PARTY_COMPILER_RT						\
 	THIRD_PARTY_MBEDTLS						\
 	THIRD_PARTY_REGEX						\
 	THIRD_PARTY_ZLIB						\
--- a/test/libc/tinymath/magicu_test.c
+++ b/test/libc/tinymath/magicu_test.c
@ -0,0 +1,52 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8                                :vi│
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2023 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/tinymath/magicu.h"
+#include "libc/macros.internal.h"
+#include "libc/testlib/ezbench.h"
+#include "libc/testlib/testlib.h"
+
+#define T    uint32_t
+#define TBIT (sizeof(T) * CHAR_BIT - 1)
+#define TMIN (((T) ~(T)0) > 1 ? (T)0 : (T)((uintmax_t)1 << TBIT))
+#define TMAX (((T) ~(T)0) > 1 ? (T) ~(T)0 : (T)(((uintmax_t)1 << TBIT) - 1))
+T V[] = {5,           4,        77,       4,         7,        0,
+         1,           2,        3,        4,         -1,       -2,
+         -3,          -4,       TMIN,     TMIN + 1,  TMIN + 2, TMIN + 3,
+         TMIN + 5,    TMIN + 7, TMAX,     TMAX - 1,  TMAX - 2, TMAX - 77,
+         TMAX - 3,    TMAX - 5, TMAX - 7, TMAX - 50, TMIN / 2, TMAX / 2,
+         TMAX / 2 - 3};
+
+TEST(magicu, test) {
+  int i, j;
+  for (i = 0; i < ARRAYLEN(V); ++i) {
+    if (!V[i]) continue;
+    struct magicu d = __magicu_get(V[i]);
+    for (j = 0; j < ARRAYLEN(V); ++j) {
+      EXPECT_EQ(V[j] / V[i], __magicu_div(V[j], d));
+    }
+  }
+}
+
+BENCH(cog, bench) {
+  struct magicu d = __magicu_get(7);
+  EZBENCH2("__magicu_get", donothing, __magicu_get(VEIL("r", 77)));
+  EZBENCH2("__magicu_div", donothing,
+           EXPROPRIATE(__magicu_div(VEIL("r", 77), d)));
+  EZBENCH2("/", donothing, EXPROPRIATE(VEIL("r", 77) / VEIL("r", 7)));
+}
--- a/third_party/ggml/ggml.c
+++ b/third_party/ggml/ggml.c
@ -41,6 +41,11 @@
 #include "libc/time/time.h"
 #include "third_party/aarch64/arm_neon.h"
 #include "third_party/intel/immintrin.internal.h"
+#include "libc/macros.internal.h"
+#include "libc/intrin/kprintf.h"
+#include "libc/intrin/kprintf.h"
+#include "libc/tinymath/magicu.h"
+#include "libc/tinymath/magicu.h"
 #include "third_party/libcxx/math.h"

 asm(".ident\t\"\\n\\n\
@ -7289,10 +7294,15 @@ static void ggml_compute_forward_add_q_f32(

    float * wdata = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;

+    assert(ne01 >= 1);
+    assert(ne02*ne01 >= 1);
+    struct magicu ne01m = __magicu_get(ne01);
+    struct magicu ne021m = __magicu_get(ne02*ne01);
+
    for (int ir = ir0; ir < ir1; ++ir) {
        // src0 indices
-        const int i03 = ir/(ne02*ne01);
-        const int i02 = (ir - i03*ne02*ne01)/ne01;
+        const int i03 = __magicu_div(ir, ne021m);
+        const int i02 = __magicu_div(ir - i03*ne02*ne01, ne01m);
        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);

        // src1 and dst are same shape as src0 => same indices
@ -8422,10 +8432,15 @@ static void ggml_compute_forward_mul_mat_f32(
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);

+    assert(ne01 >= 1);
+    assert(ne02*ne01 >= 1);
+    struct magicu ne01m = __magicu_get(ne01);
+    struct magicu ne021m = __magicu_get(ne02*ne01);
+
    for (int ir = ir0; ir < ir1; ++ir) {
        // src0 indices
-        const int i03 = ir/(ne02*ne01);
-        const int i02 = (ir - i03*ne02*ne01)/ne01;
+        const int i03 = __magicu_div(ir, ne021m);
+        const int i02 = __magicu_div(ir - i03*ne02*ne01, ne01m);
        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);

        for (int64_t ic = 0; ic < ne11; ++ic) {
@ -8640,10 +8655,15 @@ static void ggml_compute_forward_mul_mat_f16_f32(

    ggml_fp16_t * wdata = params->wdata;

+    assert(ne01 >= 1);
+    assert(ne02*ne01 >= 1);
+    struct magicu ne01m = __magicu_get(ne01);
+    struct magicu ne021m = __magicu_get(ne02*ne01);
+
    for (int ir = ir0; ir < ir1; ++ir) {
        // src0 indices
-        const int i03 = ir/(ne02*ne01);
-        const int i02 = (ir - i03*ne02*ne01)/ne01;
+        const int i03 = __magicu_div(ir, ne021m);
+        const int i02 = __magicu_div(ir - i03*ne02*ne01, ne01m);
        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);

        const int i13 = i03;
@ -8852,10 +8872,15 @@ static void ggml_compute_forward_mul_mat_q_f32(
    void * wdata = params->wdata;
    const size_t row_size = ne00*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];

+    assert(ne01 >= 1);
+    assert(ne02*ne01 >= 1);
+    struct magicu ne01m = __magicu_get(ne01);
+    struct magicu ne021m = __magicu_get(ne02*ne01);
+
    for (int ir = ir0; ir < ir1; ++ir) {
        // src0 indices
-        const int i03 = ir/(ne02*ne01);
-        const int i02 = (ir - i03*ne02*ne01)/ne01;
+        const int i03 = __magicu_div(ir, ne021m);
+        const int i02 = __magicu_div(ir - i03*ne02*ne01, ne01m);
        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);

        const int i13 = i03;
--- a/third_party/ggml/ggml.mk
+++ b/third_party/ggml/ggml.mk
@ -58,6 +58,20 @@ $(THIRD_PARTY_GGML_A_OBJS): private					\
 			-mfma
 endif

+o/rel/third_party/ggml/ggml.o						\
+o/opt/third_party/ggml/ggml.o: private					\
+		OVERRIDE_CFLAGS +=					\
+			-fomit-frame-pointer				\
+			-x-no-pg
+
+ifeq ($(ARCH), x86_64)
+o/rel/third_party/ggml/ggml.o						\
+o/opt/third_party/ggml/ggml.o: private					\
+		OVERRIDE_CFLAGS +=					\
+			-fschedule-insns2				\
+			-mred-zone
+endif
+
 ################################################################################
 # command for running inference on large language models
 # make -j8 o//third_party/ggml/llama.com