mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-03-02 23:18:44 +00:00
Remove division from matrix multiplication
This change reduces llama.com CPU cycles systemically by 2.5% according to the Linux Kernel `perf stat -Bddd` utility.
This commit is contained in:
parent
a88290e595
commit
1f6f9e6701
7 changed files with 191 additions and 70 deletions
|
@ -1,62 +0,0 @@
|
|||
#if 0
|
||||
/*─────────────────────────────────────────────────────────────────╗
|
||||
│ To the extent possible under law, Justine Tunney has waived │
|
||||
│ all copyright and related or neighboring rights to this file, │
|
||||
│ as it is written in the following disclaimers: │
|
||||
│ • http://unlicense.org/ │
|
||||
│ • http://creativecommons.org/publicdomain/zero/1.0/ │
|
||||
╚─────────────────────────────────────────────────────────────────*/
|
||||
#endif
|
||||
#include "libc/calls/calls.h"
|
||||
#include "libc/macros.internal.h"
|
||||
#include "libc/stdio/stdio.h"
|
||||
#include "libc/sysv/consts/sig.h"
|
||||
#include "libc/testlib/ezbench.h"
|
||||
|
||||
/**
|
||||
* @fileoverview Fast Division Using Multiplication Tutorial
|
||||
*
|
||||
* Expected program output:
|
||||
*
|
||||
* 23 / 3 = 7
|
||||
* 0x5555555555555556 1 1
|
||||
* division l: 16𝑐 5𝑛𝑠
|
||||
* fast div l: 5𝑐 2𝑛𝑠
|
||||
* precomps l: 70𝑐 23𝑛𝑠
|
||||
*/
|
||||
|
||||
struct Divisor {
|
||||
uint64_t m;
|
||||
uint8_t s;
|
||||
uint8_t t;
|
||||
};
|
||||
|
||||
struct Divisor GetDivisor(uint64_t d) {
|
||||
int b;
|
||||
uint128_t x;
|
||||
if (!d) raise(SIGFPE);
|
||||
b = __builtin_clzll(d) ^ 63;
|
||||
x = -d & (((1ull << b) - 1) | (1ull << b));
|
||||
return (struct Divisor){(x << 64) / d + 1, MIN(1, b + 1), MAX(0, b)};
|
||||
}
|
||||
|
||||
uint64_t Divide(uint64_t x, struct Divisor d) {
|
||||
uint128_t t;
|
||||
uint64_t l, h;
|
||||
t = d.m;
|
||||
t *= x;
|
||||
l = t;
|
||||
h = t >> 64;
|
||||
l = (x - h) >> d.s;
|
||||
return (h + l) >> d.t;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
printf("23 / 3 = %ld\n", Divide(23, GetDivisor(3)));
|
||||
volatile struct Divisor v = GetDivisor(3);
|
||||
volatile uint64_t x = 23, y = 3, z;
|
||||
EZBENCH2("division", donothing, z = x / y);
|
||||
EZBENCH2("fast div", donothing, z = Divide(x, v));
|
||||
EZBENCH2("precomp ", donothing, v = GetDivisor(y));
|
||||
return 0;
|
||||
}
|
65
libc/tinymath/magicu.c
Normal file
65
libc/tinymath/magicu.c
Normal file
|
@ -0,0 +1,65 @@
|
|||
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
|
||||
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2023 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/tinymath/magicu.h"
|
||||
|
||||
/**
|
||||
* Precomputes magic numbers for unsigned division by constant.
|
||||
*
|
||||
* The returned divisor may be passed to __magic_div() to perform
|
||||
* unsigned integer division way faster than normal division e.g.
|
||||
*
|
||||
* assert(77 / 7 == __magicu_div(77, __magicu_get(7)));
|
||||
*
|
||||
* @param d is intended divisor, which must not be zero
|
||||
* @return magic divisor
|
||||
*/
|
||||
struct magicu __magicu_get(uint32_t d) {
|
||||
// From Hacker's Delight by Henry S. Warren Jr., 9780321842688
|
||||
// Figure 10–3. Simplified algorithm for magic number unsigned
|
||||
int a, p;
|
||||
struct magicu magu;
|
||||
uint32_t p32, q, r, delta;
|
||||
p32 = 0; // Avoid compiler warning.
|
||||
a = 0; // Initialize "add" indicator.
|
||||
p = 31; // Initialize p.
|
||||
q = 0x7FFFFFFF / d; // Initialize q = (2**p - 1)/d.
|
||||
r = 0x7FFFFFFF - q * d; // Init. r = rem(2**p - 1, d).
|
||||
do {
|
||||
p = p + 1;
|
||||
if (p == 32) {
|
||||
p32 = 1; // Set p32 = 2**(p-32).
|
||||
} else {
|
||||
p32 = 2 * p32;
|
||||
}
|
||||
if (r + 1 >= d - r) {
|
||||
if (q >= 0x7FFFFFFF) a = 1;
|
||||
q = 2 * q + 1; // Update q.
|
||||
r = 2 * r + 1 - d; // Update r.
|
||||
} else {
|
||||
if (q >= 0x80000000) a = 1;
|
||||
q = 2 * q;
|
||||
r = 2 * r + 1;
|
||||
}
|
||||
delta = d - 1 - r;
|
||||
} while (p < 64 && p32 < delta);
|
||||
magu.M = q + 1; // Magic number and
|
||||
magu.s = p - 32; // Shift amount to return
|
||||
if (a) magu.s |= 64; // Sets "add" indicator
|
||||
return magu;
|
||||
}
|
26
libc/tinymath/magicu.h
Normal file
26
libc/tinymath/magicu.h
Normal file
|
@ -0,0 +1,26 @@
|
|||
#ifndef COSMOPOLITAN_LIBC_TINYMATH_MAGICU_H_
|
||||
#define COSMOPOLITAN_LIBC_TINYMATH_MAGICU_H_
|
||||
#if !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
COSMOPOLITAN_C_START_
|
||||
|
||||
struct magicu {
|
||||
uint32_t M;
|
||||
uint32_t s;
|
||||
};
|
||||
|
||||
struct magicu __magicu_get(uint32_t);
|
||||
|
||||
/**
|
||||
* Performs fast division using precomputed magic for constant divisor.
|
||||
*
|
||||
* @param x is unsigned integer that shall be divided
|
||||
* @param d should be `__magicu_get(y)` if computing `x / y`
|
||||
* @return result of unsigned integer division
|
||||
*/
|
||||
static inline uint32_t __magicu_div(uint32_t x, struct magicu d) {
|
||||
return ((((uint64_t)x * d.M) >> 32) + ((d.s & 64) ? x : 0)) >> (d.s & 63);
|
||||
}
|
||||
|
||||
COSMOPOLITAN_C_END_
|
||||
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
|
||||
#endif /* COSMOPOLITAN_LIBC_TINYMATH_MAGICU_H_ */
|
|
@ -47,6 +47,7 @@ TEST_LIBC_STR_DIRECTDEPS = \
|
|||
LIBC_TESTLIB \
|
||||
LIBC_X \
|
||||
LIBC_ZIPOS \
|
||||
THIRD_PARTY_COMPILER_RT \
|
||||
THIRD_PARTY_MBEDTLS \
|
||||
THIRD_PARTY_REGEX \
|
||||
THIRD_PARTY_ZLIB \
|
||||
|
|
52
test/libc/tinymath/magicu_test.c
Normal file
52
test/libc/tinymath/magicu_test.c
Normal file
|
@ -0,0 +1,52 @@
|
|||
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
|
||||
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2023 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/tinymath/magicu.h"
|
||||
#include "libc/macros.internal.h"
|
||||
#include "libc/testlib/ezbench.h"
|
||||
#include "libc/testlib/testlib.h"
|
||||
|
||||
#define T uint32_t
|
||||
#define TBIT (sizeof(T) * CHAR_BIT - 1)
|
||||
#define TMIN (((T) ~(T)0) > 1 ? (T)0 : (T)((uintmax_t)1 << TBIT))
|
||||
#define TMAX (((T) ~(T)0) > 1 ? (T) ~(T)0 : (T)(((uintmax_t)1 << TBIT) - 1))
|
||||
T V[] = {5, 4, 77, 4, 7, 0,
|
||||
1, 2, 3, 4, -1, -2,
|
||||
-3, -4, TMIN, TMIN + 1, TMIN + 2, TMIN + 3,
|
||||
TMIN + 5, TMIN + 7, TMAX, TMAX - 1, TMAX - 2, TMAX - 77,
|
||||
TMAX - 3, TMAX - 5, TMAX - 7, TMAX - 50, TMIN / 2, TMAX / 2,
|
||||
TMAX / 2 - 3};
|
||||
|
||||
TEST(magicu, test) {
|
||||
int i, j;
|
||||
for (i = 0; i < ARRAYLEN(V); ++i) {
|
||||
if (!V[i]) continue;
|
||||
struct magicu d = __magicu_get(V[i]);
|
||||
for (j = 0; j < ARRAYLEN(V); ++j) {
|
||||
EXPECT_EQ(V[j] / V[i], __magicu_div(V[j], d));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
BENCH(cog, bench) {
|
||||
struct magicu d = __magicu_get(7);
|
||||
EZBENCH2("__magicu_get", donothing, __magicu_get(VEIL("r", 77)));
|
||||
EZBENCH2("__magicu_div", donothing,
|
||||
EXPROPRIATE(__magicu_div(VEIL("r", 77), d)));
|
||||
EZBENCH2("/", donothing, EXPROPRIATE(VEIL("r", 77) / VEIL("r", 7)));
|
||||
}
|
41
third_party/ggml/ggml.c
vendored
41
third_party/ggml/ggml.c
vendored
|
@ -41,6 +41,11 @@
|
|||
#include "libc/time/time.h"
|
||||
#include "third_party/aarch64/arm_neon.h"
|
||||
#include "third_party/intel/immintrin.internal.h"
|
||||
#include "libc/macros.internal.h"
|
||||
#include "libc/intrin/kprintf.h"
|
||||
#include "libc/intrin/kprintf.h"
|
||||
#include "libc/tinymath/magicu.h"
|
||||
#include "libc/tinymath/magicu.h"
|
||||
#include "third_party/libcxx/math.h"
|
||||
|
||||
asm(".ident\t\"\\n\\n\
|
||||
|
@ -7289,10 +7294,15 @@ static void ggml_compute_forward_add_q_f32(
|
|||
|
||||
float * wdata = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
|
||||
|
||||
assert(ne01 >= 1);
|
||||
assert(ne02*ne01 >= 1);
|
||||
struct magicu ne01m = __magicu_get(ne01);
|
||||
struct magicu ne021m = __magicu_get(ne02*ne01);
|
||||
|
||||
for (int ir = ir0; ir < ir1; ++ir) {
|
||||
// src0 indices
|
||||
const int i03 = ir/(ne02*ne01);
|
||||
const int i02 = (ir - i03*ne02*ne01)/ne01;
|
||||
const int i03 = __magicu_div(ir, ne021m);
|
||||
const int i02 = __magicu_div(ir - i03*ne02*ne01, ne01m);
|
||||
const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
||||
|
||||
// src1 and dst are same shape as src0 => same indices
|
||||
|
@ -8422,10 +8432,15 @@ static void ggml_compute_forward_mul_mat_f32(
|
|||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
assert(ne01 >= 1);
|
||||
assert(ne02*ne01 >= 1);
|
||||
struct magicu ne01m = __magicu_get(ne01);
|
||||
struct magicu ne021m = __magicu_get(ne02*ne01);
|
||||
|
||||
for (int ir = ir0; ir < ir1; ++ir) {
|
||||
// src0 indices
|
||||
const int i03 = ir/(ne02*ne01);
|
||||
const int i02 = (ir - i03*ne02*ne01)/ne01;
|
||||
const int i03 = __magicu_div(ir, ne021m);
|
||||
const int i02 = __magicu_div(ir - i03*ne02*ne01, ne01m);
|
||||
const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
||||
|
||||
for (int64_t ic = 0; ic < ne11; ++ic) {
|
||||
|
@ -8640,10 +8655,15 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|||
|
||||
ggml_fp16_t * wdata = params->wdata;
|
||||
|
||||
assert(ne01 >= 1);
|
||||
assert(ne02*ne01 >= 1);
|
||||
struct magicu ne01m = __magicu_get(ne01);
|
||||
struct magicu ne021m = __magicu_get(ne02*ne01);
|
||||
|
||||
for (int ir = ir0; ir < ir1; ++ir) {
|
||||
// src0 indices
|
||||
const int i03 = ir/(ne02*ne01);
|
||||
const int i02 = (ir - i03*ne02*ne01)/ne01;
|
||||
const int i03 = __magicu_div(ir, ne021m);
|
||||
const int i02 = __magicu_div(ir - i03*ne02*ne01, ne01m);
|
||||
const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
||||
|
||||
const int i13 = i03;
|
||||
|
@ -8852,10 +8872,15 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|||
void * wdata = params->wdata;
|
||||
const size_t row_size = ne00*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
|
||||
|
||||
assert(ne01 >= 1);
|
||||
assert(ne02*ne01 >= 1);
|
||||
struct magicu ne01m = __magicu_get(ne01);
|
||||
struct magicu ne021m = __magicu_get(ne02*ne01);
|
||||
|
||||
for (int ir = ir0; ir < ir1; ++ir) {
|
||||
// src0 indices
|
||||
const int i03 = ir/(ne02*ne01);
|
||||
const int i02 = (ir - i03*ne02*ne01)/ne01;
|
||||
const int i03 = __magicu_div(ir, ne021m);
|
||||
const int i02 = __magicu_div(ir - i03*ne02*ne01, ne01m);
|
||||
const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
||||
|
||||
const int i13 = i03;
|
||||
|
|
14
third_party/ggml/ggml.mk
vendored
14
third_party/ggml/ggml.mk
vendored
|
@ -58,6 +58,20 @@ $(THIRD_PARTY_GGML_A_OBJS): private \
|
|||
-mfma
|
||||
endif
|
||||
|
||||
o/rel/third_party/ggml/ggml.o \
|
||||
o/opt/third_party/ggml/ggml.o: private \
|
||||
OVERRIDE_CFLAGS += \
|
||||
-fomit-frame-pointer \
|
||||
-x-no-pg
|
||||
|
||||
ifeq ($(ARCH), x86_64)
|
||||
o/rel/third_party/ggml/ggml.o \
|
||||
o/opt/third_party/ggml/ggml.o: private \
|
||||
OVERRIDE_CFLAGS += \
|
||||
-fschedule-insns2 \
|
||||
-mred-zone
|
||||
endif
|
||||
|
||||
################################################################################
|
||||
# command for running inference on large language models
|
||||
# make -j8 o//third_party/ggml/llama.com
|
||||
|
|
Loading…
Add table
Reference in a new issue