diff --git a/Makefile b/Makefile index 1d7bdd243..62f70de67 100644 --- a/Makefile +++ b/Makefile @@ -144,8 +144,8 @@ include libc/stdio/stdio.mk # │ include third_party/libcxx/libcxx.mk # │ include net/net.mk # │ include third_party/vqsort/vqsort.mk # │ -include third_party/ggml/ggml.mk # │ include libc/log/log.mk # │ +include third_party/ggml/ggml.mk # │ include third_party/bzip2/bzip2.mk # │ include dsp/core/core.mk # │ include libc/x/x.mk # │ diff --git a/libc/stdio/fread.c b/libc/stdio/fread.c index 67ee940f4..6c2c36c84 100644 --- a/libc/stdio/fread.c +++ b/libc/stdio/fread.c @@ -18,6 +18,7 @@ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "libc/intrin/describeflags.internal.h" #include "libc/intrin/strace.internal.h" +#include "libc/runtime/runtime.h" #include "libc/stdio/lock.internal.h" #include "libc/stdio/stdio.h" diff --git a/libc/tinymath/__cexpf.c b/libc/tinymath/__cexpf.c index 73af76aa6..9cb7332ea 100644 --- a/libc/tinymath/__cexpf.c +++ b/libc/tinymath/__cexpf.c @@ -35,7 +35,6 @@ Copyright 2005-2014 Rich Felker, et. al.\""); asm(".include \"libc/disclaimer.inc\""); /* clang-format off */ - /* origin: FreeBSD /usr/src/lib/msun/src/k_expf.c */ /*- * Copyright (c) 2011 David Schultz diff --git a/libc/tinymath/cosdf.c b/libc/tinymath/cosdf.c index 5c8890e8b..c519dba57 100644 --- a/libc/tinymath/cosdf.c +++ b/libc/tinymath/cosdf.c @@ -60,7 +60,7 @@ C1 = 0x155553e1053a42.0p-57, /* 0.0416666233237390631894 */ C2 = -0x16c087e80f1e27.0p-62, /* -0.00138867637746099294692 */ C3 = 0x199342e0ee5069.0p-68; /* 0.0000243904487962774090654 */ -float __cosdf(double x) +noinstrument float __cosdf(double x) { double_t r, w, z; diff --git a/libc/tinymath/expm1f.c b/libc/tinymath/expm1f.c new file mode 100644 index 000000000..ca9b619a0 --- /dev/null +++ b/libc/tinymath/expm1f.c @@ -0,0 +1,103 @@ +/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ +│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│ +╚──────────────────────────────────────────────────────────────────────────────╝ +│ │ +│ Optimized Routines │ +│ Copyright (c) 1999-2022, Arm Limited. │ +│ │ +│ Permission is hereby granted, free of charge, to any person obtaining │ +│ a copy of this software and associated documentation files (the │ +│ "Software"), to deal in the Software without restriction, including │ +│ without limitation the rights to use, copy, modify, merge, publish, │ +│ distribute, sublicense, and/or sell copies of the Software, and to │ +│ permit persons to whom the Software is furnished to do so, subject to │ +│ the following conditions: │ +│ │ +│ The above copyright notice and this permission notice shall be │ +│ included in all copies or substantial portions of the Software. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │ +│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │ +│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │ +│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │ +│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │ +│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │ +│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │ +│ │ +╚─────────────────────────────────────────────────────────────────────────────*/ +#include "libc/math.h" +#include "libc/tinymath/hornerf.internal.h" +#include "libc/tinymath/internal.h" +#include "third_party/libcxx/math.h" + +asm(".ident\t\"\\n\\n\ +Optimized Routines (MIT License)\\n\ +Copyright 2022 ARM Limited\""); +asm(".include \"libc/disclaimer.inc\""); +/* clang-format off */ + +#define Shift (0x1.8p23f) +#define InvLn2 (0x1.715476p+0f) +#define Ln2hi (0x1.62e4p-1f) +#define Ln2lo (0x1.7f7d1cp-20f) +#define AbsMask (0x7fffffff) +#define InfLimit \ + (0x1.644716p6) /* Smallest value of x for which expm1(x) overflows. */ +#define NegLimit \ + (-0x1.9bbabcp+6) /* Largest value of x for which expm1(x) rounds to 1. */ + +#define C(i) __expm1f_poly[i] + +/* Generated using fpminimax, see tools/expm1f.sollya for details. */ +const float __expm1f_poly[] = {0x1.fffffep-2, 0x1.5554aep-3, 0x1.555736p-5, + 0x1.12287cp-7, 0x1.6b55a2p-10}; + +/* Approximation for exp(x) - 1 using polynomial on a reduced interval. + The maximum error is 1.51 ULP: + expm1f(0x1.8baa96p-2) got 0x1.e2fb9p-2 + want 0x1.e2fb94p-2. */ +float +expm1f (float x) +{ + uint32_t ix = asuint (x); + uint32_t ax = ix & AbsMask; + + /* Tiny: |x| < 0x1p-23. expm1(x) is closely approximated by x. + Inf: x == +Inf => expm1(x) = x. */ + if (ax <= 0x34000000 || (ix == 0x7f800000)) + return x; + + /* +/-NaN. */ + if (ax > 0x7f800000) + return __math_invalidf (x); + + if (x >= InfLimit) + return __math_oflowf (0); + + if (x <= NegLimit || ix == 0xff800000) + return -1; + + /* Reduce argument to smaller range: + Let i = round(x / ln2) + and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 + where 2^i is exact because i is an integer. */ + float j = fmaf (InvLn2, x, Shift) - Shift; + int32_t i = j; + float f = fmaf (j, -Ln2hi, x); + f = fmaf (j, -Ln2lo, f); + + /* Approximate expm1(f) using polynomial. + Taylor expansion for expm1(x) has the form: + x + ax^2 + bx^3 + cx^4 .... + So we calculate the polynomial P(f) = a + bf + cf^2 + ... + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ + float p = fmaf (f * f, HORNER_4 (f, C), f); + /* Assemble the result, using a slight rearrangement to achieve acceptable + accuracy. + expm1(x) ~= 2^i * (p + 1) - 1 + Let t = 2^(i - 1). */ + float t = ldexpf (0.5f, i); + /* expm1(x) ~= 2 * (p * t + (t - 1/2)). */ + return 2 * fmaf (p, t, t - 0.5f); +} diff --git a/libc/tinymath/fabsf.S b/libc/tinymath/fabsf.S deleted file mode 100644 index 9060da2a7..000000000 --- a/libc/tinymath/fabsf.S +++ /dev/null @@ -1,31 +0,0 @@ -/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│ -│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│ -╞══════════════════════════════════════════════════════════════════════════════╡ -│ Copyright 2020 Justine Alexandra Roberts Tunney │ -│ │ -│ Permission to use, copy, modify, and/or distribute this software for │ -│ any purpose with or without fee is hereby granted, provided that the │ -│ above copyright notice and this permission notice appear in all copies. │ -│ │ -│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ -│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ -│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ -│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ -│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ -│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ -│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ -│ PERFORMANCE OF THIS SOFTWARE. │ -╚─────────────────────────────────────────────────────────────────────────────*/ -#include "libc/macros.internal.h" - -// Returns absolute value of 𝑥. -// -// @param 𝑥 is float passed in lower quarter on %xmm0 -// @return absolute value in %xmm0 -fabsf: .leafprologue - .profilable - movd %xmm0,%eax - and $0x7fffffff,%eax - movd %eax,%xmm0 - .leafepilogue - .endfn fabsf,globl diff --git a/libc/tinymath/expm1f.S b/libc/tinymath/fabsf.c similarity index 79% rename from libc/tinymath/expm1f.S rename to libc/tinymath/fabsf.c index 4b2c71a8e..c8f2e1d2b 100644 --- a/libc/tinymath/expm1f.S +++ b/libc/tinymath/fabsf.c @@ -1,7 +1,7 @@ -/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│ -│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│ +/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ +│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│ ╞══════════════════════════════════════════════════════════════════════════════╡ -│ Copyright 2020 Justine Alexandra Roberts Tunney │ +│ Copyright 2023 Justine Alexandra Roberts Tunney │ │ │ │ Permission to use, copy, modify, and/or distribute this software for │ │ any purpose with or without fee is hereby granted, provided that the │ @@ -16,12 +16,13 @@ │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ -#include "libc/macros.internal.h" +#include "third_party/libcxx/math.h" -// Returns 𝑒^x-1. -// -// @param 𝑥 is float scalar in low quarter of %xmm0 -// @return float scalar in low quarter of %xmm0 -expm1f: ezlea expm1l,ax - jmp _f2ld2 - .endfn expm1f,globl +float fabsf(float x) { + union { + float f; + uint32_t i; + } u = {x}; + u.i &= 0x7fffffff; + return u.f; +} diff --git a/libc/runtime/fenv.S b/libc/tinymath/fenv.S similarity index 100% rename from libc/runtime/fenv.S rename to libc/tinymath/fenv.S diff --git a/libc/tinymath/fmaf.c b/libc/tinymath/fmaf.c new file mode 100644 index 000000000..30a9e0b55 --- /dev/null +++ b/libc/tinymath/fmaf.c @@ -0,0 +1,127 @@ +/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8 -*-│ +│vi: set et ft=c ts=8 tw=8 fenc=utf-8 :vi│ +╚──────────────────────────────────────────────────────────────────────────────╝ +│ │ +│ Musl Libc │ +│ Copyright © 2005-2014 Rich Felker, et al. │ +│ │ +│ Permission is hereby granted, free of charge, to any person obtaining │ +│ a copy of this software and associated documentation files (the │ +│ "Software"), to deal in the Software without restriction, including │ +│ without limitation the rights to use, copy, modify, merge, publish, │ +│ distribute, sublicense, and/or sell copies of the Software, and to │ +│ permit persons to whom the Software is furnished to do so, subject to │ +│ the following conditions: │ +│ │ +│ The above copyright notice and this permission notice shall be │ +│ included in all copies or substantial portions of the Software. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │ +│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │ +│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │ +│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │ +│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │ +│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │ +│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │ +│ │ +╚─────────────────────────────────────────────────────────────────────────────*/ +#include "libc/math.h" +#include "libc/runtime/fenv.h" + +asm(".ident\t\"\\n\\n\ +Fused Multiply Add (MIT License)\\n\ +Copyright (c) 2005-2011 David Schultz \""); +asm(".ident\t\"\\n\\n\ +Musl libc (MIT License)\\n\ +Copyright 2005-2014 Rich Felker, et. al.\""); +asm(".include \"libc/disclaimer.inc\""); +/* clang-format off */ + +/* origin: FreeBSD /usr/src/lib/msun/src/s_fmaf.c */ +/*- + * Copyright (c) 2005-2011 David Schultz + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Fused multiply-add: Compute x * y + z with a single rounding error. + * + * A double has more than twice as much precision than a float, so + * direct double-precision arithmetic suffices, except where double + * rounding occurs. + */ +float fmaf(float x, float y, float z) +{ + // #pragma STDC FENV_ACCESS ON + double xy, result; + union {double f; uint64_t i;} u; + int e; + + xy = (double)x * y; + result = xy + z; + u.f = result; + e = u.i>>52 & 0x7ff; + /* Common case: The double precision result is fine. */ + if ((u.i & 0x1fffffff) != 0x10000000 || /* not a halfway case */ + e == 0x7ff || /* NaN */ + (result - xy == z && result - z == xy) || /* exact */ + fegetround() != FE_TONEAREST) /* not round-to-nearest */ + { + /* + underflow may not be raised correctly, example: + fmaf(0x1p-120f, 0x1p-120f, 0x1p-149f) + */ +#if defined(FE_INEXACT) && defined(FE_UNDERFLOW) + if (e < 0x3ff-126 && e >= 0x3ff-149 && fetestexcept(FE_INEXACT)) { + feclearexcept(FE_INEXACT); + /* TODO: gcc and clang bug workaround */ + volatile float vz = z; + result = xy + vz; + if (fetestexcept(FE_INEXACT)) + feraiseexcept(FE_UNDERFLOW); + else + feraiseexcept(FE_INEXACT); + } +#endif + z = result; + return z; + } + + /* + * If result is inexact, and exactly halfway between two float values, + * we need to adjust the low-order bit in the direction of the error. + */ + double err; + int neg = u.i >> 63; + if (neg == (z > xy)) + err = xy - result + z; + else + err = z - result + xy; + if (neg == (err < 0)) + u.i++; + else + u.i--; + z = u.f; + return z; +} diff --git a/libc/tinymath/horner_wrap.internal.h b/libc/tinymath/horner_wrap.internal.h new file mode 100644 index 000000000..984c728c7 --- /dev/null +++ b/libc/tinymath/horner_wrap.internal.h @@ -0,0 +1,39 @@ +#ifndef COSMOPOLITAN_LIBC_TINYMATH_HORNER_WRAP_INTERNAL_H_ +#define COSMOPOLITAN_LIBC_TINYMATH_HORNER_WRAP_INTERNAL_H_ + +/* + * Helper macros for Horner polynomial evaluation. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +// clang-format off +#define HORNER_1_(x, c, i) FMA(c(i + 1), x, c(i)) +#define HORNER_2_(x, c, i) FMA(HORNER_1_ (x, c, i + 1), x, c(i)) +#define HORNER_3_(x, c, i) FMA(HORNER_2_ (x, c, i + 1), x, c(i)) +#define HORNER_4_(x, c, i) FMA(HORNER_3_ (x, c, i + 1), x, c(i)) +#define HORNER_5_(x, c, i) FMA(HORNER_4_ (x, c, i + 1), x, c(i)) +#define HORNER_6_(x, c, i) FMA(HORNER_5_ (x, c, i + 1), x, c(i)) +#define HORNER_7_(x, c, i) FMA(HORNER_6_ (x, c, i + 1), x, c(i)) +#define HORNER_8_(x, c, i) FMA(HORNER_7_ (x, c, i + 1), x, c(i)) +#define HORNER_9_(x, c, i) FMA(HORNER_8_ (x, c, i + 1), x, c(i)) +#define HORNER_10_(x, c, i) FMA(HORNER_9_ (x, c, i + 1), x, c(i)) +#define HORNER_11_(x, c, i) FMA(HORNER_10_(x, c, i + 1), x, c(i)) +#define HORNER_12_(x, c, i) FMA(HORNER_11_(x, c, i + 1), x, c(i)) + +#define HORNER_1(x, c) HORNER_1_ (x, c, 0) +#define HORNER_2(x, c) HORNER_2_ (x, c, 0) +#define HORNER_3(x, c) HORNER_3_ (x, c, 0) +#define HORNER_4(x, c) HORNER_4_ (x, c, 0) +#define HORNER_5(x, c) HORNER_5_ (x, c, 0) +#define HORNER_6(x, c) HORNER_6_ (x, c, 0) +#define HORNER_7(x, c) HORNER_7_ (x, c, 0) +#define HORNER_8(x, c) HORNER_8_ (x, c, 0) +#define HORNER_9(x, c) HORNER_9_ (x, c, 0) +#define HORNER_10(x, c) HORNER_10_(x, c, 0) +#define HORNER_11(x, c) HORNER_11_(x, c, 0) +#define HORNER_12(x, c) HORNER_12_(x, c, 0) +// clang-format on + +#endif /* COSMOPOLITAN_LIBC_TINYMATH_HORNER_WRAP_INTERNAL_H_ */ diff --git a/libc/tinymath/hornerf.internal.h b/libc/tinymath/hornerf.internal.h new file mode 100644 index 000000000..b78e48395 --- /dev/null +++ b/libc/tinymath/hornerf.internal.h @@ -0,0 +1,24 @@ +#ifndef COSMOPOLITAN_LIBC_TINYMATH_HORNERF_INTERNAL_H_ +#define COSMOPOLITAN_LIBC_TINYMATH_HORNERF_INTERNAL_H_ +#include "third_party/libcxx/math.h" +#if !(__ASSEMBLER__ + __LINKER__ + 0) +COSMOPOLITAN_C_START_ + +/* + * Helper macros for double-precision Horner polynomial evaluation. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#if WANT_VMATH +#define FMA(x, y, z) vfmaq_f32(z, x, y) +#else +#define FMA fmaf +#endif + +#include "libc/tinymath/horner_wrap.internal.h" + +COSMOPOLITAN_C_END_ +#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ +#endif /* COSMOPOLITAN_LIBC_TINYMATH_HORNERF_INTERNAL_H_ */ diff --git a/libc/tinymath/logf.S b/libc/tinymath/logf.S deleted file mode 100644 index 7255a6b37..000000000 --- a/libc/tinymath/logf.S +++ /dev/null @@ -1,27 +0,0 @@ -/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│ -│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│ -╞══════════════════════════════════════════════════════════════════════════════╡ -│ Copyright 2020 Justine Alexandra Roberts Tunney │ -│ │ -│ Permission to use, copy, modify, and/or distribute this software for │ -│ any purpose with or without fee is hereby granted, provided that the │ -│ above copyright notice and this permission notice appear in all copies. │ -│ │ -│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ -│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ -│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ -│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ -│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ -│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ -│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ -│ PERFORMANCE OF THIS SOFTWARE. │ -╚─────────────────────────────────────────────────────────────────────────────*/ -#include "libc/macros.internal.h" - -// Returns natural logarithm of 𝑥. -// -// @param 𝑥 is double scalar in low quarter of %xmm0 -// @return float scalar in low quarter of %xmm0 -logf: ezlea logl,ax - jmp _f2ld2 - .endfn logf,globl diff --git a/libc/tinymath/logf.c b/libc/tinymath/logf.c new file mode 100644 index 000000000..8271e586f --- /dev/null +++ b/libc/tinymath/logf.c @@ -0,0 +1,104 @@ +/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ +│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│ +╚──────────────────────────────────────────────────────────────────────────────╝ +│ │ +│ Optimized Routines │ +│ Copyright (c) 1999-2022, Arm Limited. │ +│ │ +│ Permission is hereby granted, free of charge, to any person obtaining │ +│ a copy of this software and associated documentation files (the │ +│ "Software"), to deal in the Software without restriction, including │ +│ without limitation the rights to use, copy, modify, merge, publish, │ +│ distribute, sublicense, and/or sell copies of the Software, and to │ +│ permit persons to whom the Software is furnished to do so, subject to │ +│ the following conditions: │ +│ │ +│ The above copyright notice and this permission notice shall be │ +│ included in all copies or substantial portions of the Software. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │ +│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │ +│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │ +│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │ +│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │ +│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │ +│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │ +│ │ +╚─────────────────────────────────────────────────────────────────────────────*/ +#include "libc/intrin/likely.h" +#include "libc/math.h" +#include "libc/tinymath/internal.h" +#include "libc/tinymath/logf_data.internal.h" + +asm(".ident\t\"\\n\\n\ +Optimized Routines (MIT License)\\n\ +Copyright 2022 ARM Limited\""); +asm(".include \"libc/disclaimer.inc\""); +/* clang-format off */ + +/* + * Single-precision log function. + * + * Copyright (c) 2017-2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* +LOGF_TABLE_BITS = 4 +LOGF_POLY_ORDER = 4 + +ULP error: 0.818 (nearest rounding.) +Relative error: 1.957 * 2^-26 (before rounding.) +*/ + +#define T __logf_data.tab +#define A __logf_data.poly +#define Ln2 __logf_data.ln2 +#define N (1 << LOGF_TABLE_BITS) +#define OFF 0x3f330000 + +float logf(float x) +{ + double_t z, r, r2, y, y0, invc, logc; + uint32_t ix, iz, tmp; + int k, i; + + ix = asuint(x); + /* Fix sign of zero with downward rounding when x==1. */ + if (WANT_ROUNDING && UNLIKELY(ix == 0x3f800000)) + return 0; + if (UNLIKELY(ix - 0x00800000 >= 0x7f800000 - 0x00800000)) { + /* x < 0x1p-126 or inf or nan. */ + if (ix * 2 == 0) + return __math_divzerof(1); + if (ix == 0x7f800000) /* log(inf) == inf. */ + return x; + if ((ix & 0x80000000) || ix * 2 >= 0xff000000) + return __math_invalidf(x); + /* x is subnormal, normalize it. */ + ix = asuint(x * 0x1p23f); + ix -= 23 << 23; + } + + /* x = 2^k z; where z is in range [OFF,2*OFF] and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + tmp = ix - OFF; + i = (tmp >> (23 - LOGF_TABLE_BITS)) % N; + k = (int32_t)tmp >> 23; /* arithmetic shift */ + iz = ix - (tmp & 0xff800000); + invc = T[i].invc; + logc = T[i].logc; + z = (double_t)asfloat(iz); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2 */ + r = z * invc - 1; + y0 = logc + (double_t)k * Ln2; + + /* Pipelined polynomial evaluation to approximate log1p(r). */ + r2 = r * r; + y = A[1] * r + A[2]; + y = A[0] * r2 + y; + y = y * r2 + (y0 + r); + return eval_as_float(y); +} diff --git a/libc/tinymath/logf_data.c b/libc/tinymath/logf_data.c new file mode 100644 index 000000000..1e98942ff --- /dev/null +++ b/libc/tinymath/logf_data.c @@ -0,0 +1,66 @@ +/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ +│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│ +╚──────────────────────────────────────────────────────────────────────────────╝ +│ │ +│ Optimized Routines │ +│ Copyright (c) 1999-2022, Arm Limited. │ +│ │ +│ Permission is hereby granted, free of charge, to any person obtaining │ +│ a copy of this software and associated documentation files (the │ +│ "Software"), to deal in the Software without restriction, including │ +│ without limitation the rights to use, copy, modify, merge, publish, │ +│ distribute, sublicense, and/or sell copies of the Software, and to │ +│ permit persons to whom the Software is furnished to do so, subject to │ +│ the following conditions: │ +│ │ +│ The above copyright notice and this permission notice shall be │ +│ included in all copies or substantial portions of the Software. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │ +│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │ +│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │ +│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │ +│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │ +│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │ +│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │ +│ │ +╚─────────────────────────────────────────────────────────────────────────────*/ +#include "libc/tinymath/logf_data.internal.h" + +asm(".ident\t\"\\n\\n\ +Double-precision math functions (MIT License)\\n\ +Copyright 2018 ARM Limited\""); +asm(".include \"libc/disclaimer.inc\""); +/* clang-format off */ + +/* + * Data definition for logf. + * + * Copyright (c) 2017-2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +const struct logf_data __logf_data = { + .tab = { + { 0x1.661ec79f8f3bep+0, -0x1.57bf7808caadep-2 }, + { 0x1.571ed4aaf883dp+0, -0x1.2bef0a7c06ddbp-2 }, + { 0x1.49539f0f010bp+0, -0x1.01eae7f513a67p-2 }, + { 0x1.3c995b0b80385p+0, -0x1.b31d8a68224e9p-3 }, + { 0x1.30d190c8864a5p+0, -0x1.6574f0ac07758p-3 }, + { 0x1.25e227b0b8eap+0, -0x1.1aa2bc79c81p-3 }, + { 0x1.1bb4a4a1a343fp+0, -0x1.a4e76ce8c0e5ep-4 }, + { 0x1.12358f08ae5bap+0, -0x1.1973c5a611cccp-4 }, + { 0x1.0953f419900a7p+0, -0x1.252f438e10c1ep-5 }, + { 0x1p+0, 0x0p+0 }, + { 0x1.e608cfd9a47acp-1, 0x1.aa5aa5df25984p-5 }, + { 0x1.ca4b31f026aap-1, 0x1.c5e53aa362eb4p-4 }, + { 0x1.b2036576afce6p-1, 0x1.526e57720db08p-3 }, + { 0x1.9c2d163a1aa2dp-1, 0x1.bc2860d22477p-3 }, + { 0x1.886e6037841edp-1, 0x1.1058bc8a07ee1p-2 }, + { 0x1.767dcf5534862p-1, 0x1.4043057b6ee09p-2 }, + }, + .ln2 = 0x1.62e42fefa39efp-1, + .poly = { + -0x1.00ea348b88334p-2, 0x1.5575b0be00b6ap-2, -0x1.ffffef20a4123p-2, + } +}; diff --git a/libc/tinymath/logf_data.internal.h b/libc/tinymath/logf_data.internal.h new file mode 100644 index 000000000..14cdbac5f --- /dev/null +++ b/libc/tinymath/logf_data.internal.h @@ -0,0 +1,20 @@ +#ifndef COSMOPOLITAN_LIBC_TINYMATH_LOGF_DATA_INTERNAL_H_ +#define COSMOPOLITAN_LIBC_TINYMATH_LOGF_DATA_INTERNAL_H_ + +#define LOGF_TABLE_BITS 4 +#define LOGF_POLY_ORDER 4 + +#if !(__ASSEMBLER__ + __LINKER__ + 0) +COSMOPOLITAN_C_START_ + +extern _Hide const struct logf_data { + struct { + double invc, logc; + } tab[1 << LOGF_TABLE_BITS]; + double ln2; + double poly[LOGF_POLY_ORDER - 1]; /* First order coefficient is 1. */ +} __logf_data; + +COSMOPOLITAN_C_END_ +#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ +#endif /* COSMOPOLITAN_LIBC_TINYMATH_LOGF_DATA_INTERNAL_H_ */ diff --git a/libc/tinymath/sincosf.S b/libc/tinymath/sincosf.S deleted file mode 100644 index 57177755c..000000000 --- a/libc/tinymath/sincosf.S +++ /dev/null @@ -1,40 +0,0 @@ -/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│ -│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│ -╞══════════════════════════════════════════════════════════════════════════════╡ -│ Copyright 2020 Justine Alexandra Roberts Tunney │ -│ │ -│ Permission to use, copy, modify, and/or distribute this software for │ -│ any purpose with or without fee is hereby granted, provided that the │ -│ above copyright notice and this permission notice appear in all copies. │ -│ │ -│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ -│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ -│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ -│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ -│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ -│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ -│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ -│ PERFORMANCE OF THIS SOFTWARE. │ -╚─────────────────────────────────────────────────────────────────────────────*/ -#include "libc/macros.internal.h" - -// Returns sine and cosine of 𝑥. -// -// @param 𝑥 is float scalar in low quarter of %xmm0 -// @param %rdi is float *out_sin -// @param %rsi is float *out_cos -// @domain -(3π/8) < 𝑥 < 3π/8 for best accuracy -sincosf: - push %rbp - mov %rsp,%rbp - .profilable - push %rax - movss %xmm0,4(%rsp) - flds 4(%rsp) - fsincos - fxch - fstps (%rdi) - fstps (%rsi) - leave - ret - .endfn sincosf,globl diff --git a/libc/tinymath/sincosf.c b/libc/tinymath/sincosf.c new file mode 100644 index 000000000..0f2777550 --- /dev/null +++ b/libc/tinymath/sincosf.c @@ -0,0 +1,104 @@ +/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ +│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│ +╚──────────────────────────────────────────────────────────────────────────────╝ +│ │ +│ Optimized Routines │ +│ Copyright (c) 1999-2022, Arm Limited. │ +│ │ +│ Permission is hereby granted, free of charge, to any person obtaining │ +│ a copy of this software and associated documentation files (the │ +│ "Software"), to deal in the Software without restriction, including │ +│ without limitation the rights to use, copy, modify, merge, publish, │ +│ distribute, sublicense, and/or sell copies of the Software, and to │ +│ permit persons to whom the Software is furnished to do so, subject to │ +│ the following conditions: │ +│ │ +│ The above copyright notice and this permission notice shall be │ +│ included in all copies or substantial portions of the Software. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │ +│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │ +│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │ +│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │ +│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │ +│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │ +│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │ +│ │ +╚─────────────────────────────────────────────────────────────────────────────*/ +#include "libc/intrin/likely.h" +#include "libc/math.h" +#include "libc/tinymath/sincosf.internal.h" + +asm(".ident\t\"\\n\\n\ +Optimized Routines (MIT License)\\n\ +Copyright 2022 ARM Limited\""); +asm(".include \"libc/disclaimer.inc\""); +/* clang-format off */ + +/* Fast sincosf implementation. Worst-case ULP is 0.5607, maximum relative + error is 0.5303 * 2^-23. A single-step range reduction is used for + small values. Large inputs have their range reduced using fast integer + arithmetic. */ +void +sincosf (float y, float *sinp, float *cosp) +{ + double x = y; + double s; + int n; + const sincos_t *p = &__sincosf_table[0]; + + if (abstop12 (y) < abstop12 (pio4f)) + { + double x2 = x * x; + + if (UNLIKELY (abstop12 (y) < abstop12 (0x1p-12f))) + { + if (UNLIKELY (abstop12 (y) < abstop12 (0x1p-126f))) + /* Force underflow for tiny y. */ + FORCE_EVAL (x2); + *sinp = y; + *cosp = 1.0f; + return; + } + + sincosf_poly (x, x2, p, 0, sinp, cosp); + } + else if (abstop12 (y) < abstop12 (120.0f)) + { + x = reduce_fast (x, p, &n); + + /* Setup the signs for sin and cos. */ + s = p->sign[n & 3]; + + if (n & 2) + p = &__sincosf_table[1]; + + sincosf_poly (x * s, x * x, p, n, sinp, cosp); + } + else if (LIKELY (abstop12 (y) < abstop12 (INFINITY))) + { + uint32_t xi = asuint (y); + int sign = xi >> 31; + + x = reduce_large (xi, &n); + + /* Setup signs for sin and cos - include original sign. */ + s = p->sign[(n + sign) & 3]; + + if ((n + sign) & 2) + p = &__sincosf_table[1]; + + sincosf_poly (x * s, x * x, p, n, sinp, cosp); + } + else + { + /* Return NaN if Inf or NaN for both sin and cos. */ + *sinp = *cosp = y - y; +#if WANT_ERRNO + /* Needed to set errno for +-Inf, the add is a hack to work + around a gcc register allocation issue: just passing y + affects code generation in the fast path. */ + __math_invalidf (y + y); +#endif + } +} diff --git a/libc/tinymath/sincosf.internal.h b/libc/tinymath/sincosf.internal.h new file mode 100644 index 000000000..5dfcf7729 --- /dev/null +++ b/libc/tinymath/sincosf.internal.h @@ -0,0 +1,160 @@ +#ifndef COSMOPOLITAN_LIBC_TINYMATH_SINCOSF_INTERNAL_H_ +#define COSMOPOLITAN_LIBC_TINYMATH_SINCOSF_INTERNAL_H_ +#include "libc/tinymath/internal.h" +#if !(__ASSEMBLER__ + __LINKER__ + 0) +COSMOPOLITAN_C_START_ +/* clang-format off */ + +/* + * Header for sinf, cosf and sincosf. + * + * Copyright (c) 2018-2021, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* 2PI * 2^-64. */ +static const double pi63 = 0x1.921FB54442D18p-62; +/* PI / 4. */ +static const float pio4f = 0x1.921FB6p-1f; + +/* The constants and polynomials for sine and cosine. */ +typedef struct +{ + double sign[4]; /* Sign of sine in quadrants 0..3. */ + double hpi_inv; /* 2 / PI ( * 2^24 if !TOINT_INTRINSICS). */ + double hpi; /* PI / 2. */ + double c0, c1, c2, c3, c4; /* Cosine polynomial. */ + double s1, s2, s3; /* Sine polynomial. */ +} sincos_t; + +/* Polynomial data (the cosine polynomial is negated in the 2nd entry). */ +extern const sincos_t __sincosf_table[2] _Hide; + +/* Table with 4/PI to 192 bit precision. */ +extern const uint32_t __inv_pio4[] _Hide; + +/* Top 12 bits of the float representation with the sign bit cleared. */ +static inline uint32_t +abstop12 (float x) +{ + return (asuint (x) >> 20) & 0x7ff; +} + +/* Compute the sine and cosine of inputs X and X2 (X squared), using the + polynomial P and store the results in SINP and COSP. N is the quadrant, + if odd the cosine and sine polynomials are swapped. */ +static inline void +sincosf_poly (double x, double x2, const sincos_t *p, int n, float *sinp, + float *cosp) +{ + double x3, x4, x5, x6, s, c, c1, c2, s1; + + x4 = x2 * x2; + x3 = x2 * x; + c2 = p->c3 + x2 * p->c4; + s1 = p->s2 + x2 * p->s3; + + /* Swap sin/cos result based on quadrant. */ + float *tmp = (n & 1 ? cosp : sinp); + cosp = (n & 1 ? sinp : cosp); + sinp = tmp; + + c1 = p->c0 + x2 * p->c1; + x5 = x3 * x2; + x6 = x4 * x2; + + s = x + x3 * p->s1; + c = c1 + x4 * p->c2; + + *sinp = s + x5 * s1; + *cosp = c + x6 * c2; +} + +/* Return the sine of inputs X and X2 (X squared) using the polynomial P. + N is the quadrant, and if odd the cosine polynomial is used. */ +static inline float +sinf_poly (double x, double x2, const sincos_t *p, int n) +{ + double x3, x4, x6, x7, s, c, c1, c2, s1; + + if ((n & 1) == 0) + { + x3 = x * x2; + s1 = p->s2 + x2 * p->s3; + + x7 = x3 * x2; + s = x + x3 * p->s1; + + return s + x7 * s1; + } + else + { + x4 = x2 * x2; + c2 = p->c3 + x2 * p->c4; + c1 = p->c0 + x2 * p->c1; + + x6 = x4 * x2; + c = c1 + x4 * p->c2; + + return c + x6 * c2; + } +} + +/* Fast range reduction using single multiply-subtract. Return the modulo of + X as a value between -PI/4 and PI/4 and store the quadrant in NP. + The values for PI/2 and 2/PI are accessed via P. Since PI/2 as a double + is accurate to 55 bits and the worst-case cancellation happens at 6 * PI/4, + the result is accurate for |X| <= 120.0. */ +static inline double +reduce_fast (double x, const sincos_t *p, int *np) +{ + double r; +#if TOINT_INTRINSICS + /* Use fast round and lround instructions when available. */ + r = x * p->hpi_inv; + *np = converttoint (r); + return x - roundtoint (r) * p->hpi; +#else + /* Use scaled float to int conversion with explicit rounding. + hpi_inv is prescaled by 2^24 so the quadrant ends up in bits 24..31. + This avoids inaccuracies introduced by truncating negative values. */ + r = x * p->hpi_inv; + int n = ((int32_t)r + 0x800000) >> 24; + *np = n; + return x - n * p->hpi; +#endif +} + +/* Reduce the range of XI to a multiple of PI/2 using fast integer arithmetic. + XI is a reinterpreted float and must be >= 2.0f (the sign bit is ignored). + Return the modulo between -PI/4 and PI/4 and store the quadrant in NP. + Reduction uses a table of 4/PI with 192 bits of precision. A 32x96->128 bit + multiply computes the exact 2.62-bit fixed-point modulo. Since the result + can have at most 29 leading zeros after the binary point, the double + precision result is accurate to 33 bits. */ +static inline double +reduce_large (uint32_t xi, int *np) +{ + const uint32_t *arr = &__inv_pio4[(xi >> 26) & 15]; + int shift = (xi >> 23) & 7; + uint64_t n, res0, res1, res2; + + xi = (xi & 0xffffff) | 0x800000; + xi <<= shift; + + res0 = xi * arr[0]; + res1 = (uint64_t)xi * arr[4]; + res2 = (uint64_t)xi * arr[8]; + res0 = (res2 >> 32) | (res0 << 32); + res0 += res1; + + n = (res0 + (1ULL << 61)) >> 62; + res0 -= n << 62; + double x = (int64_t)res0; + *np = n; + return x * pi63; +} + +COSMOPOLITAN_C_END_ +#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ +#endif /* COSMOPOLITAN_LIBC_TINYMATH_SINCOSF_INTERNAL_H_ */ diff --git a/libc/tinymath/sincosf_data.c b/libc/tinymath/sincosf_data.c new file mode 100644 index 000000000..a7a9164ea --- /dev/null +++ b/libc/tinymath/sincosf_data.c @@ -0,0 +1,86 @@ +/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ +│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│ +╚──────────────────────────────────────────────────────────────────────────────╝ +│ │ +│ Optimized Routines │ +│ Copyright (c) 1999-2022, Arm Limited. │ +│ │ +│ Permission is hereby granted, free of charge, to any person obtaining │ +│ a copy of this software and associated documentation files (the │ +│ "Software"), to deal in the Software without restriction, including │ +│ without limitation the rights to use, copy, modify, merge, publish, │ +│ distribute, sublicense, and/or sell copies of the Software, and to │ +│ permit persons to whom the Software is furnished to do so, subject to │ +│ the following conditions: │ +│ │ +│ The above copyright notice and this permission notice shall be │ +│ included in all copies or substantial portions of the Software. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │ +│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │ +│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │ +│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │ +│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │ +│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │ +│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │ +│ │ +╚─────────────────────────────────────────────────────────────────────────────*/ +#include "libc/tinymath/sincosf.internal.h" + +asm(".ident\t\"\\n\\n\ +Optimized Routines (MIT License)\\n\ +Copyright 2022 ARM Limited\""); +asm(".include \"libc/disclaimer.inc\""); +/* clang-format off */ + +/* The constants and polynomials for sine and cosine. The 2nd entry + computes -cos (x) rather than cos (x) to get negation for free. */ +const sincos_t __sincosf_table[2] = +{ + { + { 1.0, -1.0, -1.0, 1.0 }, +#if TOINT_INTRINSICS + 0x1.45F306DC9C883p-1, +#else + 0x1.45F306DC9C883p+23, +#endif + 0x1.921FB54442D18p0, + 0x1p0, + -0x1.ffffffd0c621cp-2, + 0x1.55553e1068f19p-5, + -0x1.6c087e89a359dp-10, + 0x1.99343027bf8c3p-16, + -0x1.555545995a603p-3, + 0x1.1107605230bc4p-7, + -0x1.994eb3774cf24p-13 + }, + { + { 1.0, -1.0, -1.0, 1.0 }, +#if TOINT_INTRINSICS + 0x1.45F306DC9C883p-1, +#else + 0x1.45F306DC9C883p+23, +#endif + 0x1.921FB54442D18p0, + -0x1p0, + 0x1.ffffffd0c621cp-2, + -0x1.55553e1068f19p-5, + 0x1.6c087e89a359dp-10, + -0x1.99343027bf8c3p-16, + -0x1.555545995a603p-3, + 0x1.1107605230bc4p-7, + -0x1.994eb3774cf24p-13 + } +}; + +/* Table with 4/PI to 192 bit precision. To avoid unaligned accesses + only 8 new bits are added per entry, making the table 4 times larger. */ +const uint32_t __inv_pio4[24] = +{ + 0xa2, 0xa2f9, 0xa2f983, 0xa2f9836e, + 0xf9836e4e, 0x836e4e44, 0x6e4e4415, 0x4e441529, + 0x441529fc, 0x1529fc27, 0x29fc2757, 0xfc2757d1, + 0x2757d1f5, 0x57d1f534, 0xd1f534dd, 0xf534ddc0, + 0x34ddc0db, 0xddc0db62, 0xc0db6295, 0xdb629599, + 0x6295993c, 0x95993c43, 0x993c4390, 0x3c439041 +}; diff --git a/libc/tinymath/sindf.c b/libc/tinymath/sindf.c index 56e1cde54..f8e742000 100644 --- a/libc/tinymath/sindf.c +++ b/libc/tinymath/sindf.c @@ -60,7 +60,7 @@ S2 = 0x111110896efbb2.0p-59, /* 0.0083333293858894631756 */ S3 = -0x1a00f9e2cae774.0p-65, /* -0.000198393348360966317347 */ S4 = 0x16cd878c3b46a7.0p-71; /* 0.0000027183114939898219064 */ -float __sindf(double x) +noinstrument float __sindf(double x) { double_t r, s, w, z; diff --git a/third_party/ggml/ggml.c b/third_party/ggml/ggml.c index a2955b621..4bdc97df5 100644 --- a/third_party/ggml/ggml.c +++ b/third_party/ggml/ggml.c @@ -362,7 +362,7 @@ static const uint64_t table_b2b_u[1 << 8] = { B8(00, 10) }; // This is also true for POWER9. #if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16) -inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { +forceinline float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { uint16_t s; memcpy(&s, &f, sizeof(uint16_t)); return table_f32_f16[s]; @@ -507,7 +507,7 @@ static inline int hsum_i32_4(const __m128i a) { #if __AVX2__ || __AVX512F__ // spread 32 bits to 32 bytes { 0x00, 0xFF } -static inline __m256i bytes_from_bits_32(const uint8_t * x) { +forceinline __m256i bytes_from_bits_32(const uint8_t * x) { uint32_t x32; memcpy(&x32, x, sizeof(uint32_t)); const __m256i shuf_mask = _mm256_set_epi64x( @@ -521,7 +521,7 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) { // Unpack 32 4-bit fields into 32 bytes // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval -static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) +forceinline __m256i bytes_from_nibbles_32(const uint8_t * rsi) { // Load 16 bytes from memory __m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi ); @@ -539,14 +539,14 @@ static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) } // add int16_t pairwise and return as float vector -static inline __m256 sum_i16_pairs_float(const __m256i x) { +forceinline __m256 sum_i16_pairs_float(const __m256i x) { const __m256i ones = _mm256_set1_epi16(1); const __m256i summed_pairs = _mm256_madd_epi16(ones, x); return _mm256_cvtepi32_ps(summed_pairs); } // multiply int8_t, add results pairwise twice and return as float vector -static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { +forceinline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { // Get absolute values of x vectors const __m256i ax = _mm256_sign_epi8(x, x); // Sign the values of the y vectors diff --git a/third_party/math.h b/third_party/math.h new file mode 100755 index 000000000..e69de29bb