diff --git a/libc/calls/getcpucount.c b/libc/calls/getcpucount.c index 500ce3c76..f1433d7f8 100644 --- a/libc/calls/getcpucount.c +++ b/libc/calls/getcpucount.c @@ -95,6 +95,6 @@ __attribute__((__constructor__)) static void _getcpucount_init(void) { * * @return cpu count or 0 if it couldn't be determined */ -unsigned _getcpucount(void) { +int _getcpucount(void) { return g_cpucount; } diff --git a/libc/calls/struct/sched_param.h b/libc/calls/struct/sched_param.h index d30339c33..e445ef056 100644 --- a/libc/calls/struct/sched_param.h +++ b/libc/calls/struct/sched_param.h @@ -2,6 +2,7 @@ #define COSMOPOLITAN_LIBC_CALLS_STRUCT_SCHED_PARAM_H_ #include "libc/calls/struct/timespec.h" #if !(__ASSEMBLER__ + __LINKER__ + 0) +COSMOPOLITAN_C_START_ struct sched_param { int32_t sched_priority; @@ -15,5 +16,6 @@ int sched_rr_get_interval(int, struct timespec *); int sched_setparam(int, const struct sched_param *); int sched_setscheduler(int, int, const struct sched_param *); +COSMOPOLITAN_C_END_ #endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ #endif /* COSMOPOLITAN_LIBC_CALLS_STRUCT_SCHED_PARAM_H_ */ diff --git a/libc/intrin/fmax.c b/libc/intrin/fmax.c index d4ebd3bd1..0719e6d7d 100644 --- a/libc/intrin/fmax.c +++ b/libc/intrin/fmax.c @@ -26,10 +26,10 @@ * signed zeroes. */ double fmax(double x, double y) { - if (__builtin_isnan(x)) return y; - if (__builtin_isnan(y)) return x; - if (__builtin_signbit(x) != __builtin_signbit(y)) { - return __builtin_signbit(x) ? y : x; /* C99 Annex F.9.9.2 */ + if (isnan(x)) return y; + if (isnan(y)) return x; + if (signbit(x) != signbit(y)) { + return signbit(x) ? y : x; /* C99 Annex F.9.9.2 */ } return x < y ? y : x; } diff --git a/libc/intrin/fmaxf.c b/libc/intrin/fmaxf.c index b2d7613fd..522352bde 100644 --- a/libc/intrin/fmaxf.c +++ b/libc/intrin/fmaxf.c @@ -26,10 +26,10 @@ * signed zeroes. */ float fmaxf(float x, float y) { - if (__builtin_isnan(x)) return y; - if (__builtin_isnan(y)) return x; - if (__builtin_signbitf(x) != __builtin_signbitf(y)) { - return __builtin_signbitf(x) ? y : x; /* C99 Annex F.9.9.2 */ + if (isnan(x)) return y; + if (isnan(y)) return x; + if (signbit(x) != signbit(y)) { + return signbit(x) ? y : x; /* C99 Annex F.9.9.2 */ } return x < y ? y : x; } diff --git a/libc/intrin/fmaxl.c b/libc/intrin/fmaxl.c index ef4ee057a..392c0700c 100644 --- a/libc/intrin/fmaxl.c +++ b/libc/intrin/fmaxl.c @@ -27,10 +27,10 @@ * signed zeroes. */ long double fmaxl(long double x, long double y) { - if (__builtin_isnan(x)) return y; - if (__builtin_isnan(y)) return x; - if (__builtin_signbitl(x) != __builtin_signbitl(y)) { - return __builtin_signbitl(x) ? y : x; /* C99 Annex F.9.9.2 */ + if (isnan(x)) return y; + if (isnan(y)) return x; + if (signbit(x) != signbit(y)) { + return signbit(x) ? y : x; /* C99 Annex F.9.9.2 */ } return x < y ? y : x; } diff --git a/libc/math.h b/libc/math.h index 2441f4c40..45d7ba16d 100644 --- a/libc/math.h +++ b/libc/math.h @@ -88,7 +88,6 @@ typedef double double_t; #define isnan(x) __builtin_isnan(x) #define isfinite(x) __builtin_isfinite(x) #define isnormal(x) __builtin_isnormal(x) -#define signbit(x) __builtin_signbit(x) #define isgreater(x, y) __builtin_isgreater(x, y) #define isgreaterequal(x, y) __builtin_isgreaterequal(x, y) #define isless(x, y) __builtin_isless(x, y) @@ -99,6 +98,11 @@ typedef double double_t; #define fpclassify(x) \ __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL, FP_ZERO, x) +#define signbit(x) \ + (sizeof(x) == sizeof(double) ? __builtin_signbit(x) \ + : sizeof(x) == sizeof(float) ? __builtin_signbitf(x) \ + : __builtin_signbitl(x)) + extern int signgam; double acos(double); @@ -305,7 +309,7 @@ void sincos(double, double *, double *); void sincosf(float, float *, float *); void sincosl(long double, long double *, long double *); -float fsumf(const float *, size_t); +double fsumf(const float *, size_t); double fsum(const double *, size_t); double j0(double); diff --git a/libc/runtime/runtime.h b/libc/runtime/runtime.h index fecb3f2a6..8e76eeb0d 100644 --- a/libc/runtime/runtime.h +++ b/libc/runtime/runtime.h @@ -99,7 +99,7 @@ void _intsort(int *, size_t); void _longsort(long *, size_t); bool _isheap(void *); int NtGetVersion(void) pureconst; -unsigned _getcpucount(void) pureconst; +int _getcpucount(void) pureconst; long _missingno(); void __oom_hook(size_t); void _loadxmm(void *); diff --git a/libc/tinymath/acos.c b/libc/tinymath/acos.c index 40a1cb667..939a87abf 100644 --- a/libc/tinymath/acos.c +++ b/libc/tinymath/acos.c @@ -141,3 +141,7 @@ double acos(double x) w = R(z)*s+c; return 2*(df+w); } + +#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024 +__strong_reference(acos, acosl); +#endif diff --git a/libc/tinymath/acosf.c b/libc/tinymath/acosf.c index 28e1d3f47..15c5c902d 100644 --- a/libc/tinymath/acosf.c +++ b/libc/tinymath/acosf.c @@ -32,7 +32,7 @@ asm(".ident\t\"\\n\\n\ Musl libc (MIT License)\\n\ Copyright 2005-2014 Rich Felker, et. al.\""); asm(".include \"libc/disclaimer.inc\""); -/* clang-format off */ +// clang-format off /* origin: FreeBSD /usr/src/lib/msun/src/e_acosf.c */ /* diff --git a/libc/tinymath/acosh.c b/libc/tinymath/acosh.c index 6259fed68..35286d201 100644 --- a/libc/tinymath/acosh.c +++ b/libc/tinymath/acosh.c @@ -31,7 +31,7 @@ asm(".ident\t\"\\n\\n\ Musl libc (MIT License)\\n\ Copyright 2005-2014 Rich Felker, et. al.\""); asm(".include \"libc/disclaimer.inc\""); -/* clang-format off */ +// clang-format off /** * Returns inverse hyperbolic cosine of 𝑥. @@ -53,3 +53,7 @@ double acosh(double x) /* |x| >= 0x1p26 or nan */ return log(x) + 0.693147180559945309417232121458176568; } + +#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024 +__strong_reference(acosh, acoshl); +#endif diff --git a/libc/tinymath/acoshl.c b/libc/tinymath/acoshl.c index 6116e043d..132aa4a60 100644 --- a/libc/tinymath/acoshl.c +++ b/libc/tinymath/acoshl.c @@ -38,6 +38,7 @@ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "libc/math.h" #include "libc/tinymath/freebsd.internal.h" +#if !(LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024) asm(".ident\t\"\\n\\n\ FreeBSD libm (BSD-2 License)\\n\ @@ -62,8 +63,6 @@ asm(".include \"libc/disclaimer.inc\""); #error "Unsupported long double format" #endif -#define BIAS (LDBL_MAX_EXP - 1) - static const double one = 1.0; @@ -108,3 +107,5 @@ acoshl(long double x) RETURNI(log1pl(t+sqrtl(2.0*t+t*t))); } } + +#endif /* long double is long */ diff --git a/libc/tinymath/acosl.c b/libc/tinymath/acosl.c index bd9d47693..7958cd307 100644 --- a/libc/tinymath/acosl.c +++ b/libc/tinymath/acosl.c @@ -28,6 +28,7 @@ #include "libc/math.h" #include "libc/tinymath/invtrigl.internal.h" #include "libc/tinymath/ldshape.internal.h" +#if !(LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024) asm(".ident\t\"\\n\\n\ fdlibm (fdlibm license)\\n\ @@ -54,22 +55,20 @@ asm(".include \"libc/disclaimer.inc\""); * Converted to long double by David Schultz . */ -/** - * Returns arc cosine of 𝑥. - * - * @define atan2(fabs(sqrt((1-𝑥)*(1+𝑥))),𝑥) - * @domain -1 ≤ 𝑥 ≤ 1 - */ -long double acosl(long double x) { -#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024 - return acos(x); -#elif (LDBL_MANT_DIG == 64 || LDBL_MANT_DIG == 113) && LDBL_MAX_EXP == 16384 #if LDBL_MANT_DIG == 64 #define CLEARBOTTOM(u) (u.i.m &= -1ULL << 32) #elif LDBL_MANT_DIG == 113 #define CLEARBOTTOM(u) (u.i.lo = 0) #endif +/** + * Returns arc cosine of 𝑥. + * + * @define atan2(fabs(sqrt((1-𝑥)*(1+𝑥))),𝑥) + * @domain -1 ≤ 𝑥 ≤ 1 + */ +long double acosl(long double x) +{ union ldshape u = {x}; long double z, s, c, f; uint16_t e = u.i.se & 0x7fff; @@ -102,8 +101,6 @@ long double acosl(long double x) { f = u.f; c = (z - f*f)/(s + f); return 2*(__invtrigl_R(z)*s + c + f); - -#else -#error "architecture unsupported" -#endif } + +#endif /* long double is long */ diff --git a/libc/tinymath/asinf.c b/libc/tinymath/asinf.c index 0e288a0d0..2af22e009 100644 --- a/libc/tinymath/asinf.c +++ b/libc/tinymath/asinf.c @@ -35,7 +35,7 @@ asm(".ident\t\"\\n\\n\ Musl libc (MIT License)\\n\ Copyright 2005-2014 Rich Felker, et. al.\""); asm(".include \"libc/disclaimer.inc\""); -/* clang-format off */ +// clang-format off /* origin: FreeBSD /usr/src/lib/msun/src/e_asinf.c */ /* diff --git a/libc/tinymath/asinh.c b/libc/tinymath/asinh.c index fab4113f9..dcbbf87f5 100644 --- a/libc/tinymath/asinh.c +++ b/libc/tinymath/asinh.c @@ -64,3 +64,7 @@ double asinh(double x) } return s ? -x : x; } + +#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024 +__strong_reference(asinh, asinhl); +#endif diff --git a/libc/tinymath/asinhl.c b/libc/tinymath/asinhl.c index 26640a763..ec6992e14 100644 --- a/libc/tinymath/asinhl.c +++ b/libc/tinymath/asinhl.c @@ -38,6 +38,7 @@ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "libc/math.h" #include "libc/tinymath/freebsd.internal.h" +#if !(LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024) asm(".ident\t\"\\n\\n\ FreeBSD libm (BSD-2 License)\\n\ @@ -65,8 +66,6 @@ asm(".include \"libc/disclaimer.inc\""); #error "Unsupported long double format" #endif -#define BIAS (LDBL_MAX_EXP - 1) - static const double one = 1.00000000000000000000e+00, /* 0x3FF00000, 0x00000000 */ huge= 1.00000000000000000000e+300; @@ -110,3 +109,5 @@ asinhl(long double x) } RETURNI((hx & 0x8000) == 0 ? w : -w); } + +#endif /* long double is long */ diff --git a/libc/tinymath/cacos.c b/libc/tinymath/cacos.c index e37badfe2..7b60e4257 100644 --- a/libc/tinymath/cacos.c +++ b/libc/tinymath/cacos.c @@ -33,9 +33,7 @@ asm(".ident\t\"\\n\\n\ Musl libc (MIT License)\\n\ Copyright 2005-2014 Rich Felker, et. al.\""); asm(".include \"libc/disclaimer.inc\""); -/* clang-format off */ - - +// clang-format off // FIXME: Hull et al. "Implementing the complex arcsine and arccosine functions using exception handling" 1997 diff --git a/libc/tinymath/catan.c b/libc/tinymath/catan.c index 8a0263db4..ec2a6d489 100644 --- a/libc/tinymath/catan.c +++ b/libc/tinymath/catan.c @@ -145,3 +145,7 @@ double complex catan(double complex z) w = CMPLX(w, 0.25 * log(a)); return w; } + +#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024 +__strong_reference(catan, catanl); +#endif diff --git a/libc/tinymath/catanl.c b/libc/tinymath/catanl.c index 098de31e6..491ea59d1 100644 --- a/libc/tinymath/catanl.c +++ b/libc/tinymath/catanl.c @@ -2,32 +2,26 @@ │vi: set et ft=c ts=8 tw=8 fenc=utf-8 :vi│ ╚──────────────────────────────────────────────────────────────────────────────╝ │ │ -│ Musl Libc │ -│ Copyright © 2005-2014 Rich Felker, et al. │ +│ OpenBSD /usr/src/lib/libm/src/s_catanl.c │ │ │ -│ Permission is hereby granted, free of charge, to any person obtaining │ -│ a copy of this software and associated documentation files (the │ -│ "Software"), to deal in the Software without restriction, including │ -│ without limitation the rights to use, copy, modify, merge, publish, │ -│ distribute, sublicense, and/or sell copies of the Software, and to │ -│ permit persons to whom the Software is furnished to do so, subject to │ -│ the following conditions: │ +│ Copyright (c) 2008 Stephen L. Moshier │ │ │ -│ The above copyright notice and this permission notice shall be │ -│ included in all copies or substantial portions of the Software. │ +│ Permission to use, copy, modify, and distribute this software for any │ +│ purpose with or without fee is hereby granted, provided that the above │ +│ copyright notice and this permission notice appear in all copies. │ │ │ -│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │ -│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │ -│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │ -│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │ -│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │ -│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │ -│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │ +│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES │ +│ WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF │ +│ MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR │ +│ ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES │ +│ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN │ +│ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF │ │ │ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "libc/complex.h" #include "libc/math.h" #include "libc/tinymath/complex.internal.h" +#if !(LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024) asm(".ident\t\"\\n\\n\ OpenBSD libm (ISC License)\\n\ @@ -38,22 +32,6 @@ Copyright 2005-2014 Rich Felker, et. al.\""); asm(".include \"libc/disclaimer.inc\""); // clang-format off -/* origin: OpenBSD /usr/src/lib/libm/src/s_catanl.c */ -/* - * Copyright (c) 2008 Stephen L. Moshier - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ /* * Complex circular arc tangent * @@ -97,13 +75,6 @@ asm(".include \"libc/disclaimer.inc\""); * 2.9e-17. See also clog(). */ - -#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024 -long double complex catanl(long double complex z) -{ - return catan(z); -} -#else static const long double PIL = 3.141592653589793238462643383279502884197169L; static const long double DP1 = 3.14159265358979323829596852490908531763125L; static const long double DP2 = 1.6667485837041756656403424829301998703007e-19L; @@ -149,4 +120,4 @@ long double complex catanl(long double complex z) return w; } -#endif +#endif /* long double is long */ diff --git a/libc/tinymath/fmin.c b/libc/tinymath/fmin.c index a382ebf85..08af674ea 100644 --- a/libc/tinymath/fmin.c +++ b/libc/tinymath/fmin.c @@ -26,10 +26,10 @@ * signed zeroes. */ double fmin(double x, double y) { - if (__builtin_isnan(x)) return y; - if (__builtin_isnan(y)) return x; - if (__builtin_signbit(x) != __builtin_signbit(y)) { - return __builtin_signbit(x) ? x : y; /* C99 Annex F.9.9.2 */ + if (isnan(x)) return y; + if (isnan(y)) return x; + if (signbit(x) != signbit(y)) { + return signbit(x) ? x : y; /* C99 Annex F.9.9.2 */ } return x < y ? x : y; } diff --git a/libc/tinymath/fminf.c b/libc/tinymath/fminf.c index a00d0a7b0..4f2e3aa00 100644 --- a/libc/tinymath/fminf.c +++ b/libc/tinymath/fminf.c @@ -17,6 +17,7 @@ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "libc/math.h" +#include "libc/tinymath/freebsd.internal.h" /** * Returns minimum of two floats. @@ -26,10 +27,10 @@ * signed zeroes. */ float fminf(float x, float y) { - if (__builtin_isnan(x)) return y; - if (__builtin_isnan(y)) return x; - if (__builtin_signbitf(x) != __builtin_signbitf(y)) { - return __builtin_signbitf(x) ? x : y; /* C99 Annex F.9.9.2 */ + if (isnan(x)) return y; + if (isnan(y)) return x; + if (signbit(x) != signbit(y)) { + return signbit(x) ? x : y; /* C99 Annex F.9.9.2 */ } return x < y ? x : y; } diff --git a/libc/tinymath/fminl.c b/libc/tinymath/fminl.c index 98230af5a..ccbf7c049 100644 --- a/libc/tinymath/fminl.c +++ b/libc/tinymath/fminl.c @@ -27,10 +27,10 @@ * signed zeroes. */ long double fminl(long double x, long double y) { - if (__builtin_isnan(x)) return y; - if (__builtin_isnan(y)) return x; - if (__builtin_signbitl(x) != __builtin_signbitl(y)) { - return __builtin_signbitl(x) ? x : y; /* C99 Annex F.9.9.2 */ + if (isnan(x)) return y; + if (isnan(y)) return x; + if (signbit(x) != signbit(y)) { + return signbit(x) ? x : y; /* C99 Annex F.9.9.2 */ } return x < y ? x : y; } diff --git a/libc/tinymath/fsumf.c b/libc/tinymath/fsumf.c index c0c42ba7e..4be39a87b 100644 --- a/libc/tinymath/fsumf.c +++ b/libc/tinymath/fsumf.c @@ -22,8 +22,8 @@ /** * Adds floats in array. */ -float fsumf(const float *p, size_t n) { - float s; +double fsumf(const float *p, size_t n) { + double s; size_t i; if (n > 8) return fsumf(p, n / 2) + fsumf(p + n / 2, n - n / 2); for (s = i = 0; i < n; ++i) s += p[i]; diff --git a/test/libc/str/longsort_test.c b/test/libc/str/longsort_test.c index d6ab1d807..78d7a6295 100644 --- a/test/libc/str/longsort_test.c +++ b/test/libc/str/longsort_test.c @@ -27,6 +27,18 @@ #include "libc/testlib/testlib.h" #include "third_party/vqsort/vqsort.h" +void InsertionSort(int *A, int n) { + for (int i = 1; i < n; i++) { + int key = A[i]; + int j = i - 1; + while (j >= 0 && A[j] > key) { + A[j + 1] = A[j]; + j--; + } + A[j + 1] = key; + } +} + int CompareLong(const void *a, const void *b) { const long *x = a; const long *y = b; @@ -145,14 +157,14 @@ int CompareInt(const void *a, const void *b) { return 0; } -TEST(_intsort, test) { +TEST(InsertionSort, test) { size_t n = 5000; int *a = gc(calloc(n, sizeof(int))); int *b = gc(calloc(n, sizeof(int))); rngset(a, n * sizeof(int), 0, 0); memcpy(b, a, n * sizeof(int)); qsort(a, n, sizeof(int), CompareInt); - _intsort(b, n); + InsertionSort(b, n); ASSERT_EQ(0, memcmp(b, a, n * sizeof(int))); } @@ -218,13 +230,14 @@ TEST(radix_sort_int32, test) { ASSERT_EQ(0, memcmp(b, a, n * sizeof(int))); } -BENCH(_intsort, bench) { +BENCH(InsertionSort, bench) { printf("\n"); size_t n = 10000; int *p1 = gc(malloc(n * sizeof(int))); int *p2 = gc(malloc(n * sizeof(int))); rngset(p1, n * sizeof(int), 0, 0); - EZBENCH2("_intsort", memcpy(p2, p1, n * sizeof(int)), _intsort(p2, n)); + EZBENCH2("InsertionSort", memcpy(p2, p1, n * sizeof(int)), + InsertionSort(p2, n)); #ifdef __x86_64__ if (X86_HAVE(AVX2)) { EZBENCH2("vqsort_int32_avx2", memcpy(p2, p1, n * sizeof(int)), diff --git a/test/libc/tinymath/remainder_test.c b/test/libc/tinymath/remainder_test.c index c8eb912f5..bf5138bbe 100644 --- a/test/libc/tinymath/remainder_test.c +++ b/test/libc/tinymath/remainder_test.c @@ -24,7 +24,6 @@ #include "libc/runtime/runtime.h" #include "libc/testlib/ezbench.h" #include "libc/testlib/testlib.h" -#include "libc/tinymath/tinymath.h" #include "libc/x/x.h" float remainderf2(float, float); diff --git a/third_party/ggml/README.cosmo b/third_party/ggml/README.cosmo index 8afbd578e..f71191337 100644 --- a/third_party/ggml/README.cosmo +++ b/third_party/ggml/README.cosmo @@ -27,3 +27,5 @@ LOCAL CHANGES - Refactor headers per cosmo convention - Replace code like 'ggjt' with READ32BE("ggjt") - Remove C++ exceptions; use Die() function instead + - Removed division from matrix multiplication. + - Let quantizer convert between ggmt formats diff --git a/third_party/ggml/common.cc b/third_party/ggml/common.cc index e9f95ed1f..d780d4f9f 100644 --- a/third_party/ggml/common.cc +++ b/third_party/ggml/common.cc @@ -34,6 +34,7 @@ #include "libc/stdio/stdio.h" #include "libc/str/str.h" #include "libc/sysv/consts/fileno.h" +#include "third_party/ggml/llama_util.h" #include "third_party/libcxx/algorithm" #include "third_party/libcxx/cassert" #include "third_party/libcxx/cstring" @@ -50,13 +51,6 @@ Copyright (c) 2023 Georgi Gerganov\""); asm(".include \"libc/disclaimer.inc\""); // clang-format off -static bool is_integer_str(const char *s) { - if (*s == '-') ++s; - if (!*s) return false; - while (isdigit(*s)) ++s; - return !*s; -} - static std::string replace_all(std::string const& original, std::string const& before, std::string const& after) { @@ -92,7 +86,7 @@ static bool append_file_to_prompt(const char *path, gpt_params & params) { } bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { - params.n_threads = std::min(20, std::max(1, (int)(_getcpucount() * 0.75))); + params.n_threads = std::min(20, std::max(1, _getcpucount() >> 1)); bool invalid_param = false; std::string arg; diff --git a/third_party/ggml/common.h b/third_party/ggml/common.h index cffd92363..8cd4562ba 100644 --- a/third_party/ggml/common.h +++ b/third_party/ggml/common.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- c++; c-basic-offset:4 -*- #ifndef COSMOPOLITAN_THIRD_PARTY_GGML_COMMON_H_ #define COSMOPOLITAN_THIRD_PARTY_GGML_COMMON_H_ #include "libc/calls/struct/termios.h" @@ -21,7 +21,7 @@ struct gpt_params { int32_t seed = -1; // RNG seed int32_t verbose = 0; // Logging verbosity - int32_t n_threads = std::min(1, (int)(_getcpucount() * 0.75)); + int32_t n_threads = std::max(1, _getcpucount() >> 1); int32_t n_predict = -1; // new tokens to predict int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions) int32_t n_ctx = 512; // context size diff --git a/third_party/ggml/fp16.c b/third_party/ggml/fp16.c index 2491addd9..71d46d09c 100644 --- a/third_party/ggml/fp16.c +++ b/third_party/ggml/fp16.c @@ -78,7 +78,15 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) { } void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n) { - for (size_t i = 0; i < n; i++) { + size_t i = 0; +#ifdef __F16C__ + for (; i + 7 < n; i += 8) { + __m128i x_vec = _mm_loadu_si128((const __m128i *)(x + i)); + __m256 y_vec = _mm256_cvtph_ps(x_vec); + _mm256_storeu_ps(y + i, y_vec); + } +#endif + for (; i < n; i++) { y[i] = GGML_FP16_TO_FP32(x[i]); } } diff --git a/third_party/ggml/fp16.h b/third_party/ggml/fp16.h index 37e746bc2..c544a242b 100644 --- a/third_party/ggml/fp16.h +++ b/third_party/ggml/fp16.h @@ -3,9 +3,6 @@ #if !(__ASSEMBLER__ + __LINKER__ + 0) COSMOPOLITAN_C_START_ -#define GGML_GELU_FP16 -#define GGML_SILU_FP16 - #ifdef __ARM_NEON // we use the built-in 16-bit float type typedef __fp16 ggml_fp16_t; diff --git a/third_party/ggml/fp16.internal.h b/third_party/ggml/fp16.internal.h index 17fe2b99d..b59e28825 100644 --- a/third_party/ggml/fp16.internal.h +++ b/third_party/ggml/fp16.internal.h @@ -8,6 +8,9 @@ #if !(__ASSEMBLER__ + __LINKER__ + 0) COSMOPOLITAN_C_START_ +#define GGML_GELU_FP16 +#define GGML_SILU_FP16 + extern ggml_fp16_t table_gelu_f16[1 << 16]; extern ggml_fp16_t table_silu_f16[1 << 16]; extern ggml_fp16_t table_exp_f16[1 << 16]; diff --git a/third_party/ggml/ggjt.v1.q4_0.c b/third_party/ggml/ggjt.v1.q4_0.c index c445f67a2..3c9476422 100644 --- a/third_party/ggml/ggjt.v1.q4_0.c +++ b/third_party/ggml/ggjt.v1.q4_0.c @@ -613,23 +613,37 @@ void ggml_vec_dot_v1_q4_0_q8_0(const int n, float * restrict s, const void * res __m256 acc = _mm256_setzero_ps(); // Main loop - for (int i = 0; i < nb; ++i) { - /* Compute combined scale for the block */ - const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) ); - - __m256i bx = bytes_from_nibbles_32(x[i].qs); - - // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. - const __m256i off = _mm256_set1_epi8( 8 ); - bx = _mm256_sub_epi8( bx, off ); - - __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); - - const __m256 q = mul_sum_i8_pairs_float(bx, by); - - /* Multiply q with scale and accumulate */ - acc = _mm256_fmadd_ps( d, q, acc ); +#define WORK(I) \ + /* Compute combined scale for the block */ \ + const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[I].d ), _mm256_broadcast_ss( &y[I].d ) ); \ + __m256i bx = bytes_from_nibbles_32(x[I].qs); \ + /* Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. */ \ + const __m256i off = _mm256_set1_epi8( 8 ); \ + bx = _mm256_sub_epi8( bx, off ); \ + __m256i by = _mm256_loadu_si256((const __m256i *)y[I].qs); \ + const __m256 q = mul_sum_i8_pairs_float(bx, by); \ + /* Multiply q with scale and accumulate */ \ + acc = _mm256_fmadd_ps( d, q, acc ) + int i = 0; + for (; i + 12 < nb; i += 12) { + _mm_prefetch(x+i+12, 3); + _mm_prefetch(x+i+15, 3); + _mm_prefetch(x+i+18, 3); + _mm_prefetch(x+i+21, 3); + _mm_prefetch(y+i+12, 3); + _mm_prefetch(y+i+14, 3); + _mm_prefetch(y+i+16, 3); + _mm_prefetch(y+i+18, 3); + _mm_prefetch(y+i+20, 3); + _mm_prefetch(y+i+22, 3); + for (int j = 0; j < 12; ++j) { + WORK(i+j); + } } + for (; i < nb; ++i) { + WORK(i); + } +#undef WORK *s = hsum_float_8(acc); #elif defined(__AVX__) diff --git a/third_party/ggml/ggml.c b/third_party/ggml/ggml.c index 6a3f07401..603bafa20 100644 --- a/third_party/ggml/ggml.c +++ b/third_party/ggml/ggml.c @@ -1784,9 +1784,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); - // // Main loop - // #define WORK(I) \ /* Compute combined scale for the block */ \ const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[I].d ), _mm256_broadcast_ss( &y[I].d ) ); \ @@ -2702,9 +2700,15 @@ inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) { inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) { #ifndef GGML_USE_ACCELERATE - ggml_float sum = 0.0; - for (int i = 0; i < n; ++i) { - sum += (ggml_float)x[i]; + int i = 0; + ggml_float sum = 0; +#if __AVX__ || __AVX2__ || __AVX512F__ + for (; i + 8 <= n; i += 8) { + sum += hsum_float_8(_mm256_loadu_ps(x + i)); + } +#endif + for (; i < n; ++i) { + sum += x[i]; } *s = sum; #else @@ -2802,6 +2806,7 @@ const char *const ggjt_v2_type_name[GGML_TYPE_COUNT] = { [GGML_TYPE_F16] = "f16", [GGML_TYPE_Q4_0] = "q4_0", [GGML_TYPE_Q4_1] = "q4_1", + [GGML_TYPE_Q4_2] = "q4_2", [GGML_TYPE_Q5_0] = "q5_0", [GGML_TYPE_Q5_1] = "q5_1", [GGML_TYPE_Q8_0] = "q8_0", @@ -8113,7 +8118,7 @@ static void ggml_compute_forward_alibi_f32( assert(ne1 + n_past == ne0); (void) n_past; // add alibi to src0 (KQ_scaled) - const int n_heads_log2_floor = 1 << _bsr(n_head); + const int n_heads_log2_floor = 1 << _bsr(n_head); // [jart] const float m0 = exp2f(-8.0f / n_heads_log2_floor); const float m1 = exp2f(-4.0f / n_heads_log2_floor); diff --git a/third_party/ggml/ggml.h b/third_party/ggml/ggml.h index c8f4dc138..5cf9e4de6 100644 --- a/third_party/ggml/ggml.h +++ b/third_party/ggml/ggml.h @@ -1,3 +1,4 @@ +// -*- c; c-basic-offset:4 -*- #ifndef COSMOPOLITAN_THIRD_PARTY_LLAMA_CPP_GGML_H_ #define COSMOPOLITAN_THIRD_PARTY_LLAMA_CPP_GGML_H_ #if !(__ASSEMBLER__ + __LINKER__ + 0) diff --git a/third_party/ggml/ggml.mk b/third_party/ggml/ggml.mk index e1c768350..3aee03ac9 100644 --- a/third_party/ggml/ggml.mk +++ b/third_party/ggml/ggml.mk @@ -128,6 +128,7 @@ THIRD_PARTY_GGML_LLAMA_DIRECTDEPS = \ LIBC_STR \ LIBC_STUBS \ LIBC_SYSV \ + LIBC_SYSV_CALLS \ LIBC_THREAD \ LIBC_TINYMATH \ LIBC_ZIPOS \ @@ -180,6 +181,7 @@ o/$(MODE)/third_party/ggml/companionai.txt.zip.o: private \ THIRD_PARTY_GGML_COMS = \ $(THIRD_PARTY_GGML_LLAMA) \ + o/$(MODE)/third_party/ggml/quantize.com \ o/$(MODE)/third_party/ggml/perplexity.com THIRD_PARTY_GGML_BINS = $(THIRD_PARTY_GGML_COMS) $(THIRD_PARTY_GGML_COMS:%=%.dbg) diff --git a/third_party/ggml/llama.cc b/third_party/ggml/llama.cc index f104bf962..610f54b22 100644 --- a/third_party/ggml/llama.cc +++ b/third_party/ggml/llama.cc @@ -31,6 +31,7 @@ #include "libc/intrin/bits.h" #include "libc/macros.internal.h" #include "libc/stdio/stdio.h" +#include "libc/sysv/consts/posix.h" #include "third_party/ggml/fp16.h" #include "third_party/ggml/ggml.h" #include "third_party/ggml/llama_util.h" @@ -443,8 +444,9 @@ struct llama_file_loader { llama_hparams hparams; llama_vocab vocab; - llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map) - : file(fname, "rb") { + llama_file_loader(const char * fname, size_t file_idx, + llama_load_tensors_map & tensors_map) + : file(fname, "rb") { // fprintf(stderr, "llama.cpp: loading model from %s\n", fname); read_magic(); read_hparams(); @@ -568,8 +570,9 @@ struct llama_file_saver { write_vocab(); } void write_magic() { + ggjt_v2(); file.write_u32(READ32BE("ggjt")); // magic - file.write_u32(1); // version + file.write_u32(2); // version } void write_hparams(enum llama_ftype new_ftype) { const llama_hparams & hparams = any_file_loader->hparams; @@ -2003,16 +2006,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(), ggml_type_name(tensor.type)); - // This used to be a regex, but has an extreme cost to compile times. - bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'? - - // quantize only 2D tensors - quantize &= (tensor.ne.size() == 2); - - // uncomment this to keep the output layer in FP16 - //if (tensor.name == "output.weight") { - // quantize = false; - //} + // only quantize 2d weights that aren't the output layer + bool quantize = + tensor.ne.size() == 2 && + tensor.type != quantized_type && + _endswith(tensor.name.c_str(), "weight") && + tensor.name != "output.weight"; enum ggml_type new_type; void * new_data; @@ -2024,6 +2023,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s new_data = tensor.data; new_size = tensor.size; printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0); + } else if (quantized_type == GGML_TYPE_F16) { + GGML_ASSERT(tensor.type == GGML_TYPE_F32); + size_t nelements = tensor.ne.at(0) * tensor.ne.at(1); + new_type = quantized_type; + new_size = nelements * 2; + work.resize(new_size); + new_data = work.addr; + ggml_fp32_to_fp16_row((const float *)tensor.data, (ggml_fp16_t *)new_data, nelements); } else { new_type = quantized_type; float * f32_data; diff --git a/third_party/ggml/llama.h b/third_party/ggml/llama.h index 041794768..9c28a60dc 100644 --- a/third_party/ggml/llama.h +++ b/third_party/ggml/llama.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- c++; c-basic-offset:4 -*- #ifndef LLAMA_H #define LLAMA_H #include "libc/intrin/bits.h" diff --git a/third_party/ggml/llama_util.h b/third_party/ggml/llama_util.h index 05184945d..ae43af616 100755 --- a/third_party/ggml/llama_util.h +++ b/third_party/ggml/llama_util.h @@ -1,12 +1,11 @@ -// Internal header to be included only by llama.cpp. -// Contains wrappers around OS interfaces. - +// -*- c++; c-basic-offset:4 -*- #ifndef LLAMA_UTIL_H #define LLAMA_UTIL_H #include "libc/calls/struct/rlimit.h" #include "libc/dce.h" #include "libc/fmt/fmt.h" #include "libc/runtime/sysconf.h" +#include "libc/str/str.h" #include "libc/sysv/consts/madv.h" #include "libc/sysv/consts/map.h" #include "libc/sysv/consts/prot.h" @@ -22,6 +21,9 @@ #include "third_party/libcxx/vector" // clang-format off +// Internal header to be included only by llama.cpp. +// Contains wrappers around OS interfaces. + #define LLAMA_ASSERT(x) \ do { \ if (!(x)) { \ @@ -47,6 +49,13 @@ static void Die(const char *fmt, ...) { exit(1); } +static inline bool is_integer_str(const char *s) { + if (*s == '-') ++s; + if (!*s) return false; + while (isdigit(*s)) ++s; + return !*s; +} + struct llama_file { // use FILE * so we don't have to re-open the file to mmap FILE * fp; diff --git a/third_party/ggml/main.cc b/third_party/ggml/main.cc index cb8f1b845..8a399f825 100644 --- a/third_party/ggml/main.cc +++ b/third_party/ggml/main.cc @@ -28,6 +28,7 @@ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "libc/assert.h" #include "libc/calls/calls.h" +#include "libc/calls/struct/sched_param.h" #include "libc/calls/struct/sigaction.h" #include "libc/calls/struct/stat.h" #include "libc/fmt/fmt.h" @@ -37,9 +38,11 @@ #include "libc/nexgen32e/x86feature.h" #include "libc/runtime/runtime.h" #include "libc/stdio/stdio.h" +#include "libc/sysv/consts/ioprio.h" #include "libc/sysv/consts/map.h" #include "libc/sysv/consts/msync.h" #include "libc/sysv/consts/o.h" +#include "libc/sysv/consts/prio.h" #include "libc/sysv/consts/prot.h" #include "libc/sysv/consts/sig.h" #include "third_party/ggml/common.h" @@ -66,7 +69,6 @@ static int n_past; static int n_remain; static int n_consumed; static bool input_noecho; -static bool is_antiprompt; //////////////////////////////////////////////////////////////////////////////// @@ -103,7 +105,7 @@ static int CompareTime(struct timespec a, struct timespec b) { //////////////////////////////////////////////////////////////////////////////// // ux explanatory logging for llama.com developers -#if 1 +#if 0 #define DEVLOG(...) (void)0 #else #define DEVLOG(...) if (g_devlog) fprintf(g_devlog, __VA_ARGS__) @@ -187,16 +189,16 @@ static bool has_antiprompt(std::string::size_type *out_index = nullptr, static void finish_initializing_prompt() { prompt_status = kPromptFinished; if (params.interactive) { - std::string::size_type pos; + std::string::size_type ap_index; is_interacting = true; - if (has_antiprompt(&pos)) { + if (has_antiprompt(&ap_index)) { console_set_color(con_st, CONSOLE_COLOR_PROMPT); - printf("%s", last_output.substr(pos).c_str()); - last_output.clear(); + printf("%s", last_output.substr(ap_index).c_str()); fflush(stdout); } console_set_color(con_st, CONSOLE_COLOR_USER_INPUT); } + last_output.clear(); } //////////////////////////////////////////////////////////////////////////////// @@ -208,8 +210,16 @@ static int on_missing_feature(const char *name) { return 1; } +void MakeProcessNice(void) { + setpriority(PRIO_PROCESS, 0, 10); + ioprio_set(IOPRIO_WHO_PROCESS, 0, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + struct sched_param param = {sched_get_priority_min(SCHED_IDLE)}; + sched_setscheduler(0, SCHED_IDLE, ¶m); +} + int main(int argc, char ** argv) { + MakeProcessNice(); ShowCrashReports(); setvbuf(stdin, NULL, _IONBF, 0); setvbuf(stdout, NULL, _IONBF, 0); @@ -439,8 +449,7 @@ int main(int argc, char ** argv) { remember_init(); - is_antiprompt = false; - input_noecho = params.verbose <= 0; + input_noecho = params.verbose <= 0; n_past = 0; n_remain = params.n_predict; @@ -561,6 +570,9 @@ int main(int argc, char ** argv) { fprintf(stderr, EPHEMERAL("loading weights...")); } + // tracks if last character written to stdout was newline + bool got_newline = false; + while ((n_remain != 0 || params.interactive) && !is_terminated) { // perform evaluation @@ -678,7 +690,8 @@ int main(int argc, char ** argv) { finish_initializing_prompt(); } - if ((int) embd_inp.size() <= n_consumed && !is_interacting) { + if (prompt_status == kPromptFinished && + (int) embd_inp.size() <= n_consumed && !is_interacting) { // out of user input, sample next token DEVLOG("out of user input, sample next token w/ embd_inp.size()=%d n_consumed=%d\n", (int)embd_inp.size(), n_consumed); @@ -808,21 +821,25 @@ int main(int argc, char ** argv) { // --prompt 'Question: How old are you?\nAnswer: ' // --reverse-prompt $'\n' // + bool is_antiprompt; std::string ap_text; - std::string::size_type ap_index; std::string::size_type ap_extra; - is_antiprompt = has_antiprompt(&ap_index, &ap_text); + std::string::size_type ap_index; + if (prompt_status == kPromptFinished) { + is_antiprompt = has_antiprompt(&ap_index, &ap_text); + } else { + is_antiprompt = false; + } // display text - bool got_newline = false; - if (!input_noecho) { + if (!input_noecho && embd.size()) { std::string printme; for (auto id : embd) { printme.append(llama_token_to_str(ctx, id)); } if (is_antiprompt) { - ap_extra = last_output.size() - (ap_index + ap_text.size()); - printme.erase(printme.size() - MIN(printme.size(), ap_extra)); + ap_extra = last_output.size() - ap_index; + printme.erase(std::max(0, (int)(printme.size() - ap_extra))); } if (printme.size()) { got_newline = printme[printme.size() - 1] == '\n'; @@ -832,6 +849,7 @@ int main(int argc, char ** argv) { } if (is_antiprompt) { if (!params.interactive) { + DEVLOG("exiting due to antiprompt\n"); if (!got_newline) { printf("\n"); } diff --git a/third_party/ggml/quantize.cc b/third_party/ggml/quantize.cc index fe5a0c8f5..1472051bb 100644 --- a/third_party/ggml/quantize.cc +++ b/third_party/ggml/quantize.cc @@ -25,9 +25,12 @@ │ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │ │ │ ╚─────────────────────────────────────────────────────────────────────────────*/ +#include "libc/log/log.h" +#include "libc/runtime/runtime.h" #include "third_party/ggml/common.h" #include "third_party/ggml/ggml.h" #include "third_party/ggml/llama.h" +#include "third_party/ggml/llama_util.h" #include "third_party/libcxx/map" #include "third_party/libcxx/vector" @@ -38,46 +41,26 @@ asm(".include \"libc/disclaimer.inc\""); // clang-format off static const std::map LLAMA_FTYPE_MAP = { - {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0}, - {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1}, - {"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2}, - {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0}, - {"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1}, - {"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0}, + {"f16", LLAMA_FTYPE_MOSTLY_F16 }, + {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0}, + {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1}, + {"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2}, + {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0}, + {"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1}, + {"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0}, }; -bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::string & ftype_str_out) { - auto it = LLAMA_FTYPE_MAP.find(ftype_str); - if (it != LLAMA_FTYPE_MAP.end()) { - ftype = it->second; - ftype_str_out = it->first; - return true; - } - // try to parse as an integer - // try { - int ftype_int = std::stoi(ftype_str); - for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) { - if (it->second == ftype_int) { - ftype = it->second; - ftype_str_out = it->first; - return true; - } - } - // } - // catch (...) { - // // stoi failed - // } - return false; -} - // usage: -// ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads] +// ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type [nthreads] // int main(int argc, char ** argv) { + ShowCrashReports(); + + ggjt_v2(); ggml_time_init(); if (argc < 3) { - fprintf(stderr, "usage: %s model-f32.bin [model-quant.bin] type [nthreads]\n", argv[0]); + fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthreads]\n", argv[0]); for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) { fprintf(stderr, " type = \"%s\" or %d\n", it->first.c_str(), it->second); } @@ -91,60 +74,27 @@ int main(int argc, char ** argv) { ggml_free(ctx); } - // parse command line arguments const std::string fname_inp = argv[1]; - std::string fname_out; - int nthread; - llama_ftype ftype; + const std::string fname_out = argv[2]; - int arg_idx = 2; - std::string ftype_str; - if (try_parse_ftype(argv[arg_idx], ftype, ftype_str)) { - // argv[2] is the ftype - std::string fpath; - const size_t pos = fname_inp.find_last_of('/'); - if (pos != std::string::npos) { - fpath = fname_inp.substr(0, pos + 1); - } - // export as [inp path]/ggml-model-[ftype].bin - fname_out = fpath + "ggml-model-" + ftype_str + ".bin"; - arg_idx++; - } - else { - // argv[2] is the output path - fname_out = argv[arg_idx]; - arg_idx++; - - if (argc <= arg_idx) { - fprintf(stderr, "%s: missing ftype\n", __func__); - return 1; - } - // argv[3] is the ftype - if (!try_parse_ftype(argv[arg_idx], ftype, ftype_str)) { - fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]); - return 1; - } - arg_idx++; + if (fname_inp == fname_out) { + fprintf(stderr, "%s: input and output names are same\n", fname_inp.c_str()); + exit(1); } - // parse nthreads - if (argc > arg_idx) { - // try { - nthread = std::stoi(argv[arg_idx]); - // } - // catch (const std::exception & e) { - // Die("%s: invalid nthread '%s' (%s)\n", __func__, argv[arg_idx], e.what()); - // return 1; - // } + enum llama_ftype ftype; + if (!is_integer_str(argv[3])) { + auto it = LLAMA_FTYPE_MAP.find(argv[3]); + if (it == LLAMA_FTYPE_MAP.end()) { + fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, argv[3]); + return 1; + } + ftype = it->second; } else { - nthread = 0; + ftype = (enum llama_ftype)atoi(argv[3]); } - fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str()); - if (nthread > 0) { - fprintf(stderr, " using %d threads", nthread); - } - fprintf(stderr, "\n"); + int nthread = argc > 4 ? atoi(argv[4]) : 0; const int64_t t_main_start_us = ggml_time_us(); diff --git a/third_party/intel/f16cintrin.internal.h b/third_party/intel/f16cintrin.internal.h index 67337d68c..71f09ec9e 100644 --- a/third_party/intel/f16cintrin.internal.h +++ b/third_party/intel/f16cintrin.internal.h @@ -18,10 +18,18 @@ __funline float _cvtsh_ss(unsigned short __S) { return __builtin_ia32_vec_ext_v4sf(__A, 0); } +/** + * Converts four half-precision (16-bit) floating point values to + * single-precision floating point values. + */ __funline __m128 _mm_cvtph_ps(__m128i __A) { return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__A); } +/** + * Converts eight half-precision (16-bit) floating point values to + * single-precision floating point values. + */ __funline __m256 _mm256_cvtph_ps(__m128i __A) { return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__A); } @@ -37,6 +45,10 @@ __funline __m128i _mm_cvtps_ph(__m128 __A, const int __I) { return (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)__A, __I); } +/** + * Converts eight single-precision floating point values to + * half-precision (16-bit) floating point values. + */ __funline __m128i _mm256_cvtps_ph(__m256 __A, const int __I) { return (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)__A, __I); } diff --git a/third_party/radpajama/common-gptneox.h b/third_party/radpajama/common-gptneox.h index 004a95d5e..05e285617 100644 --- a/third_party/radpajama/common-gptneox.h +++ b/third_party/radpajama/common-gptneox.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- c++; c-basic-offset:4 -*- #ifndef COSMOPOLITAN_THIRD_PARTY_RADPAJAMA_COMMON_GPTNEOX_H_ #define COSMOPOLITAN_THIRD_PARTY_RADPAJAMA_COMMON_GPTNEOX_H_ #include "libc/macros.internal.h" diff --git a/third_party/radpajama/gptneox-util.h b/third_party/radpajama/gptneox-util.h index c05b3333b..817ef6066 100644 --- a/third_party/radpajama/gptneox-util.h +++ b/third_party/radpajama/gptneox-util.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- c++; c-basic-offset:4 -*- #ifndef GPTNEOX_UTIL_H #define GPTNEOX_UTIL_H #include "libc/calls/calls.h" diff --git a/third_party/radpajama/gptneox.cc b/third_party/radpajama/gptneox.cc index bf142851b..66152e1e4 100644 --- a/third_party/radpajama/gptneox.cc +++ b/third_party/radpajama/gptneox.cc @@ -28,6 +28,8 @@ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "third_party/radpajama/gptneox.h" #include "libc/intrin/bits.h" +#include "libc/str/str.h" +#include "libc/sysv/consts/posix.h" #include "third_party/ggml/fp16.h" #include "third_party/ggml/ggml.h" #include "third_party/ggml/llama_util.h" @@ -77,7 +79,7 @@ static const size_t MiB = 1024*1024; // needs modifications in ggml // TODO: Modify for gptneox, how are these values actually determined? -// TODO: This is now priority, +// TODO: This is now priority, static const std::map & MEM_REQ_SCRATCH0() { static std::map _MEM_REQ_SCRATCH0 = { @@ -446,7 +448,8 @@ struct gptneox_load_tensors_map { enum gptneox_file_version { GPTNEOX_FILE_VERSION_GGML, GPTNEOX_FILE_VERSION_GGMF_V1, // added version field and scores in vocab - GPTNEOX_FILE_VERSION_GGJT_V1, // added padding + GPTNEOX_FILE_VERSION_GGJT_V1, // adopted unified aligned mappable layout + GPTNEOX_FILE_VERSION_GGJT_V2, // changed quantization format }; struct gptneox_file_loader { @@ -473,10 +476,16 @@ struct gptneox_file_loader { if (magic == READ32BE("ggml") && version == 0) { file_version = GPTNEOX_FILE_VERSION_GGML; + ggjt_v1(); } else if (magic == READ32BE("ggmf") && version == 1) { file_version = GPTNEOX_FILE_VERSION_GGMF_V1; + ggjt_v1(); } else if (magic == READ32BE("ggjt") && version == 1) { file_version = GPTNEOX_FILE_VERSION_GGJT_V1; + ggjt_v1(); + } else if (magic == READ32BE("ggjt") && version == 2) { + file_version = GPTNEOX_FILE_VERSION_GGJT_V2; + ggjt_v2(); } else { Die("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?", magic, version); @@ -566,17 +575,20 @@ struct gptneox_file_loader { struct gptneox_file_saver { gptneox_file file; gptneox_file_loader * any_file_loader; - gptneox_file_saver(const char * fname, gptneox_file_loader * any_file_loader, enum gptneox_ftype new_ftype) - : file(fname, "wb"), any_file_loader(any_file_loader) { + gptneox_file_saver(const char * fname, + gptneox_file_loader * any_file_loader, + enum gptneox_ftype new_ftype) + : file(fname, "wb"), + any_file_loader(any_file_loader) { fprintf(stderr, "gptneox.cpp: saving model to %s\n", fname); - ggjt_v1(); write_magic(); write_hparams(new_ftype); write_vocab(); } void write_magic() { + ggjt_v2(); file.write_u32(READ32BE("ggjt")); // magic - file.write_u32(1); // version + file.write_u32(2); // version } void write_hparams(enum gptneox_ftype new_ftype) { const gptneox_hparams & hparams = any_file_loader->hparams; @@ -887,7 +899,8 @@ static const char *gptneox_file_version_name(gptneox_file_version version) { switch (version) { case GPTNEOX_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)"; case GPTNEOX_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)"; - case GPTNEOX_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)"; + case GPTNEOX_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)"; + case GPTNEOX_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)"; default: GPTNEOX_ASSERT(false); } } @@ -940,7 +953,7 @@ static void gptneox_model_load_internal( model.hparams = ml->file_loaders.at(0)->hparams; gptneox_file_version file_version = ml->file_loaders.at(0)->file_version; auto & hparams = model.hparams; - + { switch (hparams.n_layer) { case 16: { @@ -951,7 +964,7 @@ static void gptneox_model_load_internal( } break; } - // # : we extend the model type settings for RedPajama models. + // # : we extend the model type settings for RedPajama models. case 32:{ if (hparams.n_embd == 2560) { model.type = e_model::MODEL_3B; @@ -1195,7 +1208,7 @@ static bool gptneox_eval_internal( model.layers[il].c_attn_attn_b, cur), cur); } - + // Split QKV and make contiguous struct ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd/n_head, @@ -1225,7 +1238,7 @@ static bool gptneox_eval_internal( ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)); Vcur = ggml_cpy(ctx0, Vcur, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)); - + // MARK: gptneox RoPE Q and K, before cache // Bit 2 for gptneox style (2) // Bit 1 is zero for dont skip n_past +(0), use (2+1) = (3) if rope is applied to cache of k (after cache only) @@ -1241,7 +1254,7 @@ static bool gptneox_eval_internal( ggml_element_size(Vcur) * n_embd, 0); Vcur = ggml_transpose(ctx0, Vcur); - + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_embd * N, // num elements in current context (up to n_embd*n_ctx but usually less) ggml_element_size(kv_self.k) * n_embd * (il * n_ctx + n_past)); @@ -1250,12 +1263,12 @@ static bool gptneox_eval_internal( n_embd, ggml_element_size(kv_self.v) * n_ctx, ggml_element_size(kv_self.v) * ((il * n_ctx * n_embd) + n_past)); - + // important: storing RoPE-ed version of K in the KV cache! ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); //} - + // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) struct ggml_tensor * Q = ggml_permute(ctx0, @@ -1284,7 +1297,7 @@ static bool gptneox_eval_internal( struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - + // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() struct ggml_tensor * V_trans = ggml_view_3d(ctx0, kv_self.v, n_past + N, @@ -1312,10 +1325,10 @@ static bool gptneox_eval_internal( } lctx.use_buf(ctx0, 1); - + if (hparams.use_parallel_residual == 1) { //printf("use_parallel_residual == 1\n"); - + // This is independent of the self-attention result, so it could be done in parallel to the self-attention struct ggml_tensor * outAttn = cur; @@ -1359,7 +1372,7 @@ static bool gptneox_eval_internal( inpL = ggml_add(ctx0, inpL, cur); } else if (hparams.use_parallel_residual == 0) { //printf("use_parallel_residual == 0\n"); - + // This takes the self-attention residual output as input to Feedforward struct ggml_tensor * outAttn = cur; struct ggml_tensor * inpFF = ggml_add(ctx0, outAttn, inpL); @@ -2093,6 +2106,7 @@ int gptneox_model_copy( static void gptneox_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum gptneox_ftype ftype, int nthread) { ggml_type quantized_type; switch (ftype) { + case GPTNEOX_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break; case GPTNEOX_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break; case GPTNEOX_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break; case GPTNEOX_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break; @@ -2124,21 +2138,17 @@ static void gptneox_model_quantize_internal(const std::string & fname_inp, const tensor.data = read_data.addr; model_loader->load_data_for(tensor); - printf("[%4zu/%4zu] %36s - %16s, type = %6s, ", + printf("[%4zu/%4zu] %50s - %16s, type = %6s, ", ++idx, model_loader->tensors_map.tensors.size(), tensor.name.c_str(), gptneox_format_tensor_shape(tensor.ne).c_str(), ggml_type_name(tensor.type)); - // This used to be a regex, but has an extreme cost to compile times. - bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'? - - // quantize only 2D tensors - quantize &= (tensor.ne.size() == 2); - - // uncomment this to keep the output layer in FP16 - //if (tensor.name == "output.weight") { - // quantize = false; - //} + // only quantize 2d weights that aren't the output layer + bool quantize = + tensor.ne.size() == 2 && + tensor.type != quantized_type && + _endswith(tensor.name.c_str(), "weight") && + tensor.name != "output.weight"; enum ggml_type new_type; void * new_data; @@ -2150,6 +2160,14 @@ static void gptneox_model_quantize_internal(const std::string & fname_inp, const new_data = tensor.data; new_size = tensor.size; printf("size = %8.3f MiB\n", tensor.size/1024.0/1024.0); + } else if (quantized_type == GGML_TYPE_F16) { + GPTNEOX_ASSERT(tensor.type == GGML_TYPE_F32); + size_t nelements = tensor.ne.at(0) * tensor.ne.at(1); + new_type = quantized_type; + new_size = nelements * 2; + work.resize(new_size); + new_data = work.addr; + ggml_fp32_to_fp16_row((const float *)tensor.data, (ggml_fp16_t *)new_data, nelements); } else { new_type = quantized_type; float * f32_data; diff --git a/third_party/radpajama/gptneox.h b/third_party/radpajama/gptneox.h index 25755e127..aaf62fcc9 100644 --- a/third_party/radpajama/gptneox.h +++ b/third_party/radpajama/gptneox.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- c++; c-basic-offset:4 -*- #ifndef GPTNEOX_H #define GPTNEOX_H // clang-format off diff --git a/third_party/radpajama/quantize-gptneox.cc b/third_party/radpajama/quantize-gptneox.cc index 4d68f8325..826117bae 100644 --- a/third_party/radpajama/quantize-gptneox.cc +++ b/third_party/radpajama/quantize-gptneox.cc @@ -28,6 +28,7 @@ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "libc/log/log.h" #include "third_party/ggml/ggml.h" +#include "third_party/ggml/llama_util.h" #include "third_party/libcxx/cstdio" #include "third_party/libcxx/map" #include "third_party/libcxx/string" @@ -35,13 +36,14 @@ // clang-format off static const std::map GPTNEOX_FTYPE_MAP = { - {"q4_0", GPTNEOX_FTYPE_MOSTLY_Q4_0}, - {"q4_1", GPTNEOX_FTYPE_MOSTLY_Q4_1}, - {"q4_2", GPTNEOX_FTYPE_MOSTLY_Q4_2}, - //{"q4_3", GPTNEOX_FTYPE_MOSTLY_Q4_3}, - {"q5_0", GPTNEOX_FTYPE_MOSTLY_Q5_0}, - {"q5_1", GPTNEOX_FTYPE_MOSTLY_Q5_1}, - {"q8_0", GPTNEOX_FTYPE_MOSTLY_Q8_0}, + {"f16", GPTNEOX_FTYPE_MOSTLY_F16}, + {"q4_0", GPTNEOX_FTYPE_MOSTLY_Q4_0}, + {"q4_1", GPTNEOX_FTYPE_MOSTLY_Q4_1}, + {"q4_2", GPTNEOX_FTYPE_MOSTLY_Q4_2}, + //{"q4_3", GPTNEOX_FTYPE_MOSTLY_Q4_3}, + {"q5_0", GPTNEOX_FTYPE_MOSTLY_Q5_0}, + {"q5_1", GPTNEOX_FTYPE_MOSTLY_Q5_1}, + {"q8_0", GPTNEOX_FTYPE_MOSTLY_Q8_0}, }; // usage: @@ -50,7 +52,7 @@ static const std::map GPTNEOX_FTYPE_MAP = { int main(int argc, char ** argv) { ShowCrashReports(); - ggjt_v1(); + ggjt_v2(); ggml_time_init(); if (argc < 4) { @@ -71,8 +73,13 @@ int main(int argc, char ** argv) { const std::string fname_inp = argv[1]; const std::string fname_out = argv[2]; + if (fname_inp == fname_out) { + fprintf(stderr, "%s: input and output names are same\n", fname_inp.c_str()); + exit(1); + } + enum gptneox_ftype ftype; - if (argv[3][0] == 'q') { + if (!is_integer_str(argv[3])) { auto it = GPTNEOX_FTYPE_MAP.find(argv[3]); if (it == GPTNEOX_FTYPE_MAP.end()) { fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, argv[3]); diff --git a/tool/emacs/cosmo-c-builtins.el b/tool/emacs/cosmo-c-builtins.el index e54408191..ae255a493 100644 --- a/tool/emacs/cosmo-c-builtins.el +++ b/tool/emacs/cosmo-c-builtins.el @@ -72,6 +72,7 @@ "__builtin_extract_return_addr" "__builtin_isnan" "__builtin_signbit" + "__builtin_signbitf" "__builtin_signbitl" "__builtin_ffs" "__builtin_ffsl"