Make more ML improvements

- Fix UX issues with llama.com
- Do housekeeping on libm code
- Add more vectorization to GGML
- Get GGJT quantizer programs working well
- Have the quantizer keep the output layer as f16c
- Prefetching improves performance 15% if you use fewer threads
This commit is contained in:
Justine Tunney 2023-05-16 08:07:23 -07:00
parent 80db9de173
commit e7eb0b3070
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
46 changed files with 340 additions and 289 deletions

View file

@ -95,6 +95,6 @@ __attribute__((__constructor__)) static void _getcpucount_init(void) {
*
* @return cpu count or 0 if it couldn't be determined
*/
unsigned _getcpucount(void) {
int _getcpucount(void) {
return g_cpucount;
}

View file

@ -2,6 +2,7 @@
#define COSMOPOLITAN_LIBC_CALLS_STRUCT_SCHED_PARAM_H_
#include "libc/calls/struct/timespec.h"
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
struct sched_param {
int32_t sched_priority;
@ -15,5 +16,6 @@ int sched_rr_get_interval(int, struct timespec *);
int sched_setparam(int, const struct sched_param *);
int sched_setscheduler(int, int, const struct sched_param *);
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_LIBC_CALLS_STRUCT_SCHED_PARAM_H_ */

View file

@ -26,10 +26,10 @@
* signed zeroes.
*/
double fmax(double x, double y) {
if (__builtin_isnan(x)) return y;
if (__builtin_isnan(y)) return x;
if (__builtin_signbit(x) != __builtin_signbit(y)) {
return __builtin_signbit(x) ? y : x; /* C99 Annex F.9.9.2 */
if (isnan(x)) return y;
if (isnan(y)) return x;
if (signbit(x) != signbit(y)) {
return signbit(x) ? y : x; /* C99 Annex F.9.9.2 */
}
return x < y ? y : x;
}

View file

@ -26,10 +26,10 @@
* signed zeroes.
*/
float fmaxf(float x, float y) {
if (__builtin_isnan(x)) return y;
if (__builtin_isnan(y)) return x;
if (__builtin_signbitf(x) != __builtin_signbitf(y)) {
return __builtin_signbitf(x) ? y : x; /* C99 Annex F.9.9.2 */
if (isnan(x)) return y;
if (isnan(y)) return x;
if (signbit(x) != signbit(y)) {
return signbit(x) ? y : x; /* C99 Annex F.9.9.2 */
}
return x < y ? y : x;
}

View file

@ -27,10 +27,10 @@
* signed zeroes.
*/
long double fmaxl(long double x, long double y) {
if (__builtin_isnan(x)) return y;
if (__builtin_isnan(y)) return x;
if (__builtin_signbitl(x) != __builtin_signbitl(y)) {
return __builtin_signbitl(x) ? y : x; /* C99 Annex F.9.9.2 */
if (isnan(x)) return y;
if (isnan(y)) return x;
if (signbit(x) != signbit(y)) {
return signbit(x) ? y : x; /* C99 Annex F.9.9.2 */
}
return x < y ? y : x;
}

View file

@ -88,7 +88,6 @@ typedef double double_t;
#define isnan(x) __builtin_isnan(x)
#define isfinite(x) __builtin_isfinite(x)
#define isnormal(x) __builtin_isnormal(x)
#define signbit(x) __builtin_signbit(x)
#define isgreater(x, y) __builtin_isgreater(x, y)
#define isgreaterequal(x, y) __builtin_isgreaterequal(x, y)
#define isless(x, y) __builtin_isless(x, y)
@ -99,6 +98,11 @@ typedef double double_t;
#define fpclassify(x) \
__builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL, FP_ZERO, x)
#define signbit(x) \
(sizeof(x) == sizeof(double) ? __builtin_signbit(x) \
: sizeof(x) == sizeof(float) ? __builtin_signbitf(x) \
: __builtin_signbitl(x))
extern int signgam;
double acos(double);
@ -305,7 +309,7 @@ void sincos(double, double *, double *);
void sincosf(float, float *, float *);
void sincosl(long double, long double *, long double *);
float fsumf(const float *, size_t);
double fsumf(const float *, size_t);
double fsum(const double *, size_t);
double j0(double);

View file

@ -99,7 +99,7 @@ void _intsort(int *, size_t);
void _longsort(long *, size_t);
bool _isheap(void *);
int NtGetVersion(void) pureconst;
unsigned _getcpucount(void) pureconst;
int _getcpucount(void) pureconst;
long _missingno();
void __oom_hook(size_t);
void _loadxmm(void *);

View file

@ -141,3 +141,7 @@ double acos(double x)
w = R(z)*s+c;
return 2*(df+w);
}
#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
__strong_reference(acos, acosl);
#endif

View file

@ -32,7 +32,7 @@ asm(".ident\t\"\\n\\n\
Musl libc (MIT License)\\n\
Copyright 2005-2014 Rich Felker, et. al.\"");
asm(".include \"libc/disclaimer.inc\"");
/* clang-format off */
// clang-format off
/* origin: FreeBSD /usr/src/lib/msun/src/e_acosf.c */
/*

View file

@ -31,7 +31,7 @@ asm(".ident\t\"\\n\\n\
Musl libc (MIT License)\\n\
Copyright 2005-2014 Rich Felker, et. al.\"");
asm(".include \"libc/disclaimer.inc\"");
/* clang-format off */
// clang-format off
/**
* Returns inverse hyperbolic cosine of 𝑥.
@ -53,3 +53,7 @@ double acosh(double x)
/* |x| >= 0x1p26 or nan */
return log(x) + 0.693147180559945309417232121458176568;
}
#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
__strong_reference(acosh, acoshl);
#endif

View file

@ -38,6 +38,7 @@
*/
#include "libc/math.h"
#include "libc/tinymath/freebsd.internal.h"
#if !(LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024)
asm(".ident\t\"\\n\\n\
FreeBSD libm (BSD-2 License)\\n\
@ -62,8 +63,6 @@ asm(".include \"libc/disclaimer.inc\"");
#error "Unsupported long double format"
#endif
#define BIAS (LDBL_MAX_EXP - 1)
static const double
one = 1.0;
@ -108,3 +107,5 @@ acoshl(long double x)
RETURNI(log1pl(t+sqrtl(2.0*t+t*t)));
}
}
#endif /* long double is long */

View file

@ -28,6 +28,7 @@
#include "libc/math.h"
#include "libc/tinymath/invtrigl.internal.h"
#include "libc/tinymath/ldshape.internal.h"
#if !(LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024)
asm(".ident\t\"\\n\\n\
fdlibm (fdlibm license)\\n\
@ -54,22 +55,20 @@ asm(".include \"libc/disclaimer.inc\"");
* Converted to long double by David Schultz <das@FreeBSD.ORG>.
*/
/**
* Returns arc cosine of 𝑥.
*
* @define atan2(fabs(sqrt((1-𝑥)*(1+𝑥))),𝑥)
* @domain -1 𝑥 1
*/
long double acosl(long double x) {
#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
return acos(x);
#elif (LDBL_MANT_DIG == 64 || LDBL_MANT_DIG == 113) && LDBL_MAX_EXP == 16384
#if LDBL_MANT_DIG == 64
#define CLEARBOTTOM(u) (u.i.m &= -1ULL << 32)
#elif LDBL_MANT_DIG == 113
#define CLEARBOTTOM(u) (u.i.lo = 0)
#endif
/**
* Returns arc cosine of 𝑥.
*
* @define atan2(fabs(sqrt((1-𝑥)*(1+𝑥))),𝑥)
* @domain -1 𝑥 1
*/
long double acosl(long double x)
{
union ldshape u = {x};
long double z, s, c, f;
uint16_t e = u.i.se & 0x7fff;
@ -102,8 +101,6 @@ long double acosl(long double x) {
f = u.f;
c = (z - f*f)/(s + f);
return 2*(__invtrigl_R(z)*s + c + f);
#else
#error "architecture unsupported"
#endif
}
#endif /* long double is long */

View file

@ -35,7 +35,7 @@ asm(".ident\t\"\\n\\n\
Musl libc (MIT License)\\n\
Copyright 2005-2014 Rich Felker, et. al.\"");
asm(".include \"libc/disclaimer.inc\"");
/* clang-format off */
// clang-format off
/* origin: FreeBSD /usr/src/lib/msun/src/e_asinf.c */
/*

View file

@ -64,3 +64,7 @@ double asinh(double x)
}
return s ? -x : x;
}
#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
__strong_reference(asinh, asinhl);
#endif

View file

@ -38,6 +38,7 @@
*/
#include "libc/math.h"
#include "libc/tinymath/freebsd.internal.h"
#if !(LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024)
asm(".ident\t\"\\n\\n\
FreeBSD libm (BSD-2 License)\\n\
@ -65,8 +66,6 @@ asm(".include \"libc/disclaimer.inc\"");
#error "Unsupported long double format"
#endif
#define BIAS (LDBL_MAX_EXP - 1)
static const double
one = 1.00000000000000000000e+00, /* 0x3FF00000, 0x00000000 */
huge= 1.00000000000000000000e+300;
@ -110,3 +109,5 @@ asinhl(long double x)
}
RETURNI((hx & 0x8000) == 0 ? w : -w);
}
#endif /* long double is long */

View file

@ -33,9 +33,7 @@ asm(".ident\t\"\\n\\n\
Musl libc (MIT License)\\n\
Copyright 2005-2014 Rich Felker, et. al.\"");
asm(".include \"libc/disclaimer.inc\"");
/* clang-format off */
// clang-format off
// FIXME: Hull et al. "Implementing the complex arcsine and arccosine functions using exception handling" 1997

View file

@ -145,3 +145,7 @@ double complex catan(double complex z)
w = CMPLX(w, 0.25 * log(a));
return w;
}
#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
__strong_reference(catan, catanl);
#endif

View file

@ -2,32 +2,26 @@
vi: set et ft=c ts=8 tw=8 fenc=utf-8 :vi
Musl Libc
Copyright © 2005-2014 Rich Felker, et al.
OpenBSD /usr/src/lib/libm/src/s_catanl.c
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
Copyright (c) 2008 Stephen L. Moshier <steve@moshier.net>
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
Permission to use, copy, modify, and distribute this software for any
purpose with or without fee is hereby granted, provided that the above
copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
*/
#include "libc/complex.h"
#include "libc/math.h"
#include "libc/tinymath/complex.internal.h"
#if !(LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024)
asm(".ident\t\"\\n\\n\
OpenBSD libm (ISC License)\\n\
@ -38,22 +32,6 @@ Copyright 2005-2014 Rich Felker, et. al.\"");
asm(".include \"libc/disclaimer.inc\"");
// clang-format off
/* origin: OpenBSD /usr/src/lib/libm/src/s_catanl.c */
/*
* Copyright (c) 2008 Stephen L. Moshier <steve@moshier.net>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
/*
* Complex circular arc tangent
*
@ -97,13 +75,6 @@ asm(".include \"libc/disclaimer.inc\"");
* 2.9e-17. See also clog().
*/
#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
long double complex catanl(long double complex z)
{
return catan(z);
}
#else
static const long double PIL = 3.141592653589793238462643383279502884197169L;
static const long double DP1 = 3.14159265358979323829596852490908531763125L;
static const long double DP2 = 1.6667485837041756656403424829301998703007e-19L;
@ -149,4 +120,4 @@ long double complex catanl(long double complex z)
return w;
}
#endif
#endif /* long double is long */

View file

@ -26,10 +26,10 @@
* signed zeroes.
*/
double fmin(double x, double y) {
if (__builtin_isnan(x)) return y;
if (__builtin_isnan(y)) return x;
if (__builtin_signbit(x) != __builtin_signbit(y)) {
return __builtin_signbit(x) ? x : y; /* C99 Annex F.9.9.2 */
if (isnan(x)) return y;
if (isnan(y)) return x;
if (signbit(x) != signbit(y)) {
return signbit(x) ? x : y; /* C99 Annex F.9.9.2 */
}
return x < y ? x : y;
}

View file

@ -17,6 +17,7 @@
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/math.h"
#include "libc/tinymath/freebsd.internal.h"
/**
* Returns minimum of two floats.
@ -26,10 +27,10 @@
* signed zeroes.
*/
float fminf(float x, float y) {
if (__builtin_isnan(x)) return y;
if (__builtin_isnan(y)) return x;
if (__builtin_signbitf(x) != __builtin_signbitf(y)) {
return __builtin_signbitf(x) ? x : y; /* C99 Annex F.9.9.2 */
if (isnan(x)) return y;
if (isnan(y)) return x;
if (signbit(x) != signbit(y)) {
return signbit(x) ? x : y; /* C99 Annex F.9.9.2 */
}
return x < y ? x : y;
}

View file

@ -27,10 +27,10 @@
* signed zeroes.
*/
long double fminl(long double x, long double y) {
if (__builtin_isnan(x)) return y;
if (__builtin_isnan(y)) return x;
if (__builtin_signbitl(x) != __builtin_signbitl(y)) {
return __builtin_signbitl(x) ? x : y; /* C99 Annex F.9.9.2 */
if (isnan(x)) return y;
if (isnan(y)) return x;
if (signbit(x) != signbit(y)) {
return signbit(x) ? x : y; /* C99 Annex F.9.9.2 */
}
return x < y ? x : y;
}

View file

@ -22,8 +22,8 @@
/**
* Adds floats in array.
*/
float fsumf(const float *p, size_t n) {
float s;
double fsumf(const float *p, size_t n) {
double s;
size_t i;
if (n > 8) return fsumf(p, n / 2) + fsumf(p + n / 2, n - n / 2);
for (s = i = 0; i < n; ++i) s += p[i];

View file

@ -27,6 +27,18 @@
#include "libc/testlib/testlib.h"
#include "third_party/vqsort/vqsort.h"
void InsertionSort(int *A, int n) {
for (int i = 1; i < n; i++) {
int key = A[i];
int j = i - 1;
while (j >= 0 && A[j] > key) {
A[j + 1] = A[j];
j--;
}
A[j + 1] = key;
}
}
int CompareLong(const void *a, const void *b) {
const long *x = a;
const long *y = b;
@ -145,14 +157,14 @@ int CompareInt(const void *a, const void *b) {
return 0;
}
TEST(_intsort, test) {
TEST(InsertionSort, test) {
size_t n = 5000;
int *a = gc(calloc(n, sizeof(int)));
int *b = gc(calloc(n, sizeof(int)));
rngset(a, n * sizeof(int), 0, 0);
memcpy(b, a, n * sizeof(int));
qsort(a, n, sizeof(int), CompareInt);
_intsort(b, n);
InsertionSort(b, n);
ASSERT_EQ(0, memcmp(b, a, n * sizeof(int)));
}
@ -218,13 +230,14 @@ TEST(radix_sort_int32, test) {
ASSERT_EQ(0, memcmp(b, a, n * sizeof(int)));
}
BENCH(_intsort, bench) {
BENCH(InsertionSort, bench) {
printf("\n");
size_t n = 10000;
int *p1 = gc(malloc(n * sizeof(int)));
int *p2 = gc(malloc(n * sizeof(int)));
rngset(p1, n * sizeof(int), 0, 0);
EZBENCH2("_intsort", memcpy(p2, p1, n * sizeof(int)), _intsort(p2, n));
EZBENCH2("InsertionSort", memcpy(p2, p1, n * sizeof(int)),
InsertionSort(p2, n));
#ifdef __x86_64__
if (X86_HAVE(AVX2)) {
EZBENCH2("vqsort_int32_avx2", memcpy(p2, p1, n * sizeof(int)),

View file

@ -24,7 +24,6 @@
#include "libc/runtime/runtime.h"
#include "libc/testlib/ezbench.h"
#include "libc/testlib/testlib.h"
#include "libc/tinymath/tinymath.h"
#include "libc/x/x.h"
float remainderf2(float, float);

View file

@ -27,3 +27,5 @@ LOCAL CHANGES
- Refactor headers per cosmo convention
- Replace code like 'ggjt' with READ32BE("ggjt")
- Remove C++ exceptions; use Die() function instead
- Removed division from matrix multiplication.
- Let quantizer convert between ggmt formats

View file

@ -34,6 +34,7 @@
#include "libc/stdio/stdio.h"
#include "libc/str/str.h"
#include "libc/sysv/consts/fileno.h"
#include "third_party/ggml/llama_util.h"
#include "third_party/libcxx/algorithm"
#include "third_party/libcxx/cassert"
#include "third_party/libcxx/cstring"
@ -50,13 +51,6 @@ Copyright (c) 2023 Georgi Gerganov\"");
asm(".include \"libc/disclaimer.inc\"");
// clang-format off
static bool is_integer_str(const char *s) {
if (*s == '-') ++s;
if (!*s) return false;
while (isdigit(*s)) ++s;
return !*s;
}
static std::string replace_all(std::string const& original,
std::string const& before,
std::string const& after) {
@ -92,7 +86,7 @@ static bool append_file_to_prompt(const char *path, gpt_params & params) {
}
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.n_threads = std::min(20, std::max(1, (int)(_getcpucount() * 0.75)));
params.n_threads = std::min(20, std::max(1, _getcpucount() >> 1));
bool invalid_param = false;
std::string arg;

View file

@ -1,4 +1,4 @@
// -*- c++ -*-
// -*- c++; c-basic-offset:4 -*-
#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_COMMON_H_
#define COSMOPOLITAN_THIRD_PARTY_GGML_COMMON_H_
#include "libc/calls/struct/termios.h"
@ -21,7 +21,7 @@
struct gpt_params {
int32_t seed = -1; // RNG seed
int32_t verbose = 0; // Logging verbosity
int32_t n_threads = std::min(1, (int)(_getcpucount() * 0.75));
int32_t n_threads = std::max(1, _getcpucount() >> 1);
int32_t n_predict = -1; // new tokens to predict
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
int32_t n_ctx = 512; // context size

View file

@ -78,7 +78,15 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
}
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n) {
for (size_t i = 0; i < n; i++) {
size_t i = 0;
#ifdef __F16C__
for (; i + 7 < n; i += 8) {
__m128i x_vec = _mm_loadu_si128((const __m128i *)(x + i));
__m256 y_vec = _mm256_cvtph_ps(x_vec);
_mm256_storeu_ps(y + i, y_vec);
}
#endif
for (; i < n; i++) {
y[i] = GGML_FP16_TO_FP32(x[i]);
}
}

View file

@ -3,9 +3,6 @@
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
#define GGML_GELU_FP16
#define GGML_SILU_FP16
#ifdef __ARM_NEON
// we use the built-in 16-bit float type
typedef __fp16 ggml_fp16_t;

View file

@ -8,6 +8,9 @@
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
#define GGML_GELU_FP16
#define GGML_SILU_FP16
extern ggml_fp16_t table_gelu_f16[1 << 16];
extern ggml_fp16_t table_silu_f16[1 << 16];
extern ggml_fp16_t table_exp_f16[1 << 16];

View file

@ -613,23 +613,37 @@ void ggml_vec_dot_v1_q4_0_q8_0(const int n, float * restrict s, const void * res
__m256 acc = _mm256_setzero_ps();
// Main loop
for (int i = 0; i < nb; ++i) {
/* Compute combined scale for the block */
const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
__m256i bx = bytes_from_nibbles_32(x[i].qs);
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
const __m256i off = _mm256_set1_epi8( 8 );
bx = _mm256_sub_epi8( bx, off );
__m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
const __m256 q = mul_sum_i8_pairs_float(bx, by);
/* Multiply q with scale and accumulate */
acc = _mm256_fmadd_ps( d, q, acc );
#define WORK(I) \
/* Compute combined scale for the block */ \
const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[I].d ), _mm256_broadcast_ss( &y[I].d ) ); \
__m256i bx = bytes_from_nibbles_32(x[I].qs); \
/* Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. */ \
const __m256i off = _mm256_set1_epi8( 8 ); \
bx = _mm256_sub_epi8( bx, off ); \
__m256i by = _mm256_loadu_si256((const __m256i *)y[I].qs); \
const __m256 q = mul_sum_i8_pairs_float(bx, by); \
/* Multiply q with scale and accumulate */ \
acc = _mm256_fmadd_ps( d, q, acc )
int i = 0;
for (; i + 12 < nb; i += 12) {
_mm_prefetch(x+i+12, 3);
_mm_prefetch(x+i+15, 3);
_mm_prefetch(x+i+18, 3);
_mm_prefetch(x+i+21, 3);
_mm_prefetch(y+i+12, 3);
_mm_prefetch(y+i+14, 3);
_mm_prefetch(y+i+16, 3);
_mm_prefetch(y+i+18, 3);
_mm_prefetch(y+i+20, 3);
_mm_prefetch(y+i+22, 3);
for (int j = 0; j < 12; ++j) {
WORK(i+j);
}
}
for (; i < nb; ++i) {
WORK(i);
}
#undef WORK
*s = hsum_float_8(acc);
#elif defined(__AVX__)

View file

@ -1784,9 +1784,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
// Initialize accumulator with zeros
__m256 acc = _mm256_setzero_ps();
//
// Main loop
//
#define WORK(I) \
/* Compute combined scale for the block */ \
const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[I].d ), _mm256_broadcast_ss( &y[I].d ) ); \
@ -2702,9 +2700,15 @@ inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
#ifndef GGML_USE_ACCELERATE
ggml_float sum = 0.0;
for (int i = 0; i < n; ++i) {
sum += (ggml_float)x[i];
int i = 0;
ggml_float sum = 0;
#if __AVX__ || __AVX2__ || __AVX512F__
for (; i + 8 <= n; i += 8) {
sum += hsum_float_8(_mm256_loadu_ps(x + i));
}
#endif
for (; i < n; ++i) {
sum += x[i];
}
*s = sum;
#else
@ -2802,6 +2806,7 @@ const char *const ggjt_v2_type_name[GGML_TYPE_COUNT] = {
[GGML_TYPE_F16] = "f16",
[GGML_TYPE_Q4_0] = "q4_0",
[GGML_TYPE_Q4_1] = "q4_1",
[GGML_TYPE_Q4_2] = "q4_2",
[GGML_TYPE_Q5_0] = "q5_0",
[GGML_TYPE_Q5_1] = "q5_1",
[GGML_TYPE_Q8_0] = "q8_0",
@ -8113,7 +8118,7 @@ static void ggml_compute_forward_alibi_f32(
assert(ne1 + n_past == ne0); (void) n_past;
// add alibi to src0 (KQ_scaled)
const int n_heads_log2_floor = 1 << _bsr(n_head);
const int n_heads_log2_floor = 1 << _bsr(n_head); // [jart]
const float m0 = exp2f(-8.0f / n_heads_log2_floor);
const float m1 = exp2f(-4.0f / n_heads_log2_floor);

View file

@ -1,3 +1,4 @@
// -*- c; c-basic-offset:4 -*-
#ifndef COSMOPOLITAN_THIRD_PARTY_LLAMA_CPP_GGML_H_
#define COSMOPOLITAN_THIRD_PARTY_LLAMA_CPP_GGML_H_
#if !(__ASSEMBLER__ + __LINKER__ + 0)

View file

@ -128,6 +128,7 @@ THIRD_PARTY_GGML_LLAMA_DIRECTDEPS = \
LIBC_STR \
LIBC_STUBS \
LIBC_SYSV \
LIBC_SYSV_CALLS \
LIBC_THREAD \
LIBC_TINYMATH \
LIBC_ZIPOS \
@ -180,6 +181,7 @@ o/$(MODE)/third_party/ggml/companionai.txt.zip.o: private \
THIRD_PARTY_GGML_COMS = \
$(THIRD_PARTY_GGML_LLAMA) \
o/$(MODE)/third_party/ggml/quantize.com \
o/$(MODE)/third_party/ggml/perplexity.com
THIRD_PARTY_GGML_BINS = $(THIRD_PARTY_GGML_COMS) $(THIRD_PARTY_GGML_COMS:%=%.dbg)

View file

@ -31,6 +31,7 @@
#include "libc/intrin/bits.h"
#include "libc/macros.internal.h"
#include "libc/stdio/stdio.h"
#include "libc/sysv/consts/posix.h"
#include "third_party/ggml/fp16.h"
#include "third_party/ggml/ggml.h"
#include "third_party/ggml/llama_util.h"
@ -443,8 +444,9 @@ struct llama_file_loader {
llama_hparams hparams;
llama_vocab vocab;
llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map)
: file(fname, "rb") {
llama_file_loader(const char * fname, size_t file_idx,
llama_load_tensors_map & tensors_map)
: file(fname, "rb") {
// fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
read_magic();
read_hparams();
@ -568,8 +570,9 @@ struct llama_file_saver {
write_vocab();
}
void write_magic() {
ggjt_v2();
file.write_u32(READ32BE("ggjt")); // magic
file.write_u32(1); // version
file.write_u32(2); // version
}
void write_hparams(enum llama_ftype new_ftype) {
const llama_hparams & hparams = any_file_loader->hparams;
@ -2003,16 +2006,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
ggml_type_name(tensor.type));
// This used to be a regex, but <regex> has an extreme cost to compile times.
bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
// quantize only 2D tensors
quantize &= (tensor.ne.size() == 2);
// uncomment this to keep the output layer in FP16
//if (tensor.name == "output.weight") {
// quantize = false;
//}
// only quantize 2d weights that aren't the output layer
bool quantize =
tensor.ne.size() == 2 &&
tensor.type != quantized_type &&
_endswith(tensor.name.c_str(), "weight") &&
tensor.name != "output.weight";
enum ggml_type new_type;
void * new_data;
@ -2024,6 +2023,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
new_data = tensor.data;
new_size = tensor.size;
printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
} else if (quantized_type == GGML_TYPE_F16) {
GGML_ASSERT(tensor.type == GGML_TYPE_F32);
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
new_type = quantized_type;
new_size = nelements * 2;
work.resize(new_size);
new_data = work.addr;
ggml_fp32_to_fp16_row((const float *)tensor.data, (ggml_fp16_t *)new_data, nelements);
} else {
new_type = quantized_type;
float * f32_data;

View file

@ -1,4 +1,4 @@
// -*- c++ -*-
// -*- c++; c-basic-offset:4 -*-
#ifndef LLAMA_H
#define LLAMA_H
#include "libc/intrin/bits.h"

View file

@ -1,12 +1,11 @@
// Internal header to be included only by llama.cpp.
// Contains wrappers around OS interfaces.
// -*- c++; c-basic-offset:4 -*-
#ifndef LLAMA_UTIL_H
#define LLAMA_UTIL_H
#include "libc/calls/struct/rlimit.h"
#include "libc/dce.h"
#include "libc/fmt/fmt.h"
#include "libc/runtime/sysconf.h"
#include "libc/str/str.h"
#include "libc/sysv/consts/madv.h"
#include "libc/sysv/consts/map.h"
#include "libc/sysv/consts/prot.h"
@ -22,6 +21,9 @@
#include "third_party/libcxx/vector"
// clang-format off
// Internal header to be included only by llama.cpp.
// Contains wrappers around OS interfaces.
#define LLAMA_ASSERT(x) \
do { \
if (!(x)) { \
@ -47,6 +49,13 @@ static void Die(const char *fmt, ...) {
exit(1);
}
static inline bool is_integer_str(const char *s) {
if (*s == '-') ++s;
if (!*s) return false;
while (isdigit(*s)) ++s;
return !*s;
}
struct llama_file {
// use FILE * so we don't have to re-open the file to mmap
FILE * fp;

View file

@ -28,6 +28,7 @@
*/
#include "libc/assert.h"
#include "libc/calls/calls.h"
#include "libc/calls/struct/sched_param.h"
#include "libc/calls/struct/sigaction.h"
#include "libc/calls/struct/stat.h"
#include "libc/fmt/fmt.h"
@ -37,9 +38,11 @@
#include "libc/nexgen32e/x86feature.h"
#include "libc/runtime/runtime.h"
#include "libc/stdio/stdio.h"
#include "libc/sysv/consts/ioprio.h"
#include "libc/sysv/consts/map.h"
#include "libc/sysv/consts/msync.h"
#include "libc/sysv/consts/o.h"
#include "libc/sysv/consts/prio.h"
#include "libc/sysv/consts/prot.h"
#include "libc/sysv/consts/sig.h"
#include "third_party/ggml/common.h"
@ -66,7 +69,6 @@ static int n_past;
static int n_remain;
static int n_consumed;
static bool input_noecho;
static bool is_antiprompt;
////////////////////////////////////////////////////////////////////////////////
@ -103,7 +105,7 @@ static int CompareTime(struct timespec a, struct timespec b) {
////////////////////////////////////////////////////////////////////////////////
// ux explanatory logging for llama.com developers
#if 1
#if 0
#define DEVLOG(...) (void)0
#else
#define DEVLOG(...) if (g_devlog) fprintf(g_devlog, __VA_ARGS__)
@ -187,16 +189,16 @@ static bool has_antiprompt(std::string::size_type *out_index = nullptr,
static void finish_initializing_prompt() {
prompt_status = kPromptFinished;
if (params.interactive) {
std::string::size_type pos;
std::string::size_type ap_index;
is_interacting = true;
if (has_antiprompt(&pos)) {
if (has_antiprompt(&ap_index)) {
console_set_color(con_st, CONSOLE_COLOR_PROMPT);
printf("%s", last_output.substr(pos).c_str());
last_output.clear();
printf("%s", last_output.substr(ap_index).c_str());
fflush(stdout);
}
console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
}
last_output.clear();
}
////////////////////////////////////////////////////////////////////////////////
@ -208,8 +210,16 @@ static int on_missing_feature(const char *name) {
return 1;
}
void MakeProcessNice(void) {
setpriority(PRIO_PROCESS, 0, 10);
ioprio_set(IOPRIO_WHO_PROCESS, 0, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
struct sched_param param = {sched_get_priority_min(SCHED_IDLE)};
sched_setscheduler(0, SCHED_IDLE, &param);
}
int main(int argc, char ** argv) {
MakeProcessNice();
ShowCrashReports();
setvbuf(stdin, NULL, _IONBF, 0);
setvbuf(stdout, NULL, _IONBF, 0);
@ -439,8 +449,7 @@ int main(int argc, char ** argv) {
remember_init();
is_antiprompt = false;
input_noecho = params.verbose <= 0;
input_noecho = params.verbose <= 0;
n_past = 0;
n_remain = params.n_predict;
@ -561,6 +570,9 @@ int main(int argc, char ** argv) {
fprintf(stderr, EPHEMERAL("loading weights..."));
}
// tracks if last character written to stdout was newline
bool got_newline = false;
while ((n_remain != 0 || params.interactive) && !is_terminated) {
// perform evaluation
@ -678,7 +690,8 @@ int main(int argc, char ** argv) {
finish_initializing_prompt();
}
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
if (prompt_status == kPromptFinished &&
(int) embd_inp.size() <= n_consumed && !is_interacting) {
// out of user input, sample next token
DEVLOG("out of user input, sample next token w/ embd_inp.size()=%d n_consumed=%d\n",
(int)embd_inp.size(), n_consumed);
@ -808,21 +821,25 @@ int main(int argc, char ** argv) {
// --prompt 'Question: How old are you?\nAnswer: '
// --reverse-prompt $'\n'
//
bool is_antiprompt;
std::string ap_text;
std::string::size_type ap_index;
std::string::size_type ap_extra;
is_antiprompt = has_antiprompt(&ap_index, &ap_text);
std::string::size_type ap_index;
if (prompt_status == kPromptFinished) {
is_antiprompt = has_antiprompt(&ap_index, &ap_text);
} else {
is_antiprompt = false;
}
// display text
bool got_newline = false;
if (!input_noecho) {
if (!input_noecho && embd.size()) {
std::string printme;
for (auto id : embd) {
printme.append(llama_token_to_str(ctx, id));
}
if (is_antiprompt) {
ap_extra = last_output.size() - (ap_index + ap_text.size());
printme.erase(printme.size() - MIN(printme.size(), ap_extra));
ap_extra = last_output.size() - ap_index;
printme.erase(std::max(0, (int)(printme.size() - ap_extra)));
}
if (printme.size()) {
got_newline = printme[printme.size() - 1] == '\n';
@ -832,6 +849,7 @@ int main(int argc, char ** argv) {
}
if (is_antiprompt) {
if (!params.interactive) {
DEVLOG("exiting due to antiprompt\n");
if (!got_newline) {
printf("\n");
}

View file

@ -25,9 +25,12 @@
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "libc/log/log.h"
#include "libc/runtime/runtime.h"
#include "third_party/ggml/common.h"
#include "third_party/ggml/ggml.h"
#include "third_party/ggml/llama.h"
#include "third_party/ggml/llama_util.h"
#include "third_party/libcxx/map"
#include "third_party/libcxx/vector"
@ -38,46 +41,26 @@ asm(".include \"libc/disclaimer.inc\"");
// clang-format off
static const std::map<std::string, llama_ftype> LLAMA_FTYPE_MAP = {
{"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
{"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
{"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
{"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
{"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
{"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
{"f16", LLAMA_FTYPE_MOSTLY_F16 },
{"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
{"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
{"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
{"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
{"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
{"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
};
bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::string & ftype_str_out) {
auto it = LLAMA_FTYPE_MAP.find(ftype_str);
if (it != LLAMA_FTYPE_MAP.end()) {
ftype = it->second;
ftype_str_out = it->first;
return true;
}
// try to parse as an integer
// try {
int ftype_int = std::stoi(ftype_str);
for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
if (it->second == ftype_int) {
ftype = it->second;
ftype_str_out = it->first;
return true;
}
}
// }
// catch (...) {
// // stoi failed
// }
return false;
}
// usage:
// ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
// ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type [nthreads]
//
int main(int argc, char ** argv) {
ShowCrashReports();
ggjt_v2();
ggml_time_init();
if (argc < 3) {
fprintf(stderr, "usage: %s model-f32.bin [model-quant.bin] type [nthreads]\n", argv[0]);
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthreads]\n", argv[0]);
for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
fprintf(stderr, " type = \"%s\" or %d\n", it->first.c_str(), it->second);
}
@ -91,60 +74,27 @@ int main(int argc, char ** argv) {
ggml_free(ctx);
}
// parse command line arguments
const std::string fname_inp = argv[1];
std::string fname_out;
int nthread;
llama_ftype ftype;
const std::string fname_out = argv[2];
int arg_idx = 2;
std::string ftype_str;
if (try_parse_ftype(argv[arg_idx], ftype, ftype_str)) {
// argv[2] is the ftype
std::string fpath;
const size_t pos = fname_inp.find_last_of('/');
if (pos != std::string::npos) {
fpath = fname_inp.substr(0, pos + 1);
}
// export as [inp path]/ggml-model-[ftype].bin
fname_out = fpath + "ggml-model-" + ftype_str + ".bin";
arg_idx++;
}
else {
// argv[2] is the output path
fname_out = argv[arg_idx];
arg_idx++;
if (argc <= arg_idx) {
fprintf(stderr, "%s: missing ftype\n", __func__);
return 1;
}
// argv[3] is the ftype
if (!try_parse_ftype(argv[arg_idx], ftype, ftype_str)) {
fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
return 1;
}
arg_idx++;
if (fname_inp == fname_out) {
fprintf(stderr, "%s: input and output names are same\n", fname_inp.c_str());
exit(1);
}
// parse nthreads
if (argc > arg_idx) {
// try {
nthread = std::stoi(argv[arg_idx]);
// }
// catch (const std::exception & e) {
// Die("%s: invalid nthread '%s' (%s)\n", __func__, argv[arg_idx], e.what());
// return 1;
// }
enum llama_ftype ftype;
if (!is_integer_str(argv[3])) {
auto it = LLAMA_FTYPE_MAP.find(argv[3]);
if (it == LLAMA_FTYPE_MAP.end()) {
fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, argv[3]);
return 1;
}
ftype = it->second;
} else {
nthread = 0;
ftype = (enum llama_ftype)atoi(argv[3]);
}
fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
if (nthread > 0) {
fprintf(stderr, " using %d threads", nthread);
}
fprintf(stderr, "\n");
int nthread = argc > 4 ? atoi(argv[4]) : 0;
const int64_t t_main_start_us = ggml_time_us();

View file

@ -18,10 +18,18 @@ __funline float _cvtsh_ss(unsigned short __S) {
return __builtin_ia32_vec_ext_v4sf(__A, 0);
}
/**
* Converts four half-precision (16-bit) floating point values to
* single-precision floating point values.
*/
__funline __m128 _mm_cvtph_ps(__m128i __A) {
return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__A);
}
/**
* Converts eight half-precision (16-bit) floating point values to
* single-precision floating point values.
*/
__funline __m256 _mm256_cvtph_ps(__m128i __A) {
return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__A);
}
@ -37,6 +45,10 @@ __funline __m128i _mm_cvtps_ph(__m128 __A, const int __I) {
return (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)__A, __I);
}
/**
* Converts eight single-precision floating point values to
* half-precision (16-bit) floating point values.
*/
__funline __m128i _mm256_cvtps_ph(__m256 __A, const int __I) {
return (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)__A, __I);
}

View file

@ -1,4 +1,4 @@
// -*- c++ -*-
// -*- c++; c-basic-offset:4 -*-
#ifndef COSMOPOLITAN_THIRD_PARTY_RADPAJAMA_COMMON_GPTNEOX_H_
#define COSMOPOLITAN_THIRD_PARTY_RADPAJAMA_COMMON_GPTNEOX_H_
#include "libc/macros.internal.h"

View file

@ -1,4 +1,4 @@
// -*- c++ -*-
// -*- c++; c-basic-offset:4 -*-
#ifndef GPTNEOX_UTIL_H
#define GPTNEOX_UTIL_H
#include "libc/calls/calls.h"

View file

@ -28,6 +28,8 @@
*/
#include "third_party/radpajama/gptneox.h"
#include "libc/intrin/bits.h"
#include "libc/str/str.h"
#include "libc/sysv/consts/posix.h"
#include "third_party/ggml/fp16.h"
#include "third_party/ggml/ggml.h"
#include "third_party/ggml/llama_util.h"
@ -77,7 +79,7 @@ static const size_t MiB = 1024*1024;
// needs modifications in ggml
// TODO: Modify for gptneox, how are these values actually determined?
// TODO: This is now priority,
// TODO: This is now priority,
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
{
static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
@ -446,7 +448,8 @@ struct gptneox_load_tensors_map {
enum gptneox_file_version {
GPTNEOX_FILE_VERSION_GGML,
GPTNEOX_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
GPTNEOX_FILE_VERSION_GGJT_V1, // added padding
GPTNEOX_FILE_VERSION_GGJT_V1, // adopted unified aligned mappable layout
GPTNEOX_FILE_VERSION_GGJT_V2, // changed quantization format
};
struct gptneox_file_loader {
@ -473,10 +476,16 @@ struct gptneox_file_loader {
if (magic == READ32BE("ggml") && version == 0) {
file_version = GPTNEOX_FILE_VERSION_GGML;
ggjt_v1();
} else if (magic == READ32BE("ggmf") && version == 1) {
file_version = GPTNEOX_FILE_VERSION_GGMF_V1;
ggjt_v1();
} else if (magic == READ32BE("ggjt") && version == 1) {
file_version = GPTNEOX_FILE_VERSION_GGJT_V1;
ggjt_v1();
} else if (magic == READ32BE("ggjt") && version == 2) {
file_version = GPTNEOX_FILE_VERSION_GGJT_V2;
ggjt_v2();
} else {
Die("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
magic, version);
@ -566,17 +575,20 @@ struct gptneox_file_loader {
struct gptneox_file_saver {
gptneox_file file;
gptneox_file_loader * any_file_loader;
gptneox_file_saver(const char * fname, gptneox_file_loader * any_file_loader, enum gptneox_ftype new_ftype)
: file(fname, "wb"), any_file_loader(any_file_loader) {
gptneox_file_saver(const char * fname,
gptneox_file_loader * any_file_loader,
enum gptneox_ftype new_ftype)
: file(fname, "wb"),
any_file_loader(any_file_loader) {
fprintf(stderr, "gptneox.cpp: saving model to %s\n", fname);
ggjt_v1();
write_magic();
write_hparams(new_ftype);
write_vocab();
}
void write_magic() {
ggjt_v2();
file.write_u32(READ32BE("ggjt")); // magic
file.write_u32(1); // version
file.write_u32(2); // version
}
void write_hparams(enum gptneox_ftype new_ftype) {
const gptneox_hparams & hparams = any_file_loader->hparams;
@ -887,7 +899,8 @@ static const char *gptneox_file_version_name(gptneox_file_version version) {
switch (version) {
case GPTNEOX_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
case GPTNEOX_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
case GPTNEOX_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)";
case GPTNEOX_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
case GPTNEOX_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
default: GPTNEOX_ASSERT(false);
}
}
@ -940,7 +953,7 @@ static void gptneox_model_load_internal(
model.hparams = ml->file_loaders.at(0)->hparams;
gptneox_file_version file_version = ml->file_loaders.at(0)->file_version;
auto & hparams = model.hparams;
{
switch (hparams.n_layer) {
case 16: {
@ -951,7 +964,7 @@ static void gptneox_model_load_internal(
}
break;
}
// # <RedPajama>: we extend the model type settings for RedPajama models.
// # <RedPajama>: we extend the model type settings for RedPajama models.
case 32:{
if (hparams.n_embd == 2560) {
model.type = e_model::MODEL_3B;
@ -1195,7 +1208,7 @@ static bool gptneox_eval_internal(
model.layers[il].c_attn_attn_b, cur),
cur);
}
// Split QKV and make contiguous
struct ggml_tensor * Qcur = ggml_view_3d(ctx0, cur,
n_embd/n_head,
@ -1225,7 +1238,7 @@ static bool gptneox_eval_internal(
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N));
Vcur = ggml_cpy(ctx0, Vcur,
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N));
// MARK: gptneox RoPE Q and K, before cache
// Bit 2 for gptneox style (2)
// Bit 1 is zero for dont skip n_past +(0), use (2+1) = (3) if rope is applied to cache of k (after cache only)
@ -1241,7 +1254,7 @@ static bool gptneox_eval_internal(
ggml_element_size(Vcur) * n_embd,
0);
Vcur = ggml_transpose(ctx0, Vcur);
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k,
n_embd * N, // num elements in current context (up to n_embd*n_ctx but usually less)
ggml_element_size(kv_self.k) * n_embd * (il * n_ctx + n_past));
@ -1250,12 +1263,12 @@ static bool gptneox_eval_internal(
n_embd,
ggml_element_size(kv_self.v) * n_ctx,
ggml_element_size(kv_self.v) * ((il * n_ctx * n_embd) + n_past));
// important: storing RoPE-ed version of K in the KV cache!
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
//}
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
struct ggml_tensor * Q =
ggml_permute(ctx0,
@ -1284,7 +1297,7 @@ static bool gptneox_eval_internal(
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
// KQ = soft_max(KQ_masked)
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
struct ggml_tensor * V_trans = ggml_view_3d(ctx0, kv_self.v,
n_past + N,
@ -1312,10 +1325,10 @@ static bool gptneox_eval_internal(
}
lctx.use_buf(ctx0, 1);
if (hparams.use_parallel_residual == 1) {
//printf("use_parallel_residual == 1\n");
// This is independent of the self-attention result, so it could be done in parallel to the self-attention
struct ggml_tensor * outAttn = cur;
@ -1359,7 +1372,7 @@ static bool gptneox_eval_internal(
inpL = ggml_add(ctx0, inpL, cur);
} else if (hparams.use_parallel_residual == 0) {
//printf("use_parallel_residual == 0\n");
// This takes the self-attention residual output as input to Feedforward
struct ggml_tensor * outAttn = cur;
struct ggml_tensor * inpFF = ggml_add(ctx0, outAttn, inpL);
@ -2093,6 +2106,7 @@ int gptneox_model_copy(
static void gptneox_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum gptneox_ftype ftype, int nthread) {
ggml_type quantized_type;
switch (ftype) {
case GPTNEOX_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
case GPTNEOX_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
case GPTNEOX_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
case GPTNEOX_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
@ -2124,21 +2138,17 @@ static void gptneox_model_quantize_internal(const std::string & fname_inp, const
tensor.data = read_data.addr;
model_loader->load_data_for(tensor);
printf("[%4zu/%4zu] %36s - %16s, type = %6s, ",
printf("[%4zu/%4zu] %50s - %16s, type = %6s, ",
++idx, model_loader->tensors_map.tensors.size(),
tensor.name.c_str(), gptneox_format_tensor_shape(tensor.ne).c_str(),
ggml_type_name(tensor.type));
// This used to be a regex, but <regex> has an extreme cost to compile times.
bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
// quantize only 2D tensors
quantize &= (tensor.ne.size() == 2);
// uncomment this to keep the output layer in FP16
//if (tensor.name == "output.weight") {
// quantize = false;
//}
// only quantize 2d weights that aren't the output layer
bool quantize =
tensor.ne.size() == 2 &&
tensor.type != quantized_type &&
_endswith(tensor.name.c_str(), "weight") &&
tensor.name != "output.weight";
enum ggml_type new_type;
void * new_data;
@ -2150,6 +2160,14 @@ static void gptneox_model_quantize_internal(const std::string & fname_inp, const
new_data = tensor.data;
new_size = tensor.size;
printf("size = %8.3f MiB\n", tensor.size/1024.0/1024.0);
} else if (quantized_type == GGML_TYPE_F16) {
GPTNEOX_ASSERT(tensor.type == GGML_TYPE_F32);
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
new_type = quantized_type;
new_size = nelements * 2;
work.resize(new_size);
new_data = work.addr;
ggml_fp32_to_fp16_row((const float *)tensor.data, (ggml_fp16_t *)new_data, nelements);
} else {
new_type = quantized_type;
float * f32_data;

View file

@ -1,4 +1,4 @@
// -*- c++ -*-
// -*- c++; c-basic-offset:4 -*-
#ifndef GPTNEOX_H
#define GPTNEOX_H
// clang-format off

View file

@ -28,6 +28,7 @@
*/
#include "libc/log/log.h"
#include "third_party/ggml/ggml.h"
#include "third_party/ggml/llama_util.h"
#include "third_party/libcxx/cstdio"
#include "third_party/libcxx/map"
#include "third_party/libcxx/string"
@ -35,13 +36,14 @@
// clang-format off
static const std::map<std::string, enum gptneox_ftype> GPTNEOX_FTYPE_MAP = {
{"q4_0", GPTNEOX_FTYPE_MOSTLY_Q4_0},
{"q4_1", GPTNEOX_FTYPE_MOSTLY_Q4_1},
{"q4_2", GPTNEOX_FTYPE_MOSTLY_Q4_2},
//{"q4_3", GPTNEOX_FTYPE_MOSTLY_Q4_3},
{"q5_0", GPTNEOX_FTYPE_MOSTLY_Q5_0},
{"q5_1", GPTNEOX_FTYPE_MOSTLY_Q5_1},
{"q8_0", GPTNEOX_FTYPE_MOSTLY_Q8_0},
{"f16", GPTNEOX_FTYPE_MOSTLY_F16},
{"q4_0", GPTNEOX_FTYPE_MOSTLY_Q4_0},
{"q4_1", GPTNEOX_FTYPE_MOSTLY_Q4_1},
{"q4_2", GPTNEOX_FTYPE_MOSTLY_Q4_2},
//{"q4_3", GPTNEOX_FTYPE_MOSTLY_Q4_3},
{"q5_0", GPTNEOX_FTYPE_MOSTLY_Q5_0},
{"q5_1", GPTNEOX_FTYPE_MOSTLY_Q5_1},
{"q8_0", GPTNEOX_FTYPE_MOSTLY_Q8_0},
};
// usage:
@ -50,7 +52,7 @@ static const std::map<std::string, enum gptneox_ftype> GPTNEOX_FTYPE_MAP = {
int main(int argc, char ** argv) {
ShowCrashReports();
ggjt_v1();
ggjt_v2();
ggml_time_init();
if (argc < 4) {
@ -71,8 +73,13 @@ int main(int argc, char ** argv) {
const std::string fname_inp = argv[1];
const std::string fname_out = argv[2];
if (fname_inp == fname_out) {
fprintf(stderr, "%s: input and output names are same\n", fname_inp.c_str());
exit(1);
}
enum gptneox_ftype ftype;
if (argv[3][0] == 'q') {
if (!is_integer_str(argv[3])) {
auto it = GPTNEOX_FTYPE_MAP.find(argv[3]);
if (it == GPTNEOX_FTYPE_MAP.end()) {
fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, argv[3]);

View file

@ -72,6 +72,7 @@
"__builtin_extract_return_addr"
"__builtin_isnan"
"__builtin_signbit"
"__builtin_signbitf"
"__builtin_signbitl"
"__builtin_ffs"
"__builtin_ffsl"