Add support for new GGJT v2 quantizers

This change makes quantized models (e.g. q4_0) go 10% faster on Macs
however doesn't offer much improvement for Intel PC hardware.

This change syncs llama.cpp 699b1ad7fe6f7b9e41d3cb41e61a8cc3ea5fc6b5
which recently made a breaking change to nearly all its file formats
without any migration. Since that'll break hundreds upon hundreds of
models on websites like HuggingFace llama.com will support both file
formats because llama.com will never ever break the GGJT file format
This commit is contained in:
Justine Tunney 2023-05-13 08:08:32 -07:00
parent ba49e86e20
commit 5a4cf9560f
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
24 changed files with 4074 additions and 1805 deletions

125
third_party/ggml/fp16.c vendored Normal file
View file

@ -0,0 +1,125 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi
GGML
Copyright (c) 2023 Georgi Gerganov
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "third_party/ggml/fp16.h"
#include "libc/str/str.h"
#include "third_party/ggml/fp16.internal.h"
#include "third_party/libcxx/math.h"
asm(".ident\t\"\\n\\n\
GGML (MIT License)\\n\
Copyright (c) 2023 Georgi Gerganov\"");
asm(".include \"libc/disclaimer.inc\"");
// clang-format off
#if defined(__ARM_NEON) || defined(__wasm_simd128__)
#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
#define B8(c,s ) B7(c,s, c), B7(c,s, s)
// precomputed tables for expanding 8bits to 8 bytes (shl 4)
const uint64_t table_b2b_u[1 << 8] = { B8(00, 10) };
#endif
//
// global data
//
// precomputed gelu table for f16 (128 KB)
ggml_fp16_t table_gelu_f16[1 << 16];
// precomputed silu table for f16 (128 KB)
ggml_fp16_t table_silu_f16[1 << 16];
// precomputed exp table for f16 (128 KB)
ggml_fp16_t table_exp_f16[1 << 16];
// precomputed f32 table for f16 (256 KB)
float table_f32_f16[1 << 16];
// note: do not use these inside ggml.c
// these are meant to be used via the ggml.h API
float ggml_fp16_to_fp32(ggml_fp16_t x) {
return (float) GGML_FP16_TO_FP32(x);
}
ggml_fp16_t ggml_fp32_to_fp16(float x) {
return GGML_FP32_TO_FP16(x);
}
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n) {
for (size_t i = 0; i < n; i++) {
y[i] = GGML_FP16_TO_FP32(x[i]);
}
}
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
size_t i = 0;
#if defined(__F16C__)
for (; i + 7 < n; i += 8) {
__m256 x_vec = _mm256_loadu_ps(x + i);
__m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
_mm_storeu_si128((__m128i *)(y + i), y_vec);
}
for(; i + 3 < n; i += 4) {
__m128 x_vec = _mm_loadu_ps(x + i);
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
_mm_storel_epi64((__m128i *)(y + i), y_vec);
}
#endif
for (; i < n; i++) {
y[i] = GGML_FP32_TO_FP16(x[i]);
}
}
static const float GELU_COEF_A = 0.044715f;
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
inline static float ggml_gelu_f32(float x) {
return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
}
// Sigmoid Linear Unit (SiLU) function
inline static float ggml_silu_f32(float x) {
return x/(1.0f + expf(-x));
}
void ggml_fp16_init(void) {
ggml_fp16_t ii;
for (int i = 0; i < (1 << 16); ++i) {
uint16_t ui = i;
memcpy(&ii, &ui, sizeof(ii));
const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f));
}
}

27
third_party/ggml/fp16.h vendored Normal file
View file

@ -0,0 +1,27 @@
#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_F16_H_
#define COSMOPOLITAN_THIRD_PARTY_GGML_F16_H_
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
#define GGML_GELU_FP16
#define GGML_SILU_FP16
#ifdef __ARM_NEON
// we use the built-in 16-bit float type
typedef __fp16 ggml_fp16_t;
#else
typedef uint16_t ggml_fp16_t;
#endif
void ggml_fp16_init(void);
// convert FP16 <-> FP32
float ggml_fp16_to_fp32(ggml_fp16_t x);
ggml_fp16_t ggml_fp32_to_fp16(float x);
void ggml_fp16_to_fp32_row(const ggml_fp16_t* x, float* y, size_t n);
void ggml_fp32_to_fp16_row(const float* x, ggml_fp16_t* y, size_t n);
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_F16_H_ */

188
third_party/ggml/fp16.internal.h vendored Normal file
View file

@ -0,0 +1,188 @@
#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_FP16_INTERNAL_H_
#define COSMOPOLITAN_THIRD_PARTY_GGML_FP16_INTERNAL_H_
#include "libc/literal.h"
#include "libc/str/str.h"
#include "third_party/ggml/fp16.h"
#include "third_party/intel/immintrin.internal.h"
#include "third_party/libcxx/math.h"
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
extern ggml_fp16_t table_gelu_f16[1 << 16];
extern ggml_fp16_t table_silu_f16[1 << 16];
extern ggml_fp16_t table_exp_f16[1 << 16];
extern float table_f32_f16[1 << 16];
#if defined(__ARM_NEON) || defined(__wasm_simd128__)
extern const uint64_t table_b2b_u[1 << 8];
#endif
inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t* y,
const ggml_fp16_t* x) {
const uint16_t* i16 = (const uint16_t*)x;
for (int i = 0; i < n; ++i) {
y[i] = table_gelu_f16[i16[i]];
}
}
// clang-format off
// 16-bit float
// on Arm, we use __fp16
// on x86, we use uint16_t
#ifdef __ARM_NEON
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
//
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
//
#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
#define GGML_FP16_TO_FP32(x) ((float) (x))
#define GGML_FP32_TO_FP16(x) (x)
#else
#ifdef __F16C__
#ifdef _MSC_VER
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
#else
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
#endif
#elif defined(__POWER9_VECTOR__)
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
/* the inline asm below is about 12% faster than the lookup method */
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
register float f;
register double d;
__asm__(
"mtfprd %0,%2\n"
"xscvhpdp %0,%0\n"
"frsp %1,%0\n" :
/* temp */ "=d"(d),
/* out */ "=f"(f):
/* in */ "r"(h));
return f;
}
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
register double d;
register ggml_fp16_t r;
__asm__( /* xscvdphp can work on double or single precision */
"xscvdphp %0,%2\n"
"mffprd %1,%0\n" :
/* temp */ "=d"(d),
/* out */ "=r"(r):
/* in */ "f"(f));
return r;
}
#else
// FP16 <-> FP32
// ref: https://github.com/Maratyszcza/FP16
static inline float fp32_from_bits(uint32_t w) {
union {
uint32_t as_bits;
float as_value;
} fp32;
fp32.as_bits = w;
return fp32.as_value;
}
static inline uint32_t fp32_to_bits(float f) {
union {
float as_value;
uint32_t as_bits;
} fp32;
fp32.as_value = f;
return fp32.as_bits;
}
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
const uint32_t w = (uint32_t) h << 16;
const uint32_t sign = w & UINT32_C(0x80000000);
const uint32_t two_w = w + w;
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
const float exp_scale = 0x1.0p-112f;
#else
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
#endif
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
const uint32_t magic_mask = UINT32_C(126) << 23;
const float magic_bias = 0.5f;
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
const uint32_t result = sign |
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
return fp32_from_bits(result);
}
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
const float scale_to_inf = 0x1.0p+112f;
const float scale_to_zero = 0x1.0p-110f;
#else
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
#endif
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
const uint32_t w = fp32_to_bits(f);
const uint32_t shl1_w = w + w;
const uint32_t sign = w & UINT32_C(0x80000000);
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
if (bias < UINT32_C(0x71000000)) {
bias = UINT32_C(0x71000000);
}
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
const uint32_t bits = fp32_to_bits(base);
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
const uint32_t nonsign = exp_bits + mantissa_bits;
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
}
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
#endif // __F16C__
#endif // __ARM_NEON
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
// This is also true for POWER9.
#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
uint16_t s;
memcpy(&s, &f, sizeof(uint16_t));
return table_f32_f16[s];
}
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
#endif
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_FP16_INTERNAL_H_ */

155
third_party/ggml/ggjt.v1.c vendored Normal file
View file

@ -0,0 +1,155 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi
Copyright 2023 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/assert.h"
#include "third_party/ggml/ggjt.v1.q4_0.h"
#include "third_party/ggml/ggjt.v1.q4_1.h"
#include "third_party/ggml/ggjt.v1.q4_2.h"
#include "third_party/ggml/ggjt.v1.q5_0.h"
#include "third_party/ggml/ggjt.v1.q5_1.h"
#include "third_party/ggml/ggjt.v1.q8_0.h"
#include "third_party/ggml/ggjt.v1.q8_1.h"
#include "third_party/ggml/ggml.h"
// clang-format off
static const int ggjt_v1_blck_size[GGML_TYPE_COUNT] = {
[GGML_TYPE_F32] = 1,
[GGML_TYPE_F16] = 1,
[GGML_TYPE_Q4_0] = V1_QK4_0,
[GGML_TYPE_Q4_1] = V1_QK4_1,
[GGML_TYPE_Q4_2] = V1_QK4_2,
[GGML_TYPE_Q5_0] = V1_QK5_0,
[GGML_TYPE_Q5_1] = V1_QK5_1,
[GGML_TYPE_Q8_0] = V1_QK8_0,
[GGML_TYPE_Q8_1] = V1_QK8_1,
[GGML_TYPE_I8] = 1,
[GGML_TYPE_I16] = 1,
[GGML_TYPE_I32] = 1,
};
static const size_t ggjt_v1_type_size[GGML_TYPE_COUNT] = {
[GGML_TYPE_F32] = sizeof(float),
[GGML_TYPE_F16] = sizeof(ggml_fp16_t),
[GGML_TYPE_Q4_0] = sizeof(block_v1_q4_0),
[GGML_TYPE_Q4_1] = sizeof(block_v1_q4_1),
[GGML_TYPE_Q4_2] = sizeof(block_v1_q4_2),
[GGML_TYPE_Q5_0] = sizeof(block_v1_q5_0),
[GGML_TYPE_Q5_1] = sizeof(block_v1_q5_1),
[GGML_TYPE_Q8_0] = sizeof(block_v1_q8_0),
[GGML_TYPE_Q8_1] = sizeof(block_v1_q8_1),
[GGML_TYPE_I8] = sizeof(int8_t),
[GGML_TYPE_I16] = sizeof(int16_t),
[GGML_TYPE_I32] = sizeof(int32_t),
};
static const char *const ggjt_v1_type_name[GGML_TYPE_COUNT] = {
[GGML_TYPE_F32] = "f32",
[GGML_TYPE_F16] = "f16",
[GGML_TYPE_Q4_0] = "q4_0",
[GGML_TYPE_Q4_1] = "q4_1",
[GGML_TYPE_Q4_2] = "q4_2",
[GGML_TYPE_Q5_0] = "q5_0",
[GGML_TYPE_Q5_1] = "q5_1",
[GGML_TYPE_Q8_0] = "q8_0",
[GGML_TYPE_Q8_1] = "q8_1",
[GGML_TYPE_I8] = "i8",
[GGML_TYPE_I16] = "i16",
[GGML_TYPE_I32] = "i32",
};
static const bool ggjt_v1_is_quantized[GGML_TYPE_COUNT] = {
[GGML_TYPE_F32] = false,
[GGML_TYPE_F16] = false,
[GGML_TYPE_Q4_0] = true,
[GGML_TYPE_Q4_1] = true,
[GGML_TYPE_Q4_2] = true,
[GGML_TYPE_Q5_0] = true,
[GGML_TYPE_Q5_1] = true,
[GGML_TYPE_Q8_0] = true,
[GGML_TYPE_Q8_1] = true,
[GGML_TYPE_I8] = false,
[GGML_TYPE_I16] = false,
[GGML_TYPE_I32] = false,
};
static const quantize_fns_t ggjt_v1_quantize_fns[GGML_TYPE_COUNT] = {
[GGML_TYPE_Q4_0] = {
.dequantize_row_q = dequantize_row_v1_q4_0,
.quantize_row_q = quantize_row_v1_q4_0,
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_v1_q4_0_reference,
.quantize_row_q_dot = quantize_row_v1_q8_0,
.vec_dot_q = ggml_vec_dot_v1_q4_0_q8_0,
.vec_dot_type = GGML_TYPE_Q8_0,
},
[GGML_TYPE_Q4_1] = {
.dequantize_row_q = dequantize_row_v1_q4_1,
.quantize_row_q = quantize_row_v1_q4_1,
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_v1_q4_1_reference,
.quantize_row_q_dot = quantize_row_v1_q8_1,
.vec_dot_q = ggml_vec_dot_v1_q4_1_q8_1,
.vec_dot_type = GGML_TYPE_Q8_1,
},
[GGML_TYPE_Q4_2] = {
.dequantize_row_q = dequantize_row_v1_q4_2,
.quantize_row_q = quantize_row_v1_q4_2,
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_v1_q4_2_reference,
.quantize_row_q_dot = quantize_row_v1_q8_0,
.vec_dot_q = ggml_vec_dot_v1_q4_2_q8_0,
.vec_dot_type = GGML_TYPE_Q8_0,
},
[GGML_TYPE_Q5_0] = {
.dequantize_row_q = dequantize_row_v1_q5_0,
.quantize_row_q = quantize_row_v1_q5_0,
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_v1_q5_0_reference,
.quantize_row_q_dot = quantize_row_v1_q8_0,
.vec_dot_q = ggml_vec_dot_v1_q5_0_q8_0,
.vec_dot_type = GGML_TYPE_Q8_0,
},
[GGML_TYPE_Q5_1] = {
.dequantize_row_q = dequantize_row_v1_q5_1,
.quantize_row_q = quantize_row_v1_q5_1,
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_v1_q5_1_reference,
.quantize_row_q_dot = quantize_row_v1_q8_1,
.vec_dot_q = ggml_vec_dot_v1_q5_1_q8_1,
.vec_dot_type = GGML_TYPE_Q8_1,
},
[GGML_TYPE_Q8_0] = {
.dequantize_row_q = dequantize_row_v1_q8_0,
.quantize_row_q = quantize_row_v1_q8_0,
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_v1_q8_0_reference,
.quantize_row_q_dot = quantize_row_v1_q8_0,
.vec_dot_q = ggml_vec_dot_v1_q8_0_q8_0,
.vec_dot_type = GGML_TYPE_Q8_0,
},
[GGML_TYPE_Q8_1] = {
.dequantize_row_q = NULL, // TODO
.quantize_row_q = quantize_row_v1_q8_1,
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_v1_q8_1_reference,
.quantize_row_q_dot = quantize_row_v1_q8_1,
.vec_dot_q = NULL, // TODO
.vec_dot_type = GGML_TYPE_Q8_1,
},
};
void ggjt_v1(void) {
GGML_BLCK_SIZE = ggjt_v1_blck_size;
GGML_TYPE_SIZE = ggjt_v1_type_size;
GGML_TYPE_NAME = ggjt_v1_type_name;
GGML_IS_QUANTIZED = ggjt_v1_is_quantized;
quantize_fns = ggjt_v1_quantize_fns;
}

134
third_party/ggml/ggjt.v1.internal.h vendored Normal file
View file

@ -0,0 +1,134 @@
#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_INTERNAL_H_
#define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_INTERNAL_H_
#include "libc/str/str.h"
#include "third_party/intel/immintrin.internal.h"
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
// clang-format off
#ifdef __AVX__
// horizontally add 8 floats
static inline float hsum_float_8(const __m256 x) {
__m128 res = _mm256_extractf128_ps(x, 1);
res = _mm_add_ps(res, _mm256_castps256_ps128(x));
res = _mm_add_ps(res, _mm_movehl_ps(res, res));
res = _mm_add_ss(res, _mm_movehdup_ps(res));
return _mm_cvtss_f32(res);
}
#endif /* AVX */
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
// Unpack 16 4-bit fields into 16 bytes
// The output vector contains 16 bytes, each one in [ 0 .. 15 ] interval
static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi) {
// Load 8 bytes from memory
__m128i tmp = _mm_loadl_epi64( ( const __m128i* )rsi );
// Expand bytes into uint16_t values
__m128i bytes = _mm_cvtepu8_epi16( tmp );
// Unpack values into individual bytes
const __m128i lowMask = _mm_set1_epi8( 0xF );
__m128i high = _mm_andnot_si128( lowMask, bytes );
__m128i low = _mm_and_si128( lowMask, bytes );
high = _mm_slli_epi16( high, 4 );
bytes = _mm_or_si128( low, high );
return bytes;
}
#endif /* AVX || AVX2 || AVX512 */
#if defined(__AVX2__) || defined(__AVX512F__)
// spread 32 bits to 32 bytes { 0x00, 0xFF }
static inline __m256i bytes_from_bits_32(const uint8_t * x) {
uint32_t x32;
memcpy(&x32, x, sizeof(uint32_t));
const __m256i shuf_mask = _mm256_set_epi64x(
0x0303030303030303, 0x0202020202020202,
0x0101010101010101, 0x0000000000000000);
__m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask);
const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
bytes = _mm256_or_si256(bytes, bit_mask);
return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
}
#endif /* AVX2 || AVX512 */
#if defined(__AVX2__) || defined(__AVX512F__)
// add int16_t pairwise and return as float vector
static inline __m256 sum_i16_pairs_float(const __m256i x) {
const __m256i ones = _mm256_set1_epi16(1);
const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
return _mm256_cvtepi32_ps(summed_pairs);
}
// Unpack 32 4-bit fields into 32 bytes
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) {
// Load 16 bytes from memory
__m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi );
// Expand bytes into uint16_t values
__m256i bytes = _mm256_cvtepu8_epi16( tmp );
// Unpack values into individual bytes
const __m256i lowMask = _mm256_set1_epi8( 0xF );
__m256i high = _mm256_andnot_si256( lowMask, bytes );
__m256i low = _mm256_and_si256( lowMask, bytes );
high = _mm256_slli_epi16( high, 4 );
bytes = _mm256_or_si256( low, high );
return bytes;
}
// multiply int8_t, add results pairwise twice and return as float vector
static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
// Get absolute values of x vectors
const __m256i ax = _mm256_sign_epi8(x, x);
// Sign the values of the y vectors
const __m256i sy = _mm256_sign_epi8(y, x);
#ifdef __AVXVNNI__
const __m256i zero = _mm256_setzero_si256();
const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
return _mm256_cvtepi32_ps(summed_pairs);
#else
// Perform multiplication and create 16-bit values
const __m256i dot = _mm256_maddubs_epi16(ax, sy);
return sum_i16_pairs_float(dot);
#endif
}
static inline __m128i packNibbles( __m256i bytes ) {
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
#if defined(__AVX512F__)
const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4); // 0000_0000_abcd_0000
bytes = _mm256_or_si256(bytes, bytes_srli_4); // 0000_abcd_abcd_efgh
return _mm256_cvtepi16_epi8(bytes); // abcd_efgh
#else
const __m256i lowByte = _mm256_set1_epi16( 0xFF );
__m256i high = _mm256_andnot_si256( lowByte, bytes );
__m256i low = _mm256_and_si256( lowByte, bytes );
high = _mm256_srli_epi16( high, 4 );
bytes = _mm256_or_si256( low, high );
// Compress uint16_t lanes into bytes
__m128i r0 = _mm256_castsi256_si128( bytes );
__m128i r1 = _mm256_extracti128_si256( bytes, 1 );
return _mm_packus_epi16( r0, r1 );
#endif
}
#elif defined(__AVX__)
static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) {
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
const __m128i lowByte = _mm_set1_epi16( 0xFF );
__m128i high = _mm_andnot_si128( lowByte, bytes1 );
__m128i low = _mm_and_si128( lowByte, bytes1 );
high = _mm_srli_epi16( high, 4 );
bytes1 = _mm_or_si128( low, high );
high = _mm_andnot_si128( lowByte, bytes2 );
low = _mm_and_si128( lowByte, bytes2 );
high = _mm_srli_epi16( high, 4 );
bytes2 = _mm_or_si128( low, high );
return _mm_packus_epi16( bytes1, bytes2);
}
#endif /* __AVX__ */
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_INTERNAL_H_ */

700
third_party/ggml/ggjt.v1.q4_0.c vendored Normal file
View file

@ -0,0 +1,700 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi
GGML
Copyright (c) 2023 Georgi Gerganov
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "third_party/ggml/ggjt.v1.q4_0.h"
#include "libc/assert.h"
#include "libc/macros.internal.h"
#include "libc/str/str.h"
#include "third_party/aarch64/arm_neon.h"
#include "third_party/ggml/ggjt.v1.internal.h"
#include "third_party/ggml/ggjt.v1.q8_0.h"
#include "third_party/intel/immintrin.internal.h"
#include "third_party/libcxx/math.h"
// clang-format off
// quantization for the ggjt.v1.q4_0 file format
static_assert(sizeof(block_v1_q4_0) == sizeof(float) + V1_QK4_0 / 2,
"wrong q4_0 block size/padding");
static_assert(sizeof(block_v1_q8_0) == sizeof(float) + V1_QK8_0,
"wrong q8_0 block size/padding");
// reference implementation for deterministic creation of model files
void quantize_row_v1_q4_0_reference(const float * restrict x, block_v1_q4_0 * restrict y, int k) {
assert(k % V1_QK4_0 == 0);
const int nb = k / V1_QK4_0;
uint8_t pp[V1_QK4_0/2];
for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max
float max = 0.0f;
for (int l = 0; l < V1_QK4_0; l++) {
const float v = x[i*V1_QK4_0 + l];
if (amax < fabsf(v)) {
amax = fabsf(v);
max = v;
}
}
const float d = max / -8;
const float id = d ? 1.0f/d : 0.0f;
y[i].d = d;
for (int l = 0; l < V1_QK4_0; l += 2) {
const float v0 = x[i*V1_QK4_0 + l + 0]*id;
const float v1 = x[i*V1_QK4_0 + l + 1]*id;
const uint8_t vi0 = MIN(15, (int8_t)roundf(v0) + 8);
const uint8_t vi1 = MIN(15, (int8_t)roundf(v1) + 8);
assert(vi0 < 16);
assert(vi1 < 16);
pp[l/2] = vi0 | (vi1 << 4);
}
memcpy(y[i].qs, pp, sizeof(pp));
}
}
void quantize_row_v1_q4_0(const float * restrict x, void * restrict vy, int k) {
assert(k % V1_QK4_0 == 0);
const int nb = k / V1_QK4_0;
block_v1_q4_0 * restrict y = vy;
#if defined(__POWER9_VECTOR__)
const vector float v85 = vec_splats(8.5f);
const vector signed int v15 = vec_splats(15);
for (int i = 0; i < nb; i++) {
float max = 0.0f;
float min = 0.0f;
vector float asrcv [8];
vector float srcv [8];
vector float maxv[8];
vector float minv[8];
for (int l = 0; l < 8; l++) srcv[l] = *(vector float *)(x + i*32 + 4*l);
//for (int l = 0; l < 8; l++) asrcv[l] = vec_abs(srcv[l]);
for (int l = 0; l < 4; l++) maxv[2*l] = vec_max(asrcv[2*l], asrcv[2*l+1]);
//for (int l = 0; l < 2; l++) maxv[4*l] = vec_max(maxv[4*l], maxv[4*l+2]);
maxv[0] = vec_max(maxv[0], maxv[2]);
maxv[4] = vec_max(maxv[4], maxv[6]);
//for (int l = 0; l < 1; l++) maxv[8*l] = vec_max(maxv[8*l], maxv[8*l+4]);
maxv[0] = vec_max(maxv[0], maxv[4]);
for (int l = 0; l < 4; l++) minv[2*l] = vec_min(asrcv[2*l], asrcv[2*l+1]);
//for (int l = 0; l < 2; l++) minv[4*l] = vec_min(minv[4*l], minv[4*l+2]);
minv[0] = vec_min(minv[0], minv[2]);
minv[4] = vec_min(minv[4], minv[6]);
//for (int l = 0; l < 1; l++) minv[8*l] = vec_min(minv[8*l], minv[8*l+4]);
minv[0] = vec_min(minv[0], minv[4]);
max = MAX(
MAX(vec_extract(maxv[0], 0), vec_extract(maxv[0], 1)),
MAX(vec_extract(maxv[0], 2), vec_extract(maxv[0], 3)));
min = MIN(
MIN(vec_extract(minv[0], 0), vec_extract(minv[0], 1)),
MIN(vec_extract(minv[0], 2), vec_extract(minv[0], 3)));
const float magnitude = max >= fabsf(min) ? max : min;
const float d = magnitude / -8;
const float id = d ? 1.0/d : 0.0;
y[i].d = d;
const vector float vid = vec_splats(id);
uint8_t * restrict pb = y[i].qs;
for (int l = 0; l < 8; l++) {
const vector float vf = vec_madd(srcv[l], vid, v85);
const vector signed int vi = vec_signed(vf);
const vector signed int vc = vec_min(vi, v15);
pb[2*l + 0] = vec_extract(vc, 0) | (vec_extract(vc, 1) << 4);
pb[2*l + 1] = vec_extract(vc, 2) | (vec_extract(vc, 3) << 4);
}
}
#elif __ARM_NEON
for (int i = 0; i < nb; i++) {
float32x4_t srcv [8];
float32x4_t maxv[8];
float32x4_t minv[8];
for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*32 + 4*l);
for (int l = 0; l < 4; l++) maxv[2*l] = vmaxq_f32(srcv[2*l], srcv[2*l+1]);
for (int l = 0; l < 2; l++) maxv[4*l] = vmaxq_f32(maxv[4*l], maxv[4*l+2]);
for (int l = 0; l < 1; l++) maxv[8*l] = vmaxq_f32(maxv[8*l], maxv[8*l+4]);
for (int l = 0; l < 4; l++) minv[2*l] = vminq_f32(srcv[2*l], srcv[2*l+1]);
for (int l = 0; l < 2; l++) minv[4*l] = vminq_f32(minv[4*l], minv[4*l+2]);
for (int l = 0; l < 1; l++) minv[8*l] = vminq_f32(minv[8*l], minv[8*l+4]);
const float max = vmaxvq_f32(maxv[0]);
const float min = vminvq_f32(minv[0]);
const float magnitude = max >= fabsf(min) ? max : min;
const float d = magnitude / -8;
const float id = d ? 1.0f/d : 0.0f;
y[i].d = d;
for (int l = 0; l < 8; l++) {
const float32x4_t v = vmulq_n_f32(srcv[l], id);
const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(8.5f));
const int32x4_t vi = vcvtq_s32_f32(vf);
const int32x4_t vc = vminq_s32(vi, vdupq_n_s32(15));
y[i].qs[2*l + 0] = vgetq_lane_s32(vc, 0) | (vgetq_lane_s32(vc, 1) << 4);
y[i].qs[2*l + 1] = vgetq_lane_s32(vc, 2) | (vgetq_lane_s32(vc, 3) << 4);
}
}
#elif defined(__AVX2__)
for (int i = 0; i < nb; i++) {
// Load elements into 4 AVX vectors
__m256 v0 = _mm256_loadu_ps( x );
__m256 v1 = _mm256_loadu_ps( x + 8 );
__m256 v2 = _mm256_loadu_ps( x + 16 );
__m256 v3 = _mm256_loadu_ps( x + 24 );
x += 32;
// Compute max for the block
__m256 max = _mm256_max_ps( v0, v1 );
__m256 maxTmp = _mm256_max_ps( v2, v3 );
max = _mm256_max_ps( max, maxTmp );
__m128 max4 = _mm_max_ps( _mm256_extractf128_ps( max, 1 ), _mm256_castps256_ps128( max ) );
max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
const float maxScalar = _mm_cvtss_f32( max4 );
// Compute min for the block
__m256 min = _mm256_min_ps( v0, v1 );
__m256 minTmp = _mm256_min_ps( v2, v3 );
min = _mm256_min_ps( min, minTmp );
__m128 min4 = _mm_min_ps( _mm256_extractf128_ps( min, 1 ), _mm256_castps256_ps128( min ) );
min4 = _mm_min_ps( min4, _mm_movehl_ps( min4, min4 ) );
min4 = _mm_min_ss( min4, _mm_movehdup_ps( min4 ) );
const float minScalar = _mm_cvtss_f32( min4 );
// Quantize these floats
const float magnitude = maxScalar >= fabsf(minScalar) ? maxScalar : minScalar;
const float d = magnitude / -8.0f;
y[i].d = d;
const float id = ( magnitude != 0.0f ) ? -8.0f / magnitude : 0.0f;
const __m256 mul = _mm256_set1_ps( id );
// Apply the multiplier
v0 = _mm256_mul_ps( v0, mul );
v1 = _mm256_mul_ps( v1, mul );
v2 = _mm256_mul_ps( v2, mul );
v3 = _mm256_mul_ps( v3, mul );
// Round to nearest integer
v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
// Convert floats to integers
__m256i i0 = _mm256_cvtps_epi32( v0 );
__m256i i1 = _mm256_cvtps_epi32( v1 );
__m256i i2 = _mm256_cvtps_epi32( v2 );
__m256i i3 = _mm256_cvtps_epi32( v3 );
// Convert int32 to int16
i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31
// Convert int16 to int8
i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
// We got our precious signed bytes, but the order is now wrong
// These AVX2 pack instructions process 16-byte pieces independently
// The following instruction is fixing the order
const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
i0 = _mm256_permutevar8x32_epi32( i0, perm );
// Apply offset and clamp to translate the range from [ -8 .. +8 ] into [ +0 .. +15 ]
const __m256i off = _mm256_set1_epi8( 8 );
i0 = _mm256_add_epi8( i0, off );
const __m256i maxNibble = _mm256_set1_epi8( 15 );
i0 = _mm256_min_epi8( i0, maxNibble );
// Compress the vector into 4 bit/value, and store
__m128i res = packNibbles( i0 );
_mm_storeu_si128( ( __m128i* )y[i].qs, res );
}
#elif defined(__AVX__)
for (int i = 0; i < nb; i++) {
// Load elements into 4 AVX vectors
__m256 v0 = _mm256_loadu_ps( x );
__m256 v1 = _mm256_loadu_ps( x + 8 );
__m256 v2 = _mm256_loadu_ps( x + 16 );
__m256 v3 = _mm256_loadu_ps( x + 24 );
x += 32;
// Compute max for the block
__m256 max = _mm256_max_ps( v0, v1 );
__m256 maxTmp = _mm256_max_ps( v2, v3 );
max = _mm256_max_ps( max, maxTmp );
__m128 max4 = _mm_max_ps( _mm256_extractf128_ps( max, 1 ), _mm256_castps256_ps128( max ) );
max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
const float maxScalar = _mm_cvtss_f32( max4 );
// Compute min for the block
__m256 min = _mm256_min_ps( v0, v1 );
__m256 minTmp = _mm256_min_ps( v2, v3 );
min = _mm256_min_ps( min, minTmp );
__m128 min4 = _mm_min_ps( _mm256_extractf128_ps( min, 1 ), _mm256_castps256_ps128( min ) );
min4 = _mm_min_ps( min4, _mm_movehl_ps( min4, min4 ) );
min4 = _mm_min_ss( min4, _mm_movehdup_ps( min4 ) );
const float minScalar = _mm_cvtss_f32( min4 );
// Quantize these floats
const float magnitude = maxScalar >= fabsf(minScalar) ? maxScalar : minScalar;
const float d = magnitude / -8.0f;
y[i].d = d;
const float id = ( magnitude != 0.0f ) ? -8.0f / magnitude : 0.0f;
const __m256 mul = _mm256_set1_ps( id );
// Apply the multiplier
v0 = _mm256_mul_ps( v0, mul );
v1 = _mm256_mul_ps( v1, mul );
v2 = _mm256_mul_ps( v2, mul );
v3 = _mm256_mul_ps( v3, mul );
// Round to nearest integer
v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
// Convert floats to integers
__m256i i0 = _mm256_cvtps_epi32( v0 );
__m256i i1 = _mm256_cvtps_epi32( v1 );
__m256i i2 = _mm256_cvtps_epi32( v2 );
__m256i i3 = _mm256_cvtps_epi32( v3 );
// Since we don't have in AVX some necessary functions,
// we split the registers in half and call AVX2 analogs from SSE
__m128i ni0 = _mm256_castsi256_si128( i0 );
__m128i ni1 = _mm256_extractf128_si256( i0, 1);
__m128i ni2 = _mm256_castsi256_si128( i1 );
__m128i ni3 = _mm256_extractf128_si256( i1, 1);
__m128i ni4 = _mm256_castsi256_si128( i2 );
__m128i ni5 = _mm256_extractf128_si256( i2, 1);
__m128i ni6 = _mm256_castsi256_si128( i3 );
__m128i ni7 = _mm256_extractf128_si256( i3, 1);
// Convert int32 to int16
ni0 = _mm_packs_epi32( ni0, ni1 );
ni2 = _mm_packs_epi32( ni2, ni3 );
ni4 = _mm_packs_epi32( ni4, ni5 );
ni6 = _mm_packs_epi32( ni6, ni7 );
// Convert int16 to int8
ni0 = _mm_packs_epi16( ni0, ni2 );
ni4 = _mm_packs_epi16( ni4, ni6 );
// Apply offset and clamp to translate the range from [ -8 .. +8 ] into [ +0 .. +15 ]
const __m128i off = _mm_set1_epi8( 8 );
ni0 = _mm_add_epi8( ni0, off );
ni4 = _mm_add_epi8( ni4, off );
const __m128i maxNibble = _mm_set1_epi8( 15 );
ni0 = _mm_min_epi8( ni0, maxNibble );
ni4 = _mm_min_epi8( ni4, maxNibble );
// Compress the vector into 4 bit/value, and store
__m128i res = packNibbles( ni0, ni4 );
_mm_storeu_si128( ( __m128i* )y[i].qs, res );
}
#elif defined(__wasm_simd128__)
for (int i = 0; i < nb; i++) {
float max = 0.0f;
float min = 0.0f;
v128_t srcv [8];
v128_t maxv[8];
v128_t minv[8];
for (int l = 0; l < 8; l++) srcv[l] = wasm_v128_load(x + i*32 + 4*l);
for (int l = 0; l < 4; l++) maxv[2*l] = wasm_f32x4_max(srcv[2*l], srcv[2*l+1]);
for (int l = 0; l < 2; l++) maxv[4*l] = wasm_f32x4_max(maxv[4*l], maxv[4*l+2]);
for (int l = 0; l < 1; l++) maxv[8*l] = wasm_f32x4_max(maxv[8*l], maxv[8*l+4]);
for (int l = 0; l < 4; l++) minv[2*l] = wasm_f32x4_min(srcv[2*l], srcv[2*l+1]);
for (int l = 0; l < 2; l++) minv[4*l] = wasm_f32x4_min(minv[4*l], minv[4*l+2]);
for (int l = 0; l < 1; l++) minv[8*l] = wasm_f32x4_min(minv[8*l], minv[8*l+4]);
max = MAX(
MAX(wasm_f32x4_extract_lane(maxv[0], 0), wasm_f32x4_extract_lane(maxv[0], 1)),
MAX(wasm_f32x4_extract_lane(maxv[0], 2), wasm_f32x4_extract_lane(maxv[0], 3)));
min = MIN(
MIN(wasm_f32x4_extract_lane(minv[0], 0), wasm_f32x4_extract_lane(minv[0], 1)),
MIN(wasm_f32x4_extract_lane(minv[0], 2), wasm_f32x4_extract_lane(minv[0], 3)));
const float magnitude = max >= fabsf(min) ? max : min;
const float d = magnitude / -8;
const float id = d ? 1.0/d : 0.0;
y[i].d = d;
for (int l = 0; l < 8; l++) {
const v128_t v = wasm_f32x4_mul(srcv[l], wasm_f32x4_splat(id));
const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f));
const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf);
const v128_t vc = wasm_i32x4_min(vi, wasm_i32x4_splat(15));
y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vc, 0) | (wasm_i32x4_extract_lane(vc, 1) << 4);
y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vc, 2) | (wasm_i32x4_extract_lane(vc, 3) << 4);
}
}
#else
// scalar
quantize_row_v1_q4_0_reference(x, y, k);
#endif
}
size_t ggml_quantize_v1_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
assert(k % V1_QK4_0 == 0);
const int nb = k / V1_QK4_0;
for (int j = 0; j < n; j += k) {
block_v1_q4_0 * restrict y = (block_v1_q4_0 *)dst + j/V1_QK4_0;
quantize_row_v1_q4_0_reference(src + j, y, k);
for (int i = 0; i < nb; i++) {
for (int l = 0; l < V1_QK4_0; l += 2) {
const uint8_t vi0 = y[i].qs[l/2] & 0x0F;
const uint8_t vi1 = y[i].qs[l/2] >> 4;
hist[vi0]++;
hist[vi1]++;
}
}
}
return (n/V1_QK4_0*sizeof(block_v1_q4_0));
}
void dequantize_row_v1_q4_0(const void * restrict vx, float * restrict y, int k) {
assert(k % V1_QK4_0 == 0);
const int nb = k / V1_QK4_0;
const block_v1_q4_0 * restrict x = vx;
#if defined(__AVX2__)
for (int i = 0; i < nb; i++) {
// scale factor
const __m256 d_v = _mm256_broadcast_ss(&x[i].d);
const uint8_t * restrict pp = x[i].qs;
for (int l = 0; l < V1_QK4_0; l += 32) {
// Load 32x4-bit integers into 32x8-bit integers
__m256i vx8 = bytes_from_nibbles_32(pp+l/2);
// Subtract 8 from the integers
vx8 = _mm256_sub_epi8(vx8, _mm256_set1_epi8(8));
// Convert to 16-bit int
const __m256i vx16_lo = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 0));
const __m256i vx16_hi = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 1));
// Convert to 32-bit int -> float 32
const __m256 vf[4] = {
_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_lo, 0))),
_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_lo, 1))),
_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_hi, 0))),
_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_hi, 1)))
};
// Scale and store
for (int j = 0; j < 4; j++) {
const __m256 result = _mm256_mul_ps(vf[j], d_v);
_mm256_storeu_ps(y + i * V1_QK4_0 + l + j*8, result);
}
}
}
#elif defined(__ARM_NEON)
for (int i = 0; i < nb; i++) {
const float32x4_t vd = vdupq_n_f32(x[i].d);
const uint8_t * restrict pp = x[i].qs;
for (int l = 0; l < V1_QK4_0; l += 16) {
// Load 16x4-bit integers into 8x8-bit integers
const uint8x8_t v8 = vld1_u8(pp + l/2);
// Expand 4-bit qs to 8-bit bytes
const uint8x8_t v0 = vand_u8(v8, vdup_n_u8(0x0F));
const uint8x8_t v1 = vshr_n_u8(v8, 4);
// Convert to signed 8-bit integers
const int8x8_t vs_0 = vreinterpret_s8_u8(v0);
const int8x8_t vs_1 = vreinterpret_s8_u8(v1);
// Subtract 8 from each byte
const int8x8_t vb_0 = vsub_s8(vs_0, vdup_n_s8(8));
const int8x8_t vb_1 = vsub_s8(vs_1, vdup_n_s8(8));
// Interleave and combine
const int8x8_t vx_0 = vzip1_s8(vb_0, vb_1);
const int8x8_t vx_1 = vzip2_s8(vb_0, vb_1);
const int8x16_t vq = vcombine_s8(vx_0, vx_1);
// convert to 2x int16x8_t
const int16x8_t vi_0 = vmovl_s8(vget_low_s8 (vq));
const int16x8_t vi_1 = vmovl_s8(vget_high_s8(vq));
// convert to 4x float32x4_t
const float32x4_t vf_0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vi_0)));
const float32x4_t vf_1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vi_0)));
const float32x4_t vf_2 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vi_1)));
const float32x4_t vf_3 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vi_1)));
// Multiply by d
const float32x4_t r0 = vmulq_f32(vf_0, vd);
const float32x4_t r1 = vmulq_f32(vf_1, vd);
const float32x4_t r2 = vmulq_f32(vf_2, vd);
const float32x4_t r3 = vmulq_f32(vf_3, vd);
// Store
vst1q_f32(y + i*V1_QK4_0 + l + 0, r0);
vst1q_f32(y + i*V1_QK4_0 + l + 4, r1);
vst1q_f32(y + i*V1_QK4_0 + l + 8, r2);
vst1q_f32(y + i*V1_QK4_0 + l + 12, r3);
}
}
#else
// scalar
for (int i = 0; i < nb; i++) {
const float d = x[i].d;
const uint8_t * restrict pp = x[i].qs;
for (int l = 0; l < V1_QK4_0; l += 2) {
const uint8_t vi = pp[l/2];
const int8_t vi0 = vi & 0x0F;
const int8_t vi1 = vi >> 4;
const float v0 = (vi0 - 8)*d;
const float v1 = (vi1 - 8)*d;
//printf("d = %f, vi = %d, vi0 = %d, vi1 = %d, v0 = %f, v1 = %f\n", d, vi, vi0, vi1, v0, v1);
y[i*V1_QK4_0 + l + 0] = v0;
y[i*V1_QK4_0 + l + 1] = v1;
assert(!isnan(y[i*V1_QK4_0 + l + 0]));
assert(!isnan(y[i*V1_QK4_0 + l + 1]));
}
}
#endif
}
void ggml_vec_dot_v1_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
const int nb = n / V1_QK8_0;
assert(n % V1_QK8_0 == 0);
assert(nb % 2 == 0);
const block_v1_q4_0 * restrict x = vx;
const block_v1_q8_0 * restrict y = vy;
#if defined(__ARM_NEON)
float32x4_t sumv0 = vdupq_n_f32(0.0f);
float32x4_t sumv1 = vdupq_n_f32(0.0f);
for (int i = 0; i < nb; i += 2) {
const block_v1_q4_0 * restrict x0 = &x[i + 0];
const block_v1_q4_0 * restrict x1 = &x[i + 1];
const block_v1_q8_0 * restrict y0 = &y[i + 0];
const block_v1_q8_0 * restrict y1 = &y[i + 1];
const uint8x16_t m4b = vdupq_n_u8(0x0F);
const int8x16_t s8b = vdupq_n_s8(0x8);
const uint8x16_t v0_0 = vld1q_u8(x0->qs);
const uint8x16_t v0_1 = vld1q_u8(x1->qs);
// 4-bit -> 8-bit
const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b));
const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b));
const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
// sub 8
const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
// interleave
const int8x16_t v0_0lz = vzip1q_s8(v0_0ls, v0_0hs);
const int8x16_t v0_0hz = vzip2q_s8(v0_0ls, v0_0hs);
const int8x16_t v0_1lz = vzip1q_s8(v0_1ls, v0_1hs);
const int8x16_t v0_1hz = vzip2q_s8(v0_1ls, v0_1hs);
// load y
const int8x16_t v1_0l = vld1q_s8(y0->qs);
const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
const int8x16_t v1_1l = vld1q_s8(y1->qs);
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
#if defined(__ARM_FEATURE_DOTPROD)
// dot product into int32x4_t
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l), v0_0hz, v1_0h);
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l), v0_1hz, v1_1h);
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
#else
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l));
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l));
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h));
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h));
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l));
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l));
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h));
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h));
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d);
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d);
#endif
}
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
#elif defined(__AVX2__)
// Initialize accumulator with zeros
__m256 acc = _mm256_setzero_ps();
// Main loop
for (int i = 0; i < nb; ++i) {
/* Compute combined scale for the block */
const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
__m256i bx = bytes_from_nibbles_32(x[i].qs);
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
const __m256i off = _mm256_set1_epi8( 8 );
bx = _mm256_sub_epi8( bx, off );
__m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
const __m256 q = mul_sum_i8_pairs_float(bx, by);
/* Multiply q with scale and accumulate */
acc = _mm256_fmadd_ps( d, q, acc );
}
*s = hsum_float_8(acc);
#elif defined(__AVX__)
// Initialize accumulator with zeros
__m256 acc = _mm256_setzero_ps();
// Main loop
for (int i = 0; i < nb; ++i) {
// Compute combined scale for the block
const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
__m128i i32[2];
for (int j = 0; j < 2; ++j) {
// Load 8 bytes, and unpack 4 bit fields into bytes, making 16 bytes
__m128i bx = bytes_from_nibbles_16(x[i].qs + 8*j);
__m128i by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16*j));
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
const __m128i off = _mm_set1_epi8( 8 );
bx = _mm_sub_epi8( bx, off );
// Get absolute values of x vectors
const __m128i ax = _mm_sign_epi8(bx, bx);
// Sign the values of the y vectors
const __m128i sy = _mm_sign_epi8(by, bx);
// Perform multiplication and create 16-bit values
const __m128i dot = _mm_maddubs_epi16(ax, sy);
const __m128i ones = _mm_set1_epi16(1);
i32[j] = _mm_madd_epi16(ones, dot);
}
// Convert int32_t to float
__m256 p = _mm256_cvtepi32_ps( _mm256_set_m128i( i32[0], i32[1] ));
// Apply the scale, and accumulate
acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
}
*s = hsum_float_8(acc);
#else
// scalar
float sumf = 0.0;
for (int i = 0; i < nb; i++) {
const float d0 = x[i].d;
const float d1 = y[i].d;
const uint8_t * restrict p0 = x[i].qs;
const int8_t * restrict p1 = y[i].qs;
int sumi = 0;
for (int j = 0; j < V1_QK8_0/2; j++) {
const uint8_t v0 = p0[j];
const int i0 = (int8_t) (v0 & 0x0F) - 8;
const int i1 = (int8_t) (v0 >> 4) - 8;
const int i2 = p1[2*j + 0];
const int i3 = p1[2*j + 1];
sumi += i0*i2 + i1*i3;
}
sumf += d0*d1*sumi;
}
*s = sumf;
#endif
}

22
third_party/ggml/ggjt.v1.q4_0.h vendored Normal file
View file

@ -0,0 +1,22 @@
#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_V1_Q4_0_H_
#define COSMOPOLITAN_THIRD_PARTY_GGML_V1_Q4_0_H_
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
#define V1_QK4_0 32
typedef struct {
float d; // delta
uint8_t qs[V1_QK4_0 / 2]; // nibbles / quants
} block_v1_q4_0;
void dequantize_row_v1_q4_0(const void* restrict, float* restrict, int);
size_t ggml_quantize_v1_q4_0(const float*, void*, int, int, int64_t*);
void quantize_row_v1_q4_0(const float* restrict, void* restrict, int);
void quantize_row_v1_q4_0_reference(const float* restrict,
block_v1_q4_0* restrict, int);
void ggml_vec_dot_v1_q4_0_q8_0(const int, float* restrict, const void* restrict,
const void* restrict);
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_V1_Q4_0_H_ */

472
third_party/ggml/ggjt.v1.q4_1.c vendored Normal file
View file

@ -0,0 +1,472 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi
GGML
Copyright (c) 2023 Georgi Gerganov
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "third_party/ggml/ggjt.v1.q4_1.h"
#include "libc/assert.h"
#include "libc/macros.internal.h"
#include "libc/str/str.h"
#include "third_party/aarch64/arm_neon.h"
#include "third_party/ggml/ggjt.v1.internal.h"
#include "third_party/ggml/ggjt.v1.q4_1.h"
#include "third_party/ggml/ggjt.v1.q8_1.h"
#include "third_party/intel/immintrin.internal.h"
#include "third_party/libcxx/math.h"
// clang-format off
// quantization for the ggjt.v1.q4_1 file format
static_assert(sizeof(block_v1_q4_1) == 2 * sizeof(float) + V1_QK4_1 / 2,
"wrong q4_1 block size/padding");
static_assert(sizeof(block_v1_q8_1) == 3*sizeof(float) + V1_QK8_1,
"wrong q8_1 block size/padding");
void quantize_row_v1_q4_1_reference(const float * restrict x, void * restrict vy, int k) {
assert(k % V1_QK4_1 == 0);
const int nb = k / V1_QK4_1;
block_v1_q4_1 * restrict y = vy;
uint8_t pp[V1_QK4_1/2];
for (int i = 0; i < nb; i++) {
float min = FLT_MAX;
float max = -FLT_MAX;
for (int l = 0; l < V1_QK4_1; l++) {
const float v = x[i*V1_QK4_1 + l];
if (v < min) min = v;
if (v > max) max = v;
}
const float d = (max - min) / ((1 << 4) - 1);
const float id = d ? 1.0f/d : 0.0f;
y[i].d = d;
y[i].m = min;
for (int l = 0; l < V1_QK4_1; l += 2) {
const float v0 = (x[i*V1_QK4_1 + l + 0] - min)*id;
const float v1 = (x[i*V1_QK4_1 + l + 1] - min)*id;
const uint8_t vi0 = roundf(v0);
const uint8_t vi1 = roundf(v1);
assert(vi0 < 16);
assert(vi1 < 16);
pp[l/2] = vi0 | (vi1 << 4);
}
memcpy(y[i].qs, pp, sizeof(pp));
}
}
void quantize_row_v1_q4_1(const float * restrict x, void * restrict vy, int k) {
assert(k % V1_QK4_1 == 0);
const int nb = k / V1_QK4_1;
block_v1_q4_1 * restrict y = vy;
#if defined(__AVX2__)
for (int i = 0; i < nb; i++) {
// Load elements into 4 AVX vectors
__m256 v0 = _mm256_loadu_ps( x );
__m256 v1 = _mm256_loadu_ps( x + 8 );
__m256 v2 = _mm256_loadu_ps( x + 16 );
__m256 v3 = _mm256_loadu_ps( x + 24 );
x += 32;
// Compute max for the block
__m256 vmax;
vmax = _mm256_max_ps( v0, v1 );
vmax = _mm256_max_ps( vmax, v2 );
vmax = _mm256_max_ps( vmax, v3 );
__m128 max4 = _mm_max_ps( _mm256_extractf128_ps( vmax, 1 ), _mm256_castps256_ps128( vmax ) );
max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
const float maxScalar = _mm_cvtss_f32( max4 );
// Compute min for the block
__m256 vmin;
vmin = _mm256_min_ps( v0, v1 );
vmin = _mm256_min_ps( vmin, v2 );
vmin = _mm256_min_ps( vmin, v3 );
__m128 min4 = _mm_min_ps( _mm256_extractf128_ps( vmin, 1 ), _mm256_castps256_ps128( vmin ) );
min4 = _mm_min_ps( min4, _mm_movehl_ps( min4, min4 ) );
min4 = _mm_min_ss( min4, _mm_movehdup_ps( min4 ) );
const float minScalar = _mm_cvtss_f32( min4 );
// Quantize these floats
const float d = (maxScalar - minScalar) / ((1 << 4) - 1);
const float id = d ? 1.0f/d : 0.0f;
y[i].m = minScalar;
y[i].d = d;
// x = (x-min)*id
const __m256 mul = _mm256_set1_ps( id );
const __m256 off = _mm256_set1_ps( minScalar );
v0 = _mm256_mul_ps( _mm256_sub_ps( v0, off ), mul );
v1 = _mm256_mul_ps( _mm256_sub_ps( v1, off ), mul );
v2 = _mm256_mul_ps( _mm256_sub_ps( v2, off ), mul );
v3 = _mm256_mul_ps( _mm256_sub_ps( v3, off ), mul );
// Round to nearest integer
v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
// Convert floats to integers
__m256i i0 = _mm256_cvtps_epi32( v0 );
__m256i i1 = _mm256_cvtps_epi32( v1 );
__m256i i2 = _mm256_cvtps_epi32( v2 );
__m256i i3 = _mm256_cvtps_epi32( v3 );
// Convert int32 to int16
i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31
// Convert int16 to int8
i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
// We got our precious signed bytes, but the order is now wrong
// These AVX2 pack instructions process 16-byte pieces independently
// The following instruction is fixing the order
const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
i0 = _mm256_permutevar8x32_epi32( i0, perm );
// Compress the vector into 4 bit/value, and store
__m128i res = packNibbles( i0 );
_mm_storeu_si128( ( __m128i* )y[i].qs, res );
}
#elif __ARM_NEON
for (int i = 0; i < nb; i++) {
float32x4_t srcv[8];
float32x4_t minv[8];
float32x4_t maxv[8];
for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*V1_QK4_1 + 4*l);
for (int l = 0; l < 4; l++) minv[2*l] = vminq_f32(srcv[2*l], srcv[2*l + 1]);
for (int l = 0; l < 2; l++) minv[4*l] = vminq_f32(minv[4*l], minv[4*l + 2]);
for (int l = 0; l < 1; l++) minv[8*l] = vminq_f32(minv[8*l], minv[8*l + 4]);
for (int l = 0; l < 4; l++) maxv[2*l] = vmaxq_f32(srcv[2*l], srcv[2*l + 1]);
for (int l = 0; l < 2; l++) maxv[4*l] = vmaxq_f32(maxv[4*l], maxv[4*l + 2]);
for (int l = 0; l < 1; l++) maxv[8*l] = vmaxq_f32(maxv[8*l], maxv[8*l + 4]);
const float min = vminvq_f32(minv[0]);
const float max = vmaxvq_f32(maxv[0]);
const float d = (max - min) / ((1 << 4) - 1);
const float id = d ? 1.0f/d : 0.0f;
y[i].d = d;
y[i].m = min;
const float32x4_t minv0 = vdupq_n_f32(min);
for (int l = 0; l < 8; l++) {
const float32x4_t v = vmulq_n_f32(vsubq_f32(srcv[l], minv0), id);
const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(0.5f)); // needed to round to nearest
const int32x4_t vi = vcvtq_s32_f32(vf);
y[i].qs[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4);
y[i].qs[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4);
}
}
#else
// scalar
quantize_row_v1_q4_1_reference(x, vy, k);
#endif
}
size_t ggml_quantize_v1_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) {
assert(k % V1_QK4_1 == 0);
const int nb = k / V1_QK4_1;
for (int j = 0; j < n; j += k) {
block_v1_q4_1 * restrict y = (block_v1_q4_1 *)dst + j/V1_QK4_1;
quantize_row_v1_q4_1_reference(src + j, y, k);
for (int i = 0; i < nb; i++) {
for (int l = 0; l < V1_QK4_1; l += 2) {
const uint8_t vi0 = y[i].qs[l/2] & 0x0F;
const uint8_t vi1 = y[i].qs[l/2] >> 4;
hist[vi0]++;
hist[vi1]++;
}
}
}
return (n/V1_QK4_1*sizeof(block_v1_q4_1));
}
void dequantize_row_v1_q4_1(const void * restrict vx, float * restrict y, int k) {
assert(k % V1_QK4_1 == 0);
const int nb = k / V1_QK4_1;
const block_v1_q4_1 * restrict x = vx;
#if defined(__AVX2__)
for (int i = 0; i < nb; i++) {
const __m256 d_v = _mm256_broadcast_ss(&x[i].d);
const __m256 d_m = _mm256_broadcast_ss(&x[i].m);
const uint8_t * restrict pp = x[i].qs;
for (int l = 0; l < V1_QK4_1; l += 32) {
// Load 32x4-bit integers into 32x8-bit integers
__m256i vx8 = bytes_from_nibbles_32(pp+l/2);
// Convert to 16-bit int
const __m256i vx16_lo = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 0));
const __m256i vx16_hi = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 1));
// Convert to 32-bit int -> float 32
const __m256 vf[4] = {
_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_lo, 0))),
_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_lo, 1))),
_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_hi, 0))),
_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_hi, 1)))
};
// Scale, add m and store
for (int j = 0; j < 4; j++) {
const __m256 result = _mm256_add_ps(_mm256_mul_ps(vf[j], d_v), d_m);
_mm256_storeu_ps(y + i * V1_QK4_1 + l + j*8, result);
}
}
}
#elif defined(__ARM_NEON)
for (int i = 0; i < nb; i++) {
const float32x4_t vd = vdupq_n_f32(x[i].d);
const float32x4_t vm = vdupq_n_f32(x[i].m);
const uint8_t * restrict pp = x[i].qs;
for (int l = 0; l < V1_QK4_1; l += 16) {
// Load 16x4-bit integers into 8x8-bit integers
const uint8x8_t v8 = vld1_u8(pp + l/2);
// Expand 4-bit qs to 8-bit bytes
const uint8x8_t v0 = vand_u8(v8, vdup_n_u8(0x0F));
const uint8x8_t v1 = vshr_n_u8(v8, 4);
// Interleave and combine
const uint8x8_t vx_0 = vzip1_u8(v0, v1);
const uint8x8_t vx_1 = vzip2_u8(v0, v1);
const uint8x16_t vq = vcombine_u8(vx_0, vx_1);
// convert to 2x uint16x8_t
const uint16x8_t vi_0 = vmovl_u8(vget_low_u8 (vq));
const uint16x8_t vi_1 = vmovl_u8(vget_high_u8(vq));
// convert to 4x float32x4_t
const float32x4_t vf_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16 (vi_0)));
const float32x4_t vf_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vi_0)));
const float32x4_t vf_2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16 (vi_1)));
const float32x4_t vf_3 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vi_1)));
// multiply by d and add m
const float32x4_t r0 = vmlaq_f32(vm, vf_0, vd);
const float32x4_t r1 = vmlaq_f32(vm, vf_1, vd);
const float32x4_t r2 = vmlaq_f32(vm, vf_2, vd);
const float32x4_t r3 = vmlaq_f32(vm, vf_3, vd);
// Store
vst1q_f32(y + i*V1_QK4_1 + l + 0, r0);
vst1q_f32(y + i*V1_QK4_1 + l + 4, r1);
vst1q_f32(y + i*V1_QK4_1 + l + 8, r2);
vst1q_f32(y + i*V1_QK4_1 + l + 12, r3);
}
}
#else
for (int i = 0; i < nb; i++) {
const float d = x[i].d;
const float m = x[i].m;
const uint8_t * restrict pp = x[i].qs;
for (int l = 0; l < V1_QK4_1; l += 2) {
const uint8_t vi = pp[l/2];
const int8_t vi0 = vi & 0x0F;
const int8_t vi1 = vi >> 4;
const float v0 = vi0*d + m;
const float v1 = vi1*d + m;
y[i*V1_QK4_1 + l + 0] = v0;
y[i*V1_QK4_1 + l + 1] = v1;
assert(!isnan(y[i*V1_QK4_1 + l + 0]));
assert(!isnan(y[i*V1_QK4_1 + l + 1]));
}
}
#endif
}
void ggml_vec_dot_v1_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
const int nb = n / V1_QK8_1;
assert(n % V1_QK8_1 == 0);
assert(nb % 2 == 0);
const block_v1_q4_1 * restrict x = vx;
const block_v1_q8_1 * restrict y = vy;
// TODO: add AVX / WASM SIMD / etc
#if defined(__ARM_NEON)
float32x4_t sumv0 = vdupq_n_f32(0.0f);
float32x4_t sumv1 = vdupq_n_f32(0.0f);
float summs = 0;
for (int i = 0; i < nb; i += 2) {
const block_v1_q4_1 * restrict x0 = &x[i + 0];
const block_v1_q4_1 * restrict x1 = &x[i + 1];
const block_v1_q8_1 * restrict y0 = &y[i + 0];
const block_v1_q8_1 * restrict y1 = &y[i + 1];
summs += x0->m * (y0->s0 + y0->s1) + x1->m * (y1->s0 + y1->s1);
const uint8x16_t m4b = vdupq_n_u8(0x0F);
const uint8x16_t v0_0 = vld1q_u8(x0->qs);
const uint8x16_t v0_1 = vld1q_u8(x1->qs);
// 4-bit -> 8-bit
const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b));
const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b));
const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
// interleave
const int8x16_t v0_0lz = vzip1q_s8(v0_0l, v0_0h);
const int8x16_t v0_0hz = vzip2q_s8(v0_0l, v0_0h);
const int8x16_t v0_1lz = vzip1q_s8(v0_1l, v0_1h);
const int8x16_t v0_1hz = vzip2q_s8(v0_1l, v0_1h);
// load y
const int8x16_t v1_0l = vld1q_s8(y0->qs);
const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
const int8x16_t v1_1l = vld1q_s8(y1->qs);
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
#if defined(__ARM_FEATURE_DOTPROD)
// dot product into int32x4_t
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l), v0_0hz, v1_0h);
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l), v0_1hz, v1_1h);
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
#else
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l));
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l));
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h));
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h));
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l));
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l));
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h));
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h));
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d);
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d);
#endif
}
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
#elif defined(__AVX2__)
// Initialize accumulator with zeros
__m256 acc = _mm256_setzero_ps();
float summs = 0;
// Main loop
for (int i = 0; i < nb; ++i) {
const float * d0 = &x[i].d;
const float * d1 = &y[i].d;
summs += x[i].m * (y[i].s0 + y[i].s1);
const __m256 d0v = _mm256_broadcast_ss( d0 );
const __m256 d1v = _mm256_broadcast_ss( d1 );
// Compute combined scales
const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
// Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
const __m256i bx = bytes_from_nibbles_32(x[i].qs);
const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs );
const __m256 xy = mul_sum_i8_pairs_float(bx, by);
// Accumulate d0*d1*x*y
acc = _mm256_fmadd_ps( d0d1, xy, acc );
}
*s = hsum_float_8(acc) + summs;
#else
// scalar
float sumf = 0.0;
for (int i = 0; i < nb; i++) {
const float d0 = x[i].d;
const float m0 = x[i].m;
const float d1 = y[i].d;
const uint8_t * restrict p0 = x[i].qs;
const int8_t * restrict p1 = y[i].qs;
// TODO: this is very slow ..
for (int j = 0; j < V1_QK8_1/2; j++) {
const uint8_t v0 = p0[j];
const float f0 = d0*(v0 & 0x0F) + m0;
const float f1 = d0*(v0 >> 4) + m0;
const float f2 = d1*p1[2*j + 0];
const float f3 = d1*p1[2*j + 1];
sumf += f0*f2 + f1*f3;
}
}
*s = sumf;
#endif
}

22
third_party/ggml/ggjt.v1.q4_1.h vendored Normal file
View file

@ -0,0 +1,22 @@
#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_V1_Q4_1_H_
#define COSMOPOLITAN_THIRD_PARTY_GGML_V1_Q4_1_H_
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
#define V1_QK4_1 32
typedef struct {
float d; // delta
float m; // min
uint8_t qs[V1_QK4_1 / 2]; // nibbles / quants
} block_v1_q4_1;
void dequantize_row_v1_q4_1(const void* restrict, float* restrict, int);
size_t ggml_quantize_v1_q4_1(const float*, void*, int, int, int64_t*);
void quantize_row_v1_q4_1(const float* restrict, void* restrict, int);
void ggml_vec_dot_v1_q4_1_q8_1(const int, float* restrict, const void* restrict,
const void* restrict);
void quantize_row_v1_q4_1_reference(const float* restrict, void* restrict, int);
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_V1_Q4_1_H_ */

297
third_party/ggml/ggjt.v1.q4_2.c vendored Normal file
View file

@ -0,0 +1,297 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi
GGML
Copyright (c) 2023 Georgi Gerganov
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "third_party/ggml/ggjt.v1.q4_2.h"
#include "libc/assert.h"
#include "libc/macros.internal.h"
#include "libc/str/str.h"
#include "third_party/aarch64/arm_neon.h"
#include "third_party/ggml/fp16.internal.h"
#include "third_party/ggml/ggjt.v1.internal.h"
#include "third_party/ggml/ggjt.v1.q8_0.h"
#include "third_party/intel/immintrin.internal.h"
#include "third_party/libcxx/math.h"
// clang-format off
static_assert(sizeof(block_v1_q4_2) == sizeof(ggml_fp16_t) + V1_QK4_2 / 2,
"wrong q4_2 block size/padding");
// reference implementation for deterministic creation of model files
void quantize_row_v1_q4_2_reference(const float * restrict x, block_v1_q4_2 * restrict y, int k) {
assert(k % V1_QK4_2 == 0);
const int nb = k / V1_QK4_2;
for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max
float max = 0.0f;
for (int l = 0; l < V1_QK4_2; l++) {
const float v = x[i*V1_QK4_2 + l];
if (amax < fabsf(v)) {
amax = fabsf(v);
max = v;
}
}
const float d = max / -8;
const float id = d ? 1.0f/d : 0.0f;
y[i].d = GGML_FP32_TO_FP16(d);
for (int l = 0; l < V1_QK4_2; l += 2) {
const float v0 = x[i*V1_QK4_2 + l + 0]*id;
const float v1 = x[i*V1_QK4_2 + l + 1]*id;
const uint8_t vi0 = MIN(15, (uint8_t)(v0 + 8.5f));
const uint8_t vi1 = MIN(15, (uint8_t)(v1 + 8.5f));
assert(vi0 < 16);
assert(vi1 < 16);
y[i].qs[l/2] = vi0 | (vi1 << 4);
}
}
}
void quantize_row_v1_q4_2(const float * restrict x, void * restrict vy, int k) {
assert(k % V1_QK4_2 == 0);
block_v1_q4_2 * restrict y = vy;
quantize_row_v1_q4_2_reference(x, y, k);
}
size_t ggml_quantize_v1_q4_2(const float * src, void * dst, int n, int k, int64_t * hist) {
assert(k % V1_QK4_2 == 0);
const int nb = k / V1_QK4_2;
for (int j = 0; j < n; j += k) {
block_v1_q4_2 * restrict y = (block_v1_q4_2 *)dst + j/V1_QK4_2;
quantize_row_v1_q4_2_reference(src + j, y, k);
for (int i = 0; i < nb; i++) {
for (int l = 0; l < V1_QK4_2; l += 2) {
const uint8_t vi0 = y[i].qs[l/2] & 0x0F;
const uint8_t vi1 = y[i].qs[l/2] >> 4;
hist[vi0]++;
hist[vi1]++;
}
}
}
return (n/V1_QK4_2*sizeof(block_v1_q4_2));
}
void dequantize_row_v1_q4_2(const void * restrict vx, float * restrict y, int k) {
assert(k % V1_QK4_2 == 0);
const int nb = k / V1_QK4_2;
const block_v1_q4_2 * restrict x = vx;
for (int i = 0; i < nb; i++) {
const float d = GGML_FP16_TO_FP32(x[i].d);
const uint8_t * restrict pp = x[i].qs;
for (int l = 0; l < V1_QK4_2; l += 2) {
const uint8_t vi = pp[l/2];
const int8_t vi0 = vi & 0x0F;
const int8_t vi1 = vi >> 4;
const float v0 = (vi0 - 8)*d;
const float v1 = (vi1 - 8)*d;
y[i*V1_QK4_2 + l + 0] = v0;
y[i*V1_QK4_2 + l + 1] = v1;
assert(!isnan(y[i*V1_QK4_2 + l + 0]));
assert(!isnan(y[i*V1_QK4_2 + l + 1]));
}
}
}
void ggml_vec_dot_v1_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
const int nb = n / V1_QK8_0;
assert(n % V1_QK8_0 == 0);
assert(nb % 2 == 0);
assert(V1_QK8_0 == 2*V1_QK4_2);
const block_v1_q4_2 * restrict x = vx;
const block_v1_q8_0 * restrict y = vy;
#if defined(__ARM_NEON)
float32x4_t sumv0 = vdupq_n_f32(0.0f);
float32x4_t sumv1 = vdupq_n_f32(0.0f);
for (int i = 0; i < nb; i += 2) {
const block_v1_q4_2 * restrict x0_0 = &x[2*(i + 0) + 0];
const block_v1_q4_2 * restrict x0_1 = &x[2*(i + 0) + 1];
const block_v1_q4_2 * restrict x1_0 = &x[2*(i + 1) + 0];
const block_v1_q4_2 * restrict x1_1 = &x[2*(i + 1) + 1];
const block_v1_q8_0 * restrict y0 = &y[i + 0];
const block_v1_q8_0 * restrict y1 = &y[i + 1];
const uint8x16_t m4b = vdupq_n_u8(0x0F);
const int8x16_t s8b = vdupq_n_s8(0x8);
const uint8x16_t v0_0 = vcombine_u8(vld1_u8(x0_0->qs), vld1_u8(x0_1->qs));
const uint8x16_t v0_1 = vcombine_u8(vld1_u8(x1_0->qs), vld1_u8(x1_1->qs));
// 4-bit -> 8-bit
const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b));
const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b));
const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
// sub 8
const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
// interleave
const int8x16_t v0_0lz = vzip1q_s8(v0_0ls, v0_0hs);
const int8x16_t v0_0hz = vzip2q_s8(v0_0ls, v0_0hs);
const int8x16_t v0_1lz = vzip1q_s8(v0_1ls, v0_1hs);
const int8x16_t v0_1hz = vzip2q_s8(v0_1ls, v0_1hs);
// load y
const int8x16_t v1_0l = vld1q_s8(y0->qs);
const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
const int8x16_t v1_1l = vld1q_s8(y1->qs);
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
#if defined(__ARM_FEATURE_DOTPROD)
sumv0 = vmlaq_n_f32(sumv0, vaddq_f32(
vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l)), GGML_FP16_TO_FP32(x0_0->d)),
vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0hz, v1_0h)), GGML_FP16_TO_FP32(x0_1->d))), y0->d);
sumv1 = vmlaq_n_f32(sumv1, vaddq_f32(
vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l)), GGML_FP16_TO_FP32(x1_0->d)),
vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1hz, v1_1h)), GGML_FP16_TO_FP32(x1_1->d))), y1->d);
#else
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l));
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l));
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h));
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h));
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l));
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l));
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h));
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h));
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
sumv0 = vmlaq_n_f32(sumv0, vaddq_f32(
vmulq_n_f32(vcvtq_f32_s32(pl0), GGML_FP16_TO_FP32(x0_0->d)),
vmulq_n_f32(vcvtq_f32_s32(ph0), GGML_FP16_TO_FP32(x0_1->d))), y0->d);
sumv1 = vmlaq_n_f32(sumv1, vaddq_f32(
vmulq_n_f32(vcvtq_f32_s32(pl1), GGML_FP16_TO_FP32(x1_0->d)),
vmulq_n_f32(vcvtq_f32_s32(ph1), GGML_FP16_TO_FP32(x1_1->d))), y1->d);
#endif
}
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
#elif defined(__AVX2__)
// Initialize accumulator with zeros
__m256 acc = _mm256_setzero_ps();
// Main loop
for (int i = 0; i < nb; i++) {
/* Compute combined scale for the block */
const __m128 d0 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 0].d));
const __m128 d1 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 1].d));
const __m256 d = _mm256_mul_ps(_mm256_set_m128(d1, d0), _mm256_broadcast_ss(&y[i].d));
__m128i bx0 = bytes_from_nibbles_16(x[2*i + 0].qs);
__m128i bx1 = bytes_from_nibbles_16(x[2*i + 1].qs);
__m256i bx = _mm256_set_m128i(bx1, bx0);
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
const __m256i off = _mm256_set1_epi8(8);
bx = _mm256_sub_epi8(bx, off);
__m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
const __m256 q = mul_sum_i8_pairs_float(bx, by);
/* Multiply q with scale and accumulate */
acc = _mm256_fmadd_ps(d, q, acc);
}
*s = hsum_float_8(acc);
#else
// scalar
float sumf = 0.0;
for (int i = 0; i < nb; i++) {
const uint8_t * restrict x0 = x[2*i + 0].qs;
const uint8_t * restrict x1 = x[2*i + 1].qs;
const int8_t * restrict y0 = y[i].qs;
const float d0 = GGML_FP16_TO_FP32(x[2*i + 0].d);
const float d1 = GGML_FP16_TO_FP32(x[2*i + 1].d);
int sumi_0 = 0;
int sumi_1 = 0;
for (int j = 0; j < V1_QK8_0/4; j++) {
const uint8_t v0 = x0[j];
const uint8_t v1 = x1[j];
const int i0_0 = (int8_t) (v0 & 0x0F) - 8;
const int i1_0 = (int8_t) (v0 >> 4) - 8;
const int i0_1 = (int8_t) (v1 & 0x0F) - 8;
const int i1_1 = (int8_t) (v1 >> 4) - 8;
const int i2_0 = y0[2*j + 0];
const int i3_0 = y0[2*j + 1];
const int i2_1 = y0[2*(j + V1_QK8_0/4) + 0];
const int i3_1 = y0[2*(j + V1_QK8_0/4) + 1];
sumi_0 += i0_0*i2_0 + i1_0*i3_0;
sumi_1 += i0_1*i2_1 + i1_1*i3_1;
}
sumf += (d0 * y[i].d) * sumi_0;
sumf += (d1 * y[i].d) * sumi_1;
}
*s = sumf;
#endif
}

23
third_party/ggml/ggjt.v1.q4_2.h vendored Normal file
View file

@ -0,0 +1,23 @@
#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_Q4_2_H_
#define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_Q4_2_H_
#include "third_party/ggml/fp16.h"
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
#define V1_QK4_2 16
typedef struct {
ggml_fp16_t d; // delta
uint8_t qs[V1_QK4_2 / 2]; // nibbles / quants
} block_v1_q4_2;
void dequantize_row_v1_q4_2(const void* restrict, float* restrict, int);
void quantize_row_v1_q4_2(const float* restrict, void* restrict, int);
size_t ggml_quantize_v1_q4_2(const float*, void*, int, int, int64_t*);
void ggml_vec_dot_v1_q4_2_q8_0(const int, float* restrict, const void* restrict,
const void* restrict);
void quantize_row_v1_q4_2_reference(const float* restrict,
block_v1_q4_2* restrict, int);
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_Q4_2_H_ */

350
third_party/ggml/ggjt.v1.q5_0.c vendored Normal file
View file

@ -0,0 +1,350 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi
GGML
Copyright (c) 2023 Georgi Gerganov
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "third_party/ggml/ggjt.v1.q5_0.h"
#include "libc/assert.h"
#include "libc/macros.internal.h"
#include "third_party/aarch64/arm_neon.h"
#include "third_party/ggml/fp16.internal.h"
#include "third_party/ggml/ggjt.v1.internal.h"
#include "third_party/ggml/ggjt.v1.q8_0.h"
// clang-format off
static_assert(sizeof(block_v1_q5_0) ==
sizeof(ggml_fp16_t) + sizeof(uint32_t) + V1_QK5_0 / 2,
"wrong q5_0 block size/padding");
void quantize_row_v1_q5_0_reference(const float * restrict x, block_v1_q5_0 * restrict y, int k) {
assert(k % V1_QK5_0 == 0);
const int nb = k / V1_QK5_0;
for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max
float max = 0.0f;
for (int l = 0; l < V1_QK5_0; l++) {
const float v = x[i*V1_QK5_0 + l];
if (amax < fabsf(v)) {
amax = fabsf(v);
max = v;
}
}
const float d = max / -16;
const float id = d ? 1.0f/d : 0.0f;
y[i].d = GGML_FP32_TO_FP16(d);
uint32_t qh = 0;
for (int l = 0; l < V1_QK5_0; l += 2) {
const float v0 = x[i*V1_QK5_0 + l + 0]*id;
const float v1 = x[i*V1_QK5_0 + l + 1]*id;
const uint32_t vi0 = MIN(31, (int) (v0 + 16.5f));
const uint32_t vi1 = MIN(31, (int) (v1 + 16.5f));
y[i].qs[l/2] = (vi0 & 0x0F) | ((vi1 & 0x0F) << 4);
// get the 5-th bit and store it in qh at the right position
qh |= ((vi0 & 0x10) >> 4) << (l + 0);
qh |= ((vi1 & 0x10) >> 4) << (l + 1);
}
memcpy(&y[i].qh, &qh, sizeof(y[i].qh));
}
}
void quantize_row_v1_q5_0(const float * restrict x, void * restrict vy, int k) {
assert(k % V1_QK5_0 == 0);
block_v1_q5_0 * restrict y = vy;
quantize_row_v1_q5_0_reference(x, y, k);
}
size_t ggml_quantize_v1_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) {
assert(k % V1_QK5_0 == 0);
const int nb = k / V1_QK5_0;
for (int j = 0; j < n; j += k) {
block_v1_q5_0 * restrict y = (block_v1_q5_0 *)dst + j/V1_QK5_0;
quantize_row_v1_q5_0_reference(src + j, y, k);
for (int i = 0; i < nb; i++) {
uint32_t qh;
memcpy(&qh, &y[i].qh, sizeof(qh));
for (int l = 0; l < V1_QK5_0; l += 2) {
const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
// cast to 16 bins
const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2;
const uint8_t vi1 = ((y[i].qs[l/2] >> 4) | vh1) / 2;
hist[vi0]++;
hist[vi1]++;
}
}
}
return (n/V1_QK5_0*sizeof(block_v1_q5_0));
}
void dequantize_row_v1_q5_0(const void * restrict vx, float * restrict y, int k) {
assert(k % V1_QK5_0 == 0);
const int nb = k / V1_QK5_0;
const block_v1_q5_0 * restrict x = vx;
for (int i = 0; i < nb; i++) {
const float d = GGML_FP16_TO_FP32(x[i].d);
const uint8_t * restrict pp = x[i].qs;
uint32_t qh;
memcpy(&qh, x[i].qh, sizeof(qh));
for (int l = 0; l < V1_QK5_0; l += 2) {
const uint8_t vi = pp[l/2];
// extract the 5-th bit from qh
const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
const int8_t vi0 = (vi & 0x0F) | vh0;
const int8_t vi1 = (vi >> 4) | vh1;
const float v0 = (vi0 - 16)*d;
const float v1 = (vi1 - 16)*d;
y[i*V1_QK5_0 + l + 0] = v0;
y[i*V1_QK5_0 + l + 1] = v1;
assert(!isnan(y[i*V1_QK5_0 + l + 0]));
assert(!isnan(y[i*V1_QK5_0 + l + 1]));
}
}
}
void ggml_vec_dot_v1_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
const int nb = n / V1_QK8_0;
assert(n % V1_QK8_0 == 0);
assert(nb % 2 == 0);
assert(V1_QK8_0 == V1_QK5_0);
const block_v1_q5_0 * restrict x = vx;
const block_v1_q8_0 * restrict y = vy;
#if defined(__ARM_NEON)
float32x4_t sumv = vdupq_n_f32(0.0f);
uint64_t tmp[4];
for (int i = 0; i < nb; ++i) {
const block_v1_q5_0 * restrict x0 = &x[i];
const block_v1_q8_0 * restrict y0 = &y[i];
const uint8x16_t m4b = vdupq_n_u8(0x0F);
const int8x16_t s16b = vdupq_n_s8(0x10);
// extract the 5th bit
uint32_t qh;
memcpy(&qh, x0->qh, sizeof(qh));
tmp[0] = table_b2b_u[(qh >> 0) & 0xFF];
tmp[1] = table_b2b_u[(qh >> 8) & 0xFF];
tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
tmp[3] = table_b2b_u[(qh >> 24) ];
const int8x16_t qhl = vld1q_s8((const int8_t *)(tmp + 0));
const int8x16_t qhh = vld1q_s8((const int8_t *)(tmp + 2));
const uint8x16_t v0 = vld1q_u8(x0->qs);
// 4-bit -> 8-bit
const int8x16_t v0l = vreinterpretq_s8_u8(vandq_u8 (v0, m4b));
const int8x16_t v0h = vreinterpretq_s8_u8(vshrq_n_u8(v0, 4));
// interleave
const int8x16_t v0lz = vzip1q_s8(v0l, v0h);
const int8x16_t v0hz = vzip2q_s8(v0l, v0h);
// add high bit and sub 16
const int8x16_t v0lf = vsubq_s8(vorrq_s8(v0lz, qhl), s16b);
const int8x16_t v0hf = vsubq_s8(vorrq_s8(v0hz, qhh), s16b);
// load y
const int8x16_t v1l = vld1q_s8(y0->qs);
const int8x16_t v1h = vld1q_s8(y0->qs + 16);
const float x0d = GGML_FP16_TO_FP32(x0->d);
#if defined(__ARM_FEATURE_DOTPROD)
sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(vaddq_s32(
vdotq_s32(vdupq_n_s32(0), v0lf, v1l),
vdotq_s32(vdupq_n_s32(0), v0hf, v1h))), x0d*y0->d);
#else
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0lf), vget_low_s8 (v1l));
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0lf), vget_high_s8(v1l));
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0hf), vget_low_s8 (v1h));
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0hf), vget_high_s8(v1h));
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0d*y0->d);
#endif
}
*s = vaddvq_f32(sumv);
#elif defined(__wasm_simd128__)
v128_t sumv = wasm_f32x4_splat(0.0f);
uint64_t tmp[4];
for (int i = 0; i < nb; ++i) {
const block_v1_q5_0 * restrict x0 = &x[i];
const block_v1_q8_0 * restrict y0 = &y[i];
const v128_t m4b = wasm_i8x16_splat(0x0F);
const v128_t s16b = wasm_i8x16_splat(0x10);
// extract the 5th bit
uint32_t qh;
memcpy(&qh, x0->qh, sizeof(qh));
tmp[0] = table_b2b_u[(qh >> 0) & 0xFF];
tmp[1] = table_b2b_u[(qh >> 8) & 0xFF];
tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
tmp[3] = table_b2b_u[(qh >> 24) ];
const v128_t qhl = wasm_v128_load(tmp + 0);
const v128_t qhh = wasm_v128_load(tmp + 2);
const v128_t v0 = wasm_v128_load(x0->qs);
// 4-bit -> 8-bit
const v128_t v0l = wasm_v128_and (v0, m4b);
const v128_t v0h = wasm_u8x16_shr(v0, 4);
// interleave
const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
// add high bit and sub 16
const v128_t v0lf = wasm_i8x16_sub(wasm_v128_or(v0lz, qhl), s16b);
const v128_t v0hf = wasm_i8x16_sub(wasm_v128_or(v0hz, qhh), s16b);
// load y
const v128_t v1l = wasm_v128_load(y0->qs);
const v128_t v1h = wasm_v128_load(y0->qs + 16);
// int8x16 -> int16x8
const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
const float x0d = GGML_FP16_TO_FP32(x0->d);
// dot product
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
wasm_i32x4_add(
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
}
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
#elif defined(__AVX2__)
// Initialize accumulator with zeros
__m256 acc = _mm256_setzero_ps();
// Main loop
for (int i = 0; i < nb; i++) {
/* Compute combined scale for the block */
const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d));
__m256i bx = bytes_from_nibbles_32(x[i].qs);
__m256i bxhi = bytes_from_bits_32(x[i].qh);
bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
bx = _mm256_or_si256(bx, bxhi);
__m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
const __m256 q = mul_sum_i8_pairs_float(bx, by);
/* Multiply q with scale and accumulate */
acc = _mm256_fmadd_ps(d, q, acc);
}
*s = hsum_float_8(acc);
#else
// scalar
float sumf = 0.0;
for (int i = 0; i < nb; i++) {
const uint8_t * restrict x0 = x[i].qs;
const int8_t * restrict y0 = y[i].qs;
uint32_t qh;
memcpy(&qh, x[i].qh, sizeof(qh));
const float d = GGML_FP16_TO_FP32(x[i].d);
int sxy = 0;
for (int j = 0; j < V1_QK8_0/2; j++) {
const uint8_t v0 = x0[j];
const int x0_0h = ((qh & (1u << (2*j + 0))) >> (2*j + 0)) << 4;
const int x1_0h = ((qh & (1u << (2*j + 1))) >> (2*j + 1)) << 4;
const int x0_0 = ((v0 & 0x0F) | x0_0h) - 16;
const int x1_0 = ((v0 >> 4) | x1_0h) - 16;
const int y0_0 = y0[2*j + 0];
const int y1_0 = y0[2*j + 1];
sxy += x0_0*y0_0 + x1_0*y1_0;
}
sumf += (d*sxy)*y[i].d;
}
*s = sumf;
#endif
}

24
third_party/ggml/ggjt.v1.q5_0.h vendored Normal file
View file

@ -0,0 +1,24 @@
#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_Q5_0_H_
#define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_Q5_0_H_
#include "third_party/ggml/fp16.h"
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
#define V1_QK5_0 32
typedef struct {
ggml_fp16_t d; // delta
uint8_t qh[4]; // 5-th bit of quants
uint8_t qs[V1_QK5_0 / 2]; // nibbles / quants
} block_v1_q5_0;
void dequantize_row_v1_q5_0(const void* restrict, float* restrict, int);
void quantize_row_v1_q5_0(const float* restrict, void* restrict, int);
size_t ggml_quantize_v1_q5_0(const float*, void*, int, int, int64_t*);
void ggml_vec_dot_v1_q5_0_q8_0(const int, float* restrict, const void* restrict,
const void* restrict);
void quantize_row_v1_q5_0_reference(const float* restrict,
block_v1_q5_0* restrict, int);
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_Q5_0_H_ */

352
third_party/ggml/ggjt.v1.q5_1.c vendored Normal file
View file

@ -0,0 +1,352 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2023 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "third_party/ggml/ggjt.v1.q5_1.h"
#include "libc/assert.h"
#include "libc/math.h"
#include "libc/str/str.h"
#include "third_party/aarch64/arm_neon.h"
#include "third_party/ggml/fp16.internal.h"
#include "third_party/ggml/ggjt.v1.internal.h"
#include "third_party/ggml/ggjt.v1.q8_1.h"
// clang-format off
static_assert(sizeof(block_v1_q5_1) ==
2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + V1_QK5_1 / 2,
"wrong q5_1 block size/padding");
void quantize_row_v1_q5_1_reference(const float * restrict x, block_v1_q5_1 * restrict y, int k) {
assert(k % V1_QK5_1 == 0);
const int nb = k / V1_QK5_1;
for (int i = 0; i < nb; i++) {
float min = FLT_MAX;
float max = -FLT_MAX;
for (int l = 0; l < V1_QK5_1; l++) {
const float v = x[i*V1_QK5_1 + l];
if (v < min) min = v;
if (v > max) max = v;
}
const float d = (max - min) / ((1 << 5) - 1);
const float id = d ? 1.0f/d : 0.0f;
y[i].d = GGML_FP32_TO_FP16(d);
y[i].m = GGML_FP32_TO_FP16(min);
uint32_t qh = 0;
for (int l = 0; l < V1_QK5_1; l += 2) {
const float v0 = (x[i*V1_QK5_1 + l + 0] - min)*id;
const float v1 = (x[i*V1_QK5_1 + l + 1] - min)*id;
const uint32_t vi0 = (int) (v0 + 0.5f);
const uint32_t vi1 = (int) (v1 + 0.5f);
y[i].qs[l/2] = (vi0 & 0x0F) | ((vi1 & 0x0F) << 4);
// get the 5-th bit and store it in qh at the right position
qh |= ((vi0 & 0x10) >> 4) << (l + 0);
qh |= ((vi1 & 0x10) >> 4) << (l + 1);
}
memcpy(&y[i].qh, &qh, sizeof(y[i].qh));
}
}
void quantize_row_v1_q5_1(const float * restrict x, void * restrict vy, int k) {
assert(k % V1_QK5_1 == 0);
block_v1_q5_1 * restrict y = vy;
quantize_row_v1_q5_1_reference(x, y, k);
}
size_t ggml_quantize_v1_q5_1(const float * src, void * dst, int n, int k, int64_t * hist) {
assert(k % V1_QK5_1 == 0);
const int nb = k / V1_QK5_1;
for (int j = 0; j < n; j += k) {
block_v1_q5_1 * restrict y = (block_v1_q5_1 *)dst + j/V1_QK5_1;
quantize_row_v1_q5_1_reference(src + j, y, k);
for (int i = 0; i < nb; i++) {
uint32_t qh;
memcpy(&qh, &y[i].qh, sizeof(qh));
for (int l = 0; l < V1_QK5_1; l += 2) {
const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
// cast to 16 bins
const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2;
const uint8_t vi1 = ((y[i].qs[l/2] >> 4) | vh1) / 2;
hist[vi0]++;
hist[vi1]++;
}
}
}
return (n/V1_QK5_1*sizeof(block_v1_q5_1));
}
void dequantize_row_v1_q5_1(const void * restrict vx, float * restrict y, int k) {
assert(k % V1_QK5_1 == 0);
const int nb = k / V1_QK5_1;
const block_v1_q5_1 * restrict x = vx;
for (int i = 0; i < nb; i++) {
const float d = GGML_FP16_TO_FP32(x[i].d);
const float m = GGML_FP16_TO_FP32(x[i].m);
const uint8_t * restrict pp = x[i].qs;
uint32_t qh;
memcpy(&qh, x[i].qh, sizeof(qh));
for (int l = 0; l < V1_QK5_1; l += 2) {
const uint8_t vi = pp[l/2];
// extract the 5-th bit from qh
const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
const uint8_t vi0 = (vi & 0x0F) | vh0;
const uint8_t vi1 = (vi >> 4) | vh1;
const float v0 = vi0*d + m;
const float v1 = vi1*d + m;
y[i*V1_QK5_1 + l + 0] = v0;
y[i*V1_QK5_1 + l + 1] = v1;
assert(!isnan(y[i*V1_QK5_1 + l + 0]));
assert(!isnan(y[i*V1_QK5_1 + l + 1]));
}
}
}
void ggml_vec_dot_v1_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
const int nb = n / V1_QK8_1;
assert(n % V1_QK8_1 == 0);
assert(nb % 2 == 0);
assert(V1_QK8_1 == V1_QK5_1);
const block_v1_q5_1 * restrict x = vx;
const block_v1_q8_1 * restrict y = vy;
#if defined(__ARM_NEON)
float32x4_t sumv = vdupq_n_f32(0.0f);
float summs = 0.0f;
uint64_t tmp[4];
for (int i = 0; i < nb; ++i) {
const block_v1_q5_1 * restrict x0 = &x[i];
const block_v1_q8_1 * restrict y0 = &y[i];
summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1);
// extract the 5th bit
uint32_t qh;
memcpy(&qh, x0->qh, sizeof(qh));
tmp[0] = table_b2b_u[(qh >> 0) & 0xFF];
tmp[1] = table_b2b_u[(qh >> 8) & 0xFF];
tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
tmp[3] = table_b2b_u[(qh >> 24) ];
const int8x16_t qhl = vld1q_s8((const int8_t *)(tmp + 0));
const int8x16_t qhh = vld1q_s8((const int8_t *)(tmp + 2));
const uint8x16_t v0 = vld1q_u8(x0->qs);
// 4-bit -> 8-bit
const int8x16_t v0l = vreinterpretq_s8_u8(vandq_u8 (v0, vdupq_n_u8(0x0F)));
const int8x16_t v0h = vreinterpretq_s8_u8(vshrq_n_u8(v0, 4));
// interleave
const int8x16_t v0lz = vzip1q_s8(v0l, v0h);
const int8x16_t v0hz = vzip2q_s8(v0l, v0h);
// add
const int8x16_t v0lf = vorrq_s8(v0lz, qhl);
const int8x16_t v0hf = vorrq_s8(v0hz, qhh);
// load y
const int8x16_t v1l = vld1q_s8(y0->qs);
const int8x16_t v1h = vld1q_s8(y0->qs + 16);
const float x0d = GGML_FP16_TO_FP32(x0->d);
#if defined(__ARM_FEATURE_DOTPROD)
sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(vaddq_s32(
vdotq_s32(vdupq_n_s32(0), v0lf, v1l),
vdotq_s32(vdupq_n_s32(0), v0hf, v1h))), x0d*y0->d);
#else
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0lf), vget_low_s8 (v1l));
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0lf), vget_high_s8(v1l));
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0hf), vget_low_s8 (v1h));
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0hf), vget_high_s8(v1h));
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0d*y0->d);
#endif
}
*s = vaddvq_f32(sumv) + summs;
#elif defined(__wasm_simd128__)
v128_t sumv = wasm_f32x4_splat(0.0f);
float summs = 0.0f;
uint64_t tmp[4];
for (int i = 0; i < nb; ++i) {
const block_v1_q5_1 * restrict x0 = &x[i];
const block_v1_q8_1 * restrict y0 = &y[i];
summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1);
const v128_t m4b = wasm_i8x16_splat(0x0F);
// extract the 5th bit
uint32_t qh;
memcpy(&qh, x0->qh, sizeof(qh));
tmp[0] = table_b2b_u[(qh >> 0) & 0xFF];
tmp[1] = table_b2b_u[(qh >> 8) & 0xFF];
tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
tmp[3] = table_b2b_u[(qh >> 24) ];
const v128_t qhl = wasm_v128_load(tmp + 0);
const v128_t qhh = wasm_v128_load(tmp + 2);
const v128_t v0 = wasm_v128_load(x0->qs);
// 4-bit -> 8-bit
const v128_t v0l = wasm_v128_and (v0, m4b);
const v128_t v0h = wasm_u8x16_shr(v0, 4);
static bool x = true;
// interleave
const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
// add high bit
const v128_t v0lf = wasm_v128_or(v0lz, qhl);
const v128_t v0hf = wasm_v128_or(v0hz, qhh);
// load y
const v128_t v1l = wasm_v128_load(y0->qs);
const v128_t v1h = wasm_v128_load(y0->qs + 16);
// int8x16 -> int16x8
const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
const float x0d = GGML_FP16_TO_FP32(x0->d);
// dot product
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
wasm_i32x4_add(
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
}
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
#elif defined(__AVX2__)
// Initialize accumulator with zeros
__m256 acc = _mm256_setzero_ps();
float summs = 0.0f;
// Main loop
for (int i = 0; i < nb; i++) {
const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
summs += GGML_FP16_TO_FP32(x[i].m) * (y[i].s0 + y[i].s1);
__m256i bx = bytes_from_nibbles_32(x[i].qs);
__m256i bxhi = bytes_from_bits_32(x[i].qh);
bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
bx = _mm256_or_si256(bx, bxhi);
const __m256 dy = _mm256_broadcast_ss(&y[i].d);
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
const __m256 q = mul_sum_i8_pairs_float(bx, by);
acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
}
*s = hsum_float_8(acc) + summs;
#else
float sumf = 0.0;
for (int i = 0; i < nb; i++) {
const uint8_t * restrict x0 = x[i].qs;
const int8_t * restrict y0 = y[i].qs;
uint32_t qh;
memcpy(&qh, x[i].qh, sizeof(qh));
const float d = GGML_FP16_TO_FP32(x[i].d);
const float m = GGML_FP16_TO_FP32(x[i].m);
int sxy = 0;
for (int j = 0; j < V1_QK8_1/2; j++) {
const uint8_t v0 = x0[j];
const int x0_0h = ((qh & (1u << (2*j + 0))) >> (2*j + 0)) << 4;
const int x1_0h = ((qh & (1u << (2*j + 1))) >> (2*j + 1)) << 4;
const int x0_0 = (v0 & 0x0F) | x0_0h;
const int x1_0 = (v0 >> 4) | x1_0h;
const int y0_0 = y0[2*j + 0];
const int y1_0 = y0[2*j + 1];
sxy += x0_0*y0_0 + x1_0*y1_0;
}
sumf += (d*sxy)*y[i].d + m*(y[i].s0 + y[i].s1);
}
*s = sumf;
#endif
}

25
third_party/ggml/ggjt.v1.q5_1.h vendored Normal file
View file

@ -0,0 +1,25 @@
#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_Q5_1_H_
#define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_Q5_1_H_
#include "third_party/ggml/fp16.h"
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
#define V1_QK5_1 32
typedef struct {
ggml_fp16_t d; // delta
ggml_fp16_t m; // min
uint8_t qh[4]; // 5-th bit of quants
uint8_t qs[V1_QK5_1 / 2]; // nibbles / quants
} block_v1_q5_1;
void dequantize_row_v1_q5_1(const void* restrict, float* restrict, int);
void quantize_row_v1_q5_1(const float* restrict, void* restrict, int);
size_t ggml_quantize_v1_q5_1(const float*, void*, int, int, int64_t*);
void ggml_vec_dot_v1_q5_1_q8_1(const int, float* restrict, const void* restrict,
const void* restrict);
void quantize_row_v1_q5_1_reference(const float* restrict,
block_v1_q5_1* restrict, int);
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_Q5_1_H_ */

325
third_party/ggml/ggjt.v1.q8_0.c vendored Normal file
View file

@ -0,0 +1,325 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2023 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "third_party/ggml/ggjt.v1.q8_0.h"
#include "libc/assert.h"
#include "libc/macros.internal.h"
#include "third_party/aarch64/arm_neon.h"
#include "third_party/ggml/ggjt.v1.internal.h"
#include "third_party/ggml/ggjt.v1.q8_0.h"
#include "third_party/intel/immintrin.internal.h"
#include "third_party/libcxx/math.h"
// clang-format off
static_assert(sizeof(block_v1_q8_0) == sizeof(float) + V1_QK8_0,
"wrong q8_0 block size/padding");
// reference implementation for deterministic creation of model files
void quantize_row_v1_q8_0_reference(const float * restrict x, block_v1_q8_0 * restrict y, int k) {
assert(k % V1_QK8_0 == 0);
const int nb = k / V1_QK8_0;
for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max
for (int l = 0; l < V1_QK8_0; l++) {
const float v = x[i*V1_QK8_0 + l];
amax = MAX(amax, fabsf(v));
}
const float d = amax / ((1 << 7) - 1);
const float id = d ? 1.0f/d : 0.0f;
y[i].d = d;
for (int l = 0; l < V1_QK8_0; ++l) {
const float v0 = x[i*V1_QK8_0 + l]*id;
y[i].qs[l] = roundf(v0);
}
}
}
void quantize_row_v1_q8_0(const float * restrict x, void * restrict vy, int k) {
assert(V1_QK8_0 == 32);
assert(k % V1_QK8_0 == 0);
const int nb = k / V1_QK8_0;
block_v1_q8_0 * restrict y = vy;
#if defined(__ARM_NEON)
for (int i = 0; i < nb; i++) {
float32x4_t srcv [8];
float32x4_t asrcv[8];
float32x4_t amaxv[8];
for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*32 + 4*l);
for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]);
for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]);
for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);
const float amax = vmaxvq_f32(amaxv[0]);
const float d = amax / ((1 << 7) - 1);
const float id = d ? 1.0f/d : 0.0f;
y[i].d = d;
for (int l = 0; l < 8; l++) {
const float32x4_t v = vmulq_n_f32(srcv[l], id);
const int32x4_t vi = vcvtnq_s32_f32(v);
y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0);
y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1);
y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2);
y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3);
}
}
#elif defined(__AVX2__) || defined(__AVX__)
for (int i = 0; i < nb; i++) {
// Load elements into 4 AVX vectors
__m256 v0 = _mm256_loadu_ps( x );
__m256 v1 = _mm256_loadu_ps( x + 8 );
__m256 v2 = _mm256_loadu_ps( x + 16 );
__m256 v3 = _mm256_loadu_ps( x + 24 );
x += 32;
// Compute max(abs(e)) for the block
const __m256 signBit = _mm256_set1_ps( -0.0f );
__m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
__m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
const float maxScalar = _mm_cvtss_f32( max4 );
// Quantize these floats
const float d = maxScalar / 127.f;
y[i].d = d;
const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
const __m256 mul = _mm256_set1_ps( id );
// Apply the multiplier
v0 = _mm256_mul_ps( v0, mul );
v1 = _mm256_mul_ps( v1, mul );
v2 = _mm256_mul_ps( v2, mul );
v3 = _mm256_mul_ps( v3, mul );
// Round to nearest integer
v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
// Convert floats to integers
__m256i i0 = _mm256_cvtps_epi32( v0 );
__m256i i1 = _mm256_cvtps_epi32( v1 );
__m256i i2 = _mm256_cvtps_epi32( v2 );
__m256i i3 = _mm256_cvtps_epi32( v3 );
#if defined(__AVX2__)
// Convert int32 to int16
i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31
// Convert int16 to int8
i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
// We got our precious signed bytes, but the order is now wrong
// These AVX2 pack instructions process 16-byte pieces independently
// The following instruction is fixing the order
const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
i0 = _mm256_permutevar8x32_epi32( i0, perm );
_mm256_storeu_si256((__m256i *)y[i].qs, i0);
#else
// Since we don't have in AVX some necessary functions,
// we split the registers in half and call AVX2 analogs from SSE
__m128i ni0 = _mm256_castsi256_si128( i0 );
__m128i ni1 = _mm256_extractf128_si256( i0, 1);
__m128i ni2 = _mm256_castsi256_si128( i1 );
__m128i ni3 = _mm256_extractf128_si256( i1, 1);
__m128i ni4 = _mm256_castsi256_si128( i2 );
__m128i ni5 = _mm256_extractf128_si256( i2, 1);
__m128i ni6 = _mm256_castsi256_si128( i3 );
__m128i ni7 = _mm256_extractf128_si256( i3, 1);
// Convert int32 to int16
ni0 = _mm_packs_epi32( ni0, ni1 );
ni2 = _mm_packs_epi32( ni2, ni3 );
ni4 = _mm_packs_epi32( ni4, ni5 );
ni6 = _mm_packs_epi32( ni6, ni7 );
// Convert int16 to int8
ni0 = _mm_packs_epi16( ni0, ni2 );
ni4 = _mm_packs_epi16( ni4, ni6 );
_mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0);
_mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
#endif
}
#else
// scalar
quantize_row_v1_q8_0_reference(x, y, k);
#endif
}
size_t ggml_quantize_v1_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) {
assert(k % V1_QK8_0 == 0);
const int nb = k / V1_QK8_0;
for (int j = 0; j < n; j += k) {
block_v1_q8_0 * restrict y = (block_v1_q8_0 *)dst + j/V1_QK8_0;
quantize_row_v1_q8_0_reference(src + j, y, k);
for (int i = 0; i < nb; i++) {
for (int l = 0; l < V1_QK8_0; ++l) {
const int8_t vi = y[i].qs[l];
hist[vi/16 + 8]++;
}
}
}
return (n/V1_QK8_0*sizeof(block_v1_q8_0));
}
void dequantize_row_v1_q8_0(const void * restrict vx, float * restrict y, int k) {
assert(k % V1_QK8_0 == 0);
const int nb = k / V1_QK8_0;
const block_v1_q8_0 * restrict x = vx;
for (int i = 0; i < nb; i++) {
const float d = x[i].d;
const int8_t * restrict pp = x[i].qs;
for (int l = 0; l < V1_QK8_0; ++l) {
y[i*V1_QK8_0 + l] = pp[l]*d;
}
}
}
void ggml_vec_dot_v1_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
const int nb = n / V1_QK8_0;
assert(n % V1_QK8_0 == 0);
assert(nb % 2 == 0);
assert(V1_QK8_0 == V1_QK8_0);
const block_v1_q8_0 * restrict x = vx;
const block_v1_q8_0 * restrict y = vy;
#if defined(__ARM_NEON)
float32x4_t sumv0 = vdupq_n_f32(0.0f);
float32x4_t sumv1 = vdupq_n_f32(0.0f);
for (int i = 0; i < nb; i += 2) {
const block_v1_q8_0 * restrict x0 = &x[i + 0];
const block_v1_q8_0 * restrict x1 = &x[i + 1];
const block_v1_q8_0 * restrict y0 = &y[i + 0];
const block_v1_q8_0 * restrict y1 = &y[i + 1];
const int8x16_t x0_0 = vld1q_s8(x0->qs);
const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
const int8x16_t x1_0 = vld1q_s8(x1->qs);
const int8x16_t x1_1 = vld1q_s8(x1->qs + 16);
// load y
const int8x16_t y0_0 = vld1q_s8(y0->qs);
const int8x16_t y0_1 = vld1q_s8(y0->qs + 16);
const int8x16_t y1_0 = vld1q_s8(y1->qs);
const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
#if defined(__ARM_FEATURE_DOTPROD)
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), x0->d*y0->d);
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), x1->d*y1->d);
#else
const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0));
const int16x8_t p0_1 = vmull_s8(vget_high_s8(x0_0), vget_high_s8(y0_0));
const int16x8_t p0_2 = vmull_s8(vget_low_s8 (x0_1), vget_low_s8 (y0_1));
const int16x8_t p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1));
const int16x8_t p1_0 = vmull_s8(vget_low_s8 (x1_0), vget_low_s8 (y1_0));
const int16x8_t p1_1 = vmull_s8(vget_high_s8(x1_0), vget_high_s8(y1_0));
const int16x8_t p1_2 = vmull_s8(vget_low_s8 (x1_1), vget_low_s8 (y1_1));
const int16x8_t p1_3 = vmull_s8(vget_high_s8(x1_1), vget_high_s8(y1_1));
const int32x4_t p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1));
const int32x4_t p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3));
const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1));
const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3));
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), x0->d*y0->d);
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), x1->d*y1->d);
#endif
}
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
#elif defined(__AVX2__)
// Initialize accumulator with zeros
__m256 acc = _mm256_setzero_ps();
// Main loop
for (int i = 0; i < nb; ++i) {
// Compute combined scale for the block
const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
__m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
__m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
const __m256 q = mul_sum_i8_pairs_float(bx, by);
// Multiply q with scale and accumulate
acc = _mm256_fmadd_ps( d, q, acc );
}
*s = hsum_float_8(acc);
#else
// scalar
float sumf = 0.0;
for (int i = 0; i < nb; i++) {
const int8_t * restrict x0 = x[i].qs;
const int8_t * restrict y0 = y[i].qs;
int sumi = 0;
for (int j = 0; j < V1_QK8_0; j++) {
const int v0 = x0[j];
const int v1 = y0[j];
sumi += v0*v1;
}
sumf += (x[i].d*y[i].d)*sumi;
}
*s = sumf;
#endif
}

22
third_party/ggml/ggjt.v1.q8_0.h vendored Normal file
View file

@ -0,0 +1,22 @@
#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_Q8_0_H_
#define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_Q8_0_H_
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
#define V1_QK8_0 32
typedef struct {
float d; // delta
int8_t qs[V1_QK8_0]; // quants
} block_v1_q8_0;
void dequantize_row_v1_q8_0(const void* restrict, float* restrict, int);
void quantize_row_v1_q8_0(const float* restrict, void* restrict, int);
size_t ggml_quantize_v1_q8_0(const float*, void*, int, int, int64_t*);
void ggml_vec_dot_v1_q8_0_q8_0(const int, float* restrict, const void* restrict,
const void* restrict);
void quantize_row_v1_q8_0_reference(const float* restrict,
block_v1_q8_0* restrict, int);
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_Q8_0_H_ */

240
third_party/ggml/ggjt.v1.q8_1.c vendored Normal file
View file

@ -0,0 +1,240 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2023 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "third_party/ggml/ggjt.v1.q8_1.h"
#include "libc/assert.h"
#include "libc/macros.internal.h"
#include "third_party/aarch64/arm_neon.h"
#include "third_party/ggml/ggml.h"
#include "third_party/intel/immintrin.internal.h"
#include "third_party/libcxx/math.h"
// clang-format off
static_assert(sizeof(block_v1_q8_1) == 3 * sizeof(float) + V1_QK8_1,
"wrong q8_1 block size/padding");
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
// horizontally add 8 int32_t
static inline int hsum_i32_8(const __m256i a) {
const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
const __m128i sum64 = _mm_add_epi32(hi64, sum128);
const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
}
#endif /* AVX || AVX2 || AVX512F */
// reference implementation for deterministic creation of model files
void quantize_row_v1_q8_1_reference(const float * restrict x, block_v1_q8_1 * restrict y, int k) {
assert(V1_QK8_1 == 32);
assert(k % V1_QK8_1 == 0);
const int nb = k / V1_QK8_1;
for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max
for (int l = 0; l < V1_QK8_1; l++) {
const float v = x[i*V1_QK8_1 + l];
amax = MAX(amax, fabsf(v));
}
const float d = amax / ((1 << 7) - 1);
const float id = d ? 1.0f/d : 0.0f;
y[i].d = d;
int sum0 = 0;
int sum1 = 0;
for (int l = 0; l < V1_QK8_1/2; ++l) {
const float v0 = x[i*V1_QK8_1 + l]*id;
const float v1 = x[i*V1_QK8_1 + V1_QK8_1/2 + l]*id;
y[i].qs[ l] = roundf(v0);
y[i].qs[V1_QK8_1/2 + l] = roundf(v1);
sum0 += y[i].qs[ l];
sum1 += y[i].qs[V1_QK8_1/2 + l];
}
y[i].s0 = d * sum0;
y[i].s1 = d * sum1;
}
}
void quantize_row_v1_q8_1(const float * restrict x, void * restrict vy, int k) {
assert(k % V1_QK8_1 == 0);
const int nb = k / V1_QK8_1;
block_v1_q8_1 * restrict y = vy;
#if defined(__ARM_NEON)
for (int i = 0; i < nb; i++) {
float32x4_t srcv [8];
float32x4_t asrcv[8];
float32x4_t amaxv[8];
for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*32 + 4*l);
for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]);
for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]);
for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);
const float amax = vmaxvq_f32(amaxv[0]);
const float d = amax / ((1 << 7) - 1);
const float id = d ? 1.0f/d : 0.0f;
y[i].d = d;
int32x4_t accv0 = vdupq_n_s32(0);
int32x4_t accv1 = vdupq_n_s32(0);
// low half
for (int l = 0; l < 4; l++) {
const float32x4_t v = vmulq_n_f32(srcv[l], id);
const int32x4_t vi = vcvtnq_s32_f32(v);
y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0);
y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1);
y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2);
y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3);
accv0 = vaddq_s32(accv0, vi);
}
// high half
for (int l = 4; l < 8; l++) {
const float32x4_t v = vmulq_n_f32(srcv[l], id);
const int32x4_t vi = vcvtnq_s32_f32(v);
y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0);
y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1);
y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2);
y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3);
accv1 = vaddq_s32(accv1, vi);
}
const int32_t sum0 = vaddvq_s32(accv0);
const int32_t sum1 = vaddvq_s32(accv1);
y[i].s0 = d * sum0;
y[i].s1 = d * sum1;
}
#elif defined(__AVX2__) || defined(__AVX__)
for (int i = 0; i < nb; i++) {
// Load elements into 4 AVX vectors
__m256 v0 = _mm256_loadu_ps( x );
__m256 v1 = _mm256_loadu_ps( x + 8 );
__m256 v2 = _mm256_loadu_ps( x + 16 );
__m256 v3 = _mm256_loadu_ps( x + 24 );
x += 32;
// Compute max(abs(e)) for the block
const __m256 signBit = _mm256_set1_ps( -0.0f );
__m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
__m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
const float maxScalar = _mm_cvtss_f32( max4 );
// Quantize these floats
const float d = maxScalar / 127.f;
y[i].d = d;
const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
const __m256 mul = _mm256_set1_ps( id );
// Apply the multiplier
v0 = _mm256_mul_ps( v0, mul );
v1 = _mm256_mul_ps( v1, mul );
v2 = _mm256_mul_ps( v2, mul );
v3 = _mm256_mul_ps( v3, mul );
// Round to nearest integer
v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
// Convert floats to integers
__m256i i0 = _mm256_cvtps_epi32( v0 );
__m256i i1 = _mm256_cvtps_epi32( v1 );
__m256i i2 = _mm256_cvtps_epi32( v2 );
__m256i i3 = _mm256_cvtps_epi32( v3 );
#if defined(__AVX2__)
// Compute the sum of the quants and set y[i].s
//y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
y[i].s0 = d * hsum_i32_8(_mm256_add_epi32(i0, i1));
y[i].s1 = d * hsum_i32_8(_mm256_add_epi32(i2, i3));
// Convert int32 to int16
i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31
// Convert int16 to int8
i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
// We got our precious signed bytes, but the order is now wrong
// These AVX2 pack instructions process 16-byte pieces independently
// The following instruction is fixing the order
const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
i0 = _mm256_permutevar8x32_epi32( i0, perm );
_mm256_storeu_si256((__m256i *)y[i].qs, i0);
#else
// Since we don't have in AVX some necessary functions,
// we split the registers in half and call AVX2 analogs from SSE
__m128i ni0 = _mm256_castsi256_si128( i0 );
__m128i ni1 = _mm256_extractf128_si256( i0, 1);
__m128i ni2 = _mm256_castsi256_si128( i1 );
__m128i ni3 = _mm256_extractf128_si256( i1, 1);
__m128i ni4 = _mm256_castsi256_si128( i2 );
__m128i ni5 = _mm256_extractf128_si256( i2, 1);
__m128i ni6 = _mm256_castsi256_si128( i3 );
__m128i ni7 = _mm256_extractf128_si256( i3, 1);
// Compute the sum of the quants and set y[i].s
const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
y[i].s0 = d * hsum_i32_4(s0);
y[i].s1 = d * hsum_i32_4(s1);
// Convert int32 to int16
ni0 = _mm_packs_epi32( ni0, ni1 );
ni2 = _mm_packs_epi32( ni2, ni3 );
ni4 = _mm_packs_epi32( ni4, ni5 );
ni6 = _mm_packs_epi32( ni6, ni7 );
// Convert int16 to int8
ni0 = _mm_packs_epi16( ni0, ni2 );
ni4 = _mm_packs_epi16( ni4, ni6 );
_mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0);
_mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
#endif
}
#else
// scalar
quantize_row_v1_q8_1_reference(x, y, k);
#endif
}

20
third_party/ggml/ggjt.v1.q8_1.h vendored Normal file
View file

@ -0,0 +1,20 @@
#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_Q8_1_H_
#define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_Q8_1_H_
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
#define V1_QK8_1 32
typedef struct {
float d; // delta
float s0; // d * sum(qs[i]) low
float s1; // d * sum(qs[i]) high
int8_t qs[V1_QK8_1]; // quants
} block_v1_q8_1;
void quantize_row_v1_q8_1(const float* restrict, void* restrict, int);
void quantize_row_v1_q8_1_reference(const float* restrict,
block_v1_q8_1* restrict, int);
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_Q8_1_H_ */

2271
third_party/ggml/ggml.c vendored

File diff suppressed because it is too large Load diff

View file

@ -205,20 +205,6 @@ COSMOPOLITAN_C_START_
} \
} while (0)
#ifdef __ARM_NEON
// we use the built-in 16-bit float type
typedef __fp16 ggml_fp16_t;
#else
typedef uint16_t ggml_fp16_t;
#endif
// convert FP16 <-> FP32
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
struct ggml_object;
struct ggml_context;
@ -385,6 +371,13 @@ COSMOPOLITAN_C_START_
bool no_alloc; // don't allocate memory for the tensor data
};
//
// compatibilty
//
GGML_API void ggjt_v2(void);
GGML_API void ggjt_v1(void);
// misc
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
@ -926,6 +919,12 @@ COSMOPOLITAN_C_START_
quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
extern const int *GGML_BLCK_SIZE;
extern const size_t *GGML_TYPE_SIZE;
extern const bool *GGML_IS_QUANTIZED;
extern const char *const *GGML_TYPE_NAME;
extern const quantize_fns_t *quantize_fns;
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_THIRD_PARTY_LLAMA_CPP_GGML_H_ */

View file

@ -10,12 +10,35 @@ PKGS += THIRD_PARTY_GGML
THIRD_PARTY_GGML_ARTIFACTS += THIRD_PARTY_GGML_A
THIRD_PARTY_GGML = $(THIRD_PARTY_GGML_A_DEPS) $(THIRD_PARTY_GGML_A)
THIRD_PARTY_GGML_A = o/$(MODE)/third_party/ggml/ggml.a
THIRD_PARTY_GGML_A_HDRS = third_party/ggml/ggml.h
THIRD_PARTY_GGML_A_SRCS = third_party/ggml/ggml.c
THIRD_PARTY_GGML_A_OBJS = $(THIRD_PARTY_GGML_A_SRCS:%.c=o/$(MODE)/%.o)
THIRD_PARTY_GGML_A_FILES = $(THIRD_PARTY_GGML_A_SRCS) $(THIRD_PARTY_GGML_A_HDRS)
THIRD_PARTY_GGML_A_CHECKS = $(THIRD_PARTY_GGML_A).pkg $(THIRD_PARTY_GGML_A_HDRS:%=o/$(MODE)/%.ok)
THIRD_PARTY_GGML_A_HDRS = \
third_party/ggml/fp16.h \
third_party/ggml/ggml.h \
third_party/ggml/ggjt.v1.q4_0.h \
third_party/ggml/ggjt.v1.q4_1.h \
third_party/ggml/ggjt.v1.q4_2.h \
third_party/ggml/ggjt.v1.q5_0.h \
third_party/ggml/ggjt.v1.q5_1.h \
third_party/ggml/ggjt.v1.q8_0.h \
third_party/ggml/ggjt.v1.q8_1.h \
third_party/ggml/fp16.internal.h \
third_party/ggml/ggjt.v1.internal.h
THIRD_PARTY_GGML_A_SRCS = \
third_party/ggml/fp16.c \
third_party/ggml/ggml.c \
third_party/ggml/ggjt.v1.c \
third_party/ggml/ggjt.v1.q4_0.c \
third_party/ggml/ggjt.v1.q4_1.c \
third_party/ggml/ggjt.v1.q4_2.c \
third_party/ggml/ggjt.v1.q5_0.c \
third_party/ggml/ggjt.v1.q5_1.c \
third_party/ggml/ggjt.v1.q8_0.c \
third_party/ggml/ggjt.v1.q8_1.c
THIRD_PARTY_GGML_A_DIRECTDEPS = \
LIBC_CALLS \
LIBC_INTRIN \

View file

@ -30,6 +30,8 @@
#include "libc/assert.h"
#include "libc/intrin/bits.h"
#include "libc/macros.internal.h"
#include "libc/stdio/stdio.h"
#include "third_party/ggml/fp16.h"
#include "third_party/ggml/ggml.h"
#include "third_party/ggml/llama_util.h"
#include "third_party/libcxx/algorithm"
@ -431,7 +433,8 @@ struct llama_load_tensors_map {
enum llama_file_version {
LLAMA_FILE_VERSION_GGML,
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
LLAMA_FILE_VERSION_GGJT_V1, // added padding
LLAMA_FILE_VERSION_GGJT_V1, // adopted unified aligned mappable layout
LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
};
struct llama_file_loader {
@ -458,10 +461,16 @@ struct llama_file_loader {
if (magic == READ32BE("ggml") && version == 0) {
file_version = LLAMA_FILE_VERSION_GGML;
ggjt_v1();
} else if (magic == READ32BE("ggmf") && version == 1) {
file_version = LLAMA_FILE_VERSION_GGMF_V1;
ggjt_v1();
} else if (magic == READ32BE("ggjt") && version == 1) {
file_version = LLAMA_FILE_VERSION_GGJT_V1;
ggjt_v1();
} else if (magic == READ32BE("ggjt") && version == 2) {
file_version = LLAMA_FILE_VERSION_GGJT_V2;
ggjt_v2();
} else {
Die("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
magic, version);
@ -870,8 +879,9 @@ bool llama_mlock_supported() {
static const char *llama_file_version_name(llama_file_version version) {
switch (version) {
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)";
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (pre #613 with sharded files and no mmap support)";
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
default: LLAMA_ASSERT(false);
}
}
@ -951,6 +961,19 @@ static void llama_model_load_internal(
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
}
// check for consistency between ftype and version. for example,
// Q4_2 was removed when GGJT v2 was introduced, so we reject it
// unless the file is using an earlier version number.
if (((unsigned)hparams.ftype >= GGML_TYPE_COUNT ||
!GGML_BLCK_SIZE[hparams.ftype] ||
!GGML_TYPE_SIZE[hparams.ftype] ||
!GGML_TYPE_NAME[hparams.ftype])) {
fprintf(stderr, "%s: error: '%s' isn't specified by '%s'\n",
__func__, llama_ftype_name(hparams.ftype),
llama_file_version_name(file_version));
exit(1);
}
if (vocab_only) {
return;
}

View file

@ -47,6 +47,8 @@
"__AES__"
"__AVX__"
"__AVX2__"
"__AVX512F__"
"__AVXVNNI__"
"__ABM__"
"__BMI__"
"__BMI2__"