mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-01-31 11:37:35 +00:00
e7eb0b3070
- Fix UX issues with llama.com - Do housekeeping on libm code - Add more vectorization to GGML - Get GGJT quantizer programs working well - Have the quantizer keep the output layer as f16c - Prefetching improves performance 15% if you use fewer threads
135 lines
5.8 KiB
C
135 lines
5.8 KiB
C
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
|
|
│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi│
|
|
╚──────────────────────────────────────────────────────────────────────────────╝
|
|
│ │
|
|
│ GGML │
|
|
│ Copyright (c) 2023 Georgi Gerganov │
|
|
│ │
|
|
│ Permission is hereby granted, free of charge, to any person obtaining │
|
|
│ a copy of this software and associated documentation files (the │
|
|
│ "Software"), to deal in the Software without restriction, including │
|
|
│ without limitation the rights to use, copy, modify, merge, publish, │
|
|
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
|
│ permit persons to whom the Software is furnished to do so, subject to │
|
|
│ the following conditions: │
|
|
│ │
|
|
│ The above copyright notice and this permission notice shall be │
|
|
│ included in all copies or substantial portions of the Software. │
|
|
│ │
|
|
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
|
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
|
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
|
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
|
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
|
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
|
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
|
│ │
|
|
╚─────────────────────────────────────────────────────────────────────────────*/
|
|
#include "third_party/ggml/fp16.h"
|
|
#include "libc/str/str.h"
|
|
#include "third_party/ggml/fp16.internal.h"
|
|
#include "third_party/libcxx/math.h"
|
|
|
|
asm(".ident\t\"\\n\\n\
|
|
GGML (MIT License)\\n\
|
|
Copyright (c) 2023 Georgi Gerganov\"");
|
|
asm(".include \"libc/disclaimer.inc\"");
|
|
// clang-format off
|
|
|
|
#if defined(__ARM_NEON) || defined(__wasm_simd128__)
|
|
#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
|
|
#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
|
|
#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
|
|
#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
|
|
#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
|
|
#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
|
|
#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
|
|
#define B8(c,s ) B7(c,s, c), B7(c,s, s)
|
|
|
|
// precomputed tables for expanding 8bits to 8 bytes:
|
|
const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4
|
|
const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
|
|
#endif
|
|
|
|
//
|
|
// global data
|
|
//
|
|
|
|
// precomputed gelu table for f16 (128 KB)
|
|
ggml_fp16_t table_gelu_f16[1 << 16];
|
|
|
|
// precomputed silu table for f16 (128 KB)
|
|
ggml_fp16_t table_silu_f16[1 << 16];
|
|
|
|
// precomputed exp table for f16 (128 KB)
|
|
ggml_fp16_t table_exp_f16[1 << 16];
|
|
|
|
// precomputed f32 table for f16 (256 KB)
|
|
float table_f32_f16[1 << 16];
|
|
|
|
// note: do not use these inside ggml.c
|
|
// these are meant to be used via the ggml.h API
|
|
float ggml_fp16_to_fp32(ggml_fp16_t x) {
|
|
return (float) GGML_FP16_TO_FP32(x);
|
|
}
|
|
|
|
ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
|
return GGML_FP32_TO_FP16(x);
|
|
}
|
|
|
|
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n) {
|
|
size_t i = 0;
|
|
#ifdef __F16C__
|
|
for (; i + 7 < n; i += 8) {
|
|
__m128i x_vec = _mm_loadu_si128((const __m128i *)(x + i));
|
|
__m256 y_vec = _mm256_cvtph_ps(x_vec);
|
|
_mm256_storeu_ps(y + i, y_vec);
|
|
}
|
|
#endif
|
|
for (; i < n; i++) {
|
|
y[i] = GGML_FP16_TO_FP32(x[i]);
|
|
}
|
|
}
|
|
|
|
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
|
|
size_t i = 0;
|
|
#ifdef __F16C__
|
|
for (; i + 7 < n; i += 8) {
|
|
__m256 x_vec = _mm256_loadu_ps(x + i);
|
|
__m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
|
_mm_storeu_si128((__m128i *)(y + i), y_vec);
|
|
}
|
|
for(; i + 3 < n; i += 4) {
|
|
__m128 x_vec = _mm_loadu_ps(x + i);
|
|
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
|
_mm_storel_epi64((__m128i *)(y + i), y_vec);
|
|
}
|
|
#endif
|
|
for (; i < n; i++) {
|
|
y[i] = GGML_FP32_TO_FP16(x[i]);
|
|
}
|
|
}
|
|
|
|
static const float GELU_COEF_A = 0.044715f;
|
|
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
|
|
|
inline static float ggml_gelu_f32(float x) {
|
|
return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
|
}
|
|
|
|
// Sigmoid Linear Unit (SiLU) function
|
|
inline static float ggml_silu_f32(float x) {
|
|
return x/(1.0f + expf(-x));
|
|
}
|
|
|
|
void ggml_fp16_init(void) {
|
|
ggml_fp16_t ii;
|
|
for (int i = 0; i < (1 << 16); ++i) {
|
|
uint16_t ui = i;
|
|
memcpy(&ii, &ui, sizeof(ii));
|
|
const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
|
|
table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
|
|
table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
|
|
table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f));
|
|
}
|
|
}
|