mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-07-12 14:09:12 +00:00
Make more ML improvements
- Fix UX issues with llama.com - Do housekeeping on libm code - Add more vectorization to GGML - Get GGJT quantizer programs working well - Have the quantizer keep the output layer as f16c - Prefetching improves performance 15% if you use fewer threads
This commit is contained in:
parent
80db9de173
commit
e7eb0b3070
46 changed files with 340 additions and 289 deletions
2
third_party/ggml/README.cosmo
vendored
2
third_party/ggml/README.cosmo
vendored
|
@ -27,3 +27,5 @@ LOCAL CHANGES
|
|||
- Refactor headers per cosmo convention
|
||||
- Replace code like 'ggjt' with READ32BE("ggjt")
|
||||
- Remove C++ exceptions; use Die() function instead
|
||||
- Removed division from matrix multiplication.
|
||||
- Let quantizer convert between ggmt formats
|
||||
|
|
10
third_party/ggml/common.cc
vendored
10
third_party/ggml/common.cc
vendored
|
@ -34,6 +34,7 @@
|
|||
#include "libc/stdio/stdio.h"
|
||||
#include "libc/str/str.h"
|
||||
#include "libc/sysv/consts/fileno.h"
|
||||
#include "third_party/ggml/llama_util.h"
|
||||
#include "third_party/libcxx/algorithm"
|
||||
#include "third_party/libcxx/cassert"
|
||||
#include "third_party/libcxx/cstring"
|
||||
|
@ -50,13 +51,6 @@ Copyright (c) 2023 Georgi Gerganov\"");
|
|||
asm(".include \"libc/disclaimer.inc\"");
|
||||
// clang-format off
|
||||
|
||||
static bool is_integer_str(const char *s) {
|
||||
if (*s == '-') ++s;
|
||||
if (!*s) return false;
|
||||
while (isdigit(*s)) ++s;
|
||||
return !*s;
|
||||
}
|
||||
|
||||
static std::string replace_all(std::string const& original,
|
||||
std::string const& before,
|
||||
std::string const& after) {
|
||||
|
@ -92,7 +86,7 @@ static bool append_file_to_prompt(const char *path, gpt_params & params) {
|
|||
}
|
||||
|
||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
params.n_threads = std::min(20, std::max(1, (int)(_getcpucount() * 0.75)));
|
||||
params.n_threads = std::min(20, std::max(1, _getcpucount() >> 1));
|
||||
|
||||
bool invalid_param = false;
|
||||
std::string arg;
|
||||
|
|
4
third_party/ggml/common.h
vendored
4
third_party/ggml/common.h
vendored
|
@ -1,4 +1,4 @@
|
|||
// -*- c++ -*-
|
||||
// -*- c++; c-basic-offset:4 -*-
|
||||
#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_COMMON_H_
|
||||
#define COSMOPOLITAN_THIRD_PARTY_GGML_COMMON_H_
|
||||
#include "libc/calls/struct/termios.h"
|
||||
|
@ -21,7 +21,7 @@
|
|||
struct gpt_params {
|
||||
int32_t seed = -1; // RNG seed
|
||||
int32_t verbose = 0; // Logging verbosity
|
||||
int32_t n_threads = std::min(1, (int)(_getcpucount() * 0.75));
|
||||
int32_t n_threads = std::max(1, _getcpucount() >> 1);
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
|
||||
int32_t n_ctx = 512; // context size
|
||||
|
|
10
third_party/ggml/fp16.c
vendored
10
third_party/ggml/fp16.c
vendored
|
@ -78,7 +78,15 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
|||
}
|
||||
|
||||
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n) {
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
size_t i = 0;
|
||||
#ifdef __F16C__
|
||||
for (; i + 7 < n; i += 8) {
|
||||
__m128i x_vec = _mm_loadu_si128((const __m128i *)(x + i));
|
||||
__m256 y_vec = _mm256_cvtph_ps(x_vec);
|
||||
_mm256_storeu_ps(y + i, y_vec);
|
||||
}
|
||||
#endif
|
||||
for (; i < n; i++) {
|
||||
y[i] = GGML_FP16_TO_FP32(x[i]);
|
||||
}
|
||||
}
|
||||
|
|
3
third_party/ggml/fp16.h
vendored
3
third_party/ggml/fp16.h
vendored
|
@ -3,9 +3,6 @@
|
|||
#if !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
COSMOPOLITAN_C_START_
|
||||
|
||||
#define GGML_GELU_FP16
|
||||
#define GGML_SILU_FP16
|
||||
|
||||
#ifdef __ARM_NEON
|
||||
// we use the built-in 16-bit float type
|
||||
typedef __fp16 ggml_fp16_t;
|
||||
|
|
3
third_party/ggml/fp16.internal.h
vendored
3
third_party/ggml/fp16.internal.h
vendored
|
@ -8,6 +8,9 @@
|
|||
#if !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
COSMOPOLITAN_C_START_
|
||||
|
||||
#define GGML_GELU_FP16
|
||||
#define GGML_SILU_FP16
|
||||
|
||||
extern ggml_fp16_t table_gelu_f16[1 << 16];
|
||||
extern ggml_fp16_t table_silu_f16[1 << 16];
|
||||
extern ggml_fp16_t table_exp_f16[1 << 16];
|
||||
|
|
46
third_party/ggml/ggjt.v1.q4_0.c
vendored
46
third_party/ggml/ggjt.v1.q4_0.c
vendored
|
@ -613,23 +613,37 @@ void ggml_vec_dot_v1_q4_0_q8_0(const int n, float * restrict s, const void * res
|
|||
__m256 acc = _mm256_setzero_ps();
|
||||
|
||||
// Main loop
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
/* Compute combined scale for the block */
|
||||
const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
|
||||
|
||||
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
||||
|
||||
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
|
||||
const __m256i off = _mm256_set1_epi8( 8 );
|
||||
bx = _mm256_sub_epi8( bx, off );
|
||||
|
||||
__m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
||||
|
||||
const __m256 q = mul_sum_i8_pairs_float(bx, by);
|
||||
|
||||
/* Multiply q with scale and accumulate */
|
||||
acc = _mm256_fmadd_ps( d, q, acc );
|
||||
#define WORK(I) \
|
||||
/* Compute combined scale for the block */ \
|
||||
const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[I].d ), _mm256_broadcast_ss( &y[I].d ) ); \
|
||||
__m256i bx = bytes_from_nibbles_32(x[I].qs); \
|
||||
/* Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. */ \
|
||||
const __m256i off = _mm256_set1_epi8( 8 ); \
|
||||
bx = _mm256_sub_epi8( bx, off ); \
|
||||
__m256i by = _mm256_loadu_si256((const __m256i *)y[I].qs); \
|
||||
const __m256 q = mul_sum_i8_pairs_float(bx, by); \
|
||||
/* Multiply q with scale and accumulate */ \
|
||||
acc = _mm256_fmadd_ps( d, q, acc )
|
||||
int i = 0;
|
||||
for (; i + 12 < nb; i += 12) {
|
||||
_mm_prefetch(x+i+12, 3);
|
||||
_mm_prefetch(x+i+15, 3);
|
||||
_mm_prefetch(x+i+18, 3);
|
||||
_mm_prefetch(x+i+21, 3);
|
||||
_mm_prefetch(y+i+12, 3);
|
||||
_mm_prefetch(y+i+14, 3);
|
||||
_mm_prefetch(y+i+16, 3);
|
||||
_mm_prefetch(y+i+18, 3);
|
||||
_mm_prefetch(y+i+20, 3);
|
||||
_mm_prefetch(y+i+22, 3);
|
||||
for (int j = 0; j < 12; ++j) {
|
||||
WORK(i+j);
|
||||
}
|
||||
}
|
||||
for (; i < nb; ++i) {
|
||||
WORK(i);
|
||||
}
|
||||
#undef WORK
|
||||
|
||||
*s = hsum_float_8(acc);
|
||||
#elif defined(__AVX__)
|
||||
|
|
17
third_party/ggml/ggml.c
vendored
17
third_party/ggml/ggml.c
vendored
|
@ -1784,9 +1784,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|||
// Initialize accumulator with zeros
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
|
||||
//
|
||||
// Main loop
|
||||
//
|
||||
#define WORK(I) \
|
||||
/* Compute combined scale for the block */ \
|
||||
const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[I].d ), _mm256_broadcast_ss( &y[I].d ) ); \
|
||||
|
@ -2702,9 +2700,15 @@ inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
|
|||
|
||||
inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
|
||||
#ifndef GGML_USE_ACCELERATE
|
||||
ggml_float sum = 0.0;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
sum += (ggml_float)x[i];
|
||||
int i = 0;
|
||||
ggml_float sum = 0;
|
||||
#if __AVX__ || __AVX2__ || __AVX512F__
|
||||
for (; i + 8 <= n; i += 8) {
|
||||
sum += hsum_float_8(_mm256_loadu_ps(x + i));
|
||||
}
|
||||
#endif
|
||||
for (; i < n; ++i) {
|
||||
sum += x[i];
|
||||
}
|
||||
*s = sum;
|
||||
#else
|
||||
|
@ -2802,6 +2806,7 @@ const char *const ggjt_v2_type_name[GGML_TYPE_COUNT] = {
|
|||
[GGML_TYPE_F16] = "f16",
|
||||
[GGML_TYPE_Q4_0] = "q4_0",
|
||||
[GGML_TYPE_Q4_1] = "q4_1",
|
||||
[GGML_TYPE_Q4_2] = "q4_2",
|
||||
[GGML_TYPE_Q5_0] = "q5_0",
|
||||
[GGML_TYPE_Q5_1] = "q5_1",
|
||||
[GGML_TYPE_Q8_0] = "q8_0",
|
||||
|
@ -8113,7 +8118,7 @@ static void ggml_compute_forward_alibi_f32(
|
|||
assert(ne1 + n_past == ne0); (void) n_past;
|
||||
|
||||
// add alibi to src0 (KQ_scaled)
|
||||
const int n_heads_log2_floor = 1 << _bsr(n_head);
|
||||
const int n_heads_log2_floor = 1 << _bsr(n_head); // [jart]
|
||||
|
||||
const float m0 = exp2f(-8.0f / n_heads_log2_floor);
|
||||
const float m1 = exp2f(-4.0f / n_heads_log2_floor);
|
||||
|
|
1
third_party/ggml/ggml.h
vendored
1
third_party/ggml/ggml.h
vendored
|
@ -1,3 +1,4 @@
|
|||
// -*- c; c-basic-offset:4 -*-
|
||||
#ifndef COSMOPOLITAN_THIRD_PARTY_LLAMA_CPP_GGML_H_
|
||||
#define COSMOPOLITAN_THIRD_PARTY_LLAMA_CPP_GGML_H_
|
||||
#if !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
|
|
2
third_party/ggml/ggml.mk
vendored
2
third_party/ggml/ggml.mk
vendored
|
@ -128,6 +128,7 @@ THIRD_PARTY_GGML_LLAMA_DIRECTDEPS = \
|
|||
LIBC_STR \
|
||||
LIBC_STUBS \
|
||||
LIBC_SYSV \
|
||||
LIBC_SYSV_CALLS \
|
||||
LIBC_THREAD \
|
||||
LIBC_TINYMATH \
|
||||
LIBC_ZIPOS \
|
||||
|
@ -180,6 +181,7 @@ o/$(MODE)/third_party/ggml/companionai.txt.zip.o: private \
|
|||
|
||||
THIRD_PARTY_GGML_COMS = \
|
||||
$(THIRD_PARTY_GGML_LLAMA) \
|
||||
o/$(MODE)/third_party/ggml/quantize.com \
|
||||
o/$(MODE)/third_party/ggml/perplexity.com
|
||||
|
||||
THIRD_PARTY_GGML_BINS = $(THIRD_PARTY_GGML_COMS) $(THIRD_PARTY_GGML_COMS:%=%.dbg)
|
||||
|
|
33
third_party/ggml/llama.cc
vendored
33
third_party/ggml/llama.cc
vendored
|
@ -31,6 +31,7 @@
|
|||
#include "libc/intrin/bits.h"
|
||||
#include "libc/macros.internal.h"
|
||||
#include "libc/stdio/stdio.h"
|
||||
#include "libc/sysv/consts/posix.h"
|
||||
#include "third_party/ggml/fp16.h"
|
||||
#include "third_party/ggml/ggml.h"
|
||||
#include "third_party/ggml/llama_util.h"
|
||||
|
@ -443,8 +444,9 @@ struct llama_file_loader {
|
|||
llama_hparams hparams;
|
||||
llama_vocab vocab;
|
||||
|
||||
llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map)
|
||||
: file(fname, "rb") {
|
||||
llama_file_loader(const char * fname, size_t file_idx,
|
||||
llama_load_tensors_map & tensors_map)
|
||||
: file(fname, "rb") {
|
||||
// fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
|
||||
read_magic();
|
||||
read_hparams();
|
||||
|
@ -568,8 +570,9 @@ struct llama_file_saver {
|
|||
write_vocab();
|
||||
}
|
||||
void write_magic() {
|
||||
ggjt_v2();
|
||||
file.write_u32(READ32BE("ggjt")); // magic
|
||||
file.write_u32(1); // version
|
||||
file.write_u32(2); // version
|
||||
}
|
||||
void write_hparams(enum llama_ftype new_ftype) {
|
||||
const llama_hparams & hparams = any_file_loader->hparams;
|
||||
|
@ -2003,16 +2006,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
|
||||
ggml_type_name(tensor.type));
|
||||
|
||||
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
||||
bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
|
||||
|
||||
// quantize only 2D tensors
|
||||
quantize &= (tensor.ne.size() == 2);
|
||||
|
||||
// uncomment this to keep the output layer in FP16
|
||||
//if (tensor.name == "output.weight") {
|
||||
// quantize = false;
|
||||
//}
|
||||
// only quantize 2d weights that aren't the output layer
|
||||
bool quantize =
|
||||
tensor.ne.size() == 2 &&
|
||||
tensor.type != quantized_type &&
|
||||
_endswith(tensor.name.c_str(), "weight") &&
|
||||
tensor.name != "output.weight";
|
||||
|
||||
enum ggml_type new_type;
|
||||
void * new_data;
|
||||
|
@ -2024,6 +2023,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
new_data = tensor.data;
|
||||
new_size = tensor.size;
|
||||
printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
|
||||
} else if (quantized_type == GGML_TYPE_F16) {
|
||||
GGML_ASSERT(tensor.type == GGML_TYPE_F32);
|
||||
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
|
||||
new_type = quantized_type;
|
||||
new_size = nelements * 2;
|
||||
work.resize(new_size);
|
||||
new_data = work.addr;
|
||||
ggml_fp32_to_fp16_row((const float *)tensor.data, (ggml_fp16_t *)new_data, nelements);
|
||||
} else {
|
||||
new_type = quantized_type;
|
||||
float * f32_data;
|
||||
|
|
2
third_party/ggml/llama.h
vendored
2
third_party/ggml/llama.h
vendored
|
@ -1,4 +1,4 @@
|
|||
// -*- c++ -*-
|
||||
// -*- c++; c-basic-offset:4 -*-
|
||||
#ifndef LLAMA_H
|
||||
#define LLAMA_H
|
||||
#include "libc/intrin/bits.h"
|
||||
|
|
15
third_party/ggml/llama_util.h
vendored
15
third_party/ggml/llama_util.h
vendored
|
@ -1,12 +1,11 @@
|
|||
// Internal header to be included only by llama.cpp.
|
||||
// Contains wrappers around OS interfaces.
|
||||
|
||||
// -*- c++; c-basic-offset:4 -*-
|
||||
#ifndef LLAMA_UTIL_H
|
||||
#define LLAMA_UTIL_H
|
||||
#include "libc/calls/struct/rlimit.h"
|
||||
#include "libc/dce.h"
|
||||
#include "libc/fmt/fmt.h"
|
||||
#include "libc/runtime/sysconf.h"
|
||||
#include "libc/str/str.h"
|
||||
#include "libc/sysv/consts/madv.h"
|
||||
#include "libc/sysv/consts/map.h"
|
||||
#include "libc/sysv/consts/prot.h"
|
||||
|
@ -22,6 +21,9 @@
|
|||
#include "third_party/libcxx/vector"
|
||||
// clang-format off
|
||||
|
||||
// Internal header to be included only by llama.cpp.
|
||||
// Contains wrappers around OS interfaces.
|
||||
|
||||
#define LLAMA_ASSERT(x) \
|
||||
do { \
|
||||
if (!(x)) { \
|
||||
|
@ -47,6 +49,13 @@ static void Die(const char *fmt, ...) {
|
|||
exit(1);
|
||||
}
|
||||
|
||||
static inline bool is_integer_str(const char *s) {
|
||||
if (*s == '-') ++s;
|
||||
if (!*s) return false;
|
||||
while (isdigit(*s)) ++s;
|
||||
return !*s;
|
||||
}
|
||||
|
||||
struct llama_file {
|
||||
// use FILE * so we don't have to re-open the file to mmap
|
||||
FILE * fp;
|
||||
|
|
48
third_party/ggml/main.cc
vendored
48
third_party/ggml/main.cc
vendored
|
@ -28,6 +28,7 @@
|
|||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/assert.h"
|
||||
#include "libc/calls/calls.h"
|
||||
#include "libc/calls/struct/sched_param.h"
|
||||
#include "libc/calls/struct/sigaction.h"
|
||||
#include "libc/calls/struct/stat.h"
|
||||
#include "libc/fmt/fmt.h"
|
||||
|
@ -37,9 +38,11 @@
|
|||
#include "libc/nexgen32e/x86feature.h"
|
||||
#include "libc/runtime/runtime.h"
|
||||
#include "libc/stdio/stdio.h"
|
||||
#include "libc/sysv/consts/ioprio.h"
|
||||
#include "libc/sysv/consts/map.h"
|
||||
#include "libc/sysv/consts/msync.h"
|
||||
#include "libc/sysv/consts/o.h"
|
||||
#include "libc/sysv/consts/prio.h"
|
||||
#include "libc/sysv/consts/prot.h"
|
||||
#include "libc/sysv/consts/sig.h"
|
||||
#include "third_party/ggml/common.h"
|
||||
|
@ -66,7 +69,6 @@ static int n_past;
|
|||
static int n_remain;
|
||||
static int n_consumed;
|
||||
static bool input_noecho;
|
||||
static bool is_antiprompt;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
@ -103,7 +105,7 @@ static int CompareTime(struct timespec a, struct timespec b) {
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
// ux explanatory logging for llama.com developers
|
||||
|
||||
#if 1
|
||||
#if 0
|
||||
#define DEVLOG(...) (void)0
|
||||
#else
|
||||
#define DEVLOG(...) if (g_devlog) fprintf(g_devlog, __VA_ARGS__)
|
||||
|
@ -187,16 +189,16 @@ static bool has_antiprompt(std::string::size_type *out_index = nullptr,
|
|||
static void finish_initializing_prompt() {
|
||||
prompt_status = kPromptFinished;
|
||||
if (params.interactive) {
|
||||
std::string::size_type pos;
|
||||
std::string::size_type ap_index;
|
||||
is_interacting = true;
|
||||
if (has_antiprompt(&pos)) {
|
||||
if (has_antiprompt(&ap_index)) {
|
||||
console_set_color(con_st, CONSOLE_COLOR_PROMPT);
|
||||
printf("%s", last_output.substr(pos).c_str());
|
||||
last_output.clear();
|
||||
printf("%s", last_output.substr(ap_index).c_str());
|
||||
fflush(stdout);
|
||||
}
|
||||
console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
|
||||
}
|
||||
last_output.clear();
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -208,8 +210,16 @@ static int on_missing_feature(const char *name) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
void MakeProcessNice(void) {
|
||||
setpriority(PRIO_PROCESS, 0, 10);
|
||||
ioprio_set(IOPRIO_WHO_PROCESS, 0, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
|
||||
struct sched_param param = {sched_get_priority_min(SCHED_IDLE)};
|
||||
sched_setscheduler(0, SCHED_IDLE, ¶m);
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
|
||||
MakeProcessNice();
|
||||
ShowCrashReports();
|
||||
setvbuf(stdin, NULL, _IONBF, 0);
|
||||
setvbuf(stdout, NULL, _IONBF, 0);
|
||||
|
@ -439,8 +449,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
remember_init();
|
||||
|
||||
is_antiprompt = false;
|
||||
input_noecho = params.verbose <= 0;
|
||||
input_noecho = params.verbose <= 0;
|
||||
|
||||
n_past = 0;
|
||||
n_remain = params.n_predict;
|
||||
|
@ -561,6 +570,9 @@ int main(int argc, char ** argv) {
|
|||
fprintf(stderr, EPHEMERAL("loading weights..."));
|
||||
}
|
||||
|
||||
// tracks if last character written to stdout was newline
|
||||
bool got_newline = false;
|
||||
|
||||
while ((n_remain != 0 || params.interactive) && !is_terminated) {
|
||||
|
||||
// perform evaluation
|
||||
|
@ -678,7 +690,8 @@ int main(int argc, char ** argv) {
|
|||
finish_initializing_prompt();
|
||||
}
|
||||
|
||||
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
||||
if (prompt_status == kPromptFinished &&
|
||||
(int) embd_inp.size() <= n_consumed && !is_interacting) {
|
||||
// out of user input, sample next token
|
||||
DEVLOG("out of user input, sample next token w/ embd_inp.size()=%d n_consumed=%d\n",
|
||||
(int)embd_inp.size(), n_consumed);
|
||||
|
@ -808,21 +821,25 @@ int main(int argc, char ** argv) {
|
|||
// --prompt 'Question: How old are you?\nAnswer: '
|
||||
// --reverse-prompt $'\n'
|
||||
//
|
||||
bool is_antiprompt;
|
||||
std::string ap_text;
|
||||
std::string::size_type ap_index;
|
||||
std::string::size_type ap_extra;
|
||||
is_antiprompt = has_antiprompt(&ap_index, &ap_text);
|
||||
std::string::size_type ap_index;
|
||||
if (prompt_status == kPromptFinished) {
|
||||
is_antiprompt = has_antiprompt(&ap_index, &ap_text);
|
||||
} else {
|
||||
is_antiprompt = false;
|
||||
}
|
||||
|
||||
// display text
|
||||
bool got_newline = false;
|
||||
if (!input_noecho) {
|
||||
if (!input_noecho && embd.size()) {
|
||||
std::string printme;
|
||||
for (auto id : embd) {
|
||||
printme.append(llama_token_to_str(ctx, id));
|
||||
}
|
||||
if (is_antiprompt) {
|
||||
ap_extra = last_output.size() - (ap_index + ap_text.size());
|
||||
printme.erase(printme.size() - MIN(printme.size(), ap_extra));
|
||||
ap_extra = last_output.size() - ap_index;
|
||||
printme.erase(std::max(0, (int)(printme.size() - ap_extra)));
|
||||
}
|
||||
if (printme.size()) {
|
||||
got_newline = printme[printme.size() - 1] == '\n';
|
||||
|
@ -832,6 +849,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
if (is_antiprompt) {
|
||||
if (!params.interactive) {
|
||||
DEVLOG("exiting due to antiprompt\n");
|
||||
if (!got_newline) {
|
||||
printf("\n");
|
||||
}
|
||||
|
|
108
third_party/ggml/quantize.cc
vendored
108
third_party/ggml/quantize.cc
vendored
|
@ -25,9 +25,12 @@
|
|||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/log/log.h"
|
||||
#include "libc/runtime/runtime.h"
|
||||
#include "third_party/ggml/common.h"
|
||||
#include "third_party/ggml/ggml.h"
|
||||
#include "third_party/ggml/llama.h"
|
||||
#include "third_party/ggml/llama_util.h"
|
||||
#include "third_party/libcxx/map"
|
||||
#include "third_party/libcxx/vector"
|
||||
|
||||
|
@ -38,46 +41,26 @@ asm(".include \"libc/disclaimer.inc\"");
|
|||
// clang-format off
|
||||
|
||||
static const std::map<std::string, llama_ftype> LLAMA_FTYPE_MAP = {
|
||||
{"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
|
||||
{"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
|
||||
{"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
|
||||
{"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
|
||||
{"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
|
||||
{"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
|
||||
{"f16", LLAMA_FTYPE_MOSTLY_F16 },
|
||||
{"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
|
||||
{"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
|
||||
{"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
|
||||
{"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
|
||||
{"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
|
||||
{"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
|
||||
};
|
||||
|
||||
bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::string & ftype_str_out) {
|
||||
auto it = LLAMA_FTYPE_MAP.find(ftype_str);
|
||||
if (it != LLAMA_FTYPE_MAP.end()) {
|
||||
ftype = it->second;
|
||||
ftype_str_out = it->first;
|
||||
return true;
|
||||
}
|
||||
// try to parse as an integer
|
||||
// try {
|
||||
int ftype_int = std::stoi(ftype_str);
|
||||
for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
|
||||
if (it->second == ftype_int) {
|
||||
ftype = it->second;
|
||||
ftype_str_out = it->first;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// }
|
||||
// catch (...) {
|
||||
// // stoi failed
|
||||
// }
|
||||
return false;
|
||||
}
|
||||
|
||||
// usage:
|
||||
// ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
|
||||
// ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type [nthreads]
|
||||
//
|
||||
int main(int argc, char ** argv) {
|
||||
ShowCrashReports();
|
||||
|
||||
ggjt_v2();
|
||||
ggml_time_init();
|
||||
|
||||
if (argc < 3) {
|
||||
fprintf(stderr, "usage: %s model-f32.bin [model-quant.bin] type [nthreads]\n", argv[0]);
|
||||
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthreads]\n", argv[0]);
|
||||
for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
|
||||
fprintf(stderr, " type = \"%s\" or %d\n", it->first.c_str(), it->second);
|
||||
}
|
||||
|
@ -91,60 +74,27 @@ int main(int argc, char ** argv) {
|
|||
ggml_free(ctx);
|
||||
}
|
||||
|
||||
// parse command line arguments
|
||||
const std::string fname_inp = argv[1];
|
||||
std::string fname_out;
|
||||
int nthread;
|
||||
llama_ftype ftype;
|
||||
const std::string fname_out = argv[2];
|
||||
|
||||
int arg_idx = 2;
|
||||
std::string ftype_str;
|
||||
if (try_parse_ftype(argv[arg_idx], ftype, ftype_str)) {
|
||||
// argv[2] is the ftype
|
||||
std::string fpath;
|
||||
const size_t pos = fname_inp.find_last_of('/');
|
||||
if (pos != std::string::npos) {
|
||||
fpath = fname_inp.substr(0, pos + 1);
|
||||
}
|
||||
// export as [inp path]/ggml-model-[ftype].bin
|
||||
fname_out = fpath + "ggml-model-" + ftype_str + ".bin";
|
||||
arg_idx++;
|
||||
}
|
||||
else {
|
||||
// argv[2] is the output path
|
||||
fname_out = argv[arg_idx];
|
||||
arg_idx++;
|
||||
|
||||
if (argc <= arg_idx) {
|
||||
fprintf(stderr, "%s: missing ftype\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
// argv[3] is the ftype
|
||||
if (!try_parse_ftype(argv[arg_idx], ftype, ftype_str)) {
|
||||
fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
|
||||
return 1;
|
||||
}
|
||||
arg_idx++;
|
||||
if (fname_inp == fname_out) {
|
||||
fprintf(stderr, "%s: input and output names are same\n", fname_inp.c_str());
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// parse nthreads
|
||||
if (argc > arg_idx) {
|
||||
// try {
|
||||
nthread = std::stoi(argv[arg_idx]);
|
||||
// }
|
||||
// catch (const std::exception & e) {
|
||||
// Die("%s: invalid nthread '%s' (%s)\n", __func__, argv[arg_idx], e.what());
|
||||
// return 1;
|
||||
// }
|
||||
enum llama_ftype ftype;
|
||||
if (!is_integer_str(argv[3])) {
|
||||
auto it = LLAMA_FTYPE_MAP.find(argv[3]);
|
||||
if (it == LLAMA_FTYPE_MAP.end()) {
|
||||
fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, argv[3]);
|
||||
return 1;
|
||||
}
|
||||
ftype = it->second;
|
||||
} else {
|
||||
nthread = 0;
|
||||
ftype = (enum llama_ftype)atoi(argv[3]);
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
|
||||
if (nthread > 0) {
|
||||
fprintf(stderr, " using %d threads", nthread);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
int nthread = argc > 4 ? atoi(argv[4]) : 0;
|
||||
|
||||
const int64_t t_main_start_us = ggml_time_us();
|
||||
|
||||
|
|
12
third_party/intel/f16cintrin.internal.h
vendored
12
third_party/intel/f16cintrin.internal.h
vendored
|
@ -18,10 +18,18 @@ __funline float _cvtsh_ss(unsigned short __S) {
|
|||
return __builtin_ia32_vec_ext_v4sf(__A, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts four half-precision (16-bit) floating point values to
|
||||
* single-precision floating point values.
|
||||
*/
|
||||
__funline __m128 _mm_cvtph_ps(__m128i __A) {
|
||||
return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__A);
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts eight half-precision (16-bit) floating point values to
|
||||
* single-precision floating point values.
|
||||
*/
|
||||
__funline __m256 _mm256_cvtph_ps(__m128i __A) {
|
||||
return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__A);
|
||||
}
|
||||
|
@ -37,6 +45,10 @@ __funline __m128i _mm_cvtps_ph(__m128 __A, const int __I) {
|
|||
return (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)__A, __I);
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts eight single-precision floating point values to
|
||||
* half-precision (16-bit) floating point values.
|
||||
*/
|
||||
__funline __m128i _mm256_cvtps_ph(__m256 __A, const int __I) {
|
||||
return (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)__A, __I);
|
||||
}
|
||||
|
|
2
third_party/radpajama/common-gptneox.h
vendored
2
third_party/radpajama/common-gptneox.h
vendored
|
@ -1,4 +1,4 @@
|
|||
// -*- c++ -*-
|
||||
// -*- c++; c-basic-offset:4 -*-
|
||||
#ifndef COSMOPOLITAN_THIRD_PARTY_RADPAJAMA_COMMON_GPTNEOX_H_
|
||||
#define COSMOPOLITAN_THIRD_PARTY_RADPAJAMA_COMMON_GPTNEOX_H_
|
||||
#include "libc/macros.internal.h"
|
||||
|
|
2
third_party/radpajama/gptneox-util.h
vendored
2
third_party/radpajama/gptneox-util.h
vendored
|
@ -1,4 +1,4 @@
|
|||
// -*- c++ -*-
|
||||
// -*- c++; c-basic-offset:4 -*-
|
||||
#ifndef GPTNEOX_UTIL_H
|
||||
#define GPTNEOX_UTIL_H
|
||||
#include "libc/calls/calls.h"
|
||||
|
|
76
third_party/radpajama/gptneox.cc
vendored
76
third_party/radpajama/gptneox.cc
vendored
|
@ -28,6 +28,8 @@
|
|||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "third_party/radpajama/gptneox.h"
|
||||
#include "libc/intrin/bits.h"
|
||||
#include "libc/str/str.h"
|
||||
#include "libc/sysv/consts/posix.h"
|
||||
#include "third_party/ggml/fp16.h"
|
||||
#include "third_party/ggml/ggml.h"
|
||||
#include "third_party/ggml/llama_util.h"
|
||||
|
@ -77,7 +79,7 @@ static const size_t MiB = 1024*1024;
|
|||
// needs modifications in ggml
|
||||
|
||||
// TODO: Modify for gptneox, how are these values actually determined?
|
||||
// TODO: This is now priority,
|
||||
// TODO: This is now priority,
|
||||
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
||||
{
|
||||
static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
|
||||
|
@ -446,7 +448,8 @@ struct gptneox_load_tensors_map {
|
|||
enum gptneox_file_version {
|
||||
GPTNEOX_FILE_VERSION_GGML,
|
||||
GPTNEOX_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
|
||||
GPTNEOX_FILE_VERSION_GGJT_V1, // added padding
|
||||
GPTNEOX_FILE_VERSION_GGJT_V1, // adopted unified aligned mappable layout
|
||||
GPTNEOX_FILE_VERSION_GGJT_V2, // changed quantization format
|
||||
};
|
||||
|
||||
struct gptneox_file_loader {
|
||||
|
@ -473,10 +476,16 @@ struct gptneox_file_loader {
|
|||
|
||||
if (magic == READ32BE("ggml") && version == 0) {
|
||||
file_version = GPTNEOX_FILE_VERSION_GGML;
|
||||
ggjt_v1();
|
||||
} else if (magic == READ32BE("ggmf") && version == 1) {
|
||||
file_version = GPTNEOX_FILE_VERSION_GGMF_V1;
|
||||
ggjt_v1();
|
||||
} else if (magic == READ32BE("ggjt") && version == 1) {
|
||||
file_version = GPTNEOX_FILE_VERSION_GGJT_V1;
|
||||
ggjt_v1();
|
||||
} else if (magic == READ32BE("ggjt") && version == 2) {
|
||||
file_version = GPTNEOX_FILE_VERSION_GGJT_V2;
|
||||
ggjt_v2();
|
||||
} else {
|
||||
Die("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
||||
magic, version);
|
||||
|
@ -566,17 +575,20 @@ struct gptneox_file_loader {
|
|||
struct gptneox_file_saver {
|
||||
gptneox_file file;
|
||||
gptneox_file_loader * any_file_loader;
|
||||
gptneox_file_saver(const char * fname, gptneox_file_loader * any_file_loader, enum gptneox_ftype new_ftype)
|
||||
: file(fname, "wb"), any_file_loader(any_file_loader) {
|
||||
gptneox_file_saver(const char * fname,
|
||||
gptneox_file_loader * any_file_loader,
|
||||
enum gptneox_ftype new_ftype)
|
||||
: file(fname, "wb"),
|
||||
any_file_loader(any_file_loader) {
|
||||
fprintf(stderr, "gptneox.cpp: saving model to %s\n", fname);
|
||||
ggjt_v1();
|
||||
write_magic();
|
||||
write_hparams(new_ftype);
|
||||
write_vocab();
|
||||
}
|
||||
void write_magic() {
|
||||
ggjt_v2();
|
||||
file.write_u32(READ32BE("ggjt")); // magic
|
||||
file.write_u32(1); // version
|
||||
file.write_u32(2); // version
|
||||
}
|
||||
void write_hparams(enum gptneox_ftype new_ftype) {
|
||||
const gptneox_hparams & hparams = any_file_loader->hparams;
|
||||
|
@ -887,7 +899,8 @@ static const char *gptneox_file_version_name(gptneox_file_version version) {
|
|||
switch (version) {
|
||||
case GPTNEOX_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
|
||||
case GPTNEOX_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
|
||||
case GPTNEOX_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)";
|
||||
case GPTNEOX_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
|
||||
case GPTNEOX_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
|
||||
default: GPTNEOX_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
@ -940,7 +953,7 @@ static void gptneox_model_load_internal(
|
|||
model.hparams = ml->file_loaders.at(0)->hparams;
|
||||
gptneox_file_version file_version = ml->file_loaders.at(0)->file_version;
|
||||
auto & hparams = model.hparams;
|
||||
|
||||
|
||||
{
|
||||
switch (hparams.n_layer) {
|
||||
case 16: {
|
||||
|
@ -951,7 +964,7 @@ static void gptneox_model_load_internal(
|
|||
}
|
||||
break;
|
||||
}
|
||||
// # <RedPajama>: we extend the model type settings for RedPajama models.
|
||||
// # <RedPajama>: we extend the model type settings for RedPajama models.
|
||||
case 32:{
|
||||
if (hparams.n_embd == 2560) {
|
||||
model.type = e_model::MODEL_3B;
|
||||
|
@ -1195,7 +1208,7 @@ static bool gptneox_eval_internal(
|
|||
model.layers[il].c_attn_attn_b, cur),
|
||||
cur);
|
||||
}
|
||||
|
||||
|
||||
// Split QKV and make contiguous
|
||||
struct ggml_tensor * Qcur = ggml_view_3d(ctx0, cur,
|
||||
n_embd/n_head,
|
||||
|
@ -1225,7 +1238,7 @@ static bool gptneox_eval_internal(
|
|||
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N));
|
||||
Vcur = ggml_cpy(ctx0, Vcur,
|
||||
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N));
|
||||
|
||||
|
||||
// MARK: gptneox RoPE Q and K, before cache
|
||||
// Bit 2 for gptneox style (2)
|
||||
// Bit 1 is zero for dont skip n_past +(0), use (2+1) = (3) if rope is applied to cache of k (after cache only)
|
||||
|
@ -1241,7 +1254,7 @@ static bool gptneox_eval_internal(
|
|||
ggml_element_size(Vcur) * n_embd,
|
||||
0);
|
||||
Vcur = ggml_transpose(ctx0, Vcur);
|
||||
|
||||
|
||||
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k,
|
||||
n_embd * N, // num elements in current context (up to n_embd*n_ctx but usually less)
|
||||
ggml_element_size(kv_self.k) * n_embd * (il * n_ctx + n_past));
|
||||
|
@ -1250,12 +1263,12 @@ static bool gptneox_eval_internal(
|
|||
n_embd,
|
||||
ggml_element_size(kv_self.v) * n_ctx,
|
||||
ggml_element_size(kv_self.v) * ((il * n_ctx * n_embd) + n_past));
|
||||
|
||||
|
||||
// important: storing RoPE-ed version of K in the KV cache!
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
|
||||
//}
|
||||
|
||||
|
||||
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
|
||||
struct ggml_tensor * Q =
|
||||
ggml_permute(ctx0,
|
||||
|
@ -1284,7 +1297,7 @@ static bool gptneox_eval_internal(
|
|||
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
|
||||
// KQ = soft_max(KQ_masked)
|
||||
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
||||
|
||||
|
||||
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
|
||||
struct ggml_tensor * V_trans = ggml_view_3d(ctx0, kv_self.v,
|
||||
n_past + N,
|
||||
|
@ -1312,10 +1325,10 @@ static bool gptneox_eval_internal(
|
|||
}
|
||||
|
||||
lctx.use_buf(ctx0, 1);
|
||||
|
||||
|
||||
if (hparams.use_parallel_residual == 1) {
|
||||
//printf("use_parallel_residual == 1\n");
|
||||
|
||||
|
||||
// This is independent of the self-attention result, so it could be done in parallel to the self-attention
|
||||
struct ggml_tensor * outAttn = cur;
|
||||
|
||||
|
@ -1359,7 +1372,7 @@ static bool gptneox_eval_internal(
|
|||
inpL = ggml_add(ctx0, inpL, cur);
|
||||
} else if (hparams.use_parallel_residual == 0) {
|
||||
//printf("use_parallel_residual == 0\n");
|
||||
|
||||
|
||||
// This takes the self-attention residual output as input to Feedforward
|
||||
struct ggml_tensor * outAttn = cur;
|
||||
struct ggml_tensor * inpFF = ggml_add(ctx0, outAttn, inpL);
|
||||
|
@ -2093,6 +2106,7 @@ int gptneox_model_copy(
|
|||
static void gptneox_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum gptneox_ftype ftype, int nthread) {
|
||||
ggml_type quantized_type;
|
||||
switch (ftype) {
|
||||
case GPTNEOX_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
|
||||
case GPTNEOX_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
||||
case GPTNEOX_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
||||
case GPTNEOX_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
|
||||
|
@ -2124,21 +2138,17 @@ static void gptneox_model_quantize_internal(const std::string & fname_inp, const
|
|||
tensor.data = read_data.addr;
|
||||
model_loader->load_data_for(tensor);
|
||||
|
||||
printf("[%4zu/%4zu] %36s - %16s, type = %6s, ",
|
||||
printf("[%4zu/%4zu] %50s - %16s, type = %6s, ",
|
||||
++idx, model_loader->tensors_map.tensors.size(),
|
||||
tensor.name.c_str(), gptneox_format_tensor_shape(tensor.ne).c_str(),
|
||||
ggml_type_name(tensor.type));
|
||||
|
||||
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
||||
bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
|
||||
|
||||
// quantize only 2D tensors
|
||||
quantize &= (tensor.ne.size() == 2);
|
||||
|
||||
// uncomment this to keep the output layer in FP16
|
||||
//if (tensor.name == "output.weight") {
|
||||
// quantize = false;
|
||||
//}
|
||||
// only quantize 2d weights that aren't the output layer
|
||||
bool quantize =
|
||||
tensor.ne.size() == 2 &&
|
||||
tensor.type != quantized_type &&
|
||||
_endswith(tensor.name.c_str(), "weight") &&
|
||||
tensor.name != "output.weight";
|
||||
|
||||
enum ggml_type new_type;
|
||||
void * new_data;
|
||||
|
@ -2150,6 +2160,14 @@ static void gptneox_model_quantize_internal(const std::string & fname_inp, const
|
|||
new_data = tensor.data;
|
||||
new_size = tensor.size;
|
||||
printf("size = %8.3f MiB\n", tensor.size/1024.0/1024.0);
|
||||
} else if (quantized_type == GGML_TYPE_F16) {
|
||||
GPTNEOX_ASSERT(tensor.type == GGML_TYPE_F32);
|
||||
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
|
||||
new_type = quantized_type;
|
||||
new_size = nelements * 2;
|
||||
work.resize(new_size);
|
||||
new_data = work.addr;
|
||||
ggml_fp32_to_fp16_row((const float *)tensor.data, (ggml_fp16_t *)new_data, nelements);
|
||||
} else {
|
||||
new_type = quantized_type;
|
||||
float * f32_data;
|
||||
|
|
2
third_party/radpajama/gptneox.h
vendored
2
third_party/radpajama/gptneox.h
vendored
|
@ -1,4 +1,4 @@
|
|||
// -*- c++ -*-
|
||||
// -*- c++; c-basic-offset:4 -*-
|
||||
#ifndef GPTNEOX_H
|
||||
#define GPTNEOX_H
|
||||
// clang-format off
|
||||
|
|
25
third_party/radpajama/quantize-gptneox.cc
vendored
25
third_party/radpajama/quantize-gptneox.cc
vendored
|
@ -28,6 +28,7 @@
|
|||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/log/log.h"
|
||||
#include "third_party/ggml/ggml.h"
|
||||
#include "third_party/ggml/llama_util.h"
|
||||
#include "third_party/libcxx/cstdio"
|
||||
#include "third_party/libcxx/map"
|
||||
#include "third_party/libcxx/string"
|
||||
|
@ -35,13 +36,14 @@
|
|||
// clang-format off
|
||||
|
||||
static const std::map<std::string, enum gptneox_ftype> GPTNEOX_FTYPE_MAP = {
|
||||
{"q4_0", GPTNEOX_FTYPE_MOSTLY_Q4_0},
|
||||
{"q4_1", GPTNEOX_FTYPE_MOSTLY_Q4_1},
|
||||
{"q4_2", GPTNEOX_FTYPE_MOSTLY_Q4_2},
|
||||
//{"q4_3", GPTNEOX_FTYPE_MOSTLY_Q4_3},
|
||||
{"q5_0", GPTNEOX_FTYPE_MOSTLY_Q5_0},
|
||||
{"q5_1", GPTNEOX_FTYPE_MOSTLY_Q5_1},
|
||||
{"q8_0", GPTNEOX_FTYPE_MOSTLY_Q8_0},
|
||||
{"f16", GPTNEOX_FTYPE_MOSTLY_F16},
|
||||
{"q4_0", GPTNEOX_FTYPE_MOSTLY_Q4_0},
|
||||
{"q4_1", GPTNEOX_FTYPE_MOSTLY_Q4_1},
|
||||
{"q4_2", GPTNEOX_FTYPE_MOSTLY_Q4_2},
|
||||
//{"q4_3", GPTNEOX_FTYPE_MOSTLY_Q4_3},
|
||||
{"q5_0", GPTNEOX_FTYPE_MOSTLY_Q5_0},
|
||||
{"q5_1", GPTNEOX_FTYPE_MOSTLY_Q5_1},
|
||||
{"q8_0", GPTNEOX_FTYPE_MOSTLY_Q8_0},
|
||||
};
|
||||
|
||||
// usage:
|
||||
|
@ -50,7 +52,7 @@ static const std::map<std::string, enum gptneox_ftype> GPTNEOX_FTYPE_MAP = {
|
|||
int main(int argc, char ** argv) {
|
||||
ShowCrashReports();
|
||||
|
||||
ggjt_v1();
|
||||
ggjt_v2();
|
||||
ggml_time_init();
|
||||
|
||||
if (argc < 4) {
|
||||
|
@ -71,8 +73,13 @@ int main(int argc, char ** argv) {
|
|||
const std::string fname_inp = argv[1];
|
||||
const std::string fname_out = argv[2];
|
||||
|
||||
if (fname_inp == fname_out) {
|
||||
fprintf(stderr, "%s: input and output names are same\n", fname_inp.c_str());
|
||||
exit(1);
|
||||
}
|
||||
|
||||
enum gptneox_ftype ftype;
|
||||
if (argv[3][0] == 'q') {
|
||||
if (!is_integer_str(argv[3])) {
|
||||
auto it = GPTNEOX_FTYPE_MAP.find(argv[3]);
|
||||
if (it == GPTNEOX_FTYPE_MAP.end()) {
|
||||
fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, argv[3]);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue