From 5b42cdc663b78572f6c3013945365a36800f8530 Mon Sep 17 00:00:00 2001 From: Ariel Nunez <54999+ingenieroariel@users.noreply.github.com> Date: Thu, 11 May 2023 07:00:06 -0400 Subject: [PATCH] Added build infrastucture for radpajama --- Makefile | 1 + third_party/radpajama/radpajama.cc | 1889 ++++++++++++++++++---------- third_party/radpajama/radpajama.mk | 127 ++ third_party/third_party.mk | 1 + 4 files changed, 1359 insertions(+), 659 deletions(-) create mode 100644 third_party/radpajama/radpajama.mk diff --git a/Makefile b/Makefile index 4e6b018e2..82805239d 100644 --- a/Makefile +++ b/Makefile @@ -147,6 +147,7 @@ include net/net.mk # │ include third_party/vqsort/vqsort.mk # │ include libc/log/log.mk # │ include third_party/ggml/ggml.mk # │ +#include third_party/radpajama/radpajama.mk # │ include third_party/bzip2/bzip2.mk # │ include dsp/core/core.mk # │ include libc/x/x.mk # │ diff --git a/third_party/radpajama/radpajama.cc b/third_party/radpajama/radpajama.cc index ae098d550..77db3c3fc 100644 --- a/third_party/radpajama/radpajama.cc +++ b/third_party/radpajama/radpajama.cc @@ -26,31 +26,26 @@ │ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │ │ │ ╚─────────────────────────────────────────────────────────────────────────────*/ -#include "third_party/ggml/llama.h" #include "libc/assert.h" +#include "libc/calls/calls.h" +#include "libc/calls/struct/sigaction.h" +#include "libc/calls/struct/stat.h" #include "libc/intrin/bits.h" -#include "third_party/ggml/ggml.h" -#include "third_party/ggml/llama_util.h" -#include "third_party/libcxx/algorithm" -#include "third_party/libcxx/array" -#include "third_party/libcxx/atomic" -#include "third_party/libcxx/cassert" -#include "third_party/libcxx/cinttypes" -#include "third_party/libcxx/climits" -#include "third_party/libcxx/cstdint" -#include "third_party/libcxx/cstdio" -#include "third_party/libcxx/cstring" -#include "third_party/libcxx/ctime" -#include "third_party/libcxx/fstream" -#include "third_party/libcxx/initializer_list" +#include "libc/log/log.h" +#include "libc/nexgen32e/x86feature.h" +#include "libc/stdio/stdio.h" +#include "libc/sysv/consts/map.h" +#include "libc/sysv/consts/msync.h" +#include "libc/sysv/consts/o.h" +#include "libc/sysv/consts/prot.h" +#include "libc/sysv/consts/sig.h" #include "third_party/libcxx/map" -#include "third_party/libcxx/memory" -#include "third_party/libcxx/mutex" -#include "third_party/libcxx/queue" -#include "third_party/libcxx/random" -#include "third_party/libcxx/sstream" -#include "third_party/libcxx/thread" -#include "third_party/libcxx/unordered_map" +#include "third_party/libcxx/atomic" +#include "third_party/libcxx/iostream" +#include "third_party/libcxx/string" +#include "third_party/libcxx/vector" +#include "third_party/radpajama/ggml.h" +#include "third_party/radpajama/common.h" asm(".ident\t\"\\n\\n\ llama.cpp (MIT License)\\n\ @@ -58,163 +53,202 @@ Copyright (c) 2023 Georgi Gerganov\""); asm(".include \"libc/disclaimer.inc\""); // clang-format off -#define LLAMA_USE_SCRATCH -#define LLAMA_MAX_SCRATCH_BUFFERS 16 -// available llama models + +// TODO: Add back in n_ctx (max_position_embeddings) to ggml model, it is currently hard-coded to 2048 max for llama + +#define GPTNEOX_USE_SCRATCH +#define GPTNEOX_MAX_SCRATCH_BUFFERS 16 + +// available open-assistant based gptneox models +// OpenAssistant/stablelm-7b-sft-v7-epoch-3 +// OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5 enum e_model { MODEL_UNKNOWN, + MODEL_3B, // StabilityAI Base Alpha 3B MODEL_7B, - MODEL_13B, - MODEL_30B, - MODEL_65B, + MODEL_12B, + MODEL_20B, }; -static const size_t MB = 1024*1024; +static const size_t MiB = 1024*1024; +static console_state con_st; +static gptneox_context ** g_ctx; + +void sigint_handler(int signo) { + set_console_color(con_st, CONSOLE_COLOR_DEFAULT); + printf("\n"); // this also force flush stdout. + if (signo == SIGINT) { + if (!is_interacting) { + is_interacting=true; + } else { + gptneox_print_timings(*g_ctx); + _exit(130); + } + } +} // computed for n_ctx == 2048 // TODO: dynamically determine these sizes +// TODO: To load the stablelm 3B model on my test XR will require some tricks, small ggml context size, mmap support, among others, but is maybe feasible, is a smaller n_ctx required? 512 instead of 2048/4096? Does mmap work as desired on iOS? // needs modifications in ggml +// TODO: Modify for gptneox, how are these values actually determined? +// TODO: This is now priority, static const std::map & MEM_REQ_SCRATCH0() { static std::map _MEM_REQ_SCRATCH0 = { - { MODEL_7B, 512ull * MB }, - { MODEL_13B, 512ull * MB }, - { MODEL_30B, 512ull * MB }, - { MODEL_65B, 1024ull * MB }, + { MODEL_3B, 128ull * MiB }, + { MODEL_7B, 512ull * MiB }, + { MODEL_12B, 512ull * MiB }, + { MODEL_20B, 512ull * MiB }, }; return _MEM_REQ_SCRATCH0; } +// TODO: Modify for gptneox, how are these values actually determined? static const std::map & MEM_REQ_SCRATCH1() { static std::map _MEM_REQ_SCRATCH1 = { - { MODEL_7B, 512ull * MB }, - { MODEL_13B, 512ull * MB }, - { MODEL_30B, 512ull * MB }, - { MODEL_65B, 1024ull * MB }, + { MODEL_3B, 128ull * MiB }, + { MODEL_7B, 512ull * MiB }, + { MODEL_12B, 512ull * MiB }, + { MODEL_20B, 512ull * MiB }, }; return _MEM_REQ_SCRATCH1; } +// TODO: Modify for gptneox, how are these values actually determined? // 2*n_embd*n_ctx*n_layer*sizeof(float16) +// llama 7B: 2 * 768 * 32 * 2 = 98304 static const std::map & MEM_REQ_KV_SELF() { static std::map _MEM_REQ_KV_SELF = { - { MODEL_7B, 1026ull * MB }, - { MODEL_13B, 1608ull * MB }, - { MODEL_30B, 3124ull * MB }, - { MODEL_65B, 5120ull * MB }, + { MODEL_3B, 512ull * MiB }, + { MODEL_7B, 1026ull * MiB }, + { MODEL_12B, 1608ull * MiB }, + { MODEL_20B, 1608ull * MiB }, }; return _MEM_REQ_KV_SELF; } +// TODO: Modify for gptneox, how are these values actually determined? // this is mostly needed for temporary mul_mat buffers to dequantize the data // not actually needed if BLAS is disabled static const std::map & MEM_REQ_EVAL() { static std::map _MEM_REQ_EVAL = { - { MODEL_7B, 768ull * MB }, - { MODEL_13B, 1024ull * MB }, - { MODEL_30B, 1280ull * MB }, - { MODEL_65B, 1536ull * MB }, + { MODEL_3B, 512ull * MiB }, + { MODEL_7B, 768ull * MiB }, + { MODEL_12B, 1024ull * MiB }, + { MODEL_20B, 1024ull * MiB }, }; return _MEM_REQ_EVAL; } -// default hparams (LLaMA 7B) -struct llama_hparams { - uint32_t n_vocab = 32000; - uint32_t n_ctx = 512; // this is provided as user input? - uint32_t n_embd = 4096; - uint32_t n_mult = 256; - uint32_t n_head = 32; - uint32_t n_layer = 32; - uint32_t n_rot = 64; - enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16; +// default hparams (GPT-NeoX oasst 12B) +struct gptneox_hparams { + uint32_t n_vocab = 50288; + uint32_t n_ctx = 4096; // this is provided as user input? + uint32_t n_embd = 5120; + uint32_t n_head = 40; + uint32_t n_layer = 36; + uint32_t n_rot = 32; + uint32_t use_parallel_residual = 1; // 1 = true, 0 = false + enum gptneox_ftype ftype = GPTNEOX_FTYPE_MOSTLY_F16; - bool operator!=(const llama_hparams & other) const { - return memcmp(this, &other, sizeof(llama_hparams)); + bool operator!=(const gptneox_hparams & other) const { + return memcmp(this, &other, sizeof(gptneox_hparams)); } }; -struct llama_layer { - // normalization - struct ggml_tensor * attention_norm; +struct gptneox_layer { + // input_layernorm + struct ggml_tensor * ln_attn_g; + struct ggml_tensor * ln_attn_b; + + // post_attention_layernorm + struct ggml_tensor * ln_ff_g; + struct ggml_tensor * ln_ff_b; // attention - struct ggml_tensor * wq; - struct ggml_tensor * wk; - struct ggml_tensor * wv; - struct ggml_tensor * wo; + struct ggml_tensor * c_attn_attn_w; - // normalization - struct ggml_tensor * ffn_norm; + struct ggml_tensor * c_attn_attn_b; + + struct ggml_tensor * c_attn_proj_w; + struct ggml_tensor * c_attn_proj_b; // ff - struct ggml_tensor * w1; - struct ggml_tensor * w2; - struct ggml_tensor * w3; + struct ggml_tensor * c_mlp_fc_w; + struct ggml_tensor * c_mlp_fc_b; + + struct ggml_tensor * c_mlp_proj_w; + struct ggml_tensor * c_mlp_proj_b; }; -struct llama_kv_cache { +struct gptneox_kv_cache { struct ggml_tensor * k; struct ggml_tensor * v; struct ggml_context * ctx = NULL; - llama_buffer buf; + gptneox_buffer buf; int n; // number of tokens currently in the cache - ~llama_kv_cache() { + ~gptneox_kv_cache() { if (ctx) { ggml_free(ctx); } } }; -struct llama_model { +struct gptneox_model { e_model type = MODEL_UNKNOWN; - llama_hparams hparams; + gptneox_hparams hparams; - struct ggml_tensor * tok_embeddings; + // final normalization + struct ggml_tensor * ln_f_g; + struct ggml_tensor * ln_f_b; - struct ggml_tensor * norm; - struct ggml_tensor * output; + // word embedding + struct ggml_tensor * wte; - std::vector layers; + // language model head + struct ggml_tensor * lmh_g; + + std::vector layers; // context struct ggml_context * ctx = NULL; // key + value cache for the self attention - // TODO: move to llama_state - struct llama_kv_cache kv_self; + // TODO: move to gptneox_state + struct gptneox_kv_cache kv_self; // the model memory buffer - llama_buffer buf; + gptneox_buffer buf; // model memory mapped file - std::unique_ptr mapping; + std::unique_ptr mapping; // objects representing data potentially being locked in memory - llama_mlock mlock_buf; - llama_mlock mlock_mmap; + gptneox_mlock mlock_buf; + gptneox_mlock mlock_mmap; // for quantize-stats only std::vector> tensors_by_name; - ~llama_model() { + ~gptneox_model() { if (ctx) { ggml_free(ctx); } } }; -struct llama_vocab { +struct gptneox_vocab { using id = int32_t; using token = std::string; @@ -227,7 +261,7 @@ struct llama_vocab { std::vector id_to_token; }; -struct llama_context { +struct gptneox_context { std::mt19937 rng; int64_t t_load_us = 0; @@ -242,8 +276,8 @@ struct llama_context { int32_t n_eval = 0; // number of eval calls int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) - llama_model model; - llama_vocab vocab; + gptneox_model model; + gptneox_vocab vocab; size_t mem_per_token = 0; @@ -255,15 +289,15 @@ struct llama_context { std::vector embedding; // memory buffers used to evaluate the model - // TODO: move in llama_state - llama_buffer buf_compute; - llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS]; + // TODO: move in gptneox_state + gptneox_buffer buf_compute; + gptneox_buffer buf_scratch[GPTNEOX_MAX_SCRATCH_BUFFERS]; int buf_last = 0; - size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 }; + size_t buf_max_size[GPTNEOX_MAX_SCRATCH_BUFFERS] = { 0 }; void use_buf(struct ggml_context * ctx, int i) { -#if defined(LLAMA_USE_SCRATCH) +#if defined(GPTNEOX_USE_SCRATCH) size_t last_size = 0; if (i == -1) { @@ -285,7 +319,7 @@ struct llama_context { } size_t get_buf_max_mem(int i) const { -#if defined(LLAMA_USE_SCRATCH) +#if defined(GPTNEOX_USE_SCRATCH) return buf_max_size[i]; #else (void) i; @@ -298,20 +332,20 @@ template static T checked_mul(T a, T b) { T ret = a * b; if (a != 0 && ret / a != b) { - Die("overflow multiplying %llu * %llu", - (unsigned long long) a, (unsigned long long) b); + throw format("overflow multiplying %llu * %llu", + (unsigned long long) a, (unsigned long long) b); } return ret; } static size_t checked_div(size_t a, size_t b) { if (b == 0 || a % b != 0) { - Die("error dividing %zu / %zu", a, b); + throw format("error dividing %zu / %zu", a, b); } return a / b; } -static std::string llama_format_tensor_shape(const std::vector & ne) { +static std::string gptneox_format_tensor_shape(const std::vector & ne) { char buf[256]; snprintf(buf, sizeof(buf), "%5u", ne.at(0)); for (size_t i = 1; i < ne.size(); i++) { @@ -320,7 +354,7 @@ static std::string llama_format_tensor_shape(const std::vector & ne) { return buf; } -static size_t llama_calc_tensor_size(const std::vector & ne, enum ggml_type type) { +static size_t gptneox_calc_tensor_size(const std::vector & ne, enum ggml_type type) { size_t size = ggml_type_size(type); for (uint32_t dim : ne) { size = checked_mul(size, dim); @@ -328,7 +362,7 @@ static size_t llama_calc_tensor_size(const std::vector & ne, enum ggml return size / ggml_blck_size(type); } -struct llama_load_tensor_shard { +struct gptneox_load_tensor_shard { std::vector ne; size_t size; enum ggml_type type; @@ -336,28 +370,28 @@ struct llama_load_tensor_shard { size_t file_off; void calc_size() { - size = llama_calc_tensor_size(ne, type); + size = gptneox_calc_tensor_size(ne, type); } }; -enum llama_split_type { +enum gptneox_split_type { SPLIT_NONE, SPLIT_BY_COLUMNS, SPLIT_BY_ROWS }; -struct llama_load_tensor { - std::vector shards; +struct gptneox_load_tensor { + std::vector shards; std::string name; enum ggml_type type = GGML_TYPE_F32; - llama_split_type split_type = SPLIT_NONE; + gptneox_split_type split_type = SPLIT_NONE; std::vector ne; size_t size; struct ggml_tensor * ggml_tensor = NULL; uint8_t * data; - llama_load_tensor(const std::string & name) : name(name) {} + gptneox_load_tensor(const std::string & name) : name(name) {} void calc_all() { calc_type(); @@ -370,7 +404,7 @@ struct llama_load_tensor { const auto & first_shard = shards.at(0); for (const auto & shard : shards) { if (shard.type != first_shard.type) { - Die("inconsistent tensor shard type in '%s'", name.c_str()); + throw format("inconsistent tensor shard type in '%s'", name.c_str()); } } type = first_shard.type; @@ -393,12 +427,12 @@ struct llama_load_tensor { const auto & first_shard = shards.at(0); for (const auto & shard : shards) { if (shard.ne != first_shard.ne) { - Die("inconsistent tensor shard shape in '%s': first was %s, other was %s", - name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()); + throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s", + name.c_str(), gptneox_format_tensor_shape(first_shard.ne).c_str(), gptneox_format_tensor_shape(shard.ne).c_str()); } } ne = first_shard.ne; - LLAMA_ASSERT(shards.size() <= UINT32_MAX); + GPTNEOX_ASSERT(shards.size() <= UINT32_MAX); uint32_t n_shards = (uint32_t) shards.size(); switch (split_type) { case SPLIT_NONE: @@ -416,31 +450,31 @@ struct llama_load_tensor { } void calc_size() { - size = llama_calc_tensor_size(ne, type); + size = gptneox_calc_tensor_size(ne, type); } }; -struct llama_load_tensors_map { +struct gptneox_load_tensors_map { // tensors is kept in a separate vector to preserve file order - std::vector tensors; + std::vector tensors; std::unordered_map name_to_idx; }; -enum llama_file_version { - LLAMA_FILE_VERSION_GGML, - LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab - LLAMA_FILE_VERSION_GGJT_V1, // added padding +enum gptneox_file_version { + GPTNEOX_FILE_VERSION_GGML, + GPTNEOX_FILE_VERSION_GGMF_V1, // added version field and scores in vocab + GPTNEOX_FILE_VERSION_GGJT_V1, // added padding }; -struct llama_file_loader { - llama_file file; - llama_file_version file_version; - llama_hparams hparams; - llama_vocab vocab; +struct gptneox_file_loader { + gptneox_file file; + gptneox_file_version file_version; + gptneox_hparams hparams; + gptneox_vocab vocab; - llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map) + gptneox_file_loader(const char * fname, size_t file_idx, gptneox_load_tensors_map & tensors_map) : file(fname, "rb") { - // fprintf(stderr, "llama.cpp: loading model from %s\n", fname); + fprintf(stderr, "gptneox.cpp: loading model from %s\n", fname); read_magic(); read_hparams(); read_vocab(); @@ -450,29 +484,30 @@ struct llama_file_loader { uint32_t magic = file.read_u32(); uint32_t version = 0; - if (magic != READ32BE("ggml")) { + if (magic != 'ggml') { version = file.read_u32(); } - if (magic == READ32BE("ggml") && version == 0) { - file_version = LLAMA_FILE_VERSION_GGML; - } else if (magic == READ32BE("ggmf") && version == 1) { - file_version = LLAMA_FILE_VERSION_GGMF_V1; - } else if (magic == READ32BE("ggjt") && version == 1) { - file_version = LLAMA_FILE_VERSION_GGJT_V1; + if (magic == 'ggml' && version == 0) { + file_version = GPTNEOX_FILE_VERSION_GGML; + } else if (magic == 'ggmf' && version == 1) { + file_version = GPTNEOX_FILE_VERSION_GGMF_V1; + } else if (magic == 'ggjt' && version == 1) { + file_version = GPTNEOX_FILE_VERSION_GGJT_V1; } else { - Die("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?", - magic, version); + throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?", + magic, version); } } void read_hparams() { hparams.n_vocab = file.read_u32(); + hparams.n_ctx = file.read_u32(); hparams.n_embd = file.read_u32(); - hparams.n_mult = file.read_u32(); hparams.n_head = file.read_u32(); hparams.n_layer = file.read_u32(); hparams.n_rot = file.read_u32(); - hparams.ftype = (enum llama_ftype) file.read_u32(); + hparams.use_parallel_residual = file.read_u32(); + hparams.ftype = (enum gptneox_ftype) file.read_u32(); } void read_vocab() { vocab.id_to_token.resize(hparams.n_vocab); @@ -482,9 +517,10 @@ struct llama_file_loader { std::string word = file.read_string(len); float score = 0.0f; - if (file_version >= LLAMA_FILE_VERSION_GGMF_V1) { + // TODO: Implement scores in gptneox + /*if (file_version >= GPTNEOX_FILE_VERSION_GGMF_V1) { file.read_raw(&score, sizeof(score)); - } + }*/ vocab.token_to_id[word] = i; @@ -493,9 +529,9 @@ struct llama_file_loader { tok_score.score = score; } } - void read_tensor_metadata(size_t file_idx, llama_load_tensors_map & tensors_map) { + void read_tensor_metadata(size_t file_idx, gptneox_load_tensors_map & tensors_map) { while (file.tell() < file.size) { - llama_load_tensor_shard shard; + gptneox_load_tensor_shard shard; uint32_t n_dims = file.read_u32(); uint32_t name_len = file.read_u32(); shard.type = (enum ggml_type) file.read_u32(); @@ -503,7 +539,7 @@ struct llama_file_loader { file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims); std::string name = file.read_string(name_len); if (n_dims < 1 || n_dims > 2) { - Die("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims); + throw format("gptneox.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims); } switch (shard.type) { case GGML_TYPE_F32: @@ -516,11 +552,11 @@ struct llama_file_loader { case GGML_TYPE_Q8_0: break; default: { - Die("unrecognized tensor type %u\n", shard.type); + throw format("unrecognized tensor type %u\n", shard.type); } } - if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) { + if (file_version >= GPTNEOX_FILE_VERSION_GGJT_V1) { // skip to the next multiple of 32 bytes file.seek(-file.tell() & 31, SEEK_CUR); } @@ -544,43 +580,45 @@ struct llama_file_loader { } }; -struct llama_file_saver { - llama_file file; - llama_file_loader * any_file_loader; - llama_file_saver(const char * fname, llama_file_loader * any_file_loader, enum llama_ftype new_ftype) +struct gptneox_file_saver { + gptneox_file file; + gptneox_file_loader * any_file_loader; + gptneox_file_saver(const char * fname, gptneox_file_loader * any_file_loader, enum gptneox_ftype new_ftype) : file(fname, "wb"), any_file_loader(any_file_loader) { - // fprintf(stderr, "llama.cpp: saving model to %s\n", fname); + fprintf(stderr, "gptneox.cpp: saving model to %s\n", fname); write_magic(); write_hparams(new_ftype); write_vocab(); } void write_magic() { - file.write_u32(READ32BE("ggjt")); // magic + file.write_u32('ggjt'); // magic file.write_u32(1); // version } - void write_hparams(enum llama_ftype new_ftype) { - const llama_hparams & hparams = any_file_loader->hparams; + void write_hparams(enum gptneox_ftype new_ftype) { + const gptneox_hparams & hparams = any_file_loader->hparams; file.write_u32(hparams.n_vocab); + file.write_u32(hparams.n_ctx); file.write_u32(hparams.n_embd); - file.write_u32(hparams.n_mult); file.write_u32(hparams.n_head); file.write_u32(hparams.n_layer); file.write_u32(hparams.n_rot); + file.write_u32(hparams.use_parallel_residual); file.write_u32(new_ftype); } void write_vocab() { - if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) { - fprintf(stderr, "llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n"); + if (any_file_loader->file_version == GPTNEOX_FILE_VERSION_GGML) { + fprintf(stderr, "gptneox.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n"); } uint32_t n_vocab = any_file_loader->hparams.n_vocab; for (uint32_t i = 0; i < n_vocab; i++) { const auto & token_score = any_file_loader->vocab.id_to_token.at(i); file.write_u32((uint32_t) token_score.tok.size()); file.write_raw(token_score.tok.data(), token_score.tok.size()); - file.write_raw(&token_score.score, sizeof(token_score.score)); + // TODO: Implement scores in gptneox? + //file.write_raw(&token_score.score, sizeof(token_score.score)); } } - void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) { + void write_tensor(gptneox_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) { switch (new_type) { case GGML_TYPE_F32: case GGML_TYPE_F16: @@ -591,7 +629,7 @@ struct llama_file_saver { case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: break; - default: LLAMA_ASSERT(false); + default: GPTNEOX_ASSERT(false); } file.write_u32((uint32_t) tensor.ne.size()); file.write_u32((uint32_t) tensor.name.size()); @@ -599,47 +637,47 @@ struct llama_file_saver { file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size()); file.write_raw(tensor.name.data(), tensor.name.size()); file.seek(-file.tell() & 31, SEEK_CUR); - LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type)); + GPTNEOX_ASSERT(new_size == gptneox_calc_tensor_size(tensor.ne, new_type)); file.write_raw(new_data, new_size); } }; -struct llama_model_loader { - std::vector> file_loaders; - llama_load_tensors_map tensors_map; +struct gptneox_model_loader { + std::vector> file_loaders; + gptneox_load_tensors_map tensors_map; bool use_mmap; size_t num_ggml_tensors_created = 0; struct ggml_context * ggml_ctx = NULL; - std::unique_ptr mapping; + std::unique_ptr mapping; - llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) { - auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map); + gptneox_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) { + auto first_file = new gptneox_file_loader(fname_base.c_str(), 0, tensors_map); file_loaders.emplace_back(first_file); uint32_t n_parts = vocab_only ? 1 : guess_n_parts(); for (uint32_t i = 1; i < n_parts; i++) { std::string fname = fname_base + "." + std::to_string(i); - auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map); + auto ith_file = new gptneox_file_loader(fname.c_str(), i, tensors_map); file_loaders.emplace_back(ith_file); if (ith_file->hparams != first_file->hparams) { - Die("llama.cpp: hparams inconsistent between files"); + throw format("gptneox.cpp: hparams inconsistent between files"); } } - if (!llama_mmap::SUPPORTED) { + if (!gptneox_mmap::SUPPORTED) { use_mmap = false; } if (use_mmap && alignment_prevents_mmap()) { - fprintf(stderr, "llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n"); + fprintf(stderr, "gptneox.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n"); use_mmap = false; } this->use_mmap = use_mmap; - for (llama_load_tensor & lt : tensors_map.tensors) { + for (gptneox_load_tensor & lt : tensors_map.tensors) { lt.calc_all(); } } bool alignment_prevents_mmap() { - for (const llama_load_tensor & lt : tensors_map.tensors) { - for (const llama_load_tensor_shard & shard : lt.shards) { + for (const gptneox_load_tensor & lt : tensors_map.tensors) { + for (const gptneox_load_tensor_shard & shard : lt.shards) { if (shard.file_off & 3) { return true; } @@ -649,17 +687,17 @@ struct llama_model_loader { } uint32_t guess_n_parts() const { - auto it = tensors_map.name_to_idx.find("tok_embeddings.weight"); + auto it = tensors_map.name_to_idx.find("gpt_neox.embed_in.weight"); if (it == tensors_map.name_to_idx.end()) { - Die("missing tok_embeddings.weight"); + throw std::string("missing gpt_neox.embed_in.weight"); } - const llama_load_tensor & lt = tensors_map.tensors.at(it->second); + const gptneox_load_tensor & lt = tensors_map.tensors.at(it->second); return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0); } void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const { *ctx_size_p = *mmapped_size_p = 0; - for (const llama_load_tensor & lt : tensors_map.tensors) { + for (const gptneox_load_tensor & lt : tensors_map.tensors) { *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE; *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size; } @@ -668,27 +706,26 @@ struct llama_model_loader { struct ggml_tensor * get_tensor(const std::string & name, std::vector ne) { auto it = tensors_map.name_to_idx.find(name); if (it == tensors_map.name_to_idx.end()) { - Die("llama.cpp: tensor '%s' is missing from model", name.c_str()); + throw format("gptneox.cpp: tensor '%s' is missing from model", name.c_str()); } - llama_load_tensor & lt = tensors_map.tensors.at(it->second); + gptneox_load_tensor & lt = tensors_map.tensors.at(it->second); if (lt.ne != ne) { - Die("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s", - name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()); + throw format("gptneox.cpp: tensor '%s' has wrong shape; expected %s, got %s", + name.c_str(), gptneox_format_tensor_shape(ne).c_str(), gptneox_format_tensor_shape(lt.ne).c_str()); } return get_tensor_for(lt); } - struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) { + struct ggml_tensor * get_tensor_for(gptneox_load_tensor & lt) { struct ggml_tensor * tensor; if (lt.ne.size() == 2) { tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1)); } else { - LLAMA_ASSERT(lt.ne.size() == 1); + GPTNEOX_ASSERT(lt.ne.size() == 1); tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0)); } - ggml_set_name(tensor, lt.name.c_str()); - LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor + GPTNEOX_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor lt.ggml_tensor = tensor; num_ggml_tensors_created++; return tensor; @@ -696,18 +733,18 @@ struct llama_model_loader { void done_getting_tensors() { if (num_ggml_tensors_created != tensors_map.tensors.size()) { - Die("llama.cpp: file contained more tensors than expected"); + throw std::string("gptneox.cpp: file contained more tensors than expected"); } } - void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { + void load_all_data(gptneox_progress_callback progress_callback, void * progress_callback_user_data, gptneox_mlock * lmlock) { size_t data_size = 0; - for (const llama_load_tensor & lt : tensors_map.tensors) { + for (const gptneox_load_tensor & lt : tensors_map.tensors) { data_size += lt.size; } if (use_mmap) { - mapping.reset(new llama_mmap(&file_loaders.at(0)->file)); + mapping.reset(new gptneox_mmap(&file_loaders.at(0)->file)); if (!lmlock) { // Don't call the callback since the actual loading will be lazy // and we can't measure it. @@ -719,11 +756,11 @@ struct llama_model_loader { } size_t done_size = 0; - for (llama_load_tensor & lt : tensors_map.tensors) { + for (gptneox_load_tensor & lt : tensors_map.tensors) { if (progress_callback) { progress_callback((float) done_size / data_size, progress_callback_user_data); } - LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already + GPTNEOX_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already lt.data = (uint8_t *) lt.ggml_tensor->data; load_data_for(lt); lt.ggml_tensor->data = lt.data; @@ -737,29 +774,30 @@ struct llama_model_loader { } } - void load_data_for(llama_load_tensor & lt) { + void load_data_for(gptneox_load_tensor & lt) { if (use_mmap) { - LLAMA_ASSERT(lt.shards.size() == 1); + GPTNEOX_ASSERT(lt.shards.size() == 1); lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off; } else if (lt.split_type == SPLIT_NONE) { - llama_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file; + gptneox_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file; file.seek(lt.shards.at(0).file_off, SEEK_SET); file.read_raw(lt.data, lt.size); } else if (lt.split_type == SPLIT_BY_ROWS) { size_t offset = 0; - for (llama_load_tensor_shard & shard : lt.shards) { - llama_file & file = file_loaders.at(shard.file_idx)->file; + for (gptneox_load_tensor_shard & shard : lt.shards) { + gptneox_file & file = file_loaders.at(shard.file_idx)->file; file.seek(shard.file_off, SEEK_SET); file.read_raw(lt.data + offset, shard.size); offset += shard.size; } - LLAMA_ASSERT(offset == lt.size); + GPTNEOX_ASSERT(offset == lt.size); } else if (lt.split_type == SPLIT_BY_COLUMNS) { // Let's load the data into temporary buffers to ensure the OS performs large loads. - std::vector tmp_bufs(lt.shards.size()); + std::vector tmp_bufs; + tmp_bufs.resize(lt.shards.size()); for (size_t i = 0; i < lt.shards.size(); i++) { - llama_load_tensor_shard & shard = lt.shards.at(i); - llama_file & file = file_loaders.at(shard.file_idx)->file; + gptneox_load_tensor_shard & shard = lt.shards.at(i); + gptneox_file & file = file_loaders.at(shard.file_idx)->file; file.seek(shard.file_off, SEEK_SET); tmp_bufs.at(i).resize(shard.size); file.read_raw(tmp_bufs.at(i).addr, shard.size); @@ -769,28 +807,28 @@ struct llama_model_loader { size_t per_shard_row_size = lt.shards.at(0).size / num_rows; size_t out_offset = 0; for (size_t row = 0; row < num_rows; row++) { - for (llama_buffer & tmp_buf : tmp_bufs) { + for (gptneox_buffer & tmp_buf : tmp_bufs) { memcpy(lt.data + out_offset, tmp_buf.addr + row * per_shard_row_size, per_shard_row_size); out_offset += per_shard_row_size; } } - LLAMA_ASSERT(out_offset == lt.size); + GPTNEOX_ASSERT(out_offset == lt.size); } if (0) { print_checksum(lt); } } - static void print_checksum(llama_load_tensor & lt) { + static void print_checksum(gptneox_load_tensor & lt) { uint32_t sum = 0; for (size_t i = 0; i < lt.size; i++) { uint8_t byte = lt.data[i]; sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash } fprintf(stderr, "%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum, - llama_format_tensor_shape(lt.ne).c_str(), lt.size); + gptneox_format_tensor_shape(lt.ne).c_str(), lt.size); } }; @@ -801,17 +839,17 @@ struct llama_model_loader { // static bool kv_cache_init( - const struct llama_hparams & hparams, - struct llama_kv_cache & cache, + const struct gptneox_hparams & hparams, + struct gptneox_kv_cache & cache, ggml_type wtype, int n_ctx) { const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; - const int64_t n_mem = n_layer*n_ctx; + const int64_t n_mem = (int64_t)n_layer*n_ctx; const int64_t n_elements = n_embd*n_mem; - cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); + cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MiB); struct ggml_init_params params; params.mem_size = cache.buf.size; @@ -827,17 +865,15 @@ static bool kv_cache_init( cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); - ggml_set_name(cache.k, "cache_k"); - ggml_set_name(cache.v, "cache_v"); return true; } -struct llama_context_params llama_context_default_params() { - struct llama_context_params result = { +struct gptneox_context_params gptneox_context_default_params() { + struct gptneox_context_params result = { /*.n_ctx =*/ 512, /*.n_parts =*/ -1, - /*.seed =*/ -1, + /*.seed =*/ 0, /*.f16_kv =*/ false, /*.logits_all =*/ false, /*.vocab_only =*/ false, @@ -851,100 +887,117 @@ struct llama_context_params llama_context_default_params() { return result; } -bool llama_mmap_supported() { - return llama_mmap::SUPPORTED; +bool gptneox_mmap_supported() { + return gptneox_mmap::SUPPORTED; } -bool llama_mlock_supported() { - return llama_mlock::SUPPORTED; +bool gptneox_mlock_supported() { + return gptneox_mlock::SUPPORTED; } // // model loading // -static const char *llama_file_version_name(llama_file_version version) { +static const char *gptneox_file_version_name(gptneox_file_version version) { switch (version) { - case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)"; - case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)"; - case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)"; - default: LLAMA_ASSERT(false); + case GPTNEOX_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)"; + case GPTNEOX_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)"; + case GPTNEOX_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)"; + default: GPTNEOX_ASSERT(false); } } -static const char *llama_ftype_name(enum llama_ftype ftype) { +static const char *gptneox_ftype_name(enum gptneox_ftype ftype) { switch (ftype) { - case LLAMA_FTYPE_ALL_F32: return "all F32"; - case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16"; - case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0"; - case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1"; - case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: + case GPTNEOX_FTYPE_ALL_F32: return "all F32"; + case GPTNEOX_FTYPE_MOSTLY_F16: return "mostly F16"; + case GPTNEOX_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0"; + case GPTNEOX_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1"; + case GPTNEOX_FTYPE_MOSTLY_Q4_1_SOME_F16: return "mostly Q4_1, some F16"; - case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2"; - case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0"; - case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1"; - case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0"; + case GPTNEOX_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2"; + //case GPTNEOX_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3"; + case GPTNEOX_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0"; + case GPTNEOX_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1"; + case GPTNEOX_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0"; default: return "unknown, may not work"; } } -static const char *llama_model_type_name(e_model type) { +static const char *gptneox_model_type_name(e_model type) { switch (type) { + case MODEL_3B: return "3B"; case MODEL_7B: return "7B"; - case MODEL_13B: return "13B"; - case MODEL_30B: return "30B"; - case MODEL_65B: return "65B"; - default: LLAMA_ASSERT(false); + case MODEL_12B: return "12B"; + case MODEL_20B: return "20B"; + case MODEL_UNKNOWN: return "UNKNOWN"; + default: GPTNEOX_ASSERT(false); } } -static void llama_model_load_internal( +static void gptneox_model_load_internal( const std::string & fname, - llama_context & lctx, + gptneox_context & lctx, int n_ctx, ggml_type memory_type, bool use_mmap, bool use_mlock, bool vocab_only, - llama_progress_callback progress_callback, - void * progress_callback_user_data, - int verbose) { + gptneox_progress_callback progress_callback, + void * progress_callback_user_data) { lctx.t_start_us = ggml_time_us(); - std::unique_ptr ml(new llama_model_loader(fname, use_mmap, vocab_only)); + std::unique_ptr ml(new gptneox_model_loader(fname, use_mmap, vocab_only)); lctx.vocab = std::move(ml->file_loaders.at(0)->vocab); auto & model = lctx.model; model.hparams = ml->file_loaders.at(0)->hparams; - llama_file_version file_version = ml->file_loaders.at(0)->file_version; + gptneox_file_version file_version = ml->file_loaders.at(0)->file_version; auto & hparams = model.hparams; - uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; - + { switch (hparams.n_layer) { - case 32: model.type = e_model::MODEL_7B; break; - case 40: model.type = e_model::MODEL_13B; break; - case 60: model.type = e_model::MODEL_30B; break; - case 80: model.type = e_model::MODEL_65B; break; + case 16: { + if (hparams.n_embd < 6144) { + model.type = e_model::MODEL_3B; + } else { + model.type = e_model::MODEL_7B; + } + break; + } + // # : we extend the model type settings for RedPajama models. + case 32:{ + if (hparams.n_embd == 2560) { + model.type = e_model::MODEL_3B; + } else if (hparams.n_embd == 4096) { + model.type = e_model::MODEL_7B; + } + else { + model.type = e_model::MODEL_UNKNOWN; + } + break; + } + case 36: model.type = e_model::MODEL_12B; break; + case 44: model.type = e_model::MODEL_20B; break; } hparams.n_ctx = n_ctx; } - if (verbose) { - fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version)); + { + fprintf(stderr, "%s: format = %s\n", __func__, gptneox_file_version_name(file_version)); fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab); fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx); fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd); - fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult); fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head); fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer); fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); - fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype)); - fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff); + fprintf(stderr, "%s: use_parallel_residual = %d\n", __func__, hparams.use_parallel_residual); + fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, gptneox_ftype_name(hparams.ftype)); fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size()); - fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type)); + fprintf(stderr, "%s: model size = %s\n", __func__, gptneox_model_type_name(model.type)); } if (vocab_only) { @@ -955,9 +1008,7 @@ static void llama_model_load_internal( size_t ctx_size, mmapped_size; ml->calc_sizes(&ctx_size, &mmapped_size); - if (verbose) { - fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0); - } + fprintf(stderr, "%s: ggml ctx size = %6.2f KiB\n", __func__, ctx_size/1024.0); // print memory requirements { @@ -971,14 +1022,12 @@ static void llama_model_load_internal( MEM_REQ_SCRATCH1().at(model.type) + MEM_REQ_EVAL().at(model.type); - // this is the memory required by one llama_state + // this is the memory required by one gptneox_state const size_t mem_required_state = scale*MEM_REQ_KV_SELF().at(model.type); - if (verbose) { - fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, - mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); - } + fprintf(stderr, "%s: mem required = %7.2f MiB (+ %7.2f MiB per state)\n", __func__, + mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); } // create the ggml context @@ -997,7 +1046,7 @@ static void llama_model_load_internal( model.ctx = ggml_init(params); if (!model.ctx) { - Die("ggml_init() failed"); + throw format("ggml_init() failed"); } } @@ -1011,35 +1060,39 @@ static void llama_model_load_internal( ml->ggml_ctx = ctx; - model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}); - model.norm = ml->get_tensor("norm.weight", {n_embd}); - model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}); + model.wte = ml->get_tensor("gpt_neox.embed_in.weight", {n_embd, n_vocab}); + model.ln_f_g = ml->get_tensor("gpt_neox.final_layer_norm.weight", {n_embd}); + model.ln_f_b = ml->get_tensor("gpt_neox.final_layer_norm.bias", {n_embd}); + model.lmh_g = ml->get_tensor("embed_out.weight", {n_embd, n_vocab}); model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { auto & layer = model.layers[i]; - std::string layers_i = "layers." + std::to_string(i); + std::string layers_i = "gpt_neox.layers." + std::to_string(i); - layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}); + layer.ln_attn_g = ml->get_tensor(layers_i + ".input_layernorm.weight", {n_embd}); + layer.ln_attn_b = ml->get_tensor(layers_i + ".input_layernorm.bias", {n_embd}); - layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}); - layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}); - layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}); - layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}); + layer.c_attn_attn_w = ml->get_tensor(layers_i + ".attention.query_key_value.weight", {n_embd, n_embd * 3}); + layer.c_attn_attn_b = ml->get_tensor(layers_i + ".attention.query_key_value.bias", {n_embd * 3}); + layer.c_attn_proj_w = ml->get_tensor(layers_i + ".attention.dense.weight", {n_embd, n_embd}); + layer.c_attn_proj_b = ml->get_tensor(layers_i + ".attention.dense.bias", {n_embd}); - layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}); + layer.ln_ff_g = ml->get_tensor(layers_i + ".post_attention_layernorm.weight", {n_embd}); + layer.ln_ff_b = ml->get_tensor(layers_i + ".post_attention_layernorm.bias", {n_embd}); - layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}); - layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}); - layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}); + layer.c_mlp_fc_w = ml->get_tensor(layers_i + ".mlp.dense_h_to_4h.weight", {n_embd, n_embd * 4}); + layer.c_mlp_fc_b = ml->get_tensor(layers_i + ".mlp.dense_h_to_4h.bias", {n_embd * 4}); + layer.c_mlp_proj_w = ml->get_tensor(layers_i + ".mlp.dense_4h_to_h.weight", {n_embd * 4, n_embd}); + layer.c_mlp_proj_b = ml->get_tensor(layers_i + ".mlp.dense_4h_to_h.bias", {n_embd}); } } ml->done_getting_tensors(); // populate `tensors_by_name` - for (llama_load_tensor & lt : ml->tensors_map.tensors) { + for (gptneox_load_tensor & lt : ml->tensors_map.tensors) { model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor); } @@ -1052,26 +1105,24 @@ static void llama_model_load_internal( lctx.t_load_us = ggml_time_us() - lctx.t_start_us; } -static bool llama_model_load( +static bool gptneox_model_load( const std::string & fname, - llama_context & lctx, + gptneox_context & lctx, int n_ctx, ggml_type memory_type, bool use_mmap, bool use_mlock, bool vocab_only, - llama_progress_callback progress_callback, - void *progress_callback_user_data, - int verbose) { - // try { - llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock, - vocab_only, progress_callback, progress_callback_user_data, - verbose); - return true; - // } catch (const std::string & err) { - // fprintf(stderr, "error loading model: %s\n", err.c_str()); - // return false; - // } + gptneox_progress_callback progress_callback, + void *progress_callback_user_data) { + try { + gptneox_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock, + vocab_only, progress_callback, progress_callback_user_data); + return true; + } catch (const std::string & err) { + fprintf(stderr, "error loading model: %s\n", err.c_str()); + return false; + } } // evaluate the transformer @@ -1081,19 +1132,12 @@ static bool llama_model_load( // - n_past: the context size so far // - n_threads: number of threads to use // -static bool llama_eval_internal( - llama_context & lctx, - const llama_token * tokens, +static bool gptneox_eval_internal( + gptneox_context & lctx, + const gptneox_token * tokens, const int n_tokens, const int n_past, const int n_threads) { - - // enforce that the first token is BOS - if (n_past == 0 && tokens[0] != llama_token_bos()) { - fprintf(stderr, "%s: first token must be BOS\n", __func__); - return false; - } - const int64_t t_start_us = ggml_time_us(); const int N = n_tokens; @@ -1103,14 +1147,14 @@ static bool llama_eval_internal( auto & kv_self = model.kv_self; - LLAMA_ASSERT(!!kv_self.ctx); + GPTNEOX_ASSERT(!!kv_self.ctx); const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; const int n_ctx = hparams.n_ctx; const int n_head = hparams.n_head; const int n_vocab = hparams.n_vocab; - const int n_rot = hparams.n_embd/hparams.n_head; + const int n_rot = hparams.n_rot; auto & mem_per_token = lctx.mem_per_token; auto & buf_compute = lctx.buf_compute; @@ -1126,161 +1170,251 @@ static bool llama_eval_internal( // for big prompts, if BLAS is enabled, it is better to use only one thread // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance ggml_cgraph gf = {}; - gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; + gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_cublas() ? 1 : n_threads; struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - ggml_set_name(embd, "embd"); memcpy(embd->data, tokens, N*ggml_element_size(embd)); - struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); + struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - struct ggml_tensor * cur; lctx.use_buf(ctx0, 0); - // norm + // input norm { - cur = ggml_rms_norm(ctx0, inpL); + cur = ggml_norm(ctx0, inpL); - // cur = attention_norm*cur - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model.layers[il].attention_norm, cur), - cur); + // cur = ln_attn_g*cur + ln_attn_b + cur = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.layers[il].ln_attn_g, cur), + cur), + ggml_repeat(ctx0, model.layers[il].ln_attn_b, cur)); } // self-attention { - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); - struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); - ggml_set_name(Qcur, "Qcur"); - ggml_set_name(Kcur, "Kcur"); - - // store key and value to memory + // attn + // [3*n_embd, n_embd] - model.layers[il].c_attn_attn_w + // [3*n_embd, 1] - model.layers[il].c_attn_attn_b + // [ n_embd, N] - cur (in) + // [3*n_embd, N] - cur (out) + // + // cur = attn_w*cur + attn_b + // [3*n_embd, N] { - // compute the transposed [N, n_embd] V matrix - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N)); - - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); + cur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_w, cur); + cur = ggml_add(ctx0, + ggml_repeat(ctx0, + model.layers[il].c_attn_attn_b, cur), + cur); + } + + // Split QKV and make contiguous + struct ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, + n_embd/n_head, + n_head, + N, + ggml_element_size(cur) * 3 * n_embd/n_head, + ggml_element_size(cur) * 3 * n_embd, + ggml_element_size(cur) * n_embd/n_head * 0); + struct ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, + n_embd/n_head, + n_head, + N, + ggml_element_size(cur) * 3 * n_embd/n_head, + ggml_element_size(cur) * 3 * n_embd, + ggml_element_size(cur) * n_embd/n_head * 1); + struct ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, + n_embd/n_head, + n_head, + N, + ggml_element_size(cur) * 3 * n_embd/n_head, + ggml_element_size(cur) * 3 * n_embd, + ggml_element_size(cur) * n_embd/n_head * 2); + // TODO: Flatten without copying, or see if non-contiguous can be used for any of QKV. + Qcur = ggml_cpy(ctx0, Qcur, + ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)); + Kcur = ggml_cpy(ctx0, Kcur, + ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)); + Vcur = ggml_cpy(ctx0, Vcur, + ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)); + + // MARK: gptneox RoPE Q and K, before cache + // Bit 2 for gptneox style (2) + // Bit 1 is zero for dont skip n_past +(0), use (2+1) = (3) if rope is applied to cache of k (after cache only) + Qcur = ggml_rope(ctx0, Qcur, n_past, n_rot, 2); + Kcur = ggml_rope(ctx0, Kcur, n_past, n_rot, 2); //3); + // store key and value to memory, not required if prompt if only a single token (not practical or likely) + //if (N >= 1) { + // Each entry in kv_self has byte size of (ggml_element_size * n_embd * n_ctx * n_layer) + Vcur = ggml_view_2d(ctx0, Vcur, + n_embd, + N, + ggml_element_size(Vcur) * n_embd, + 0); + Vcur = ggml_transpose(ctx0, Vcur); + + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, + n_embd * N, // num elements in current context (up to n_embd*n_ctx but usually less) + ggml_element_size(kv_self.k) * n_embd * (il * n_ctx + n_past)); + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, + N, + n_embd, + ggml_element_size(kv_self.v) * n_ctx, + ggml_element_size(kv_self.v) * ((il * n_ctx * n_embd) + n_past)); + // important: storing RoPE-ed version of K in the KV cache! ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); - } - + //} + + // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - ggml_set_name(Q, "Q"); + // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) struct ggml_tensor * K = ggml_permute(ctx0, ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd), + ggml_view_1d(ctx0, kv_self.k, + (n_past + N) * n_embd, + ggml_element_size(kv_self.k) * il * n_ctx * n_embd), n_embd/n_head, n_head, n_past + N), 0, 2, 1, 3); - ggml_set_name(K, "K"); // K * Q + // Will use internally ggml_compute_forward_mul_mat_f16_f32 because K is f16 (cache) and Q is f32 (from q4_0) + // Outputs [N, N, H, B], so it seems like this is correct for "scores" + // K is internally transposed by ggml_mul_mat struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - ggml_set_name(KQ, "KQ"); - // KQ_scaled = KQ / sqrt(n_embd/n_head) - struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)); - ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)"); - - struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - ggml_set_name(KQ_scaled, "KQ_scaled"); - + struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, + ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))); // KQ_masked = mask_past(KQ_scaled) struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); - ggml_set_name(KQ_masked, "KQ_masked"); - // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); + + // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() + struct ggml_tensor * V_trans = ggml_view_3d(ctx0, kv_self.v, + n_past + N, + n_embd/n_head, + n_head, + ggml_element_size(kv_self.v) * n_ctx, + ggml_element_size(kv_self.v) * n_ctx * n_embd/n_head, + ggml_element_size(kv_self.v) * il * n_ctx * n_embd); - // split cached V into n_head heads - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_past + N, n_embd/n_head, n_head, - n_ctx*ggml_element_size(kv_self.v), - n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head, - il*n_ctx*ggml_element_size(kv_self.v)*n_embd); - ggml_set_name(V, "V"); - -#if 1 - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - ggml_set_name(KQV, "KQV"); -#else - // make V contiguous in memory to speed up the matmul, however we waste time on the copy - // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation - // is there a better way? - struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head)); - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max); -#endif + // KQV = transpose(V) * KQ_soft_max + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); // KQV_merged = KQV.permute(0, 2, 1, 3) struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - ggml_set_name(KQV_merged, "KQV_merged"); // cur = KQV_merged.contiguous().view(n_embd, N) - cur = ggml_cpy(ctx0, - KQV_merged, - ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - ggml_set_name(cur, "KQV_merged_contiguous"); + cur = ggml_cpy(ctx0, KQV_merged, + ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - // projection (no bias) - cur = ggml_mul_mat(ctx0, - model.layers[il].wo, - cur); + // projection (first weight) + cur = ggml_mul_mat(ctx0, model.layers[il].c_attn_proj_w, cur); + + // projection (then bias) + cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), cur); } lctx.use_buf(ctx0, 1); + + if (hparams.use_parallel_residual == 1) { + //printf("use_parallel_residual == 1\n"); + + // This is independent of the self-attention result, so it could be done in parallel to the self-attention + struct ggml_tensor * outAttn = cur; - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - - // feed-forward network - { - // norm + // post attention layer norm { - cur = ggml_rms_norm(ctx0, inpFF); + cur = ggml_norm(ctx0, inpL); - // cur = ffn_norm*cur - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model.layers[il].ffn_norm, cur), - cur); + // cur = ln_attn_g*inpFF + ln_attn_b + cur = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.layers[il].ln_ff_g, cur), + cur), + ggml_repeat(ctx0, model.layers[il].ln_ff_b, cur)); } - struct ggml_tensor * tmp = ggml_mul_mat(ctx0, - model.layers[il].w3, - cur); - cur = ggml_mul_mat(ctx0, - model.layers[il].w1, - cur); + // feed-forward network + { + // note here we pass inpFF instead of cur + cur = ggml_mul_mat(ctx0, model.layers[il].c_mlp_fc_w, cur); - // SILU activation - cur = ggml_silu(ctx0, cur); + cur = ggml_add(ctx0, + ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur), + cur); - cur = ggml_mul(ctx0, cur, tmp); + // GELU activation + cur = ggml_gelu(ctx0, cur); - cur = ggml_mul_mat(ctx0, - model.layers[il].w2, - cur); + // projection + // cur = proj_w*inpFF + proj_b + cur = ggml_mul_mat(ctx0, model.layers[il].c_mlp_proj_w, cur); + + cur = ggml_add(ctx0, + ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur), + cur); + } + //# pseudocode: + //# x = x + attn(ln1(x)) + mlp(ln2(x)) + // inpL = inpL + outAttn + cur + cur = ggml_add(ctx0, outAttn, cur); + inpL = ggml_add(ctx0, inpL, cur); + } else if (hparams.use_parallel_residual == 0) { + //printf("use_parallel_residual == 0\n"); + + // This takes the self-attention residual output as input to Feedforward + struct ggml_tensor * outAttn = cur; + struct ggml_tensor * inpFF = ggml_add(ctx0, outAttn, inpL); + + // post attention layer norm + { + cur = ggml_norm(ctx0, inpFF); + + // inpFF = ln_attn_g*inpFF + ln_attn_b + cur = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.layers[il].ln_ff_g, cur), + cur), + ggml_repeat(ctx0, model.layers[il].ln_ff_b, cur)); + } + + // feed-forward network + { + // note here we pass inpFF instead of cur + cur = ggml_mul_mat(ctx0, model.layers[il].c_mlp_fc_w, cur); + + cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur), cur); + + cur = ggml_gelu(ctx0, cur); + + cur = ggml_mul_mat(ctx0, model.layers[il].c_mlp_proj_w, cur); + + cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur), cur); + } + + //# pseudocode: + //# x = x + attn(ln1(x)) (residual above as input to mlp) + //# x = x + mlp(ln2(x)) (residual after mlp aka inpL + cur) + //# : we fixed a small issue in the gptneox.cpp fork when setting use_parallel_residual to False; + inpL = ggml_add(ctx0, inpFF, cur); + } else { + printf("use_parallel_residual == %d\n", hparams.use_parallel_residual); + assert(0); } - - cur = ggml_add(ctx0, cur, inpFF); - - // input for next layer - inpL = cur; } lctx.use_buf(ctx0, 0); @@ -1290,19 +1424,20 @@ static bool llama_eval_internal( // norm { + inpL = ggml_norm(ctx0, inpL); - inpL = ggml_rms_norm(ctx0, inpL); - - // inpL = norm*inpL - inpL = ggml_mul(ctx0, - ggml_repeat(ctx0, model.norm, inpL), - inpL); + // inpL = ln_f_g*inpL + ln_f_b + inpL = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.ln_f_g, inpL), + inpL), + ggml_repeat(ctx0, model.ln_f_b, inpL)); embeddings = inpL; } // lm_head - inpL = ggml_mul_mat(ctx0, model.output, inpL); + inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL); lctx.use_buf(ctx0, -1); @@ -1327,9 +1462,6 @@ static bool llama_eval_internal( //embd_w.resize(n_vocab*N); //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); - // update kv token count - lctx.model.kv_self.n = n_past + N; - // extract logits { auto & logits_out = lctx.logits; @@ -1357,7 +1489,7 @@ static bool llama_eval_internal( } #if 0 - printf("\n%s: used_mem = %.3f MB, scratch -- %.3f MB %.3f MB\n", __func__, + printf("\n%s: used_mem = %.3f MiB, scratch -- %.3f MiB %.3f MiB\n", __func__, ggml_used_mem(ctx0)/1024.0/1024.0, lctx.get_buf_max_mem(0)/1024.0/1024.0, lctx.get_buf_max_mem(1)/1024.0/1024.0); @@ -1388,7 +1520,7 @@ static size_t utf8_len(char src) { return lookup[highbits]; } -struct llama_sp_symbol { +struct gptneox_sp_symbol { using index = int; index prev; index next; @@ -1396,31 +1528,31 @@ struct llama_sp_symbol { size_t n; }; -struct llama_sp_bigram { +struct gptneox_sp_bigram { struct comparator { - bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) { + bool operator()(gptneox_sp_bigram & l, gptneox_sp_bigram & r) { return (l.score < r.score) || (l.score == r.score && l.left > r.left); } }; - using queue_storage = std::vector; - using queue = std::priority_queue; - llama_sp_symbol::index left; - llama_sp_symbol::index right; + using queue_storage = std::vector; + using queue = std::priority_queue; + gptneox_sp_symbol::index left; + gptneox_sp_symbol::index right; float score; size_t size; }; // original implementation: // https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4 -struct llama_tokenizer { - llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {} +struct gptneox_tokenizer { + gptneox_tokenizer(const gptneox_vocab & vocab): vocab_(vocab) {} - void tokenize(const std::string & text, std::vector & output) { + void tokenize(const std::string & text, std::vector & output) { // split string into utf8 chars int index = 0; size_t offs = 0; while (offs < text.size()) { - llama_sp_symbol sym; + gptneox_sp_symbol sym; size_t char_len = std::min(text.size() - offs, utf8_len(text[offs])); sym.text = text.c_str() + offs; sym.n = char_len; @@ -1474,7 +1606,7 @@ struct llama_tokenizer { if (token == vocab_.token_to_id.end()) { // output any symbols that did not form tokens as bytes. for (int j = 0; j < (int) symbol.n; ++j) { - llama_vocab::id token_id = static_cast(symbol.text[j]) + 3; + gptneox_vocab::id token_id = static_cast(symbol.text[j]) + 3; output.push_back(token_id); } } else { @@ -1502,7 +1634,7 @@ private: const auto &tok_score = vocab_.id_to_token[(*token).second]; - llama_sp_bigram bigram; + gptneox_sp_bigram bigram; bigram.left = left; bigram.right = right; bigram.score = tok_score.score; @@ -1510,21 +1642,21 @@ private: work_queue_.push(bigram); } - const llama_vocab & vocab_; - std::vector symbols_; - llama_sp_bigram::queue work_queue_; + const gptneox_vocab & vocab_; + std::vector symbols_; + gptneox_sp_bigram::queue work_queue_; }; -static std::vector llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos) { - llama_tokenizer tokenizer(vocab); - std::vector output; +static std::vector gptneox_tokenize(const gptneox_vocab & vocab, const std::string & text, bool bos) { + gptneox_tokenizer tokenizer(vocab); + std::vector output; if (text.size() == 0) { return output; } if (bos) { - output.push_back(llama_token_bos()); + output.push_back(gptneox_token_bos()); } tokenizer.tokenize(text, output); @@ -1535,14 +1667,14 @@ static std::vector llama_tokenize(const llama_vocab & vocab, co // sampling // -void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) { +void gptneox_sample_softmax(struct gptneox_context * ctx, gptneox_token_data_array * candidates) { assert(candidates->size > 0); const int64_t t_start_sample_us = ggml_time_us(); // Sort the logits in descending order if (!candidates->sorted) { - std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) { + std::sort(candidates->data, candidates->data + candidates->size, [](const gptneox_token_data & a, const gptneox_token_data & b) { return a.logit > b.logit; }); candidates->sorted = true; @@ -1564,7 +1696,7 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c } } -void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep) { +void gptneox_sample_top_k(struct gptneox_context * ctx, gptneox_token_data_array * candidates, int k, size_t min_keep) { const int64_t t_start_sample_us = ggml_time_us(); k = std::max(k, (int) min_keep); @@ -1572,7 +1704,7 @@ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * can // Sort scores in descending order if (!candidates->sorted) { - auto comp = [](const llama_token_data & a, const llama_token_data & b) { + auto comp = [](const gptneox_token_data & a, const gptneox_token_data & b) { return a.logit > b.logit; }; if (k == (int) candidates->size) { @@ -1589,14 +1721,14 @@ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * can } } -void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) { +void gptneox_sample_top_p(struct gptneox_context * ctx, gptneox_token_data_array * candidates, float p, size_t min_keep) { if (p >= 1.0f) { return; } const int64_t t_start_sample_us = ggml_time_us(); - llama_sample_softmax(ctx, candidates); + gptneox_sample_softmax(ctx, candidates); // Compute the cumulative probabilities float cum_sum = 0.0f; @@ -1620,14 +1752,14 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can } } -void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) { +void gptneox_sample_tail_free(struct gptneox_context * ctx, gptneox_token_data_array * candidates, float z, size_t min_keep) { if (z >= 1.0f || candidates->size <= 2) { return; } const int64_t t_start_sample_us = ggml_time_us(); - llama_sample_softmax(nullptr, candidates); + gptneox_sample_softmax(nullptr, candidates); // Compute the first and second derivatives std::vector first_derivatives(candidates->size - 1); @@ -1672,7 +1804,7 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * } -void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) { +void gptneox_sample_typical(struct gptneox_context * ctx, gptneox_token_data_array * candidates, float p, size_t min_keep) { // Reference implementation: // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr if (p >= 1.0f) { @@ -1682,7 +1814,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c const int64_t t_start_sample_us = ggml_time_us(); // Compute the softmax of logits and calculate entropy - llama_sample_softmax(nullptr, candidates); + gptneox_sample_softmax(nullptr, candidates); float entropy = 0.0f; for (size_t i = 0; i < candidates->size; ++i) { @@ -1703,7 +1835,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) { return shifted_scores[a] < shifted_scores[b]; }); - + // Compute the cumulative probabilities float cum_sum = 0.0f; size_t last_idx = indices.size(); @@ -1720,7 +1852,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c } // Resize the output vector to keep only the locally typical tokens - std::vector new_candidates; + std::vector new_candidates; for (size_t i = 0; i < last_idx; ++i) { size_t idx = indices[i]; new_candidates.push_back(candidates->data[idx]); @@ -1735,7 +1867,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c } } -void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) { +void gptneox_sample_temperature(struct gptneox_context * ctx, gptneox_token_data_array * candidates_p, float temp) { const int64_t t_start_sample_us = ggml_time_us(); for (size_t i = 0; i < candidates_p->size; ++i) { @@ -1747,7 +1879,7 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array } } -void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) { +void gptneox_sample_repetition_penalty(struct gptneox_context * ctx, gptneox_token_data_array * candidates, gptneox_token * last_tokens, size_t last_tokens_size, float penalty) { if (last_tokens_size == 0 || penalty == 1.0f) { return; } @@ -1776,7 +1908,7 @@ void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_dat } } -void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) { +void gptneox_sample_frequency_and_presence_penalties(struct gptneox_context * ctx, gptneox_token_data_array * candidates, gptneox_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) { if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) { return; } @@ -1784,7 +1916,7 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l const int64_t t_start_sample_us = ggml_time_us(); // Create a frequency map to count occurrences of each token in last_tokens - std::unordered_map token_count; + std::unordered_map token_count; for (size_t i = 0; i < last_tokens_size; ++i) { token_count[last_tokens_p[i]]++; } @@ -1808,13 +1940,13 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l } -llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) { +gptneox_token gptneox_sample_token_mirostat(struct gptneox_context * ctx, gptneox_token_data_array * candidates, float tau, float eta, int m, float * mu) { assert(ctx); - auto N = float(llama_n_vocab(ctx)); + auto N = float(gptneox_n_vocab(ctx)); int64_t t_start_sample_us; t_start_sample_us = ggml_time_us(); - llama_sample_softmax(nullptr, candidates); + gptneox_sample_softmax(nullptr, candidates); // Estimate s_hat using the most probable m tokens float s_hat = 0.0; @@ -1833,15 +1965,15 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_ float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat); // Sample the next word X using top-k sampling - llama_sample_top_k(nullptr, candidates, int(k), 1); + gptneox_sample_top_k(nullptr, candidates, int(k), 1); if (ctx) { ctx->t_sample_us += ggml_time_us() - t_start_sample_us; } - llama_token X = llama_sample_token(ctx, candidates); + gptneox_token X = gptneox_sample_token(ctx, candidates); t_start_sample_us = ggml_time_us(); // Compute error as the difference between observed surprise and target surprise value - size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) { + size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const gptneox_token_data & candidate) { return candidate.id == X; })); float observed_surprise = -log2f(candidates->data[X_idx].p); @@ -1857,30 +1989,30 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_ return X; } -llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) { +gptneox_token gptneox_sample_token_mirostat_v2(struct gptneox_context * ctx, gptneox_token_data_array * candidates, float tau, float eta, float * mu) { assert(ctx); int64_t t_start_sample_us; t_start_sample_us = ggml_time_us(); - llama_sample_softmax(ctx, candidates); + gptneox_sample_softmax(ctx, candidates); // Truncate the words with surprise values greater than mu - candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) { + candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const gptneox_token_data & candidate) { return -log2f(candidate.p) > *mu; })); // Normalize the probabilities of the remaining words - llama_sample_softmax(ctx, candidates); + gptneox_sample_softmax(ctx, candidates); // Sample the next word X from the remaining words if (ctx) { ctx->t_sample_us += ggml_time_us() - t_start_sample_us; } - llama_token X = llama_sample_token(ctx, candidates); + gptneox_token X = gptneox_sample_token(ctx, candidates); t_start_sample_us = ggml_time_us(); // Compute error as the difference between observed surprise and target surprise value - size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) { + size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const gptneox_token_data & candidate) { return candidate.id == X; })); float observed_surprise = -log2f(candidates->data[X_idx].p); @@ -1895,15 +2027,15 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok return X; } -llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) { +gptneox_token gptneox_sample_token_greedy(struct gptneox_context * ctx, gptneox_token_data_array * candidates) { const int64_t t_start_sample_us = ggml_time_us(); // Find max element - auto max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) { + auto max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const gptneox_token_data & a, const gptneox_token_data & b) { return a.logit < b.logit; }); - llama_token result = max_iter->id; + gptneox_token result = max_iter->id; if (ctx) { ctx->t_sample_us += ggml_time_us() - t_start_sample_us; ctx->n_sample++; @@ -1911,10 +2043,10 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da return result; } -llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) { +gptneox_token gptneox_sample_token(struct gptneox_context * ctx, gptneox_token_data_array * candidates) { assert(ctx); const int64_t t_start_sample_us = ggml_time_us(); - llama_sample_softmax(nullptr, candidates); + gptneox_sample_softmax(nullptr, candidates); std::vector probs; probs.reserve(candidates->size); @@ -1926,7 +2058,7 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra auto & rng = ctx->rng; int idx = dist(rng); - llama_token result = candidates->data[idx].id; + gptneox_token result = candidates->data[idx].id; ctx->t_sample_us += ggml_time_us() - t_start_sample_us; ctx->n_sample++; @@ -1937,25 +2069,62 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra // quantization // -static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) { +// temp - load then save model, allows for load and save to be different +static void gptneox_model_copy_internal(const std::string & fname_inp, const std::string & fname_out, enum gptneox_ftype ftype) { + std::unique_ptr model_loader(new gptneox_model_loader(fname_inp.c_str(), + /*use_mmap*/ false, + /*vocab_only*/ false)); + gptneox_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype); + + size_t idx = 0; + for (gptneox_load_tensor & tensor : model_loader->tensors_map.tensors) { + gptneox_buffer read_data; + read_data.resize(tensor.size); + tensor.data = read_data.addr; + model_loader->load_data_for(tensor); + + printf("[%4zu/%4zu] %36s - %16s, type = %6s, ", + ++idx, model_loader->tensors_map.tensors.size(), + tensor.name.c_str(), gptneox_format_tensor_shape(tensor.ne).c_str(), + ggml_type_name(tensor.type)); + + file_saver.write_tensor(tensor, tensor.type, tensor.data, tensor.size); + } +} + +int gptneox_model_copy( + const char * fname_inp, + const char * fname_out, + enum gptneox_ftype ftype) { + try { + gptneox_model_copy_internal(fname_inp, fname_out, ftype); + return 0; + } catch (const std::string & err) { + fprintf(stderr, "%s: failed to copy: %s\n", __func__, err.c_str()); + return 1; + } +} + + +static void gptneox_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum gptneox_ftype ftype, int nthread) { ggml_type quantized_type; switch (ftype) { - case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break; - case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break; - case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break; - case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break; - case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break; - case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break; - default: Die("invalid output file type %d\n", ftype); + case GPTNEOX_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break; + case GPTNEOX_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break; + case GPTNEOX_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break; + case GPTNEOX_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break; + case GPTNEOX_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break; + case GPTNEOX_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break; + default: throw format("invalid output file type %d\n", ftype); }; if (nthread <= 0) { nthread = std::thread::hardware_concurrency(); } - std::unique_ptr model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false, + std::unique_ptr model_loader(new gptneox_model_loader(fname_inp.c_str(), /*use_mmap*/ false, /*vocab_only*/ false)); - llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype); + gptneox_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype); size_t total_size_org = 0; size_t total_size_new = 0; @@ -1965,15 +2134,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s std::mutex mutex; size_t idx = 0; - for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) { - llama_buffer read_data; + for (gptneox_load_tensor & tensor : model_loader->tensors_map.tensors) { + gptneox_buffer read_data; read_data.resize(tensor.size); tensor.data = read_data.addr; model_loader->load_data_for(tensor); printf("[%4zu/%4zu] %36s - %16s, type = %6s, ", ++idx, model_loader->tensors_map.tensors.size(), - tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(), + tensor.name.c_str(), gptneox_format_tensor_shape(tensor.ne).c_str(), ggml_type_name(tensor.type)); // This used to be a regex, but has an extreme cost to compile times. @@ -1990,18 +2159,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s enum ggml_type new_type; void * new_data; size_t new_size; - llama_buffer work; + gptneox_buffer work; if (!quantize) { new_type = tensor.type; new_data = tensor.data; new_size = tensor.size; - printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0); + printf("size = %8.3f MiB\n", tensor.size/1024.0/1024.0); } else { new_type = quantized_type; float * f32_data; size_t nelements = tensor.ne.at(0) * tensor.ne.at(1); - llama_buffer f32_conv_buf; + gptneox_buffer f32_conv_buf; if (tensor.type == GGML_TYPE_F32) { f32_data = (float *) tensor.data; } else if (tensor.type == GGML_TYPE_F16) { @@ -2012,7 +2181,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s f32_data[i] = ggml_fp16_to_fp32(f16_data[i]); } } else { - Die("type %s unsupported for integer quantization", ggml_type_name(tensor.type)); + throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type)); } printf("quantizing .. "); @@ -2055,7 +2224,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s for (int it = 0; it < nthread_use - 1; ++it) workers[it].join(); } - printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0); + printf("size = %8.2f MiB -> %8.2f MiB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0); for (size_t i = 0; i < hist_cur.size(); i++) { hist_all[i] += hist_cur[i]; } @@ -2070,8 +2239,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s file_saver.write_tensor(tensor, new_type, new_data, new_size); } - printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); - printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); + printf("%s: model size = %8.2f MiB\n", __func__, total_size_org/1024.0/1024.0); + printf("%s: quant size = %8.2f MiB\n", __func__, total_size_new/1024.0/1024.0); { int64_t sum_all = 0; @@ -2091,20 +2260,19 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // interface implementation // -struct llama_context * llama_init_from_file( +struct gptneox_context * gptneox_init_from_file( const char * path_model, - struct llama_context_params params, - int verbose) { + struct gptneox_context_params params) { ggml_time_init(); - llama_context * ctx = new llama_context; + gptneox_context * ctx = new gptneox_context; - if (params.seed < 0) { + if (params.seed <= 0) { params.seed = time(NULL); } unsigned cur_percentage = 0; - if (verbose && params.progress_callback == NULL) { + if (params.progress_callback == NULL) { params.progress_callback_user_data = &cur_percentage; params.progress_callback = [](float progress, void * ctx) { unsigned * cur_percentage_p = (unsigned *) ctx; @@ -2125,12 +2293,11 @@ struct llama_context * llama_init_from_file( ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; - if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type, + if (!gptneox_model_load(path_model, *ctx, params.n_ctx, memory_type, params.use_mmap, params.use_mlock, params.vocab_only, - params.progress_callback, params.progress_callback_user_data, - verbose)) { + params.progress_callback, params.progress_callback_user_data)) { fprintf(stderr, "%s: failed to load model\n", __func__); - llama_free(ctx); + gptneox_free(ctx); return nullptr; } @@ -2138,13 +2305,13 @@ struct llama_context * llama_init_from_file( if (!params.vocab_only) { if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) { fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__); - llama_free(ctx); + gptneox_free(ctx); return nullptr; } - if (verbose) { + { const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v); - fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); + fprintf(stderr, "%s: kv self size = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0); } const auto & hparams = ctx->model.hparams; @@ -2160,7 +2327,7 @@ struct llama_context * llama_init_from_file( ctx->embedding.resize(hparams.n_embd); } - ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type)); + //ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type)); ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type)); ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)); @@ -2169,25 +2336,25 @@ struct llama_context * llama_init_from_file( return ctx; } -void llama_free(struct llama_context * ctx) { +void gptneox_free(struct gptneox_context * ctx) { delete ctx; } -int llama_model_quantize( +int gptneox_model_quantize( const char * fname_inp, const char * fname_out, - enum llama_ftype ftype, + enum gptneox_ftype ftype, int nthread) { - // try { - llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread); - return 0; - // } catch (const std::string & err) { - // fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str()); - // return 1; - // } + try { + gptneox_model_quantize_internal(fname_inp, fname_out, ftype, nthread); + return 0; + } catch (const std::string & err) { + fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str()); + return 1; + } } -int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { +int gptneox_apply_lora_from_file_internal(struct gptneox_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); auto & model = ctx->model; @@ -2204,7 +2371,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * { uint32_t magic; fin.read((char *) &magic, sizeof(magic)); - if (magic != READ32BE("ggla")) { + if (magic != 'ggla') { fprintf(stderr, "%s: bad file magic\n", __func__); return 1; } @@ -2229,7 +2396,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * // create a temporary ggml context to store the lora tensors // todo: calculate size from biggest possible tensor std::vector lora_buf(1024ull * 1024ull * 1024ull); - struct ggml_init_params params; + struct gml_init_params params; params.mem_size = lora_buf.size(); params.mem_buffer = lora_buf.data(); params.no_alloc = false; @@ -2245,12 +2412,12 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * // load base model - std::unique_ptr model_loader; + std::unique_ptr model_loader; ggml_context * base_ctx = NULL; - llama_buffer base_buf; + gptneox_buffer base_buf; if (path_base_model) { fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model); - model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false)); + model_loader.reset(new gptneox_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false)); size_t ctx_size, mmapped_size; model_loader->calc_sizes(&ctx_size, &mmapped_size); @@ -2265,9 +2432,9 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * model_loader->ggml_ctx = base_ctx; - // maybe this should in llama_model_loader + // maybe this should in gptneox_model_loader if (model_loader->use_mmap) { - model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false)); + model_loader->mapping.reset(new gptneox_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false)); } } @@ -2355,7 +2522,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * return 1; } size_t idx = model_loader->tensors_map.name_to_idx[base_name]; - llama_load_tensor & lt = model_loader->tensors_map.tensors[idx]; + gptneox_load_tensor & lt = model_loader->tensors_map.tensors[idx]; base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }); lt.data = (uint8_t *) lt.ggml_tensor->data; model_loader->load_data_for(lt); @@ -2426,34 +2593,34 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * return 0; } -int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { - // try { - return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads); - // } catch (const std::string & err) { - // fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str()); - // return 1; - // } +int gptneox_apply_lora_from_file(struct gptneox_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { + try { + return gptneox_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads); + } catch (const std::string & err) { + fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str()); + return 1; + } } -int llama_get_kv_cache_token_count(const struct llama_context * ctx) { +int gptneox_get_kv_cache_token_count(struct gptneox_context * ctx) { return ctx->model.kv_self.n; } -#define LLAMA_MAX_RNG_STATE 64*1024 +#define GPTNEOX_MAX_RNG_STATE 64*1024 -void llama_set_rng_seed(struct llama_context * ctx, int seed) { - if (seed < 0) { +void gptneox_set_rng_seed(struct gptneox_context * ctx, int seed) { + if (seed <= 0) { seed = time(NULL); } ctx->rng.seed(seed); } // Returns the size of the state -size_t llama_get_state_size(const struct llama_context * ctx) { +size_t gptneox_get_state_size(struct gptneox_context * ctx) { // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state. // for reference, std::mt19937(1337) serializes to 6701 bytes. const size_t s_rng_size = sizeof(size_t); - const size_t s_rng = LLAMA_MAX_RNG_STATE; + const size_t s_rng = GPTNEOX_MAX_RNG_STATE; const size_t s_logits_capacity = sizeof(size_t); const size_t s_logits_size = sizeof(size_t); const size_t s_logits = ctx->logits.capacity() * sizeof(float); @@ -2480,7 +2647,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) { } // Copies the state to the specified destination address -size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) { +size_t gptneox_copy_state_data(struct gptneox_context * ctx, uint8_t * dest) { uint8_t * out = dest; // copy rng @@ -2489,13 +2656,13 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) { rng_ss << ctx->rng; const size_t rng_size = rng_ss.str().size(); - char rng_buf[LLAMA_MAX_RNG_STATE]; + char rng_buf[GPTNEOX_MAX_RNG_STATE]; - memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE); + memset(&rng_buf[0], 0, GPTNEOX_MAX_RNG_STATE); memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size()); memcpy(out, &rng_size, sizeof(rng_size)); out += sizeof(rng_size); - memcpy(out, &rng_buf[0], LLAMA_MAX_RNG_STATE); out += LLAMA_MAX_RNG_STATE; + memcpy(out, &rng_buf[0], GPTNEOX_MAX_RNG_STATE); out += GPTNEOX_MAX_RNG_STATE; } // copy logits @@ -2527,73 +2694,42 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) { // copy kv cache { - const auto & kv_self = ctx->model.kv_self; - const auto & hparams = ctx->model.hparams; - const int n_layer = hparams.n_layer; - const int n_embd = hparams.n_embd; - const int n_ctx = hparams.n_ctx; - - const size_t kv_size = kv_self.buf.size; - const int kv_ntok = llama_get_kv_cache_token_count(ctx); + const size_t kv_size = ctx->model.kv_self.buf.size; + const int kv_ntok = gptneox_get_kv_cache_token_count(ctx); memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size); memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok); if (kv_size) { - const size_t elt_size = ggml_element_size(kv_self.k); - llama_buffer buffer; - buffer.resize(4096); - ggml_context * cpy_ctx = ggml_init({ buffer.size, buffer.addr, /* no_alloc */ true }); - ggml_cgraph gf{}; - gf.n_threads = 1; - - ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); - kout3d->data = out; - out += ggml_nbytes(kout3d); - - ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer); - vout3d->data = out; - out += ggml_nbytes(vout3d); - - ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k, - n_embd, kv_ntok, n_layer, - elt_size*n_embd, elt_size*n_embd*n_ctx, 0); - - ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v, - kv_ntok, n_embd, n_layer, - elt_size*n_ctx, elt_size*n_ctx*n_embd, 0); - - ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d)); - ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d)); - ggml_graph_compute(cpy_ctx, &gf); + memcpy(out, ctx->model.kv_self.buf.addr, kv_size); out += kv_size; } } const size_t written = out - dest; - const size_t max_size = llama_get_state_size(ctx); + const size_t expected = gptneox_get_state_size(ctx); - LLAMA_ASSERT(written <= max_size); + GPTNEOX_ASSERT(written == expected); return written; } // Sets the state reading from the specified source address -size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { +size_t gptneox_set_state_data(struct gptneox_context * ctx, const uint8_t * src) { const uint8_t * in = src; // set rng { size_t rng_size; - char rng_buf[LLAMA_MAX_RNG_STATE]; + char rng_buf[GPTNEOX_MAX_RNG_STATE]; memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size); - memcpy(&rng_buf[0], in, LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE; + memcpy(&rng_buf[0], in, GPTNEOX_MAX_RNG_STATE); in += GPTNEOX_MAX_RNG_STATE; std::stringstream rng_ss; rng_ss.str(std::string(&rng_buf[0], rng_size)); rng_ss >> ctx->rng; - LLAMA_ASSERT(rng_ss.fail() == false); + GPTNEOX_ASSERT(rng_ss.fail() == false); } // set logits @@ -2604,7 +2740,7 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { memcpy(&logits_cap, in, sizeof(logits_cap)); in += sizeof(logits_cap); memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size); - LLAMA_ASSERT(ctx->logits.capacity() == logits_cap); + GPTNEOX_ASSERT(ctx->logits.capacity() == logits_cap); if (logits_size) { ctx->logits.resize(logits_size); @@ -2620,7 +2756,7 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size); - LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size); + GPTNEOX_ASSERT(ctx->embedding.capacity() == embedding_size); if (embedding_size) { memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float)); @@ -2630,12 +2766,6 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { // set kv cache { - const auto & kv_self = ctx->model.kv_self; - const auto & hparams = ctx->model.hparams; - const int n_layer = hparams.n_layer; - const int n_embd = hparams.n_embd; - const int n_ctx = hparams.n_ctx; - size_t kv_size; int kv_ntok; @@ -2643,77 +2773,54 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok); if (kv_size) { - LLAMA_ASSERT(kv_self.buf.size == kv_size); + GPTNEOX_ASSERT(ctx->model.kv_self.buf.size == kv_size); - const size_t elt_size = ggml_element_size(kv_self.k); - llama_buffer buffer; - buffer.resize(4096); - ggml_context * cpy_ctx = ggml_init({ buffer.size, buffer.addr, /* no_alloc */ true }); - ggml_cgraph gf{}; - gf.n_threads = 1; + void * k_data = ctx->model.kv_self.k->data; // remember data pointers + void * v_data = ctx->model.kv_self.v->data; // because their value is stored in buf and overwritten by memcpy - ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); - kin3d->data = (void *) in; - in += ggml_nbytes(kin3d); + memcpy(ctx->model.kv_self.buf.addr, in, kv_size); in += kv_size; - ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer); - vin3d->data = (void *) in; - in += ggml_nbytes(vin3d); + ctx->model.kv_self.k->data = k_data; // restore correct data pointers + ctx->model.kv_self.v->data = v_data; - ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k, - n_embd, kv_ntok, n_layer, - elt_size*n_embd, elt_size*n_embd*n_ctx, 0); - - ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v, - kv_ntok, n_embd, n_layer, - elt_size*n_ctx, elt_size*n_ctx*n_embd, 0); - - ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d)); - ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d)); - ggml_graph_compute(cpy_ctx, &gf); } ctx->model.kv_self.n = kv_ntok; } const size_t nread = in - src; - const size_t max_size = llama_get_state_size(ctx); + const size_t expected = gptneox_get_state_size(ctx); - LLAMA_ASSERT(nread <= max_size); + GPTNEOX_ASSERT(nread == expected); return nread; } -int llama_eval( - struct llama_context * ctx, - const llama_token * tokens, +int gptneox_eval( + struct gptneox_context * ctx, + const gptneox_token * tokens, int n_tokens, int n_past, int n_threads) { - int64_t start_eval = 0; - if (!ctx->has_evaluated_once) { - start_eval = ggml_time_us(); - } - if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads)) { + if (!gptneox_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads)) { fprintf(stderr, "%s: failed to eval\n", __func__); return 1; } - // get a more accurate load time by measuring the first eval - // this will take into consideration any page fault slowdown + // get a more accurate load time, upon first eval if (!ctx->has_evaluated_once) { - ctx->t_load_us += ggml_time_us() - start_eval; + ctx->t_load_us = ggml_time_us() - ctx->t_start_us; ctx->has_evaluated_once = true; } return 0; } -int llama_tokenize( - struct llama_context * ctx, +int gptneox_tokenize( + struct gptneox_context * ctx, const char * text, - llama_token * tokens, + gptneox_token * tokens, int n_max_tokens, bool add_bos) { - auto res = llama_tokenize(ctx->vocab, text, add_bos); + auto res = gptneox_tokenize(ctx->vocab, text, add_bos); if (n_max_tokens < (int) res.size()) { fprintf(stderr, "%s: too many tokens\n", __func__); @@ -2727,47 +2834,53 @@ int llama_tokenize( return res.size(); } -int llama_n_vocab(const struct llama_context * ctx) { +int gptneox_n_vocab(struct gptneox_context * ctx) { return ctx->vocab.id_to_token.size(); } -int llama_n_ctx(const struct llama_context * ctx) { +int gptneox_n_ctx(struct gptneox_context * ctx) { return ctx->model.hparams.n_ctx; } -int llama_n_embd(const struct llama_context * ctx) { +int gptneox_n_embd(struct gptneox_context * ctx) { return ctx->model.hparams.n_embd; } -float * llama_get_logits(struct llama_context * ctx) { +float * gptneox_get_logits(struct gptneox_context * ctx) { return ctx->logits.data(); } -float * llama_get_embeddings(struct llama_context * ctx) { +float * gptneox_get_embeddings(struct gptneox_context * ctx) { return ctx->embedding.data(); } -const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) { - if (token >= llama_n_vocab(ctx)) { +const char * gptneox_token_to_str(struct gptneox_context * ctx, gptneox_token token) { + if (token >= gptneox_n_vocab(ctx)) { return nullptr; } return ctx->vocab.id_to_token[token].tok.c_str(); } -llama_token llama_token_bos() { - return 1; +gptneox_token gptneox_str_to_token(struct gptneox_context * ctx, const char * str) { + return ctx->vocab.token_to_id[str]; } -llama_token llama_token_eos() { - return 2; +gptneox_token gptneox_token_bos() { + return 0; } -llama_token llama_token_nl() { +gptneox_token gptneox_token_eos() { + return 0; +} + +// Varies depending on gptneox model, use gptneox_str_to_token instead +gptneox_token gptneox_token_nl() { return 13; } -void llama_print_timings(struct llama_context * ctx) { + +void gptneox_print_timings(struct gptneox_context * ctx) { const int64_t t_end_us = ggml_time_us(); const int32_t n_sample = std::max(1, ctx->n_sample); @@ -2782,37 +2895,495 @@ void llama_print_timings(struct llama_context * ctx) { fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0); } -void llama_reset_timings(struct llama_context * ctx) { +void gptneox_reset_timings(struct gptneox_context * ctx) { ctx->t_start_us = ggml_time_us(); ctx->t_sample_us = ctx->n_sample = 0; ctx->t_eval_us = ctx->n_eval = 0; ctx->t_p_eval_us = ctx->n_p_eval = 0; } -const char * llama_print_system_info(void) { - static std::string s; - - s = ""; - s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | "; - s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | "; - s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | "; - s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | "; - s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | "; - s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | "; - s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; - s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; - s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; - s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; - s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; - s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; - s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; - s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; - - return s.c_str(); -} // For internal test use -std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx) { +std::vector>& gptneox_internal_get_tensor_map(struct gptneox_context * ctx) { return ctx->model.tensors_by_name; } +size_t gptneox_load_session_file(struct gptneox_context * ctx, const char * path_session, gptneox_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { + // TODO leverage mmap + gptneox_file file(path_session, "rb"); + const uint32_t magic = file.read_u32(); + const uint32_t version = file.read_u32(); + + if (!(magic == 'ggsn' && version == 0)) { + fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version); + return 0; + } + + gptneox_hparams session_hparams; + file.read_raw(&session_hparams, sizeof(gptneox_hparams)); + + // REVIEW + if (session_hparams != ctx->model.hparams) { + fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__); + return 0; + } + + const uint32_t n_token_count = file.read_u32(); + GPTNEOX_ASSERT(n_token_capacity >= n_token_count); + file.read_raw(tokens_out, sizeof(gptneox_token) * n_token_count); + *n_token_count_out = n_token_count; + + const size_t n_state_size = file.size - file.tell(); + const size_t n_orig_state_size = gptneox_get_state_size(ctx); + if (n_state_size != n_orig_state_size) { + fprintf(stderr, "%s : failed to validate state size\n", __func__); + } + std::unique_ptr state_data(new uint8_t[n_state_size]); + file.read_raw(state_data.get(), n_state_size); + return gptneox_set_state_data(ctx, state_data.get()); +} + +size_t gptneox_save_session_file(struct gptneox_context * ctx, const char * path_session, const gptneox_token * tokens, size_t n_token_count) { + // TODO save temp & swap + gptneox_file file(path_session, "wb"); + + const size_t n_state_size = gptneox_get_state_size(ctx); + std::unique_ptr state_data(new uint8_t[n_state_size]); + gptneox_copy_state_data(ctx, state_data.get()); + + file.write_u32('ggsn'); // magic + file.write_u32(0); // version + file.write_raw(&ctx->model.hparams, sizeof(gptneox_hparams)); + + file.write_u32((uint32_t) n_token_count); // REVIEW + file.write_raw(tokens, sizeof(gptneox_token) * n_token_count); + + file.write_raw(state_data.get(), n_state_size); + return n_state_size; // REVIEW +} + + +static std::atomic is_interacting; +static std::atomic is_terminated; + +#define EPHEMERAL(fmt) "\r\e[K\033[1;35m" fmt " \033[0m" + +static void sigint_handler_batch(int signo) { + is_terminated = true; +} + +static void sigint_handler_interactive(int signo) { + if (!is_interacting) { + is_interacting = true; + } else { + is_terminated = true; + } +} + +static int CompareTime(struct timespec a, struct timespec b) { + int cmp; + if (!(cmp = (a.tv_sec > b.tv_sec) - (a.tv_sec < b.tv_sec))) { + cmp = (a.tv_nsec > b.tv_nsec) - (a.tv_nsec < b.tv_nsec); + } + return cmp; +} + +static int on_missing_feature(const char *name) { + fprintf(stderr, "%s: error: cpuid %s not detected\n", __func__, name); + fprintf(stderr, "%s: amd microprocessors made after 2017 usually work\n", __func__); + fprintf(stderr, "%s: intel microprocessors made after 2013 usually work\n", __func__); + return 1; +} + +int main(int argc, char ** argv) { + gpt_params params; + params.model = "ggml-RedPajama-INCITE-Chat-3B-v1-q8_0.bin"; + + if (gpt_params_parse(argc, argv, params) == false) { + return 1; + } + + + + ShowCrashReports(); + setvbuf(stdin, NULL, _IONBF, 0); + setvbuf(stdout, NULL, _IONBF, 0); + setvbuf(stderr, NULL, _IONBF, 0); + + +#ifdef __x86_64__ + if (!X86_HAVE(AVX2)) return on_missing_feature("avx2"); + if (!X86_HAVE(AVX)) return on_missing_feature("avx"); + if (!X86_HAVE(FMA)) return on_missing_feature("fma"); + if (!X86_HAVE(SSE3)) return on_missing_feature("sse3"); + if (!X86_HAVE(F16C)) { + fprintf(stderr, "%s: warning: cpuid f16c not detected; inference might crash\n", __func__); + } +#endif /* __x86_64__ */ + + gptneox_context * ctx; + g_ctx = &ctx; + + // load the model + { + auto lparams = gptneox_context_default_params(); + + lparams.n_ctx = params.n_ctx; + lparams.n_parts = params.n_parts; + lparams.seed = params.seed; + lparams.f16_kv = params.memory_f16; + lparams.use_mmap = params.use_mmap; + lparams.use_mlock = params.use_mlock; + + ctx = gptneox_init_from_file(params.model.c_str(), lparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); + return 1; + } + } + + if (!params.lora_adapter.empty()) { + int err = gptneox_apply_lora_from_file(ctx, + params.lora_adapter.c_str(), + params.lora_base.empty() ? NULL : params.lora_base.c_str(), + params.n_threads); + if (err != 0) { + fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); + return 1; + } + } + + // print system information + { + fprintf(stderr, "\n"); + fprintf(stderr, "system_info: n_threads = %d / %d | \n", + params.n_threads, std::thread::hardware_concurrency()); + } + + // determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters + // uncomment the "used_mem" line in llama.cpp to see the results + if (params.mem_test) { + { + const std::vector tmp(params.n_batch, 0); + gptneox_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads); + } + + { + const std::vector tmp = { 0, }; + gptneox_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads); + } + + gptneox_print_timings(ctx); + gptneox_free(ctx); + + return 0; + } + + + // Always interactive in Open-Assistant + params.interactive = true; + + if (params.interactive) { +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) + struct sigaction sigint_action; + sigint_action.sa_handler = sigint_handler; + sigemptyset (&sigint_action.sa_mask); + sigint_action.sa_flags = 0; + sigaction(SIGINT, &sigint_action, NULL); +#elif defined (_WIN32) + signal(SIGINT, sigint_handler); +#endif + } + fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", + params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty); + fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", params.n_ctx, params.n_batch, params.n_predict, params.n_keep); + fprintf(stderr, "\n\n"); + + // TODO: replace with ring-buffer + std::vector last_n_tokens = std::vector(); + //std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); + + + if (params.interactive) { + printf("== Running in interactive mode. ==\n" +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) + " - Press Ctrl+C to interject at any time.\n" +#endif + " - Press Return to return control to LLaMa.\n" + " - If you want to submit another line, end your input in '\\'.\n\n"); + } + + const int32_t top_k = params.top_k; + const float top_p = params.top_p; + const float temp = params.temp; + const float repeat_penalty = params.repeat_penalty; + + // Chat loop + while (true) { + is_interacting = true; + + int n_past = 0; + + // Get input + + // potentially set color to indicate we are taking user input + +#if defined (_WIN32) + // Windows: must reactivate sigint handler after each signal + signal(SIGINT, sigint_handler); +#endif + + if (params.instruct) { + printf("\n: "); + } + + std::string buffer; + if (!params.input_prefix.empty()) { + buffer += params.input_prefix; + printf("%s", buffer.c_str()); + } + + std::string line; + bool another_line = true; + do { +#if defined(_WIN32) + std::wstring wline; + if (!std::getline(std::wcin, wline)) { + // input stream is bad or EOF received + return 0; + } + win32_utf8_encode(wline, line); +#else + if (!std::getline(std::cin, line)) { + // input stream is bad or EOF received + return 0; + } +#endif + if (line.empty() || line.back() != '\\') { + another_line = false; + } else { + line.pop_back(); // Remove the continue character + } + buffer += line; + if (another_line) { + buffer += '\n'; + } + } while (another_line); + + is_interacting = false; + + // done taking input, reset color + + // Check for input + if (buffer.length() <= 0) { + continue; // Restart loop for input + } + + // Tokenize prompt with oasst special tokens + + auto prompt_embd = ::gptneox_tokenize(ctx, buffer, false); + auto embd_inp = std::vector(); + + // Redpajama: insert special tokens for OA. (prefix) + embd_inp.push_back(gptneox_str_to_token(ctx, "<")); + embd_inp.push_back(gptneox_str_to_token(ctx, "human")); + embd_inp.push_back(gptneox_str_to_token(ctx, ">:")); + + embd_inp.insert(embd_inp.end(), prompt_embd.begin(), prompt_embd.end()); + + // Redpajama: insert special tokens for OA. (postfix) + embd_inp.push_back(gptneox_str_to_token(ctx, "<")); + embd_inp.push_back(gptneox_str_to_token(ctx, "bot")); + embd_inp.push_back(gptneox_str_to_token(ctx, ">:")); + + + // Verbose prompt + if (params.verbose_prompt) { + fprintf(stderr, "\n"); + fprintf(stderr, "%s: prompt: '%s'\n", __func__, buffer.c_str()); + fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); + for (int i = 0; i < (int) embd_inp.size(); i++) { + fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], gptneox_token_to_str(ctx, embd_inp[i])); + } + /*if (params.n_keep > 0) { + fprintf(stderr, "%s: static prompt based on n_keep: '", __func__); + for (int i = 0; i < params.n_keep; i++) { + fprintf(stderr, "%s", gptneox_token_to_str(ctx, embd_inp[i])); + } + fprintf(stderr, "'\n"); + } + */ + fprintf(stderr, "\n"); + } + + // How many tokens to generate - check if theres space in context for atleast one token (or batch size tokens?) + auto inp_size = embd_inp.size(); + auto space = params.n_ctx - inp_size; + if(space <= 0) { + fprintf(stderr, "%s : input too long\n", __func__); + continue; + } + // Send batches to eval + while (n_past < inp_size) { + auto remaining = inp_size - n_past; + int n_eval = params.n_batch < remaining ? params.n_batch : remaining; + if (gptneox_eval(ctx, &embd_inp[n_past], n_eval, n_past, params.n_threads)) { + fprintf(stderr, ": %s : failed to eval\n", __func__); + return 1; + } + n_past += n_eval; + } + + const int n_ctx = gptneox_n_ctx(ctx); + const int n_vocab = gptneox_n_vocab(ctx); + + const float temp = params.temp; + const int32_t top_k = params.top_k <= 0 ? gptneox_n_vocab(ctx) : params.top_k; + const float top_p = params.top_p; + const float tfs_z = params.tfs_z; + const float typical_p = params.typical_p; + const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n; + const float repeat_penalty = params.repeat_penalty; + const float alpha_presence = params.presence_penalty; + const float alpha_frequency = params.frequency_penalty; + const int mirostat = params.mirostat; + const float mirostat_tau = params.mirostat_tau; + const float mirostat_eta = params.mirostat_eta; + const bool penalize_nl = params.penalize_nl; + + // Eval until space runs out + auto out_count = 0; + + printf(":"); + while (space > 0) { + // Get token + gptneox_token id = 0; + + { + auto logits = gptneox_get_logits(ctx); + + // Apply params.logit_bias map + for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { + logits[it->first] += it->second; + } + + std::vector candidates; + candidates.reserve(n_vocab); + for (gptneox_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back(gptneox_token_data{token_id, logits[token_id], 0.0f}); + } + + gptneox_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + + // Apply penalties + gptneox_token nl_token = gptneox_str_to_token(ctx, "\n"); + float nl_logit = logits[nl_token]; + auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx); + gptneox_sample_repetition_penalty(ctx, &candidates_p, + last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, + last_n_repeat, repeat_penalty); + gptneox_sample_frequency_and_presence_penalties(ctx, &candidates_p, + last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, + last_n_repeat, alpha_frequency, alpha_presence); + if (!penalize_nl) { + logits[nl_token] = nl_logit; + } + + if (temp <= 0) { + // Greedy sampling + id = gptneox_sample_token_greedy(ctx, &candidates_p); + } else { + if (mirostat == 1) { + static float mirostat_mu = 2.0f * mirostat_tau; + const int mirostat_m = 100; + gptneox_sample_temperature(ctx, &candidates_p, temp); + id = gptneox_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); + } else if (mirostat == 2) { + static float mirostat_mu = 2.0f * mirostat_tau; + gptneox_sample_temperature(ctx, &candidates_p, temp); + id = gptneox_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); + } else { + // Temperature sampling + gptneox_sample_top_k(ctx, &candidates_p, top_k, 1); + gptneox_sample_tail_free(ctx, &candidates_p, tfs_z, 1); + gptneox_sample_typical(ctx, &candidates_p, typical_p, 1); + gptneox_sample_top_p(ctx, &candidates_p, top_p, 1); + gptneox_sample_temperature(ctx, &candidates_p, temp); + id = gptneox_sample_token(ctx, &candidates_p); + } + } + } + + // Inc out count and dec space + out_count += 1; + space -= 1; + // Repeat tokens update + last_n_tokens.push_back(id); + if (last_n_tokens.size() > params.repeat_last_n) { + last_n_tokens.erase(last_n_tokens.begin()); + } + // Redpajama: check if the interactive is done. + //std::cout<<" last_n_tokens.size: "<< last_n_tokens[0] <<" "<< last_n_tokens[1] <<" "<< last_n_tokens[2] << std::endl; + if (last_n_tokens.size()==3 && last_n_tokens[0]==gptneox_str_to_token(ctx, "<") + && last_n_tokens[1]==gptneox_str_to_token(ctx, "human") && last_n_tokens[2]==gptneox_str_to_token(ctx, ">:")){ + space = 0; + continue; + } + + // Check for eos - end early - check eos before bos in case they are the same + if (id == gptneox_token_eos()) { + space = 0; + continue; + } + // Check for bos - skip callback if so + if (id == gptneox_token_bos()) { + continue; + } + // Convert token to string and display + // printf("%s(%d)", gptneox_token_to_str(ctx, id), id); + + + if (last_n_tokens[2]==gptneox_str_to_token(ctx, "<")){ + ; + } + else if (last_n_tokens[2]==gptneox_str_to_token(ctx, "human")){ + if (last_n_tokens[1]==gptneox_str_to_token(ctx, "<")){ + ; + } + else{ + printf("%s", gptneox_token_to_str(ctx, id)); + } + } + else if (last_n_tokens[1]==gptneox_str_to_token(ctx, "<")){ + printf("<"); + printf("%s", gptneox_token_to_str(ctx, id)); + } + else{ + printf("%s", gptneox_token_to_str(ctx, id)); + } + fflush(stdout); + // Check if we need to run another eval + if (space > 0) { + // Send generated token back into model for next generation + if (gptneox_eval(ctx, &id, 1, n_past, params.n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return 1; + } + // Increment past count + n_past += 1; + } + // Check for user interrupt + if (is_interacting) { space = 0; } + } + printf("\n"); + //printf("\n %d", space); + fflush(stdout); + } + + gptneox_print_timings(ctx); + gptneox_free(ctx); + + + return 0; +} diff --git a/third_party/radpajama/radpajama.mk b/third_party/radpajama/radpajama.mk new file mode 100644 index 000000000..e2fa901a4 --- /dev/null +++ b/third_party/radpajama/radpajama.mk @@ -0,0 +1,127 @@ +#-*-mode:makefile-gmake;indent-tabs-mode:t;tab-width:8;coding:utf-8-*-┐ +#───vi: set et ft=make ts=8 tw=8 fenc=utf-8 :vi───────────────────────┘ + +PKGS += THIRD_PARTY_REDPAJAMA + +################################################################################ +# single file machine learning framework written in c +# make -j8 o//third_party/radpajama/ggml.a + +THIRD_PARTY_REDPAJAMA_ARTIFACTS += THIRD_PARTY_REDPAJAMA_A +THIRD_PARTY_REDPAJAMA = $(THIRD_PARTY_REDPAJAMA_A_DEPS) $(THIRD_PARTY_REDPAJAMA_A) +THIRD_PARTY_REDPAJAMA_A = o/$(MODE)/third_party/radpajama/ggml.a +THIRD_PARTY_REDPAJAMA_A_HDRS = third_party/radpajama/ggml.h +THIRD_PARTY_REDPAJAMA_A_SRCS = third_party/radpajama/ggml.c +THIRD_PARTY_REDPAJAMA_A_OBJS = $(THIRD_PARTY_REDPAJAMA_A_SRCS:%.c=o/$(MODE)/%.o) +THIRD_PARTY_REDPAJAMA_A_FILES = $(THIRD_PARTY_REDPAJAMA_A_SRCS) $(THIRD_PARTY_REDPAJAMA_A_HDRS) +THIRD_PARTY_REDPAJAMA_A_CHECKS = $(THIRD_PARTY_REDPAJAMA_A).pkg $(THIRD_PARTY_REDPAJAMA_A_HDRS:%=o/$(MODE)/%.ok) + +THIRD_PARTY_REDPAJAMA_A_DIRECTDEPS = \ + LIBC_CALLS \ + LIBC_INTRIN \ + LIBC_MEM \ + LIBC_NEXGEN32E \ + LIBC_RUNTIME \ + LIBC_STDIO \ + LIBC_THREAD \ + LIBC_STR \ + LIBC_STUBS \ + LIBC_SYSV \ + LIBC_TINYMATH \ + THIRD_PARTY_COMPILER_RT + +THIRD_PARTY_REDPAJAMA_A_DEPS := \ + $(call uniq,$(foreach x,$(THIRD_PARTY_REDPAJAMA_A_DIRECTDEPS),$($(x)))) + +$(THIRD_PARTY_REDPAJAMA_A): \ + third_party/radpajama/ \ + $(THIRD_PARTY_REDPAJAMA_A).pkg \ + $(THIRD_PARTY_REDPAJAMA_A_OBJS) + +$(THIRD_PARTY_REDPAJAMA_A).pkg: \ + $(THIRD_PARTY_REDPAJAMA_A_OBJS) \ + $(foreach x,$(THIRD_PARTY_REDPAJAMA_A_DIRECTDEPS),$($(x)_A).pkg) + +$(THIRD_PARTY_REDPAJAMA_A_OBJS): private \ + OVERRIDE_CFLAGS += \ + -O3 \ + -ffunction-sections \ + -fdata-sections + +ifeq ($(ARCH), x86_64) +$(THIRD_PARTY_REDPAJAMA_A_OBJS): private \ + OVERRIDE_CFLAGS += \ + -msse3 \ + -mavx \ + -mavx2 \ + -mf16c \ + -mfma +endif + +################################################################################ +# command for running inference on large language models +# make -j8 o//third_party/radpajama/radpajama.com + +THIRD_PARTY_REDPAJAMA_ARTIFACTS += THIRD_PARTY_REDPAJAMA_LLAMA +THIRD_PARTY_REDPAJAMA_LLAMA = o/$(MODE)/third_party/radpajama/radpajama.com +THIRD_PARTY_REDPAJAMA_LLAMA_HDRS = third_party/radpajama/llama.h third_party/radpajama/llama_util.h third_party/radpajama/common.h +THIRD_PARTY_REDPAJAMA_LLAMA_SRCS = third_party/radpajama/radpajama.cc third_party/radpajama/common.cc +THIRD_PARTY_REDPAJAMA_LLAMA_OBJS = $(THIRD_PARTY_REDPAJAMA_LLAMA_SRCS:%.cc=o/$(MODE)/%.o) +THIRD_PARTY_REDPAJAMA_LLAMA_FILES := $(THIRD_PARTY_REDPAJAMA_LLAMA_SRCS) $(THIRD_PARTY_REDPAJAMA_LLAMA_HDRS) +THIRD_PARTY_REDPAJAMA_LLAMA_CHECKS = $(THIRD_PARTY_REDPAJAMA_LLAMA).pkg $(THIRD_PARTY_REDPAJAMA_LLAMA_HDRS:%=o/$(MODE)/%.okk) + +THIRD_PARTY_REDPAJAMA_LLAMA_DIRECTDEPS = \ + LIBC_CALLS \ + LIBC_FMT \ + LIBC_INTRIN \ + LIBC_MEM \ + LIBC_NEXGEN32E \ + LIBC_RUNTIME \ + LIBC_STDIO \ + LIBC_LOG \ + LIBC_STR \ + LIBC_STUBS \ + LIBC_SYSV \ + LIBC_THREAD \ + LIBC_TINYMATH \ + LIBC_ZIPOS \ + THIRD_PARTY_REDPAJAMA \ + THIRD_PARTY_LIBCXX + +THIRD_PARTY_REDPAJAMA_LLAMA_DEPS := \ + $(call uniq,$(foreach x,$(THIRD_PARTY_REDPAJAMA_LLAMA_DIRECTDEPS),$($(x)))) + +$(THIRD_PARTY_REDPAJAMA_LLAMA).dbg: \ + $(THIRD_PARTY_REDPAJAMA_LLAMA).pkg \ + $(THIRD_PARTY_REDPAJAMA_LLAMA_DEPS) \ + o/$(MODE)/third_party/radpajama/radpajama.txt.zip.o \ + o/$(MODE)/third_party/radpajama/common.o \ + o/$(MODE)/third_party/radpajama/llama.o \ + o/$(MODE)/third_party/radpajama/radpajama.o \ + $(CRT) \ + $(APE_NO_MODIFY_SELF) + @$(APELINK) + +$(THIRD_PARTY_REDPAJAMA_LLAMA).pkg: \ + $(THIRD_PARTY_REDPAJAMA_LLAMA_OBJS) \ + $(foreach x,$(THIRD_PARTY_REDPAJAMA_LLAMA_DIRECTDEPS),$($(x)_A).pkg) + +o/$(MODE)/third_party/radpajama/radpajama.txt.zip.o: private \ + ZIPOBJ_FLAGS += \ + -B + +################################################################################ + +THIRD_PARTY_REDPAJAMA_COMS = $(THIRD_PARTY_REDPAJAMA_LLAMA) +THIRD_PARTY_REDPAJAMA_BINS = $(THIRD_PARTY_REDPAJAMA_COMS) $(THIRD_PARTY_REDPAJAMA_COMS:%=%.dbg) +THIRD_PARTY_REDPAJAMA_LIBS = $(foreach x,$(THIRD_PARTY_REDPAJAMA_ARTIFACTS),$($(x))) +THIRD_PARTY_REDPAJAMA_SRCS = $(foreach x,$(THIRD_PARTY_REDPAJAMA_ARTIFACTS),$($(x)_SRCS)) +THIRD_PARTY_REDPAJAMA_HDRS = $(foreach x,$(THIRD_PARTY_REDPAJAMA_ARTIFACTS),$($(x)_HDRS)) +THIRD_PARTY_REDPAJAMA_OBJS = $(foreach x,$(THIRD_PARTY_REDPAJAMA_ARTIFACTS),$($(x)_OBJS)) +THIRD_PARTY_REDPAJAMA_CHECKS = $(foreach x,$(THIRD_PARTY_REDPAJAMA_ARTIFACTS),$($(x)_CHECKS)) +$(THIRD_PARTY_REDPAJAMA_OBJS): third_party/radpajama/radpajama.mk + +.PHONY: o/$(MODE)/third_party/radpajama +o/$(MODE)/third_party/radpajama: \ + $(THIRD_PARTY_REDPAJAMA_BINS) \ + $(THIRD_PARTY_REDPAJAMA_CHECKS) diff --git a/third_party/third_party.mk b/third_party/third_party.mk index 46c93029d..b6108057d 100644 --- a/third_party/third_party.mk +++ b/third_party/third_party.mk @@ -14,6 +14,7 @@ o/$(MODE)/third_party: \ o/$(MODE)/third_party/gdtoa \ o/$(MODE)/third_party/getopt \ o/$(MODE)/third_party/ggml \ +# o/$(MODE)/third_party/radpajama \ o/$(MODE)/third_party/hiredis \ o/$(MODE)/third_party/libcxx \ o/$(MODE)/third_party/linenoise \