From c454f8b84843fcb064e2c4ccbea329038d46a567 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sat, 22 Apr 2023 11:23:25 +0800 Subject: [PATCH] Gpt NeoX / Pythia integration completed --- README.md | 2 + gpttype_adapter.cpp | 18 +++++- koboldcpp.py | 2 +- model_adapter.cpp | 2 +- otherarch/neox.cpp | 146 -------------------------------------------- 5 files changed, 20 insertions(+), 150 deletions(-) diff --git a/README.md b/README.md index bc483b785..6a04a69e4 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ What does it mean? You get llama.cpp with a fancy UI, persistent stories, editin # Highlights - Now has experimental CLBlast support. - Now supports RWKV models WITHOUT pytorch or tokenizers! Yep, just GGML! +- Now supports GPT-NeoX / Pythia models ## Usage - [Download the latest release here](https://github.com/LostRuins/koboldcpp/releases/latest) or clone the repo. @@ -62,4 +63,5 @@ What does it mean? You get llama.cpp with a fancy UI, persistent stories, editin - GPT-2 (All versions, including legacy f16, newer format + quanitzed, cerebras) Supports OpenBLAS acceleration only for newer format. - GPT-J (All versions including legacy f16, newer format + quantized, pyg.cpp, new pygmalion, janeway etc.) Supports OpenBLAS acceleration only for newer format. - RWKV (f16 GGMF format), unaccelerated due to RNN properties. + - GPT-NeoX / Pythia - Basically every single current and historical GGML format that has ever existed should be supported, except for bloomz.cpp due to lack of demand. \ No newline at end of file diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 9e73ac9eb..5393684f3 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -335,7 +335,8 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GPT2_2 || - file_format == FileFormat::GPTJ_3); + file_format == FileFormat::GPTJ_3 || + file_format == FileFormat::NEOX_1); bool blasmode = (approved_format && embd_inp.size() >= 32 && ggml_cpu_has_blas()); // bool blasmode = false; int original_batch = params.n_batch; @@ -382,6 +383,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o { n_vocab = gpt2_ctx_v2.hparams.n_vocab; } + else if(file_format == FileFormat::NEOX_1) + { + n_vocab = neox_ctx.hparams.n_vocab; + } else if(file_format == FileFormat::RWKV_1) { n_vocab = vocab.id_to_token.size(); //handled seperately @@ -443,6 +448,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o { evalres = gpt2_eval(gpt2_ctx_v2, params.n_threads, n_past, embd, logits, mem_per_token, file_format); } + else if(file_format==FileFormat::NEOX_1) + { + evalres = stablelm_eval(neox_ctx, params.n_threads, n_past, embd, logits, mem_per_token); + } else if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2) { evalres = legacy_gptj_eval(gptj_ctx_v1, params.n_threads, n_past, embd, logits, mem_per_token, file_format); @@ -495,7 +504,12 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o else { // set the logit of the eos token (2) to zero to avoid sampling it - if(logits.size()>50256) + if((file_format == FileFormat::GPT2_1 || + file_format == FileFormat::GPT2_2 || + file_format == FileFormat::GPTJ_1 || + file_format == FileFormat::GPTJ_2 || + file_format == FileFormat::GPTJ_3) + && logits.size()>50256) { logits[50256] = (logits[50256] < 0 ? logits[50256] : 0); } diff --git a/koboldcpp.py b/koboldcpp.py index 94e674763..1a65d1d0c 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -139,7 +139,7 @@ maxctx = 2048 maxlen = 128 modelbusy = False defaultport = 5001 -KcppVersion = "1.10" +KcppVersion = "1.11" class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): sys_version = "" diff --git a/model_adapter.cpp b/model_adapter.cpp index 66bbda14f..f6225dc55 100644 --- a/model_adapter.cpp +++ b/model_adapter.cpp @@ -127,7 +127,7 @@ void print_tok_vec(std::vector &embd) fileformat = FileFormat::GPT2_2; //quantized format cannot be legacy type } } - else if(vocabsiz < 32000 || vocabsiz > 36000) + else if(vocabsiz < 31998 || vocabsiz > 33000) { //anything outside the llama v1 range is assumed to be NeoX fileformat = FileFormat::NEOX_1; diff --git a/otherarch/neox.cpp b/otherarch/neox.cpp index ff4a7ae7e..14c9f5b74 100644 --- a/otherarch/neox.cpp +++ b/otherarch/neox.cpp @@ -596,149 +596,3 @@ bool stablelm_eval( return true; } - -// int main(int argc, char ** argv) { -// ggml_time_init(); -// const int64_t t_main_start_us = ggml_time_us(); - -// gpt_params params; -// params.model = "models/stablelm-base-alpha-3b/ggml-model-f16.bin"; - -// if (gpt_params_parse(argc, argv, params) == false) { -// return 1; -// } - -// if (params.seed < 0) { -// params.seed = time(NULL); -// } - -// printf("%s: seed = %d\n", __func__, params.seed); - -// std::mt19937 rng(params.seed); -// if (params.prompt.empty()) { -// if( !isatty(STDIN_FILENO) ){ -// std::string line; -// while( std::getline(std::cin, line) ){ -// params.prompt = params.prompt + "\n" + line; -// } -// } else { -// params.prompt = gpt_random_prompt(rng); -// } -// } - -// int64_t t_load_us = 0; - -// gpt_vocab vocab; -// stablelm_model model; - -// // load the model -// { -// const int64_t t_start_us = ggml_time_us(); - -// if (!stablelm_model_load(params.model, model, vocab)) { -// fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); -// return 1; -// } - -// t_load_us = ggml_time_us() - t_start_us; -// } - -// int n_past = 0; - -// int64_t t_sample_us = 0; -// int64_t t_predict_us = 0; - -// std::vector logits; - -// // tokenize the prompt -// std::vector embd_inp = ::gpt_tokenize(vocab, params.prompt); - -// params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); - -// printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); -// for (int i = 0; i < embd_inp.size(); i++) { -// printf("%s: token[%d] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str()); -// } -// printf("\n"); - -// std::vector embd; - -// // determine the required inference memory per token: -// size_t mem_per_token = 0; -// stablelm_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); - -// for (int i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { -// // predict -// if (embd.size() > 0) { -// const int64_t t_start_us = ggml_time_us(); - -// if (!stablelm_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) { -// printf("Failed to predict\n"); -// return 1; -// } - -// t_predict_us += ggml_time_us() - t_start_us; -// } - -// n_past += embd.size(); -// embd.clear(); - -// if (i >= embd_inp.size()) { -// // sample next token -// const int top_k = params.top_k; -// const float top_p = params.top_p; -// const float temp = params.temp; - -// const int n_vocab = model.hparams.n_vocab; - -// gpt_vocab::id id = 0; - -// { -// const int64_t t_start_sample_us = ggml_time_us(); - -// id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng); - -// t_sample_us += ggml_time_us() - t_start_sample_us; -// } - -// // add it to the context -// embd.push_back(id); -// } else { -// // if here, it means we are still processing the input prompt -// for (int k = i; k < embd_inp.size(); k++) { -// embd.push_back(embd_inp[k]); -// if (embd.size() > params.n_batch) { -// break; -// } -// } -// i += embd.size() - 1; -// } - -// // display text -// for (auto id : embd) { -// printf("%s", vocab.id_to_token[id].c_str()); -// } -// fflush(stdout); - -// // end of text token -// if (embd.back() == 0) { -// break; -// } -// } - -// // report timing -// { -// const int64_t t_main_end_us = ggml_time_us(); - -// printf("\n\n"); -// printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token); -// printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); -// printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); -// printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past); -// printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); -// } - -// ggml_free(model.ctx); - -// return 0; -// } \ No newline at end of file