From 2f2eff6e13b9f19f52800b915f49dc2b73596307 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Mon, 8 May 2023 20:58:00 +0800 Subject: [PATCH] the dark gods have been sated, and redpajama is integrated... but at what cost? --- expose.cpp | 10 ++++++++-- gpttype_adapter.cpp | 42 ++++++++++++++++++++++++++++-------------- model_adapter.h | 1 + otherarch/neox.cpp | 27 ++++++++++++++++++++------- 4 files changed, 57 insertions(+), 23 deletions(-) diff --git a/expose.cpp b/expose.cpp index 38defd15e..df426d82e 100644 --- a/expose.cpp +++ b/expose.cpp @@ -128,16 +128,22 @@ extern "C" return true; } } - else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2) + else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3) { printf("\n---\nIdentified as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format); ModelLoadResult lr = gpttype_load_model(inputs, file_format); if (lr == ModelLoadResult::RETRY_LOAD) + { + file_format = FileFormat::NEOX_3; + printf("\n---\nRetrying as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format); + lr = gpttype_load_model(inputs, file_format); + } + if (lr == ModelLoadResult::RETRY_LOAD) { file_format = FileFormat::NEOX_1; printf("\n---\nRetrying as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format); lr = gpttype_load_model(inputs, file_format); - } + } if (lr == ModelLoadResult::FAIL || lr == ModelLoadResult::RETRY_LOAD) { return false; diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index ba7a0c2cd..f0a12bcaf 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -369,7 +369,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in return ModelLoadResult::SUCCESS; } - else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2) + else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3) { ModelLoadResult res = stablelm_model_load(params.model, neox_ctx, vocab, file_format); if(res==ModelLoadResult::FAIL) @@ -383,7 +383,23 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in return res; } // determine the required inference memory per token: - stablelm_eval(neox_ctx, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); + stablelm_eval(neox_ctx, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format); + + if(logits.size()>0 && file_format==FileFormat::NEOX_2 && !IsNanCheck(logits[0])) + { + //run the black magic eval to determine if it's redpajama. VERY UGLY HACK! + std::vector test_embd = ::gpt_tokenize(vocab, "1 2 3 4 5 6 7"); + stablelm_eval(neox_ctx, params.n_threads, 0, test_embd, logits, mem_per_token, FileFormat::NEOX_3); + int topid = std::max_element(logits.begin(),logits.end())-logits.begin(); + std::string predicted = vocab.id_to_token[topid].c_str(); + if(predicted.find("8") != std::string::npos) + { + printf("\n---\nRedPajama NeoX Detected! Switching to new format! (use_parallel_residual=False)\n"); + ggml_free(neox_ctx.ctx); + return ModelLoadResult::RETRY_LOAD; + } + } + return ModelLoadResult::SUCCESS; } else @@ -514,13 +530,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o } //if using BLAS and prompt is big enough, switch to single thread and use a huge batch - bool approved_format = (file_format == FileFormat::GGML || - file_format == FileFormat::GGHF || - file_format == FileFormat::GGJT || - file_format == FileFormat::GPT2_2 || - file_format == FileFormat::GPTJ_3 || - file_format == FileFormat::NEOX_1 || - file_format == FileFormat::NEOX_2); + bool approved_format = !(file_format == FileFormat::BADFORMAT || + file_format == FileFormat::GPT2_1 || + file_format == FileFormat::GPTJ_1 || + file_format == FileFormat::GPTJ_2 || + file_format == FileFormat::RWKV_1); bool blasmode = (approved_format && embd_inp.size() >= 32 && ggml_cpu_has_blas()); // bool blasmode = false; int original_batch = params.n_batch; @@ -579,7 +593,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o { n_vocab = gpt2_ctx_v2.hparams.n_vocab; } - else if(file_format == FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2) + else if(file_format == FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3) { n_vocab = neox_ctx.hparams.n_vocab; } @@ -614,14 +628,14 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o { for (auto id : embd_inp) { - printf("'%s', ",llama_token_to_str(llama_ctx_v1, id)); + printf("'%s (%d)', ",llama_token_to_str(llama_ctx_v1, id),id); } } else { for (auto id : embd_inp) { - printf("'%s', ",vocab.id_to_token[id].c_str()); + printf("'%s (%d)', ",vocab.id_to_token[id].c_str(),id); } } printf("\n"); @@ -665,9 +679,9 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o { evalres = gpt2_eval(gpt2_ctx_v2, params.n_threads, n_past, embd, logits, mem_per_token, file_format); } - else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2) + else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3) { - evalres = stablelm_eval(neox_ctx, params.n_threads, n_past, embd, logits, mem_per_token); + evalres = stablelm_eval(neox_ctx, params.n_threads, n_past, embd, logits, mem_per_token, file_format); } else if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2) { diff --git a/model_adapter.h b/model_adapter.h index 5c303a1fd..3e3f8f30c 100644 --- a/model_adapter.h +++ b/model_adapter.h @@ -31,6 +31,7 @@ enum FileFormat NEOX_1=400, NEOX_2=401, + NEOX_3=402, }; enum ModelLoadResult diff --git a/otherarch/neox.cpp b/otherarch/neox.cpp index 47397c309..fb6b4b996 100644 --- a/otherarch/neox.cpp +++ b/otherarch/neox.cpp @@ -345,7 +345,8 @@ bool stablelm_eval( const int n_past, const std::vector & embd_inp, std::vector & embd_w, - size_t & mem_per_token) { + size_t & mem_per_token, + FileFormat file_format) { const int N = embd_inp.size(); const auto & hparams = model.hparams; @@ -494,6 +495,12 @@ bool stablelm_eval( } } + if(file_format==FileFormat::NEOX_3) + { + // layer input + Attn + cur = ggml_add(ctx0, cur, inpL); + } + struct ggml_tensor * inpFF = cur; // feed-forward network @@ -502,7 +509,7 @@ bool stablelm_eval( // post attention layer norm // note here we pass inpL instead of cur { - cur = ggml_norm(ctx0, inpL); + cur = ggml_norm(ctx0, (file_format==FileFormat::NEOX_3?cur:inpL)); cur = ggml_add(ctx0, ggml_mul(ctx0, @@ -533,11 +540,17 @@ bool stablelm_eval( cur); } - // layer input + FF - cur = ggml_add(ctx0, cur, inpFF); - - // input for next layer - inpL = ggml_add(ctx0, cur, inpL); + if (file_format == FileFormat::NEOX_3) + { + // layer input + FF + inpL = ggml_add(ctx0, cur, inpFF); + } + else + { + cur = ggml_add(ctx0, cur, inpFF); + // input for next layer + inpL = ggml_add(ctx0, cur, inpL); + } } // norm