From d4c22a8b0211552a6bf1b92a7e99078f0ae01308 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Wed, 30 Aug 2023 16:50:55 +0800 Subject: [PATCH] updated lite, added autorope config based on trained ctxlen, hotfix for falcon gpu broken --- gpttype_adapter.cpp | 29 +- klite.embd | 692 ++++++++++++++++++++++++++++++++++++++++---- model_adapter.cpp | 5 + model_adapter.h | 2 +- 4 files changed, 654 insertions(+), 74 deletions(-) diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 83ee387d4..25aef2fea 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -328,7 +328,7 @@ static std::string FileFormatTokenizeID(int id, FileFormat file_format) { return std::string(llama_v3_token_to_str(llama_ctx_v3, id)); } - else if( file_format == FileFormat::GGUF_LLAMA) + else if(file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) { return std::string(llama_token_to_str(llama_ctx_v4, id)); } @@ -340,7 +340,7 @@ static std::string FileFormatTokenizeID(int id, FileFormat file_format) static void TokenizeString(const std::string & str_to_tokenize, std::vector & output_tokens, FileFormat file_format) { - if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA) + if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) { if(file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 ) { @@ -432,7 +432,13 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in else { //approximate NTK aware ctx - rope_freq_base = (params.n_ctx <= 3072 ? 26000.0f : (params.n_ctx <= 4096 ? 32000.0f : (params.n_ctx <= 6144 ? 54000.0f : (params.n_ctx <= 8192 ? 82684.0f : (params.n_ctx <= 12288 ? 140000.0f : 200000.0f))))); + auto effectivenctx = params.n_ctx; + if((file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) && llama_ctx_v4->model.hparams.n_ctx_train>2048) + { + float factor = llama_ctx_v4->model.hparams.n_ctx_train/2048; + effectivenctx = effectivenctx/factor; + } + rope_freq_base = (effectivenctx <= 3072 ? 26000.0f : (effectivenctx <= 4096 ? 32000.0f : (effectivenctx <= 6144 ? 54000.0f : (effectivenctx <= 8192 ? 82684.0f : (effectivenctx <= 12288 ? 140000.0f : 200000.0f))))); } @@ -585,7 +591,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } return ModelLoadResult::SUCCESS; } - else if(file_format==FileFormat::GGUF_LLAMA) + else if(file_format==FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) { llama_context_params llama_ctx_params = llama_context_default_params(); llama_ctx_params.n_ctx = inputs.max_context_length; @@ -598,6 +604,11 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in llama_ctx_params.use_mmap = inputs.use_mmap; llama_ctx_params.use_mlock = inputs.use_mlock; llama_ctx_params.n_gpu_layers = inputs.gpulayers; + if(file_format==FileFormat::GGUF_FALCON && llama_ctx_params.n_gpu_layers>0) + { + printf("\nGPU layer offload for GGUF FALCON is known to have issues, it has been set to 0.\n"); + llama_ctx_params.n_gpu_layers = 0; + } llama_ctx_params.main_gpu = cu_parseinfo_maindevice; llama_ctx_params.rope_freq_base = rope_freq_base; llama_ctx_params.rope_freq_scale = rope_freq_scale; @@ -1120,7 +1131,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o { //for non llama, limit to 256 int bbs = blasbatchsize; - if (file_format != FileFormat::GGML && file_format != FileFormat::GGHF && file_format != FileFormat::GGJT && file_format != FileFormat::GGJT_2 && file_format != FileFormat::GGJT_3 && file_format != FileFormat::GGUF_LLAMA) + if (file_format != FileFormat::GGML && file_format != FileFormat::GGHF && file_format != FileFormat::GGJT && file_format != FileFormat::GGJT_2 && file_format != FileFormat::GGJT_3 && file_format != FileFormat::GGUF_LLAMA && file_format!=FileFormat::GGUF_FALCON) { bbs = (blasbatchsize > 256 ? 256 : blasbatchsize); } @@ -1180,7 +1191,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o { n_vocab = llama_v3_n_vocab(llama_ctx_v3); } - else if(file_format == FileFormat::GGUF_LLAMA) + else if(file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) { n_vocab = llama_n_vocab(llama_ctx_v4); } @@ -1331,7 +1342,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o { evalres = (llama_v3_eval(llama_ctx_v3, embd.data(), embdsize, n_past, params.n_threads)==0); } - else if(file_format == FileFormat::GGUF_LLAMA) + else if(file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) { evalres = (llama_eval(llama_ctx_v4, embd.data(), embdsize, n_past, params.n_threads)==0); } @@ -1439,9 +1450,9 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o unsigned int eosID = 0; float * logitsPtr; int btsize = banned_token_ids.size(); - if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA) + if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) { - if(file_format == FileFormat::GGUF_LLAMA) + if(file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) { logitsPtr = llama_get_logits(llama_ctx_v4); eosID = llama_token_eos(llama_ctx_v4); diff --git a/klite.embd b/klite.embd index 050677387..21bd7bc8e 100644 --- a/klite.embd +++ b/klite.embd @@ -1,6 +1,6 @@ + @@ -8240,8 +8616,6 @@ Kobold Lite is under the AGPL v3.0 License for the purposes of koboldcpp. Please Share - - @@ -8320,6 +8694,7 @@ Kobold Lite is under the AGPL v3.0 License for the purposes of koboldcpp. Please + @@ -8855,6 +9230,10 @@ Kobold Lite is under the AGPL v3.0 License for the purposes of koboldcpp. Please
Trim Whitespace
+
+
Unban Tokens (KAI)
+ +
Persist Session
@@ -9087,8 +9466,193 @@ Kobold Lite is under the AGPL v3.0 License for the purposes of koboldcpp. Please
+ +