From a012342a772435fedfcc008e6465512965a9393b Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Thu, 30 Nov 2023 14:19:40 +0800 Subject: [PATCH] updated docs, shifted kv extra space to be subtracted from user's ctx value instead of added on load. --- gpttype_adapter.cpp | 14 +++++++++++++- kcpp_docs.embd | 16 +++++++++++++--- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index a2bbc8777..406a1e57e 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -98,6 +98,8 @@ static std::mutex concat_output_mtx; static std::string concat_output = ""; static std::string concat_output_reader_copy = ""; +const size_t extra_context_handle_fragmentation = 80; + inline bool IsNanCheck(float f) { const unsigned int u = *(unsigned int*)&f; @@ -883,7 +885,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in { llama_model_params model_params = llama_model_default_params(); llama_context_params llama_ctx_params = llama_context_default_params(); - llama_ctx_params.n_ctx = clamped_max_context_length + 64; //add some extra context to deal with KV fragmentation + llama_ctx_params.n_ctx = clamped_max_context_length; //llama_ctx_paran_parts = -1; llama_ctx_params.seed = -1; llama_ctx_params.f16_kv = inputs.f16_kv; @@ -1421,6 +1423,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o stop_sequence.push_back(stopper); } } + std::string addedmemory = inputs.memory; params.prompt = inputs.prompt; params.seed = inputs.seed; @@ -1442,6 +1445,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o params.n_threads_batch = n_blasthreads; bool stream_sse = inputs.stream_sse; + if(params.n_ctx >= 256 && useContextShift && (file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)) + { + params.n_ctx -= extra_context_handle_fragmentation; //add some additional buffer to handle KV fragmentation + if(debugmode==1) + { + printf("\nTrue max context permitted: %d\n",params.n_ctx); + } + } + bool allow_regular_prints = (debugmode!=-1 && !inputs.quiet) || debugmode >= 1; generation_finished = false; // Set current generation status diff --git a/kcpp_docs.embd b/kcpp_docs.embd index d34ce8cb4..517428e45 100644 --- a/kcpp_docs.embd +++ b/kcpp_docs.embd @@ -367,9 +367,19 @@ "content": { "application/json": { "example": { - "prompt": "Niko the kobold stalked carefully down the alley, his small scaly figure obscured by a dusky cloak that fluttered lightly in the cold winter breeze.", - "temperature": 0.5, - "top_p": 0.9 + "max_context_length": 2048, + "max_length": 100, + "prompt": "Niko the kobold stalked carefully down the alley, his small scaly figure obscured by a dusky cloak that fluttered lightly in the cold winter breeze.", + "quiet": false, + "rep_pen": 1.1, + "rep_pen_range": 256, + "rep_pen_slope": 1, + "temperature": 0.5, + "tfs": 1.0, + "top_a": 0, + "top_k": 100, + "top_p": 0.9, + "typical": 1.0 }, "schema": { "$ref": "#/components/schemas/GenerationInput"