updated docs, shifted kv extra space to be subtracted from user's ctx value instead of added on load.

2023-11-30 14:19:40 +08:00 · 2023-11-30 14:19:40 +08:00 · a012342a77
commit a012342a77
parent 66ef4a20e2
2 changed files with 26 additions and 4 deletions
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -98,6 +98,8 @@ static std::mutex concat_output_mtx;
 static std::string concat_output = "";
 static std::string concat_output_reader_copy = "";

+const size_t extra_context_handle_fragmentation = 80;
+
 inline bool IsNanCheck(float f)
 {
    const unsigned int u = *(unsigned int*)&f;
@ -883,7 +885,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
    {
        llama_model_params model_params = llama_model_default_params();
        llama_context_params llama_ctx_params = llama_context_default_params();
-        llama_ctx_params.n_ctx = clamped_max_context_length + 64; //add some extra context to deal with KV fragmentation
+        llama_ctx_params.n_ctx = clamped_max_context_length;
        //llama_ctx_paran_parts = -1;
        llama_ctx_params.seed = -1;
        llama_ctx_params.f16_kv = inputs.f16_kv;
@ -1421,6 +1423,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
            stop_sequence.push_back(stopper);
        }
    }
+
    std::string addedmemory = inputs.memory;
    params.prompt = inputs.prompt;
    params.seed = inputs.seed;
@ -1442,6 +1445,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
    params.n_threads_batch = n_blasthreads;
    bool stream_sse = inputs.stream_sse;

+    if(params.n_ctx >= 256 && useContextShift && (file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON))
+    {
+        params.n_ctx -= extra_context_handle_fragmentation; //add some additional buffer to handle KV fragmentation
+        if(debugmode==1)
+        {
+            printf("\nTrue max context permitted: %d\n",params.n_ctx);
+        }
+    }
+
    bool allow_regular_prints = (debugmode!=-1 && !inputs.quiet) || debugmode >= 1;

    generation_finished = false; // Set current generation status
--- a/kcpp_docs.embd
+++ b/kcpp_docs.embd
@ -367,9 +367,19 @@
                        "content": {
                           "application/json": {
                              "example": {
-                                 "prompt": "Niko the kobold stalked carefully down the alley, his small scaly figure obscured by a dusky cloak that fluttered lightly in the cold winter breeze.",
-                                 "temperature": 0.5,
-                                 "top_p": 0.9
+                              "max_context_length": 2048,
+                              "max_length": 100,
+                              "prompt": "Niko the kobold stalked carefully down the alley, his small scaly figure obscured by a dusky cloak that fluttered lightly in the cold winter breeze.",
+                              "quiet": false,
+                              "rep_pen": 1.1,
+                              "rep_pen_range": 256,
+                              "rep_pen_slope": 1,
+                              "temperature": 0.5,
+                              "tfs": 1.0,
+                              "top_a": 0,
+                              "top_k": 100,
+                              "top_p": 0.9,
+                              "typical": 1.0
                              },
                              "schema": {
                                 "$ref": "#/components/schemas/GenerationInput"