From fb3bcac3685c30e76d4cff623dc475430bb79f99 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Tue, 7 Nov 2023 17:15:14 +0800
Subject: [PATCH] handle memory separately for kcpp

---
 expose.h            |  5 ++--
 gpttype_adapter.cpp | 46 +++++++++++++++++++++++++++++++
 klite.embd          | 67 +++++++++++++++++++++++++++++++++------------
 koboldcpp.py        |  9 ++++--
 4 files changed, 105 insertions(+), 22 deletions(-)
diff --git a/expose.h b/expose.h
index dee52ec6e..e3c069a66 100644
--- a/expose.h
+++ b/expose.h
@@ -53,7 +53,8 @@ struct load_model_inputs
 struct generation_inputs
 {
     const int seed;
-    const char *prompt;
+    const char * prompt;
+    const char * memory;
     const int max_context_length;
     const int max_length;
     const float temperature;
@@ -79,7 +80,7 @@ struct generation_inputs
 struct generation_outputs
 {
     int status = -1;
-    char text[24576]; //24kb should be enough for any response
+    char text[32768]; //32kb should be enough for any response
 };
 
 extern std::string executable_path;
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index d7ee627cd..c3d41e3ed 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -1388,6 +1388,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
             stop_sequence.push_back(stopper);
         }
     }
+    std::string addedmemory = inputs.memory;
     params.prompt = inputs.prompt;
     params.seed = inputs.seed;
     params.n_predict = inputs.max_length;
@@ -1442,7 +1443,12 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
 
     // tokenize the prompt
     std::vector<int> embd_inp;
+    std::vector<int> embd_inp_mem; //for storing added memory
     TokenizeString(params.prompt, embd_inp, file_format);
+    if(addedmemory!="")
+    {
+        TokenizeString(addedmemory, embd_inp_mem, file_format);
+    }
 
     //truncate to front of the prompt if its too long
     int32_t nctx = params.n_ctx;
@@ -1461,6 +1467,46 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
         }
     }
 
+    //added special memory, overwrite if needed
+    if(addedmemory!="")
+    {
+        //remove bos token from prompt, it'll be taken from memory
+        std::vector<int> bos;
+        TokenizeString("", bos, file_format);
+        if (bos.size()>0 && !embd_inp.empty() && bos[0]==embd_inp[0]) {
+            embd_inp.erase(embd_inp.begin());
+        }
+
+        //shorten memory if needed
+        if (embd_inp_mem.size() + params.n_predict + 4 > nctx)
+        {
+            int offset = embd_inp_mem.size() - nctx + params.n_predict + 4;
+            embd_inp_mem = std::vector<int>(embd_inp_mem.begin() + offset, embd_inp_mem.end());
+            //replace bos into front if exists
+            if(bos.size()>0 && embd_inp_mem.size()>0)
+            {
+                embd_inp_mem[0] = bos[0];
+            }
+        }
+
+        //shorten main prompt by trimming the front if needed
+        int addmemtokens = embd_inp_mem.size();
+        int totalsize = (addmemtokens + embd_inp.size() + params.n_predict);
+        if(totalsize > nctx)
+        {
+            int excess = totalsize - nctx;
+            if (embd_inp.size() >= excess) {
+                embd_inp.erase(embd_inp.begin(), embd_inp.begin() + excess);
+            } else {
+                embd_inp.clear();
+            }
+        }
+
+        //stick memory to front of prompt
+        embd_inp.insert(embd_inp.begin(), embd_inp_mem.begin(), embd_inp_mem.end());
+
+    }
+
     //determine how much npast we have to rewind from the current state
     std::vector<gpt_vocab::id> embd;
 
diff --git a/klite.embd b/klite.embd
index d6ddb56a0..0d0b9e468 100644
--- a/klite.embd
+++ b/klite.embd
@@ -6,7 +6,7 @@ It requires no dependencies, installation or setup.
 Just copy this single static HTML file anywhere and open it in a browser, or from a webserver.
 Please go to https://github.com/LostRuins/lite.koboldai.net for updates on Kobold Lite.
 Kobold Lite is under the AGPL v3.0 License unless otherwise exempted. Please do not remove this line.
-Current version: 92
+Current version: 93
 -Concedo
 -->
 
@@ -4009,6 +4009,10 @@ Current version: 92
 	{
 		return (custom_kobold_endpoint!="" && koboldcpp_version && koboldcpp_version!="" && compare_version_str(koboldcpp_version, "1.43") > 0);
 	}
+	function is_using_kcpp_with_added_memory()
+	{
+		return (custom_kobold_endpoint!="" && koboldcpp_version && koboldcpp_version!="" && compare_version_str(koboldcpp_version, "1.48.2") > 0);
+	}
 
 	//0 is none, 1 is pseudostreaming, 2 is true poll-streaming, 3 is sse-streaming
 	function determine_streaming_type()
@@ -7354,8 +7358,8 @@ Current version: 92
 			let max_allowed_characters = Math.floor(localsettings.max_context_length * 3.0)-100;
 			let truncated_context = concat_gametext(true, "");
 
-			let max_mem_anote_len = Math.floor(max_allowed_characters*0.9);
-			let truncated_memory = current_memory.substring(current_memory.length - max_mem_anote_len);
+			let max_mem_len = Math.floor(max_allowed_characters*0.8);
+			let truncated_memory = current_memory.substring(current_memory.length - max_mem_len);
 			if (truncated_memory != null && truncated_memory != "") {
 				truncated_memory += "\n";
 			}
@@ -7933,6 +7937,10 @@ Current version: 92
 				//if there is no memory, then we can be a lot of lenient with the character counts since the backend will truncate excess anyway
 				chars_per_token = 4.8;
 			}
+			if(is_using_kcpp_with_added_memory()) //easily handle overflow
+			{
+				chars_per_token = 6;
+			}
 			let max_allowed_characters = Math.max(1, Math.floor((maxctxlen-maxgenamt) * chars_per_token) - 12);
 
 			//for adventure mode, inject hidden context, even more if there's nothing in memory
@@ -8059,9 +8067,10 @@ Current version: 92
 			}
 
 			//we clip the memory if its too long, taking the last x chars (not the first)
-			//memory or anote is allowed to be up to 0.9 times of ctx allowance
-			let max_mem_anote_len = Math.floor(max_allowed_characters*0.9);
-			let truncated_memory = substring_to_boundary(current_memory, max_mem_anote_len);
+			//memory is allowed to be up to 0.8 times of ctx allowance, anote up to 0.6 times
+			let max_mem_len = Math.floor(max_allowed_characters*0.8);
+			let max_anote_len = Math.floor(max_allowed_characters*0.6);
+			let truncated_memory = substring_to_boundary(current_memory, max_mem_len);
 			if (truncated_memory != null && truncated_memory != "") {
 				if(newlineaftermemory)
 				{
@@ -8129,23 +8138,29 @@ Current version: 92
 
 			//we clip the authors note if its too long
 			let truncated_anote = current_anotetemplate.replace("<|>", current_anote);
-			truncated_anote = substring_to_boundary(truncated_anote, max_mem_anote_len);
+			truncated_anote = substring_to_boundary(truncated_anote, max_anote_len);
 
 			if (current_anote.length == 0) {
 				//if there's no authors note at all, don't include the template
 				truncated_anote = "";
 			}
 
+			//now we resize the context such that the memory and authors note can fit inside
+			truncated_context = substring_to_boundary(truncated_context, max_allowed_characters);
+
 			//append memory to the start of the context, clipping excess space if needed
 			//only do this processing if memory or anote is not blank
-			if (truncated_memory.length > 0 || current_anote.length > 0) {
-				//now we resize the context such that the memory and authors note can fit inside
-				truncated_context = substring_to_boundary(truncated_context, max_allowed_characters);
-				let augmented_len = truncated_memory.length + truncated_context.length + truncated_anote.length;
-				let excess_len = augmented_len - max_allowed_characters; //if > 0, then we exceeded context window
-				excess_len = excess_len < 0 ? 0 : excess_len;
-				let newlimit = (max_allowed_characters-excess_len) < 32 ? 32 : (max_allowed_characters-excess_len);
-				truncated_context = substring_to_boundary(truncated_context, newlimit); //must always have at least 32 chars from main context
+			if (truncated_memory.length > 0 || current_anote.length > 0)
+			{
+				if(!is_using_kcpp_with_added_memory())
+				{
+					let augmented_len = truncated_memory.length + truncated_context.length + truncated_anote.length;
+					let excess_len = augmented_len - max_allowed_characters; //if > 0, then we exceeded context window
+					excess_len = excess_len < 0 ? 0 : excess_len;
+					let newlimit = (max_allowed_characters-excess_len) < 32 ? 32 : (max_allowed_characters-excess_len);
+					truncated_context = substring_to_boundary(truncated_context, newlimit); //must always have at least 32 chars from main context
+				}
+
 				//insert authors note 80 tokens before the ending (320 characters).
 				let anote_dist = anote_strength;
 				let anote_insert_idx = truncated_context.length - anote_dist;
@@ -8164,11 +8179,24 @@ Current version: 92
 				}
 				anote_insert_idx = clamp(anote_insert_idx, 0, truncated_context.length);
 				truncated_context = truncated_context.slice(0, anote_insert_idx) + truncated_anote + truncated_context.slice(anote_insert_idx);
-				truncated_context = truncated_memory + truncated_context;
+				if(!is_using_kcpp_with_added_memory())
+				{
+					truncated_context = truncated_memory + truncated_context;
+				}
 			}
+
+			truncated_memory = replace_placeholders(truncated_memory);
 			truncated_context = replace_placeholders(truncated_context);
 
-			last_token_budget = truncated_context.length  + "/" + max_allowed_characters;
+			if(is_using_kcpp_with_added_memory())
+			{
+				last_token_budget = (truncated_memory.length + truncated_context.length)  + "/" + max_allowed_characters;
+			}
+			else
+			{
+				last_token_budget = truncated_context.length  + "/" + max_allowed_characters;
+			}
+
 
 			let submit_payload = {
 				"prompt": truncated_context,
@@ -8190,6 +8218,11 @@ Current version: 92
 				"models": selected_models.map((m) => { return m.name }),
 			};
 
+			if(is_using_kcpp_with_added_memory())
+			{
+				submit_payload.params.memory = truncated_memory;
+			}
+
 			if(localsettings.sampler_seed>=1)
 			{
 				submit_payload.params.sampler_seed = localsettings.sampler_seed;
diff --git a/koboldcpp.py b/koboldcpp.py
index f0f01fb9c..98c549f62 100755
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -49,6 +49,7 @@ class load_model_inputs(ctypes.Structure):
 class generation_inputs(ctypes.Structure):
     _fields_ = [("seed", ctypes.c_int),
                 ("prompt", ctypes.c_char_p),
+                ("memory", ctypes.c_char_p),
                 ("max_context_length", ctypes.c_int),
                 ("max_length", ctypes.c_int),
                 ("temperature", ctypes.c_float),
@@ -73,7 +74,7 @@ class generation_inputs(ctypes.Structure):
 
 class generation_outputs(ctypes.Structure):
     _fields_ = [("status", ctypes.c_int),
-                ("text", ctypes.c_char * 24576)]
+                ("text", ctypes.c_char * 32768)]
 
 handle = None
 
@@ -297,11 +298,12 @@ def load_model(model_filename):
     ret = handle.load_model(inputs)
     return ret
 
-def generate(prompt,max_length=32, max_context_length=512, temperature=0.7, top_k=100, top_a=0.0, top_p=0.92, min_p=0.0, typical_p=1.0, tfs=1.0, rep_pen=1.1, rep_pen_range=128, mirostat=0, mirostat_tau=5.0, mirostat_eta=0.1, sampler_order=[6,0,1,3,4,2,5], seed=-1, stop_sequence=[], use_default_badwordsids=False, stream_sse=False, grammar='', grammar_retain_state=False, genkey=''):
+def generate(prompt, memory="", max_length=32, max_context_length=512, temperature=0.7, top_k=100, top_a=0.0, top_p=0.92, min_p=0.0, typical_p=1.0, tfs=1.0, rep_pen=1.1, rep_pen_range=128, mirostat=0, mirostat_tau=5.0, mirostat_eta=0.1, sampler_order=[6,0,1,3,4,2,5], seed=-1, stop_sequence=[], use_default_badwordsids=False, stream_sse=False, grammar='', grammar_retain_state=False, genkey=''):
     global maxctx, args, currentusergenkey, totalgens
     inputs = generation_inputs()
     outputs = ctypes.create_unicode_buffer(ctypes.sizeof(generation_outputs))
     inputs.prompt = prompt.encode("UTF-8")
+    inputs.memory = memory.encode("UTF-8")
     if max_length >= max_context_length:
         max_length = max_context_length-1
     inputs.max_context_length = max_context_length   # this will resize the context buffer if changed
@@ -379,7 +381,7 @@ maxhordelen = 256
 modelbusy = threading.Lock()
 requestsinqueue = 0
 defaultport = 5001
-KcppVersion = "1.48.1"
+KcppVersion = "1.49"
 showdebug = True
 showsamplerwarning = True
 showmaxctxwarning = True
@@ -474,6 +476,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
 
             return generate(
                 prompt=genparams.get('prompt', ""),
+                memory=genparams.get('memory', ""),
                 max_context_length=genparams.get('max_context_length', maxctx),
                 max_length=genparams.get('max_length', 80),
                 temperature=genparams.get('temperature', 0.7),