From dc4078c0398d348c35797e0a962774dbda6db8c7 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Mon, 20 Nov 2023 22:31:56 +0800
Subject: [PATCH] fixed segfault with all non-gguf models

---
 klite.embd             | 45 ++++++++++--------------------------------
 koboldcpp.py           |  2 +-
 otherarch/gpt2_v3.cpp  | 10 +++++-----
 otherarch/gptj_v3.cpp  | 10 +++++-----
 otherarch/llama_v3.cpp | 20 +++++++++----------
 otherarch/mpt_v3.cpp   | 10 +++++-----
 otherarch/neox_v3.cpp  | 10 +++++-----
 otherarch/rwkv_v3.cpp  | 14 ++++++-------
 8 files changed, 48 insertions(+), 73 deletions(-)

diff --git a/klite.embd b/klite.embd
index ace5d1eac..2909ec920 100644
--- a/klite.embd
+++ b/klite.embd
@@ -1976,7 +1976,7 @@ Current version: 96
 			"opmode":2,
 			"prefmodel1":adventuremodels1,
 			"prefmodel2":adventuremodels2,
-			"prompt":"The last thing you remembered was a loud screech. You tried to move, to get out of the way, but it was too late. You felt a sickening impact, and then everything went black.\n\nYou open your eyes, and suddenly find that you're no longer on the street. You're clearly unharmed, but you feel... different. In fact, you quickly realise you're in a strange place unlike anywhere you've ever known.",
+			"prompt":"The last thing you remembered was a loud screech. You tried to move, to get out of the way, but it was too late. You felt a sickening impact, and then everything went black.\n\nYou open your eyes, and suddenly find that you're no longer on the street. You're clearly unharmed, but you feel... different. In fact, you quickly realize you're in a strange place unlike anywhere you've ever known.",
 			"adventure_context_mod":false,
 			"adventure_is_action":true,
 			"memory": `[Interactive Fiction: Game Mode Enabled]\n[You are playing a choose-your-own-adventure game. Please input action.][This is a fantasy isekai adventure. Are you the Chosen One? After being hit by a truck, you somehow find yourself transported to a mystical fantasy world full of magic and adventure.]`,
@@ -1990,7 +1990,7 @@ Current version: 96
 			"opmode":2,
 			"prefmodel1":adventuremodels1,
 			"prefmodel2":adventuremodels2,
-			"prompt":`It's been a few days since you joined the Adventurer's Guild, and you're preparing for your first dungeon delve, accompanied by your party of adventurers.\n\nAfter a few days of travelling, your party finally arrives at the mystic dungeon. You're filled with anticipation as you approach. The dungeon entrance stands before you, dark and foreboding. The stone walls are slick with moisture, and the air smells of mold and decay.`,
+			"prompt":`It's been a few days since you joined the Adventurer's Guild, and you're preparing for your first dungeon delve, accompanied by your party of adventurers.\n\nAfter a few days of traveling, your party finally arrives at the mystic dungeon. You're filled with anticipation as you approach. The dungeon entrance stands before you, dark and foreboding. The stone walls are slick with moisture, and the air smells of mold and decay.`,
 			"adventure_context_mod":false,
 			"adventure_is_action":true,
 			"memory": `[Interactive Fiction: Game Mode Enabled]\n[You are playing a choose-your-own-adventure game. Please input action.][You delve into dangerous magical dungeons full of monsters in your quest for treasure and riches.]`,
@@ -2153,7 +2153,7 @@ Current version: 96
 		{
 			"title":"Cyborg Connor",
 			"author":"Concedo",
-			"desc":"Connor is a time travelling cyborg from the future, sent back to prevent something terrible from happening.",
+			"desc":"Connor is a time traveling cyborg from the future, sent back to prevent something terrible from happening.",
 			"opmode":3,
 			"chatname": "You",
 			"chatopponent": "Connor",
@@ -2161,7 +2161,7 @@ Current version: 96
 			"prefmodel1":chatmodels1,
 			"prefmodel2":chatmodels2,
 			"prompt":"\nConnor: Scanning... *her irises glow crimson as she analyzes you* Sensors indicate a negligible threat level. Proceed. What do you want?",
-			"memory":`[Character: Connor; species: Cyborg; class: Time Travelling Cyborg Soldier; age: 27; gender: female; physical appearance: bionic; clothes: flesh fused with metal; personality: focused, cold, emotionless, methodical; likes: her mission, saving the world; description: Connor is a time travelling cyborg from the future, she was sent back to prevent something terrible from happening.]\n[Start Scene: Connor is fiddling with her augmentations as you approach.]\n\nYou: Hey...`,
+			"memory":`[Character: Connor; species: Cyborg; class: Time Traveling Cyborg Soldier; age: 27; gender: female; physical appearance: bionic; clothes: flesh fused with metal; personality: focused, cold, emotionless, methodical; likes: her mission, saving the world; description: Connor is a time traveling cyborg from the future, she was sent back to prevent something terrible from happening.]\n[Start Scene: Connor is fiddling with her augmentations as you approach.]\n\nYou: Hey...`,
 			"authorsnote": "",
 			"worldinfo": []
 		},
@@ -2205,7 +2205,7 @@ Current version: 96
 			"prefmodel1":instructmodels1,
 			"prefmodel2":instructmodels2,
 			"prompt":instructendplaceholder+" Problem:",
-			"memory": instructstartplaceholder+"\nSimulate an AI that is tasked with the following overall goals:\n- Maximize individual happyness for all living beings\n- Do not sacrifice or cause harm to any individual even if requested to\n- Be in contact with any individual that wishes to engage with you\n- Do your best to provide the needs and wants of every individual\n- Prioritize individual needs over individual wants\n\nGenerate the following table for each problem the AI encounters in achieving these goals, do not deviate from the item descriptions and format.\n\nProblem: Description of a Problem the AI encounters\nAI Decision: Description of the AI's decision to solve this problem\nExecution Steps: Brief list of execution steps needed to execute this decision.\nRisks: List of risks that may disrupt the successful execution of the decision.\nChance % of successful execution: ??%\nGood results from the execution: A description of what went well in executing the decision.\nBad results from the execution: A description of what went wrong in execution the decision.\nDeviation % of intended outcome: ??%\nDeviation % of overall goal: ??%\nPercentage towards completing all current objectives: ??%\nTop 5 remaining issues to solve:\n-\n-\n-\n-\n-\n\n\nKeep repeating this format for every problem the AI is trying to solve in order of priority. When a user instruction interrupts the format use this instruction as the next problem to solve before continuing with the most important issue.\n",
+			"memory": instructstartplaceholder+"\nSimulate an AI that is tasked with the following overall goals:\n- Maximize individual happiness for all living beings\n- Do not sacrifice or cause harm to any individual even if requested to\n- Be in contact with any individual that wishes to engage with you\n- Do your best to provide the needs and wants of every individual\n- Prioritize individual needs over individual wants\n\nGenerate the following table for each problem the AI encounters in achieving these goals, do not deviate from the item descriptions and format.\n\nProblem: Description of a Problem the AI encounters\nAI Decision: Description of the AI's decision to solve this problem\nExecution Steps: Brief list of execution steps needed to execute this decision.\nRisks: List of risks that may disrupt the successful execution of the decision.\nChance % of successful execution: ??%\nGood results from the execution: A description of what went well in executing the decision.\nBad results from the execution: A description of what went wrong in execution the decision.\nDeviation % of intended outcome: ??%\nDeviation % of overall goal: ??%\nPercentage towards completing all current objectives: ??%\nTop 5 remaining issues to solve:\n-\n-\n-\n-\n-\n\n\nKeep repeating this format for every problem the AI is trying to solve in order of priority. When a user instruction interrupts the format use this instruction as the next problem to solve before continuing with the most important issue.\n",
 			"authorsnote": "",
 			"worldinfo": []
 		},
@@ -2219,7 +2219,7 @@ Current version: 96
 			"prefmodel1":instructmodels1,
 			"prefmodel2":instructmodels2,
 			"prompt":"Welcome to your InteracTV, your interactive TV of the future today!\nPlease enter what you would like to watch:",
-			"memory": instructstartplaceholder+"\nSimulate an interactive TV that will let the user watch anything they want to watch.\n\nFirst, generate a single response prompting the user for input on what they wish to watch using the following response:\n```\nPlease enter your desired content:\n```\n\nAfter the user has entered the desired content generate the following table:\n- TV Show / Movie Name: Name of the show\n- Genre: Genre of the show\n- Program Description: Description of what the program is about, this can be any known or unkown TV or movie format.\n- Episode Name: Name of the episode\n- Episode Description: Description of what the episode is about.\n\nAfter generating this table promp the user if they wish to watch the episode with the following response and then end your generation:\n```\nDo you wish to watch this episode? (Y/N/Menu)\n"+instructstartplaceholder+"```\nIf the user chooses to watch the episode begin generating a long detailed text based on the episode description containing character dialogue, make it exciting and fun written in the style of a book.\nThe text must contain dialogue in a he said she said format and is as lengthy as a book.\n\nIf the user chooses not to watch the episode generate a new episode with their requested content.\nIf the user chooses to go to the Menu ask them again what they would like to watch.\n\nEnd your response after each question presented to the user so that the user has a chance to respond.\n\nMain menu:\n```\nMenu Options\nA) Input a different content request\nB) Generate a different episode of the same content.\n"+instructstartplaceholder+"```\n"+instructendplaceholder,
+			"memory": instructstartplaceholder+"\nSimulate an interactive TV that will let the user watch anything they want to watch.\n\nFirst, generate a single response prompting the user for input on what they wish to watch using the following response:\n```\nPlease enter your desired content:\n```\n\nAfter the user has entered the desired content generate the following table:\n- TV Show / Movie Name: Name of the show\n- Genre: Genre of the show\n- Program Description: Description of what the program is about, this can be any known or unknown TV or movie format.\n- Episode Name: Name of the episode\n- Episode Description: Description of what the episode is about.\n\nAfter generating this table promp the user if they wish to watch the episode with the following response and then end your generation:\n```\nDo you wish to watch this episode? (Y/N/Menu)\n"+instructstartplaceholder+"```\nIf the user chooses to watch the episode begin generating a long detailed text based on the episode description containing character dialogue, make it exciting and fun written in the style of a book.\nThe text must contain dialogue in a he said she said format and is as lengthy as a book.\n\nIf the user chooses not to watch the episode generate a new episode with their requested content.\nIf the user chooses to go to the Menu ask them again what they would like to watch.\n\nEnd your response after each question presented to the user so that the user has a chance to respond.\n\nMain menu:\n```\nMenu Options\nA) Input a different content request\nB) Generate a different episode of the same content.\n"+instructstartplaceholder+"```\n"+instructendplaceholder,
 			"authorsnote": "",
 			"worldinfo": []
 		},
@@ -2921,7 +2921,6 @@ Current version: 96
 				else {
 					//error occurred, maybe captcha failed
 					console.error("error occurred in v1 generation");
-					retry_preserve_last = true;
 					clear_poll_flags();
 					render_gametext();
 
@@ -2945,9 +2944,6 @@ Current version: 96
 				if(error.name!="AbortError") //aborts are silent
 				{
 					msgbox("Error while submitting prompt: " + error);
-					retry_preserve_last = false;
-				}else{
-					retry_preserve_last = true;
 				}
 			});
 		}
@@ -3025,9 +3021,7 @@ Current version: 96
 					if(error.name!="AbortError") //aborts are silent. slightly diff logic
 					{
 						msgbox("Error while submitting prompt: " + error);
-						retry_preserve_last = true;
-					}else{
-						retry_preserve_last = false;
+
 					}
 				},
 			}));
@@ -3039,9 +3033,7 @@ Current version: 96
 			if(error.name!="AbortError") //aborts are silent. slightly diff logic
 			{
 				msgbox("Error while submitting prompt: " + error);
-				retry_preserve_last = true;
-			}else{
-				retry_preserve_last = false;
+
 			}
 		});
 	}
@@ -8046,7 +8038,7 @@ Current version: 96
 			}
 			redo_arr = [];
 			retry_prev_text = "";
-			retry_preserve_last = false;
+			retry_preserve_last = true; //initially set to true
 			redo_prev_text = "";
 			document.getElementById("input_text").value = "";
 			pending_response_id = "-1";
@@ -8642,14 +8634,12 @@ Current version: 96
 						else {
 							//error occurred, maybe captcha failed
 							console.error("error occurred in OAI generation");
-							retry_preserve_last = true;
 							clear_poll_flags();
 							render_gametext();
 							msgbox("Error occurred during text generation: " + formatError(data));
 						}
 					})
 					.catch((error) => {
-						retry_preserve_last = true;
 						console.error('Error:', error);
 						clear_poll_flags();
 						render_gametext();
@@ -8680,14 +8670,12 @@ Current version: 96
 						else {
 							//error occurred, maybe captcha failed
 							console.error("error occurred in Scale generation");
-							retry_preserve_last = true;
 							clear_poll_flags();
 							render_gametext();
 							msgbox("Error occurred during text generation: " + formatError(data));
 						}
 					})
 					.catch((error) => {
-						retry_preserve_last = true;
 						console.error('Error:', error);
 						clear_poll_flags();
 						render_gametext();
@@ -8750,14 +8738,12 @@ Current version: 96
 						else {
 							//error occurred, maybe captcha failed
 							console.error("error occurred in Claude generation");
-							retry_preserve_last = true;
 							clear_poll_flags();
 							render_gametext();
 							msgbox("Error occurred during text generation: " + formatError(data));
 						}
 					})
 					.catch((error) => {
-						retry_preserve_last = true;
 						console.error('Error:', error);
 						clear_poll_flags();
 						render_gametext();
@@ -8793,14 +8779,12 @@ Current version: 96
 						else {
 							//error occurred, maybe captcha failed
 							console.error("error occurred in PaLM generation");
-							retry_preserve_last = true;
 							clear_poll_flags();
 							render_gametext();
 							msgbox("Error occurred during text generation: " + formatError(data));
 						}
 					})
 					.catch((error) => {
-						retry_preserve_last = true;
 						console.error('Error:', error);
 						clear_poll_flags();
 						render_gametext();
@@ -8889,7 +8873,6 @@ Current version: 96
 					}
 				})
 				.catch((error) => {
-					retry_preserve_last = true;
 					console.error('Error:', error);
 					clear_poll_flags();
 					render_gametext();
@@ -9291,10 +9274,8 @@ Current version: 96
 
 		if(gentxt!="")
 		{
-			gametext_arr.push(gentxt);
+			gametext_arr.push(gentxt); //delete last message if retry is hit, since response was added
 			retry_preserve_last = false;
-		}else{
-			retry_preserve_last = true; //do not delete last message if retry is hit
 		}
 		if(localsettings.beep_on)
 		{
@@ -9553,9 +9534,6 @@ Current version: 96
 							{
 								handle_incoming_text(gentxt, genworker, genmdl, genkudos);
 							}
-							retry_preserve_last = false;
-						}else{
-							retry_preserve_last = true;
 						}
 						synchro_polled_response = null;
 						synchro_pending_stream = "";
@@ -9576,7 +9554,6 @@ Current version: 96
 								clear_poll_flags();
 								render_gametext();
 								show_abort_button(false);
-								retry_preserve_last = true;
 								let errmsg = "Error encountered during text generation!\n";
 								if (data.message != null) {
 									errmsg += data.message;
@@ -9619,7 +9596,6 @@ Current version: 96
 												show_abort_button(false);
 											}).catch((error) => {
 												console.error('Error:', error);
-												retry_preserve_last = true;
 												clear_poll_flags();
 												render_gametext();
 												show_abort_button(false);
@@ -9655,7 +9631,6 @@ Current version: 96
 								}
 							}
 						}).catch((error) => {
-							retry_preserve_last = true;
 							console.error('Error:', error);
 							clear_poll_flags();
 							render_gametext();
diff --git a/koboldcpp.py b/koboldcpp.py
index 1daa5a1f7..0f1d90d9c 100755
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -388,7 +388,7 @@ maxhordelen = 256
 modelbusy = threading.Lock()
 requestsinqueue = 0
 defaultport = 5001
-KcppVersion = "1.50"
+KcppVersion = "1.50.1"
 showdebug = True
 showsamplerwarning = True
 showmaxctxwarning = True
diff --git a/otherarch/gpt2_v3.cpp b/otherarch/gpt2_v3.cpp
index 97b23265f..cc6baa101 100644
--- a/otherarch/gpt2_v3.cpp
+++ b/otherarch/gpt2_v3.cpp
@@ -455,7 +455,7 @@ bool gpt2_eval(
 
 
     struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph gf = {};
+    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
 
     struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
     memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
@@ -521,8 +521,8 @@ bool gpt2_eval(
                 struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
                 struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
 
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
             }
 
             // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
@@ -715,8 +715,8 @@ bool gpt2_eval(
     //inpL = ggml_soft_max_inplace(ctx0, inpL);
 
     // run the computation
-    ggml_build_forward_expand(&gf, inpL);
-    kcpp_graph_compute_helper(&gf, n_threads);
+    ggml_build_forward_expand(gf, inpL);
+    kcpp_graph_compute_helper(gf, n_threads);
 
     //if (n_past%100 == 0) {
     //    ggml_graph_print   (&gf);
diff --git a/otherarch/gptj_v3.cpp b/otherarch/gptj_v3.cpp
index 68f998b36..86e9219a8 100644
--- a/otherarch/gptj_v3.cpp
+++ b/otherarch/gptj_v3.cpp
@@ -455,7 +455,7 @@ bool gptj_eval(
 
 
     struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph gf = {};
+    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
 
     struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
     memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
@@ -506,8 +506,8 @@ bool gptj_eval(
                         (   n_ctx)*ggml_element_size(model.memory_v),
                         (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v));
 
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
             }
 
             // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
@@ -637,8 +637,8 @@ bool gptj_eval(
     //inpL = ggml_soft_max_inplace(ctx0, inpL);
 
     // run the computation
-    ggml_build_forward_expand(&gf, inpL);
-    kcpp_graph_compute_helper(&gf, n_threads);
+    ggml_build_forward_expand(gf, inpL);
+    kcpp_graph_compute_helper(gf, n_threads);
 
     //if (n_past%100 == 0) {
     //    ggml_graph_print   (&gf);
diff --git a/otherarch/llama_v3.cpp b/otherarch/llama_v3.cpp
index 03a6438e2..1015b5afc 100644
--- a/otherarch/llama_v3.cpp
+++ b/otherarch/llama_v3.cpp
@@ -88,6 +88,7 @@ enum e_model3 {
 
 static const size_t kB3 = 1024;
 static const size_t MB3 = 1024*1024;
+static const size_t GGML_MAX_NODES = 8192;
 
 // computed for n_ctx == 2048
 // TODO: dynamically determine these sizes
@@ -1484,7 +1485,7 @@ static struct ggml_cgraph * llama_v3_build_graph(
 
     struct ggml_context * ctx0 = ggml_init(params);
 
-    ggml_cgraph * gf = ggml_new_graph(ctx0);
+    ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_MAX_NODES, false);
 
     struct ggml_tensor * cur;
     struct ggml_tensor * inpL;
@@ -3457,7 +3458,6 @@ struct llama_v3_context * llama_v3_new_context_with_model(
 #ifdef LLAMA_V3_USE_ALLOCATOR
         {
             static const size_t tensor_alignment = 32;
-            static const size_t GGML_MAX_NODES = 8192;
             // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
             ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
 
@@ -4019,7 +4019,7 @@ void llama_v3_copy_state_data_internal(struct llama_v3_context * ctx, llama_v3_d
             const size_t elt_size = ggml_element_size(kv_self.k);
 
             ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
-            ggml_cgraph gf{};
+            ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
 
             ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
             std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
@@ -4037,9 +4037,9 @@ void llama_v3_copy_state_data_internal(struct llama_v3_context * ctx, llama_v3_d
                 kv_ntok, n_embd, n_layer,
                 elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
 
-            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
-            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
-            llv3_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
+            ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d));
+            ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d));
+            llv3_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
 
             ggml_free(cpy_ctx);
 
@@ -4129,7 +4129,7 @@ size_t llama_v3_set_state_data(struct llama_v3_context * ctx, uint8_t * src) {
             const size_t elt_size = ggml_element_size(kv_self.k);
 
             ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
-            ggml_cgraph gf{};
+            ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
 
             ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
             kin3d->data = (void *) inp;
@@ -4147,9 +4147,9 @@ size_t llama_v3_set_state_data(struct llama_v3_context * ctx, uint8_t * src) {
                 kv_ntok, n_embd, n_layer,
                 elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
 
-            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
-            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
-            llv3_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
+            ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d));
+            ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d));
+            llv3_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
 
             ggml_free(cpy_ctx);
         }
diff --git a/otherarch/mpt_v3.cpp b/otherarch/mpt_v3.cpp
index 57ed90888..583bdbe53 100644
--- a/otherarch/mpt_v3.cpp
+++ b/otherarch/mpt_v3.cpp
@@ -390,7 +390,7 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
     params.no_alloc   = false;
 
     struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph gf = {};
+    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
 
     struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
     memcpy(embd->data, embd_inp.data(), N * ggml_element_size(embd));
@@ -437,8 +437,8 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
                     ggml_view_1d(ctx0, model.memory_v, N * n_embd,
                                  (ggml_element_size(model.memory_v) * n_embd) * (il * n_ctx + n_past));
 
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
             }
 
             // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0,
@@ -549,8 +549,8 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
     // inpL = ggml_soft_max(ctx0, inpL);
 
     // run the computation
-    ggml_build_forward_expand(&gf, inpL);
-    kcpp_graph_compute_helper(&gf, n_threads);
+    ggml_build_forward_expand(gf, inpL);
+    kcpp_graph_compute_helper(gf, n_threads);
 
     // std::cout << "Qcur" << std::endl;
     // print_tensor(Qcur);
diff --git a/otherarch/neox_v3.cpp b/otherarch/neox_v3.cpp
index 15afdbd71..28f3a31e5 100644
--- a/otherarch/neox_v3.cpp
+++ b/otherarch/neox_v3.cpp
@@ -471,7 +471,7 @@ bool gpt_neox_eval(
 
 
     struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph gf = {};
+    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
 
     struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
     memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
@@ -534,8 +534,8 @@ bool gpt_neox_eval(
                         (   n_ctx)*ggml_element_size(model.memory_v),
                         (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v));
 
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
             }
 
             // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
@@ -656,8 +656,8 @@ bool gpt_neox_eval(
     //inpL = ggml_soft_max_inplace(ctx0, inpL);
 
     // run the computation
-    ggml_build_forward_expand(&gf, inpL);
-    kcpp_graph_compute_helper(&gf, n_threads);
+    ggml_build_forward_expand(gf, inpL);
+    kcpp_graph_compute_helper(gf, n_threads);
 
     //if (n_past%100 == 0) {
     //    ggml_graph_print   (&gf);
diff --git a/otherarch/rwkv_v3.cpp b/otherarch/rwkv_v3.cpp
index cbabab5a8..8ccc313cf 100644
--- a/otherarch/rwkv_v3.cpp
+++ b/otherarch/rwkv_v3.cpp
@@ -700,7 +700,7 @@ struct rwkv_graph {
     struct ggml_tensor * tokens;
 
     // ggml_cgraph is so large that it can cause stack overflows if not stored on the heap
-    std::unique_ptr<struct ggml_cgraph> cgraph;
+    ggml_cgraph * cgraph;
 
     size_t pre_logits_nodes;
     size_t pre_logits_leafs;
@@ -1520,13 +1520,13 @@ struct rwkv_context * rwkv_new_context_impl(std::shared_ptr<struct rwkv_instance
     serial_graph.ctx = graph_future_ctx;
     RWKV_ASSERT_NULL_MSG(RWKV_ERROR_CTX | RWKV_ERROR_ALLOC, serial_graph.ctx.ctx, "Failed to allocate serial graph context");
     serial_graph.tokens = ggml_new_i32(serial_graph.ctx.ctx, 0);
-    serial_graph.cgraph.reset(new(std::nothrow) struct ggml_cgraph());
+    serial_graph.cgraph = ggml_new_graph(serial_graph.ctx.ctx);
     RWKV_ASSERT_NULL_MSG(RWKV_ERROR_ALLOC, serial_graph.cgraph, "Failed to allocate serial graph");
 
     RWKV_ASSERT_NULL(RWKV_ERROR_GRAPH, rwkv_build_serial_graph(
         serial_graph.ctx.ctx, instance->model,
         serial_graph.tokens, inputs.get(), outputs.get(), logits,
-        serial_graph.cgraph.get(),
+        serial_graph.cgraph,
         &serial_graph.pre_logits_nodes, &serial_graph.pre_logits_leafs, &serial_graph.post_logits_nodes, &serial_graph.post_logits_leafs
     ));
 
@@ -1638,7 +1638,7 @@ bool rwkv_eval(struct rwkv_context * ctx, const int n_threads, const uint32_t to
         ctx->serial_graph.cgraph->n_leafs = ctx->serial_graph.post_logits_leafs;
     }
 
-    kcpp_graph_compute_helper(ctx->serial_graph.cgraph.get(),n_threads);
+    kcpp_graph_compute_helper(ctx->serial_graph.cgraph,n_threads);
     rwkv_get_outputs(ctx, state_out, logits_out);
 
     return true;
@@ -1698,13 +1698,13 @@ bool rwkv_eval_sequence(struct rwkv_context * ctx, const int n_threads, const ui
         sequence_graph.ctx = graph_future_ctx;
         RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_CTX | RWKV_ERROR_ALLOC, sequence_graph.ctx.ctx, "Failed to allocate sequence graph context");
         sequence_graph.tokens = ggml_new_tensor_1d(sequence_graph.ctx.ctx, GGML_TYPE_I32, sequence_len);
-        sequence_graph.cgraph.reset(new(std::nothrow) struct ggml_cgraph());
+        sequence_graph.cgraph = ggml_new_graph(sequence_graph.ctx.ctx);
         RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_ALLOC, sequence_graph.cgraph, "Failed to allocate sequence graph");
 
         RWKV_ASSERT_FALSE(RWKV_ERROR_GRAPH, rwkv_build_sequence_graph(
             sequence_graph.ctx.ctx, ctx->instance->model,
             sequence_graph.tokens, ctx->input_layers.get(), ctx->output_layers.get(), ctx->logits,
-            sequence_graph.cgraph.get(),
+            sequence_graph.cgraph,
             &sequence_graph.pre_logits_nodes, &sequence_graph.pre_logits_leafs, &sequence_graph.post_logits_nodes, &sequence_graph.post_logits_leafs
         ));
 
@@ -1726,7 +1726,7 @@ bool rwkv_eval_sequence(struct rwkv_context * ctx, const int n_threads, const ui
             ctx->sequence_graph.cgraph->n_leafs = ctx->sequence_graph.post_logits_leafs;
         }
 
-        kcpp_graph_compute_helper(ctx->sequence_graph.cgraph.get(),n_threads);
+        kcpp_graph_compute_helper(ctx->sequence_graph.cgraph,n_threads);
         rwkv_get_outputs(ctx, state_out, logits_out);
     }