diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index d3d6d1e6c..67b8fe705 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -941,19 +941,20 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
             llamamodel->hparams.rope_freq_scale_train!=1.0f ||
             llamamodel->hparams.rope_scaling_type_train==2)
             {
-                // float ropemultiplier = 1.0f;
-                // if(llamamodel->hparams.rope_scaling_type_train!=2 &&
-                // llamamodel->hparams.n_ctx_train > 2048 && clamped_max_context_length > llamamodel->hparams.n_ctx_train)
-                // {
-                //     ropemultiplier = (float)llamamodel->hparams.n_ctx_train / (float)clamped_max_context_length;
-                //     llama_ctx_params.rope_freq_base = rope_freq_base = llamamodel->hparams.rope_freq_base_train;
-                //     llama_ctx_params.rope_freq_scale = rope_freq_scale = ropemultiplier * llamamodel->hparams.rope_freq_scale_train;
-                //     printf("Automatic RoPE Scaling: Using (scale:%.3f, base:%.1f).\n", rope_freq_scale, rope_freq_base);
-                // }
-                // else
-                // {
+                float ropemultiplier = 1.0f;
+                if(llamamodel->hparams.rope_scaling_type_train!=2 &&
+                llamamodel->hparams.n_ctx_train > 2048 && clamped_max_context_length > llamamodel->hparams.n_ctx_train &&
+                llamamodel->hparams.rope_freq_scale_train==1.0f)
+                {
+                    ropemultiplier = (float)llamamodel->hparams.n_ctx_train / (float)clamped_max_context_length;
+                    llama_ctx_params.rope_freq_base = rope_freq_base = llamamodel->hparams.rope_freq_base_train;
+                    llama_ctx_params.rope_freq_scale = rope_freq_scale = ropemultiplier * llamamodel->hparams.rope_freq_scale_train;
+                    printf("Automatic RoPE Scaling: Using (scale:%.3f, base:%.1f).\n", rope_freq_scale, rope_freq_base);
+                }
+                else
+                {
                     printf("Automatic RoPE Scaling: Using model internal value.\n");
-                //}
+                }
             }
             else
             {
diff --git a/otherarch/gpt2_v3.cpp b/otherarch/gpt2_v3.cpp
index cc6baa101..cbaf31cf7 100644
--- a/otherarch/gpt2_v3.cpp
+++ b/otherarch/gpt2_v3.cpp
@@ -455,7 +455,7 @@ bool gpt2_eval(
 
 
     struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, 8192, false);
 
     struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
     memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
diff --git a/otherarch/gptj_v3.cpp b/otherarch/gptj_v3.cpp
index 86e9219a8..1ce708877 100644
--- a/otherarch/gptj_v3.cpp
+++ b/otherarch/gptj_v3.cpp
@@ -455,7 +455,7 @@ bool gptj_eval(
 
 
     struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_MAX_NODES, false);
 
     struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
     memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
diff --git a/otherarch/llama_v3.cpp b/otherarch/llama_v3.cpp
index 1015b5afc..27b181c6b 100644
--- a/otherarch/llama_v3.cpp
+++ b/otherarch/llama_v3.cpp
@@ -12,6 +12,7 @@
 #include "llama_v3.h"
 
 #include "ggml.h"
+#include "otherarch.h"
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
 #endif
@@ -88,7 +89,6 @@ enum e_model3 {
 
 static const size_t kB3 = 1024;
 static const size_t MB3 = 1024*1024;
-static const size_t GGML_MAX_NODES = 8192;
 
 // computed for n_ctx == 2048
 // TODO: dynamically determine these sizes
diff --git a/otherarch/mpt_v3.cpp b/otherarch/mpt_v3.cpp
index 583bdbe53..5ba6d61b3 100644
--- a/otherarch/mpt_v3.cpp
+++ b/otherarch/mpt_v3.cpp
@@ -390,7 +390,7 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
     params.no_alloc   = false;
 
     struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_MAX_NODES, false);
 
     struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
     memcpy(embd->data, embd_inp.data(), N * ggml_element_size(embd));
diff --git a/otherarch/neox_v3.cpp b/otherarch/neox_v3.cpp
index 28f3a31e5..d71a63849 100644
--- a/otherarch/neox_v3.cpp
+++ b/otherarch/neox_v3.cpp
@@ -471,7 +471,7 @@ bool gpt_neox_eval(
 
 
     struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_MAX_NODES, false);
 
     struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
     memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
diff --git a/otherarch/otherarch.h b/otherarch/otherarch.h
index f4a39b12b..47ea0d7b3 100644
--- a/otherarch/otherarch.h
+++ b/otherarch/otherarch.h
@@ -459,3 +459,4 @@ struct mpt_model {
 };
 
 const float default_norm_eps = 1e-5f;
+const size_t GGML_MAX_NODES = 8192;
\ No newline at end of file
diff --git a/otherarch/rwkv_v3.cpp b/otherarch/rwkv_v3.cpp
index 8ccc313cf..ccc9f11e9 100644
--- a/otherarch/rwkv_v3.cpp
+++ b/otherarch/rwkv_v3.cpp
@@ -1520,7 +1520,7 @@ struct rwkv_context * rwkv_new_context_impl(std::shared_ptr<struct rwkv_instance
     serial_graph.ctx = graph_future_ctx;
     RWKV_ASSERT_NULL_MSG(RWKV_ERROR_CTX | RWKV_ERROR_ALLOC, serial_graph.ctx.ctx, "Failed to allocate serial graph context");
     serial_graph.tokens = ggml_new_i32(serial_graph.ctx.ctx, 0);
-    serial_graph.cgraph = ggml_new_graph(serial_graph.ctx.ctx);
+    serial_graph.cgraph = ggml_new_graph_custom(serial_graph.ctx.ctx, GGML_MAX_NODES, false);
     RWKV_ASSERT_NULL_MSG(RWKV_ERROR_ALLOC, serial_graph.cgraph, "Failed to allocate serial graph");
 
     RWKV_ASSERT_NULL(RWKV_ERROR_GRAPH, rwkv_build_serial_graph(
@@ -1698,7 +1698,7 @@ bool rwkv_eval_sequence(struct rwkv_context * ctx, const int n_threads, const ui
         sequence_graph.ctx = graph_future_ctx;
         RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_CTX | RWKV_ERROR_ALLOC, sequence_graph.ctx.ctx, "Failed to allocate sequence graph context");
         sequence_graph.tokens = ggml_new_tensor_1d(sequence_graph.ctx.ctx, GGML_TYPE_I32, sequence_len);
-        sequence_graph.cgraph = ggml_new_graph(sequence_graph.ctx.ctx);
+        sequence_graph.cgraph = ggml_new_graph_custom(sequence_graph.ctx.ctx, GGML_MAX_NODES, false);
         RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_ALLOC, sequence_graph.cgraph, "Failed to allocate sequence graph");
 
         RWKV_ASSERT_FALSE(RWKV_ERROR_GRAPH, rwkv_build_sequence_graph(