diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index d3d6d1e6c..67b8fe705 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -941,19 +941,20 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in llamamodel->hparams.rope_freq_scale_train!=1.0f || llamamodel->hparams.rope_scaling_type_train==2) { - // float ropemultiplier = 1.0f; - // if(llamamodel->hparams.rope_scaling_type_train!=2 && - // llamamodel->hparams.n_ctx_train > 2048 && clamped_max_context_length > llamamodel->hparams.n_ctx_train) - // { - // ropemultiplier = (float)llamamodel->hparams.n_ctx_train / (float)clamped_max_context_length; - // llama_ctx_params.rope_freq_base = rope_freq_base = llamamodel->hparams.rope_freq_base_train; - // llama_ctx_params.rope_freq_scale = rope_freq_scale = ropemultiplier * llamamodel->hparams.rope_freq_scale_train; - // printf("Automatic RoPE Scaling: Using (scale:%.3f, base:%.1f).\n", rope_freq_scale, rope_freq_base); - // } - // else - // { + float ropemultiplier = 1.0f; + if(llamamodel->hparams.rope_scaling_type_train!=2 && + llamamodel->hparams.n_ctx_train > 2048 && clamped_max_context_length > llamamodel->hparams.n_ctx_train && + llamamodel->hparams.rope_freq_scale_train==1.0f) + { + ropemultiplier = (float)llamamodel->hparams.n_ctx_train / (float)clamped_max_context_length; + llama_ctx_params.rope_freq_base = rope_freq_base = llamamodel->hparams.rope_freq_base_train; + llama_ctx_params.rope_freq_scale = rope_freq_scale = ropemultiplier * llamamodel->hparams.rope_freq_scale_train; + printf("Automatic RoPE Scaling: Using (scale:%.3f, base:%.1f).\n", rope_freq_scale, rope_freq_base); + } + else + { printf("Automatic RoPE Scaling: Using model internal value.\n"); - //} + } } else { diff --git a/otherarch/gpt2_v3.cpp b/otherarch/gpt2_v3.cpp index cc6baa101..cbaf31cf7 100644 --- a/otherarch/gpt2_v3.cpp +++ b/otherarch/gpt2_v3.cpp @@ -455,7 +455,7 @@ bool gpt2_eval( struct ggml_context * ctx0 = ggml_init(params); - struct ggml_cgraph * gf = ggml_new_graph(ctx0); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, 8192, false); struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); diff --git a/otherarch/gptj_v3.cpp b/otherarch/gptj_v3.cpp index 86e9219a8..1ce708877 100644 --- a/otherarch/gptj_v3.cpp +++ b/otherarch/gptj_v3.cpp @@ -455,7 +455,7 @@ bool gptj_eval( struct ggml_context * ctx0 = ggml_init(params); - struct ggml_cgraph * gf = ggml_new_graph(ctx0); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_MAX_NODES, false); struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); diff --git a/otherarch/llama_v3.cpp b/otherarch/llama_v3.cpp index 1015b5afc..27b181c6b 100644 --- a/otherarch/llama_v3.cpp +++ b/otherarch/llama_v3.cpp @@ -12,6 +12,7 @@ #include "llama_v3.h" #include "ggml.h" +#include "otherarch.h" #ifdef GGML_USE_CUBLAS #include "ggml-cuda.h" #endif @@ -88,7 +89,6 @@ enum e_model3 { static const size_t kB3 = 1024; static const size_t MB3 = 1024*1024; -static const size_t GGML_MAX_NODES = 8192; // computed for n_ctx == 2048 // TODO: dynamically determine these sizes diff --git a/otherarch/mpt_v3.cpp b/otherarch/mpt_v3.cpp index 583bdbe53..5ba6d61b3 100644 --- a/otherarch/mpt_v3.cpp +++ b/otherarch/mpt_v3.cpp @@ -390,7 +390,7 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past, params.no_alloc = false; struct ggml_context * ctx0 = ggml_init(params); - struct ggml_cgraph * gf = ggml_new_graph(ctx0); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_MAX_NODES, false); struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, embd_inp.data(), N * ggml_element_size(embd)); diff --git a/otherarch/neox_v3.cpp b/otherarch/neox_v3.cpp index 28f3a31e5..d71a63849 100644 --- a/otherarch/neox_v3.cpp +++ b/otherarch/neox_v3.cpp @@ -471,7 +471,7 @@ bool gpt_neox_eval( struct ggml_context * ctx0 = ggml_init(params); - struct ggml_cgraph * gf = ggml_new_graph(ctx0); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_MAX_NODES, false); struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); diff --git a/otherarch/otherarch.h b/otherarch/otherarch.h index f4a39b12b..47ea0d7b3 100644 --- a/otherarch/otherarch.h +++ b/otherarch/otherarch.h @@ -459,3 +459,4 @@ struct mpt_model { }; const float default_norm_eps = 1e-5f; +const size_t GGML_MAX_NODES = 8192; \ No newline at end of file diff --git a/otherarch/rwkv_v3.cpp b/otherarch/rwkv_v3.cpp index 8ccc313cf..ccc9f11e9 100644 --- a/otherarch/rwkv_v3.cpp +++ b/otherarch/rwkv_v3.cpp @@ -1520,7 +1520,7 @@ struct rwkv_context * rwkv_new_context_impl(std::shared_ptr