diff --git a/common/common.h b/common/common.h index a1e7da128..e49db6fed 100644 --- a/common/common.h +++ b/common/common.h @@ -54,8 +54,7 @@ struct gpt_params { float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor float yarn_beta_fast = 32.0f; // YaRN low correction dim float yarn_beta_slow = 1.0f; // YaRN high correction dim - - llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; + int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // sampling parameters int32_t top_k = 40; // <= 0 to use vocab size diff --git a/llama.cpp b/llama.cpp index cd545b254..56c511b59 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1706,12 +1706,12 @@ static void llm_load_hparams(llama_model_loader & ml, llama_model & model, const hparams.rope_freq_base = rope_freq_base; } - llama_rope_scaling_type rope_scaling_type = params.rope_scaling_type; + int8_t rope_scaling_type = params.rope_scaling_type; if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) { uint8_t type = LLAMA_ROPE_SCALING_LINEAR; GGUF_GET_KEY(ctx, type, gguf_get_val_u8, GGUF_TYPE_UINT8, false, kv(LLM_KV_ROPE_SCALING_TYPE)); - rope_scaling_type = llama_rope_scaling_type(type); + rope_scaling_type = int8_t(type); } GGML_ASSERT(rope_scaling_type >= 0 && rope_scaling_type <= LLAMA_ROPE_SCALING_MAX_VALUE); @@ -6234,6 +6234,7 @@ struct llama_context_params llama_context_default_params() { /*.n_batch =*/ 512, /*.n_gpu_layers =*/ 0, /*.main_gpu =*/ 0, + /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED, /*.tensor_split =*/ nullptr, /*.rope_freq_base =*/ 0.0f, /*.rope_freq_scale =*/ 0.0f, @@ -6241,7 +6242,6 @@ struct llama_context_params llama_context_default_params() { /*.yarn_attn_factor =*/ 1.0f, /*.yarn_beta_fast =*/ 32.0f, /*.yarn_beta_slow =*/ 1.0f, - /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED, /*.progress_callback =*/ nullptr, /*.progress_callback_user_data =*/ nullptr, /*.low_vram =*/ false, diff --git a/llama.h b/llama.h index 5d69997bf..6528254cb 100644 --- a/llama.h +++ b/llama.h @@ -108,7 +108,7 @@ extern "C" { LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; - enum llama_rope_scaling_type: int8_t { + enum llama_rope_scaling_type { LLAMA_ROPE_SCALING_UNSPECIFIED = -1, LLAMA_ROPE_SCALING_NONE = 0, LLAMA_ROPE_SCALING_LINEAR = 1, @@ -131,23 +131,22 @@ extern "C" { typedef void (*llama_progress_callback)(float progress, void *ctx); struct llama_context_params { - uint32_t seed; // RNG seed, -1 for random - int32_t n_ctx; // text context - int32_t n_batch; // prompt processing batch size - int32_t n_gpu_layers; // number of layers to store in VRAM - int32_t main_gpu; // the GPU that is used for scratch and small tensors + uint32_t seed; // RNG seed, -1 for random + int32_t n_ctx; // text context + int32_t n_batch; // prompt processing batch size + int32_t n_gpu_layers; // number of layers to store in VRAM + int32_t main_gpu; // the GPU that is used for scratch and small tensors + int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES) // ref: https://github.com/ggerganov/llama.cpp/pull/2054 - float rope_freq_base; // RoPE base frequency - float rope_freq_scale; // RoPE frequency scaling factor - float yarn_ext_factor; // YaRN extrapolation mix factor - float yarn_attn_factor; // YaRN magnitude scaling factor - float yarn_beta_fast; // YaRN low correction dim - float yarn_beta_slow; // YaRN high correction dim - - llama_rope_scaling_type rope_scaling_type; + float rope_freq_base; // RoPE base frequency + float rope_freq_scale; // RoPE frequency scaling factor + float yarn_ext_factor; // YaRN extrapolation mix factor + float yarn_attn_factor; // YaRN magnitude scaling factor + float yarn_beta_fast; // YaRN low correction dim + float yarn_beta_slow; // YaRN high correction dim // called with a progress value between 0 and 1, pass NULL to disable llama_progress_callback progress_callback;