diff --git a/common/common.h b/common/common.h
index a1e7da128..e49db6fed 100644
--- a/common/common.h
+++ b/common/common.h
@@ -54,8 +54,7 @@ struct gpt_params {
     float   yarn_attn_factor                = 1.0f;     // YaRN magnitude scaling factor
     float   yarn_beta_fast                  = 32.0f;    // YaRN low correction dim
     float   yarn_beta_slow                  = 1.0f;     // YaRN high correction dim
-
-    llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED;
+    int8_t  rope_scaling_type               = LLAMA_ROPE_SCALING_UNSPECIFIED;
 
     // sampling parameters
     int32_t top_k             = 40;    // <= 0 to use vocab size
diff --git a/llama.cpp b/llama.cpp
index cd545b254..56c511b59 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1706,12 +1706,12 @@ static void llm_load_hparams(llama_model_loader & ml, llama_model & model, const
         hparams.rope_freq_base = rope_freq_base;
     }
 
-    llama_rope_scaling_type rope_scaling_type = params.rope_scaling_type;
+    int8_t rope_scaling_type = params.rope_scaling_type;
 
     if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
         uint8_t type = LLAMA_ROPE_SCALING_LINEAR;
         GGUF_GET_KEY(ctx, type, gguf_get_val_u8, GGUF_TYPE_UINT8, false, kv(LLM_KV_ROPE_SCALING_TYPE));
-        rope_scaling_type = llama_rope_scaling_type(type);
+        rope_scaling_type = int8_t(type);
     }
     GGML_ASSERT(rope_scaling_type >= 0 && rope_scaling_type <= LLAMA_ROPE_SCALING_MAX_VALUE);
 
@@ -6234,6 +6234,7 @@ struct llama_context_params llama_context_default_params() {
         /*.n_batch                     =*/ 512,
         /*.n_gpu_layers                =*/ 0,
         /*.main_gpu                    =*/ 0,
+        /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
         /*.tensor_split                =*/ nullptr,
         /*.rope_freq_base              =*/ 0.0f,
         /*.rope_freq_scale             =*/ 0.0f,
@@ -6241,7 +6242,6 @@ struct llama_context_params llama_context_default_params() {
         /*.yarn_attn_factor            =*/ 1.0f,
         /*.yarn_beta_fast              =*/ 32.0f,
         /*.yarn_beta_slow              =*/ 1.0f,
-        /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
         /*.progress_callback           =*/ nullptr,
         /*.progress_callback_user_data =*/ nullptr,
         /*.low_vram                    =*/ false,
diff --git a/llama.h b/llama.h
index 5d69997bf..6528254cb 100644
--- a/llama.h
+++ b/llama.h
@@ -108,7 +108,7 @@ extern "C" {
         LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
     };
 
-    enum llama_rope_scaling_type: int8_t {
+    enum llama_rope_scaling_type {
         LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
         LLAMA_ROPE_SCALING_NONE        = 0,
         LLAMA_ROPE_SCALING_LINEAR      = 1,
@@ -131,23 +131,22 @@ extern "C" {
     typedef void (*llama_progress_callback)(float progress, void *ctx);
 
     struct llama_context_params {
-        uint32_t seed;         // RNG seed, -1 for random
-        int32_t  n_ctx;        // text context
-        int32_t  n_batch;      // prompt processing batch size
-        int32_t  n_gpu_layers; // number of layers to store in VRAM
-        int32_t  main_gpu;     // the GPU that is used for scratch and small tensors
+        uint32_t seed;              // RNG seed, -1 for random
+        int32_t  n_ctx;             // text context
+        int32_t  n_batch;           // prompt processing batch size
+        int32_t  n_gpu_layers;      // number of layers to store in VRAM
+        int32_t  main_gpu;          // the GPU that is used for scratch and small tensors
+        int8_t   rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
 
         const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
 
         // ref: https://github.com/ggerganov/llama.cpp/pull/2054
-        float    rope_freq_base;   // RoPE base frequency
-        float    rope_freq_scale;  // RoPE frequency scaling factor
-        float    yarn_ext_factor;  // YaRN extrapolation mix factor
-        float    yarn_attn_factor; // YaRN magnitude scaling factor
-        float    yarn_beta_fast;   // YaRN low correction dim
-        float    yarn_beta_slow;   // YaRN high correction dim
-
-        llama_rope_scaling_type rope_scaling_type;
+        float rope_freq_base;   // RoPE base frequency
+        float rope_freq_scale;  // RoPE frequency scaling factor
+        float yarn_ext_factor;  // YaRN extrapolation mix factor
+        float yarn_attn_factor; // YaRN magnitude scaling factor
+        float yarn_beta_fast;   // YaRN low correction dim
+        float yarn_beta_slow;   // YaRN high correction dim
 
         // called with a progress value between 0 and 1, pass NULL to disable
         llama_progress_callback progress_callback;