llama : fix C compatibility

2023-09-20 23:29:08 -04:00 · 2023-09-20 23:29:08 -04:00 · 43eaf06a2f
commit 43eaf06a2f
parent 56abb9a406
3 changed files with 17 additions and 19 deletions
--- a/common/common.h
+++ b/common/common.h
@ -54,8 +54,7 @@ struct gpt_params {
    float   yarn_attn_factor                = 1.0f;     // YaRN magnitude scaling factor
    float   yarn_beta_fast                  = 32.0f;    // YaRN low correction dim
    float   yarn_beta_slow                  = 1.0f;     // YaRN high correction dim
-
-    llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED;
+    int8_t  rope_scaling_type               = LLAMA_ROPE_SCALING_UNSPECIFIED;

    // sampling parameters
    int32_t top_k             = 40;    // <= 0 to use vocab size
--- a/llama.cpp
+++ b/llama.cpp
@ -1706,12 +1706,12 @@ static void llm_load_hparams(llama_model_loader & ml, llama_model & model, const
        hparams.rope_freq_base = rope_freq_base;
    }

-    llama_rope_scaling_type rope_scaling_type = params.rope_scaling_type;
+    int8_t rope_scaling_type = params.rope_scaling_type;

    if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
        uint8_t type = LLAMA_ROPE_SCALING_LINEAR;
        GGUF_GET_KEY(ctx, type, gguf_get_val_u8, GGUF_TYPE_UINT8, false, kv(LLM_KV_ROPE_SCALING_TYPE));
-        rope_scaling_type = llama_rope_scaling_type(type);
+        rope_scaling_type = int8_t(type);
    }
    GGML_ASSERT(rope_scaling_type >= 0 && rope_scaling_type <= LLAMA_ROPE_SCALING_MAX_VALUE);

@ -6234,6 +6234,7 @@ struct llama_context_params llama_context_default_params() {
        /*.n_batch                     =*/ 512,
        /*.n_gpu_layers                =*/ 0,
        /*.main_gpu                    =*/ 0,
+        /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
        /*.tensor_split                =*/ nullptr,
        /*.rope_freq_base              =*/ 0.0f,
        /*.rope_freq_scale             =*/ 0.0f,
@ -6241,7 +6242,6 @@ struct llama_context_params llama_context_default_params() {
        /*.yarn_attn_factor            =*/ 1.0f,
        /*.yarn_beta_fast              =*/ 32.0f,
        /*.yarn_beta_slow              =*/ 1.0f,
-        /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
        /*.progress_callback           =*/ nullptr,
        /*.progress_callback_user_data =*/ nullptr,
        /*.low_vram                    =*/ false,
--- a/llama.h
+++ b/llama.h
@ -108,7 +108,7 @@ extern "C" {
        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
    };

-    enum llama_rope_scaling_type: int8_t {
+    enum llama_rope_scaling_type {
        LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
        LLAMA_ROPE_SCALING_NONE        = 0,
        LLAMA_ROPE_SCALING_LINEAR      = 1,
@ -131,23 +131,22 @@ extern "C" {
    typedef void (*llama_progress_callback)(float progress, void *ctx);

    struct llama_context_params {
-        uint32_t seed;         // RNG seed, -1 for random
-        int32_t  n_ctx;        // text context
-        int32_t  n_batch;      // prompt processing batch size
-        int32_t  n_gpu_layers; // number of layers to store in VRAM
-        int32_t  main_gpu;     // the GPU that is used for scratch and small tensors
+        uint32_t seed;              // RNG seed, -1 for random
+        int32_t  n_ctx;             // text context
+        int32_t  n_batch;           // prompt processing batch size
+        int32_t  n_gpu_layers;      // number of layers to store in VRAM
+        int32_t  main_gpu;          // the GPU that is used for scratch and small tensors
+        int8_t   rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`

        const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)

        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
-        float    rope_freq_base;   // RoPE base frequency
-        float    rope_freq_scale;  // RoPE frequency scaling factor
-        float    yarn_ext_factor;  // YaRN extrapolation mix factor
-        float    yarn_attn_factor; // YaRN magnitude scaling factor
-        float    yarn_beta_fast;   // YaRN low correction dim
-        float    yarn_beta_slow;   // YaRN high correction dim
-
-        llama_rope_scaling_type rope_scaling_type;
+        float rope_freq_base;   // RoPE base frequency
+        float rope_freq_scale;  // RoPE frequency scaling factor
+        float yarn_ext_factor;  // YaRN extrapolation mix factor
+        float yarn_attn_factor; // YaRN magnitude scaling factor
+        float yarn_beta_fast;   // YaRN low correction dim
+        float yarn_beta_slow;   // YaRN high correction dim

        // called with a progress value between 0 and 1, pass NULL to disable
        llama_progress_callback progress_callback;