Implement customizable RoPE

The original RoPE has pre-defined parameters theta_i = 10000^(−2(i−1)/d), for i in [1, 2, ..., d/2] Our customizable RoPE, ggml_rope_custom_inplace, uses theta_i = scale * base^(−2(i−1)/d), for i in [1, 2, ..., d/2] with the default matches the original scale = 1.0 base = 10000 The new command line arguments --rope-freq-base --rope-freq-scale set the two new RoPE parameter. Recent researches show changing these two parameters extends the context limit with minimal loss. 1. Extending Context to 8K kaiokendev https://kaiokendev.github.io/til#extending-context-to-8k 2. Extending Context Window of Large Language Models via Positional Interpolation Shouyuan Chen, Sherman Wong, Liangjian Chen, Yuandong Tian https://arxiv.org/abs/2306.15595 3. NTK-Aware Scaled RoPE allows LLaMA models to have extended (8k+) context size without any fine-tuning and minimal perplexity degradation. https://www.reddit.com/user/bloc97 https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/ For the bold, try adding the following command line parameters to your favorite model: -c 16384 --rope-freq-base 80000 --rope-freq-scale 0.5
2023-06-29 23:16:04 -05:00 · 2023-06-29 23:16:04 -05:00 · dc0d0eb6a9
commit dc0d0eb6a9
parent dfd9fce6d6
10 changed files with 131 additions and 28 deletions
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -168,6 +168,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.n_ctx = std::stoi(argv[i]);
        } else if (arg == "--rope-freq-base") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.rope_freq_base = std::stof(argv[i]);
        } else if (arg == "--rope-freq-scale") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.rope_freq_scale = std::stof(argv[i]);
        } else if (arg == "--memory-f32") {
            params.memory_f16 = false;
        } else if (arg == "--top-p") {
@ -469,6 +481,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stderr, "                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
    fprintf(stderr, "                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
    fprintf(stderr, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
    fprintf(stderr, "  --rope_freq_base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
    fprintf(stderr, "  --rope_freq_scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
    fprintf(stderr, "  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
    fprintf(stderr, "  --no-penalize-nl      do not penalize newline token\n");
    fprintf(stderr, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
@ -549,6 +563,8 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
    lparams.use_mlock    = params.use_mlock;
    lparams.logits_all   = params.perplexity;
    lparams.embedding    = params.embedding;
    lparams.rope_freq_base  = params.rope_freq_base;
    lparams.rope_freq_scale = params.rope_freq_scale;
    llama_model * model  = llama_load_model_from_file(params.model.c_str(), lparams);
    if (model == NULL) {
--- a/examples/common.h
+++ b/examples/common.h
@ -32,6 +32,8 @@ struct gpt_params {
    int32_t main_gpu                        = 0;   // the GPU that is used for scratch and small tensors
    float   tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
    int32_t n_probs                         = 0;   // if greater than 0, output the probabilities of top n_probs tokens.
    float   rope_freq_base                  = 10000.0f; // RoPE base frequency
    float   rope_freq_scale                 = 1.0f;     // RoPE frequency scaling factor
    // sampling parameters
    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -84,9 +84,17 @@ int main(int argc, char ** argv) {
        return 0;
    }
    if (params.rope_freq_base != 10000.0) {
        fprintf(stderr, "%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
    }
    if (params.rope_freq_scale != 1.0) {
        fprintf(stderr, "%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
    }
    if (params.n_ctx > 2048) {
-        fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
+        fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified);"
-                "expect poor results\n", __func__, params.n_ctx);
+                " you are on your own\n", __func__, params.n_ctx);
    } else if (params.n_ctx < 8) {
        fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
        params.n_ctx = 8;
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -608,6 +608,8 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    fprintf(stderr, "  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
    fprintf(stderr, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
    fprintf(stderr, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
    fprintf(stderr, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
    fprintf(stderr, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    fprintf(stderr, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
    fprintf(stderr, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
@ -722,6 +724,22 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            }
            params.n_ctx = std::stoi(argv[i]);
        }
        else if (arg == "--rope-freq-base")
        {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.rope_freq_base = std::stof(argv[i]);
        }
        else if (arg == "--rope-freq-scale")
        {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.rope_freq_scale = std::stof(argv[i]);
        }
        else if (arg == "--memory-f32" || arg == "--memory_f32")
        {
            params.memory_f16 = false;
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -874,6 +874,10 @@ void ggml_metal_graph_compute(
                            const int n_past = ((int32_t *)(src1->data))[0];
                            float freq_base, freq_scale;
                            memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
                            memcpy(&freq_scale, (int32_t *) src1->date + 5, sizeof(float));
                            [encoder setComputePipelineState:ctx->pipeline_rope];
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
@ -896,6 +900,8 @@ void ggml_metal_graph_compute(
                            [encoder setBytes:&n_past length:sizeof(     int) atIndex:18];
                            [encoder setBytes:&n_dims length:sizeof(     int) atIndex:19];
                            [encoder setBytes:&mode   length:sizeof(     int) atIndex:20];
                            [encoder setBytes:&freq_base  length:sizeof(float) atIndex:21];
                            [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                        } break;
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -615,17 +615,19 @@ kernel void kernel_rope(
        constant       int & n_past,
        constant       int & n_dims,
        constant       int & mode,
        constant     float & freq_base,
        constant     float & freq_scale,
        uint3 tpig[[thread_position_in_grid]]) {
    const int64_t i3 = tpig[2];
    const int64_t i2 = tpig[1];
    const int64_t i1 = tpig[0];
    const bool is_neox = mode & 2;
-    const float theta_scale = pow(10000.0, -2.0f/n_dims);
+    const float theta_scale = pow(freq_base, -2.0f/n_dims);
    const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
-    float theta = (float)p;
+    float theta = freq_scale * (float)p;
    if (!is_neox) {
        for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
--- a/ggml.c
+++ b/ggml.c
@ -6943,6 +6943,8 @@ struct ggml_tensor * ggml_rope_impl(
        int                   n_past,
        int                   n_dims,
        int                   mode,
        float                 freq_base,
        float                 freq_scale,
        int                   n_ctx,
        bool                  inplace) {
    GGML_ASSERT(n_past >= 0);
@ -6956,12 +6958,14 @@ struct ggml_tensor * ggml_rope_impl(
    ggml_scratch_save(ctx);
-    struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
+    struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6);
    ((int32_t *) b->data)[0] = n_past;
    ((int32_t *) b->data)[1] = n_dims;
    ((int32_t *) b->data)[2] = mode;
    ((int32_t *) b->data)[3] = n_ctx;
    memcpy((int32_t *) b->data + 4, &freq_base, sizeof(float));
    memcpy((int32_t *) b->data + 5, &freq_scale, sizeof(float));
    ggml_scratch_load(ctx);
@ -6980,7 +6984,7 @@ struct ggml_tensor * ggml_rope(
        int                   n_dims,
        int                   mode,
        int                   n_ctx) {
-    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 10000.0f, 1.0f, n_ctx, false);
 }
 struct ggml_tensor * ggml_rope_inplace(
@ -6990,7 +6994,19 @@ struct ggml_tensor * ggml_rope_inplace(
        int                   n_dims,
        int                   mode,
        int                   n_ctx) {
-    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 10000.0f, 1.0f, n_ctx, true);
 }
 struct ggml_tensor * ggml_rope_custom_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   n_past,
        int                   n_dims,
        int                   mode,
        float                 freq_base,
        float                 freq_scale,
        int                   n_ctx) {
    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, freq_base, freq_scale, n_ctx, true);
 }
 // ggml_rope_back
@ -11948,7 +11964,7 @@ static void ggml_compute_forward_rope_f32(
        const struct ggml_tensor * src1,
        struct ggml_tensor * dst) {
    GGML_ASSERT(src1->type == GGML_TYPE_I32);
-    GGML_ASSERT(ggml_nelements(src1) == 4);
+    GGML_ASSERT(ggml_nelements(src1) == 6);
    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
        return;
@ -11958,6 +11974,9 @@ static void ggml_compute_forward_rope_f32(
    const int n_dims = ((int32_t *) src1->data)[1];
    const int mode   = ((int32_t *) src1->data)[2];
    const int n_ctx  = ((int32_t *) src1->data)[3];
    float freq_base, freq_scale;
    memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
    memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
    assert(n_past >= 0);
@ -11986,7 +12005,7 @@ static void ggml_compute_forward_rope_f32(
    // row index used to determine which thread to use
    int ir = 0;
-    const float theta_scale = powf(10000.0, -2.0f/n_dims);
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
    const bool is_neox = mode & 2;
    const bool is_glm  = mode & 4;
@ -11998,7 +12017,7 @@ static void ggml_compute_forward_rope_f32(
                if (ir++ < ir0) continue;
                if (ir   > ir1) break;
-                float theta = (float)p;
+                float theta = freq_scale * (float)p;
                if (is_glm) {
                    theta = MIN(p, n_ctx - 2);
@ -12075,7 +12094,7 @@ static void ggml_compute_forward_rope_f16(
        const struct ggml_tensor * src1,
        struct ggml_tensor * dst) {
    GGML_ASSERT(src1->type == GGML_TYPE_I32);
-    GGML_ASSERT(ggml_nelements(src1) == 4);
+    GGML_ASSERT(ggml_nelements(src1) == 6);
    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
        return;
@ -12085,6 +12104,9 @@ static void ggml_compute_forward_rope_f16(
    const int n_dims = ((int32_t *) src1->data)[1];
    const int mode   = ((int32_t *) src1->data)[2];
    const int n_ctx  = ((int32_t *) src1->data)[3];
    float freq_base, freq_scale;
    memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
    memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
    assert(n_past >= 0);
@ -12113,7 +12135,7 @@ static void ggml_compute_forward_rope_f16(
    // row index used to determine which thread to use
    int ir = 0;
-    const float theta_scale = powf(10000.0, -2.0f/n_dims);
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
    const bool is_neox = mode & 2;
    const bool is_glm  = mode & 4;
@ -12125,7 +12147,7 @@ static void ggml_compute_forward_rope_f16(
                if (ir++ < ir0) continue;
                if (ir   > ir1) break;
-                float theta = (float)p;
+                float theta = freq_scale * (float)p;
                if (is_glm) {
                    theta = MIN(p, n_ctx - 2);
--- a/ggml.h
+++ b/ggml.h
@ -1107,6 +1107,17 @@ extern "C" {
            int                   mode,
            int                   n_ctx);
    // custom RoPE, in-place, returns view(a)
    GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   n_past,
            int                   n_dims,
            int                   mode,
            float                 freq_base,
            float                 freq_scale,
            int                   n_ctx);
    // rotary position embedding backward, i.e compute dx from dy
    // a - dy
    GGML_API struct ggml_tensor * ggml_rope_back(
--- a/llama.cpp
+++ b/llama.cpp
@ -79,14 +79,15 @@ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
    (void) tensor;
 }
-static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
+static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
 {
    static std::map<e_model, size_t> k_sizes = {
-        { MODEL_3B,    256ull * MB },
+        /* empirical scaling, still a guess */
-        { MODEL_7B,    512ull * MB },
+        { MODEL_3B,   ((size_t) n_ctx / 16ull + 128ull) * MB },
-        { MODEL_13B,   512ull * MB },
+        { MODEL_7B,   ((size_t) n_ctx / 16ull + 256ull) * MB },
-        { MODEL_30B,   512ull * MB },
+        { MODEL_13B,  ((size_t) n_ctx / 12ull + 256ull) * MB },
-        { MODEL_65B,  1024ull * MB },
+        { MODEL_30B,  ((size_t) n_ctx / 10ull + 256ull) * MB },
        { MODEL_65B,  ((size_t) n_ctx /  8ull + 512ull) * MB },
    };
    return k_sizes;
 }
@ -167,6 +168,8 @@ struct llama_hparams {
    uint32_t n_head  = 32;
    uint32_t n_layer = 32;
    uint32_t n_rot   = 64;
    float rope_freq_base  = 10000.0f;
    float rope_freq_scale = 1.0f;
    enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
    bool operator!=(const llama_hparams & other) const {
@ -619,7 +622,7 @@ struct llama_model_loader {
        *ctx_size_p = *mmapped_size_p = 0;
        for (const llama_load_tensor & lt : tensors_map.tensors) {
            *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
-            *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
+            *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16;
        }
    }
@ -812,6 +815,8 @@ struct llama_context_params llama_context_default_params() {
    struct llama_context_params result = {
        /*.seed                        =*/ LLAMA_DEFAULT_SEED,
        /*.n_ctx                       =*/ 512,
        /*.rope_freq_base              =*/ 10000.0f,
        /*.rope_freq_scale             =*/ 1.0f,
        /*.n_batch                     =*/ 512,
        /*.gpu_layers                  =*/ 0,
        /*.main_gpu                    =*/ 0,
@ -925,6 +930,8 @@ static void llama_model_load_internal(
        llama_model & model,
        llama_vocab & vocab,
        int n_ctx,
        float rope_freq_base,
        float rope_freq_scale,
        int n_batch,
        int n_gpu_layers,
        int main_gpu,
@ -963,6 +970,8 @@ static void llama_model_load_internal(
        }
        hparams.n_ctx = n_ctx;
        hparams.rope_freq_base = rope_freq_base;
        hparams.rope_freq_scale = rope_freq_scale;
    }
    const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
@ -976,6 +985,8 @@ static void llama_model_load_internal(
        fprintf(stderr, "%s: n_head     = %u\n",  __func__, hparams.n_head);
        fprintf(stderr, "%s: n_layer    = %u\n",  __func__, hparams.n_layer);
        fprintf(stderr, "%s: n_rot      = %u\n",  __func__, hparams.n_rot);
        fprintf(stderr, "%s: freq_base  = %.1f\n",  __func__, hparams.rope_freq_base);
        fprintf(stderr, "%s: freq_scale = %g\n",  __func__, hparams.rope_freq_scale);
        fprintf(stderr, "%s: ftype      = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
        fprintf(stderr, "%s: n_ff       = %u\n",  __func__, n_ff);
        fprintf(stderr, "%s: model size = %s\n",  __func__, llama_model_type_name(model.type));
@ -1127,7 +1138,7 @@ static void llama_model_load_internal(
        const size_t mem_required =
            ctx_size +
            mmapped_size - vram_weights + // weights in VRAM not in memory
-            MEM_REQ_SCRATCH0().at(model.type) +
+            MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
            MEM_REQ_SCRATCH1().at(model.type) +
            MEM_REQ_EVAL().at    (model.type);
@ -1229,6 +1240,8 @@ static bool llama_model_load(
        llama_model & model,
        llama_vocab & vocab,
        int n_ctx,
        float rope_freq_base,
        float rope_freq_scale,
        int n_batch,
        int n_gpu_layers,
        int main_gpu,
@ -1241,7 +1254,7 @@ static bool llama_model_load(
        llama_progress_callback progress_callback,
        void *progress_callback_user_data) {
    try {
-        llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
+        llama_model_load_internal(fname, model, vocab, n_ctx, rope_freq_base, rope_freq_scale, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
                                  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
        return true;
    } catch (const std::exception & err) {
@ -1293,6 +1306,8 @@ static bool llama_eval_internal(
    const int n_head       = hparams.n_head;
    const int n_vocab      = hparams.n_vocab;
    const int n_rot        = hparams.n_embd/hparams.n_head;
    const float freq_base  = hparams.rope_freq_base;
    const float freq_scale = hparams.rope_freq_scale;
    const int n_gpu_layers = model.n_gpu_layers;
    auto & mem_per_token = lctx.mem_per_token;
@ -1384,11 +1399,11 @@ static bool llama_eval_internal(
            offload_func_kq(tmpq);
            ggml_set_name(tmpq, "tmpq");
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
+            struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0);
            offload_func_kq(Kcur);
            ggml_set_name(Kcur, "Kcur");
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
+            struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0);
            offload_func_kq(Qcur);
            ggml_set_name(Qcur, "Qcur");
@ -2559,9 +2574,10 @@ struct llama_model * llama_load_model_from_file(
    ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
-    if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
+    if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.rope_freq_base, params.rope_freq_scale,
-                params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
+                params.n_batch, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.low_vram, memory_type,
-                params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
+                params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
                params.progress_callback_user_data)) {
        delete model;
        fprintf(stderr, "%s: failed to load model\n", __func__);
        return nullptr;
@ -2638,7 +2654,7 @@ struct llama_context * llama_new_context_with_model(
        ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
-        ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
+        ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
        ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
    }
--- a/llama.h
+++ b/llama.h
@ -85,6 +85,8 @@ extern "C" {
   struct llama_context_params {
        uint32_t seed;                         // RNG seed, -1 for random
        int32_t  n_ctx;                        // text context
        float    rope_freq_base;               // RoPE base frequency
        float    rope_freq_scale;              // RoPE frequency scaling factor
        int32_t  n_batch;                      // prompt processing batch size
        int32_t  n_gpu_layers;                 // number of layers to store in VRAM
        int32_t  main_gpu;                     // the GPU that is used for scratch and small tensors