diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 1f38d4b98..08edd4bb9 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -176,8 +176,6 @@ struct my_llama_hparams {
     uint32_t n_layer = 32;
     uint32_t n_rot   = 64;
 
-    float f_rms_norm_eps = 1e-5f;
-
     bool operator!=(const my_llama_hparams& other) const {
         return memcmp(this, &other, sizeof(other));
     }
@@ -229,6 +227,12 @@ struct my_llama_lora_hparams {
     uint32_t n_rank_norm = 1;
     uint32_t n_rank_output = 4;
 
+    // float f_norm_eps     = 1e-5f; // falcon
+    float f_norm_rms_eps = 1e-5f; // llama
+
+    float rope_freq_base  = 10000.0f;
+    float rope_freq_scale = 1.0f;
+
     bool operator!=(const my_llama_lora_hparams& other) const {
         return memcmp(this, &other, sizeof(other));
     }
@@ -769,8 +773,9 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
     const int n_head     = hparams.n_head;
     const int n_rot      = hparams.n_rot;
     const int n_ff       = hparams.n_ff;
-    const float rms_norm_eps = hparams.f_rms_norm_eps;
-    const int rope_mode  = 0;
+    const float rms_norm_eps    = lora->hparams.f_norm_rms_eps;
+    const float rope_freq_base  = lora->hparams.rope_freq_base;
+    const float rope_freq_scale = lora->hparams.rope_freq_scale;
 
     GGML_ASSERT(n_layer == lora->layers.size());
 
@@ -781,6 +786,18 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
         }
     };
 
+    // rope has so much parameters that we make a custom function for it
+    auto rope = [ctx, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
+                (struct ggml_tensor * t) -> struct ggml_tensor * {
+        // not capturing these, to silcence warnings
+        const int n_past    = 0;
+        const int rope_mode = 0;
+
+        return ggml_rope_custom(ctx,
+            t, n_past, n_rot, rope_mode, n_ctx,
+            rope_freq_base, rope_freq_scale);
+    };
+
     set_name(tokens_input, "tokens_input");
     set_name(targets,      "targets");
 
@@ -834,10 +851,10 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
         struct ggml_tensor * t04 = ggml_mul          (ctx, t03, t02);                               set_name(t04, "t04");     assert_shape_2d(t04, n_embd, N*n_batch);
         struct ggml_tensor * t05 = ggml_mul_mat      (ctx, wq, t04);                                set_name(t05, "t05");     assert_shape_2d(t05, n_embd, N*n_batch);
         struct ggml_tensor * t06 = ggml_reshape_4d   (ctx, t05, n_embd/n_head, n_head, N, n_batch); set_name(t06, "t06");     assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t07 = ggml_rope_inplace (ctx, t06, n_past, n_rot, rope_mode, n_ctx);   set_name(t07, "t07");     assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t07 = rope              (t06);                                         set_name(t07, "t07");     assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
         struct ggml_tensor * t08 = ggml_mul_mat      (ctx, wk, t04);                                set_name(t08, "t08");     assert_shape_2d(t08, n_embd, N*n_batch);
         struct ggml_tensor * t09 = ggml_reshape_4d   (ctx, t08, n_embd/n_head, n_head, N, n_batch); set_name(t09, "t09");     assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t10 = ggml_rope_inplace (ctx, t09, n_past, n_rot, rope_mode, n_ctx);   set_name(t10, "t10");     assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t10 = rope              (t09);                                         set_name(t10, "t10");     assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
 
         struct ggml_tensor * t11;
         if (ggml_is_quantized(wv->type)) {
@@ -1631,6 +1648,10 @@ struct train_params {
     int n_batch;
     int n_examples;
 
+    float f_norm_rms_eps;
+    float rope_freq_base;
+    float rope_freq_scale;
+
     int32_t lora_r;
     int32_t lora_alpha;
 
@@ -1701,6 +1722,10 @@ struct train_params get_default_train_params() {
     params.n_batch    =    8;
     params.n_examples =    1;
 
+    params.f_norm_rms_eps  = 1e-5f;
+    params.rope_freq_base  = 10000.0f;
+    params.rope_freq_scale = 1.0f;
+
     params.lora_alpha  = 4;
     params.lora_r      = 4;
 
@@ -1771,6 +1796,9 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  -t N, --threads N          Number of threads (default %d)\n", params->n_threads);
     fprintf(stderr, "  -b N, --batch N            Parallel batch size (default %d)\n", params->n_batch);
     fprintf(stderr, "  -n N, --examples N         Number of examples to train (default %d)\n", params->n_examples);
+    fprintf(stderr, "  --norm-rms-eps F           RMS-Norm epsilon value (default %f)\n", params->f_norm_rms_eps);
+    fprintf(stderr, "  --rope-freq-base F         Frequency base for ROPE (default %f)\n", params->rope_freq_base);
+    fprintf(stderr, "  --rope-freq-scale F        Frequency scale for ROPE (default %f)\n", params->rope_freq_scale);
     fprintf(stderr, "  --lora-alpha N             LORA alpha : resulting LORA scaling is alpha/r. (default %d)\n", params->lora_alpha);
     fprintf(stderr, "  --lora-r N                 LORA r     : resulting LORA scaling is alpha/r. (default %d)\n", params->lora_r);
     fprintf(stderr, "  --rank-att-norm N          LORA rank for attention norm tensor (default %d)\n", params->n_rank_attention_norm);
@@ -1910,6 +1938,24 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->n_examples = std::stoi(argv[i]);
+        } else if (arg == "--norm-rms-eps") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->f_norm_rms_eps = std::stof(argv[i]);
+        } else if (arg == "--rope-freq-base") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->rope_freq_base = std::stof(argv[i]);
+        } else if (arg == "--rope-freq-scale") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->rope_freq_scale = std::stof(argv[i]);
         } else if (arg == "--lora-alpha") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -2290,6 +2336,9 @@ int main(int argc, char ** argv) {
     init_model(lmodel, &model, params.n_ctx);
 
     struct my_llama_lora lora;
+    lora.hparams.f_norm_rms_eps        = params.f_norm_rms_eps;
+    lora.hparams.rope_freq_base        = params.rope_freq_base;
+    lora.hparams.rope_freq_scale       = params.rope_freq_scale;
     lora.hparams.lora_r                = params.lora_r;
     lora.hparams.lora_alpha            = params.lora_alpha;
     lora.hparams.n_rank_attention_norm = params.n_rank_attention_norm;