From ac2089c3780cfcfbe8d3f91bd83537d84f33e6a2 Mon Sep 17 00:00:00 2001
From: HimariO <dsfhe49854@gmail.com>
Date: Sun, 8 Dec 2024 00:47:48 +0800
Subject: [PATCH] add mrope unit test, fix few compiler warnings

---
 examples/llava/clip.cpp        |  1 -
 examples/llava/qwen2vl-cli.cpp | 21 ++++-----
 ggml/src/ggml.c                |  1 -
 src/llama.cpp                  |  2 +
 tests/test-rope.cpp            | 81 +++++++++++++++++++++++++---------
 5 files changed, 74 insertions(+), 32 deletions(-)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 050b04ce2..863d86ea4 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -2488,7 +2488,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     const int patch_size    = hparams.patch_size;
     const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
     const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
-    const int num_position_ids = ctx->has_qwen2vl_merger ? num_positions * 3 : num_positions;
     if(ctx->load_image_size==nullptr){
         ctx->load_image_size= clip_image_size_init();
     }
diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-cli.cpp
index cb4ce7f0e..99394a980 100644
--- a/examples/llava/qwen2vl-cli.cpp
+++ b/examples/llava/qwen2vl-cli.cpp
@@ -24,7 +24,9 @@ static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct lla
     const int ph = image_size->height / patch_size + (image_size->height % patch_size > 0);
     const int pw = image_size->width / patch_size + (image_size->width % patch_size > 0);
     auto img_tokens = image_embed->n_image_pos;
-    llama_pos mrope_pos[img_tokens * 4];
+    // llama_pos mrope_pos[img_tokens * 4];
+    std::vector<llama_pos> mrope_pos;
+    mrope_pos.resize(img_tokens * 4);
     
     for (int y = 0; y < ph; y++)
     {
@@ -350,7 +352,7 @@ static void llava_free(struct llava_context * ctx_llava) {
 
 #ifndef NDEBUG
 
-static void tmp_test_rope(struct llava_context * ctx_llava, common_params * params) {
+static void tmp_test_rope() {
     
     int n_threads = 1;
     static size_t buf_size = 512u*1024*1024;
@@ -415,13 +417,13 @@ static void tmp_test_rope(struct llava_context * ctx_llava, common_params * para
     }
 }
 
-static void tmp_dump_img_embed(struct llava_context * ctx_llava, common_params * params) {
-    // auto * image_embed = load_image(ctx_llava, params, "/home/ron/Downloads/gguf/dog.jpeg");
+static void tmp_dump_img_embed(struct llava_context * ctx_llava) {
     int n_embd  = llama_n_embd(llama_get_model(ctx_llava->ctx_llama));
-    // int ne = n_embd * image_embed->n_image_pos;
     int ne = n_embd * 4;
     float vals[56 * 56 * 3];
-    float embd[ne];
+    // float embd[ne];
+    std::vector<float> embd;
+    embd.resize(ne);
 
     for (int i = 0; i < 56*56; i++)
     {
@@ -429,12 +431,11 @@ static void tmp_dump_img_embed(struct llava_context * ctx_llava, common_params *
             vals[i * 3 + c] = (float)(i % (56 * 56)) / (56*56);
     }
     
-    // auto param = &ctx_llava->ctx_clip->vision_model.hparams;
-    clip_encode_float_image(ctx_llava->ctx_clip, 16, vals, 56, 56, embd);
+    clip_encode_float_image(ctx_llava->ctx_clip, 16, vals, 56, 56, embd.data());
 
     std::ofstream outFile("img_embed.bin", std::ios::binary);
     if (outFile.is_open()) {
-        outFile.write(reinterpret_cast<const char*>(embd), ne * sizeof(float));
+        outFile.write(reinterpret_cast<const char*>(embd.data()), ne * sizeof(float));
 
         outFile.close();
         std::cout << "Data successfully written to mrope.bin" << std::endl;
@@ -484,7 +485,7 @@ int main(int argc, char ** argv) {
     } else if (params.image[0].empty()) {
         auto ctx_llava = llava_init_context(&params, model);
         
-        tmp_dump_img_embed(ctx_llava, &params);
+        tmp_dump_img_embed(ctx_llava);
 
         llama_perf_context_print(ctx_llava->ctx_llama);
         ctx_llava->model = NULL;
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index c3726163b..008022441 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -3585,7 +3585,6 @@ struct ggml_tensor * ggml_mrope_ext(
     memcpy(params +  9, &beta_fast,    sizeof(float));
     memcpy(params + 10, &beta_slow,    sizeof(float));
     memcpy(&params[11], sections,      sizeof(int)*4);
-    // memcpy(params + 11, sections,      sizeof(int)*3);
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op   = GGML_OP_ROPE;
diff --git a/src/llama.cpp b/src/llama.cpp
index 15052006b..d7deaffe0 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3337,6 +3337,7 @@ struct llama_context {
     // whether we are computing encoder output or decoder output
     bool is_encoding = false;
     
+    // TODO: find a better way to accommodate mutli-dimension position encoding methods
     // number of position id each token get, 1 for each token in most cases.
     // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
     int n_pos_per_token = 1;
@@ -5719,6 +5720,7 @@ static void llm_load_hparams(
                 std::fill(hparams.rope_mrope_sections.begin(), hparams.rope_mrope_sections.end(), 0);
                 ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_mrope_sections, 4, true);
             }
+            // fall through
         case LLM_ARCH_QWEN2:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
diff --git a/tests/test-rope.cpp b/tests/test-rope.cpp
index 4656b30f0..b138ffb25 100644
--- a/tests/test-rope.cpp
+++ b/tests/test-rope.cpp
@@ -138,7 +138,7 @@ int main(int /*argc*/, const char ** /*argv*/) {
     struct ggml_tensor * x;
 
     // rope f32
-    for (int m = 0; m < 3; ++m) {
+    for (int m = 0; m < 5; ++m) {
         const int ndims = 4;
 
         const int64_t n_rot = 128;
@@ -147,28 +147,69 @@ int main(int /*argc*/, const char ** /*argv*/) {
         const int n_past_0 = 100;
         const int n_past_2 = 33;
 
-        struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
-        struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
-        struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
-
-        for (int i = 0; i < ne[2]; ++i) {
-            ((int32_t *) p0->data)[i] = n_past_0 + i;
-            ((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
-            ((int32_t *) p2->data)[i] = n_past_2 + i;
-        }
-
-        // test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
-        const int mode = m == 0 ? 0 : m == 1 ? 2 : 4;
-
+        struct ggml_tensor * r0;
+        struct ggml_tensor * r1;
+        struct ggml_tensor * r2;
         x = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+        int mode = -1;
 
-        // 100, 101, 102, ..., 172
-        struct ggml_tensor * r0 = ggml_rope(ctx0, x,  p0, n_rot, mode);
-        // -67, -67, -67, ..., -67
-        struct ggml_tensor * r1 = ggml_rope(ctx0, r0, p1, n_rot, mode); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
+        if (m < 3) {
+            struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
+            struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
+            struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
 
-        //  33,  34,  35, ..., 105
-        struct ggml_tensor * r2 = ggml_rope(ctx0, x,  p2, n_rot, mode);
+            for (int i = 0; i < ne[2]; ++i) {
+                ((int32_t *) p0->data)[i] = n_past_0 + i;
+                ((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
+                ((int32_t *) p2->data)[i] = n_past_2 + i;
+            }
+            // test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
+            mode = m == 0 ? 0 : m == 1 ? 2 : 4;
+
+            // 100, 101, 102, ..., 172
+            r0 = ggml_rope(ctx0, x,  p0, n_rot, mode);
+            // -67, -67, -67, ..., -67
+            r1 = ggml_rope(ctx0, r0, p1, n_rot, mode); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
+
+            //  33,  34,  35, ..., 105
+            r2 = ggml_rope(ctx0, x,  p2, n_rot, mode);
+        } else {
+            // testing multi-dimension rope position embedding mode
+            struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
+            struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
+            struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
+            
+            int sections[4] = {16, 24, 24, 0};
+            mode = (m == 3) ? GGML_ROPE_TYPE_MROPE : GGML_ROPE_TYPE_VISION;
+
+            for (int i = 0; i < ne[2]; ++i) {
+                for (int j = 0; j < 4; ++j) {
+                    ((int32_t *) p0->data)[i + ne[2] * j] = n_past_0 + i + j;
+                    ((int32_t *) p1->data)[i + ne[2] * j] = n_past_2 - n_past_0;
+                    ((int32_t *) p2->data)[i + ne[2] * j] = n_past_2 + i + j;
+                }
+            }
+           
+            // [[100, 101, 102, ..., 172],
+            // [101, 102, 103, ..., 173],
+            // [102, 103, 104, ..., 174]]
+            r0 = ggml_mrope_ext(
+                ctx0, x, p0, nullptr,
+                n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
+            // [[-67, -67, -67, ..., -67]
+            // [-67, -67, -67, ..., -67]
+            // [-67, -67, -67, ..., -67]]
+            r1 = ggml_mrope_ext(
+                ctx0, r0, p1, nullptr,
+                n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
+
+            //  [[33,  34,  35, ..., 105]
+            //  [34,  35,  36, ..., 106]
+            //  [35,  36,  37, ..., 107]]
+            r2 = ggml_mrope_ext(
+                ctx0, x, p2, nullptr,
+                n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
+        }
 
         ggml_cgraph * gf = ggml_new_graph(ctx0);