From 9d389a051be3675196a9128bed51145db8386449 Mon Sep 17 00:00:00 2001
From: HimariO <dsfhe49854@gmail.com>
Date: Mon, 30 Sep 2024 22:30:02 +0800
Subject: [PATCH] Add vl-rope/2d-rope support for qwen2vl ViT

---
 examples/llava/qwen2vl-cli.cpp | 70 +++++++++++++++++++++++++++++++++-
 ggml/include/ggml.h            |  1 +
 ggml/src/ggml.c                | 49 ++++++++++++++++++------
 src/llama.cpp                  |  6 +--
 4 files changed, 109 insertions(+), 17 deletions(-)
diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-cli.cpp
index 181678f14..82583e750 100644
--- a/examples/llava/qwen2vl-cli.cpp
+++ b/examples/llava/qwen2vl-cli.cpp
@@ -463,9 +463,10 @@ static void tmp_test_mrope(struct llava_context * ctx_llava, gpt_params * params
     for (int i = 60; i < 90; i ++) pos_id[i] = i - 0;
     memcpy(pos->data, pos_id.data(), 90 * ggml_element_size(pos));
 
+    int sections[3] = {16, 24, 24};
     auto encode = ggml_mrope_ext(
         ctx0, inp_raw, pos, nullptr,
-        128, LLAMA_ROPE_TYPE_NEOX, 32768, 1000000, 1,
+        128, sections, LLAMA_ROPE_TYPE_NEOX, 32768, 1000000, 1,
         0, 1, 32, 1);
     
     ggml_build_forward_expand(gf, encode);
@@ -490,6 +491,70 @@ static void tmp_test_mrope(struct llava_context * ctx_llava, gpt_params * params
     }
 }
 
+
+static void tmp_test_mrope_2d(struct llava_context * ctx_llava, gpt_params * params) {
+    
+    int n_threads = 1;
+    static size_t buf_size = 512u*1024*1024;
+    static void * buf = malloc(buf_size);
+
+    struct ggml_init_params init_params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
+        /*.no_alloc   =*/ false,
+    };
+
+    struct ggml_context * ctx0 = ggml_init(init_params);
+    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+    struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 128, 12, 30);
+    ggml_set_name(inp_raw, "inp_raw");
+    ggml_set_input(inp_raw);
+
+    std::vector<float> dummy_q;
+    dummy_q.resize(128 * 12 * 30);
+    std::fill(dummy_q.begin(), dummy_q.end(), 0.1);
+    memcpy(inp_raw->data, dummy_q.data(), 128 * 12 * 30 * ggml_element_size(inp_raw));
+
+    struct ggml_tensor * pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, 30 * 3);
+    ggml_set_name(pos, "pos");
+    ggml_set_input(pos);
+
+    std::vector<int> pos_id;
+    pos_id.resize(90);
+    for (int i = 0; i < 30; i ++) pos_id[i] = i;
+    for (int i = 30; i < 60; i ++) pos_id[i] = i - 30;
+    for (int i = 60; i < 90; i ++) pos_id[i] = i - 0;
+    memcpy(pos->data, pos_id.data(), 90 * ggml_element_size(pos));
+
+    int sections[3] = {32, 32, 0};
+    auto encode = ggml_mrope_ext(
+        ctx0, inp_raw, pos, nullptr,
+        128/2, sections, LLAMA_ROPE_TYPE_NEOX, 32768, 1000000, 1,
+        0, 1, 32, 1);
+    
+    ggml_build_forward_expand(gf, encode);
+    ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
+
+    std::vector<float> embd;
+    embd.resize(128 * 12 * 30);
+    memcpy(
+        embd.data(), 
+        (float *) ggml_get_data(encode), 
+        sizeof(float) * 128 * 12 * 30);
+    ggml_free(ctx0);
+
+    std::ofstream outFile("mrope_2d.bin", std::ios::binary);
+    if (outFile.is_open()) {
+        outFile.write(reinterpret_cast<const char*>(embd.data()), embd.size() * sizeof(int));
+
+        outFile.close();
+        std::cout << "Data successfully written to mrope.bin" << std::endl;
+    } else {
+        std::cerr << "Error opening file!" << std::endl;
+    }
+}
+
 /*
     -----------------------------------------------------------------------------------------------------------------
 */
@@ -542,7 +607,8 @@ int main(int argc, char ** argv) {
         // process the prompt
         // tmp_test_conv2d_reshape(ctx_llava, &params);
         // tmp_test_rope(ctx_llava, &params);
-        tmp_test_mrope(ctx_llava, &params);
+        // tmp_test_mrope(ctx_llava, &params);
+        tmp_test_mrope_2d(ctx_llava, &params);
         // process_prompt(ctx_llava, nullptr, &params, params.prompt);
 
         llama_print_timings(ctx_llava->ctx_llama);
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 07d66af20..ff833a4fc 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -1451,6 +1451,7 @@ extern "C" {
             struct ggml_tensor  * b,
             struct ggml_tensor  * c,
             int                   n_dims,
+            int                   sections[3],
             int                   mode,
             int                   n_ctx_orig,
             float                 freq_base,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index a9518bf96..98a1110d7 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -3559,6 +3559,7 @@ struct ggml_tensor * ggml_mrope_ext(
         struct ggml_tensor  * b,
         struct ggml_tensor  * c,
         int                   n_dims,
+        int                   sections[3],
         int                   mode,
         int                   n_ctx_orig,
         float                 freq_base,
@@ -3568,8 +3569,6 @@ struct ggml_tensor * ggml_mrope_ext(
         float                 beta_fast,
         float                 beta_slow) {
 
-    int sections[3] = {16, 24, 24};  // TODO: move this into gguf model file.
-
     GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
 
     GGML_ASSERT(ggml_is_vector(b));
@@ -3596,7 +3595,8 @@ struct ggml_tensor * ggml_mrope_ext(
     memcpy(params +  8, &attn_factor,  sizeof(float));
     memcpy(params +  9, &beta_fast,    sizeof(float));
     memcpy(params + 10, &beta_slow,    sizeof(float));
-    memcpy(params + 11, &sections,    sizeof(int) * 3);
+    memcpy(&params[11], sections,      sizeof(int)*3);
+    // memcpy(params + 11, sections,      sizeof(int)*3);
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op   = GGML_OP_ROPE;
@@ -11238,7 +11238,7 @@ static void ggml_rope_cache_init(
 }
 
 static void ggml_mrope_cache_init(
-     float theta_base_t, float theta_base_h, float theta_base_w, int sections[3],
+     float theta_base_t, float theta_base_h, float theta_base_w, int sections[3], bool indep_sects,
      float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
      float * cache, float sin_sign, float theta_scale) {
     // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
@@ -11246,12 +11246,25 @@ static void ggml_mrope_cache_init(
     float theta_h = theta_base_h;
     float theta_w = theta_base_w;
     int sect_dims = sections[0] + sections[1] + sections[2];
+    int prev_sector = -1;
     
     for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
         const float ff = freq_factors ? freq_factors[i0/2] : 1.0f;
-        float theta = theta_t;
-        int sector = (i0 / 2) % sect_dims;
         
+        int sector = (i0 / 2) % sect_dims;
+        if (indep_sects) {
+            if (sector == 0) {
+                theta_t = theta_base_t;
+            }
+            else if (sector == sections[0]) {
+                theta_h = theta_base_h;;
+            }
+            else if (sector == sections[1]) {
+                theta_w = theta_base_w;
+            }
+        }
+
+        float theta = theta_t;
         if (sector < sections[1] + sections[0] && sector >= sections[0]) {
             theta = theta_h;
         } 
@@ -11267,6 +11280,7 @@ static void ggml_mrope_cache_init(
         theta_t *= theta_scale;
         theta_w *= theta_scale;
         theta_h *= theta_scale;
+        prev_sector = sector;
     }
 }
 
@@ -11366,7 +11380,7 @@ static void ggml_compute_forward_rope_f32(
                 const int64_t p_h = pos[i2 + ne2];
                 const int64_t p_w = pos[i2 + ne2 * 2];
                 ggml_mrope_cache_init(
-                    p_t, p_h, p_w, sections, 
+                    p_t, p_h, p_w, sections, sections[2] == 0,
                     freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
             }
 
@@ -11406,12 +11420,23 @@ static void ggml_compute_forward_rope_f32(
                     }
                 }
 
-                for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
-                    const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                    float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+                if (is_mrope) {
+                    // fill the remain channels by repeating 0~n_dims channel
+                    for (int64_t i0 = n_dims; i0 < ne0; i0 ++) {
+                        float * dst_data_0  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1);
+                        float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+                        dst_data[0] = dst_data_0[i0 % n_dims];
+                    }
+                }
+                else {
+                    // fill the remain channels with data from src tensor
+                    for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
+                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                        float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
 
-                    dst_data[0] = src[0];
-                    dst_data[1] = src[1];
+                        dst_data[0] = src[0];
+                        dst_data[1] = src[1];
+                    }
                 }
             }
         }
diff --git a/src/llama.cpp b/src/llama.cpp
index 995d6c8d6..2e42d206d 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -12510,7 +12510,6 @@ struct llm_build_context {
     
     struct ggml_cgraph * build_qwen2vl() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
-
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -12529,6 +12528,7 @@ struct llm_build_context {
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
         struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        int sections[3] = {16, 24, 24};  // TODO: move this into gguf model file.
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -12560,14 +12560,14 @@ struct llm_build_context {
                 Qcur = ggml_mrope_ext(
                     ctx0, 
                     ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_mrope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);