Add vl-rope/2d-rope support for qwen2vl ViT

2024-09-30 22:30:02 +08:00 · 2024-09-30 22:30:02 +08:00 · 9d389a051b
commit 9d389a051b
parent 35411963d2
4 changed files with 109 additions and 17 deletions
--- a/examples/llava/qwen2vl-cli.cpp
+++ b/examples/llava/qwen2vl-cli.cpp
@ -463,9 +463,10 @@ static void tmp_test_mrope(struct llava_context * ctx_llava, gpt_params * params
    for (int i = 60; i < 90; i ++) pos_id[i] = i - 0;
    memcpy(pos->data, pos_id.data(), 90 * ggml_element_size(pos));

+    int sections[3] = {16, 24, 24};
    auto encode = ggml_mrope_ext(
        ctx0, inp_raw, pos, nullptr,
-        128, LLAMA_ROPE_TYPE_NEOX, 32768, 1000000, 1,
+        128, sections, LLAMA_ROPE_TYPE_NEOX, 32768, 1000000, 1,
        0, 1, 32, 1);
    
    ggml_build_forward_expand(gf, encode);
@ -490,6 +491,70 @@ static void tmp_test_mrope(struct llava_context * ctx_llava, gpt_params * params
    }
 }

+
+static void tmp_test_mrope_2d(struct llava_context * ctx_llava, gpt_params * params) {
+    
+    int n_threads = 1;
+    static size_t buf_size = 512u*1024*1024;
+    static void * buf = malloc(buf_size);
+
+    struct ggml_init_params init_params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
+        /*.no_alloc   =*/ false,
+    };
+
+    struct ggml_context * ctx0 = ggml_init(init_params);
+    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+    struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 128, 12, 30);
+    ggml_set_name(inp_raw, "inp_raw");
+    ggml_set_input(inp_raw);
+
+    std::vector<float> dummy_q;
+    dummy_q.resize(128 * 12 * 30);
+    std::fill(dummy_q.begin(), dummy_q.end(), 0.1);
+    memcpy(inp_raw->data, dummy_q.data(), 128 * 12 * 30 * ggml_element_size(inp_raw));
+
+    struct ggml_tensor * pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, 30 * 3);
+    ggml_set_name(pos, "pos");
+    ggml_set_input(pos);
+
+    std::vector<int> pos_id;
+    pos_id.resize(90);
+    for (int i = 0; i < 30; i ++) pos_id[i] = i;
+    for (int i = 30; i < 60; i ++) pos_id[i] = i - 30;
+    for (int i = 60; i < 90; i ++) pos_id[i] = i - 0;
+    memcpy(pos->data, pos_id.data(), 90 * ggml_element_size(pos));
+
+    int sections[3] = {32, 32, 0};
+    auto encode = ggml_mrope_ext(
+        ctx0, inp_raw, pos, nullptr,
+        128/2, sections, LLAMA_ROPE_TYPE_NEOX, 32768, 1000000, 1,
+        0, 1, 32, 1);
+    
+    ggml_build_forward_expand(gf, encode);
+    ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
+
+    std::vector<float> embd;
+    embd.resize(128 * 12 * 30);
+    memcpy(
+        embd.data(), 
+        (float *) ggml_get_data(encode), 
+        sizeof(float) * 128 * 12 * 30);
+    ggml_free(ctx0);
+
+    std::ofstream outFile("mrope_2d.bin", std::ios::binary);
+    if (outFile.is_open()) {
+        outFile.write(reinterpret_cast<const char*>(embd.data()), embd.size() * sizeof(int));
+
+        outFile.close();
+        std::cout << "Data successfully written to mrope.bin" << std::endl;
+    } else {
+        std::cerr << "Error opening file!" << std::endl;
+    }
+}
+
 /*
    -----------------------------------------------------------------------------------------------------------------
 */
@ -542,7 +607,8 @@ int main(int argc, char ** argv) {
        // process the prompt
        // tmp_test_conv2d_reshape(ctx_llava, &params);
        // tmp_test_rope(ctx_llava, &params);
-        tmp_test_mrope(ctx_llava, &params);
+        // tmp_test_mrope(ctx_llava, &params);
+        tmp_test_mrope_2d(ctx_llava, &params);
        // process_prompt(ctx_llava, nullptr, &params, params.prompt);

        llama_print_timings(ctx_llava->ctx_llama);
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -1451,6 +1451,7 @@ extern "C" {
            struct ggml_tensor  * b,
            struct ggml_tensor  * c,
            int                   n_dims,
+            int                   sections[3],
            int                   mode,
            int                   n_ctx_orig,
            float                 freq_base,
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -3559,6 +3559,7 @@ struct ggml_tensor * ggml_mrope_ext(
        struct ggml_tensor  * b,
        struct ggml_tensor  * c,
        int                   n_dims,
+        int                   sections[3],
        int                   mode,
        int                   n_ctx_orig,
        float                 freq_base,
@ -3568,8 +3569,6 @@ struct ggml_tensor * ggml_mrope_ext(
        float                 beta_fast,
        float                 beta_slow) {

-    int sections[3] = {16, 24, 24};  // TODO: move this into gguf model file.
-
    GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");

    GGML_ASSERT(ggml_is_vector(b));
@ -3596,7 +3595,8 @@ struct ggml_tensor * ggml_mrope_ext(
    memcpy(params +  8, &attn_factor,  sizeof(float));
    memcpy(params +  9, &beta_fast,    sizeof(float));
    memcpy(params + 10, &beta_slow,    sizeof(float));
-    memcpy(params + 11, &sections,    sizeof(int) * 3);
+    memcpy(&params[11], sections,      sizeof(int)*3);
+    // memcpy(params + 11, sections,      sizeof(int)*3);
    ggml_set_op_params(result, params, sizeof(params));

    result->op   = GGML_OP_ROPE;
@ -11238,7 +11238,7 @@ static void ggml_rope_cache_init(
 }

 static void ggml_mrope_cache_init(
-     float theta_base_t, float theta_base_h, float theta_base_w, int sections[3],
+     float theta_base_t, float theta_base_h, float theta_base_w, int sections[3], bool indep_sects,
     float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
     float * cache, float sin_sign, float theta_scale) {
    // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
@ -11246,12 +11246,25 @@ static void ggml_mrope_cache_init(
    float theta_h = theta_base_h;
    float theta_w = theta_base_w;
    int sect_dims = sections[0] + sections[1] + sections[2];
+    int prev_sector = -1;
    
    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
        const float ff = freq_factors ? freq_factors[i0/2] : 1.0f;
-        float theta = theta_t;
-        int sector = (i0 / 2) % sect_dims;
        
+        int sector = (i0 / 2) % sect_dims;
+        if (indep_sects) {
+            if (sector == 0) {
+                theta_t = theta_base_t;
+            }
+            else if (sector == sections[0]) {
+                theta_h = theta_base_h;;
+            }
+            else if (sector == sections[1]) {
+                theta_w = theta_base_w;
+            }
+        }
+
+        float theta = theta_t;
        if (sector < sections[1] + sections[0] && sector >= sections[0]) {
            theta = theta_h;
        } 
@ -11267,6 +11280,7 @@ static void ggml_mrope_cache_init(
        theta_t *= theta_scale;
        theta_w *= theta_scale;
        theta_h *= theta_scale;
+        prev_sector = sector;
    }
 }

@ -11366,7 +11380,7 @@ static void ggml_compute_forward_rope_f32(
                const int64_t p_h = pos[i2 + ne2];
                const int64_t p_w = pos[i2 + ne2 * 2];
                ggml_mrope_cache_init(
-                    p_t, p_h, p_w, sections, 
+                    p_t, p_h, p_w, sections, sections[2] == 0,
                    freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
            }

@ -11406,12 +11420,23 @@ static void ggml_compute_forward_rope_f32(
                    }
                }

-                for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
-                    const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                    float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+                if (is_mrope) {
+                    // fill the remain channels by repeating 0~n_dims channel
+                    for (int64_t i0 = n_dims; i0 < ne0; i0 ++) {
+                        float * dst_data_0  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1);
+                        float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+                        dst_data[0] = dst_data_0[i0 % n_dims];
+                    }
+                }
+                else {
+                    // fill the remain channels with data from src tensor
+                    for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
+                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                        float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);

-                    dst_data[0] = src[0];
-                    dst_data[1] = src[1];
+                        dst_data[0] = src[0];
+                        dst_data[1] = src[1];
+                    }
                }
            }
        }
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -12510,7 +12510,6 @@ struct llm_build_context {
    
    struct ggml_cgraph * build_qwen2vl() {
        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
-
        const int64_t n_embd_head = hparams.n_embd_head_v;
        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
        GGML_ASSERT(n_embd_head == hparams.n_rot);
@ -12529,6 +12528,7 @@ struct llm_build_context {

        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        int sections[3] = {16, 24, 24};  // TODO: move this into gguf model file.

        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * inpSA = inpL;
@ -12560,14 +12560,14 @@ struct llm_build_context {
                Qcur = ggml_mrope_ext(
                    ctx0, 
                    ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow
                );
                cb(Qcur, "Qcur", il);

                Kcur = ggml_mrope_ext(
                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow
                );
                cb(Kcur, "Kcur", il);