From 9d389a051be3675196a9128bed51145db8386449 Mon Sep 17 00:00:00 2001 From: HimariO Date: Mon, 30 Sep 2024 22:30:02 +0800 Subject: [PATCH] Add vl-rope/2d-rope support for qwen2vl ViT --- examples/llava/qwen2vl-cli.cpp | 70 +++++++++++++++++++++++++++++++++- ggml/include/ggml.h | 1 + ggml/src/ggml.c | 49 ++++++++++++++++++------ src/llama.cpp | 6 +-- 4 files changed, 109 insertions(+), 17 deletions(-) diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-cli.cpp index 181678f14..82583e750 100644 --- a/examples/llava/qwen2vl-cli.cpp +++ b/examples/llava/qwen2vl-cli.cpp @@ -463,9 +463,10 @@ static void tmp_test_mrope(struct llava_context * ctx_llava, gpt_params * params for (int i = 60; i < 90; i ++) pos_id[i] = i - 0; memcpy(pos->data, pos_id.data(), 90 * ggml_element_size(pos)); + int sections[3] = {16, 24, 24}; auto encode = ggml_mrope_ext( ctx0, inp_raw, pos, nullptr, - 128, LLAMA_ROPE_TYPE_NEOX, 32768, 1000000, 1, + 128, sections, LLAMA_ROPE_TYPE_NEOX, 32768, 1000000, 1, 0, 1, 32, 1); ggml_build_forward_expand(gf, encode); @@ -490,6 +491,70 @@ static void tmp_test_mrope(struct llava_context * ctx_llava, gpt_params * params } } + +static void tmp_test_mrope_2d(struct llava_context * ctx_llava, gpt_params * params) { + + int n_threads = 1; + static size_t buf_size = 512u*1024*1024; + static void * buf = malloc(buf_size); + + struct ggml_init_params init_params = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf, + /*.no_alloc =*/ false, + }; + + struct ggml_context * ctx0 = ggml_init(init_params); + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 128, 12, 30); + ggml_set_name(inp_raw, "inp_raw"); + ggml_set_input(inp_raw); + + std::vector dummy_q; + dummy_q.resize(128 * 12 * 30); + std::fill(dummy_q.begin(), dummy_q.end(), 0.1); + memcpy(inp_raw->data, dummy_q.data(), 128 * 12 * 30 * ggml_element_size(inp_raw)); + + struct ggml_tensor * pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, 30 * 3); + ggml_set_name(pos, "pos"); + ggml_set_input(pos); + + std::vector pos_id; + pos_id.resize(90); + for (int i = 0; i < 30; i ++) pos_id[i] = i; + for (int i = 30; i < 60; i ++) pos_id[i] = i - 30; + for (int i = 60; i < 90; i ++) pos_id[i] = i - 0; + memcpy(pos->data, pos_id.data(), 90 * ggml_element_size(pos)); + + int sections[3] = {32, 32, 0}; + auto encode = ggml_mrope_ext( + ctx0, inp_raw, pos, nullptr, + 128/2, sections, LLAMA_ROPE_TYPE_NEOX, 32768, 1000000, 1, + 0, 1, 32, 1); + + ggml_build_forward_expand(gf, encode); + ggml_graph_compute_with_ctx(ctx0, gf, n_threads); + + std::vector embd; + embd.resize(128 * 12 * 30); + memcpy( + embd.data(), + (float *) ggml_get_data(encode), + sizeof(float) * 128 * 12 * 30); + ggml_free(ctx0); + + std::ofstream outFile("mrope_2d.bin", std::ios::binary); + if (outFile.is_open()) { + outFile.write(reinterpret_cast(embd.data()), embd.size() * sizeof(int)); + + outFile.close(); + std::cout << "Data successfully written to mrope.bin" << std::endl; + } else { + std::cerr << "Error opening file!" << std::endl; + } +} + /* ----------------------------------------------------------------------------------------------------------------- */ @@ -542,7 +607,8 @@ int main(int argc, char ** argv) { // process the prompt // tmp_test_conv2d_reshape(ctx_llava, ¶ms); // tmp_test_rope(ctx_llava, ¶ms); - tmp_test_mrope(ctx_llava, ¶ms); + // tmp_test_mrope(ctx_llava, ¶ms); + tmp_test_mrope_2d(ctx_llava, ¶ms); // process_prompt(ctx_llava, nullptr, ¶ms, params.prompt); llama_print_timings(ctx_llava->ctx_llama); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 07d66af20..ff833a4fc 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -1451,6 +1451,7 @@ extern "C" { struct ggml_tensor * b, struct ggml_tensor * c, int n_dims, + int sections[3], int mode, int n_ctx_orig, float freq_base, diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index a9518bf96..98a1110d7 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -3559,6 +3559,7 @@ struct ggml_tensor * ggml_mrope_ext( struct ggml_tensor * b, struct ggml_tensor * c, int n_dims, + int sections[3], int mode, int n_ctx_orig, float freq_base, @@ -3568,8 +3569,6 @@ struct ggml_tensor * ggml_mrope_ext( float beta_fast, float beta_slow) { - int sections[3] = {16, 24, 24}; // TODO: move this into gguf model file. - GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported"); GGML_ASSERT(ggml_is_vector(b)); @@ -3596,7 +3595,8 @@ struct ggml_tensor * ggml_mrope_ext( memcpy(params + 8, &attn_factor, sizeof(float)); memcpy(params + 9, &beta_fast, sizeof(float)); memcpy(params + 10, &beta_slow, sizeof(float)); - memcpy(params + 11, §ions, sizeof(int) * 3); + memcpy(¶ms[11], sections, sizeof(int)*3); + // memcpy(params + 11, sections, sizeof(int)*3); ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_ROPE; @@ -11238,7 +11238,7 @@ static void ggml_rope_cache_init( } static void ggml_mrope_cache_init( - float theta_base_t, float theta_base_h, float theta_base_w, int sections[3], + float theta_base_t, float theta_base_h, float theta_base_w, int sections[3], bool indep_sects, float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale, float * cache, float sin_sign, float theta_scale) { // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py @@ -11246,12 +11246,25 @@ static void ggml_mrope_cache_init( float theta_h = theta_base_h; float theta_w = theta_base_w; int sect_dims = sections[0] + sections[1] + sections[2]; + int prev_sector = -1; for (int64_t i0 = 0; i0 < ne0; i0 += 2) { const float ff = freq_factors ? freq_factors[i0/2] : 1.0f; - float theta = theta_t; - int sector = (i0 / 2) % sect_dims; + int sector = (i0 / 2) % sect_dims; + if (indep_sects) { + if (sector == 0) { + theta_t = theta_base_t; + } + else if (sector == sections[0]) { + theta_h = theta_base_h;; + } + else if (sector == sections[1]) { + theta_w = theta_base_w; + } + } + + float theta = theta_t; if (sector < sections[1] + sections[0] && sector >= sections[0]) { theta = theta_h; } @@ -11267,6 +11280,7 @@ static void ggml_mrope_cache_init( theta_t *= theta_scale; theta_w *= theta_scale; theta_h *= theta_scale; + prev_sector = sector; } } @@ -11366,7 +11380,7 @@ static void ggml_compute_forward_rope_f32( const int64_t p_h = pos[i2 + ne2]; const int64_t p_w = pos[i2 + ne2 * 2]; ggml_mrope_cache_init( - p_t, p_h, p_w, sections, + p_t, p_h, p_w, sections, sections[2] == 0, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale); } @@ -11406,12 +11420,23 @@ static void ggml_compute_forward_rope_f32( } } - for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) { - const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + if (is_mrope) { + // fill the remain channels by repeating 0~n_dims channel + for (int64_t i0 = n_dims; i0 < ne0; i0 ++) { + float * dst_data_0 = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + dst_data[0] = dst_data_0[i0 % n_dims]; + } + } + else { + // fill the remain channels with data from src tensor + for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) { + const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - dst_data[0] = src[0]; - dst_data[1] = src[1]; + dst_data[0] = src[0]; + dst_data[1] = src[1]; + } } } } diff --git a/src/llama.cpp b/src/llama.cpp index 995d6c8d6..2e42d206d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -12510,7 +12510,6 @@ struct llm_build_context { struct ggml_cgraph * build_qwen2vl() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); - const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -12529,6 +12528,7 @@ struct llm_build_context { // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + int sections[3] = {16, 24, 24}; // TODO: move this into gguf model file. for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -12560,14 +12560,14 @@ struct llm_build_context { Qcur = ggml_mrope_ext( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_mrope_ext( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il);