diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index e1df2aa5e..ba28c07c6 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -659,19 +659,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 if (ctx->has_qwen2vl_merger) { GGML_ASSERT(image_size_width % (patch_size * 2) == 0); GGML_ASSERT(image_size_height % (patch_size * 2) == 0); - + auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); inp = ggml_add(ctx0, inp, inp_1); inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b] inp = ggml_reshape_4d( - ctx0, inp, + ctx0, inp, hidden_size * 2, patches_w / 2, patches_h, batch_size); inp = ggml_reshape_4d( - ctx0, inp, + ctx0, inp, hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2)); inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3)); inp = ggml_reshape_3d( - ctx0, inp, + ctx0, inp, hidden_size, patches_w * patches_h, batch_size); } else { @@ -756,7 +756,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size); if (ctx->has_qwen2vl_merger) { Q = ggml_rope_multi( - ctx0, Q, positions, nullptr, + ctx0, Q, positions, nullptr, d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); } Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head)); @@ -769,7 +769,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); if (ctx->has_qwen2vl_merger) { K = ggml_rope_multi( - ctx0, K, positions, nullptr, + ctx0, K, positions, nullptr, d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); } K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); @@ -1286,7 +1286,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { idx = get_key_idx(ctx, KEY_USE_GELU); new_clip->use_gelu = gguf_get_val_bool(ctx, idx); - + try { idx = get_key_idx(ctx, KEY_USE_SILU); new_clip->use_silu = gguf_get_val_bool(ctx, idx); @@ -2079,14 +2079,14 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli } } return true; - } + } else if (ctx->has_qwen2vl_merger) { clip_image_u8 * resized = clip_image_u8_init(); auto patch_size = clip_patch_size(ctx) * 2; int nx = ceil((float)img->nx / patch_size) * patch_size; int ny = ceil((float)img->ny / patch_size) * patch_size; bicubic_resize(*img, *resized, nx, ny); - + res_imgs->data = new clip_image_f32[1]; // clip_image_f32 * res = clip_image_f32_init(); normalize_image_u8_to_f32(resized, res_imgs->data, ctx->image_mean, ctx->image_std); @@ -2573,7 +2573,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima const int pw = image_size_width / patch_size; const int ph = image_size_height / patch_size; int* positions_data = (int*)malloc(ggml_nbytes(positions)); - + int ptr = 0; for (int y = 0; y < ph; y+=2) { @@ -2590,7 +2590,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } } } - + ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); free(positions_data); } diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 80fba1c51..16f30c56c 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -263,7 +263,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli std::vector image_embd_v; image_embd_v.resize(img_res_v.size); struct clip_image_size * load_image_size = clip_image_size_init(); - + for (size_t i = 0; i < img_res_v.size; i++) { const int64_t t_img_enc_step_start_us = ggml_time_us(); image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny)); @@ -271,7 +271,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli load_image_size->width = img_res_v.data[i].nx; load_image_size->height = img_res_v.data[i].ny; clip_add_load_image_size(ctx_clip, load_image_size); - + bool encoded = false; if (clip_is_qwen2vl(ctx_clip)) { encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); @@ -285,7 +285,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); } } - + if (!encoded) { LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); return false; @@ -299,8 +299,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli int n_img_pos_out = 0; for (size_t i = 0; i < image_embd_v.size(); i++) { std::memcpy( - image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), - image_embd_v[i], + image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), + image_embd_v[i], clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny)); n_img_pos_out += clip_n_patches_by_img(ctx_clip, &img_res_v.data[i]); } diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-cli.cpp index a65687a8a..e86a60280 100644 --- a/examples/llava/qwen2vl-cli.cpp +++ b/examples/llava/qwen2vl-cli.cpp @@ -25,7 +25,7 @@ #include -static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, +static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) { int n_embd = llama_n_embd(llama_get_model(ctx_llama)); const int patch_size = 14 * 2; @@ -35,7 +35,7 @@ static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct lla // llama_pos mrope_pos[img_tokens * 4]; std::vector mrope_pos; mrope_pos.resize(img_tokens * 4); - + for (int y = 0; y < ph; y++) { for (int x = 0; x < pw; x++) @@ -45,14 +45,14 @@ static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct lla mrope_pos[i + img_tokens] = *st_pos_id + y; mrope_pos[i + img_tokens * 2] = *st_pos_id + x; mrope_pos[i + img_tokens * 3] = 0; - } + } } *st_pos_id += std::max(pw, ph); int processed = 0; std::vector batch_mrope_pos; batch_mrope_pos.resize(img_tokens * 4); - + for (int i = 0; i < img_tokens; i += n_batch) { int n_eval = img_tokens - i; if (n_eval > n_batch) { @@ -65,7 +65,7 @@ static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct lla memcpy(&batch_mrope_pos[n_eval * 1], &mrope_pos[img_tokens * 1 + processed], n_eval * sizeof(llama_pos)); memcpy(&batch_mrope_pos[n_eval * 2], &mrope_pos[img_tokens * 2 + processed], n_eval * sizeof(llama_pos)); memcpy(&batch_mrope_pos[n_eval * 3], &mrope_pos[img_tokens * 3 + processed], n_eval * sizeof(llama_pos)); - + llama_batch batch = { int32_t(n_eval), // n_tokens nullptr, // token @@ -75,7 +75,7 @@ static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct lla nullptr, // seq_id nullptr, // logits }; - + if (llama_decode(ctx_llama, batch)) { LOG_ERR("%s : failed to eval\n", __func__); return false; @@ -103,7 +103,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector(result_data), ggml_nbytes(result)); outFile.close(); @@ -493,7 +493,7 @@ static void debug_dump_img_embed(struct llava_context * ctx_llava) { for (int c = 0; c < 3; c++) vals[i * 3 + c] = (float)(i % (56 * 56)) / (56*56); } - + clip_encode_float_image(ctx_llava->ctx_clip, 16, vals, 56, 56, embd.data()); std::ofstream outFile("img_embed.bin", std::ios::binary); @@ -547,7 +547,7 @@ int main(int argc, char ** argv) { #ifndef NDEBUG } else if (params.image[0].empty()) { auto ctx_llava = llava_init_context(¶ms, model); - + debug_test_mrope_2d(); debug_dump_img_embed(ctx_llava); diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index fd414027c..67e67a089 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -9146,10 +9146,10 @@ static void ggml_mrope_cache_init( int sec_w = sections[1] + sections[0]; int sec_e = sections[2] + sec_w; GGML_ASSERT(sect_dims <= ne0); - + for (int64_t i0 = 0; i0 < ne0; i0 += 2) { const float ff = freq_factors ? freq_factors[i0/2] : 1.0f; - + int sector = (i0 / 2) % sect_dims; if (indep_sects) { // compute theta independently for each dim sections @@ -9171,14 +9171,14 @@ static void ggml_mrope_cache_init( float theta = theta_t; if (sector >= sections[0] && sector < sec_w) { theta = theta_h; - } + } else if (sector >= sec_w && sector < sec_w + sections[2]) { theta = theta_w; } else if (sector >= sec_w + sections[2]) { theta = theta_e; } - + rope_yarn( theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1] ); diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu index e1a5361b5..fc3cabfb2 100644 --- a/ggml/src/ggml-cuda/rope.cu +++ b/ggml/src/ggml-cuda/rope.cu @@ -147,7 +147,7 @@ static __global__ void rope_multi( } else if (sector >= sections.v[0] && sector < sec_w) { theta_base = pos[i2 + ne2 * 1]*powf(theta_scale, i0/2.0f); - } + } else if (sector >= sec_w && sector < sec_w + sections.v[2]) { theta_base = pos[i2 + ne2 * 2]*powf(theta_scale, i0/2.0f); } @@ -196,7 +196,7 @@ static __global__ void rope_vision( else if (sector >= sections.v[0] && sector < sec_w) { const int p = sector - sections.v[0]; theta_base = pos[i2 + ne2]*powf(theta_scale, p); - } + } const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f; diff --git a/src/llama.cpp b/src/llama.cpp index 8a3c4e605..116272814 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3400,7 +3400,7 @@ struct llama_context { // whether we are computing encoder output or decoder output bool is_encoding = false; - + // TODO: find a better way to accommodate mutli-dimension position encoding methods // number of position id each token get, 1 for each token in most cases. // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate. @@ -12592,7 +12592,7 @@ struct llm_build_context { return gf; } - + struct ggml_cgraph * build_qwen2vl() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; @@ -12644,7 +12644,7 @@ struct llm_build_context { cb(Vcur, "Vcur", il); Qcur = ggml_rope_multi( - ctx0, + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow @@ -20168,7 +20168,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_EXAONE: case LLM_ARCH_MINICPM3: return LLAMA_ROPE_TYPE_NEOX; - + case LLM_ARCH_QWEN2VL: return LLAMA_ROPE_TYPE_MROPE; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index b454cd917..b9454ba59 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -2200,10 +2200,10 @@ struct test_rope : public test_case { ggml_set_param(ctx, a); ggml_set_name(a, "a"); } - + const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE; const bool is_vision = mode == GGML_ROPE_TYPE_VISION; - + ggml_tensor * pos; if (is_mrope || is_vision) { pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2] * 4); @@ -3834,7 +3834,7 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_rope(type, { 80, 32, 2, 1}, 20, 2, 512, fs, ef, af, ff, v)); // neox (stablelm) test_cases.emplace_back(new test_rope(type, { 80, 32, 2, 1}, 32, 2, 512, fs, ef, af, ff, v)); // neox (phi-2) } - + if (all) { test_cases.emplace_back(new test_rope(type, {128, 12, 2, 1}, 128, GGML_ROPE_TYPE_MROPE, 512, fs, ef, af, ff, v)); // rope_multi,m-rope (qwen2vl 2B) test_cases.emplace_back(new test_rope(type, {128, 28, 2, 1}, 128, GGML_ROPE_TYPE_MROPE, 512, fs, ef, af, ff, v)); // rope_multi,m-rope (qwen2vl 7B) diff --git a/tests/test-rope.cpp b/tests/test-rope.cpp index b54e3b21e..322b8bb99 100644 --- a/tests/test-rope.cpp +++ b/tests/test-rope.cpp @@ -178,7 +178,7 @@ int main(int /*argc*/, const char ** /*argv*/) { struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4); struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4); struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4); - + int sections[4] = {16, 24, 24, 0}; mode = (m == 3) ? GGML_ROPE_TYPE_MROPE : GGML_ROPE_TYPE_VISION; @@ -189,7 +189,7 @@ int main(int /*argc*/, const char ** /*argv*/) { ((int32_t *) p2->data)[i + ne[2] * j] = n_past_2 + i + j; } } - + // [[100, 101, 102, ..., 172], // [101, 102, 103, ..., 173], // [102, 103, 104, ..., 174]]