From a02a190756aa8e1a1f3db4a2e1d45ab0d3caad67 Mon Sep 17 00:00:00 2001
From: HimariO <dsfhe49854@gmail.com>
Date: Fri, 13 Dec 2024 21:31:51 +0800
Subject: [PATCH] minor updates

---
 README.md                          |  1 +
 examples/llava/qwen2_vl_surgery.py |  9 +++++++--
 ggml/src/ggml-cuda/rope.cu         |  1 -
 ggml/src/ggml.c                    |  2 +-
 src/llama.cpp                      | 11 ++++++-----
 5 files changed, 15 insertions(+), 9 deletions(-)
diff --git a/README.md b/README.md
index 6fdd8d9ee..ddb932ffc 100644
--- a/README.md
+++ b/README.md
@@ -110,6 +110,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
 - [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
 - [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
+- [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)
 
 </details>
 
diff --git a/examples/llava/qwen2_vl_surgery.py b/examples/llava/qwen2_vl_surgery.py
index 56d933fde..464ab80d3 100644
--- a/examples/llava/qwen2_vl_surgery.py
+++ b/examples/llava/qwen2_vl_surgery.py
@@ -50,7 +50,6 @@ def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
             tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq
             tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk
             tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv
-            # breakpoint()
         elif 'merger' in name:
             if name.endswith("ln_q.weight"):
                 tensor_map['v.post_ln.weight'] = ten
@@ -97,7 +96,12 @@ def main(args):
     cfg: Qwen2VLConfig = qwen2vl.config  # type: ignore[reportAssignmentType]
     vcfg = cfg.vision_config
 
-    fname_out = "qwen2vl-vision.gguf"
+    if os.path.isdir(model_name):
+        if model_name.endswith(os.sep):
+            model_name = model_name[:-1]
+        model_name = os.path.basename(model_name)
+    fname_out = f"{model_name.replace('/', '-').lower()}-vision.gguf"
+
     fout = GGUFWriter(path=fname_out, arch="clip")
     fout.add_description("image encoder for Qwen2VL")
 
@@ -143,6 +147,7 @@ def main(args):
     fout.write_kv_data_to_file()
     fout.write_tensors_to_file()
     fout.close()
+    print("save model as: ", fname_out)
 
 
 if __name__ == "__main__":
diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu
index fc3cabfb2..2c84778d2 100644
--- a/ggml/src/ggml-cuda/rope.cu
+++ b/ggml/src/ggml-cuda/rope.cu
@@ -397,7 +397,6 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const int mode       = ((int32_t *) dst->op_params)[2];
     //const int n_ctx      = ((int32_t *) dst->op_params)[3];
     const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
-    // int sections[4];
     mrope_sections sections;
 
     // RoPE alteration for extended context
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index aa4c952c9..51cc85662 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -3528,7 +3528,7 @@ static struct ggml_tensor * ggml_rope_impl(
     memcpy(params +  8, &attn_factor,  sizeof(float));
     memcpy(params +  9, &beta_fast,    sizeof(float));
     memcpy(params + 10, &beta_slow,    sizeof(float));
-    memcpy(params + 11, &sections,     sizeof(int) * 4);
+    memcpy(params + 11, &sections,     sizeof(int)*4);
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op     = GGML_OP_ROPE;
diff --git a/src/llama.cpp b/src/llama.cpp
index 6349b8019..cb61d4802 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2558,6 +2558,9 @@ struct llama_hparams {
         if (this->rope_finetuned  != other.rope_finetuned)  return true;
         if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
         if (this->rope_sections   != other.rope_sections)   return true;
+        if (std::equal(std::begin(this->rope_sections),
+                       std::end(this->rope_sections),
+                       std::begin(other.rope_sections)))    return true;
 
         if (this->ssm_d_conv  != other.ssm_d_conv)  return true;
         if (this->ssm_d_inner != other.ssm_d_inner) return true;
@@ -3421,8 +3424,6 @@ struct llama_context {
     struct ggml_tensor * inp_tokens;      // I32 [n_batch]
     struct ggml_tensor * inp_embd;        // F32 [n_embd, n_batch]
     struct ggml_tensor * inp_pos;         // I32 [n_batch]
-    struct ggml_tensor * inp_pos_w;       // I32 [n_batch] second-dimension of m-rope position index
-    struct ggml_tensor * inp_pos_h;       // I32 [n_batch] third-dimension of m-rope position index
     struct ggml_tensor * inp_out_ids;     // I32 [n_outputs]
     struct ggml_tensor * inp_KQ_mask;     // F32 [kv_size, n_batch]
     struct ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch]
@@ -12606,7 +12607,6 @@ struct llm_build_context {
         inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
-        // struct ggml_tensor * inp_pos = build_inp_pos();
         lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens * 4);
         cb(lctx.inp_pos, "inp_pos", -1);
         ggml_set_input(lctx.inp_pos);
@@ -12646,14 +12646,15 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_multi(
                     ctx0,
-                    ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos, nullptr,
+                    ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
                     n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_multi(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    ctx0,
+                    ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
                     n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );