From fd2c58286aaeb4ed51d6b963344a6d2584e25ab5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E4=B8=BA?= <liwei@liweideMac-Studio.local>
Date: Thu, 21 Nov 2024 20:10:27 +0800
Subject: [PATCH 1/5] remove reference interface from extern C in qwen2audio
 examples

---
 examples/qwen2-audio/qwen2-cli.cpp | 12 ++++++---
 examples/qwen2-audio/qwen2.cpp     | 40 ++++++++++++++++++++++--------
 examples/qwen2-audio/qwen2.h       | 29 +++++-----------------
 3 files changed, 44 insertions(+), 37 deletions(-)
diff --git a/examples/qwen2-audio/qwen2-cli.cpp b/examples/qwen2-audio/qwen2-cli.cpp
index 9221780a9..1d139c5a5 100644
--- a/examples/qwen2-audio/qwen2-cli.cpp
+++ b/examples/qwen2-audio/qwen2-cli.cpp
@@ -1,9 +1,13 @@
 #include "qwen2.h"
+#include <iostream>
+
+using std::cout;
+using std::endl;
 
 int main(int argc, char **argv)
 {
 
-    omni_context_params ctx_params = omni_context_default_params();
+    omni_context_params * ctx_params = omni_context_default_params();
     if (!omni_context_params_parse(argc, argv, ctx_params))
     {
         return 1;
@@ -11,9 +15,11 @@ int main(int argc, char **argv)
 
     omni_context *ctx_omni = omni_init_context(ctx_params);
 
-    omni_process_full(ctx_omni, ctx_params);
+    auto* ret_str  = omni_process_full(ctx_omni, ctx_params);
+    cout << "RET: " << ret_str << endl;
+
 
     omni_free(ctx_omni);
 
     return 0;
-}
\ No newline at end of file
+}
diff --git a/examples/qwen2-audio/qwen2.cpp b/examples/qwen2-audio/qwen2.cpp
index 8a08a7ac6..02d3c94d8 100644
--- a/examples/qwen2-audio/qwen2.cpp
+++ b/examples/qwen2-audio/qwen2.cpp
@@ -27,10 +27,26 @@ void* internal_chars = nullptr;
 
 static const char *AUDIO_TOKEN = "<|AUDIO|>";
 
+struct omni_context_params
+{
+    const char *model;
+    const char *mmproj;
+    const char *file;
+    const char *prompt;
+    int32_t n_gpu_layers;
+};
+
+struct omni_context
+{
+    struct whisper_context *ctx_whisper;
+    struct audio_projector *projector;
+    struct llama_context *ctx_llama;
+    struct llama_model *model;
+};
+
 //
 // Whisper
 //
-
 struct whisper_params
 {
     int32_t n_threads = std::min(4, (int32_t)std::thread::hardware_concurrency());
@@ -476,8 +492,9 @@ static void omni_print_usage(int, char **argv)
     LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
 }
 
-bool omni_context_params_parse(int argc, char **argv, omni_context_params &params)
+bool omni_context_params_parse(int argc, char **argv, omni_context_params * in_params)
 {
+    auto& params = *in_params;
     for (int i = 1; i < argc; i++)
     {
         std::string arg = argv[i];
@@ -523,15 +540,15 @@ bool omni_context_params_parse(int argc, char **argv, omni_context_params &param
     return true;
 }
 
-omni_context_params omni_context_default_params()
+omni_context_params * omni_context_default_params()
 {
-    omni_context_params params;
+    static omni_context_params params;
     params.model = "";
     params.mmproj = "";
     params.file = "";
     params.prompt = "this conversation talks about";
     params.n_gpu_layers = -1;
-    return params;
+    return &params;
 }
 
 struct omni_params
@@ -540,8 +557,9 @@ struct omni_params
     whisper_params whisper;
 };
 
-bool omni_params_parse(int argc, char **argv, omni_params &params)
+bool omni_params_parse(int argc, char **argv, omni_params * in_params)
 {
+    auto& params = *in_params;
     if (!gpt_params_parse(argc, argv, params.gpt))
     {
         return false;
@@ -564,8 +582,9 @@ bool omni_params_parse(int argc, char **argv, omni_params &params)
     return true;
 }
 
-static omni_params get_omni_params_from_context_params(omni_context_params &params)
+static omni_params get_omni_params_from_context_params(omni_context_params * in_params)
 {
+    auto& params = *in_params;
     omni_params all_params;
 
     // Initialize gpt params
@@ -639,10 +658,9 @@ static size_t find_audio_token(const std::string &prompt)
     return prompt.find(AUDIO_TOKEN);
 }
 
-struct omni_context *omni_init_context(omni_context_params &params)
+struct omni_context *omni_init_context(omni_context_params * in_params)
 {
-
-    omni_params all_params = get_omni_params_from_context_params(params);
+    omni_params all_params = get_omni_params_from_context_params(in_params);
 
     // llama
     LLAMA_LOG_INFO("------- llama --------\n");
@@ -877,7 +895,7 @@ const char* omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audi
     return (const char*)(internal_chars);
 }
 
-const char* omni_process_full(struct omni_context *ctx_omni, omni_context_params &params)
+const char* omni_process_full(struct omni_context *ctx_omni, omni_context_params * params)
 {
     omni_params all_params = get_omni_params_from_context_params(params);
 
diff --git a/examples/qwen2-audio/qwen2.h b/examples/qwen2-audio/qwen2.h
index dcadb4288..61538b837 100644
--- a/examples/qwen2-audio/qwen2.h
+++ b/examples/qwen2-audio/qwen2.h
@@ -29,34 +29,17 @@
 extern "C" {
 #endif
 
-struct omni_context_params
-{
-    const char *model;
-    const char *mmproj;
-    const char *file;
-    const char *prompt;
-    int32_t n_gpu_layers;
-};
+OMNI_AUDIO_API bool omni_context_params_parse(int argc, char **argv, struct omni_context_params * params);
 
-struct omni_context
-{
-    struct whisper_context *ctx_whisper;
-    struct audio_projector *projector;
-    struct llama_context *ctx_llama;
-    struct llama_model *model;
-};
+OMNI_AUDIO_API struct omni_context_params * omni_context_default_params();
 
-OMNI_AUDIO_API bool omni_context_params_parse(int argc, char **argv, omni_context_params &params);
+OMNI_AUDIO_API struct omni_context * omni_init_context(struct omni_context_params * params);
 
-OMNI_AUDIO_API omni_context_params omni_context_default_params();
-
-OMNI_AUDIO_API struct omni_context *omni_init_context(omni_context_params &params);
-
-OMNI_AUDIO_API void omni_free(struct omni_context *ctx_omni);
+OMNI_AUDIO_API void omni_free(struct omni_context * ctx_omni);
 
 OMNI_AUDIO_API const char* omni_process_full(
-    struct omni_context *ctx_omni,
-    omni_context_params &params
+    struct omni_context * ctx_omni,
+    struct omni_context_params * params
 );
 
 #ifdef __cplusplus

From 7589158595091a88b7844c83569f68c780469d5b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E4=B8=BA?= <liwei@liweideMac-Studio.local>
Date: Thu, 21 Nov 2024 20:44:49 +0800
Subject: [PATCH 2/5] expose omni_context_params struct

---
 examples/qwen2-audio/qwen2.cpp | 8 --------
 examples/qwen2-audio/qwen2.h   | 9 +++++++++
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/examples/qwen2-audio/qwen2.cpp b/examples/qwen2-audio/qwen2.cpp
index 02d3c94d8..d172ce087 100644
--- a/examples/qwen2-audio/qwen2.cpp
+++ b/examples/qwen2-audio/qwen2.cpp
@@ -27,14 +27,6 @@ void* internal_chars = nullptr;
 
 static const char *AUDIO_TOKEN = "<|AUDIO|>";
 
-struct omni_context_params
-{
-    const char *model;
-    const char *mmproj;
-    const char *file;
-    const char *prompt;
-    int32_t n_gpu_layers;
-};
 
 struct omni_context
 {
diff --git a/examples/qwen2-audio/qwen2.h b/examples/qwen2-audio/qwen2.h
index 61538b837..c0894cb1a 100644
--- a/examples/qwen2-audio/qwen2.h
+++ b/examples/qwen2-audio/qwen2.h
@@ -29,6 +29,15 @@
 extern "C" {
 #endif
 
+struct omni_context_params
+{
+    const char *model;
+    const char *mmproj;
+    const char *file;
+    const char *prompt;
+    int32_t n_gpu_layers;
+};
+
 OMNI_AUDIO_API bool omni_context_params_parse(int argc, char **argv, struct omni_context_params * params);
 
 OMNI_AUDIO_API struct omni_context_params * omni_context_default_params();

From be54cb02ff14354ac78dd8ec8a9efa170475b00d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E4=B8=BA?= <liwei@liweideMac-mini.local>
Date: Tue, 3 Dec 2024 11:47:28 +0800
Subject: [PATCH 3/5] bug fix

---
 common/common-nexa.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/common/common-nexa.cpp b/common/common-nexa.cpp
index cda4706b7..e774fc505 100644
--- a/common/common-nexa.cpp
+++ b/common/common-nexa.cpp
@@ -152,7 +152,6 @@ bool load_hparams_and_tensors_from_gguf(const std::string &fname, NexaBaseModel
 
     ggml_free(meta);
     gguf_free(ctx_gguf);
-    gguf_free(ctx_gguf);
     return true;
 }
 

From ca7e8ef19e1e3ca1558d64e184218e83294ebb5d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E4=B8=BA?= <liwei@liweideMac-mini.local>
Date: Tue, 3 Dec 2024 14:54:52 +0800
Subject: [PATCH 4/5] fix clip_n_patch() allocation size error for 81-series
 omni-vlm models

---
 examples/omni-vlm/clip.cpp | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/examples/omni-vlm/clip.cpp b/examples/omni-vlm/clip.cpp
index 73abe85a8..ad2d8c102 100644
--- a/examples/omni-vlm/clip.cpp
+++ b/examples/omni-vlm/clip.cpp
@@ -39,12 +39,12 @@
 #include <sstream>
 #include <cinttypes>
 #include <limits>
-// #include <iostream>
+#include <iostream>
 // #include <fstream>
 // #include <fstream>
 
-// using std::cout;
-// using std::endl;
+using std::cout;
+using std::endl;
 
 #define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
 #define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
@@ -1927,16 +1927,20 @@ int clip_n_patches(const struct clip_ctx * ctx) {
 
     int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
 
-    if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
-        n_patches /= 4;
-    } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
-        if (ctx->minicpmv_version == 2) {
-            n_patches = 96;
-        }
-        else if (ctx->minicpmv_version == 3) {
-            n_patches = 64;
-        }
+    if(ctx->omni_vlm_ver_type == omni_vlm_version_type::VLM_81_OCR 
+        || ctx->omni_vlm_ver_type == omni_vlm_version_type::VLM_81_INSTRUCT) {
+        n_patches /= 9;
     }
+    // if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
+    //     n_patches /= 4;
+    // } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
+    //     if (ctx->minicpmv_version == 2) {
+    //         n_patches = 96;
+    //     }
+    //     else if (ctx->minicpmv_version == 3) {
+    //         n_patches = 64;
+    //     }
+    // }
 
     return n_patches;
 }

From b86cdedb7e5d0b9b2fe61404c39010a149da99be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E4=B8=BA?= <liwei@liweideMac-mini.local>
Date: Tue, 3 Dec 2024 15:03:55 +0800
Subject: [PATCH 5/5] remove iostream header

---
 examples/omni-vlm/clip.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/omni-vlm/clip.cpp b/examples/omni-vlm/clip.cpp
index ad2d8c102..2a4f37cf4 100644
--- a/examples/omni-vlm/clip.cpp
+++ b/examples/omni-vlm/clip.cpp
@@ -39,12 +39,12 @@
 #include <sstream>
 #include <cinttypes>
 #include <limits>
-#include <iostream>
+// #include <iostream>
 // #include <fstream>
 // #include <fstream>
 
-using std::cout;
-using std::endl;
+// using std::cout;
+// using std::endl;
 
 #define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
 #define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)