diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 000000000..c9d0fbaef --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1 @@ +@zhiyuan8 @alexchen4ai diff --git a/common/common-nexa.cpp b/common/common-nexa.cpp index e8a54ba04..c41f91384 100644 --- a/common/common-nexa.cpp +++ b/common/common-nexa.cpp @@ -150,6 +150,7 @@ bool load_hparams_and_tensors_from_gguf(const std::string &fname, NexaBaseModel } ggml_free(meta); + gguf_free(ctx_gguf); return true; } @@ -314,4 +315,4 @@ struct ggml_tensor * ggml_graph_node(struct ggml_cgraph * cgraph, int i) { GGML_ASSERT(i < cgraph->n_nodes); return cgraph->nodes[i]; -} \ No newline at end of file +} diff --git a/common/common.cpp b/common/common.cpp index 715adf946..e85c498c9 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1442,6 +1442,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa // End of Parse args for logging parameters #endif // LOG_DISABLE_LOGS + if (arg == "--omni-vlm-version") { + CHECK_ARG + params.omni_vlm_version = argv[i]; + return true; + } return false; } @@ -1688,6 +1693,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param "layer range to apply the control vector(s) to, start and end inclusive" }); options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n" "or --model-url if set, otherwise %s)", DEFAULT_MODEL_PATH }); + options.push_back({ "*", " --omni-vlm-version VERSION_STRING", "omni vlm string version(one of 'vlm-81-ocr', 'vlm-81-instruct', 'nano-vlm-instruct')\n" "(default: 'vlm-81-ocr')"}); options.push_back({ "*", "-md, --model-draft FNAME", "draft model for speculative decoding (default: unused)" }); options.push_back({ "*", "-mu, --model-url MODEL_URL", "model download url (default: unused)" }); options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" }); diff --git a/common/common.h b/common/common.h index f603ba2be..73dab55ca 100644 --- a/common/common.h +++ b/common/common.h @@ -265,6 +265,8 @@ struct gpt_params { bool spm_infill = false; // suffix/prefix/middle pattern for infill std::string lora_outfile = "ggml-lora-merged-f16.gguf"; + + std::string omni_vlm_version = "vlm-81-ocr"; }; void gpt_params_parse_from_env(gpt_params & params); diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt index 9da04f7d3..0b9539e56 100644 --- a/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt +++ b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt @@ -72,19 +72,14 @@ class MainActivity( val models = listOf( Downloadable( - "Phi-2 7B (Q4_0, 1.6 GiB)", - Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true"), - File(extFilesDir, "phi-2-q4_0.gguf"), + "Llama3.2-1B-Instruct (Q4_0, 735 MB)", + Uri.parse("https://public-storage.nexa4ai.com/Llama3.2-1B-Instruct/q4_0.gguf"), + File(extFilesDir, "Llama3.2-1B-Instruct-q4_0.gguf"), ), Downloadable( - "TinyLlama 1.1B (f16, 2.2 GiB)", - Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true"), - File(extFilesDir, "tinyllama-1.1-f16.gguf"), - ), - Downloadable( - "Phi 2 DPO (Q3_K_M, 1.48 GiB)", - Uri.parse("https://huggingface.co/TheBloke/phi-2-dpo-GGUF/resolve/main/phi-2-dpo.Q3_K_M.gguf?download=true"), - File(extFilesDir, "phi-2-dpo.Q3_K_M.gguf") + "octopus", + Uri.parse("https://public-storage.nexa4ai.com/Octopus-v2/q4_0.gguf"), + File(extFilesDir, "octopus-q4_0.gguf") ), ) diff --git a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt index 2de496574..9b1a436c8 100644 --- a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +++ b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt @@ -33,6 +33,7 @@ project("llama-android") #load local llama.cpp add_subdirectory(../../../../../../ build-llama) +add_subdirectory(../../../../../../examples/llava build-llava) # In order to load a library into your app from Java/Kotlin, you must call # System.loadLibrary() and pass the name of the library defined here; @@ -50,4 +51,5 @@ target_link_libraries(${CMAKE_PROJECT_NAME} llama common android - log) + log + llava) diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp index 2aafe2316..297583c13 100644 --- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp +++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp @@ -6,6 +6,7 @@ #include #include "llama.h" #include "common.h" +#include "llava.h" // Write C++ code here. // diff --git a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt index 6c63e54e0..866cbaf89 100644 --- a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt +++ b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt @@ -36,7 +36,7 @@ class LLamaAndroid { } }.asCoroutineDispatcher() - private val nlen: Int = 64 + private val nlen: Int = 256 private external fun log_to_android() private external fun load_model(filename: String): Long diff --git a/examples/nexa-omni-audio/omni.cpp b/examples/nexa-omni-audio/omni.cpp index f55dc3d5c..b236fae57 100644 --- a/examples/nexa-omni-audio/omni.cpp +++ b/examples/nexa-omni-audio/omni.cpp @@ -23,6 +23,8 @@ // Constants // +void* internal_chars = nullptr; + static const char *AUDIO_TOKEN = "<|AUDIO|>"; // @@ -570,7 +572,7 @@ static omni_params get_omni_params_from_context_params(omni_context_params ¶ all_params.gpt.n_gpu_layers = params.n_gpu_layers; all_params.gpt.model = params.model; all_params.gpt.prompt = params.prompt; - + // Initialize whisper params all_params.whisper.model = params.mmproj; all_params.whisper.fname_inp = {params.file}; @@ -703,6 +705,10 @@ struct omni_context *omni_init_context(omni_context_params ¶ms) void omni_free(struct omni_context *ctx_omni) { + if(internal_chars != nullptr) + { + free(internal_chars); + } if (ctx_omni->ctx_whisper) { whisper_free(ctx_omni->ctx_whisper); @@ -792,7 +798,7 @@ ggml_tensor *omni_process_audio(struct omni_context *ctx_omni, omni_params ¶ return embed_proj; } -void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed, omni_params ¶ms, const std::string &prompt) +const char* omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed, omni_params ¶ms, const std::string &prompt) { int n_past = 0; @@ -833,12 +839,11 @@ void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed for (int i = 0; i < max_tgt_len; i++) { const char * tmp = sample(ctx_sampling, ctx_omni->ctx_llama, &n_past); - response += tmp; if (strcmp(tmp, "") == 0) break; if (strstr(tmp, "###")) break; // Yi-VL behavior - printf("%s", tmp); + // printf("%s", tmp); if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works) if (strstr(response.c_str(), "<|im_start|>")) @@ -847,16 +852,22 @@ void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed break; // mistral llava-1.6 fflush(stdout); + response += tmp; } llama_sampling_free(ctx_sampling); printf("\n"); + if(internal_chars != nullptr) { free(internal_chars); } + internal_chars = malloc(sizeof(char)*(response.size()+1)); + strncpy((char*)(internal_chars), response.c_str(), response.size()); + ((char*)(internal_chars))[response.size()] = '\0'; + return (const char*)(internal_chars); } -void omni_process_full(struct omni_context *ctx_omni, omni_context_params ¶ms) +const char* omni_process_full(struct omni_context *ctx_omni, omni_context_params ¶ms) { omni_params all_params = get_omni_params_from_context_params(params); ggml_tensor *audio_embed = omni_process_audio(ctx_omni, all_params); - omni_process_prompt(ctx_omni, audio_embed, all_params, all_params.gpt.prompt); -} \ No newline at end of file + return omni_process_prompt(ctx_omni, audio_embed, all_params, all_params.gpt.prompt); +} diff --git a/examples/nexa-omni-audio/omni.h b/examples/nexa-omni-audio/omni.h index 5cbbd52ed..dcadb4288 100644 --- a/examples/nexa-omni-audio/omni.h +++ b/examples/nexa-omni-audio/omni.h @@ -54,11 +54,11 @@ OMNI_AUDIO_API struct omni_context *omni_init_context(omni_context_params ¶m OMNI_AUDIO_API void omni_free(struct omni_context *ctx_omni); -OMNI_AUDIO_API void omni_process_full( +OMNI_AUDIO_API const char* omni_process_full( struct omni_context *ctx_omni, omni_context_params ¶ms ); #ifdef __cplusplus } -#endif \ No newline at end of file +#endif diff --git a/examples/omni-vlm/README.md b/examples/omni-vlm/README.md index d6cfc7f37..07bbb3423 100644 --- a/examples/omni-vlm/README.md +++ b/examples/omni-vlm/README.md @@ -1,22 +1,30 @@ # omni-vlm -Currently this implementation supports [omni-vlm](https://huggingface.co/NexaAIDev/nano-vlm-instruct) variants, +Currently this implementation supports: -After API is confirmed, more models will be supported / uploaded. +* [nano-vlm-instruct](https://huggingface.co/NexaAIDev/nano-vlm-instruct/tree/main) ([gguf](https://huggingface.co/NexaAIDev/nano-vlm-instruct-gguf/tree/main)) +* [vlm-81-ocr](https://huggingface.co/NexaAIDev/vlm-81-ocr/tree/main) ([gguf](https://huggingface.co/NexaAIDev/vlm-81-ocr-gguf/tree/main)) +* [vlm-81-instruct](https://huggingface.co/NexaAIDev/vlm-81-instruct/tree/main) ([gguf](https://huggingface.co/NexaAIDev/vlm-81-instruct-gguf/tree/main)) + +After API is stable, more models will be supported. ## Usage -Build with cmake in the `llama-cpp-experiments` folder: -```bash + +Build with cmake in the `llama.cpp` folder: + +```console cmake -S . -B build -DCMAKE_BUILD_TYPE=RelWithDebInfo cmake --build build --verbose -j ``` + After building, run: `./omni-vlm-cli` to see the usage. For example: -```bash +```console ./omni-vlm-cli \ - -m Nano-Llm-494M-F16.gguf \ - --mmproj mmproj-omni-vlm-f16.gguf \ - --image example/omni-vlm/cat.png + -m \ + --mmproj \ + --image example/omni-vlm/cat.png \ + --omni-vlm-version ``` See next section to convert gguf files from original safetensors. @@ -27,6 +35,7 @@ See next section to convert gguf files from original safetensors. ) ## Omni-vlm gguf conversion + 1) First clone omni-vlm model: ```console git clone https://huggingface.co/NexaAIDev/nano-vlm-instruct @@ -34,7 +43,7 @@ git clone https://huggingface.co/NexaAIDev/nano-vlm-instruct 2) Install the required Python packages: -```sh +```console pip install -r examples/omni-vlm/requirements.txt ``` @@ -104,6 +113,5 @@ After successfully compiling omni_vlm_wrapper_shared dynamic library, run: python omni_vlm_demo.py \ --model /Nano-Llm-494M-F16.gguf \ --mmproj /mmproj-omni-vlm-f16.gguf \ - --prompt="Describe this image for me" \ - --image-path cat.png + --omni-vlm-version ``` diff --git a/examples/omni-vlm/clip.cpp b/examples/omni-vlm/clip.cpp index 45764f9f3..618067aba 100644 --- a/examples/omni-vlm/clip.cpp +++ b/examples/omni-vlm/clip.cpp @@ -6,6 +6,7 @@ #include "ggml.h" #include "ggml-alloc.h" #include "ggml-backend.h" +#include "common.h" #ifdef GGML_USE_CUDA #include "ggml-cuda.h" @@ -167,7 +168,11 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_RESAMPLER, "resampler"}, }; - +enum omni_vlm_version_type { + VLM_81_OCR, + VLM_81_INSTRUCT, + NANO_VLM_INSTRUCT, +}; // // utilities to get data from a gguf file // @@ -294,115 +299,6 @@ static projector_type clip_projector_type_from_string(const std::string & name) return PROJECTOR_TYPE_UNKNOWN; } -#ifdef CLIP_DEBUG_FUNCTIONS -static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) { - std::ofstream file(filename, std::ios::binary); - if (!file.is_open()) { - LOG_ERR("Failed to open file for writing: %s\n", filename.c_str()); - return; - } - - // PPM header: P6 format, width, height, and max color value - file << "P6\n" << img.nx << " " << img.ny << "\n255\n"; - - // Write pixel data - for (size_t i = 0; i < img.buf.size(); i += 3) { - // PPM expects binary data in RGB format, which matches our image buffer - file.write(reinterpret_cast(&img.buf[i]), 3); - } - - file.close(); -} - -static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) { - std::ofstream file(filename, std::ios::binary); - if (!file.is_open()) { - LOG_ERR("Failed to open file for writing: %s\n", filename.c_str()); - return; - } - - int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data - int bytesPerPixel = 3; - int widthInBytes = img.nx * bytesPerPixel; - int paddingAmount = (4 - (widthInBytes % 4)) % 4; - int stride = widthInBytes + paddingAmount; - - // Bitmap file header - unsigned char fileHeader[14] = { - 'B','M', // Signature - 0,0,0,0, // Image file size in bytes - 0,0,0,0, // Reserved - 54,0,0,0 // Start of pixel array - }; - - // Total file size - fileSize = 54 + (stride * img.ny); - fileHeader[2] = (unsigned char)(fileSize); - fileHeader[3] = (unsigned char)(fileSize >> 8); - fileHeader[4] = (unsigned char)(fileSize >> 16); - fileHeader[5] = (unsigned char)(fileSize >> 24); - - // Bitmap information header (BITMAPINFOHEADER) - unsigned char infoHeader[40] = { - 40,0,0,0, // Size of this header (40 bytes) - 0,0,0,0, // Image width - 0,0,0,0, // Image height - 1,0, // Number of color planes - 24,0, // Bits per pixel - 0,0,0,0, // No compression - 0,0,0,0, // Image size (can be 0 for no compression) - 0,0,0,0, // X pixels per meter (not specified) - 0,0,0,0, // Y pixels per meter (not specified) - 0,0,0,0, // Total colors (color table not used) - 0,0,0,0 // Important colors (all are important) - }; - - // Width and height in the information header - infoHeader[4] = (unsigned char)(img.nx); - infoHeader[5] = (unsigned char)(img.nx >> 8); - infoHeader[6] = (unsigned char)(img.nx >> 16); - infoHeader[7] = (unsigned char)(img.nx >> 24); - infoHeader[8] = (unsigned char)(img.ny); - infoHeader[9] = (unsigned char)(img.ny >> 8); - infoHeader[10] = (unsigned char)(img.ny >> 16); - infoHeader[11] = (unsigned char)(img.ny >> 24); - - // Write file headers - file.write(reinterpret_cast(fileHeader), sizeof(fileHeader)); - file.write(reinterpret_cast(infoHeader), sizeof(infoHeader)); - - // Pixel data - std::vector padding(3, 0); // Max padding size to be added to each row - for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top - for (int x = 0; x < img.nx; ++x) { - // Each pixel - size_t pixelIndex = (y * img.nx + x) * 3; - unsigned char pixel[3] = { - img.buf[pixelIndex + 2], // BMP stores pixels in BGR format - img.buf[pixelIndex + 1], - img.buf[pixelIndex] - }; - file.write(reinterpret_cast(pixel), 3); - } - // Write padding for the row - file.write(reinterpret_cast(padding.data()), paddingAmount); - } - - file.close(); -} - -// debug function to convert f32 to u8 -static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) { - dst.nx = src.nx; - dst.ny = src.ny; - dst.buf.resize(3 * src.nx * src.ny); - for (size_t i = 0; i < src.buf.size(); ++i) { - dst.buf[i] = static_cast(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255)); - } -} -#endif - - // // clip layers // @@ -564,6 +460,7 @@ struct clip_ctx { struct clip_vision_model vision_model; projector_type proj_type = PROJECTOR_TYPE_MLP; + omni_vlm_version_type omni_vlm_ver_type; float image_mean[3]; float image_std[3]; @@ -785,6 +682,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b); } + if(ctx->omni_vlm_ver_type == omni_vlm_version_type::VLM_81_OCR || ctx->omni_vlm_ver_type == omni_vlm_version_type::VLM_81_INSTRUCT) { + embeddings = ggml_reshape_3d(ctx0, embeddings, embeddings->ne[0]*9, embeddings->ne[1]/9, 1); + } + embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); @@ -800,7 +701,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } // read and create ggml_context containing the tensors and their data -struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { +struct clip_ctx * clip_model_load(const char * fname, const char * omni_vlm_version, const int verbosity = 1) { struct ggml_context * meta = NULL; struct gguf_init_params params = { @@ -895,6 +796,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } clip_ctx * new_clip = new clip_ctx{}; + if (std::string(omni_vlm_version) == "vlm-81-ocr") { + new_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_OCR; + } else if (std::string(omni_vlm_version) == "vlm-81-instruct") { + new_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_INSTRUCT; + } else if (std::string(omni_vlm_version) == "nano-vlm-instruct") { + new_clip->omni_vlm_ver_type = omni_vlm_version_type::NANO_VLM_INSTRUCT; + } else { + throw std::runtime_error(std::string("error vlm version info: ") + omni_vlm_version); + } // update projector type { @@ -1308,6 +1218,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { return new_clip; } +// void clip_set_omni_vlm_version(struct clip_ctx * ctx_clip, const struct gpt_params * params) { +// if (params->omni_vlm_version == "vlm-81-ocr") { +// ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_OCR; +// } else if (params->omni_vlm_version == "vlm-81-instruct") { +// ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_INSTRUCT; +// } else if (params->omni_vlm_version == "nano-vlm-instruct") { +// ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::NANO_VLM_INSTRUCT; +// } else { +// throw std::runtime_error(std::string("error vlm version info: ") + params->omni_vlm_version); +// } +// } + void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) { ctx_clip->load_image_size = load_image_size; } @@ -2294,13 +2216,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima return true; } -bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) { +bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype, const char* omni_vlm_version) { ggml_type type = GGML_TYPE_Q4_1; assert(itype < GGML_TYPE_COUNT); type = static_cast(itype); - auto * ctx_clip = clip_model_load(fname_inp, 2); + auto * ctx_clip = clip_model_load(fname_inp, omni_vlm_version, 2); const auto & ctx_src = ctx_clip->ctx_gguf; const auto & ctx_data = ctx_clip->ctx_data; diff --git a/examples/omni-vlm/clip.h b/examples/omni-vlm/clip.h index 78588bdf1..cd4007a9e 100644 --- a/examples/omni-vlm/clip.h +++ b/examples/omni-vlm/clip.h @@ -39,9 +39,12 @@ struct clip_image_f32_batch { size_t size; }; -CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity); +CLIP_API struct clip_ctx * clip_model_load (const char * fname, const char * omni_vlm_version, int verbosity); CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity); +// struct gpt_params; +// CLIP_API void clip_set_omni_vlm_version(struct clip_ctx * ctx_clip, const struct gpt_params * params); + CLIP_API void clip_free(struct clip_ctx * ctx); CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx); @@ -83,7 +86,7 @@ CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ct CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec); CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec); -CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype); +CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype, const char * omni_vlm_version); CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx); diff --git a/examples/omni-vlm/latex.png b/examples/omni-vlm/latex.png new file mode 100644 index 000000000..b97318fc0 Binary files /dev/null and b/examples/omni-vlm/latex.png differ diff --git a/examples/omni-vlm/omni-vlm-cli.cpp b/examples/omni-vlm/omni-vlm-cli.cpp index 68e833182..d24634fe8 100644 --- a/examples/omni-vlm/omni-vlm-cli.cpp +++ b/examples/omni-vlm/omni-vlm-cli.cpp @@ -12,6 +12,10 @@ #include #include #include +// #include +// +// using std::cout; +// using std::endl; static bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past) { int N = (int) tokens.size(); @@ -149,7 +153,7 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_omnivlm->ctx_llama, tmp[i]).c_str()); } } - LOG_TEE("user_prompt: %s\n", user_prompt.c_str()); + // LOG_TEE("user_prompt: %s\n", user_prompt.c_str()); if (params->verbose_prompt) { auto tmp = ::llama_tokenize(ctx_omnivlm->ctx_llama, user_prompt, true, true); for (int i = 0; i < (int) tmp.size(); i++) { @@ -165,6 +169,9 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima LOG("\n"); + params->sparams.temp = 0.0f; + params->sparams.top_k = 1; + params->sparams.top_p = 1.0f; struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams); if (!ctx_sampling) { LOG_TEE("%s: failed to initialize sampling subsystem\n", __func__); @@ -177,8 +184,8 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima response += tmp; if (strcmp(tmp, "<|im_end|>") == 0) break; if (strcmp(tmp, "") == 0) break; - // if (strstr(tmp, "###")) break; // Yi-VL behavior printf("%s", tmp); + // LOG("%s", tmp); // if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works) // if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6 // if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6 @@ -212,8 +219,8 @@ static struct omnivlm_context * omnivlm_init_context(gpt_params * params, llama_ prompt = "describe the image in detail."; } - auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 10); - + auto ctx_clip = clip_model_load(clip_path, params->omni_vlm_version.c_str(), /*verbosity=*/ 0); + // clip_set_omni_vlm_version(ctx_clip, params); llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings @@ -249,9 +256,6 @@ int main(int argc, char ** argv) { gpt_params params; - // if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) { - // return 1; - // } if (!gpt_params_parse(argc, argv, params)) { print_usage(argc, argv, params); return 1; @@ -261,8 +265,21 @@ int main(int argc, char ** argv) { print_usage(argc, argv, {}); return 1; } + if (params.omni_vlm_version != "vlm-81-ocr" && params.prompt.empty()) { + LOG_TEE("%s : prompt is empty.\n", __func__); + print_usage(argc, argv, {}); + return 1; + } - params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nDescribe this image for me\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>"; + if (params.omni_vlm_version == "vlm-81-ocr") { + params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n <|vision_start|><|image_pad|><|vision_end|><|im_end|>"; + } else if (params.omni_vlm_version == "vlm-81-instruct" || params.omni_vlm_version == "nano-vlm-instruct") { + params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" + params.prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>"; + } else { + LOG_TEE("%s : error: you set wrong vlm version info:'%s'.\n", __func__, params.omni_vlm_version.c_str()); + print_usage(argc, argv, {}); + return 1; + } auto * model = omnivlm_init(¶ms); if (model == NULL) { @@ -271,8 +288,8 @@ int main(int argc, char ** argv) { } - auto * ctx_omnivlm = omnivlm_init_context(¶ms, model); for (auto & image : params.image) { + auto * ctx_omnivlm = omnivlm_init_context(¶ms, model); auto * image_embed = load_image(ctx_omnivlm, ¶ms, image); if (!image_embed) { LOG_TEE("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str()); @@ -283,9 +300,9 @@ int main(int argc, char ** argv) { llama_print_timings(ctx_omnivlm->ctx_llama); omnivlm_image_embed_free(image_embed); + ctx_omnivlm->model = NULL; + omnivlm_free(ctx_omnivlm); } - ctx_omnivlm->model = NULL; - omnivlm_free(ctx_omnivlm); llama_free_model(model); diff --git a/examples/omni-vlm/omni-vlm-wrapper-cli.cpp b/examples/omni-vlm/omni-vlm-wrapper-cli.cpp index 731b7791e..6a65b7643 100644 --- a/examples/omni-vlm/omni-vlm-wrapper-cli.cpp +++ b/examples/omni-vlm/omni-vlm-wrapper-cli.cpp @@ -1,15 +1,24 @@ // WARNING: this .cpp file is only for debugging. do not user directly. #include "omni-vlm-wrapper.h" +#include + + +using std::cout; +using std::endl; int main(int argc, char ** argv) { - const char* llm_model = ""; - const char* mmproj_model = ""; - const char* image_path = ""; + const char* llm_model = ""; + const char* mmproj_model = ""; + const char* image_path = ""; const char* prompt = ""; - omnivlm_init(llm_model, mmproj_model); - omnivlm_inference(prompt, image_path); - omnivlm_inference(prompt, image_path); + omnivlm_init(llm_model, mmproj_model, "vlm-81-ocr"); + + const char* res; + res = omnivlm_inference(prompt, image_path); + cout << "RES: " << res << endl; + res = omnivlm_inference(prompt, image_path); + cout << "RES: " << res << endl; omnivlm_free(); return 0; diff --git a/examples/omni-vlm/omni-vlm-wrapper.cpp b/examples/omni-vlm/omni-vlm-wrapper.cpp index 81178205e..ba0749d06 100644 --- a/examples/omni-vlm/omni-vlm-wrapper.cpp +++ b/examples/omni-vlm/omni-vlm-wrapper.cpp @@ -24,6 +24,8 @@ struct omnivlm_context { struct llama_model * model = NULL; }; +void* internal_chars = nullptr; + static struct gpt_params params; static struct llama_model* model; static struct omnivlm_context* ctx_omnivlm; @@ -63,7 +65,8 @@ static struct omnivlm_context * omnivlm_init_context(gpt_params * params, llama_ prompt = "describe the image in detail."; } - auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 10); + auto ctx_clip = clip_model_load(clip_path, params->omni_vlm_version.c_str(), /*verbosity=*/ 0); + // clip_set_omni_vlm_version(ctx_clip, params); llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); @@ -128,19 +131,19 @@ static const char * sample(struct llama_sampling_context * ctx_sampling, return ret.c_str(); } -static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_image_embed * image_embed, gpt_params * params, const std::string & prompt) { +static const char* process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_image_embed * image_embed, gpt_params * params, const std::string & prompt) { int n_past = 0; const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict; - std::string full_prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" \ - + prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>"; - size_t image_pos = full_prompt.find("<|image_pad|>"); + // std::string full_prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" \ + // + prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>"; + size_t image_pos = params->prompt.find("<|image_pad|>"); std::string system_prompt, user_prompt; // new templating mode: Provide the full prompt including system message and use as a placeholder for the image - system_prompt = full_prompt.substr(0, image_pos); - user_prompt = full_prompt.substr(image_pos + std::string("<|image_pad|>").length()); + system_prompt = params->prompt.substr(0, image_pos); + user_prompt = params->prompt.substr(image_pos + std::string("<|image_pad|>").length()); if (params->verbose_prompt) { auto tmp = ::llama_tokenize(ctx_omnivlm->ctx_llama, system_prompt, true, true); for (int i = 0; i < (int) tmp.size(); i++) { @@ -155,6 +158,9 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima } } + params->sparams.top_k = 1; + params->sparams.top_p = 1.0f; + eval_string(ctx_omnivlm->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true); omnivlm_eval_image_embed(ctx_omnivlm->ctx_llama, image_embed, params->n_batch, &n_past); eval_string(ctx_omnivlm->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false); @@ -172,11 +178,11 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima std::string response = ""; for (int i = 0; i < max_tgt_len; i++) { const char * tmp = sample(ctx_sampling, ctx_omnivlm->ctx_llama, &n_past); - response += tmp; if (strcmp(tmp, "<|im_end|>") == 0) break; if (strcmp(tmp, "") == 0) break; // if (strstr(tmp, "###")) break; // Yi-VL behavior - printf("%s", tmp); + // printf("%s", tmp); + response += tmp; // if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works) // if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6 // if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6 @@ -186,6 +192,13 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima llama_sampling_free(ctx_sampling); printf("\n"); + + // const char* ret_char_ptr = (const char*)(malloc(sizeof(char)*response.size())); + if(internal_chars != nullptr) { free(internal_chars); } + internal_chars = malloc(sizeof(char)*(response.size()+1)); + strncpy((char*)(internal_chars), response.c_str(), response.size()); + ((char*)(internal_chars))[response.size()] = '\0'; + return (const char*)(internal_chars); } static void omnivlm_free(struct omnivlm_context * ctx_omnivlm) { @@ -208,8 +221,8 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) { } // inference interface definition -void omnivlm_init(const char* llm_model_path, const char* projector_model_path) { - const char* argv = "hello-omni-vlm-wrapper-cli"; +void omnivlm_init(const char* llm_model_path, const char* projector_model_path, const char* omni_vlm_version) { + const char* argv = "omni-wrapper-py"; char* nc_argv = const_cast(argv); if (!gpt_params_parse(1, &nc_argv, params)) { print_usage(1, &nc_argv, {}); @@ -217,31 +230,60 @@ void omnivlm_init(const char* llm_model_path, const char* projector_model_path) } params.model = llm_model_path; params.mmproj = projector_model_path; + params.omni_vlm_version = omni_vlm_version; + + std::string omni_vlm_ver = params.omni_vlm_version; + if(omni_vlm_ver != "vlm-81-ocr" && omni_vlm_ver != "vlm-81-instruct" && omni_vlm_ver != "nano-vlm-instruct") { + fprintf(stderr, "%s: error: you set wrong omni_vlm_string: %s\n", __func__, omni_vlm_version); + fprintf(stderr, "%s: Valid omni_vlm_version set is ('vlm-81-ocr', 'vlm-81-instruct', 'nano-vlm-instruct')\n", __func__); + throw std::runtime_error("You set wrong vlm_version info strings."); + } + model = omnivlm_init(¶ms); if (model == nullptr) { fprintf(stderr, "%s: error: failed to init omnivlm model\n", __func__); throw std::runtime_error("Failed to init omnivlm model"); } - ctx_omnivlm = omnivlm_init_context(¶ms, model); } -void omnivlm_inference(const char *prompt, const char *imag_path) { +const char* omnivlm_inference(const char *prompt, const char *imag_path) { + ctx_omnivlm = omnivlm_init_context(¶ms, model); + std::string image = imag_path; params.prompt = prompt; + + if (params.omni_vlm_version == "vlm-81-ocr") { + params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n <|ocr_start|><|vision_start|><|image_pad|><|vision_end|><|ocr_end|><|im_end|>"; + } else if (params.omni_vlm_version == "vlm-81-instruct" || params.omni_vlm_version == "nano-vlm-instruct") { + params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" + params.prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>"; + } else { + LOG_TEE("%s : error: you set wrong vlm version info:'%s'.\n", __func__, params.omni_vlm_version.c_str()); + throw std::runtime_error("You set wrong vlm_version info strings."); + } + auto * image_embed = load_image(ctx_omnivlm, ¶ms, image); if (!image_embed) { LOG_TEE("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str()); throw std::runtime_error("failed to load image " + image); } // process the prompt - process_prompt(ctx_omnivlm, image_embed, ¶ms, params.prompt); + const char* ret_chars = process_prompt(ctx_omnivlm, image_embed, ¶ms, params.prompt); // llama_perf_print(ctx_omnivlm->ctx_llama, LLAMA_PERF_TYPE_CONTEXT); omnivlm_image_embed_free(image_embed); + ctx_omnivlm->model = nullptr; + omnivlm_free(ctx_omnivlm); + ctx_omnivlm = nullptr; + + return ret_chars; } void omnivlm_free() { - ctx_omnivlm->model = NULL; - omnivlm_free(ctx_omnivlm); + if(internal_chars != nullptr) { free(internal_chars); } + if(ctx_omnivlm != nullptr) { + // this snipet should never be run! + ctx_omnivlm->model = nullptr; + omnivlm_free(ctx_omnivlm); + } llama_free_model(model); } diff --git a/examples/omni-vlm/omni-vlm-wrapper.h b/examples/omni-vlm/omni-vlm-wrapper.h index 4ab2c234c..22cc40533 100644 --- a/examples/omni-vlm/omni-vlm-wrapper.h +++ b/examples/omni-vlm/omni-vlm-wrapper.h @@ -20,9 +20,9 @@ extern "C" { #endif -OMNIVLM_API void omnivlm_init(const char* llm_model_path, const char* projector_model_path); +OMNIVLM_API void omnivlm_init(const char* llm_model_path, const char* projector_model_path, const char* omni_vlm_version); -OMNIVLM_API void omnivlm_inference(const char* prompt, const char* imag_path); +OMNIVLM_API const char* omnivlm_inference(const char* prompt, const char* imag_path); OMNIVLM_API void omnivlm_free(); diff --git a/examples/omni-vlm/omni-vlm.cpp b/examples/omni-vlm/omni-vlm.cpp index 539b300bf..339b6ffbe 100644 --- a/examples/omni-vlm/omni-vlm.cpp +++ b/examples/omni-vlm/omni-vlm.cpp @@ -258,111 +258,6 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli *n_img_pos = clip_n_patches(ctx_clip); bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); - // cout << "\t\t A NICE START" << endl; - // cout << "\t\t" << *n_img_pos << endl; - /* - if (clip_is_minicpmv(ctx_clip)) { - std::vector image_embd_v; - image_embd_v.resize(img_res_v.size); - struct clip_image_size * load_image_size = clip_image_size_init(); - for (size_t i = 0; i < img_res_v.size; i++) { - const int64_t t_img_enc_step_start_us = ggml_time_us(); - image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); - int patch_size=14; - load_image_size->width = img_res_v.data[i].nx; - load_image_size->height = img_res_v.data[i].ny; - clip_add_load_image_size(ctx_clip, load_image_size); - bool encoded = false; - int has_minicpmv_projector = clip_is_minicpmv(ctx_clip); - if (has_minicpmv_projector == 2) { - encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]); - } - else if (has_minicpmv_projector == 3) { - encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); - } - if (!encoded) { - LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); - return false; - } - const int64_t t_img_enc_steop_batch_us = ggml_time_us(); - LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0); - } - const int64_t t_img_enc_batch_us = ggml_time_us(); - LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); - - int n_img_pos_out = 0; - for (size_t i = 0; i < image_embd_v.size(); i++) { - std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip)); - n_img_pos_out += clip_n_patches(ctx_clip); - } - *n_img_pos = n_img_pos_out; - for (size_t i = 0; i < image_embd_v.size(); i++) { - free(image_embd_v[i]); - } - image_embd_v.clear(); - load_image_size->width = img->nx; - load_image_size->height = img->ny; - clip_add_load_image_size(ctx_clip, load_image_size); - LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height); - } - else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { - // flat / default llava-1.5 type embedding - *n_img_pos = clip_n_patches(ctx_clip); - bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096 - delete[] img_res_v.data; - if (!encoded) { - LOG_ERR("Unable to encode image\n"); - - return false; - } - } - else { - // spatial_unpad llava-1.6 type embedding - // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working - std::vector image_embd_v; - image_embd_v.resize(img_res_v.size); - for (size_t i = 0; i < img_res_v.size; i++) { - image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184 - const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside - if (!encoded) { - LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); - return false; - } - } - const int64_t t_img_enc_batch_us = ggml_time_us(); - LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); - - const int32_t * image_grid = clip_image_grid(ctx_clip); - - std::vector> grid_pinpoints; - for (int i = 0; i < 32 && image_grid[i] != 0; i += 2) { - grid_pinpoints.push_back({image_grid[i], image_grid[i+1]}); - } - - // free all img_res_v - not needed anymore - delete[] img_res_v.data; - img_res_v.size = 0; - img_res_v.data = nullptr; - - const int32_t image_size = clip_image_size(ctx_clip); - - struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size); - - int n_img_pos_out; - clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out); - *n_img_pos = n_img_pos_out; - - for (size_t i = 0; i < image_embd_v.size(); i++) { - free(image_embd_v[i]); - } - image_embd_v.clear(); - - // debug image/segment/normalization content: - // clip_image_u8 * tmp = clip_image_u8_init(); - // clip_image_convert_f32_to_u8(*image_feature, *tmp); - // clip_image_save_to_bmp(*tmp, "image_feature.bmp"); - } - */ LOG("%s: image embedding created: %d tokens\n", __func__, *n_img_pos); diff --git a/examples/omni-vlm/omni_vlm_cpp.py b/examples/omni-vlm/omni_vlm_cpp.py index 6f23f7c4c..81edb6f1d 100644 --- a/examples/omni-vlm/omni_vlm_cpp.py +++ b/examples/omni-vlm/omni_vlm_cpp.py @@ -60,11 +60,11 @@ _lib = _load_shared_library(_lib_base_name, base_path) omni_char_p = ctypes.c_char_p -def omnivlm_init(llm_model_path: omni_char_p, mmproj_model_path: omni_char_p): - return _lib.omnivlm_init(llm_model_path, mmproj_model_path) +def omnivlm_init(llm_model_path: omni_char_p, mmproj_model_path: omni_char_p, vlm_version: omni_char_p): + return _lib.omnivlm_init(llm_model_path, mmproj_model_path, vlm_version) -_lib.omnivlm_init.argtypes = [omni_char_p, omni_char_p] +_lib.omnivlm_init.argtypes = [omni_char_p, omni_char_p, omni_char_p] _lib.omnivlm_init.restype = None @@ -73,7 +73,7 @@ def omnivlm_inference(prompt: omni_char_p, image_path: omni_char_p): _lib.omnivlm_inference.argtypes = [omni_char_p, omni_char_p] -_lib.omnivlm_inference.restype = None +_lib.omnivlm_inference.restype = omni_char_p def omnivlm_free(): diff --git a/examples/omni-vlm/omni_vlm_demo.py b/examples/omni-vlm/omni_vlm_demo.py index 4f8c5998f..fbed2758f 100644 --- a/examples/omni-vlm/omni_vlm_demo.py +++ b/examples/omni-vlm/omni_vlm_demo.py @@ -11,16 +11,17 @@ class NexaOmniVlmInference: A class used for vision language model inference. """ - def __init__(self, llm_model_path: str, mmproj_model_path: str): + def __init__(self, llm_model_path: str, mmproj_model_path: str, omni_vlm_version: str): self.llm_model = ctypes.c_char_p(llm_model_path.encode("utf-8")) self.mmproj_model = ctypes.c_char_p(mmproj_model_path.encode("utf-8")) + self.omni_vlm_version = ctypes.c_char_p(omni_vlm_version.encode("utf-8")) - omni_vlm_cpp.omnivlm_init(self.llm_model, self.mmproj_model) + omni_vlm_cpp.omnivlm_init(self.llm_model, self.mmproj_model, self.omni_vlm_version) def inference(self, prompt: str, image_path: str): prompt = ctypes.c_char_p(prompt.encode("utf-8")) image_path = ctypes.c_char_p(image_path.encode("utf-8")) - omni_vlm_cpp.omnivlm_inference(prompt, image_path) + return omni_vlm_cpp.omnivlm_inference(prompt, image_path) def __del__(self): omni_vlm_cpp.omnivlm_free() @@ -34,22 +35,30 @@ if __name__ == "__main__": ) parser.add_argument("--model", type=str, help="Path to the llm model file") parser.add_argument("--mmproj", type=str, help="Path to the mmproj file") + parser.add_argument("--omni-vlm-version", type=str, help="omni-vlm-version info ('vlm-81-ocr', 'vlm-81-instruct', 'nano-vlm-instruct')") # parser.add_argument("--prompt", type=str, help="prompt string.") # parser.add_argument("--image-path", type=str, help="Path to the image.") args = parser.parse_args() - omni_vlm_obj = NexaOmniVlmInference(args.model, args.mmproj) + print("DEBUG") + print(args.omni_vlm_version) + omni_vlm_obj = NexaOmniVlmInference(args.model, args.mmproj, args.omni_vlm_version) # omni_vlm_obj.inference(args.prompt, args.image_path) while True: - print("Input your prompt:") - prompt = input() - if prompt == "": - print("ERROR: you input an empty prompt, try again.") - continue + if args.omni_vlm_version != "vlm-81-ocr": + print("Input your prompt:") + prompt = input() + if prompt == "": + print("ERROR: you input an empty prompt, try again.") + continue + else: + prompt = "" print("Input your image path:") image_path = input() while not os.path.exists(image_path): print("ERROR: can not find image in your input path, please check and input agian.") image_path = input() - omni_vlm_obj.inference(prompt, image_path) + response = omni_vlm_obj.inference(prompt, image_path) + print("\tresponse:") + print(response.decode('utf-8')) diff --git a/examples/qwen2-audio/qwen2.cpp b/examples/qwen2-audio/qwen2.cpp index be7d74d6d..8a08a7ac6 100644 --- a/examples/qwen2-audio/qwen2.cpp +++ b/examples/qwen2-audio/qwen2.cpp @@ -18,10 +18,12 @@ #include #include #include +#include // // Constants // +void* internal_chars = nullptr; static const char *AUDIO_TOKEN = "<|AUDIO|>"; @@ -565,16 +567,16 @@ bool omni_params_parse(int argc, char **argv, omni_params ¶ms) static omni_params get_omni_params_from_context_params(omni_context_params ¶ms) { omni_params all_params; - + // Initialize gpt params all_params.gpt.n_gpu_layers = params.n_gpu_layers; all_params.gpt.model = params.model; all_params.gpt.prompt = params.prompt; - + // Initialize whisper params all_params.whisper.model = params.mmproj; all_params.whisper.fname_inp = {params.file}; - + if (all_params.gpt.n_threads <= 0) { all_params.gpt.n_threads = std::thread::hardware_concurrency(); @@ -703,6 +705,12 @@ struct omni_context *omni_init_context(omni_context_params ¶ms) void omni_free(struct omni_context *ctx_omni) { + + if(internal_chars != nullptr) + { + free(internal_chars); + internal_chars = nullptr; + } if (ctx_omni->ctx_whisper) { whisper_free(ctx_omni->ctx_whisper); @@ -710,12 +718,13 @@ void omni_free(struct omni_context *ctx_omni) } if (ctx_omni->projector) { - ctx_omni->projector->free(); + delete ctx_omni->projector; } llama_free(ctx_omni->ctx_llama); llama_free_model(ctx_omni->model); llama_backend_free(); + free(ctx_omni); } static bool omni_eval_audio_embed(llama_context *ctx_llama, ggml_tensor *audio_embed, int n_batch, int *n_past) @@ -755,6 +764,7 @@ static bool omni_eval_audio_embed(llama_context *ctx_llama, ggml_tensor *audio_e } *n_past += n_eval; } + free(audio_embed_data); return true; } @@ -792,7 +802,7 @@ ggml_tensor *omni_process_audio(struct omni_context *ctx_omni, omni_params ¶ return embed_proj; } -void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed, omni_params ¶ms, const std::string &prompt) +const char* omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed, omni_params ¶ms, const std::string &prompt) { int n_past = 0; @@ -841,12 +851,11 @@ void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed for (int i = 0; i < max_tgt_len; i++) { const char * tmp = sample(ctx_sampling, ctx_omni->ctx_llama, &n_past); - response += tmp; if (strcmp(tmp, "") == 0) break; if (strstr(tmp, "###")) break; // Yi-VL behavior - printf("%s", tmp); + // printf("%s", tmp); if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works) if (strstr(response.c_str(), "<|im_start|>")) @@ -855,16 +864,23 @@ void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed break; // mistral llava-1.6 fflush(stdout); + response += tmp; } llama_sampling_free(ctx_sampling); printf("\n"); + + if(internal_chars != nullptr) { free(internal_chars); } + internal_chars = malloc(sizeof(char)*(response.size()+1)); + strncpy((char*)(internal_chars), response.c_str(), response.size()); + ((char*)(internal_chars))[response.size()] = '\0'; + return (const char*)(internal_chars); } -void omni_process_full(struct omni_context *ctx_omni, omni_context_params ¶ms) +const char* omni_process_full(struct omni_context *ctx_omni, omni_context_params ¶ms) { omni_params all_params = get_omni_params_from_context_params(params); ggml_tensor *audio_embed = omni_process_audio(ctx_omni, all_params); - omni_process_prompt(ctx_omni, audio_embed, all_params, all_params.gpt.prompt); -} \ No newline at end of file + return omni_process_prompt(ctx_omni, audio_embed, all_params, all_params.gpt.prompt); +} diff --git a/examples/qwen2-audio/qwen2.h b/examples/qwen2-audio/qwen2.h index 5cbbd52ed..dcadb4288 100644 --- a/examples/qwen2-audio/qwen2.h +++ b/examples/qwen2-audio/qwen2.h @@ -54,11 +54,11 @@ OMNI_AUDIO_API struct omni_context *omni_init_context(omni_context_params ¶m OMNI_AUDIO_API void omni_free(struct omni_context *ctx_omni); -OMNI_AUDIO_API void omni_process_full( +OMNI_AUDIO_API const char* omni_process_full( struct omni_context *ctx_omni, omni_context_params ¶ms ); #ifdef __cplusplus } -#endif \ No newline at end of file +#endif diff --git a/examples/qwen2-audio/whisper.cpp b/examples/qwen2-audio/whisper.cpp index 6da9d268d..b2ce58475 100644 --- a/examples/qwen2-audio/whisper.cpp +++ b/examples/qwen2-audio/whisper.cpp @@ -9467,6 +9467,8 @@ static bool whisper_encoder_load(struct whisper_model_loader *loader, whisper_co wctx.t_load_us = ggml_time_us() - t_start_us; + gguf_free(gguf_ctx); + return true; } diff --git a/ggml_llama/src/vulkan-shaders/CMakeLists.txt b/ggml_llama/src/vulkan-shaders/CMakeLists.txt index bdb6038a1..10075db33 100644 --- a/ggml_llama/src/vulkan-shaders/CMakeLists.txt +++ b/ggml_llama/src/vulkan-shaders/CMakeLists.txt @@ -1,6 +1,6 @@ find_package (Threads REQUIRED) -set(TARGET llama-vulkan-shaders-gen) +set(TARGET vulkan-shaders-gen) add_executable(${TARGET} vulkan-shaders-gen.cpp) install(TARGETS ${TARGET} RUNTIME) target_compile_features(${TARGET} PRIVATE cxx_std_11)