From 3dfac7817f8f40562b559e877b50b104d697bcf8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E4=B8=BA?= Date: Thu, 7 Nov 2024 11:19:50 +0800 Subject: [PATCH] add returned string type (const char*) for nexa-omni-audio --- examples/CMakeLists.txt | 1 + examples/omni-vlm-v2/CMakeLists.txt | 50 + examples/omni-vlm-v2/README.md | 110 + examples/omni-vlm-v2/clip-v2.cpp | 2255 +++++++++++++++++ examples/omni-vlm-v2/clip-v2.h | 94 + .../convert_image_encoder_to_gguf.py | 208 ++ examples/omni-vlm-v2/latex.png | Bin 0 -> 5931 bytes examples/omni-vlm-v2/omni-vlm-v2-cli.cpp | 299 +++ .../omni-vlm-v2/omni-vlm-v2-wrapper-cli.cpp | 16 + examples/omni-vlm-v2/omni-vlm-v2-wrapper.cpp | 259 ++ examples/omni-vlm-v2/omni-vlm-v2-wrapper.h | 33 + examples/omni-vlm-v2/omni-vlm-v2.cpp | 434 ++++ examples/omni-vlm-v2/omni-vlm-v2.h | 46 + examples/omni-vlm-v2/omni_vlm_v2_cpp.py | 84 + examples/omni-vlm-v2/omni_vlm_v2_demo.py | 57 + examples/omni-vlm-v2/omni_vlm_v2_surgery.py | 161 ++ examples/omni-vlm-v2/requirements.txt | 5 + 17 files changed, 4112 insertions(+) create mode 100644 examples/omni-vlm-v2/CMakeLists.txt create mode 100644 examples/omni-vlm-v2/README.md create mode 100644 examples/omni-vlm-v2/clip-v2.cpp create mode 100644 examples/omni-vlm-v2/clip-v2.h create mode 100644 examples/omni-vlm-v2/convert_image_encoder_to_gguf.py create mode 100644 examples/omni-vlm-v2/latex.png create mode 100644 examples/omni-vlm-v2/omni-vlm-v2-cli.cpp create mode 100644 examples/omni-vlm-v2/omni-vlm-v2-wrapper-cli.cpp create mode 100644 examples/omni-vlm-v2/omni-vlm-v2-wrapper.cpp create mode 100644 examples/omni-vlm-v2/omni-vlm-v2-wrapper.h create mode 100644 examples/omni-vlm-v2/omni-vlm-v2.cpp create mode 100644 examples/omni-vlm-v2/omni-vlm-v2.h create mode 100644 examples/omni-vlm-v2/omni_vlm_v2_cpp.py create mode 100644 examples/omni-vlm-v2/omni_vlm_v2_demo.py create mode 100644 examples/omni-vlm-v2/omni_vlm_v2_surgery.py create mode 100644 examples/omni-vlm-v2/requirements.txt diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 922c293ef..784d6e9de 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -53,6 +53,7 @@ else() # add_subdirectory(speculative) # add_subdirectory(tokenize) add_subdirectory(omni-vlm) + add_subdirectory(omni-vlm-v2) add_subdirectory(nexa-omni-audio) add_subdirectory(qwen2-audio) endif() diff --git a/examples/omni-vlm-v2/CMakeLists.txt b/examples/omni-vlm-v2/CMakeLists.txt new file mode 100644 index 000000000..b6cdd9e7e --- /dev/null +++ b/examples/omni-vlm-v2/CMakeLists.txt @@ -0,0 +1,50 @@ +add_library(omni_vlm_v2 OBJECT + omni-vlm-v2.cpp + omni-vlm-v2.h + clip-v2.cpp + clip-v2.h + ) + +target_link_libraries(omni_vlm_v2 PRIVATE ggml_llama llama ${CMAKE_THREAD_LIBS_INIT}) + +target_include_directories(omni_vlm_v2 PUBLIC .) +target_include_directories(omni_vlm_v2 PUBLIC ../..) +target_include_directories(omni_vlm_v2 PUBLIC ../../common) + +target_compile_features(omni_vlm_v2 PRIVATE cxx_std_11) + +add_library(omni_vlm_v2_static STATIC $) +if (BUILD_SHARED_LIBS) + set_target_properties(omni_vlm_v2 PROPERTIES POSITION_INDEPENDENT_CODE ON) + target_compile_definitions(omni_vlm_v2 PRIVATE LLAMA_SHARED LLAMA_BUILD) + add_library(omni_vlm_v2_shared SHARED $) + target_link_libraries(omni_vlm_v2_shared PRIVATE ggml_llama llama ${CMAKE_THREAD_LIBS_INIT}) + install(TARGETS omni_vlm_v2_shared LIBRARY) +endif() + +if (NOT MSVC) + target_compile_options(omni_vlm_v2 PRIVATE -Wno-cast-qual) # stb_image.h +endif() + +if(TARGET BUILD_INFO) + add_dependencies(omni_vlm_v2 BUILD_INFO) +endif() + +set(TARGET omni-vlm-v2-cli) +add_executable(${TARGET} omni-vlm-v2-cli.cpp) +set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME omni-vlm-v2-cli) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common omni_vlm_v2 ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) + +#=== for omni-vlm-wrapper +add_library(omni_vlm_v2_wrapper_shared SHARED omni-vlm-v2-wrapper.cpp $) +target_link_libraries(omni_vlm_v2_wrapper_shared PRIVATE common ggml_llama llama ${CMAKE_THREAD_LIBS_INIT}) +install(TARGETS omni_vlm_v2_wrapper_shared LIBRARY) + +# set(TARGET omni-vlm-wrapper-cli) +# add_executable(${TARGET} omni-vlm-wrapper-cli.cpp) +# set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME omni-vlm-wrapper-cli) +# install(TARGETS ${TARGET} RUNTIME) +# target_link_libraries(${TARGET} PRIVATE omni_vlm_v2_wrapper_shared ${CMAKE_THREAD_LIBS_INIT}) +# target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/omni-vlm-v2/README.md b/examples/omni-vlm-v2/README.md new file mode 100644 index 000000000..cd2d40cb4 --- /dev/null +++ b/examples/omni-vlm-v2/README.md @@ -0,0 +1,110 @@ +# omni-vlm + +Currently this implementation supports [vlm-81-ocr](https://huggingface.co/NexaAIDev/vlm-81-ocr) variants, + +After API is confirmed, more models will be supported / uploaded. + +## Usage +Build with cmake in the `llama.cpp` folder: +```bash +cmake -S . -B build -DCMAKE_BUILD_TYPE=RelWithDebInfo +cmake --build build --verbose -j +``` +After building, run: `./omni-vlm-v2-cli` to see the usage. For example: + +```bash +./omni-vlm-v2-cli \ + -m Nano-Llm-v2-494M-F16.gguf \ + --mmproj mmproj-omni-vlm-v2-f16.gguf \ + --image example/omni-vlm-v2/latex.png \ + --prompt "Describe this image for me" +``` + +See next section to convert gguf files from original safetensors. + +[comment]: # (TODO: +**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so. +**note**: For GPU offloading ensure to use the `-ngl` flag just like usual +) + +## Omni-vlm gguf conversion +1) First clone omni-vlm model: +```console +git clone https://huggingface.co/NexaAIDev/vlm-81-ocr +``` + +2) Install the required Python packages: + +```sh +pip install -r examples/omni-vlm/requirements.txt +``` + +3) Run `omni_vlm_surgery.py`: +```console +python omni_vlm_surgery.py \ + --clean-vision-tower \ + --model +``` +- you will find an `omni_vlm.projector` and an `omni_vlm.clip` file in `vlm-81-ocr/` directory + +4) Create a soft link `pytorch_model.bin` to `omni_vlm.clip`: +```bash +# in vlm-81-ocr/ folder +ln -s omni_vlm.clip pytorch_model.bin +``` +5) Go back to `llama.cpp` project folder and create the visual gguf model: + +clone `nano-vlm-processor` model directory (You may need to obtain authorization to access NexaAIDev space). +```console +git clone https://huggingface.co/NexaAIDev/nano-vlm-processor +``` + +```console +python ./examples/omni-vlm/convert_image_encoder_to_gguf.py \ + -m \ + --output-dir \ + -p +``` +- You will get a pure vision model part of CLIP named `/mmproj-omni-vlm-f16.gguf`. + +6) Then convert the LLM portion to gguf format: +* Run python snippet below to extract LLM portion from original omni-vlm model. +```python +from safetensors import safe_open +from safetensors.torch import save_file + +tensors = {} +with safe_open("/model.safetensors", framework="pt", device=0) as f: + for k in f.keys(): + if k.startswith('language_model'): + k2 = k.replace('language_model.', '') + tensors[k2] = f.get_tensor(k) + + save_file(tensors, "/model.safetensors") +``` + +```console +python convert_hf_to_gguf.py +``` +Finally we will get LLM GGUF model: `/Nano-Llm-494M-F16.ggu` + +7) And finally we can run the omni-vlm demo of C++ version: +```console +./build/bin/omni-vlm-cli \ + -m /Nano-Llm-494M-F16.gguf \ + --mmproj /mmproj-omni-vlm-v2-f16.gguf \ + --image example/omni-vlm/cat.png +``` +The results will print on the screen: +> The image depicts a grey and white cat with its head pressed against the camera, appearing as if it is staring directly into the lens. The cat is surrounded by black and white stripes, adding a unique touch to its appearance. The black background creates a strong contrast and highlights the cat's features, making it a captivating scene. + +8) Python interface: + +After successfully compiling omni_vlm_wrapper_shared dynamic library, run: +```console +python omni_vlm_v2_demo.py \ + --model /Nano-Llm-494M-F16.gguf \ + --mmproj /mmproj-omni-vlm-f16.gguf \ + --prompt="Describe this image for me" \ + --image-path latex.png +``` diff --git a/examples/omni-vlm-v2/clip-v2.cpp b/examples/omni-vlm-v2/clip-v2.cpp new file mode 100644 index 000000000..2205a9164 --- /dev/null +++ b/examples/omni-vlm-v2/clip-v2.cpp @@ -0,0 +1,2255 @@ +#include "clip-v2.h" +#include "ggml.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" + +#ifdef GGML_USE_CUDA +#include "ggml-cuda.h" +#endif + +#ifdef GGML_USE_METAL +#include "ggml-metal.h" +#endif + +#ifdef GGML_USE_CANN +#include "ggml-cann.h" +#endif + +#ifdef GGML_USE_VULKAN +#include "ggml-vulkan.h" +#endif + +#define STB_IMAGE_IMPLEMENTATION +#include "stb_image.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using std::cout; +using std::endl; + +#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0) +#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0) + +//#define CLIP_DEBUG_FUNCTIONS + +// RGB uint8 image +struct clip_image_u8 { + int nx; + int ny; + + std::vector buf; +}; + +// RGB float32 image (NHWC) +// Memory layout: RGBRGBRGB... +struct clip_image_f32 { + int nx; + int ny; + + std::vector buf; +}; + +static std::string format(const char * fmt, ...) { + va_list ap; + va_list ap2; + va_start(ap, fmt); + va_copy(ap2, ap); + int size = vsnprintf(NULL, 0, fmt, ap); + GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT + std::vector buf(size + 1); + int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); + GGML_ASSERT(size2 == size); + va_end(ap2); + va_end(ap); + return std::string(buf.data(), buf.size()); +} + +// +// key constants +// + +#define KEY_FTYPE "general.file_type" +#define KEY_NAME "general.name" +#define KEY_DESCRIPTION "general.description" +// #define KEY_HAS_TEXT_ENC "clip.has_text_encoder" +// #define KEY_HAS_VIS_ENC "clip.has_vision_encoder" +// #define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector" +// #define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector" +// #define KEY_MINICPMV_VERSION "clip.minicpmv_version" +#define KEY_USE_GELU "siglip.use_gelu" +#define KEY_N_EMBD "siglip.%s.embedding_length" +#define KEY_N_FF "siglip.%s.feed_forward_length" +#define KEY_N_BLOCK "siglip.%s.block_count" +#define KEY_N_HEAD "siglip.%s.attention.head_count" +#define KEY_LAYER_NORM_EPS "siglip.%s.attention.layer_norm_epsilon" +#define KEY_PROJ_DIM "siglip.%s.projection_dim" +#define KEY_TOKENS "tokenizer.ggml.tokens" +#define KEY_N_POSITIONS "clip.text.context_length" +#define KEY_IMAGE_SIZE "siglip.vision.image_size" +#define KEY_PATCH_SIZE "siglip.vision.patch_size" +#define KEY_IMAGE_MEAN "siglip.vision.image_mean" +#define KEY_IMAGE_STD "siglip.vision.image_std" +#define KEY_PROJ_TYPE "siglip.projector_type" + +#define KEY_MM_PATCH_MERGE_TYPE "siglip.vision.mm_patch_merge_type" +#define KEY_IMAGE_GRID_PINPOINTS "siglip.vision.image_grid_pinpoints" +#define KEY_IMAGE_CROP_RESOLUTION "siglip.vision.image_crop_resolution" + + +// +// tensor name constants +// + +#define TN_TOKEN_EMBD "%s.token_embd.weight" +#define TN_POS_EMBD "%s.position_embd.weight" +#define TN_CLASS_EMBD "v.class_embd" +#define TN_PATCH_EMBD "v.patch_embd.weight" +#define TN_PATCH_BIAS "v.patch_embd.bias" +#define TN_ATTN_K "%s.blk.%d.attn_k.%s" +#define TN_ATTN_Q "%s.blk.%d.attn_q.%s" +#define TN_ATTN_V "%s.blk.%d.attn_v.%s" +#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s" +#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s" +#define TN_FFN_UP "%s.blk.%d.ffn_up.%s" +#define TN_LN_1 "%s.blk.%d.ln1.%s" +#define TN_LN_2 "%s.blk.%d.ln2.%s" +#define TN_LN_PRE "%s.pre_ln.%s" +#define TN_LN_POST "%s.post_ln.%s" +#define TN_TEXT_PROJ "text_projection.weight" +#define TN_VIS_PROJ "visual_projection.weight" +#define TN_LLAVA_PROJ "mm.%d.%s" +#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s" +#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" +#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s" +#define TN_IMAGE_NEWLINE "model.image_newline" + +#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k" +#define TN_MINICPMV_QUERY "resampler.query" +#define TN_MINICPMV_PROJ "resampler.proj.weight" +#define TN_MINICPMV_KV_PROJ "resampler.kv.weight" +#define TN_MINICPMV_ATTN "resampler.attn.%s.%s" +#define TN_MINICPMV_LN "resampler.ln_%s.%s" + + +enum projector_type { + PROJECTOR_TYPE_MLP, + PROJECTOR_TYPE_MLP_NORM, + PROJECTOR_TYPE_LDP, + PROJECTOR_TYPE_LDPV2, + PROJECTOR_TYPE_RESAMPLER, + PROJECTOR_TYPE_UNKNOWN, +}; + +static std::map PROJECTOR_TYPE_NAMES = { + { PROJECTOR_TYPE_MLP, "mlp" }, + { PROJECTOR_TYPE_LDP, "ldp" }, + { PROJECTOR_TYPE_LDPV2, "ldpv2"}, + { PROJECTOR_TYPE_RESAMPLER, "resampler"}, +}; + + +// +// utilities to get data from a gguf file +// + +static int get_key_idx(const gguf_context * ctx, const char * key) { + int i = gguf_find_key(ctx, key); + if (i == -1) { + LOG_ERR("key %s not found in file\n", key); + throw std::runtime_error(format("Missing required key: %s", key)); + } + + return i; +} + +static uint32_t get_u32(const gguf_context * ctx, const std::string & key) { + const int i = get_key_idx(ctx, key.c_str()); + + return gguf_get_val_u32(ctx, i); +} + +static float get_f32(const gguf_context * ctx, const std::string & key) { + const int i = get_key_idx(ctx, key.c_str()); + + return gguf_get_val_f32(ctx, i); +} + +static struct ggml_tensor * get_tensor(struct ggml_context * ctx, const std::string & name) { + struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str()); + if (!cur) { + throw std::runtime_error(format("%s: unable to find tensor %s\n", __func__, name.c_str())); + } + + return cur; +} + +static std::string get_ftype(int ftype) { + return ggml_type_name(static_cast(ftype)); +} + +static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) { + switch (type) { + case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]); + case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]); + case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]); + case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]); + case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]); + case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]); + case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]); + case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]); + case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]); + case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]); + case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false"; + default: return format("unknown type %d", type); + } +} + +static void replace_all(std::string & s, const std::string & search, const std::string & replace) { + if (search.empty()) { + return; + } + std::string builder; + builder.reserve(s.length()); + size_t pos = 0; + size_t last_pos = 0; + while ((pos = s.find(search, last_pos)) != std::string::npos) { + builder.append(s, last_pos, pos - last_pos); + builder.append(replace); + last_pos = pos + search.length(); + } + builder.append(s, last_pos, std::string::npos); + s = std::move(builder); +} + +static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { + const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i); + + switch (type) { + case GGUF_TYPE_STRING: + return gguf_get_val_str(ctx_gguf, i); + case GGUF_TYPE_ARRAY: + { + const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i); + int arr_n = gguf_get_arr_n(ctx_gguf, i); + const void * data = gguf_get_arr_data(ctx_gguf, i); + std::stringstream ss; + ss << "["; + for (int j = 0; j < arr_n; j++) { + if (arr_type == GGUF_TYPE_STRING) { + std::string val = gguf_get_arr_str(ctx_gguf, i, j); + // escape quotes + replace_all(val, "\\", "\\\\"); + replace_all(val, "\"", "\\\""); + ss << '"' << val << '"'; + } else if (arr_type == GGUF_TYPE_ARRAY) { + ss << "???"; + } else { + ss << gguf_data_to_str(arr_type, data, j); + } + if (j < arr_n - 1) { + ss << ", "; + } + } + ss << "]"; + return ss.str(); + } + default: + return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0); + } +} + +static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") { + size_t tensor_size = ggml_nbytes(tensor); + LOG_INF("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n", + prefix, ggml_n_dims(tensor), tensor->name, tensor_size, + tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type)); +} + +static projector_type clip_projector_type_from_string(const std::string & name) { + for (const auto & kv : PROJECTOR_TYPE_NAMES) { // NOLINT + if (kv.second == name) { + return kv.first; + } + } + return PROJECTOR_TYPE_UNKNOWN; +} + +#ifdef CLIP_DEBUG_FUNCTIONS +static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) { + std::ofstream file(filename, std::ios::binary); + if (!file.is_open()) { + LOG_ERR("Failed to open file for writing: %s\n", filename.c_str()); + return; + } + + // PPM header: P6 format, width, height, and max color value + file << "P6\n" << img.nx << " " << img.ny << "\n255\n"; + + // Write pixel data + for (size_t i = 0; i < img.buf.size(); i += 3) { + // PPM expects binary data in RGB format, which matches our image buffer + file.write(reinterpret_cast(&img.buf[i]), 3); + } + + file.close(); +} + +static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) { + std::ofstream file(filename, std::ios::binary); + if (!file.is_open()) { + LOG_ERR("Failed to open file for writing: %s\n", filename.c_str()); + return; + } + + int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data + int bytesPerPixel = 3; + int widthInBytes = img.nx * bytesPerPixel; + int paddingAmount = (4 - (widthInBytes % 4)) % 4; + int stride = widthInBytes + paddingAmount; + + // Bitmap file header + unsigned char fileHeader[14] = { + 'B','M', // Signature + 0,0,0,0, // Image file size in bytes + 0,0,0,0, // Reserved + 54,0,0,0 // Start of pixel array + }; + + // Total file size + fileSize = 54 + (stride * img.ny); + fileHeader[2] = (unsigned char)(fileSize); + fileHeader[3] = (unsigned char)(fileSize >> 8); + fileHeader[4] = (unsigned char)(fileSize >> 16); + fileHeader[5] = (unsigned char)(fileSize >> 24); + + // Bitmap information header (BITMAPINFOHEADER) + unsigned char infoHeader[40] = { + 40,0,0,0, // Size of this header (40 bytes) + 0,0,0,0, // Image width + 0,0,0,0, // Image height + 1,0, // Number of color planes + 24,0, // Bits per pixel + 0,0,0,0, // No compression + 0,0,0,0, // Image size (can be 0 for no compression) + 0,0,0,0, // X pixels per meter (not specified) + 0,0,0,0, // Y pixels per meter (not specified) + 0,0,0,0, // Total colors (color table not used) + 0,0,0,0 // Important colors (all are important) + }; + + // Width and height in the information header + infoHeader[4] = (unsigned char)(img.nx); + infoHeader[5] = (unsigned char)(img.nx >> 8); + infoHeader[6] = (unsigned char)(img.nx >> 16); + infoHeader[7] = (unsigned char)(img.nx >> 24); + infoHeader[8] = (unsigned char)(img.ny); + infoHeader[9] = (unsigned char)(img.ny >> 8); + infoHeader[10] = (unsigned char)(img.ny >> 16); + infoHeader[11] = (unsigned char)(img.ny >> 24); + + // Write file headers + file.write(reinterpret_cast(fileHeader), sizeof(fileHeader)); + file.write(reinterpret_cast(infoHeader), sizeof(infoHeader)); + + // Pixel data + std::vector padding(3, 0); // Max padding size to be added to each row + for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top + for (int x = 0; x < img.nx; ++x) { + // Each pixel + size_t pixelIndex = (y * img.nx + x) * 3; + unsigned char pixel[3] = { + img.buf[pixelIndex + 2], // BMP stores pixels in BGR format + img.buf[pixelIndex + 1], + img.buf[pixelIndex] + }; + file.write(reinterpret_cast(pixel), 3); + } + // Write padding for the row + file.write(reinterpret_cast(padding.data()), paddingAmount); + } + + file.close(); +} + +// debug function to convert f32 to u8 +static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) { + dst.nx = src.nx; + dst.ny = src.ny; + dst.buf.resize(3 * src.nx * src.ny); + for (size_t i = 0; i < src.buf.size(); ++i) { + dst.buf[i] = static_cast(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255)); + } +} +#endif + + +// +// clip layers +// + +struct clip_hparams { + int32_t image_size; + int32_t patch_size; + int32_t hidden_size; + int32_t n_intermediate; + int32_t projection_dim; + int32_t n_head; + int32_t n_layer; + + float eps; + + char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default) + + int32_t image_grid_pinpoints[32]; + int32_t image_crop_resolution; +}; + +struct clip_layer { + // attention + struct ggml_tensor * k_w; + struct ggml_tensor * k_b; + struct ggml_tensor * q_w; + struct ggml_tensor * q_b; + struct ggml_tensor * v_w; + struct ggml_tensor * v_b; + + struct ggml_tensor * o_w; + struct ggml_tensor * o_b; + + // layernorm 1 + struct ggml_tensor * ln_1_w; + struct ggml_tensor * ln_1_b; + + // ff + struct ggml_tensor * ff_i_w; + struct ggml_tensor * ff_i_b; + + struct ggml_tensor * ff_o_w; + struct ggml_tensor * ff_o_b; + + // layernorm 2 + struct ggml_tensor * ln_2_w; + struct ggml_tensor * ln_2_b; +}; + +struct clip_vision_model { + struct clip_hparams hparams; + + // embeddings + struct ggml_tensor * class_embedding; + struct ggml_tensor * patch_embeddings; + struct ggml_tensor * patch_bias; + struct ggml_tensor * position_embeddings; + + struct ggml_tensor * pre_ln_w; + struct ggml_tensor * pre_ln_b; + + std::vector layers; + + struct ggml_tensor * post_ln_w; + struct ggml_tensor * post_ln_b; + + struct ggml_tensor * projection; + + // LLaVA projection + struct ggml_tensor * mm_0_w = NULL; + struct ggml_tensor * mm_0_b = NULL; + struct ggml_tensor * mm_2_w = NULL; + struct ggml_tensor * mm_2_b = NULL; + + struct ggml_tensor * image_newline = NULL; + + // Yi type models with mlp+normalization projection + struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4 + struct ggml_tensor * mm_1_b = NULL; + struct ggml_tensor * mm_3_w = NULL; + struct ggml_tensor * mm_3_b = NULL; + struct ggml_tensor * mm_4_w = NULL; + struct ggml_tensor * mm_4_b = NULL; + + // MobileVLM projection + struct ggml_tensor * mm_model_mlp_1_w; + struct ggml_tensor * mm_model_mlp_1_b; + struct ggml_tensor * mm_model_mlp_3_w; + struct ggml_tensor * mm_model_mlp_3_b; + struct ggml_tensor * mm_model_block_1_block_0_0_w; + struct ggml_tensor * mm_model_block_1_block_0_1_w; + struct ggml_tensor * mm_model_block_1_block_0_1_b; + struct ggml_tensor * mm_model_block_1_block_1_fc1_w; + struct ggml_tensor * mm_model_block_1_block_1_fc1_b; + struct ggml_tensor * mm_model_block_1_block_1_fc2_w; + struct ggml_tensor * mm_model_block_1_block_1_fc2_b; + struct ggml_tensor * mm_model_block_1_block_2_0_w; + struct ggml_tensor * mm_model_block_1_block_2_1_w; + struct ggml_tensor * mm_model_block_1_block_2_1_b; + struct ggml_tensor * mm_model_block_2_block_0_0_w; + struct ggml_tensor * mm_model_block_2_block_0_1_w; + struct ggml_tensor * mm_model_block_2_block_0_1_b; + struct ggml_tensor * mm_model_block_2_block_1_fc1_w; + struct ggml_tensor * mm_model_block_2_block_1_fc1_b; + struct ggml_tensor * mm_model_block_2_block_1_fc2_w; + struct ggml_tensor * mm_model_block_2_block_1_fc2_b; + struct ggml_tensor * mm_model_block_2_block_2_0_w; + struct ggml_tensor * mm_model_block_2_block_2_1_w; + struct ggml_tensor * mm_model_block_2_block_2_1_b; + + // MobileVLM_V2 projection + struct ggml_tensor * mm_model_mlp_0_w; + struct ggml_tensor * mm_model_mlp_0_b; + struct ggml_tensor * mm_model_mlp_2_w; + struct ggml_tensor * mm_model_mlp_2_b; + struct ggml_tensor * mm_model_peg_0_w; + struct ggml_tensor * mm_model_peg_0_b; + + // omni-vlm projection + struct ggml_tensor * mm_head_attn_in_w; // shape is [hidden_dim, 3*hidden_dim] + struct ggml_tensor * mm_head_attn_in_b; + struct ggml_tensor * mm_head_attn_out_w; + struct ggml_tensor * mm_head_attn_out_b; + struct ggml_tensor * mm_head_ln_w; + struct ggml_tensor * mm_head_ln_b; + struct ggml_tensor * mm_head_ffd_w; + struct ggml_tensor * mm_head_ffd_b; + struct ggml_tensor * mm_head_ffu_w; + struct ggml_tensor * mm_head_ffu_b; + struct ggml_tensor * mm_head_prob; + + // MINICPMV projection + struct ggml_tensor * mm_model_pos_embed_k; + struct ggml_tensor * mm_model_query; + struct ggml_tensor * mm_model_proj; + struct ggml_tensor * mm_model_kv_proj; + struct ggml_tensor * mm_model_attn_q_w; + struct ggml_tensor * mm_model_attn_q_b; + struct ggml_tensor * mm_model_attn_k_w; + struct ggml_tensor * mm_model_attn_k_b; + struct ggml_tensor * mm_model_attn_v_w; + struct ggml_tensor * mm_model_attn_v_b; + struct ggml_tensor * mm_model_attn_o_w; + struct ggml_tensor * mm_model_attn_o_b; + struct ggml_tensor * mm_model_ln_q_w; + struct ggml_tensor * mm_model_ln_q_b; + struct ggml_tensor * mm_model_ln_kv_w; + struct ggml_tensor * mm_model_ln_kv_b; + struct ggml_tensor * mm_model_ln_post_w; + struct ggml_tensor * mm_model_ln_post_b; +}; + +struct clip_ctx { + bool has_text_encoder = false; + bool has_vision_encoder = false; + bool has_llava_projector = false; + bool has_minicpmv_projector = false; + int minicpmv_version = 2; + + struct clip_vision_model vision_model; + projector_type proj_type = PROJECTOR_TYPE_MLP; + + float image_mean[3]; + float image_std[3]; + bool use_gelu = false; + int32_t ftype = 1; + + bool has_class_embedding = true; + bool has_pre_norm = true; + bool has_post_norm = false; + bool has_patch_bias = false; + + struct gguf_context * ctx_gguf; + struct ggml_context * ctx_data; + + std::vector buf_compute_meta; + + // memory buffers to evaluate the model + ggml_backend_buffer_t params_buffer = NULL; + + ggml_backend_t backend = NULL; + ggml_gallocr_t compute_alloc = NULL; + + struct clip_image_size * load_image_size; +}; + +static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) { + if (!ctx->has_vision_encoder) { + LOG_ERR("This gguf file seems to have no vision encoder\n"); + return nullptr; + } + + const auto & model = ctx->vision_model; + const auto & hparams = model.hparams; + + const int image_size = hparams.image_size; + int image_size_width = image_size; + int image_size_height = image_size; + if (ctx->has_minicpmv_projector) { + if (load_image_size == nullptr) { + load_image_size = clip_image_size_init(); + } + LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height); + image_size_width = load_image_size->width; + image_size_height = load_image_size->height; + if (is_inf) { + image_size_width = imgs->data->nx; + image_size_height = imgs->data->ny; + } + } + const int patch_size = hparams.patch_size; + const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); + const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); + const int hidden_size = hparams.hidden_size; + const int n_head = hparams.n_head; + const int d_head = hidden_size / n_head; + int n_layer = hparams.n_layer; + const float eps = hparams.eps; + + const int batch_size = imgs->size; + + if (ctx->has_llava_projector || ctx->has_minicpmv_projector) { + GGML_ASSERT(batch_size == 1); + } + + struct ggml_init_params params = { + /*.mem_size =*/ ctx->buf_compute_meta.size(), + /*.mem_buffer =*/ ctx->buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + + struct ggml_context * ctx0 = ggml_init(params); + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size); + ggml_set_name(inp_raw, "inp_raw"); + ggml_set_input(inp_raw); + + struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + + inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size); + inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); + + if (ctx->has_patch_bias) { + // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp)); + inp = ggml_add(ctx0, inp, model.patch_bias); + } + struct ggml_tensor * embeddings = inp; + struct ggml_tensor * pos_embed = nullptr; + + if (ctx->has_llava_projector) { + // concat class_embeddings and patch_embeddings + if (ctx->has_class_embedding) { + embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); + ggml_set_name(embeddings, "embeddings"); + ggml_set_input(embeddings); + embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, + embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); + embeddings = ggml_acc(ctx0, embeddings, inp, + embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); + } + } + + struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + embeddings = + ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); + + if (ctx->has_minicpmv_projector) { + int pos_w = image_size_width/patch_size; + int pos_h = image_size_height/patch_size; + if (ctx->minicpmv_version == 2) { + pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1); + } + else if (ctx->minicpmv_version == 3) { + pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1); + } + ggml_set_name(pos_embed, "pos_embed"); + ggml_set_input(pos_embed); + } + + // pre-layernorm + if (ctx->has_pre_norm) { + embeddings = ggml_norm(ctx0, embeddings, eps); + ggml_set_name(embeddings, "pre_ln"); + + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b); + } + + // loop over layers + // if (ctx->has_minicpmv_projector) { + // n_layer += 1; + // } + for (int il = 0; il < n_layer; il++) { + struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states + + //const size_t nb_q_w = model.layers[il].q_w->nb[0]; + + // layernorm1 + { + cur = ggml_norm(ctx0, cur, eps); + + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w), + model.layers[il].ln_1_b); + } + + { + + struct ggml_tensor * Q = + ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b); + + Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head)); + Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size); + Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); + Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size); + + struct ggml_tensor * K = + ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b); + + K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); + K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); + K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); + + struct ggml_tensor * V = + ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b); + + V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size); + V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); + V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size); + + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + // KQ = ggml_scale_inplace(ctx0, KQ, 1.0f / sqrt((float)d_head)); + KQ = ggml_soft_max_inplace(ctx0, KQ); + + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); + KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size); + KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + + cur = ggml_cont_3d(ctx0, KQV, hidden_size, num_positions, batch_size); + } + + // attention output + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b); + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, embeddings); + + embeddings = cur; // embeddings = residual, cur = hidden_states + + // layernorm2 + { + cur = ggml_norm(ctx0, cur, eps); + + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b); + } + cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur); + cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b); + + if (ctx->use_gelu) { + cur = ggml_gelu_inplace(ctx0, cur); + } else { + cur = ggml_gelu_quick_inplace(ctx0, cur); + } + + cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur); + cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b); + + // residual 2 + cur = ggml_add(ctx0, embeddings, cur); + + embeddings = cur; + } + + // post-layernorm + if (ctx->has_post_norm) { + embeddings = ggml_norm(ctx0, embeddings, eps); + ggml_set_name(embeddings, "post_ln"); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b); + } + + embeddings = ggml_reshape_3d(ctx0, embeddings, embeddings->ne[0]*9, embeddings->ne[1]/9, 1); + + embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); + + embeddings = ggml_gelu_inplace(ctx0, embeddings); + + embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); + + ggml_build_forward_expand(gf, embeddings); + ggml_free(ctx0); + + return gf; +} + +// read and create ggml_context containing the tensors and their data +struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { + struct ggml_context * meta = NULL; + + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &meta, + }; + + struct gguf_context * ctx = gguf_init_from_file(fname, params); + if (!ctx) { + throw std::runtime_error(format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname)); + } + + if (verbosity >= 1) { + const int n_tensors = gguf_get_n_tensors(ctx); + const int n_kv = gguf_get_n_kv(ctx); + const int ftype = get_u32(ctx, KEY_FTYPE); + const std::string ftype_str = get_ftype(ftype); + const int idx_desc = get_key_idx(ctx, KEY_DESCRIPTION); + const std::string description = gguf_get_val_str(ctx, idx_desc); + const int idx_name = gguf_find_key(ctx, KEY_NAME); + if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug + const std::string name = gguf_get_val_str(ctx, idx_name); + LOG_INF("%s: model name: %s\n", __func__, name.c_str()); + } + LOG_INF("%s: description: %s\n", __func__, description.c_str()); + LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx)); + LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); + LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors); + LOG_INF("%s: n_kv: %d\n", __func__, n_kv); + LOG_INF("%s: ftype: %s\n", __func__, ftype_str.c_str()); + LOG_INF("\n"); + } + const int n_tensors = gguf_get_n_tensors(ctx); + + // kv + const int n_kv = gguf_get_n_kv(ctx); + LOG_INF("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n", + __func__, n_kv, n_tensors, fname); + { + std::map n_type; + + for (int i = 0; i < n_tensors; i++) { + enum ggml_type type = gguf_get_tensor_type(ctx, i); + + n_type[type]++; + } + + LOG_INF("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); + for (int i = 0; i < n_kv; i++) { + const char * name = gguf_get_key(ctx, i); + const enum gguf_type type = gguf_get_kv_type(ctx, i); + const std::string type_name = + type == GGUF_TYPE_ARRAY + ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx, i)), gguf_get_arr_n(ctx, i)) + : gguf_type_name(type); + + std::string value = gguf_kv_to_str(ctx, i); + const size_t MAX_VALUE_LEN = 40; + if (value.size() > MAX_VALUE_LEN) { + value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()); + } + replace_all(value, "\n", "\\n"); + + LOG_INF("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); + } + + // print type counts + for (auto & kv : n_type) { + if (kv.second == 0) { + continue; + } + + LOG_INF("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); + } + } + + // data + size_t model_size = 0; + { + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(ctx, i); + const size_t offset = gguf_get_tensor_offset(ctx, i); + enum ggml_type type = gguf_get_tensor_type(ctx, i); + struct ggml_tensor * cur = ggml_get_tensor(meta, name); + size_t tensor_size = ggml_nbytes(cur); + model_size += tensor_size; + if (verbosity >= 3) { + LOG_INF("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n", + __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type)); + } + } + } + + clip_ctx * new_clip = new clip_ctx{}; + + // update projector type + { + int idx = gguf_find_key(ctx, KEY_PROJ_TYPE); + if (idx != -1) { + const std::string proj_type = gguf_get_val_str(ctx, idx); + new_clip->proj_type = clip_projector_type_from_string(proj_type); + } else { + new_clip->proj_type = PROJECTOR_TYPE_MLP; + } + + if (new_clip->proj_type == PROJECTOR_TYPE_MLP) { + if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) { + new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM; + } + } + } + +#ifdef GGML_USE_CUDA + new_clip->backend = ggml_backend_cuda_init(0); + LOG_INF("%s: CLIP using CUDA backend\n", __func__); +#endif + +#ifdef GGML_USE_METAL + new_clip->backend = ggml_backend_metal_init(); + LOG_INF("%s: CLIP using Metal backend\n", __func__); +#endif + +#ifdef GGML_USE_CANN + new_clip->backend = ggml_backend_cann_init(0); + LOG_INF("%s: CLIP using CANN backend\n", __func__); +#endif + +#ifdef GGML_USE_VULKAN + new_clip->backend = ggml_backend_vk_init(0); + LOG_INF("%s: CLIP using Vulkan backend\n", __func__); +#endif + + if (!new_clip->backend) { + new_clip->backend = ggml_backend_cpu_init(); + LOG_INF("%s: CLIP using CPU backend\n", __func__); + } + + // model size and capabilities + { + int idx; + // int idx = get_key_idx(ctx, KEY_HAS_TEXT_ENC); + // new_clip->has_text_encoder = gguf_get_val_bool(ctx, idx); + new_clip->has_text_encoder = false; + + // idx = get_key_idx(ctx, KEY_HAS_VIS_ENC); + // new_clip->has_vision_encoder = gguf_get_val_bool(ctx, idx); + new_clip->has_vision_encoder = true; + + // idx = gguf_find_key(ctx, KEY_HAS_LLAVA_PROJ); + // if (idx != -1) { + // new_clip->has_llava_projector = gguf_get_val_bool(ctx, idx); + // } + new_clip->has_llava_projector = true; + + // idx = gguf_find_key(ctx, KEY_HAS_MINICPMV_PROJ); + // if (idx != -1) { + // new_clip->has_minicpmv_projector = gguf_get_val_bool(ctx, idx); + // } + new_clip->has_minicpmv_projector = false; + + // idx = gguf_find_key(ctx, KEY_MINICPMV_VERSION); + // if (idx != -1) { + // new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx); + // } + + // GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search + + // TODO: adjust for omni + new_clip->has_class_embedding = false; + + GGML_ASSERT(new_clip->has_vision_encoder); + GGML_ASSERT(!new_clip->has_text_encoder); + + // idx = get_key_idx(ctx, KEY_USE_GELU); + // new_clip->use_gelu = gguf_get_val_bool(ctx, idx); + new_clip->use_gelu = true; // TODO: to be confirmed + + if (verbosity >= 1) { + LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder); + LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); + LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector); + LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector); + LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0); + LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0); + } + } + + LOG_INF("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors); + + // load tensors + { + std::vector read_buf; + struct ggml_init_params params = { + /*.mem_size =*/ (n_tensors + 1) * ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + + new_clip->ctx_data = ggml_init(params); + if (!new_clip->ctx_data) { + LOG_ERR("%s: ggml_init() failed\n", __func__); + clip_free(new_clip); + gguf_free(ctx); + return nullptr; + } + + auto fin = std::ifstream(fname, std::ios::binary); + if (!fin) { + LOG_ERR("cannot open model file for loading tensors\n"); + clip_free(new_clip); + gguf_free(ctx); + return nullptr; + } + + // add tensors to context + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(ctx, i); + struct ggml_tensor * t = ggml_get_tensor(meta, name); + struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx_data, t); + ggml_set_name(cur, name); + } + + // alloc memory and offload data + new_clip->params_buffer = ggml_backend_alloc_ctx_tensors(new_clip->ctx_data, new_clip->backend); + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(ctx, i); + struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name); + const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i); + fin.seekg(offset, std::ios::beg); + if (!fin) { + LOG_ERR("%s: failed to seek for tensor %s\n", __func__, name); + clip_free(new_clip); + gguf_free(ctx); + return nullptr; + } + int num_bytes = ggml_nbytes(cur); + if (ggml_backend_buffer_is_host(new_clip->params_buffer)) { + // for the CPU and Metal backend, we can read directly into the tensor + fin.read(reinterpret_cast(cur->data), num_bytes); + } else { + // read into a temporary buffer first, then copy to device memory + read_buf.resize(num_bytes); + fin.read(reinterpret_cast(read_buf.data()), num_bytes); + ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); + } + } + fin.close(); + } + + // vision model + if (new_clip->has_vision_encoder) { + // load vision model + auto & vision_model = new_clip->vision_model; + auto & hparams = vision_model.hparams; + hparams.hidden_size = get_u32(ctx, format(KEY_N_EMBD, "vision")); + hparams.n_head = get_u32(ctx, format(KEY_N_HEAD, "vision")); + hparams.n_intermediate = get_u32(ctx, format(KEY_N_FF, "vision")); + hparams.n_layer = get_u32(ctx, format(KEY_N_BLOCK, "vision")); + hparams.image_size = get_u32(ctx, KEY_IMAGE_SIZE); + hparams.patch_size = get_u32(ctx, KEY_PATCH_SIZE); + hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision")); + // hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision")); + hparams.eps = 1e-06f; + + /* + try { + int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS); + int n = gguf_get_arr_n(ctx, idx); + const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx); + for (int i = 0; i < 32 && i < n && pinpoints[i] != 0; ++i) { + hparams.image_grid_pinpoints[i] = pinpoints[i]; + } + if (n < 32) + hparams.image_grid_pinpoints[n] = 0; + } catch (std::runtime_error & ) { + hparams.image_grid_pinpoints[0]=0; + } + + try { + int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE); + strcpy(hparams.mm_patch_merge_type, gguf_get_val_str(ctx, idx)); + } catch (std::runtime_error & ) { + strcpy(hparams.mm_patch_merge_type, "flat"); + } + + try { + hparams.image_crop_resolution = get_u32(ctx, KEY_IMAGE_CROP_RESOLUTION); // llava-1.6 + } catch(const std::exception& ) { + hparams.image_crop_resolution = hparams.image_size; + } + */ + + int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN); + int idx_std = get_key_idx(ctx, KEY_IMAGE_STD); + + const float * mean_data = (const float *)gguf_get_arr_data(ctx, idx_mean); + const float * std_data = (const float *)gguf_get_arr_data(ctx, idx_std); + + for (int i = 0; i < 3; ++i) { + new_clip->image_mean[i] = mean_data[i]; + new_clip->image_std[i] = std_data[i]; + } + + if (verbosity >= 2) { + LOG_INF("\n%s: vision model hparams\n", __func__); + LOG_INF("image_size %d\n", hparams.image_size); + LOG_INF("patch_size %d\n", hparams.patch_size); + LOG_INF("v_hidden_size %d\n", hparams.hidden_size); + LOG_INF("v_n_intermediate %d\n", hparams.n_intermediate); + LOG_INF("v_projection_dim %d\n", hparams.projection_dim); + LOG_INF("v_n_head %d\n", hparams.n_head); + LOG_INF("v_n_layer %d\n", hparams.n_layer); + LOG_INF("v_eps %f\n", hparams.eps); + LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]); + LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]); + LOG_INF("v_image_grid_pinpoints: "); + for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) { + LOG_INF("%d ", hparams.image_grid_pinpoints[i]); + } + LOG_INF("\n"); + LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type); + + } + + try { + vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD); + new_clip->has_class_embedding = true; + } catch (const std::exception& /*e*/) { + new_clip->has_class_embedding = false; + } + + try { + vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight")); + vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); + new_clip->has_pre_norm = true; + } catch (std::exception & /*e*/) { + new_clip->has_pre_norm = false; + } + + try { + vision_model.post_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight")); + vision_model.post_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias")); + new_clip->has_post_norm = true; + } catch (std::exception & /*e*/) { + new_clip->has_post_norm = false; + } + + try { + vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS); + new_clip->has_patch_bias = true; + } catch (std::exception & /*e*/) { + new_clip->has_patch_bias = false; + } + + try { + vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); + vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); + } catch(const std::exception& /*e*/) { + LOG_ERR("%s: failed to load vision model tensors\n", __func__); + } + + // LLaVA projection + if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) { + vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight")); + vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias")); + try { + // Yi-type llava + vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight")); + vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias")); + } catch (std::runtime_error & /*e*/) { } + try { + // missing in Yi-type llava + vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight")); + vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias")); + } catch (std::runtime_error & /*e*/) { } + try { + // Yi-type llava + vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight")); + vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias")); + } catch (std::runtime_error & /*e*/) { } + try { + // Yi-type llava + vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight")); + vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias")); + } catch (std::runtime_error & /*e*/) { } + try { + vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE); + // LOG_INF("%s: image_newline tensor (llava-1.6) found\n", __func__); + } catch (std::runtime_error & /*e*/) { } + // TODO: omni-vlm patch code here: + vision_model.mm_head_attn_in_w = get_tensor(new_clip->ctx_data, "v.head.attention.in_weight"); + vision_model.mm_head_attn_in_b = get_tensor(new_clip->ctx_data, "v.head.attention.in_bias"); + vision_model.mm_head_attn_out_w = get_tensor(new_clip->ctx_data, "v.head.attention.out.weight"); + vision_model.mm_head_attn_out_b = get_tensor(new_clip->ctx_data, "v.head.attention.out.bias"); + vision_model.mm_head_ln_w = get_tensor(new_clip->ctx_data, "v.head.ln.weight"); + vision_model.mm_head_ln_b = get_tensor(new_clip->ctx_data, "v.head.ln.bias"); + vision_model.mm_head_ffd_w = get_tensor(new_clip->ctx_data, "v.head.ffn_down.weight"); + vision_model.mm_head_ffd_b = get_tensor(new_clip->ctx_data, "v.head.ffn_down.bias"); + vision_model.mm_head_ffu_w = get_tensor(new_clip->ctx_data, "v.head.ffn_up.weight"); + vision_model.mm_head_ffu_b = get_tensor(new_clip->ctx_data, "v.head.ffn_up.bias"); + vision_model.mm_head_prob = get_tensor(new_clip->ctx_data, "v.head.probe"); + + } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) { + // MobileVLM projection + vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight")); + vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias")); + vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight")); + vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias")); + vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight")); + vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight")); + vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias")); + vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight")); + vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias")); + vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight")); + vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias")); + vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight")); + vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight")); + vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias")); + vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight")); + vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight")); + vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias")); + vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight")); + vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias")); + vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight")); + vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias")); + vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight")); + vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight")); + vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias")); + } + else if (new_clip->proj_type == PROJECTOR_TYPE_LDPV2) + { + // MobilVLM_V2 projection + vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "weight")); + vision_model.mm_model_mlp_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "bias")); + vision_model.mm_model_mlp_2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "weight")); + vision_model.mm_model_mlp_2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "bias")); + vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight")); + vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias")); + } + else if (new_clip->proj_type == PROJECTOR_TYPE_RESAMPLER) { + // vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD); + vision_model.mm_model_pos_embed_k = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD_K); + vision_model.mm_model_query = get_tensor(new_clip->ctx_data, TN_MINICPMV_QUERY); + vision_model.mm_model_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_PROJ); + vision_model.mm_model_kv_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_KV_PROJ); + vision_model.mm_model_attn_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "weight")); + vision_model.mm_model_attn_k_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "weight")); + vision_model.mm_model_attn_v_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "weight")); + vision_model.mm_model_attn_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "bias")); + vision_model.mm_model_attn_k_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "bias")); + vision_model.mm_model_attn_v_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "bias")); + vision_model.mm_model_attn_o_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "weight")); + vision_model.mm_model_attn_o_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "bias")); + vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "weight")); + vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "bias")); + vision_model.mm_model_ln_kv_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "weight")); + vision_model.mm_model_ln_kv_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "bias")); + vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight")); + vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias")); + } + else { + std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type]; + throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); + } + + vision_model.layers.resize(hparams.n_layer); + + for (int il = 0; il < hparams.n_layer; ++il) { + auto & layer = vision_model.layers[il]; + layer.k_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "weight")); + layer.q_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q, "v", il, "weight")); + layer.v_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_V, "v", il, "weight")); + layer.o_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "weight")); + layer.ln_1_w = get_tensor(new_clip->ctx_data, format(TN_LN_1, "v", il, "weight")); + layer.ln_2_w = get_tensor(new_clip->ctx_data, format(TN_LN_2, "v", il, "weight")); + layer.ff_i_w = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN, "v", il, "weight")); + layer.ff_o_w = get_tensor(new_clip->ctx_data, format(TN_FFN_UP, "v", il, "weight")); + layer.k_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "bias")); + layer.q_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q, "v", il, "bias")); + layer.v_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_V, "v", il, "bias")); + layer.o_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "bias")); + layer.ln_1_b = get_tensor(new_clip->ctx_data, format(TN_LN_1, "v", il, "bias")); + layer.ln_2_b = get_tensor(new_clip->ctx_data, format(TN_LN_2, "v", il, "bias")); + layer.ff_i_b = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN, "v", il, "bias")); + layer.ff_o_b = get_tensor(new_clip->ctx_data, format(TN_FFN_UP, "v", il, "bias")); + } + } + + ggml_free(meta); + + new_clip->ctx_gguf = ctx; + + // measure mem requirement and allocate + { + new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead()); + new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend)); + clip_image_f32_batch batch; + batch.size = 1; + ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false); + ggml_gallocr_reserve(new_clip->compute_alloc, gf); + size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0); + LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0); + } + + return new_clip; +} + +void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) { + ctx_clip->load_image_size = load_image_size; +} + +struct clip_image_size * clip_image_size_init() { + struct clip_image_size * load_image_size = new struct clip_image_size(); + load_image_size->width = 448; + load_image_size->height = 448; + return load_image_size; +} + +struct clip_image_u8 * clip_image_u8_init() { + return new clip_image_u8(); +} + +struct clip_image_f32 * clip_image_f32_init() { + return new clip_image_f32(); +} + +void clip_image_u8_free(struct clip_image_u8 * img) { delete img; } +void clip_image_f32_free(struct clip_image_f32 * img) { delete img; } +void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { + if (batch->size > 0) { + delete[] batch->data; + batch->size = 0; + } +} +void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { + if (batch->size > 0) { + delete[] batch->data; + batch->size = 0; + } +} + +static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) { + img->nx = nx; + img->ny = ny; + img->buf.resize(3 * nx * ny); + memcpy(img->buf.data(), data, img->buf.size()); +} + +bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) { + int nx, ny, nc; + auto * data = stbi_load(fname, &nx, &ny, &nc, 3); + if (!data) { + LOG_ERR("%s: failed to load image '%s'\n", __func__, fname); + return false; + } + build_clip_img_from_data(data, nx, ny, img); + stbi_image_free(data); + return true; +} + +bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) { + int nx, ny, nc; + auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3); + if (!data) { + LOG_ERR("%s: failed to decode image bytes\n", __func__); + return false; + } + build_clip_img_from_data(data, nx, ny, img); + stbi_image_free(data); + return true; +} + +// Linear interpolation between two points +inline float clip_lerp(float s, float e, float t) { + return s + (e - s) * t; +} +// Bilinear resize function +static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) { + dst.nx = target_width; + dst.ny = target_height; + dst.buf.resize(3 * target_width * target_height); + + float x_ratio = static_cast(src.nx - 1) / target_width; + float y_ratio = static_cast(src.ny - 1) / target_height; + + for (int y = 0; y < target_height; y++) { + for (int x = 0; x < target_width; x++) { + float px = x_ratio * x; + float py = y_ratio * y; + int x_floor = static_cast(px); + int y_floor = static_cast(py); + float x_lerp = px - x_floor; + float y_lerp = py - y_floor; + + for (int c = 0; c < 3; c++) { + float top = clip_lerp( + static_cast(src.buf[3 * (y_floor * src.nx + x_floor) + c]), + static_cast(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]), + x_lerp + ); + float bottom = clip_lerp( + static_cast(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]), + static_cast(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]), + x_lerp + ); + dst.buf[3 * (y * target_width + x) + c] = static_cast(clip_lerp(top, bottom, y_lerp)); + } + } + } +} + +// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not +static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, const float mean[3], const float std[3]) { + dst->nx = src->nx; + dst->ny = src->ny; + dst->buf.resize(src->buf.size()); + + for (size_t i = 0; i < src->buf.size(); ++i) { + int c = i % 3; // rgb + dst->buf[i] = (static_cast(src->buf[i]) / 255.0f - mean[c]) / std[c]; + } +} + +inline int clip(int x, int lower, int upper) { + return std::max(lower, std::min(x, upper)); +} + +static bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_width, int target_height) { + const int nx = img.nx; + const int ny = img.ny; + + dst.nx = target_width; + dst.ny = target_height; + dst.buf.resize(3 * target_width * target_height); + + float Cc; + float C[5]; + float d0, d2, d3, a0, a1, a2, a3; + int i, j, k, jj; + int x, y; + float dx, dy; + float tx, ty; + + tx = (float)nx / (float)target_width; + ty = (float)ny / (float)target_height; + + // Bicubic interpolation; adapted from ViT.cpp, inspired from : + // -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36 + // -> https://en.wikipedia.org/wiki/Bicubic_interpolation + + for (i = 0; i < target_height; i++) { + for (j = 0; j < target_width; j++) { + x = (int)(tx * j); + y = (int)(ty * i); + + dx = tx * j - x; + dy = ty * i - y; + + for (k = 0; k < 3; k++) { + for (jj = 0; jj <= 3; jj++) { + d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + + a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; + a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; + a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; + + C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx; + + d0 = C[0] - C[1]; + d2 = C[2] - C[1]; + d3 = C[3] - C[1]; + a0 = C[1]; + a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; + a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; + a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; + Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy; + + const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f); + dst.buf[(i * target_width + j) * 3 + k] = float(Cc2); + } + } + } + } + + return true; +} + +// llava-1.6 type of resize_and_pad (black) +static void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &image_output, const std::pair& target_resolution) { + int target_width = target_resolution.first; + int target_height = target_resolution.second; + + float scale_w = static_cast(target_width) / image.nx; + float scale_h = static_cast(target_height) / image.ny; + + int new_width, new_height; + + if (scale_w < scale_h) { + new_width = target_width; + new_height = std::min(static_cast(std::ceil(image.ny * scale_w)), target_height); + } else { + new_height = target_height; + new_width = std::min(static_cast(std::ceil(image.nx * scale_h)), target_width); + } + + clip_image_u8 resized_image; + // bilinear_resize(image, resized_image, new_width, new_height); + bicubic_resize(image, resized_image, new_width, new_height); + + clip_image_u8 padded_image; + padded_image.nx = target_width; + padded_image.ny = target_height; + padded_image.buf.resize(3 * target_width * target_height, 0); // Initialize with black + + // Calculate padding offsets + int pad_x = (target_width - new_width) / 2; + int pad_y = (target_height - new_height) / 2; + + // Copy the resized image into the center of the padded buffer + for (int y = 0; y < new_height; ++y) { + for (int x = 0; x < new_width; ++x) { + for (int c = 0; c < 3; ++c) { + padded_image.buf[3 * ((y + pad_y) * target_width + (x + pad_x)) + c] = resized_image.buf[3 * (y * new_width + x) + c]; + } + } + } + image_output = std::move(padded_image); +} + +/** + * Selects the best resolution from a list of possible resolutions based on the original size. + * + * @param original_size The original size of the image in the format (width, height). + * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...]. + * @return The best fit resolution in the format (width, height). + */ +static std::pair select_best_resolution(const std::pair & original_size, const std::vector> & possible_resolutions) { + int original_width = original_size.first; + int original_height = original_size.second; + std::pair best_fit; + int max_effective_resolution = 0; + int min_wasted_resolution = std::numeric_limits::max(); + + for (const auto& resolution : possible_resolutions) { + int width = resolution.first; + int height = resolution.second; + float scale = std::min(static_cast(width) / original_width, static_cast(height) / original_height); + int downscaled_width = static_cast(original_width * scale); + int downscaled_height = static_cast(original_height * scale); + int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); + int wasted_resolution = (width * height) - effective_resolution; + // LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); + if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { + max_effective_resolution = effective_resolution; + min_wasted_resolution = wasted_resolution; + best_fit = resolution; + } + } + + return best_fit; +} + +static std::vector divide_to_patches_u8(const clip_image_u8 & image, int patch_size) { + std::vector patches; + int width = image.nx; + int height = image.ny; + for (int i = 0; i < height; i += patch_size) { + for (int j = 0; j < width; j += patch_size) { + clip_image_u8 *patch = clip_image_u8_init(); + patch->nx = std::min(patch_size, width - j); + patch->ny = std::min(patch_size, height - i); + patch->buf.resize(3 * patch->nx * patch->ny); + for (int y = 0; y < patch->ny; ++y) { + for (int x = 0; x < patch->nx; ++x) { + for (int c = 0; c < 3; ++c) { + patch->buf[3 * (y * patch->nx + x) + c] = image.buf[3 * ((i + y) * width + (j + x)) + c]; + } + } + } + patches.push_back(patch); + } + } + return patches; +} + +static int ensure_divide(int length, int patch_size) { + return std::max(static_cast(std::round(static_cast(length) / patch_size) * patch_size), patch_size); +} + +static std::pair uhd_find_best_resize(std::pair original_size, int scale_resolution, int patch_size, bool allow_upscale = false) { + int width = original_size.first; + int height = original_size.second; + if ((width * height > scale_resolution * scale_resolution) || allow_upscale) { + float r = static_cast(width) / height; + height = static_cast(scale_resolution / std::sqrt(r)); + width = static_cast(height * r); + } + int best_width = ensure_divide(width, patch_size); + int best_height = ensure_divide(height, patch_size); + return std::make_pair(best_width, best_height); +} + +static std::pair uhd_get_refine_size(std::pair original_size, std::pair grid, int scale_resolution, int patch_size, bool allow_upscale = false) { + int width, height; + std::tie(width, height) = original_size; + int grid_x, grid_y; + std::tie(grid_x, grid_y) = grid; + + int refine_width = ensure_divide(width, grid_x); + int refine_height = ensure_divide(height, grid_y); + + int grid_width = refine_width / grid_x; + int grid_height = refine_height / grid_y; + + // auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line) + auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair + int best_grid_width, best_grid_height; + std::tie(best_grid_width, best_grid_height) = best_grid_size; + + // std::pair refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line) + std::pair refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line) + return refine_size; +} + +static std::pair uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) { + std::vector candidate_split_grids_nums; + for (int i : {multiple - 1, multiple, multiple + 1}) { + if (i == 1 || i > max_slice_nums) { + continue; + } + candidate_split_grids_nums.push_back(i); + } + + std::vector> candidate_grids; + for (int split_grids_nums : candidate_split_grids_nums) { + int m = 1; + while (m <= split_grids_nums) { + if (split_grids_nums % m == 0) { + candidate_grids.emplace_back(m, split_grids_nums / m); + } + ++m; + } + } + + std::pair best_grid{1, 1}; + float min_error = std::numeric_limits::infinity(); + for (const auto& grid : candidate_grids) { + float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second)); + if (error < min_error) { + best_grid = grid; + min_error = error; + } + } + return best_grid; +} + +// inspired from LLaVA-UHD: +// -> https://arxiv.org/pdf/2403.11703 +// -> https://github.com/thunlp/LLaVA-UHD +// -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118 +static std::vector> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) { + const std::pair original_size={img->nx,img->ny}; + const int original_width = img->nx; + const int original_height = img->ny; + const float log_ratio = log(1.0*original_width/original_height); + const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution); + const int multiple = fmin(ceil(ratio), max_slice_nums); + + std::vector> images; + LOG_INF("%s: multiple %d\n", __func__, multiple); + images.push_back(std::vector()); + + if (multiple <= 1) { + auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true); + clip_image_u8 * source_image = clip_image_u8_init(); + bicubic_resize(*img, *source_image, best_size.first, best_size.second); + // source_image = image.resize(best_size, Image.Resampling.BICUBIC) + images[images.size()-1].push_back(source_image); + } + else if (multiple > 1) { + auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size); + clip_image_u8 * source_image = clip_image_u8_init(); + bicubic_resize(*img, *source_image, best_size.first, best_size.second); + // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC) + LOG_INF("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second); + images[images.size()-1].push_back(source_image); + + std::pair best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio); + LOG_INF("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second); + + auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true); + clip_image_u8 * refine_image = clip_image_u8_init(); + bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second); + + LOG_INF("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second); + + // split_to_patches + int width = refine_image->nx; + int height = refine_image->ny; + int grid_x = int(width / best_grid.first); + int grid_y = int(height / best_grid.second); + for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){ + images.push_back(std::vector()); + for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){ + clip_image_u8 * patch = clip_image_u8_init(); + patch->nx = grid_x; + patch->ny = grid_y; + patch->buf.resize(3 * patch->nx * patch->ny); + for (int y = patches_i; y < patches_i + grid_y; ++y) { + for (int x = patches_j; x < patches_j + grid_x; ++x) { + const int i = 3 * (y * refine_image->nx + x); + const int j = 3 * ((y-patches_i) * patch->nx + (x-patches_j)); + patch->buf[j] = refine_image->buf[i]; + patch->buf[j+1] = refine_image->buf[i+1]; + patch->buf[j+2] = refine_image->buf[i+2]; + } + } + images[images.size()-1].push_back(patch); + } + } + } + return images; +} + +int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) { + const int max_slice_nums=9; + const int scale_resolution=448; + const int original_width = ctx_clip->load_image_size->width; + const int original_height = ctx_clip->load_image_size->height; + const float log_ratio = log(1.0*original_width/original_height); + const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution); + const int multiple = fmin(ceil(ratio), max_slice_nums); + std::pair best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio); + return best_grid.first; +} + +// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector +// res_imgs memory is being allocated here, previous allocations will be freed if found +bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) { + // develop for omni-vlm, WIP + // TODO: a better structure + res_imgs->data = new clip_image_f32[1]; + res_imgs->size = 1; + + clip_image_u8 sampled_img; + bool ret = bicubic_resize(*img, sampled_img, ctx->vision_model.hparams.image_size, ctx->vision_model.hparams.image_size); + + clip_image_f32 img_f32; + normalize_image_u8_to_f32(&sampled_img, &img_f32, ctx->image_mean, ctx->image_std); + + res_imgs->data[0] = img_f32; + + return true; + +} + +ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) { + return ctx->vision_model.image_newline; +} + +void clip_free(clip_ctx * ctx) { + ggml_free(ctx->ctx_data); + gguf_free(ctx->ctx_gguf); + + ggml_backend_buffer_free(ctx->params_buffer); + ggml_backend_free(ctx->backend); + ggml_gallocr_free(ctx->compute_alloc); + delete ctx; +} + +size_t clip_embd_nbytes(const struct clip_ctx * ctx) { + return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float); +} + +int32_t clip_image_size(const struct clip_ctx * ctx) { + return ctx->vision_model.hparams.image_size; +} + +int32_t clip_patch_size(const struct clip_ctx * ctx) { + return ctx->vision_model.hparams.patch_size; +} + +int32_t clip_hidden_size(const struct clip_ctx * ctx) { + return ctx->vision_model.hparams.hidden_size; +} + +const char * clip_patch_merge_type(const struct clip_ctx * ctx) { + return ctx->vision_model.hparams.mm_patch_merge_type; +} + +const int32_t * clip_image_grid(const struct clip_ctx * ctx) { + return ctx->vision_model.hparams.image_grid_pinpoints; +} + +int clip_n_patches(const struct clip_ctx * ctx) { + const auto & params = ctx->vision_model.hparams; + + int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size); + + if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) { + n_patches /= 4; + } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { + if (ctx->minicpmv_version == 2) { + n_patches = 96; + } + else if (ctx->minicpmv_version == 3) { + n_patches = 64; + } + } + + return n_patches; +} + +static std::vector>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector> & pos) { + assert(embed_dim % 2 == 0); + int H = pos.size(); + int W = pos[0].size(); + + std::vector omega(embed_dim / 2); + for (int i = 0; i < embed_dim / 2; ++i) { + omega[i] = 1.0 / pow(10000.0, static_cast(i) / (embed_dim / 2)); + } + + std::vector>> emb(H, std::vector>(W, std::vector(embed_dim))); + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + for (int d = 0; d < embed_dim / 2; ++d) { + float out_value = pos[h][w] * omega[d]; + emb[h][w][d] = sin(out_value); + emb[h][w][d + embed_dim / 2] = cos(out_value); + } + } + } + + return emb; +} + +static std::vector>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector>> & grid) { + assert(embed_dim % 2 == 0); + std::vector>> emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[0]); // (H, W, D/2) + std::vector>> emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[1]); // (H, W, D/2) + + int H = emb_h.size(); + int W = emb_h[0].size(); + std::vector>> emb(H, std::vector>(W, std::vector(embed_dim))); + + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + for (int d = 0; d < embed_dim / 2; ++d) { + emb[h][w][d] = emb_h[h][w][d]; + emb[h][w][d + embed_dim / 2] = emb_w[h][w][d]; + } + } + } + return emb; +} + +static std::vector> get_2d_sincos_pos_embed(int embed_dim, const std::pair image_size) { + int grid_h_size = image_size.first; + int grid_w_size = image_size.second; + + std::vector grid_h(grid_h_size); + std::vector grid_w(grid_w_size); + + for (int i = 0; i < grid_h_size; ++i) { + grid_h[i] = static_cast(i); + } + for (int i = 0; i < grid_w_size; ++i) { + grid_w[i] = static_cast(i); + } + + std::vector> grid(grid_h_size, std::vector(grid_w_size)); + for (int h = 0; h < grid_h_size; ++h) { + for (int w = 0; w < grid_w_size; ++w) { + grid[h][w] = grid_w[w]; + } + } + std::vector>> grid_2d = {grid, grid}; + for (int h = 0; h < grid_h_size; ++h) { + for (int w = 0; w < grid_w_size; ++w) { + grid_2d[0][h][w] = grid_h[h]; + grid_2d[1][h][w] = grid_w[w]; + } + } + + std::vector>> pos_embed_3d = get_2d_sincos_pos_embed_from_grid(embed_dim, grid_2d); + + int H = image_size.first; + int W = image_size.second; + std::vector> pos_embed_2d(H * W, std::vector(embed_dim)); + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + pos_embed_2d[w * H + h] = pos_embed_3d[h][w]; + } + } + + return pos_embed_2d; +} + +bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { + if (!ctx->has_vision_encoder) { + LOG_ERR("This gguf file seems to have no vision encoder\n"); + return false; + } + + clip_image_f32_batch imgs{}; + imgs.size = 1; + imgs.data = img; + return clip_image_batch_encode(ctx, n_threads, &imgs, vec); +} + +bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) { + if (!ctx->has_vision_encoder) { + LOG_ERR("This gguf file seems to have no vision encoder\n"); + return false; + } + + int batch_size = imgs->size; + if (ctx->has_llava_projector) { + GGML_ASSERT(batch_size == 1); // TODO: support multiple images + } + if (ctx->has_minicpmv_projector) { + GGML_ASSERT(batch_size == 1); + } + + // build the inference graph + ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true); + ggml_gallocr_alloc_graph(ctx->compute_alloc, gf); + + // set inputs + const auto & model = ctx->vision_model; + const auto & hparams = model.hparams; + + const int image_size = hparams.image_size; + int image_size_width = image_size; + int image_size_height = image_size; + if (ctx->has_minicpmv_projector) { + image_size_width = imgs->data[0].nx; + image_size_height = imgs->data[0].ny; + } + const int patch_size = hparams.patch_size; + const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); + const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); + if(ctx->load_image_size==nullptr){ + ctx->load_image_size= clip_image_size_init(); + } + const int pos_w = ctx->load_image_size->width/patch_size; + const int pos_h = ctx->load_image_size->height/patch_size; + + { + struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw"); + float * data = (float *)malloc(ggml_nbytes(inp_raw)); + + for (size_t i = 0; i < imgs->size; i++) { + const int nx = imgs->data[i].nx; + const int ny = imgs->data[i].ny; + if (!ctx->has_minicpmv_projector) { + GGML_ASSERT(nx == image_size && ny == image_size); + } + + const int n = nx * ny; + + for (int b = 0; b < batch_size; b++) { + for (int k = 0; k < 3; k++) { + for (int y = 0; y < ny; y++) { + for (int x = 0; x < nx; x++) { + data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].buf[3 * (y * nx + x) + k]; + } + } + } + } + } + + // std::ifstream ifs("/Users/liwei/repo/llama-cpp-experiments/huggingface/pixel_values"); + // ifs.read(reinterpret_cast(data), 384*384*3*sizeof(float)); + // cout << "\t\tGGGET" << endl; + // ifs.close(); + + ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw)); + free(data); + } + if (ctx->has_minicpmv_projector) { + { + // inspired from siglip: + // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit + // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316 + struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); + int* positions_data = (int*)malloc(ggml_nbytes(positions)); + int bucket_coords_h[70]; + int bucket_coords_w[70]; + for (int i = 0; i < pos_h; i++){ + bucket_coords_h[i] = std::floor(70.0*i/pos_h); + } + for (int i = 0; i < pos_w; i++){ + bucket_coords_w[i] = std::floor(70.0*i/pos_w); + } + for (int i = 0, id = 0; i < pos_h; i++){ + for (int j = 0; j < pos_w; j++){ + positions_data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j]; + } + } + ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); + free(positions_data); + } + + { + // inspired from resampler of Qwen-VL: + // -> https://huggingface.co/Qwen/Qwen-VL/tree/main + // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23 + struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed"); + int embed_dim = 4096; + if (ctx->minicpmv_version == 2) { + embed_dim = 4096; + } + else if (ctx->minicpmv_version == 3) { + embed_dim = 3584; + } + auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h)); + + float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed)); + for(int i=0;ihas_class_embedding) { + struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings"); + + void* zero_mem = malloc(ggml_nbytes(embeddings)); + memset(zero_mem, 0, ggml_nbytes(embeddings)); + ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings)); + free(zero_mem); + } + } + + { + struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); + + int* positions_data = (int*)malloc(ggml_nbytes(positions)); + for (int i = 0; i < num_positions; i++) { + positions_data[i] = i; + } + ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); + free(positions_data); + } + + /* + { + struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); + int* patches_data = (int*)malloc(ggml_nbytes(patches)); + for (int i = 0; i < num_patches; i++) { + patches_data[i] = i + 1; + } + ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); + free(patches_data); + } + */ + + } + + if (ggml_backend_is_cpu(ctx->backend)) { + ggml_backend_cpu_set_n_threads(ctx->backend, n_threads); + } + +#ifdef GGML_USE_METAL + if (ggml_backend_is_metal(ctx->backend)) { + ggml_backend_metal_set_n_cb(ctx->backend, n_threads); + } +#endif + + ggml_backend_graph_compute(ctx->backend, gf); + + // the last node is the embedding tensor + struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1]; + + // copy the embeddings to the location passed by the user + ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); + + return true; +} + +bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) { + ggml_type type = GGML_TYPE_Q4_1; + + assert(itype < GGML_TYPE_COUNT); + type = static_cast(itype); + + auto * ctx_clip = clip_model_load(fname_inp, 2); + + const auto & ctx_src = ctx_clip->ctx_gguf; + const auto & ctx_data = ctx_clip->ctx_data; + + auto * ctx_out = gguf_init_empty(); + gguf_set_kv(ctx_out, ctx_src); + gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); + gguf_set_val_u32(ctx_out, "general.file_type", itype); + + auto fout = std::ofstream(fname_out, std::ios::binary); + + const int n_tensors = gguf_get_n_tensors(ctx_src); + + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(ctx_src, i); + struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); + gguf_add_tensor(ctx_out, cur); + } + + const size_t meta_size = gguf_get_meta_size(ctx_out); + for (size_t i = 0; i < meta_size; ++i) { + fout.put(0); + } + + // regexes of tensor names to be quantized + const std::vector k_names = { + ".*weight", + }; + + std::vector work(512); + std::vector conv_buf(512); + size_t total_size_org = 0; + size_t total_size_new = 0; + + for (int i = 0; i < n_tensors; ++i) { + const std::string name = gguf_get_tensor_name(ctx_src, i); + struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name.c_str()); + + enum ggml_type new_type; + void * new_data; + size_t new_size; + + bool quantize = false; + for (const auto & s : k_names) { + if (std::regex_match(name, std::regex(s))) { + quantize = true; + break; + } + } + + // quantize only 2D tensors + quantize &= (ggml_n_dims(cur) == 2); + + if (quantize) { + new_type = type; + if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) { + new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type + // LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type)); + } + const size_t n_elms = ggml_nelements(cur); + float * f32_data; + + switch (cur->type) { + case GGML_TYPE_F32: + f32_data = (float *)cur->data; + break; + case GGML_TYPE_F16: + if (conv_buf.size() < n_elms) { + conv_buf.resize(n_elms); + } + for (size_t j = 0; j < n_elms; ++j) { + conv_buf[j] = ggml_fp16_to_fp32(((ggml_fp16_t *)cur->data)[j]); + } + f32_data = (float *)conv_buf.data(); + break; + default: + LOG_ERR("Please use an input file in f32 or f16\n"); + gguf_free(ctx_out); + return false; + } + + if (work.size() < n_elms * 4) { + work.resize(n_elms * 4); + } + new_data = work.data(); + + new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, n_elms/cur->ne[0], cur->ne[0], nullptr); + } else { + new_type = cur->type; + new_data = cur->data; + new_size = ggml_nbytes(cur); + } + const size_t orig_size = ggml_nbytes(cur); + total_size_org += orig_size; + total_size_new += new_size; + gguf_set_tensor_type(ctx_out, name.c_str(), new_type); + gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size); + fout.write((const char *)new_data, new_size); + size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size; + for (size_t j = 0; j < pad; ++j) { + fout.put(0); + } + + LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize, + orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); + } + + // go back to beginning of file and write the updated metadata + fout.seekp(0, std::ios::beg); + std::vector meta(meta_size); + gguf_get_meta_data(ctx_out, meta.data()); + fout.write((const char *)meta.data(), meta_size); + + fout.close(); + + clip_free(ctx_clip); + gguf_free(ctx_out); + + { + LOG_INF("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0); + LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0); + } + + return true; +} + +int clip_n_mmproj_embd(const struct clip_ctx * ctx) { + if (ctx->proj_type == PROJECTOR_TYPE_LDP) { + return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0]; + } + if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) { + return ctx->vision_model.mm_model_peg_0_b->ne[0]; + } + if (ctx->proj_type == PROJECTOR_TYPE_MLP) { + return ctx->vision_model.mm_2_b->ne[0]; + } + if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { + return ctx->vision_model.mm_3_b->ne[0]; + } + if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { + if (ctx->minicpmv_version == 2) { + return 4096; + } + else if (ctx->minicpmv_version == 3) { + return 3584; + } + } + + std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; + throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); +} + +int clip_is_minicpmv(const struct clip_ctx * ctx) { + if (ctx->has_minicpmv_projector) { + return ctx->minicpmv_version; + } + return 0; +} diff --git a/examples/omni-vlm-v2/clip-v2.h b/examples/omni-vlm-v2/clip-v2.h new file mode 100644 index 000000000..78588bdf1 --- /dev/null +++ b/examples/omni-vlm-v2/clip-v2.h @@ -0,0 +1,94 @@ +#ifndef CLIP_H +#define CLIP_H + +#include +#include + +#ifdef LLAMA_SHARED +# if defined(_WIN32) && !defined(__MINGW32__) +# ifdef LLAMA_BUILD +# define CLIP_API __declspec(dllexport) +# else +# define CLIP_API __declspec(dllimport) +# endif +# else +# define CLIP_API __attribute__ ((visibility ("default"))) +# endif +#else +# define CLIP_API +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +struct clip_ctx; + +struct clip_image_size { + int width; + int height; +}; + +struct clip_image_u8_batch { + struct clip_image_u8 * data; + size_t size; +}; + +struct clip_image_f32_batch { + struct clip_image_f32 * data; + size_t size; +}; + +CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity); +CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity); + +CLIP_API void clip_free(struct clip_ctx * ctx); + +CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx); + +CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx); +CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx); +CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx); + +// TODO: should be enum, not string +CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx); + +CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx); + +CLIP_API int clip_n_patches (const struct clip_ctx * ctx); +CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); + +CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip); +CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size); + +CLIP_API struct clip_image_size * clip_image_size_init(); +CLIP_API struct clip_image_u8 * clip_image_u8_init (); +CLIP_API struct clip_image_f32 * clip_image_f32_init(); + +CLIP_API void clip_image_u8_free (struct clip_image_u8 * img); +CLIP_API void clip_image_f32_free(struct clip_image_f32 * img); +CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch); +CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch); + +CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); + +/** interpret bytes as an image file with length bytes_length, and use the result to populate img */ +CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img); + +/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */ +CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs ); + +CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx); + +CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec); +CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec); + +CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype); + +CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx); + +#ifdef __cplusplus +} +#endif + +#endif // CLIP_H diff --git a/examples/omni-vlm-v2/convert_image_encoder_to_gguf.py b/examples/omni-vlm-v2/convert_image_encoder_to_gguf.py new file mode 100644 index 000000000..bdc0d68ab --- /dev/null +++ b/examples/omni-vlm-v2/convert_image_encoder_to_gguf.py @@ -0,0 +1,208 @@ +import argparse +import os +import json +import re + +import torch +import numpy as np +from gguf import * +# from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel + +VISION = "siglip.vision" + + +def k(raw_key: str, arch: str) -> str: + return raw_key.format(arch=arch) + + +def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_omni_vlm: bool) -> bool: + if name in ( + "logit_scale", + "text_model.embeddings.position_ids", + "vision_model.embeddings.position_ids", + ): + return True + + # if name.startswith("vision_model.post_layernorm") or name.startswith("vision_model.head"): + # return True + + if name.startswith("v") and not has_vision: + return True + + if name.startswith("t") and not has_text: + return True + + return False + + +def get_tensor_name(name: str) -> str: + if "projection" in name: + return name + if "multi_modal_projector" in name: + name = name.replace("multi_modal_projector", "mm") + name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1) + name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1) + return name + + return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln") + + +ap = argparse.ArgumentParser() +ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True) +ap.add_argument("-p", "--processor-dir", help="Path to vlm-processor directory cloned from HF Hub", required=True) +ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) +ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16") +# TODO: whether update this info? +# default_image_mean = [0.48145466, 0.4578275, 0.40821073] +# default_image_std = [0.26862954, 0.26130258, 0.27577711] +default_image_mean = [0.5, 0.5, 0.5] +default_image_std = [0.5, 0.5, 0.5] + +# with proper +args = ap.parse_args() + +if args.use_f32: + print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.") + +# output in the same directory as the model if output_dir is None +dir_model = args.model_dir +dir_processor = args.processor_dir + +with open(dir_processor + "/preprocessor_config.json", "r", encoding="utf-8") as f: + hparams = json.load(f) + +# possible data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 +# +# map from ftype to string +ftype_str = ["f32", "f16"] + +ftype = 1 +if args.use_f32: + ftype = 0 + +has_omni_vlm_projector = True +fname_middle = "mmproj-" +output_dir = args.output_dir if args.output_dir is not None else dir_model +os.makedirs(output_dir, exist_ok=True) + +fname_out = os.path.join(output_dir, f"{fname_middle}omni-vlm-v2-{ftype_str[ftype]}.gguf") +fout = GGUFWriter(path=fname_out, arch="siglip") + +fout.add_bool("siglip.has_omni_vlm_projector", has_omni_vlm_projector) +fout.add_file_type(ftype) +fout.add_name("omni-vlm") +fout.add_description("image encoder for omni-vlm") + + +fout.add_uint32("siglip.vision.image_size", 384) +fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), 1152) +fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), 16) #TODO: to be confirmed +fout.add_uint32("siglip.vision.patch_size", 14) +# block_count = (27 - 1) +block_count = 27 +fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count) + +fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), 4304) +fout.add_uint32("siglip.vision.projection_dim", 4096) + # fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"]) + # if "image_grid_pinpoints" in v_hparams: + # # flatten it + # image_grid_pinpoints = [] + # for pinpoint in v_hparams["image_grid_pinpoints"]: + # for p in pinpoint: + # image_grid_pinpoints.append(p) +# fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints) +# if "image_crop_resolution" in v_hparams: +# fout.add_uint32("clip.vision.image_crop_resolution", v_hparams["image_crop_resolution"]) +# if "image_aspect_ratio" in v_hparams: +# fout.add_string("clip.vision.image_aspect_ratio", v_hparams["image_aspect_ratio"]) +# if "image_split_resolution" in v_hparams: +# fout.add_uint32("clip.vision.image_split_resolution", v_hparams["image_split_resolution"]) +# if "mm_patch_merge_type" in v_hparams: +# fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"]) +# if "mm_projector_type" in v_hparams: +# fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"]) +# +# +# if processor is not None: +# image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean # pyright: ignore[reportAttributeAccessIssue] +# image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std # pyright: ignore[reportAttributeAccessIssue] +# else: +# image_mean = args.image_mean if args.image_mean is not None else default_image_mean +# image_std = args.image_std if args.image_std is not None else default_image_std +# fout.add_array("clip.vision.image_mean", image_mean) +# fout.add_array("clip.vision.image_std", image_std) +# +fout.add_array("siglip.vision.image_mean", default_image_mean) +fout.add_array("siglip.vision.image_std", default_image_std) + +# use_gelu = v_hparams["hidden_act"] == "gelu" +# fout.add_bool("clip.use_gelu", use_gelu) + +model = torch.load(os.path.join(dir_model, "omni_vlm.clip"), map_location='cpu') + # model.vision_model.encoder.layers.pop(-1) +projector = torch.load(os.path.join(dir_model, "omni_vlm.projector"), map_location='cpu') +for name, data in projector.items(): + name = get_tensor_name(name) + # pw and dw conv ndim==4 + if data.ndim == 2 or data.ndim == 4: + data = data.squeeze().cpu().numpy().astype(np.float16) + else: + data = data.squeeze().cpu().numpy().astype(np.float32) + + fout.add_tensor(name, data) + +print("Projector tensors added\n") + + +# state_dict = model.state_dict() +state_dict = dict(model) +for name, data in state_dict.items(): + if should_skip_tensor(name, False, True, True): + # we don't need this + print(f"skipping parameter: {name}") + continue + + # if name.startswith(f"vision_model.encoder.layers.{block_count}"): + # continue + + name = get_tensor_name(name) + # data = data.astype(np.float16) + # print(data) + data = data.squeeze().float().numpy() + + n_dims = len(data.shape) + + # ftype == 0 -> float32, ftype == 1 -> float16 + ftype_cur = 0 + if n_dims == 4: + print(f"tensor {name} is always saved in f16") + data = data.astype(np.float16) + ftype_cur = 1 + elif ftype == 1: + if name[-7:] == ".weight" and n_dims == 2: + print(" Converting to float16") + data = data.astype(np.float16) + ftype_cur = 1 + else: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + else: + if data.dtype != np.float32: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + + print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}") + fout.add_tensor(name, data) + + +fout.write_header_to_file() +fout.write_kv_data_to_file() +fout.write_tensors_to_file() +fout.close() + +print("Done. Output file: " + fname_out) diff --git a/examples/omni-vlm-v2/latex.png b/examples/omni-vlm-v2/latex.png new file mode 100644 index 0000000000000000000000000000000000000000..b97318fc049a3fd4822604db547ac71520a104e3 GIT binary patch literal 5931 zcmZWtcQ{;q(^iAkMeij#yLvCNTJ+wcmnFnny$hoE8U$grXhC$*SKS&diyad*;mjJLjKC(AQNb2G9dAFffQU9;q1K>DD`O!M}HRSF6BI z!N9<@a8*{;*HBhw)%S)uyShU#FnBWj(i^qFUX(G4<75+6C8bJ(Z21C1jF0Dro?|r* zxrOvP9td zIEiXV?O~4IuP;oBcgZDfVz|}r~A?bDy zROdozq1g4<2WlnYzGF-GP3lG_(^3v=~P9Chb|O-p6L<3sGHLRmbUI$a&1NBfV!^`lC@AXhn3duM&<3*$Y%bE zm*{C1CSsSMFunyRJ)Gd3oKrE9dilTxi}~FQNDs^Sbx4Pp0>&!Zaq^lV&)|-0Q;4~S zvyKi1-<^$*ff?tDfpceJ-UzX^J~hfn)s>ZG~h1W4ruGR5DW5(74k^ zj@}T6htCt3Z;Dml#9gU*S7UQua~*9NM;H`n?*wyz00W_(e;^ogfiia{6yj^o8VGgw z@R13WXa6fh=Fa}(2C=jLmE!wUp50tWpH&&=4Pli43IT=K6#%TPta9E?&N7B7YJbD; zp5)n|`1*RvfItBO0l)xJAk5nZBrGj04H6OoiHHc^WeEBNdHC7~3VQf({HMtO)=`1@ zIC{H!`ntkASpU?ucYyi%%CocoY4p$apZA0Wy8gG5htJ<*-3jr>>S{~}F%Al}L_=v_-+ zh5w$+-{5}-{|3r|{!IQaCjLY7U*0>O6##Oef6hz+u;+D@je&7rLqkQ$I1qEsn$XJF zRfHZ0Rnsdym-J;S{f3-SW*#v>4zP&ZGvSbvJxahl1)>yBPGw84b&OdS`}u z2M_ly^F-3nh>M)Kgu(Pgc2;ysqkg!0O zDY`Nq8>SsH?$MSh#2I=1aqECPFoE?WGdg~|wj@w;ftbmXZI<}@bzc&Gxb)b0{cz@i zx4o`T>7b{?*!z3mPeM0?( zvjx1yRa0;6SncxGF-(NSx*o=f=^G2Kc7LQ;GKef-XIJA|c6CxHO5uRxMb;k{&_*J^ z1($3bS zjWXW2hC&qNUM#G}LAeIV?XfQplJRzAMn6bnIjgU!DP(fkBD5@-hpsNBgO> z4fnyi>D}YetmV_6|4cKjLp;K-cdjIv(x4y5uimziWMb)A+Hz0akCfeR(0C$G3O_Nm z?iGQ(!H3n?swH+~-tPF`&^^W6FLrtwG|A$Q=VKasRX@ZAft@W3#VSK38>GjLCk5wh z2Z7jTvpJhcxc-k{PT2F)hf0E%iXpY!r8k}A`__e?%BOl0^h~jD2M-P%zmWMtO&elo z*2x!+7Z|&JR&kwvbj|#Db7+(@c75Bg^eM=_1)Utnd-wg=;DO;g_(+}5lpIJ-X6HML z9k6EpmQK37Uadd(>|)lup+7V#x-*blkbaXbxrn^GjHlySb~Qwv<&g(F$U zoa@hJhddLdZ-y`fmR7%jjW=d!$OA(@@DikVz#DeMD87^@c^YbsjM;Nm2hx}xFeb*_ z9=h)xOSGeJUFCKUmhxsX&zo;Sq@Pn*I&sz!s?!5?qZPe0m>Jb@)Yi+o^MWjNuzCB3 zd^fNCIKhsqtIO@$?~yQrth)E8Nx23MU%K@)g;y;Teqi-;p*Kq-3hhpfr?$FomhBOf zy9W^=zjX$#Y)IREWm|$@+&-U32A@q6PRp$mZ^gGEM~zQBY_l`%*O!~GZMLRGX+UWvoj5qZ zv#0(bpO?c1>RpuPesyk}bYtaiwOIFkJ2>X!X*VtHoPB-q0Ziy=C`(H^=|ZxOZVbDq z77YKr0~D}sVltaBYaHdg`%8#QdT8KkrxlEhGWltV41-^a&pJ{?<3#$;USq6a((>%+ zt@b1L5*C&b$nFnIkExQcw#p?aq()GtY5dh%Lg){KnAou#0t%Py7Yu%tN5}J<#Lj01 zV74tiGZw_~WFI2rzWm~x;OcOBy^_5fW6rI4l?AJvMd{n>HJI7Pq7lTjYP=~Yui->% z;ZleFEO+&>1F`XHgL>iw4cLj~eU#2v`3*#{zE2z*@d`-`${*)Tka*z7>3x4y{eo{W zOF0~*a0oNS-&Z=YC*|V};Ae}%Q=7cOFV=RD2bydwoxQj?Cbco| zT<0GG(&B!edf+beW{)VZ%P=ptVda`kQJy&gRkB<+;~z^t5HFu7&NKFCa7Vl#-Am(e zpm3exgwMHGfl)K#v}I%GEwfCJu+3FDe~w2%T8U}bY_)y$m*d5?W(n$MyrrfT|n_~ud`D{EBCE*%b)ccSV9PI)TfQPktaiw%0vn^~T;Z1&hM}v<#Dr|Ut?o%;> z4D!ie{fUkvUE5Scq3*B_mn6~a&jsGqiFFU%%^kM&5T$1Z73G5-hCy53#8x7_|d=>UBcK7{AIt| z)0WOLlia;hRfHU`&>8;R4n>o1=Wq6k2?jL}5VyXKbT+7>v5MkBao6d z7fGGDTAq~9koCq3vqM0t=qLJ%I$ec2Zl?R1gJ!L}(L(7~Eg{8|=Sy*~9~42~@vM}# z{v51}jQE15%7;!9EQnEEWu}!rWZTkeJZ(u!RleYXUErv$9aHbRBA4{Hri+#nryZso zeiV+<&YE;Z8)(n}bk&uQI-FYc4!zD5zxbYLj-foh+yC?}8P;rlVKAT8vjR$RgFtvx zCQx#*oudwe%LH7V5kGLa*(dUlsn4}RO? zWMw@OOWD|{Enqw~b=M6w_vs6#34fP?%ff|_gN1KXnB=ILyqC}8G0!C`WTjJ(O=Zwi zSoBz#>qTSsr19xN%>KPWP8A|_`?{lcFAsS{(gMMCe5cI|{OBd}_Y1Eu$w7^`%8XoruF*EmCyGqj1*|Sa(6w9>Y=okn6|tbP zk62AeLNF~hq;PVy&F zAE|gyCaycV#D_v9c%nl%BT2MQ@u%~dR99DLp>N|%%H}>Ce3>s+_wSt1V~`wnZ!wh- zMY{~KzS=sxPp$MYT+g}&tXs4u8#vIAJJ`|X?b3#eir08#@KlgVxGKbKsqR*U#$=rN zzRq`o#jc)U4MFX+0~#sffC~i&Cp-6Vct`1Ldb!n}Hi3Zq%TGb%T!gAbHK2DmDOq#P zc**7w3)(?vG6`jijn_`q5Ou|2`GGUiHrROy9gpd4b==2Oo+-sBTih3vJu~A-lzq8u zC%E1T+xeGYRge9cQ646Fl*2AruO|_6I(}v64t*l7D3v#Y=4oryCZ8@B5IM0e|Afj9 z9#cpVZe^>|y~^5gnj^CX16%v&n@mrh$xJyz#Klr!wt=XV_#M$+1h#%KpeK|KkW{Ws zZ!YVgFRwXOt3O=CknI7uud&R;QRctN7lI=9IH3MjczLhE3%u)CSkx~w$$>ie897Mq zUG$6MPddbw5XyV8I-d`*R3WNS%h)mVKROe-d8Y&#x+-rapIN@1x9JC|$5g9uRE`t~ zykn@h30p|E=Jhr)Lm!Gm$%U)1en~%n#U&>x_IWJ41rNtJ^LP7quK=R&l|rGx-J_i( z>A!H+2rPJYStbT?dk|?7@w5+!Ecaaw&W~AQ2bh=ipl%YfHzk`&1}Q&9EoP$ej4uWb zGD5UP-hVGq3Un^#m)INaPxwhw^6{NPKOH7{_=vw5o5KvIK*+VN4tW&yh`1Wj;}*=t z%f@&m1JGLa7J1Z4!a%K>3Q=h{;t^yQLyiLvQTv@!p6lhf=psGlnGlEflTQz$oa_n8 zMWj{KiN7NCoP@#mjGWvO9)-1Ox-|rU29SO~t*QOV3w0!#43SR{HILx%mwj!(wk`&8 z=GdFy<+2>^on3#Hy`H=2^7(WtF=Lu&P+3j!fg9+vt|x;$iR0HjcM#nFpgdgYMrr}& z6n=BiBNV9sb!-F-7AEl?2RXPgY33sRVDj8D0w& z$KuONI(?yTi`4~S($s~*uTa7H3}Wd^YLjI{R%}CM`EIjiqs4+x=Z7am({$11q0C}z zYz<7&zxgU!R5=z@CE_wu08@_4#y@JZ(OHxBK690VPQJ= z#aDi$LwJ(zbHAaa_KuF@Sf9^oMMR$>9KXM$)90rS$O3XtvrYG#%UIO*otxO|mbwB; z+MQW`QEKwM_2AJT!sTP|pxRWd=s}wFLETd(;ojGs47g8}KN`URsj{w!s!AH3Ab*Us zU^jh2K!^@DkCcMhpmke(rpqs}bb3HA`&pg|qO)Xw0~1YqtuPsO3<1!wsB9t>OQTN+ zJ7J)Em~31V8%CAknvRtBd4QIWRh~8)ChI*-*4sPc;-ofau$|b;X_k4x{sMeZ%UXOy z$jhr&Y-FJ2vdJx$nV$mJ(JfCh9nu=~@YZ^QVCKo(n&}rVHVl3zeh!n2)jtH3Kwef* z1T$(acvM1aD0oynx#<-XEk?_yrR%gTRq7uH9q?$jJBaSm&NiP2X}j=5DfV}!f5?*+ zWUj2ieIpP)GbbOP-$uUd=K2!E@OAc&2(Jt*B>SKm?UN4Zv7?PGpXqmtrk`j54Zbtl7?ekZ_~bJh56mjuiLJc z%ZL;JnMJLRIRmN+KfZ-|uYgqF!|7*CR&~8a#-$OK!0LPsi_kJsi{XOCiOG;=;s-5uZFs@F zUFLJoblqwPp(8J@f5Sp4Ub;UdU4Bwqi?>8IZ{0JsOe(uYXJ*Zaxnh*GIcT&;i>d1- z>mgU)DRsAB+UPhmmQ}l3Eu}J~)dExRmQhkbxw%yGweq&j;mw*Pjzh)kpk(9sXxWWF P%TWzgU6p#U-Shte5t8c8 literal 0 HcmV?d00001 diff --git a/examples/omni-vlm-v2/omni-vlm-v2-cli.cpp b/examples/omni-vlm-v2/omni-vlm-v2-cli.cpp new file mode 100644 index 000000000..f789aff0a --- /dev/null +++ b/examples/omni-vlm-v2/omni-vlm-v2-cli.cpp @@ -0,0 +1,299 @@ +// #include "arg.h" +#include "base64.hpp" +#include "log.h" +#include "common.h" +#include "sampling.h" +#include "clip-v2.h" +#include "omni-vlm-v2.h" +#include "llama.h" +#include "ggml.h" + +#include +#include +#include +#include + +static bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past) { + int N = (int) tokens.size(); + for (int i = 0; i < N; i += n_batch) { + int n_eval = (int) tokens.size() - i; + if (n_eval > n_batch) { + n_eval = n_batch; + } + if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) { + LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); + return false; + } + *n_past += n_eval; + } + return true; +} + +static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) { + std::vector tokens; + tokens.push_back(id); + return eval_tokens(ctx_llama, tokens, 1, n_past); +} + +static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ + std::string str2 = str; + std::vector embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true); + eval_tokens(ctx_llama, embd_inp, n_batch, n_past); + return true; +} + +static const char * sample(struct llama_sampling_context * ctx_sampling, + struct llama_context * ctx_llama, + int * n_past) { + const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL); + llama_sampling_accept(ctx_sampling, ctx_llama, id, true); + static std::string ret; + if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { + ret = ""; + } else { + ret = llama_token_to_piece(ctx_llama, id); + } + eval_id(ctx_llama, id, n_past); + return ret.c_str(); +} + +static const std::string IMG_PAD = "<|image_pad|>"; + +static void find_image_tag_in_prompt(const std::string& prompt, size_t& idx) { + // begin_out = prompt.find(IMG_BASE64_TAG_BEGIN); + // end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out); + idx = prompt.find(IMG_PAD); +} + +static bool prompt_contains_image(const std::string& prompt) { + size_t begin; + find_image_tag_in_prompt(prompt, begin); + return (begin != std::string::npos); +} + +// replaces the base64 image tag in the prompt with `replacement` +static omni_image_embed * omnivlm_image_embed_make_with_prompt(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) { + size_t idx; + find_image_tag_in_prompt(prompt, idx); + if (idx == std::string::npos) { + LOG_TEE("%s: invalid base64 image tag. must be %s\n", __func__, IMG_PAD.c_str()); + return NULL; + } + + auto base64_str = prompt.substr(idx, IMG_PAD.size()); + + auto required_bytes = base64::required_encode_size(base64_str.size()); + auto img_bytes = std::vector(required_bytes); + base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin()); + + auto embed = omnivlm_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size()); + if (!embed) { + LOG_TEE("%s: could not load image from base64 string.\n", __func__); + return NULL; + } + + return embed; +} + +static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") { + size_t begin; + find_image_tag_in_prompt(prompt, begin); + if (begin == std::string::npos) { + return prompt; + } + auto pre = prompt.substr(0, begin); + auto post = prompt.substr(begin + IMG_PAD.size()); + return pre + replacement + post; +} + +struct omnivlm_context { + struct clip_ctx * ctx_clip = NULL; + struct llama_context * ctx_llama = NULL; + struct llama_model * model = NULL; +}; + +static void print_usage(int argc, char ** argv, const gpt_params & params) { + gpt_params_print_usage(argc, argv, params); + + LOG_TEE("\n example usage:\n"); + LOG_TEE("\n %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); + LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n"); +} + +static struct omni_image_embed * load_image(omnivlm_context * ctx_omnivlm, gpt_params * params, const std::string & fname) { + + // load and preprocess the image + omni_image_embed * embed = NULL; + embed = omnivlm_image_embed_make_with_filename(ctx_omnivlm->ctx_clip, params->n_threads, fname.c_str()); + if (!embed) { + fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str()); + return NULL; + } + + return embed; +} + +static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_image_embed * image_embed, gpt_params * params, const std::string & prompt) { + int n_past = 0; + + const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict; + + std::string system_prompt, user_prompt; + size_t image_pos = prompt.find("<|image_pad|>"); + // new templating mode: Provide the full prompt including system message and use as a placeholder for the image + system_prompt = prompt.substr(0, image_pos); + user_prompt = prompt.substr(image_pos + std::string("<|image_pad|>").length()); + if (params->verbose_prompt) { + auto tmp = ::llama_tokenize(ctx_omnivlm->ctx_llama, system_prompt, true, true); + for (int i = 0; i < (int) tmp.size(); i++) { + LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_omnivlm->ctx_llama, tmp[i]).c_str()); + } + } + LOG_TEE("user_prompt: %s\n", user_prompt.c_str()); + if (params->verbose_prompt) { + auto tmp = ::llama_tokenize(ctx_omnivlm->ctx_llama, user_prompt, true, true); + for (int i = 0; i < (int) tmp.size(); i++) { + LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_omnivlm->ctx_llama, tmp[i]).c_str()); + } + } + + eval_string(ctx_omnivlm->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true); + omnivlm_eval_image_embed(ctx_omnivlm->ctx_llama, image_embed, params->n_batch, &n_past); + eval_string(ctx_omnivlm->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false); + + // generate the response + + LOG("\n"); + + struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams); + if (!ctx_sampling) { + LOG_TEE("%s: failed to initialize sampling subsystem\n", __func__); + exit(1); + } + + std::string response = ""; + for (int i = 0; i < max_tgt_len; i++) { + const char * tmp = sample(ctx_sampling, ctx_omnivlm->ctx_llama, &n_past); + response += tmp; + if (strcmp(tmp, "<|im_end|>") == 0) break; + if (strcmp(tmp, "") == 0) break; + // if (strstr(tmp, "###")) break; // Yi-VL behavior + printf("%s", tmp); + // if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works) + // if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6 + // if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6 + + fflush(stdout); + } + + llama_sampling_free(ctx_sampling); + printf("\n"); +} + +static struct llama_model * omnivlm_init(gpt_params * params) { + llama_backend_init(); + llama_numa_init(params->numa); + + llama_model_params model_params = llama_model_params_from_gpt_params(*params); + + llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); + if (model == NULL) { + LOG_TEE("%s: unable to load model\n" , __func__); + return NULL; + } + return model; +} + +static struct omnivlm_context * omnivlm_init_context(gpt_params * params, llama_model * model) { + const char * clip_path = params->mmproj.c_str(); + + auto prompt = params->prompt; + if (prompt.empty()) { + prompt = "describe the image in detail."; + } + + auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 10); + + + llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); + ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings + + llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); + + if (ctx_llama == NULL) { + LOG_TEE("%s: failed to create the llama_context\n" , __func__); + return NULL; + } + + auto * ctx_omnivlm = (struct omnivlm_context *)malloc(sizeof(omnivlm_context)); + + ctx_omnivlm->ctx_llama = ctx_llama; + ctx_omnivlm->ctx_clip = ctx_clip; + ctx_omnivlm->model = model; + return ctx_omnivlm; +} + +static void omnivlm_free(struct omnivlm_context * ctx_omnivlm) { + if (ctx_omnivlm->ctx_clip) { + clip_free(ctx_omnivlm->ctx_clip); + ctx_omnivlm->ctx_clip = NULL; + } + + llama_free(ctx_omnivlm->ctx_llama); + llama_free_model(ctx_omnivlm->model); + llama_backend_free(); +} + +int main(int argc, char ** argv) { + ggml_time_init(); + + gpt_params params; + + // if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) { + // return 1; + // } + if (!gpt_params_parse(argc, argv, params)) { + print_usage(argc, argv, params); + return 1; + } + + if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { + print_usage(argc, argv, {}); + return 1; + } + + if(params.prompt.empty()) { + LOG_TEE("%s: prompt is empty. Terminating\n\n", __func__); + print_usage(argc, argv, {}); + return 1; + } + + params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" + params.prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>"; + + auto * model = omnivlm_init(¶ms); + if (model == NULL) { + fprintf(stderr, "%s: error: failed to init omnivlm model\n", __func__); + return 1; + } + + + auto * ctx_omnivlm = omnivlm_init_context(¶ms, model); + for (auto & image : params.image) { + auto * image_embed = load_image(ctx_omnivlm, ¶ms, image); + if (!image_embed) { + LOG_TEE("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str()); + return 1; + } + // process the prompt + process_prompt(ctx_omnivlm, image_embed, ¶ms, params.prompt); + + llama_print_timings(ctx_omnivlm->ctx_llama); + omnivlm_image_embed_free(image_embed); + } + ctx_omnivlm->model = NULL; + omnivlm_free(ctx_omnivlm); + + llama_free_model(model); + + return 0; +} diff --git a/examples/omni-vlm-v2/omni-vlm-v2-wrapper-cli.cpp b/examples/omni-vlm-v2/omni-vlm-v2-wrapper-cli.cpp new file mode 100644 index 000000000..f30661e08 --- /dev/null +++ b/examples/omni-vlm-v2/omni-vlm-v2-wrapper-cli.cpp @@ -0,0 +1,16 @@ +// WARNING: this .cpp file is only for debugging. do not user directly. +#include "omni-vlm-v2-wrapper.h" + +int main(int argc, char ** argv) { + const char* llm_model = ""; + const char* mmproj_model = ""; + const char* image_path = ""; + const char* prompt = ""; + + omnivlm_init(llm_model, mmproj_model); + omnivlm_inference(prompt, image_path); + omnivlm_inference(prompt, image_path); + omnivlm_free(); + + return 0; +} diff --git a/examples/omni-vlm-v2/omni-vlm-v2-wrapper.cpp b/examples/omni-vlm-v2/omni-vlm-v2-wrapper.cpp new file mode 100644 index 000000000..b8786372c --- /dev/null +++ b/examples/omni-vlm-v2/omni-vlm-v2-wrapper.cpp @@ -0,0 +1,259 @@ +#include "base64.hpp" +#include "log.h" +#include "common.h" +#include "sampling.h" +#include "clip-v2.h" +#include "omni-vlm-v2.h" +#include "llama.h" +#include "ggml.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "omni-vlm-v2-wrapper.h" + + +struct omnivlm_context { + struct clip_ctx * ctx_clip = NULL; + struct llama_context * ctx_llama = NULL; + struct llama_model * model = NULL; +}; + +void* internal_chars = nullptr; + +static struct gpt_params params; +static struct llama_model* model; +static struct omnivlm_context* ctx_omnivlm; + +static struct omni_image_embed * load_image(omnivlm_context * ctx_omnivlm, gpt_params * params, const std::string & fname) { + + // load and preprocess the image + omni_image_embed * embed = NULL; + embed = omnivlm_image_embed_make_with_filename(ctx_omnivlm->ctx_clip, params->n_threads, fname.c_str()); + if (!embed) { + fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str()); + return NULL; + } + + return embed; +} + +static struct llama_model * omnivlm_init(gpt_params * params) { + llama_backend_init(); + llama_numa_init(params->numa); + + llama_model_params model_params = llama_model_params_from_gpt_params(*params); + + llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); + if (model == NULL) { + LOG_TEE("%s: unable to load model\n" , __func__); + return NULL; + } + return model; +} + +static struct omnivlm_context * omnivlm_init_context(gpt_params * params, llama_model * model) { + const char * clip_path = params->mmproj.c_str(); + + auto prompt = params->prompt; + if (prompt.empty()) { + prompt = "describe the image in detail."; + } + + auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 10); + + + llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); + ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings + + llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); + + if (ctx_llama == NULL) { + LOG_TEE("%s: failed to create the llama_context\n" , __func__); + return NULL; + } + + ctx_omnivlm = (struct omnivlm_context *)malloc(sizeof(omnivlm_context)); + + ctx_omnivlm->ctx_llama = ctx_llama; + ctx_omnivlm->ctx_clip = ctx_clip; + ctx_omnivlm->model = model; + return ctx_omnivlm; +} + +static bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past) { + int N = (int) tokens.size(); + for (int i = 0; i < N; i += n_batch) { + int n_eval = (int) tokens.size() - i; + if (n_eval > n_batch) { + n_eval = n_batch; + } + if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) { + LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); + return false; + } + *n_past += n_eval; + } + return true; +} + +static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) { + std::vector tokens; + tokens.push_back(id); + return eval_tokens(ctx_llama, tokens, 1, n_past); +} + +static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ + std::string str2 = str; + std::vector embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true); + eval_tokens(ctx_llama, embd_inp, n_batch, n_past); + return true; +} + +static const char * sample(struct llama_sampling_context * ctx_sampling, + struct llama_context * ctx_llama, + int * n_past) { + const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL); + llama_sampling_accept(ctx_sampling, ctx_llama, id, true); + static std::string ret; + if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { + ret = ""; + } else { + ret = llama_token_to_piece(ctx_llama, id); + } + eval_id(ctx_llama, id, n_past); + return ret.c_str(); +} + +static const char* process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_image_embed * image_embed, gpt_params * params, const std::string & prompt) { + int n_past = 0; + + const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict; + + std::string full_prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" \ + + prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>"; + size_t image_pos = full_prompt.find("<|image_pad|>"); + std::string system_prompt, user_prompt; + + // new templating mode: Provide the full prompt including system message and use as a placeholder for the image + system_prompt = full_prompt.substr(0, image_pos); + user_prompt = full_prompt.substr(image_pos + std::string("<|image_pad|>").length()); + if (params->verbose_prompt) { + auto tmp = ::llama_tokenize(ctx_omnivlm->ctx_llama, system_prompt, true, true); + for (int i = 0; i < (int) tmp.size(); i++) { + LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_omnivlm->ctx_llama, tmp[i]).c_str()); + } + } + // LOG_TEE("user_prompt: %s\n", user_prompt.c_str()); + if (params->verbose_prompt) { + auto tmp = ::llama_tokenize(ctx_omnivlm->ctx_llama, user_prompt, true, true); + for (int i = 0; i < (int) tmp.size(); i++) { + LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_omnivlm->ctx_llama, tmp[i]).c_str()); + } + } + + eval_string(ctx_omnivlm->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true); + omnivlm_eval_image_embed(ctx_omnivlm->ctx_llama, image_embed, params->n_batch, &n_past); + eval_string(ctx_omnivlm->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false); + + // generate the response + + LOG("\n"); + + struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams); + if (!ctx_sampling) { + LOG_TEE("%s: failed to initialize sampling subsystem\n", __func__); + exit(1); + } + + std::string response = ""; + for (int i = 0; i < max_tgt_len; i++) { + const char * tmp = sample(ctx_sampling, ctx_omnivlm->ctx_llama, &n_past); + if (strcmp(tmp, "<|im_end|>") == 0) break; + if (strcmp(tmp, "") == 0) break; + // if (strstr(tmp, "###")) break; // Yi-VL behavior + // printf("%s", tmp); + response += tmp; + // if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works) + // if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6 + // if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6 + + fflush(stdout); + } + + llama_sampling_free(ctx_sampling); + printf("\n"); + + // const char* ret_char_ptr = (const char*)(malloc(sizeof(char)*response.size())); + if(internal_chars != nullptr) { free(internal_chars); } + internal_chars = malloc(sizeof(char)*(response.size()+1)); + strncpy((char*)(internal_chars), response.c_str(), response.size()); + ((char*)(internal_chars))[response.size()] = '\0'; + return (const char*)(internal_chars); +} + +static void omnivlm_free(struct omnivlm_context * ctx_omnivlm) { + if (ctx_omnivlm->ctx_clip) { + clip_free(ctx_omnivlm->ctx_clip); + ctx_omnivlm->ctx_clip = NULL; + } + + llama_free(ctx_omnivlm->ctx_llama); + llama_free_model(ctx_omnivlm->model); + llama_backend_free(); +} + +static void print_usage(int argc, char ** argv, const gpt_params & params) { + gpt_params_print_usage(argc, argv, params); + + LOG_TEE("\n example usage:\n"); + LOG_TEE("\n %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); + LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n"); +} + +// inference interface definition +void omnivlm_init(const char* llm_model_path, const char* projector_model_path) { + const char* argv = "hello-omni-vlm-wrapper-cli"; + char* nc_argv = const_cast(argv); + if (!gpt_params_parse(1, &nc_argv, params)) { + print_usage(1, &nc_argv, {}); + throw std::runtime_error("init params error."); + } + params.model = llm_model_path; + params.mmproj = projector_model_path; + model = omnivlm_init(¶ms); + if (model == nullptr) { + fprintf(stderr, "%s: error: failed to init omnivlm model\n", __func__); + throw std::runtime_error("Failed to init omnivlm model"); + } + ctx_omnivlm = omnivlm_init_context(¶ms, model); +} + +const char* omnivlm_inference(const char *prompt, const char *imag_path) { + std::string image = imag_path; + params.prompt = prompt; + auto * image_embed = load_image(ctx_omnivlm, ¶ms, image); + if (!image_embed) { + LOG_TEE("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str()); + throw std::runtime_error("failed to load image " + image); + } + // process the prompt + const char* ret_chars = process_prompt(ctx_omnivlm, image_embed, ¶ms, params.prompt); + + // llama_perf_print(ctx_omnivlm->ctx_llama, LLAMA_PERF_TYPE_CONTEXT); + omnivlm_image_embed_free(image_embed); + + return ret_chars; +} + +void omnivlm_free() { + if(internal_chars != nullptr) { free(internal_chars); } + ctx_omnivlm->model = NULL; + omnivlm_free(ctx_omnivlm); + llama_free_model(model); +} diff --git a/examples/omni-vlm-v2/omni-vlm-v2-wrapper.h b/examples/omni-vlm-v2/omni-vlm-v2-wrapper.h new file mode 100644 index 000000000..ff37a2550 --- /dev/null +++ b/examples/omni-vlm-v2/omni-vlm-v2-wrapper.h @@ -0,0 +1,33 @@ + +#ifndef OMNIVLMWRAPPER_H +#define OMNIVLMWRAPPER_H + +#ifdef LLAMA_SHARED +# if defined(_WIN32) && !defined(__MINGW32__) +# ifdef LLAMA_BUILD +# define OMNIVLM_API __declspec(dllexport) +# else +# define OMNIVLM_API __declspec(dllimport) +# endif +# else +# define OMNIVLM_API __attribute__ ((visibility ("default"))) +# endif +#else +# define OMNIVLM_API +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +OMNIVLM_API void omnivlm_init(const char* llm_model_path, const char* projector_model_path); + +OMNIVLM_API const char* omnivlm_inference(const char* prompt, const char* imag_path); + +OMNIVLM_API void omnivlm_free(); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/examples/omni-vlm-v2/omni-vlm-v2.cpp b/examples/omni-vlm-v2/omni-vlm-v2.cpp new file mode 100644 index 000000000..24806d958 --- /dev/null +++ b/examples/omni-vlm-v2/omni-vlm-v2.cpp @@ -0,0 +1,434 @@ +#include "clip-v2.h" +#include "omni-vlm-v2.h" + +#include "llama.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0) +#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0) + +// #define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0) +// #define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +// #define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +// #define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0) + +// RGB uint8 image +struct clip_image_u8 { + int nx; + int ny; + + std::vector buf; +}; + +// RGB float32 image (NHWC) +// Memory layout: RGBRGBRGB... +struct clip_image_f32 { + int nx; + int ny; + + std::vector buf; +}; + +struct clip_image_grid_shape { + int first; + int second; +}; + +/** + * Selects the best resolution from a list of possible resolutions based on the original size. + * + * @param original_size The original size of the image in the format (width, height). + * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...]. + * @return The best fit resolution in the format (width, height). + */ +static std::pair select_best_resolution(const std::pair& original_size, const std::vector>& possible_resolutions) { + int original_width = original_size.first; + int original_height = original_size.second; + + std::pair best_fit; + int max_effective_resolution = 0; + int min_wasted_resolution = std::numeric_limits::max(); + + for (const auto& resolution : possible_resolutions) { + int width = resolution.first; + int height = resolution.second; + float scale = std::min(static_cast(width) / original_width, static_cast(height) / original_height); + int downscaled_width = static_cast(original_width * scale); + int downscaled_height = static_cast(original_height * scale); + int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); + int wasted_resolution = (width * height) - effective_resolution; + // LOG_DBG("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); + if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { + max_effective_resolution = effective_resolution; + min_wasted_resolution = wasted_resolution; + best_fit = resolution; + } + } + + return best_fit; +} + +/** + * @brief Get the anyres image grid shape object + * + * @param image_size + * @param grid_pinpoints + * @param image_patch_size + * @return + */ +static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair & image_size, const std::vector> & grid_pinpoints, int image_patch_size) { + /** + Conversion from gguf flat array to vector: + std::vector> possible_resolutions; + for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) { + possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]}); + } + */ + auto best_resolution = select_best_resolution(image_size, grid_pinpoints); + return {best_resolution.first / image_patch_size, best_resolution.second / image_patch_size}; +} + +// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out) +static bool clip_omnivlm_handle_patches(clip_ctx * ctx_clip, std::vector & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) { + struct { + struct ggml_context * ctx; + } model; + + const int32_t image_size = clip_image_size(ctx_clip); + const int32_t patch_size = clip_patch_size(ctx_clip); + + int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches) + + int num_patches_width = grid_shape.first; // grid 1-4 + int num_patches_height = grid_shape.second; // grid 1-4 + + const size_t num_images = num_patches_width * num_patches_height + 1; + + // TODO: size calculation is not calculated - it's only tens of MB + size_t ctx_size = 0; + + { + ctx_size += clip_embd_nbytes(ctx_clip) * num_images * 8; // image_features + ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32); + } + + struct ggml_init_params params { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, // NOTE: this should be false when using the legacy API + }; + + // Python reference code for full unpad: + /* + base_image_feature = image_feature[0] + image_feature = image_feature[1:] + image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous() + image_feature = image_feature.flatten(1, 2).flatten(2, 3) + image_feature = unpad_image(image_feature, image_sizes[image_idx]) + image_feature = torch.cat(( + image_feature, + self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1) + ), dim=-1) + image_feature = image_feature.flatten(1, 2).transpose(0, 1) + image_feature = torch.cat((base_image_feature, image_feature), dim=0) + */ + // We now have two options: unpad or no unpad. Unpad removes tokens for faster llm eval. + // In terms of result quality it appears to make no difference, so we'll start with the easier approach given 5D tensors are not supported in ggml yet. + // Without unpad we have to split the sub-image embeddings into patches of 24 features each and permute them. + // Once all images are processed to prepended the base_image_features without any changes. + + // Pytorch reference simplified, modified for ggml compatibility - confirmed identical output in python (for a 2x2 grid image (676x676 scaling)) + /* + image_feature = image_feature.view(2, 2, 24, 24, 4096) + image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous() + image_feature = image_feature.view(2, 24, 2, 24, 4096) + image_feature = image_feature.flatten(0, 3) + + // Reshape to 4D tensor by merging the last two dimensions + image_feature = image_feature.view(2, 2, 24, 24*4096) + image_feature = image_feature.permute(0, 2, 1, 3).contiguous() + image_feature = image_feature.view(-1, 4096) + */ + + model.ctx = ggml_init(params); + + struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4 + // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false); + // fill it with the image embeddings, ignoring the base + for (size_t i = 1; i < num_images; i++) { + size_t offset = (i-1) * clip_embd_nbytes(ctx_clip); + memcpy((uint8_t *)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip)); + } + + struct ggml_cgraph * gf = ggml_new_graph(model.ctx); + size_t size_ele = ggml_type_size(GGML_TYPE_F32); + + struct ggml_tensor *image_features_patchview = ggml_view_4d(model.ctx, image_features, + num_patches_per_side * clip_n_mmproj_embd(ctx_clip), + num_patches_per_side, + num_patches_width, + num_patches_height, + size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip), + size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side, + size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side * num_patches_width, 0); + // ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,false); + struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3)); + /** + At the end of each row we have to add the row_end embeddings, which are the same as the newline embeddings + image_feature = torch.cat(( + image_feature, + self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device) + ), dim=-1) + * + */ + + // ggml_tensor_printf(permuted_cont,"permuted_cont",__LINE__,false,false); + struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, size_ele * clip_n_mmproj_embd(ctx_clip), 0); + // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false); + ggml_build_forward_expand(gf, flatten); + ggml_graph_compute_with_ctx(model.ctx, gf, 1); + struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1]; + + memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context + // append without newline tokens (default behavior in llava_arch when not using unpad ): + memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches + *n_img_pos_out = static_cast(result->ne[1]+clip_n_patches(ctx_clip)); + + // Debug: Test single segments + // Current findings: sending base image, sending a segment embedding all works similar to python + // However, permuted embeddings do not work yet (stride issue?) + // memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as context + // memcpy(image_embd_out, (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip)); // main image as context + // *n_img_pos_out=576; + + ggml_free(model.ctx); + return true; +} + +static clip_image_f32 * only_v2_5_reshape_by_patch(clip_image_f32 * image, int patch_size) { + int width = image->nx; + int height = image->ny; + int num_patches = (height / patch_size) * (width / patch_size); + clip_image_f32 * patch = clip_image_f32_init(); + patch->nx = patch_size * num_patches; + patch->ny = patch_size; + patch->buf.resize(3 * patch->nx * patch->ny); + + int patch_index = 0; + + for (int i = 0; i < height; i += patch_size) { + for (int j = 0; j < width; j += patch_size) { + for (int pi = 0; pi < patch_size; ++pi) { + for (int pj = 0; pj < patch_size; ++pj) { + int input_index = ((i + pi) * width + (j + pj)) * 3; + int output_index = (pi * patch_size * num_patches + patch_index * patch_size + pj) * 3; + patch->buf[output_index] = image->buf[input_index]; + patch->buf[output_index+1] = image->buf[input_index+1]; + patch->buf[output_index+2] = image->buf[input_index+2]; + } + } + patch_index++; + } + } + return patch; +} + +static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) { + // std::vector img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336 + clip_image_f32_batch img_res_v; + img_res_v.size = 0; + img_res_v.data = nullptr; + if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) { + LOG_TEE("%s: unable to preprocess image\n", __func__); + delete[] img_res_v.data; + return false; + } + + const int64_t t_img_enc_start_us = ggml_time_us(); + + const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip); + + *n_img_pos = clip_n_patches(ctx_clip); + bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); + + LOG("%s: image embedding created: %d tokens\n", __func__, *n_img_pos); + + const int64_t t_img_enc_end_us = ggml_time_us(); + float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0; + + LOG("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos); + + return true; +} + +bool omnivlm_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) { + // make sure that the correct mmproj was used, i.e., compare apples to apples + int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama)); + auto n_image_embd = clip_n_mmproj_embd(ctx_clip); + if (n_image_embd != n_llama_embd) { + LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd); + return false; + } + return true; +} + +bool omnivlm_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) { + int num_max_patches = 6; + if (clip_is_minicpmv(ctx_clip)) { + num_max_patches = 10; + } + + float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model + if (!image_embd) { + LOG_TEE("Unable to allocate memory for image embeddings\n"); + return false; + } + + int n_img_pos; + if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) { + LOG_TEE("%s: cannot encode image, aborting\n", __func__); + free(image_embd); + return false; + } + *image_embd_out = image_embd; + *n_img_pos_out = n_img_pos; + + return true; +} + +struct omnivlm_embd_batch { + std::vector pos; + std::vector n_seq_id; + std::vector seq_id_0; + std::vector seq_ids; + std::vector logits; + llama_batch batch; + omnivlm_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { + pos .resize(n_tokens); + n_seq_id.resize(n_tokens); + seq_ids .resize(n_tokens + 1); + logits .resize(n_tokens); + seq_id_0.resize(1); + seq_id_0[0] = seq_id; + seq_ids [n_tokens] = nullptr; + batch = { + /*n_tokens =*/ n_tokens, + /*tokens =*/ nullptr, + /*embd =*/ embd, + /*pos =*/ pos.data(), + /*n_seq_id =*/ n_seq_id.data(), + /*seq_id =*/ seq_ids.data(), + /*logits =*/ logits.data(), + }; + for (int i = 0; i < n_tokens; i++) { + batch.pos [i] = pos_0 + i; + batch.n_seq_id[i] = 1; + batch.seq_id [i] = seq_id_0.data(); + batch.logits [i] = false; + } + } +}; + +bool omnivlm_eval_image_embed(llama_context * ctx_llama, const struct omni_image_embed * image_embed, int n_batch, int * n_past) { + int n_embd = llama_n_embd(llama_get_model(ctx_llama)); + + for (int i = 0; i < image_embed->n_image_pos; i += n_batch) { + int n_eval = image_embed->n_image_pos - i; + if (n_eval > n_batch) { + n_eval = n_batch; + } + float * embd = image_embed->embed+i*n_embd; + llama_batch batch = {int32_t(n_eval), nullptr, embd, nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, }; + if (llama_decode(ctx_llama, batch)) { + LOG_TEE("%s : failed to eval\n", __func__); + return false; + } + *n_past += n_eval; + } + return true; +} + +struct omni_image_embed * omnivlm_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) { + clip_image_u8 * img = clip_image_u8_init(); + if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) { + clip_image_u8_free(img); + LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__); + return NULL; + } + + float* image_embed = NULL; + int n_image_pos = 0; + bool image_embed_result = omnivlm_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos); + if (!image_embed_result) { + clip_image_u8_free(img); + LOG_TEE("%s: couldn't embed the image\n", __func__); + return NULL; + } + + clip_image_u8_free(img); + auto result = (omni_image_embed*)malloc(sizeof(omni_image_embed)); + result->embed = image_embed; + result->n_image_pos = n_image_pos; + return result; +} + +static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) { + auto* file = fopen(path, "rb"); + if (file == NULL) { + LOG_TEE("%s: can't read file %s\n", __func__, path); + return false; + } + + fseek(file, 0, SEEK_END); + auto fileSize = ftell(file); + fseek(file, 0, SEEK_SET); + + auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data + if (buffer == NULL) { + LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path); + perror("Memory allocation error"); + fclose(file); + return false; + } + errno = 0; + size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer + if (ferror(file)) { + die_fmt("read error: %s", strerror(errno)); + } + if (ret != (size_t) fileSize) { + die("unexpectedly reached end of file"); + } + fclose(file); // Close the file + + *bytesOut = buffer; + *sizeOut = fileSize; + return true; +} + +struct omni_image_embed * omnivlm_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) { + unsigned char* image_bytes; + long image_bytes_length; + auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length); + if (!loaded) { + LOG_TEE("%s: failed to load %s\n", __func__, image_path); + return NULL; + } + + omni_image_embed *embed = omnivlm_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length); + free(image_bytes); + + return embed; +} + +void omnivlm_image_embed_free(struct omni_image_embed * embed) { + free(embed->embed); + free(embed); +} diff --git a/examples/omni-vlm-v2/omni-vlm-v2.h b/examples/omni-vlm-v2/omni-vlm-v2.h new file mode 100644 index 000000000..90b20c651 --- /dev/null +++ b/examples/omni-vlm-v2/omni-vlm-v2.h @@ -0,0 +1,46 @@ +#ifndef OMNIVLM_H +#define OMNIVLM_H + +#ifdef LLAMA_SHARED +# if defined(_WIN32) && !defined(__MINGW32__) +# ifdef LLAMA_BUILD +# define OMNIVLM_API __declspec(dllexport) +# else +# define OMNIVLM_API __declspec(dllimport) +# endif +# else +# define OMNIVLM_API __attribute__ ((visibility ("default"))) +# endif +#else +# define OMNIVLM_API +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +struct clip_ctx; +struct omni_image_embed { + float * embed; + int n_image_pos; +}; + +OMNIVLM_API bool omnivlm_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip); + +OMNIVLM_API bool omnivlm_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out); + +/** build an image embed from image file bytes */ +OMNIVLM_API struct omni_image_embed * omnivlm_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length); +/** build an image embed from a path to an image filename */ +OMNIVLM_API struct omni_image_embed * omnivlm_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path); +/** free an embedding made with OMNIVLM_image_embed_make_* */ +OMNIVLM_API void omnivlm_image_embed_free(struct omni_image_embed * embed); + +/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */ +OMNIVLM_API bool omnivlm_eval_image_embed(struct llama_context * ctx_llama, const struct omni_image_embed * embed, int n_batch, int * n_past); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/examples/omni-vlm-v2/omni_vlm_v2_cpp.py b/examples/omni-vlm-v2/omni_vlm_v2_cpp.py new file mode 100644 index 000000000..efe04be6c --- /dev/null +++ b/examples/omni-vlm-v2/omni_vlm_v2_cpp.py @@ -0,0 +1,84 @@ +import ctypes +import os +import sys +from pathlib import Path + + +# Load the library +def _load_shared_library(lib_base_name: str, base_path: Path = None): + # Determine the file extension based on the platform + if sys.platform.startswith("linux"): + lib_ext = ".so" + elif sys.platform == "darwin": + lib_ext = ".dylib" + elif sys.platform == "win32": + lib_ext = ".dll" + else: + raise RuntimeError("Unsupported platform") + + # Construct the paths to the possible shared library names + if base_path is None: + _base_path = Path(__file__).parent.parent.resolve() + else: + print(f"Using base path: {base_path}") + _base_path = base_path + _lib_paths = [ + _base_path / f"lib{lib_base_name}{lib_ext}", + _base_path / f"{lib_base_name}{lib_ext}", + ] + + # Add the library directory to the DLL search path on Windows (if needed) + if sys.platform == "win32" and sys.version_info >= (3, 8): + os.add_dll_directory(str(_base_path)) + + # Try to load the shared library, handling potential errors + for _lib_path in _lib_paths: + print(f"Trying to load shared library '{_lib_path}'") + if _lib_path.exists(): + try: + return ctypes.CDLL(str(_lib_path)) + except Exception as e: + print(f"Failed to load shared library '{_lib_path}': {e}") + + raise FileNotFoundError( + f"Shared library with base name '{lib_base_name}' not found" + ) + + +# Specify the base name of the shared library to load +_lib_base_name = "omni_vlm_v2_wrapper_shared" +base_path = ( + Path(__file__).parent.parent.parent.resolve() + / "build" + / "examples" + / "omni-vlm-v2" +) + +# Load the library +_lib = _load_shared_library(_lib_base_name, base_path) + +omni_char_p = ctypes.c_char_p + + +def omnivlm_init(llm_model_path: omni_char_p, mmproj_model_path: omni_char_p): + return _lib.omnivlm_init(llm_model_path, mmproj_model_path) + + +_lib.omnivlm_init.argtypes = [omni_char_p, omni_char_p] +_lib.omnivlm_init.restype = None + + +def omnivlm_inference(prompt: omni_char_p, image_path: omni_char_p): + return _lib.omnivlm_inference(prompt, image_path) + + +_lib.omnivlm_inference.argtypes = [omni_char_p, omni_char_p] +_lib.omnivlm_inference.restype = omni_char_p + + +def omnivlm_free(): + return _lib.omnivlm_free() + + +_lib.omnivlm_free.argtypes = [] +_lib.omnivlm_free.restype = None diff --git a/examples/omni-vlm-v2/omni_vlm_v2_demo.py b/examples/omni-vlm-v2/omni_vlm_v2_demo.py new file mode 100644 index 000000000..0384631e0 --- /dev/null +++ b/examples/omni-vlm-v2/omni_vlm_v2_demo.py @@ -0,0 +1,57 @@ + +import ctypes +import logging +import os + +import omni_vlm_cpp + + +class NexaOmniVlmInference: + """ + A class used for vision language model inference. + """ + + def __init__(self, llm_model_path: str, mmproj_model_path: str): + self.llm_model = ctypes.c_char_p(llm_model_path.encode("utf-8")) + self.mmproj_model = ctypes.c_char_p(mmproj_model_path.encode("utf-8")) + + omni_vlm_cpp.omnivlm_init(self.llm_model, self.mmproj_model) + + def inference(self, prompt: str, image_path: str): + prompt = ctypes.c_char_p(prompt.encode("utf-8")) + image_path = ctypes.c_char_p(image_path.encode("utf-8")) + return omni_vlm_cpp.omnivlm_inference(prompt, image_path) + + def __del__(self): + omni_vlm_cpp.omnivlm_free() + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser( + description="Run omni vision language model generation" + ) + parser.add_argument("--model", type=str, help="Path to the llm model file") + parser.add_argument("--mmproj", type=str, help="Path to the mmproj file") + # parser.add_argument("--prompt", type=str, help="prompt string.") + # parser.add_argument("--image-path", type=str, help="Path to the image.") + + args = parser.parse_args() + + omni_vlm_obj = NexaOmniVlmInference(args.model, args.mmproj) + # omni_vlm_obj.inference(args.prompt, args.image_path) + while True: + print("Input your prompt:") + prompt = input() + if prompt == "": + print("ERROR: you input an empty prompt, try again.") + continue + print("Input your image path:") + image_path = input() + while not os.path.exists(image_path): + print("ERROR: can not find image in your input path, please check and input agian.") + image_path = input() + response = omni_vlm_obj.inference(prompt, image_path) + print("\tresponse:") + print(response.decode('utf-8')) diff --git a/examples/omni-vlm-v2/omni_vlm_v2_surgery.py b/examples/omni-vlm-v2/omni_vlm_v2_surgery.py new file mode 100644 index 000000000..a0cac5616 --- /dev/null +++ b/examples/omni-vlm-v2/omni_vlm_v2_surgery.py @@ -0,0 +1,161 @@ +import argparse +import glob +import os +import torch +from safetensors import safe_open +from safetensors.torch import save_file +from typing import Any, ContextManager, cast + +# Function to determine if file is a SafeTensor file +def is_safetensor_file(file_path): + return file_path.endswith('.safetensors') + + +# Unified loading function +def load_model(file_path): + if is_safetensor_file(file_path): + tensors = {} + with cast(ContextManager[Any], safe_open(file_path, framework="pt", device="cpu")) as f: + for key in f.keys(): + tensors[key] = f.get_tensor(key).clone() + # output shape + print(f"{key} : {tensors[key].shape}") + return tensors, 'safetensor' + else: + return torch.load(file_path, map_location=torch.device('cpu')), 'pytorch' + + +# Unified saving function +def save_model(model, file_path, file_type): + if file_type == 'safetensor': + # safe_save(model, file_path) + save_file(model, file_path) + else: + torch.save(model, file_path) + + +# Adapted function to clean vision tower from checkpoint +def clean_vision_tower_from_checkpoint(checkpoint_path): + checkpoint, file_type = load_model(checkpoint_path) + # file_type = 'pytorch' + model_path = os.path.dirname(checkpoint_path) + print(f"Searching for vision tower tensors in {checkpoint_path}") + clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vision_tower.vision_model")] + + if len(clip_tensors) > 0: + print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}") + # Adapted for file type + clip_path = os.path.join(model_path, "omni_vlm.clip") + + if os.path.exists(clip_path): + print(f"Loading existing omni_vlm.clip from {clip_path}") + existing_clip, _ = load_model(clip_path) + else: + print(f"Creating new omni_vlm.clip at {clip_path}") + existing_clip = {} + # Update existing_clip with new tensors, avoid duplicates + for name in clip_tensors: + simple_name = name[name.index('vision_model.'):] if 'vision_model.' in name else name + print(f"Adding {simple_name} to omni_vlm.clip") + if simple_name not in existing_clip: + existing_clip[simple_name] = checkpoint[name] + + # Save the updated clip tensors back to omni_vlm.clip + save_model(existing_clip, clip_path, 'pytorch') + + # Remove the tensors from the original checkpoint + for name in clip_tensors: + del checkpoint[name] + + checkpoint_path = checkpoint_path + return True + return False + +def find_relevant_checkpoints(checkpoint_paths, newline_criteria, projector): + newline_checkpoint_path = None + projector_checkpoint_path = None + + for path in checkpoint_paths: + checkpoint, _ = load_model(path) + if newline_criteria(checkpoint) and newline_checkpoint_path is None: + newline_checkpoint_path = path + if projector(checkpoint): + projector_checkpoint_path = path + + return newline_checkpoint_path, projector_checkpoint_path + +def newline_criteria(checkpoint): + return any(k.startswith("model.image_newline") for k in checkpoint.keys()) + +def proj_criteria(checkpoint): + # return any(k.startswith("multi_modal_projector.") or k.startswith("vision_proj.") for k in checkpoint.keys()) + return any(k.startswith("multi_modal_projector.") for k in checkpoint.keys()) + + +# Command-line interface setup +ap = argparse.ArgumentParser() +ap.add_argument("-m", "--model", required=True, help="Path to omni-vlm model") +ap.add_argument("-C", "--clean-vision-tower", action="store_true", help="Remove any vision tower from the model files") +args = ap.parse_args() + +if args.clean_vision_tower: + # Generalized to handle both PyTorch and SafeTensors models + model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True) + # checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and path.startswith('pytorch')) or (path.endswith('.safetensors') and path.startswith('model'))] + checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])] + for projector_checkpoint_path in checkpoint_paths: + print(f"Cleaning {projector_checkpoint_path}") + if not clean_vision_tower_from_checkpoint(projector_checkpoint_path): + print(f"No vision tower found in {projector_checkpoint_path}") + # we break once none is found, so far all models append them at the end + # break + print("Done! All vision tower tensors are removed from the model files and stored in omni_vlm.clip file.") + +# Now we look for the projector in the last checkpoint +model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True) +checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])] +# last_checkpoint_path = checkpoint_paths[0] +# first_checkpoint_path = checkpoint_paths[-1] +newline_checkpoint_path, projector_checkpoint_path = find_relevant_checkpoints(checkpoint_paths, newline_criteria, proj_criteria) + +print(f"Taking projector from {projector_checkpoint_path}") +first_mm_tensors = [] +first_checkpoint = None +if newline_checkpoint_path is not None: + print(f"Taking newline from {newline_checkpoint_path}") + first_checkpoint, file_type = load_model(newline_checkpoint_path) + first_mm_tensors = [k for k, v in first_checkpoint.items() if k.startswith("model.image_newline")] + +# Load the checkpoint +mm_tensors = [] +last_checkpoint = None +if projector_checkpoint_path is not None: + last_checkpoint, file_type = load_model(projector_checkpoint_path) + # mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("multi_modal_projector.") or k.startswith("vision_proj.")] + mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("multi_modal_projector.")] + +if len(mm_tensors) == 0: + if last_checkpoint is not None: + for k, v in last_checkpoint.items(): + print(k) + print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint) if last_checkpoint is not None else 0} tensors.") + print("No tensors found. Is this omni-vlm model?") + exit() + +print(f"Found {len(mm_tensors)} tensors to extract.") +print(f"Found additional {len(first_mm_tensors)} tensors to extract.") +# projector = {name: checkpoint.[name].float() for name in mm_tensors} +projector = {} +for name in mm_tensors: + assert last_checkpoint is not None + projector[name] = last_checkpoint[name].float() +for name in first_mm_tensors: + assert first_checkpoint is not None + projector[name] = first_checkpoint[name].float() + +if len(projector) > 0: + save_model(projector, f"{args.model}/omni_vlm.projector", 'pytorch') + +print("Done!") +print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.") +print(f"Also, use {args.model}/omni_vlm.projector to prepare a omni-vlm-encoder.gguf file.") diff --git a/examples/omni-vlm-v2/requirements.txt b/examples/omni-vlm-v2/requirements.txt new file mode 100644 index 000000000..40266eddf --- /dev/null +++ b/examples/omni-vlm-v2/requirements.txt @@ -0,0 +1,5 @@ +-r ../../requirements/requirements-convert_legacy_llama.txt +# --extra-index-url https://download.pytorch.org/whl/cpu +pillow +torch +torchvision