Merge remote-tracking branch 'origin' into zack/vlm
This commit is contained in:
commit
bbf1aaa7ed
25 changed files with 258 additions and 316 deletions
1
.github/CODEOWNERS
vendored
Normal file
1
.github/CODEOWNERS
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
@zhiyuan8 @alexchen4ai
|
|
@ -150,6 +150,7 @@ bool load_hparams_and_tensors_from_gguf(const std::string &fname, NexaBaseModel
|
|||
}
|
||||
|
||||
ggml_free(meta);
|
||||
gguf_free(ctx_gguf);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -1442,6 +1442,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|||
// End of Parse args for logging parameters
|
||||
#endif // LOG_DISABLE_LOGS
|
||||
|
||||
if (arg == "--omni-vlm-version") {
|
||||
CHECK_ARG
|
||||
params.omni_vlm_version = argv[i];
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -1688,6 +1693,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|||
"layer range to apply the control vector(s) to, start and end inclusive" });
|
||||
options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n"
|
||||
"or --model-url if set, otherwise %s)", DEFAULT_MODEL_PATH });
|
||||
options.push_back({ "*", " --omni-vlm-version VERSION_STRING", "omni vlm string version(one of 'vlm-81-ocr', 'vlm-81-instruct', 'nano-vlm-instruct')\n" "(default: 'vlm-81-ocr')"});
|
||||
options.push_back({ "*", "-md, --model-draft FNAME", "draft model for speculative decoding (default: unused)" });
|
||||
options.push_back({ "*", "-mu, --model-url MODEL_URL", "model download url (default: unused)" });
|
||||
options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" });
|
||||
|
|
|
@ -265,6 +265,8 @@ struct gpt_params {
|
|||
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
||||
|
||||
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
|
||||
|
||||
std::string omni_vlm_version = "vlm-81-ocr";
|
||||
};
|
||||
|
||||
void gpt_params_parse_from_env(gpt_params & params);
|
||||
|
|
|
@ -72,19 +72,14 @@ class MainActivity(
|
|||
|
||||
val models = listOf(
|
||||
Downloadable(
|
||||
"Phi-2 7B (Q4_0, 1.6 GiB)",
|
||||
Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true"),
|
||||
File(extFilesDir, "phi-2-q4_0.gguf"),
|
||||
"Llama3.2-1B-Instruct (Q4_0, 735 MB)",
|
||||
Uri.parse("https://public-storage.nexa4ai.com/Llama3.2-1B-Instruct/q4_0.gguf"),
|
||||
File(extFilesDir, "Llama3.2-1B-Instruct-q4_0.gguf"),
|
||||
),
|
||||
Downloadable(
|
||||
"TinyLlama 1.1B (f16, 2.2 GiB)",
|
||||
Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true"),
|
||||
File(extFilesDir, "tinyllama-1.1-f16.gguf"),
|
||||
),
|
||||
Downloadable(
|
||||
"Phi 2 DPO (Q3_K_M, 1.48 GiB)",
|
||||
Uri.parse("https://huggingface.co/TheBloke/phi-2-dpo-GGUF/resolve/main/phi-2-dpo.Q3_K_M.gguf?download=true"),
|
||||
File(extFilesDir, "phi-2-dpo.Q3_K_M.gguf")
|
||||
"octopus",
|
||||
Uri.parse("https://public-storage.nexa4ai.com/Octopus-v2/q4_0.gguf"),
|
||||
File(extFilesDir, "octopus-q4_0.gguf")
|
||||
),
|
||||
)
|
||||
|
||||
|
|
|
@ -33,6 +33,7 @@ project("llama-android")
|
|||
|
||||
#load local llama.cpp
|
||||
add_subdirectory(../../../../../../ build-llama)
|
||||
add_subdirectory(../../../../../../examples/llava build-llava)
|
||||
|
||||
# In order to load a library into your app from Java/Kotlin, you must call
|
||||
# System.loadLibrary() and pass the name of the library defined here;
|
||||
|
@ -50,4 +51,5 @@ target_link_libraries(${CMAKE_PROJECT_NAME}
|
|||
llama
|
||||
common
|
||||
android
|
||||
log)
|
||||
log
|
||||
llava)
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
#include <unistd.h>
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
#include "llava.h"
|
||||
|
||||
// Write C++ code here.
|
||||
//
|
||||
|
|
|
@ -36,7 +36,7 @@ class LLamaAndroid {
|
|||
}
|
||||
}.asCoroutineDispatcher()
|
||||
|
||||
private val nlen: Int = 64
|
||||
private val nlen: Int = 256
|
||||
|
||||
private external fun log_to_android()
|
||||
private external fun load_model(filename: String): Long
|
||||
|
|
|
@ -23,6 +23,8 @@
|
|||
// Constants
|
||||
//
|
||||
|
||||
void* internal_chars = nullptr;
|
||||
|
||||
static const char *AUDIO_TOKEN = "<|AUDIO|>";
|
||||
|
||||
//
|
||||
|
@ -703,6 +705,10 @@ struct omni_context *omni_init_context(omni_context_params ¶ms)
|
|||
|
||||
void omni_free(struct omni_context *ctx_omni)
|
||||
{
|
||||
if(internal_chars != nullptr)
|
||||
{
|
||||
free(internal_chars);
|
||||
}
|
||||
if (ctx_omni->ctx_whisper)
|
||||
{
|
||||
whisper_free(ctx_omni->ctx_whisper);
|
||||
|
@ -792,7 +798,7 @@ ggml_tensor *omni_process_audio(struct omni_context *ctx_omni, omni_params ¶
|
|||
return embed_proj;
|
||||
}
|
||||
|
||||
void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed, omni_params ¶ms, const std::string &prompt)
|
||||
const char* omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed, omni_params ¶ms, const std::string &prompt)
|
||||
{
|
||||
int n_past = 0;
|
||||
|
||||
|
@ -833,12 +839,11 @@ void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed
|
|||
for (int i = 0; i < max_tgt_len; i++)
|
||||
{
|
||||
const char * tmp = sample(ctx_sampling, ctx_omni->ctx_llama, &n_past);
|
||||
response += tmp;
|
||||
if (strcmp(tmp, "</s>") == 0)
|
||||
break;
|
||||
if (strstr(tmp, "###"))
|
||||
break; // Yi-VL behavior
|
||||
printf("%s", tmp);
|
||||
// printf("%s", tmp);
|
||||
if (strstr(response.c_str(), "<|im_end|>"))
|
||||
break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
|
||||
if (strstr(response.c_str(), "<|im_start|>"))
|
||||
|
@ -847,16 +852,22 @@ void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed
|
|||
break; // mistral llava-1.6
|
||||
|
||||
fflush(stdout);
|
||||
response += tmp;
|
||||
}
|
||||
|
||||
llama_sampling_free(ctx_sampling);
|
||||
printf("\n");
|
||||
if(internal_chars != nullptr) { free(internal_chars); }
|
||||
internal_chars = malloc(sizeof(char)*(response.size()+1));
|
||||
strncpy((char*)(internal_chars), response.c_str(), response.size());
|
||||
((char*)(internal_chars))[response.size()] = '\0';
|
||||
return (const char*)(internal_chars);
|
||||
}
|
||||
|
||||
void omni_process_full(struct omni_context *ctx_omni, omni_context_params ¶ms)
|
||||
const char* omni_process_full(struct omni_context *ctx_omni, omni_context_params ¶ms)
|
||||
{
|
||||
omni_params all_params = get_omni_params_from_context_params(params);
|
||||
|
||||
ggml_tensor *audio_embed = omni_process_audio(ctx_omni, all_params);
|
||||
omni_process_prompt(ctx_omni, audio_embed, all_params, all_params.gpt.prompt);
|
||||
return omni_process_prompt(ctx_omni, audio_embed, all_params, all_params.gpt.prompt);
|
||||
}
|
|
@ -54,7 +54,7 @@ OMNI_AUDIO_API struct omni_context *omni_init_context(omni_context_params ¶m
|
|||
|
||||
OMNI_AUDIO_API void omni_free(struct omni_context *ctx_omni);
|
||||
|
||||
OMNI_AUDIO_API void omni_process_full(
|
||||
OMNI_AUDIO_API const char* omni_process_full(
|
||||
struct omni_context *ctx_omni,
|
||||
omni_context_params ¶ms
|
||||
);
|
||||
|
|
|
@ -1,22 +1,30 @@
|
|||
# omni-vlm
|
||||
|
||||
Currently this implementation supports [omni-vlm](https://huggingface.co/NexaAIDev/nano-vlm-instruct) variants,
|
||||
Currently this implementation supports:
|
||||
|
||||
After API is confirmed, more models will be supported / uploaded.
|
||||
* [nano-vlm-instruct](https://huggingface.co/NexaAIDev/nano-vlm-instruct/tree/main) ([gguf](https://huggingface.co/NexaAIDev/nano-vlm-instruct-gguf/tree/main))
|
||||
* [vlm-81-ocr](https://huggingface.co/NexaAIDev/vlm-81-ocr/tree/main) ([gguf](https://huggingface.co/NexaAIDev/vlm-81-ocr-gguf/tree/main))
|
||||
* [vlm-81-instruct](https://huggingface.co/NexaAIDev/vlm-81-instruct/tree/main) ([gguf](https://huggingface.co/NexaAIDev/vlm-81-instruct-gguf/tree/main))
|
||||
|
||||
After API is stable, more models will be supported.
|
||||
|
||||
## Usage
|
||||
Build with cmake in the `llama-cpp-experiments` folder:
|
||||
```bash
|
||||
|
||||
Build with cmake in the `llama.cpp` folder:
|
||||
|
||||
```console
|
||||
cmake -S . -B build -DCMAKE_BUILD_TYPE=RelWithDebInfo
|
||||
cmake --build build --verbose -j
|
||||
```
|
||||
|
||||
After building, run: `./omni-vlm-cli` to see the usage. For example:
|
||||
|
||||
```bash
|
||||
```console
|
||||
./omni-vlm-cli \
|
||||
-m Nano-Llm-494M-F16.gguf \
|
||||
--mmproj mmproj-omni-vlm-f16.gguf \
|
||||
--image example/omni-vlm/cat.png
|
||||
-m <llm-F16.gguf> \
|
||||
--mmproj <mmproj-F16.gguf> \
|
||||
--image example/omni-vlm/cat.png \
|
||||
--omni-vlm-version <vlm-81-ocr | vlm-81-instruct | nano-vlm-instruct>
|
||||
```
|
||||
|
||||
See next section to convert gguf files from original safetensors.
|
||||
|
@ -27,6 +35,7 @@ See next section to convert gguf files from original safetensors.
|
|||
)
|
||||
|
||||
## Omni-vlm gguf conversion
|
||||
|
||||
1) First clone omni-vlm model:
|
||||
```console
|
||||
git clone https://huggingface.co/NexaAIDev/nano-vlm-instruct
|
||||
|
@ -34,7 +43,7 @@ git clone https://huggingface.co/NexaAIDev/nano-vlm-instruct
|
|||
|
||||
2) Install the required Python packages:
|
||||
|
||||
```sh
|
||||
```console
|
||||
pip install -r examples/omni-vlm/requirements.txt
|
||||
```
|
||||
|
||||
|
@ -104,6 +113,5 @@ After successfully compiling omni_vlm_wrapper_shared dynamic library, run:
|
|||
python omni_vlm_demo.py \
|
||||
--model <PATH TO nano-vlm-processor>/Nano-Llm-494M-F16.gguf \
|
||||
--mmproj <PATH TO nano-vlm-instruct>/mmproj-omni-vlm-f16.gguf \
|
||||
--prompt="Describe this image for me" \
|
||||
--image-path cat.png
|
||||
--omni-vlm-version <vlm-81-ocr | vlm-81-instruct | nano-vlm-instruct>
|
||||
```
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
#include "ggml.h"
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "common.h"
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
#include "ggml-cuda.h"
|
||||
|
@ -167,7 +168,11 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|||
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
|
||||
};
|
||||
|
||||
|
||||
enum omni_vlm_version_type {
|
||||
VLM_81_OCR,
|
||||
VLM_81_INSTRUCT,
|
||||
NANO_VLM_INSTRUCT,
|
||||
};
|
||||
//
|
||||
// utilities to get data from a gguf file
|
||||
//
|
||||
|
@ -294,115 +299,6 @@ static projector_type clip_projector_type_from_string(const std::string & name)
|
|||
return PROJECTOR_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
#ifdef CLIP_DEBUG_FUNCTIONS
|
||||
static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
|
||||
std::ofstream file(filename, std::ios::binary);
|
||||
if (!file.is_open()) {
|
||||
LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
// PPM header: P6 format, width, height, and max color value
|
||||
file << "P6\n" << img.nx << " " << img.ny << "\n255\n";
|
||||
|
||||
// Write pixel data
|
||||
for (size_t i = 0; i < img.buf.size(); i += 3) {
|
||||
// PPM expects binary data in RGB format, which matches our image buffer
|
||||
file.write(reinterpret_cast<const char*>(&img.buf[i]), 3);
|
||||
}
|
||||
|
||||
file.close();
|
||||
}
|
||||
|
||||
static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
|
||||
std::ofstream file(filename, std::ios::binary);
|
||||
if (!file.is_open()) {
|
||||
LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data
|
||||
int bytesPerPixel = 3;
|
||||
int widthInBytes = img.nx * bytesPerPixel;
|
||||
int paddingAmount = (4 - (widthInBytes % 4)) % 4;
|
||||
int stride = widthInBytes + paddingAmount;
|
||||
|
||||
// Bitmap file header
|
||||
unsigned char fileHeader[14] = {
|
||||
'B','M', // Signature
|
||||
0,0,0,0, // Image file size in bytes
|
||||
0,0,0,0, // Reserved
|
||||
54,0,0,0 // Start of pixel array
|
||||
};
|
||||
|
||||
// Total file size
|
||||
fileSize = 54 + (stride * img.ny);
|
||||
fileHeader[2] = (unsigned char)(fileSize);
|
||||
fileHeader[3] = (unsigned char)(fileSize >> 8);
|
||||
fileHeader[4] = (unsigned char)(fileSize >> 16);
|
||||
fileHeader[5] = (unsigned char)(fileSize >> 24);
|
||||
|
||||
// Bitmap information header (BITMAPINFOHEADER)
|
||||
unsigned char infoHeader[40] = {
|
||||
40,0,0,0, // Size of this header (40 bytes)
|
||||
0,0,0,0, // Image width
|
||||
0,0,0,0, // Image height
|
||||
1,0, // Number of color planes
|
||||
24,0, // Bits per pixel
|
||||
0,0,0,0, // No compression
|
||||
0,0,0,0, // Image size (can be 0 for no compression)
|
||||
0,0,0,0, // X pixels per meter (not specified)
|
||||
0,0,0,0, // Y pixels per meter (not specified)
|
||||
0,0,0,0, // Total colors (color table not used)
|
||||
0,0,0,0 // Important colors (all are important)
|
||||
};
|
||||
|
||||
// Width and height in the information header
|
||||
infoHeader[4] = (unsigned char)(img.nx);
|
||||
infoHeader[5] = (unsigned char)(img.nx >> 8);
|
||||
infoHeader[6] = (unsigned char)(img.nx >> 16);
|
||||
infoHeader[7] = (unsigned char)(img.nx >> 24);
|
||||
infoHeader[8] = (unsigned char)(img.ny);
|
||||
infoHeader[9] = (unsigned char)(img.ny >> 8);
|
||||
infoHeader[10] = (unsigned char)(img.ny >> 16);
|
||||
infoHeader[11] = (unsigned char)(img.ny >> 24);
|
||||
|
||||
// Write file headers
|
||||
file.write(reinterpret_cast<char*>(fileHeader), sizeof(fileHeader));
|
||||
file.write(reinterpret_cast<char*>(infoHeader), sizeof(infoHeader));
|
||||
|
||||
// Pixel data
|
||||
std::vector<unsigned char> padding(3, 0); // Max padding size to be added to each row
|
||||
for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top
|
||||
for (int x = 0; x < img.nx; ++x) {
|
||||
// Each pixel
|
||||
size_t pixelIndex = (y * img.nx + x) * 3;
|
||||
unsigned char pixel[3] = {
|
||||
img.buf[pixelIndex + 2], // BMP stores pixels in BGR format
|
||||
img.buf[pixelIndex + 1],
|
||||
img.buf[pixelIndex]
|
||||
};
|
||||
file.write(reinterpret_cast<char*>(pixel), 3);
|
||||
}
|
||||
// Write padding for the row
|
||||
file.write(reinterpret_cast<char*>(padding.data()), paddingAmount);
|
||||
}
|
||||
|
||||
file.close();
|
||||
}
|
||||
|
||||
// debug function to convert f32 to u8
|
||||
static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
|
||||
dst.nx = src.nx;
|
||||
dst.ny = src.ny;
|
||||
dst.buf.resize(3 * src.nx * src.ny);
|
||||
for (size_t i = 0; i < src.buf.size(); ++i) {
|
||||
dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
//
|
||||
// clip layers
|
||||
//
|
||||
|
@ -564,6 +460,7 @@ struct clip_ctx {
|
|||
|
||||
struct clip_vision_model vision_model;
|
||||
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
||||
omni_vlm_version_type omni_vlm_ver_type;
|
||||
|
||||
float image_mean[3];
|
||||
float image_std[3];
|
||||
|
@ -785,6 +682,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
|
||||
}
|
||||
|
||||
if(ctx->omni_vlm_ver_type == omni_vlm_version_type::VLM_81_OCR || ctx->omni_vlm_ver_type == omni_vlm_version_type::VLM_81_INSTRUCT) {
|
||||
embeddings = ggml_reshape_3d(ctx0, embeddings, embeddings->ne[0]*9, embeddings->ne[1]/9, 1);
|
||||
}
|
||||
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
||||
|
||||
|
@ -800,7 +701,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
}
|
||||
|
||||
// read and create ggml_context containing the tensors and their data
|
||||
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
struct clip_ctx * clip_model_load(const char * fname, const char * omni_vlm_version, const int verbosity = 1) {
|
||||
struct ggml_context * meta = NULL;
|
||||
|
||||
struct gguf_init_params params = {
|
||||
|
@ -895,6 +796,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
}
|
||||
|
||||
clip_ctx * new_clip = new clip_ctx{};
|
||||
if (std::string(omni_vlm_version) == "vlm-81-ocr") {
|
||||
new_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_OCR;
|
||||
} else if (std::string(omni_vlm_version) == "vlm-81-instruct") {
|
||||
new_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_INSTRUCT;
|
||||
} else if (std::string(omni_vlm_version) == "nano-vlm-instruct") {
|
||||
new_clip->omni_vlm_ver_type = omni_vlm_version_type::NANO_VLM_INSTRUCT;
|
||||
} else {
|
||||
throw std::runtime_error(std::string("error vlm version info: ") + omni_vlm_version);
|
||||
}
|
||||
|
||||
// update projector type
|
||||
{
|
||||
|
@ -1308,6 +1218,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
return new_clip;
|
||||
}
|
||||
|
||||
// void clip_set_omni_vlm_version(struct clip_ctx * ctx_clip, const struct gpt_params * params) {
|
||||
// if (params->omni_vlm_version == "vlm-81-ocr") {
|
||||
// ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_OCR;
|
||||
// } else if (params->omni_vlm_version == "vlm-81-instruct") {
|
||||
// ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_INSTRUCT;
|
||||
// } else if (params->omni_vlm_version == "nano-vlm-instruct") {
|
||||
// ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::NANO_VLM_INSTRUCT;
|
||||
// } else {
|
||||
// throw std::runtime_error(std::string("error vlm version info: ") + params->omni_vlm_version);
|
||||
// }
|
||||
// }
|
||||
|
||||
void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
|
||||
ctx_clip->load_image_size = load_image_size;
|
||||
}
|
||||
|
@ -2294,13 +2216,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||
return true;
|
||||
}
|
||||
|
||||
bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) {
|
||||
bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype, const char* omni_vlm_version) {
|
||||
ggml_type type = GGML_TYPE_Q4_1;
|
||||
|
||||
assert(itype < GGML_TYPE_COUNT);
|
||||
type = static_cast<ggml_type>(itype);
|
||||
|
||||
auto * ctx_clip = clip_model_load(fname_inp, 2);
|
||||
auto * ctx_clip = clip_model_load(fname_inp, omni_vlm_version, 2);
|
||||
|
||||
const auto & ctx_src = ctx_clip->ctx_gguf;
|
||||
const auto & ctx_data = ctx_clip->ctx_data;
|
||||
|
|
|
@ -39,9 +39,12 @@ struct clip_image_f32_batch {
|
|||
size_t size;
|
||||
};
|
||||
|
||||
CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity);
|
||||
CLIP_API struct clip_ctx * clip_model_load (const char * fname, const char * omni_vlm_version, int verbosity);
|
||||
CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
|
||||
|
||||
// struct gpt_params;
|
||||
// CLIP_API void clip_set_omni_vlm_version(struct clip_ctx * ctx_clip, const struct gpt_params * params);
|
||||
|
||||
CLIP_API void clip_free(struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
||||
|
@ -83,7 +86,7 @@ CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ct
|
|||
CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
|
||||
CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
|
||||
|
||||
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
|
||||
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype, const char * omni_vlm_version);
|
||||
|
||||
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
|
||||
|
||||
|
|
BIN
examples/omni-vlm/latex.png
Normal file
BIN
examples/omni-vlm/latex.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 5.8 KiB |
|
@ -12,6 +12,10 @@
|
|||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
// #include <iostream>
|
||||
//
|
||||
// using std::cout;
|
||||
// using std::endl;
|
||||
|
||||
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
|
||||
int N = (int) tokens.size();
|
||||
|
@ -149,7 +153,7 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima
|
|||
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_omnivlm->ctx_llama, tmp[i]).c_str());
|
||||
}
|
||||
}
|
||||
LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
|
||||
// LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
|
||||
if (params->verbose_prompt) {
|
||||
auto tmp = ::llama_tokenize(ctx_omnivlm->ctx_llama, user_prompt, true, true);
|
||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||
|
@ -165,6 +169,9 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima
|
|||
|
||||
LOG("\n");
|
||||
|
||||
params->sparams.temp = 0.0f;
|
||||
params->sparams.top_k = 1;
|
||||
params->sparams.top_p = 1.0f;
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
|
||||
if (!ctx_sampling) {
|
||||
LOG_TEE("%s: failed to initialize sampling subsystem\n", __func__);
|
||||
|
@ -177,8 +184,8 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima
|
|||
response += tmp;
|
||||
if (strcmp(tmp, "<|im_end|>") == 0) break;
|
||||
if (strcmp(tmp, "</s>") == 0) break;
|
||||
// if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||
printf("%s", tmp);
|
||||
// LOG("%s", tmp);
|
||||
// if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
|
||||
// if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
|
||||
// if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
|
||||
|
@ -212,8 +219,8 @@ static struct omnivlm_context * omnivlm_init_context(gpt_params * params, llama_
|
|||
prompt = "describe the image in detail.";
|
||||
}
|
||||
|
||||
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 10);
|
||||
|
||||
auto ctx_clip = clip_model_load(clip_path, params->omni_vlm_version.c_str(), /*verbosity=*/ 0);
|
||||
// clip_set_omni_vlm_version(ctx_clip, params);
|
||||
|
||||
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
|
||||
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
|
||||
|
@ -249,9 +256,6 @@ int main(int argc, char ** argv) {
|
|||
|
||||
gpt_params params;
|
||||
|
||||
// if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
|
||||
// return 1;
|
||||
// }
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
print_usage(argc, argv, params);
|
||||
return 1;
|
||||
|
@ -261,8 +265,21 @@ int main(int argc, char ** argv) {
|
|||
print_usage(argc, argv, {});
|
||||
return 1;
|
||||
}
|
||||
if (params.omni_vlm_version != "vlm-81-ocr" && params.prompt.empty()) {
|
||||
LOG_TEE("%s : prompt is empty.\n", __func__);
|
||||
print_usage(argc, argv, {});
|
||||
return 1;
|
||||
}
|
||||
|
||||
params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nDescribe this image for me\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>";
|
||||
if (params.omni_vlm_version == "vlm-81-ocr") {
|
||||
params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n <|vision_start|><|image_pad|><|vision_end|><|im_end|>";
|
||||
} else if (params.omni_vlm_version == "vlm-81-instruct" || params.omni_vlm_version == "nano-vlm-instruct") {
|
||||
params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" + params.prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>";
|
||||
} else {
|
||||
LOG_TEE("%s : error: you set wrong vlm version info:'%s'.\n", __func__, params.omni_vlm_version.c_str());
|
||||
print_usage(argc, argv, {});
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto * model = omnivlm_init(¶ms);
|
||||
if (model == NULL) {
|
||||
|
@ -271,8 +288,8 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
|
||||
auto * ctx_omnivlm = omnivlm_init_context(¶ms, model);
|
||||
for (auto & image : params.image) {
|
||||
auto * ctx_omnivlm = omnivlm_init_context(¶ms, model);
|
||||
auto * image_embed = load_image(ctx_omnivlm, ¶ms, image);
|
||||
if (!image_embed) {
|
||||
LOG_TEE("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
|
||||
|
@ -283,9 +300,9 @@ int main(int argc, char ** argv) {
|
|||
|
||||
llama_print_timings(ctx_omnivlm->ctx_llama);
|
||||
omnivlm_image_embed_free(image_embed);
|
||||
}
|
||||
ctx_omnivlm->model = NULL;
|
||||
omnivlm_free(ctx_omnivlm);
|
||||
}
|
||||
|
||||
llama_free_model(model);
|
||||
|
||||
|
|
|
@ -1,15 +1,24 @@
|
|||
// WARNING: this .cpp file is only for debugging. do not user directly.
|
||||
#include "omni-vlm-wrapper.h"
|
||||
#include <iostream>
|
||||
|
||||
|
||||
using std::cout;
|
||||
using std::endl;
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
const char* llm_model = "<path to llm gguf.>";
|
||||
const char* mmproj_model = "<path to mm projector gguf>";
|
||||
const char* image_path = "<path where image is located.>";
|
||||
const char* llm_model = "";
|
||||
const char* mmproj_model = "";
|
||||
const char* image_path = "";
|
||||
const char* prompt = "";
|
||||
|
||||
omnivlm_init(llm_model, mmproj_model);
|
||||
omnivlm_inference(prompt, image_path);
|
||||
omnivlm_inference(prompt, image_path);
|
||||
omnivlm_init(llm_model, mmproj_model, "vlm-81-ocr");
|
||||
|
||||
const char* res;
|
||||
res = omnivlm_inference(prompt, image_path);
|
||||
cout << "RES: " << res << endl;
|
||||
res = omnivlm_inference(prompt, image_path);
|
||||
cout << "RES: " << res << endl;
|
||||
omnivlm_free();
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -24,6 +24,8 @@ struct omnivlm_context {
|
|||
struct llama_model * model = NULL;
|
||||
};
|
||||
|
||||
void* internal_chars = nullptr;
|
||||
|
||||
static struct gpt_params params;
|
||||
static struct llama_model* model;
|
||||
static struct omnivlm_context* ctx_omnivlm;
|
||||
|
@ -63,7 +65,8 @@ static struct omnivlm_context * omnivlm_init_context(gpt_params * params, llama_
|
|||
prompt = "describe the image in detail.";
|
||||
}
|
||||
|
||||
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 10);
|
||||
auto ctx_clip = clip_model_load(clip_path, params->omni_vlm_version.c_str(), /*verbosity=*/ 0);
|
||||
// clip_set_omni_vlm_version(ctx_clip, params);
|
||||
|
||||
|
||||
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
|
||||
|
@ -128,19 +131,19 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,
|
|||
return ret.c_str();
|
||||
}
|
||||
|
||||
static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_image_embed * image_embed, gpt_params * params, const std::string & prompt) {
|
||||
static const char* process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_image_embed * image_embed, gpt_params * params, const std::string & prompt) {
|
||||
int n_past = 0;
|
||||
|
||||
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
|
||||
|
||||
std::string full_prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" \
|
||||
+ prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>";
|
||||
size_t image_pos = full_prompt.find("<|image_pad|>");
|
||||
// std::string full_prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" \
|
||||
// + prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>";
|
||||
size_t image_pos = params->prompt.find("<|image_pad|>");
|
||||
std::string system_prompt, user_prompt;
|
||||
|
||||
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
|
||||
system_prompt = full_prompt.substr(0, image_pos);
|
||||
user_prompt = full_prompt.substr(image_pos + std::string("<|image_pad|>").length());
|
||||
system_prompt = params->prompt.substr(0, image_pos);
|
||||
user_prompt = params->prompt.substr(image_pos + std::string("<|image_pad|>").length());
|
||||
if (params->verbose_prompt) {
|
||||
auto tmp = ::llama_tokenize(ctx_omnivlm->ctx_llama, system_prompt, true, true);
|
||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||
|
@ -155,6 +158,9 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima
|
|||
}
|
||||
}
|
||||
|
||||
params->sparams.top_k = 1;
|
||||
params->sparams.top_p = 1.0f;
|
||||
|
||||
eval_string(ctx_omnivlm->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
|
||||
omnivlm_eval_image_embed(ctx_omnivlm->ctx_llama, image_embed, params->n_batch, &n_past);
|
||||
eval_string(ctx_omnivlm->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
||||
|
@ -172,11 +178,11 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima
|
|||
std::string response = "";
|
||||
for (int i = 0; i < max_tgt_len; i++) {
|
||||
const char * tmp = sample(ctx_sampling, ctx_omnivlm->ctx_llama, &n_past);
|
||||
response += tmp;
|
||||
if (strcmp(tmp, "<|im_end|>") == 0) break;
|
||||
if (strcmp(tmp, "</s>") == 0) break;
|
||||
// if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||
printf("%s", tmp);
|
||||
// printf("%s", tmp);
|
||||
response += tmp;
|
||||
// if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
|
||||
// if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
|
||||
// if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
|
||||
|
@ -186,6 +192,13 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima
|
|||
|
||||
llama_sampling_free(ctx_sampling);
|
||||
printf("\n");
|
||||
|
||||
// const char* ret_char_ptr = (const char*)(malloc(sizeof(char)*response.size()));
|
||||
if(internal_chars != nullptr) { free(internal_chars); }
|
||||
internal_chars = malloc(sizeof(char)*(response.size()+1));
|
||||
strncpy((char*)(internal_chars), response.c_str(), response.size());
|
||||
((char*)(internal_chars))[response.size()] = '\0';
|
||||
return (const char*)(internal_chars);
|
||||
}
|
||||
|
||||
static void omnivlm_free(struct omnivlm_context * ctx_omnivlm) {
|
||||
|
@ -208,8 +221,8 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
|||
}
|
||||
|
||||
// inference interface definition
|
||||
void omnivlm_init(const char* llm_model_path, const char* projector_model_path) {
|
||||
const char* argv = "hello-omni-vlm-wrapper-cli";
|
||||
void omnivlm_init(const char* llm_model_path, const char* projector_model_path, const char* omni_vlm_version) {
|
||||
const char* argv = "omni-wrapper-py";
|
||||
char* nc_argv = const_cast<char*>(argv);
|
||||
if (!gpt_params_parse(1, &nc_argv, params)) {
|
||||
print_usage(1, &nc_argv, {});
|
||||
|
@ -217,31 +230,60 @@ void omnivlm_init(const char* llm_model_path, const char* projector_model_path)
|
|||
}
|
||||
params.model = llm_model_path;
|
||||
params.mmproj = projector_model_path;
|
||||
params.omni_vlm_version = omni_vlm_version;
|
||||
|
||||
std::string omni_vlm_ver = params.omni_vlm_version;
|
||||
if(omni_vlm_ver != "vlm-81-ocr" && omni_vlm_ver != "vlm-81-instruct" && omni_vlm_ver != "nano-vlm-instruct") {
|
||||
fprintf(stderr, "%s: error: you set wrong omni_vlm_string: %s\n", __func__, omni_vlm_version);
|
||||
fprintf(stderr, "%s: Valid omni_vlm_version set is ('vlm-81-ocr', 'vlm-81-instruct', 'nano-vlm-instruct')\n", __func__);
|
||||
throw std::runtime_error("You set wrong vlm_version info strings.");
|
||||
}
|
||||
|
||||
model = omnivlm_init(¶ms);
|
||||
if (model == nullptr) {
|
||||
fprintf(stderr, "%s: error: failed to init omnivlm model\n", __func__);
|
||||
throw std::runtime_error("Failed to init omnivlm model");
|
||||
}
|
||||
ctx_omnivlm = omnivlm_init_context(¶ms, model);
|
||||
}
|
||||
|
||||
void omnivlm_inference(const char *prompt, const char *imag_path) {
|
||||
const char* omnivlm_inference(const char *prompt, const char *imag_path) {
|
||||
ctx_omnivlm = omnivlm_init_context(¶ms, model);
|
||||
|
||||
std::string image = imag_path;
|
||||
params.prompt = prompt;
|
||||
|
||||
if (params.omni_vlm_version == "vlm-81-ocr") {
|
||||
params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n <|ocr_start|><|vision_start|><|image_pad|><|vision_end|><|ocr_end|><|im_end|>";
|
||||
} else if (params.omni_vlm_version == "vlm-81-instruct" || params.omni_vlm_version == "nano-vlm-instruct") {
|
||||
params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" + params.prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>";
|
||||
} else {
|
||||
LOG_TEE("%s : error: you set wrong vlm version info:'%s'.\n", __func__, params.omni_vlm_version.c_str());
|
||||
throw std::runtime_error("You set wrong vlm_version info strings.");
|
||||
}
|
||||
|
||||
auto * image_embed = load_image(ctx_omnivlm, ¶ms, image);
|
||||
if (!image_embed) {
|
||||
LOG_TEE("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
|
||||
throw std::runtime_error("failed to load image " + image);
|
||||
}
|
||||
// process the prompt
|
||||
process_prompt(ctx_omnivlm, image_embed, ¶ms, params.prompt);
|
||||
const char* ret_chars = process_prompt(ctx_omnivlm, image_embed, ¶ms, params.prompt);
|
||||
|
||||
// llama_perf_print(ctx_omnivlm->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
|
||||
omnivlm_image_embed_free(image_embed);
|
||||
ctx_omnivlm->model = nullptr;
|
||||
omnivlm_free(ctx_omnivlm);
|
||||
ctx_omnivlm = nullptr;
|
||||
|
||||
return ret_chars;
|
||||
}
|
||||
|
||||
void omnivlm_free() {
|
||||
ctx_omnivlm->model = NULL;
|
||||
if(internal_chars != nullptr) { free(internal_chars); }
|
||||
if(ctx_omnivlm != nullptr) {
|
||||
// this snipet should never be run!
|
||||
ctx_omnivlm->model = nullptr;
|
||||
omnivlm_free(ctx_omnivlm);
|
||||
}
|
||||
llama_free_model(model);
|
||||
}
|
||||
|
|
|
@ -20,9 +20,9 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
OMNIVLM_API void omnivlm_init(const char* llm_model_path, const char* projector_model_path);
|
||||
OMNIVLM_API void omnivlm_init(const char* llm_model_path, const char* projector_model_path, const char* omni_vlm_version);
|
||||
|
||||
OMNIVLM_API void omnivlm_inference(const char* prompt, const char* imag_path);
|
||||
OMNIVLM_API const char* omnivlm_inference(const char* prompt, const char* imag_path);
|
||||
|
||||
OMNIVLM_API void omnivlm_free();
|
||||
|
||||
|
|
|
@ -258,111 +258,6 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|||
|
||||
*n_img_pos = clip_n_patches(ctx_clip);
|
||||
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd);
|
||||
// cout << "\t\t A NICE START" << endl;
|
||||
// cout << "\t\t" << *n_img_pos << endl;
|
||||
/*
|
||||
if (clip_is_minicpmv(ctx_clip)) {
|
||||
std::vector<float *> image_embd_v;
|
||||
image_embd_v.resize(img_res_v.size);
|
||||
struct clip_image_size * load_image_size = clip_image_size_init();
|
||||
for (size_t i = 0; i < img_res_v.size; i++) {
|
||||
const int64_t t_img_enc_step_start_us = ggml_time_us();
|
||||
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
|
||||
int patch_size=14;
|
||||
load_image_size->width = img_res_v.data[i].nx;
|
||||
load_image_size->height = img_res_v.data[i].ny;
|
||||
clip_add_load_image_size(ctx_clip, load_image_size);
|
||||
bool encoded = false;
|
||||
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
|
||||
if (has_minicpmv_projector == 2) {
|
||||
encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
|
||||
}
|
||||
else if (has_minicpmv_projector == 3) {
|
||||
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
|
||||
}
|
||||
if (!encoded) {
|
||||
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
||||
return false;
|
||||
}
|
||||
const int64_t t_img_enc_steop_batch_us = ggml_time_us();
|
||||
LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
|
||||
}
|
||||
const int64_t t_img_enc_batch_us = ggml_time_us();
|
||||
LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
||||
|
||||
int n_img_pos_out = 0;
|
||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
||||
std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip));
|
||||
n_img_pos_out += clip_n_patches(ctx_clip);
|
||||
}
|
||||
*n_img_pos = n_img_pos_out;
|
||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
||||
free(image_embd_v[i]);
|
||||
}
|
||||
image_embd_v.clear();
|
||||
load_image_size->width = img->nx;
|
||||
load_image_size->height = img->ny;
|
||||
clip_add_load_image_size(ctx_clip, load_image_size);
|
||||
LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
||||
}
|
||||
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
|
||||
// flat / default llava-1.5 type embedding
|
||||
*n_img_pos = clip_n_patches(ctx_clip);
|
||||
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
|
||||
delete[] img_res_v.data;
|
||||
if (!encoded) {
|
||||
LOG_ERR("Unable to encode image\n");
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else {
|
||||
// spatial_unpad llava-1.6 type embedding
|
||||
// TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
|
||||
std::vector<float *> image_embd_v;
|
||||
image_embd_v.resize(img_res_v.size);
|
||||
for (size_t i = 0; i < img_res_v.size; i++) {
|
||||
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
|
||||
const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
|
||||
if (!encoded) {
|
||||
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
const int64_t t_img_enc_batch_us = ggml_time_us();
|
||||
LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
||||
|
||||
const int32_t * image_grid = clip_image_grid(ctx_clip);
|
||||
|
||||
std::vector<std::pair<int, int>> grid_pinpoints;
|
||||
for (int i = 0; i < 32 && image_grid[i] != 0; i += 2) {
|
||||
grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
|
||||
}
|
||||
|
||||
// free all img_res_v - not needed anymore
|
||||
delete[] img_res_v.data;
|
||||
img_res_v.size = 0;
|
||||
img_res_v.data = nullptr;
|
||||
|
||||
const int32_t image_size = clip_image_size(ctx_clip);
|
||||
|
||||
struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
|
||||
|
||||
int n_img_pos_out;
|
||||
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
|
||||
*n_img_pos = n_img_pos_out;
|
||||
|
||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
||||
free(image_embd_v[i]);
|
||||
}
|
||||
image_embd_v.clear();
|
||||
|
||||
// debug image/segment/normalization content:
|
||||
// clip_image_u8 * tmp = clip_image_u8_init();
|
||||
// clip_image_convert_f32_to_u8(*image_feature, *tmp);
|
||||
// clip_image_save_to_bmp(*tmp, "image_feature.bmp");
|
||||
}
|
||||
*/
|
||||
|
||||
LOG("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
|
||||
|
||||
|
|
|
@ -60,11 +60,11 @@ _lib = _load_shared_library(_lib_base_name, base_path)
|
|||
omni_char_p = ctypes.c_char_p
|
||||
|
||||
|
||||
def omnivlm_init(llm_model_path: omni_char_p, mmproj_model_path: omni_char_p):
|
||||
return _lib.omnivlm_init(llm_model_path, mmproj_model_path)
|
||||
def omnivlm_init(llm_model_path: omni_char_p, mmproj_model_path: omni_char_p, vlm_version: omni_char_p):
|
||||
return _lib.omnivlm_init(llm_model_path, mmproj_model_path, vlm_version)
|
||||
|
||||
|
||||
_lib.omnivlm_init.argtypes = [omni_char_p, omni_char_p]
|
||||
_lib.omnivlm_init.argtypes = [omni_char_p, omni_char_p, omni_char_p]
|
||||
_lib.omnivlm_init.restype = None
|
||||
|
||||
|
||||
|
@ -73,7 +73,7 @@ def omnivlm_inference(prompt: omni_char_p, image_path: omni_char_p):
|
|||
|
||||
|
||||
_lib.omnivlm_inference.argtypes = [omni_char_p, omni_char_p]
|
||||
_lib.omnivlm_inference.restype = None
|
||||
_lib.omnivlm_inference.restype = omni_char_p
|
||||
|
||||
|
||||
def omnivlm_free():
|
||||
|
|
|
@ -11,16 +11,17 @@ class NexaOmniVlmInference:
|
|||
A class used for vision language model inference.
|
||||
"""
|
||||
|
||||
def __init__(self, llm_model_path: str, mmproj_model_path: str):
|
||||
def __init__(self, llm_model_path: str, mmproj_model_path: str, omni_vlm_version: str):
|
||||
self.llm_model = ctypes.c_char_p(llm_model_path.encode("utf-8"))
|
||||
self.mmproj_model = ctypes.c_char_p(mmproj_model_path.encode("utf-8"))
|
||||
self.omni_vlm_version = ctypes.c_char_p(omni_vlm_version.encode("utf-8"))
|
||||
|
||||
omni_vlm_cpp.omnivlm_init(self.llm_model, self.mmproj_model)
|
||||
omni_vlm_cpp.omnivlm_init(self.llm_model, self.mmproj_model, self.omni_vlm_version)
|
||||
|
||||
def inference(self, prompt: str, image_path: str):
|
||||
prompt = ctypes.c_char_p(prompt.encode("utf-8"))
|
||||
image_path = ctypes.c_char_p(image_path.encode("utf-8"))
|
||||
omni_vlm_cpp.omnivlm_inference(prompt, image_path)
|
||||
return omni_vlm_cpp.omnivlm_inference(prompt, image_path)
|
||||
|
||||
def __del__(self):
|
||||
omni_vlm_cpp.omnivlm_free()
|
||||
|
@ -34,22 +35,30 @@ if __name__ == "__main__":
|
|||
)
|
||||
parser.add_argument("--model", type=str, help="Path to the llm model file")
|
||||
parser.add_argument("--mmproj", type=str, help="Path to the mmproj file")
|
||||
parser.add_argument("--omni-vlm-version", type=str, help="omni-vlm-version info ('vlm-81-ocr', 'vlm-81-instruct', 'nano-vlm-instruct')")
|
||||
# parser.add_argument("--prompt", type=str, help="prompt string.")
|
||||
# parser.add_argument("--image-path", type=str, help="Path to the image.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
omni_vlm_obj = NexaOmniVlmInference(args.model, args.mmproj)
|
||||
print("DEBUG")
|
||||
print(args.omni_vlm_version)
|
||||
omni_vlm_obj = NexaOmniVlmInference(args.model, args.mmproj, args.omni_vlm_version)
|
||||
# omni_vlm_obj.inference(args.prompt, args.image_path)
|
||||
while True:
|
||||
if args.omni_vlm_version != "vlm-81-ocr":
|
||||
print("Input your prompt:")
|
||||
prompt = input()
|
||||
if prompt == "":
|
||||
print("ERROR: you input an empty prompt, try again.")
|
||||
continue
|
||||
else:
|
||||
prompt = ""
|
||||
print("Input your image path:")
|
||||
image_path = input()
|
||||
while not os.path.exists(image_path):
|
||||
print("ERROR: can not find image in your input path, please check and input agian.")
|
||||
image_path = input()
|
||||
omni_vlm_obj.inference(prompt, image_path)
|
||||
response = omni_vlm_obj.inference(prompt, image_path)
|
||||
print("\tresponse:")
|
||||
print(response.decode('utf-8'))
|
||||
|
|
|
@ -18,10 +18,12 @@
|
|||
#include <thread>
|
||||
#include <vector>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
|
||||
//
|
||||
// Constants
|
||||
//
|
||||
void* internal_chars = nullptr;
|
||||
|
||||
static const char *AUDIO_TOKEN = "<|AUDIO|>";
|
||||
|
||||
|
@ -703,6 +705,12 @@ struct omni_context *omni_init_context(omni_context_params ¶ms)
|
|||
|
||||
void omni_free(struct omni_context *ctx_omni)
|
||||
{
|
||||
|
||||
if(internal_chars != nullptr)
|
||||
{
|
||||
free(internal_chars);
|
||||
internal_chars = nullptr;
|
||||
}
|
||||
if (ctx_omni->ctx_whisper)
|
||||
{
|
||||
whisper_free(ctx_omni->ctx_whisper);
|
||||
|
@ -710,12 +718,13 @@ void omni_free(struct omni_context *ctx_omni)
|
|||
}
|
||||
if (ctx_omni->projector)
|
||||
{
|
||||
ctx_omni->projector->free();
|
||||
delete ctx_omni->projector;
|
||||
}
|
||||
|
||||
llama_free(ctx_omni->ctx_llama);
|
||||
llama_free_model(ctx_omni->model);
|
||||
llama_backend_free();
|
||||
free(ctx_omni);
|
||||
}
|
||||
|
||||
static bool omni_eval_audio_embed(llama_context *ctx_llama, ggml_tensor *audio_embed, int n_batch, int *n_past)
|
||||
|
@ -755,6 +764,7 @@ static bool omni_eval_audio_embed(llama_context *ctx_llama, ggml_tensor *audio_e
|
|||
}
|
||||
*n_past += n_eval;
|
||||
}
|
||||
free(audio_embed_data);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -792,7 +802,7 @@ ggml_tensor *omni_process_audio(struct omni_context *ctx_omni, omni_params ¶
|
|||
return embed_proj;
|
||||
}
|
||||
|
||||
void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed, omni_params ¶ms, const std::string &prompt)
|
||||
const char* omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed, omni_params ¶ms, const std::string &prompt)
|
||||
{
|
||||
int n_past = 0;
|
||||
|
||||
|
@ -841,12 +851,11 @@ void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed
|
|||
for (int i = 0; i < max_tgt_len; i++)
|
||||
{
|
||||
const char * tmp = sample(ctx_sampling, ctx_omni->ctx_llama, &n_past);
|
||||
response += tmp;
|
||||
if (strcmp(tmp, "</s>") == 0)
|
||||
break;
|
||||
if (strstr(tmp, "###"))
|
||||
break; // Yi-VL behavior
|
||||
printf("%s", tmp);
|
||||
// printf("%s", tmp);
|
||||
if (strstr(response.c_str(), "<|im_end|>"))
|
||||
break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
|
||||
if (strstr(response.c_str(), "<|im_start|>"))
|
||||
|
@ -855,16 +864,23 @@ void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed
|
|||
break; // mistral llava-1.6
|
||||
|
||||
fflush(stdout);
|
||||
response += tmp;
|
||||
}
|
||||
|
||||
llama_sampling_free(ctx_sampling);
|
||||
printf("\n");
|
||||
|
||||
if(internal_chars != nullptr) { free(internal_chars); }
|
||||
internal_chars = malloc(sizeof(char)*(response.size()+1));
|
||||
strncpy((char*)(internal_chars), response.c_str(), response.size());
|
||||
((char*)(internal_chars))[response.size()] = '\0';
|
||||
return (const char*)(internal_chars);
|
||||
}
|
||||
|
||||
void omni_process_full(struct omni_context *ctx_omni, omni_context_params ¶ms)
|
||||
const char* omni_process_full(struct omni_context *ctx_omni, omni_context_params ¶ms)
|
||||
{
|
||||
omni_params all_params = get_omni_params_from_context_params(params);
|
||||
|
||||
ggml_tensor *audio_embed = omni_process_audio(ctx_omni, all_params);
|
||||
omni_process_prompt(ctx_omni, audio_embed, all_params, all_params.gpt.prompt);
|
||||
return omni_process_prompt(ctx_omni, audio_embed, all_params, all_params.gpt.prompt);
|
||||
}
|
|
@ -54,7 +54,7 @@ OMNI_AUDIO_API struct omni_context *omni_init_context(omni_context_params ¶m
|
|||
|
||||
OMNI_AUDIO_API void omni_free(struct omni_context *ctx_omni);
|
||||
|
||||
OMNI_AUDIO_API void omni_process_full(
|
||||
OMNI_AUDIO_API const char* omni_process_full(
|
||||
struct omni_context *ctx_omni,
|
||||
omni_context_params ¶ms
|
||||
);
|
||||
|
|
|
@ -9467,6 +9467,8 @@ static bool whisper_encoder_load(struct whisper_model_loader *loader, whisper_co
|
|||
|
||||
wctx.t_load_us = ggml_time_us() - t_start_us;
|
||||
|
||||
gguf_free(gguf_ctx);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
find_package (Threads REQUIRED)
|
||||
|
||||
set(TARGET llama-vulkan-shaders-gen)
|
||||
set(TARGET vulkan-shaders-gen)
|
||||
add_executable(${TARGET} vulkan-shaders-gen.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue