remove redundant omni-vlm-v2/ folder, all omni-vlm examples will be added to omni-vlm/ folder.
This commit is contained in:
parent
3dfac7817f
commit
16c22471e8
12 changed files with 124 additions and 144 deletions
|
@ -1442,6 +1442,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||||
// End of Parse args for logging parameters
|
// End of Parse args for logging parameters
|
||||||
#endif // LOG_DISABLE_LOGS
|
#endif // LOG_DISABLE_LOGS
|
||||||
|
|
||||||
|
if (arg == "--omni-vlm-version") {
|
||||||
|
CHECK_ARG
|
||||||
|
params.omni_vlm_version = argv[i];
|
||||||
|
return true;
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1688,6 +1693,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||||
"layer range to apply the control vector(s) to, start and end inclusive" });
|
"layer range to apply the control vector(s) to, start and end inclusive" });
|
||||||
options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n"
|
options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n"
|
||||||
"or --model-url if set, otherwise %s)", DEFAULT_MODEL_PATH });
|
"or --model-url if set, otherwise %s)", DEFAULT_MODEL_PATH });
|
||||||
|
options.push_back({ "*", " --omni-vlm-version VERSION_STRING", "omni vlm string version(one of 'vlm-81-ocr', 'vlm-81-instruct', 'nano-vlm-instruct')\n" "(default: 'vlm-81-ocr')"});
|
||||||
options.push_back({ "*", "-md, --model-draft FNAME", "draft model for speculative decoding (default: unused)" });
|
options.push_back({ "*", "-md, --model-draft FNAME", "draft model for speculative decoding (default: unused)" });
|
||||||
options.push_back({ "*", "-mu, --model-url MODEL_URL", "model download url (default: unused)" });
|
options.push_back({ "*", "-mu, --model-url MODEL_URL", "model download url (default: unused)" });
|
||||||
options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" });
|
options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" });
|
||||||
|
|
|
@ -265,6 +265,8 @@ struct gpt_params {
|
||||||
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
||||||
|
|
||||||
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
|
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
|
||||||
|
|
||||||
|
std::string omni_vlm_version = "vlm-81-ocr";
|
||||||
};
|
};
|
||||||
|
|
||||||
void gpt_params_parse_from_env(gpt_params & params);
|
void gpt_params_parse_from_env(gpt_params & params);
|
||||||
|
|
|
@ -53,7 +53,6 @@ else()
|
||||||
# add_subdirectory(speculative)
|
# add_subdirectory(speculative)
|
||||||
# add_subdirectory(tokenize)
|
# add_subdirectory(tokenize)
|
||||||
add_subdirectory(omni-vlm)
|
add_subdirectory(omni-vlm)
|
||||||
add_subdirectory(omni-vlm-v2)
|
|
||||||
add_subdirectory(nexa-omni-audio)
|
add_subdirectory(nexa-omni-audio)
|
||||||
add_subdirectory(qwen2-audio)
|
add_subdirectory(qwen2-audio)
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -6,6 +6,7 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
|
@ -167,7 +168,11 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||||
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
|
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum omni_vlm_version_type {
|
||||||
|
VLM_81_OCR,
|
||||||
|
VLM_81_INSTRUCT,
|
||||||
|
NANO_VLM_INSTRUCT,
|
||||||
|
};
|
||||||
//
|
//
|
||||||
// utilities to get data from a gguf file
|
// utilities to get data from a gguf file
|
||||||
//
|
//
|
||||||
|
@ -294,115 +299,6 @@ static projector_type clip_projector_type_from_string(const std::string & name)
|
||||||
return PROJECTOR_TYPE_UNKNOWN;
|
return PROJECTOR_TYPE_UNKNOWN;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CLIP_DEBUG_FUNCTIONS
|
|
||||||
static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
|
|
||||||
std::ofstream file(filename, std::ios::binary);
|
|
||||||
if (!file.is_open()) {
|
|
||||||
LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// PPM header: P6 format, width, height, and max color value
|
|
||||||
file << "P6\n" << img.nx << " " << img.ny << "\n255\n";
|
|
||||||
|
|
||||||
// Write pixel data
|
|
||||||
for (size_t i = 0; i < img.buf.size(); i += 3) {
|
|
||||||
// PPM expects binary data in RGB format, which matches our image buffer
|
|
||||||
file.write(reinterpret_cast<const char*>(&img.buf[i]), 3);
|
|
||||||
}
|
|
||||||
|
|
||||||
file.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
|
|
||||||
std::ofstream file(filename, std::ios::binary);
|
|
||||||
if (!file.is_open()) {
|
|
||||||
LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data
|
|
||||||
int bytesPerPixel = 3;
|
|
||||||
int widthInBytes = img.nx * bytesPerPixel;
|
|
||||||
int paddingAmount = (4 - (widthInBytes % 4)) % 4;
|
|
||||||
int stride = widthInBytes + paddingAmount;
|
|
||||||
|
|
||||||
// Bitmap file header
|
|
||||||
unsigned char fileHeader[14] = {
|
|
||||||
'B','M', // Signature
|
|
||||||
0,0,0,0, // Image file size in bytes
|
|
||||||
0,0,0,0, // Reserved
|
|
||||||
54,0,0,0 // Start of pixel array
|
|
||||||
};
|
|
||||||
|
|
||||||
// Total file size
|
|
||||||
fileSize = 54 + (stride * img.ny);
|
|
||||||
fileHeader[2] = (unsigned char)(fileSize);
|
|
||||||
fileHeader[3] = (unsigned char)(fileSize >> 8);
|
|
||||||
fileHeader[4] = (unsigned char)(fileSize >> 16);
|
|
||||||
fileHeader[5] = (unsigned char)(fileSize >> 24);
|
|
||||||
|
|
||||||
// Bitmap information header (BITMAPINFOHEADER)
|
|
||||||
unsigned char infoHeader[40] = {
|
|
||||||
40,0,0,0, // Size of this header (40 bytes)
|
|
||||||
0,0,0,0, // Image width
|
|
||||||
0,0,0,0, // Image height
|
|
||||||
1,0, // Number of color planes
|
|
||||||
24,0, // Bits per pixel
|
|
||||||
0,0,0,0, // No compression
|
|
||||||
0,0,0,0, // Image size (can be 0 for no compression)
|
|
||||||
0,0,0,0, // X pixels per meter (not specified)
|
|
||||||
0,0,0,0, // Y pixels per meter (not specified)
|
|
||||||
0,0,0,0, // Total colors (color table not used)
|
|
||||||
0,0,0,0 // Important colors (all are important)
|
|
||||||
};
|
|
||||||
|
|
||||||
// Width and height in the information header
|
|
||||||
infoHeader[4] = (unsigned char)(img.nx);
|
|
||||||
infoHeader[5] = (unsigned char)(img.nx >> 8);
|
|
||||||
infoHeader[6] = (unsigned char)(img.nx >> 16);
|
|
||||||
infoHeader[7] = (unsigned char)(img.nx >> 24);
|
|
||||||
infoHeader[8] = (unsigned char)(img.ny);
|
|
||||||
infoHeader[9] = (unsigned char)(img.ny >> 8);
|
|
||||||
infoHeader[10] = (unsigned char)(img.ny >> 16);
|
|
||||||
infoHeader[11] = (unsigned char)(img.ny >> 24);
|
|
||||||
|
|
||||||
// Write file headers
|
|
||||||
file.write(reinterpret_cast<char*>(fileHeader), sizeof(fileHeader));
|
|
||||||
file.write(reinterpret_cast<char*>(infoHeader), sizeof(infoHeader));
|
|
||||||
|
|
||||||
// Pixel data
|
|
||||||
std::vector<unsigned char> padding(3, 0); // Max padding size to be added to each row
|
|
||||||
for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top
|
|
||||||
for (int x = 0; x < img.nx; ++x) {
|
|
||||||
// Each pixel
|
|
||||||
size_t pixelIndex = (y * img.nx + x) * 3;
|
|
||||||
unsigned char pixel[3] = {
|
|
||||||
img.buf[pixelIndex + 2], // BMP stores pixels in BGR format
|
|
||||||
img.buf[pixelIndex + 1],
|
|
||||||
img.buf[pixelIndex]
|
|
||||||
};
|
|
||||||
file.write(reinterpret_cast<char*>(pixel), 3);
|
|
||||||
}
|
|
||||||
// Write padding for the row
|
|
||||||
file.write(reinterpret_cast<char*>(padding.data()), paddingAmount);
|
|
||||||
}
|
|
||||||
|
|
||||||
file.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
// debug function to convert f32 to u8
|
|
||||||
static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
|
|
||||||
dst.nx = src.nx;
|
|
||||||
dst.ny = src.ny;
|
|
||||||
dst.buf.resize(3 * src.nx * src.ny);
|
|
||||||
for (size_t i = 0; i < src.buf.size(); ++i) {
|
|
||||||
dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// clip layers
|
// clip layers
|
||||||
//
|
//
|
||||||
|
@ -564,6 +460,7 @@ struct clip_ctx {
|
||||||
|
|
||||||
struct clip_vision_model vision_model;
|
struct clip_vision_model vision_model;
|
||||||
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
||||||
|
omni_vlm_version_type omni_vlm_ver_type;
|
||||||
|
|
||||||
float image_mean[3];
|
float image_mean[3];
|
||||||
float image_std[3];
|
float image_std[3];
|
||||||
|
@ -785,6 +682,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
|
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(ctx->omni_vlm_ver_type == omni_vlm_version_type::VLM_81_OCR || ctx->omni_vlm_ver_type == omni_vlm_version_type::VLM_81_INSTRUCT) {
|
||||||
|
embeddings = ggml_reshape_3d(ctx0, embeddings, embeddings->ne[0]*9, embeddings->ne[1]/9, 1);
|
||||||
|
}
|
||||||
|
|
||||||
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
||||||
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
||||||
|
|
||||||
|
@ -1308,6 +1209,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
return new_clip;
|
return new_clip;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void clip_set_omni_vlm_version(struct clip_ctx * ctx_clip, const struct gpt_params * params) {
|
||||||
|
if (params->omni_vlm_version == "vlm-81-ocr") {
|
||||||
|
ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_OCR;
|
||||||
|
} else if (params->omni_vlm_version == "vlm-81-instruct") {
|
||||||
|
ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_INSTRUCT;
|
||||||
|
} else if (params->omni_vlm_version == "nano-vlm-instruct") {
|
||||||
|
ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::NANO_VLM_INSTRUCT;
|
||||||
|
} else {
|
||||||
|
throw std::runtime_error(std::string("error vlm version info: ") + params->omni_vlm_version);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
|
void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
|
||||||
ctx_clip->load_image_size = load_image_size;
|
ctx_clip->load_image_size = load_image_size;
|
||||||
}
|
}
|
||||||
|
|
|
@ -42,6 +42,9 @@ struct clip_image_f32_batch {
|
||||||
CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity);
|
CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity);
|
||||||
CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
|
CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
|
||||||
|
|
||||||
|
struct gpt_params;
|
||||||
|
CLIP_API void clip_set_omni_vlm_version(struct clip_ctx * ctx_clip, const struct gpt_params * params);
|
||||||
|
|
||||||
CLIP_API void clip_free(struct clip_ctx * ctx);
|
CLIP_API void clip_free(struct clip_ctx * ctx);
|
||||||
|
|
||||||
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
||||||
|
|
BIN
examples/omni-vlm/latex.png
Normal file
BIN
examples/omni-vlm/latex.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 5.8 KiB |
|
@ -212,8 +212,8 @@ static struct omnivlm_context * omnivlm_init_context(gpt_params * params, llama_
|
||||||
prompt = "describe the image in detail.";
|
prompt = "describe the image in detail.";
|
||||||
}
|
}
|
||||||
|
|
||||||
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 10);
|
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 0);
|
||||||
|
clip_set_omni_vlm_version(ctx_clip, params);
|
||||||
|
|
||||||
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
|
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
|
||||||
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
|
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
|
||||||
|
@ -249,9 +249,6 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
// if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
|
|
||||||
// return 1;
|
|
||||||
// }
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params)) {
|
||||||
print_usage(argc, argv, params);
|
print_usage(argc, argv, params);
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -261,8 +258,21 @@ int main(int argc, char ** argv) {
|
||||||
print_usage(argc, argv, {});
|
print_usage(argc, argv, {});
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
if (params.omni_vlm_version != "vlm-81-ocr" && params.prompt.empty()) {
|
||||||
|
LOG_TEE("%s : prompt is empty.\n", __func__);
|
||||||
|
print_usage(argc, argv, {});
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nDescribe this image for me\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>";
|
if (params.omni_vlm_version == "vlm-81-ocr") {
|
||||||
|
params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n <|ocr_start|><|vision_start|><|image_pad|><|vision_end|><|ocr_end|><|im_end|>";
|
||||||
|
} else if (params.omni_vlm_version == "vlm-81-instruct" || params.omni_vlm_version == "nano-vlm-instruct") {
|
||||||
|
params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" + params.prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>";
|
||||||
|
} else {
|
||||||
|
LOG_TEE("%s : error: you set wrong vlm version info:'%s'.\n", __func__, params.omni_vlm_version.c_str());
|
||||||
|
print_usage(argc, argv, {});
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
auto * model = omnivlm_init(¶ms);
|
auto * model = omnivlm_init(¶ms);
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
|
@ -270,8 +280,12 @@ int main(int argc, char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
auto * ctx_omnivlm = omnivlm_init_context(¶ms, model);
|
auto * ctx_omnivlm = omnivlm_init_context(¶ms, model);
|
||||||
|
|
||||||
|
// temporarily set to greedy decoding.
|
||||||
|
params.sparams.top_k = 1;
|
||||||
|
params.sparams.top_p = 1.0f;
|
||||||
|
|
||||||
for (auto & image : params.image) {
|
for (auto & image : params.image) {
|
||||||
auto * image_embed = load_image(ctx_omnivlm, ¶ms, image);
|
auto * image_embed = load_image(ctx_omnivlm, ¶ms, image);
|
||||||
if (!image_embed) {
|
if (!image_embed) {
|
||||||
|
|
|
@ -1,15 +1,24 @@
|
||||||
// WARNING: this .cpp file is only for debugging. do not user directly.
|
// WARNING: this .cpp file is only for debugging. do not user directly.
|
||||||
#include "omni-vlm-wrapper.h"
|
#include "omni-vlm-wrapper.h"
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
|
||||||
|
using std::cout;
|
||||||
|
using std::endl;
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
const char* llm_model = "<path to llm gguf.>";
|
const char* llm_model = "";
|
||||||
const char* mmproj_model = "<path to mm projector gguf>";
|
const char* mmproj_model = "";
|
||||||
const char* image_path = "<path where image is located.>";
|
const char* image_path = "";
|
||||||
const char* prompt = "";
|
const char* prompt = "";
|
||||||
|
|
||||||
omnivlm_init(llm_model, mmproj_model);
|
omnivlm_init(llm_model, mmproj_model, "vlm-81-ocr");
|
||||||
omnivlm_inference(prompt, image_path);
|
|
||||||
omnivlm_inference(prompt, image_path);
|
const char* res;
|
||||||
|
res = omnivlm_inference(prompt, image_path);
|
||||||
|
cout << "RES: " << res << endl;
|
||||||
|
res = omnivlm_inference(prompt, image_path);
|
||||||
|
cout << "RES: " << res << endl;
|
||||||
omnivlm_free();
|
omnivlm_free();
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -65,7 +65,8 @@ static struct omnivlm_context * omnivlm_init_context(gpt_params * params, llama_
|
||||||
prompt = "describe the image in detail.";
|
prompt = "describe the image in detail.";
|
||||||
}
|
}
|
||||||
|
|
||||||
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 10);
|
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 0);
|
||||||
|
clip_set_omni_vlm_version(ctx_clip, params);
|
||||||
|
|
||||||
|
|
||||||
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
|
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
|
||||||
|
@ -135,14 +136,14 @@ static const char* process_prompt(struct omnivlm_context * ctx_omnivlm, struct o
|
||||||
|
|
||||||
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
|
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
|
||||||
|
|
||||||
std::string full_prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" \
|
// std::string full_prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" \
|
||||||
+ prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>";
|
// + prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>";
|
||||||
size_t image_pos = full_prompt.find("<|image_pad|>");
|
size_t image_pos = params->prompt.find("<|image_pad|>");
|
||||||
std::string system_prompt, user_prompt;
|
std::string system_prompt, user_prompt;
|
||||||
|
|
||||||
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
|
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
|
||||||
system_prompt = full_prompt.substr(0, image_pos);
|
system_prompt = params->prompt.substr(0, image_pos);
|
||||||
user_prompt = full_prompt.substr(image_pos + std::string("<|image_pad|>").length());
|
user_prompt = params->prompt.substr(image_pos + std::string("<|image_pad|>").length());
|
||||||
if (params->verbose_prompt) {
|
if (params->verbose_prompt) {
|
||||||
auto tmp = ::llama_tokenize(ctx_omnivlm->ctx_llama, system_prompt, true, true);
|
auto tmp = ::llama_tokenize(ctx_omnivlm->ctx_llama, system_prompt, true, true);
|
||||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||||
|
@ -157,6 +158,9 @@ static const char* process_prompt(struct omnivlm_context * ctx_omnivlm, struct o
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
params->sparams.top_k = 1;
|
||||||
|
params->sparams.top_p = 1.0f;
|
||||||
|
|
||||||
eval_string(ctx_omnivlm->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
|
eval_string(ctx_omnivlm->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
|
||||||
omnivlm_eval_image_embed(ctx_omnivlm->ctx_llama, image_embed, params->n_batch, &n_past);
|
omnivlm_eval_image_embed(ctx_omnivlm->ctx_llama, image_embed, params->n_batch, &n_past);
|
||||||
eval_string(ctx_omnivlm->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
eval_string(ctx_omnivlm->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
||||||
|
@ -217,8 +221,10 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// inference interface definition
|
// inference interface definition
|
||||||
void omnivlm_init(const char* llm_model_path, const char* projector_model_path) {
|
void omnivlm_init(const char* llm_model_path, const char* projector_model_path, const char* omni_vlm_version) {
|
||||||
const char* argv = "hello-omni-vlm-wrapper-cli";
|
std::cout << "debug0 " << llm_model_path << std::endl;
|
||||||
|
std::cout << "debug1 " << omni_vlm_version << std::endl;
|
||||||
|
const char* argv = "omni-wrapper-py";
|
||||||
char* nc_argv = const_cast<char*>(argv);
|
char* nc_argv = const_cast<char*>(argv);
|
||||||
if (!gpt_params_parse(1, &nc_argv, params)) {
|
if (!gpt_params_parse(1, &nc_argv, params)) {
|
||||||
print_usage(1, &nc_argv, {});
|
print_usage(1, &nc_argv, {});
|
||||||
|
@ -226,6 +232,17 @@ void omnivlm_init(const char* llm_model_path, const char* projector_model_path)
|
||||||
}
|
}
|
||||||
params.model = llm_model_path;
|
params.model = llm_model_path;
|
||||||
params.mmproj = projector_model_path;
|
params.mmproj = projector_model_path;
|
||||||
|
params.omni_vlm_version = omni_vlm_version;
|
||||||
|
|
||||||
|
std::string omni_vlm_ver = params.omni_vlm_version;
|
||||||
|
std::cout << "\t\t DEBUG omni_ver" << std::endl;
|
||||||
|
std::cout << params.omni_vlm_version << std::endl;
|
||||||
|
if(omni_vlm_ver != "vlm-81-ocr" && omni_vlm_ver != "vlm-81-instruct" && omni_vlm_ver != "nano-vlm-instruct") {
|
||||||
|
fprintf(stderr, "%s: error: you set wrong omni_vlm_string: %s\n", __func__, omni_vlm_version);
|
||||||
|
fprintf(stderr, "%s: Valid omni_vlm_version set is ('vlm-81-ocr', 'vlm-81-instruct', 'nano-vlm-instruct')\n", __func__);
|
||||||
|
throw std::runtime_error("You set wrong vlm_version info strings.");
|
||||||
|
}
|
||||||
|
|
||||||
model = omnivlm_init(¶ms);
|
model = omnivlm_init(¶ms);
|
||||||
if (model == nullptr) {
|
if (model == nullptr) {
|
||||||
fprintf(stderr, "%s: error: failed to init omnivlm model\n", __func__);
|
fprintf(stderr, "%s: error: failed to init omnivlm model\n", __func__);
|
||||||
|
@ -237,6 +254,16 @@ void omnivlm_init(const char* llm_model_path, const char* projector_model_path)
|
||||||
const char* omnivlm_inference(const char *prompt, const char *imag_path) {
|
const char* omnivlm_inference(const char *prompt, const char *imag_path) {
|
||||||
std::string image = imag_path;
|
std::string image = imag_path;
|
||||||
params.prompt = prompt;
|
params.prompt = prompt;
|
||||||
|
|
||||||
|
if (params.omni_vlm_version == "vlm-81-ocr") {
|
||||||
|
params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n <|ocr_start|><|vision_start|><|image_pad|><|vision_end|><|ocr_end|><|im_end|>";
|
||||||
|
} else if (params.omni_vlm_version == "vlm-81-instruct" || params.omni_vlm_version == "nano-vlm-instruct") {
|
||||||
|
params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" + params.prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>";
|
||||||
|
} else {
|
||||||
|
LOG_TEE("%s : error: you set wrong vlm version info:'%s'.\n", __func__, params.omni_vlm_version.c_str());
|
||||||
|
throw std::runtime_error("You set wrong vlm_version info strings.");
|
||||||
|
}
|
||||||
|
|
||||||
auto * image_embed = load_image(ctx_omnivlm, ¶ms, image);
|
auto * image_embed = load_image(ctx_omnivlm, ¶ms, image);
|
||||||
if (!image_embed) {
|
if (!image_embed) {
|
||||||
LOG_TEE("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
|
LOG_TEE("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
|
||||||
|
|
|
@ -20,7 +20,7 @@
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
OMNIVLM_API void omnivlm_init(const char* llm_model_path, const char* projector_model_path);
|
OMNIVLM_API void omnivlm_init(const char* llm_model_path, const char* projector_model_path, const char* omni_vlm_version);
|
||||||
|
|
||||||
OMNIVLM_API const char* omnivlm_inference(const char* prompt, const char* imag_path);
|
OMNIVLM_API const char* omnivlm_inference(const char* prompt, const char* imag_path);
|
||||||
|
|
||||||
|
|
|
@ -60,11 +60,11 @@ _lib = _load_shared_library(_lib_base_name, base_path)
|
||||||
omni_char_p = ctypes.c_char_p
|
omni_char_p = ctypes.c_char_p
|
||||||
|
|
||||||
|
|
||||||
def omnivlm_init(llm_model_path: omni_char_p, mmproj_model_path: omni_char_p):
|
def omnivlm_init(llm_model_path: omni_char_p, mmproj_model_path: omni_char_p, vlm_version: omni_char_p):
|
||||||
return _lib.omnivlm_init(llm_model_path, mmproj_model_path)
|
return _lib.omnivlm_init(llm_model_path, mmproj_model_path, vlm_version)
|
||||||
|
|
||||||
|
|
||||||
_lib.omnivlm_init.argtypes = [omni_char_p, omni_char_p]
|
_lib.omnivlm_init.argtypes = [omni_char_p, omni_char_p, omni_char_p]
|
||||||
_lib.omnivlm_init.restype = None
|
_lib.omnivlm_init.restype = None
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -11,11 +11,12 @@ class NexaOmniVlmInference:
|
||||||
A class used for vision language model inference.
|
A class used for vision language model inference.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, llm_model_path: str, mmproj_model_path: str):
|
def __init__(self, llm_model_path: str, mmproj_model_path: str, omni_vlm_version: str):
|
||||||
self.llm_model = ctypes.c_char_p(llm_model_path.encode("utf-8"))
|
self.llm_model = ctypes.c_char_p(llm_model_path.encode("utf-8"))
|
||||||
self.mmproj_model = ctypes.c_char_p(mmproj_model_path.encode("utf-8"))
|
self.mmproj_model = ctypes.c_char_p(mmproj_model_path.encode("utf-8"))
|
||||||
|
self.omni_vlm_version = ctypes.c_char_p(omni_vlm_version.encode("utf-8"))
|
||||||
|
|
||||||
omni_vlm_cpp.omnivlm_init(self.llm_model, self.mmproj_model)
|
omni_vlm_cpp.omnivlm_init(self.llm_model, self.mmproj_model, self.omni_vlm_version)
|
||||||
|
|
||||||
def inference(self, prompt: str, image_path: str):
|
def inference(self, prompt: str, image_path: str):
|
||||||
prompt = ctypes.c_char_p(prompt.encode("utf-8"))
|
prompt = ctypes.c_char_p(prompt.encode("utf-8"))
|
||||||
|
@ -34,19 +35,25 @@ if __name__ == "__main__":
|
||||||
)
|
)
|
||||||
parser.add_argument("--model", type=str, help="Path to the llm model file")
|
parser.add_argument("--model", type=str, help="Path to the llm model file")
|
||||||
parser.add_argument("--mmproj", type=str, help="Path to the mmproj file")
|
parser.add_argument("--mmproj", type=str, help="Path to the mmproj file")
|
||||||
|
parser.add_argument("--omni-vlm-version", type=str, help="omni-vlm-version info ('vlm-81-ocr', 'vlm-81-instruct', 'nano-vlm-instruct')")
|
||||||
# parser.add_argument("--prompt", type=str, help="prompt string.")
|
# parser.add_argument("--prompt", type=str, help="prompt string.")
|
||||||
# parser.add_argument("--image-path", type=str, help="Path to the image.")
|
# parser.add_argument("--image-path", type=str, help="Path to the image.")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
omni_vlm_obj = NexaOmniVlmInference(args.model, args.mmproj)
|
print("DEBUG")
|
||||||
|
print(args.omni_vlm_version)
|
||||||
|
omni_vlm_obj = NexaOmniVlmInference(args.model, args.mmproj, args.omni_vlm_version)
|
||||||
# omni_vlm_obj.inference(args.prompt, args.image_path)
|
# omni_vlm_obj.inference(args.prompt, args.image_path)
|
||||||
while True:
|
while True:
|
||||||
|
if args.omni_vlm_version != "vlm-81-ocr":
|
||||||
print("Input your prompt:")
|
print("Input your prompt:")
|
||||||
prompt = input()
|
prompt = input()
|
||||||
if prompt == "":
|
if prompt == "":
|
||||||
print("ERROR: you input an empty prompt, try again.")
|
print("ERROR: you input an empty prompt, try again.")
|
||||||
continue
|
continue
|
||||||
|
else:
|
||||||
|
prompt = ""
|
||||||
print("Input your image path:")
|
print("Input your image path:")
|
||||||
image_path = input()
|
image_path = input()
|
||||||
while not os.path.exists(image_path):
|
while not os.path.exists(image_path):
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue