Merge pull request #19 from NexaAI/master

include latest vlm and audio lm changes
This commit is contained in:
Zack Li 2024-11-11 12:25:51 -08:00 committed by GitHub
commit 5f2d958492
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
22 changed files with 229 additions and 306 deletions

1
.github/CODEOWNERS vendored Normal file
View file

@ -0,0 +1 @@
@zhiyuan8 @alexchen4ai

View file

@ -1442,6 +1442,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
// End of Parse args for logging parameters
#endif // LOG_DISABLE_LOGS
if (arg == "--omni-vlm-version") {
CHECK_ARG
params.omni_vlm_version = argv[i];
return true;
}
return false;
}
@ -1688,6 +1693,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
"layer range to apply the control vector(s) to, start and end inclusive" });
options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n"
"or --model-url if set, otherwise %s)", DEFAULT_MODEL_PATH });
options.push_back({ "*", " --omni-vlm-version VERSION_STRING", "omni vlm string version(one of 'vlm-81-ocr', 'vlm-81-instruct', 'nano-vlm-instruct')\n" "(default: 'vlm-81-ocr')"});
options.push_back({ "*", "-md, --model-draft FNAME", "draft model for speculative decoding (default: unused)" });
options.push_back({ "*", "-mu, --model-url MODEL_URL", "model download url (default: unused)" });
options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" });

View file

@ -265,6 +265,8 @@ struct gpt_params {
bool spm_infill = false; // suffix/prefix/middle pattern for infill
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
std::string omni_vlm_version = "vlm-81-ocr";
};
void gpt_params_parse_from_env(gpt_params & params);

View file

@ -72,19 +72,14 @@ class MainActivity(
val models = listOf(
Downloadable(
"Phi-2 7B (Q4_0, 1.6 GiB)",
Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true"),
File(extFilesDir, "phi-2-q4_0.gguf"),
"Llama3.2-1B-Instruct (Q4_0, 735 MB)",
Uri.parse("https://public-storage.nexa4ai.com/Llama3.2-1B-Instruct/q4_0.gguf"),
File(extFilesDir, "Llama3.2-1B-Instruct-q4_0.gguf"),
),
Downloadable(
"TinyLlama 1.1B (f16, 2.2 GiB)",
Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true"),
File(extFilesDir, "tinyllama-1.1-f16.gguf"),
),
Downloadable(
"Phi 2 DPO (Q3_K_M, 1.48 GiB)",
Uri.parse("https://huggingface.co/TheBloke/phi-2-dpo-GGUF/resolve/main/phi-2-dpo.Q3_K_M.gguf?download=true"),
File(extFilesDir, "phi-2-dpo.Q3_K_M.gguf")
"octopus",
Uri.parse("https://public-storage.nexa4ai.com/Octopus-v2/q4_0.gguf"),
File(extFilesDir, "octopus-q4_0.gguf")
),
)

View file

@ -33,6 +33,7 @@ project("llama-android")
#load local llama.cpp
add_subdirectory(../../../../../../ build-llama)
add_subdirectory(../../../../../../examples/llava build-llava)
# In order to load a library into your app from Java/Kotlin, you must call
# System.loadLibrary() and pass the name of the library defined here;
@ -50,4 +51,5 @@ target_link_libraries(${CMAKE_PROJECT_NAME}
llama
common
android
log)
log
llava)

View file

@ -6,6 +6,7 @@
#include <unistd.h>
#include "llama.h"
#include "common.h"
#include "llava.h"
// Write C++ code here.
//

View file

@ -36,7 +36,7 @@ class LLamaAndroid {
}
}.asCoroutineDispatcher()
private val nlen: Int = 64
private val nlen: Int = 256
private external fun log_to_android()
private external fun load_model(filename: String): Long

View file

@ -23,6 +23,8 @@
// Constants
//
void* internal_chars = nullptr;
static const char *AUDIO_TOKEN = "<|AUDIO|>";
//
@ -570,7 +572,7 @@ static omni_params get_omni_params_from_context_params(omni_context_params &para
all_params.gpt.n_gpu_layers = params.n_gpu_layers;
all_params.gpt.model = params.model;
all_params.gpt.prompt = params.prompt;
// Initialize whisper params
all_params.whisper.model = params.mmproj;
all_params.whisper.fname_inp = {params.file};
@ -703,6 +705,10 @@ struct omni_context *omni_init_context(omni_context_params &params)
void omni_free(struct omni_context *ctx_omni)
{
if(internal_chars != nullptr)
{
free(internal_chars);
}
if (ctx_omni->ctx_whisper)
{
whisper_free(ctx_omni->ctx_whisper);
@ -792,7 +798,7 @@ ggml_tensor *omni_process_audio(struct omni_context *ctx_omni, omni_params &para
return embed_proj;
}
void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed, omni_params &params, const std::string &prompt)
const char* omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed, omni_params &params, const std::string &prompt)
{
int n_past = 0;
@ -833,12 +839,11 @@ void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed
for (int i = 0; i < max_tgt_len; i++)
{
const char * tmp = sample(ctx_sampling, ctx_omni->ctx_llama, &n_past);
response += tmp;
if (strcmp(tmp, "</s>") == 0)
break;
if (strstr(tmp, "###"))
break; // Yi-VL behavior
printf("%s", tmp);
// printf("%s", tmp);
if (strstr(response.c_str(), "<|im_end|>"))
break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
if (strstr(response.c_str(), "<|im_start|>"))
@ -847,16 +852,22 @@ void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed
break; // mistral llava-1.6
fflush(stdout);
response += tmp;
}
llama_sampling_free(ctx_sampling);
printf("\n");
if(internal_chars != nullptr) { free(internal_chars); }
internal_chars = malloc(sizeof(char)*(response.size()+1));
strncpy((char*)(internal_chars), response.c_str(), response.size());
((char*)(internal_chars))[response.size()] = '\0';
return (const char*)(internal_chars);
}
void omni_process_full(struct omni_context *ctx_omni, omni_context_params &params)
const char* omni_process_full(struct omni_context *ctx_omni, omni_context_params &params)
{
omni_params all_params = get_omni_params_from_context_params(params);
ggml_tensor *audio_embed = omni_process_audio(ctx_omni, all_params);
omni_process_prompt(ctx_omni, audio_embed, all_params, all_params.gpt.prompt);
}
return omni_process_prompt(ctx_omni, audio_embed, all_params, all_params.gpt.prompt);
}

View file

@ -54,11 +54,11 @@ OMNI_AUDIO_API struct omni_context *omni_init_context(omni_context_params &param
OMNI_AUDIO_API void omni_free(struct omni_context *ctx_omni);
OMNI_AUDIO_API void omni_process_full(
OMNI_AUDIO_API const char* omni_process_full(
struct omni_context *ctx_omni,
omni_context_params &params
);
#ifdef __cplusplus
}
#endif
#endif

View file

@ -1,22 +1,30 @@
# omni-vlm
Currently this implementation supports [omni-vlm](https://huggingface.co/NexaAIDev/nano-vlm-instruct) variants,
Currently this implementation supports:
After API is confirmed, more models will be supported / uploaded.
* [nano-vlm-instruct](https://huggingface.co/NexaAIDev/nano-vlm-instruct/tree/main) ([gguf](https://huggingface.co/NexaAIDev/nano-vlm-instruct-gguf/tree/main))
* [vlm-81-ocr](https://huggingface.co/NexaAIDev/vlm-81-ocr/tree/main) ([gguf](https://huggingface.co/NexaAIDev/vlm-81-ocr-gguf/tree/main))
* [vlm-81-instruct](https://huggingface.co/NexaAIDev/vlm-81-instruct/tree/main) ([gguf](https://huggingface.co/NexaAIDev/vlm-81-instruct-gguf/tree/main))
After API is stable, more models will be supported.
## Usage
Build with cmake in the `llama-cpp-experiments` folder:
```bash
Build with cmake in the `llama.cpp` folder:
```console
cmake -S . -B build -DCMAKE_BUILD_TYPE=RelWithDebInfo
cmake --build build --verbose -j
```
After building, run: `./omni-vlm-cli` to see the usage. For example:
```bash
```console
./omni-vlm-cli \
-m Nano-Llm-494M-F16.gguf \
--mmproj mmproj-omni-vlm-f16.gguf \
--image example/omni-vlm/cat.png
-m <llm-F16.gguf> \
--mmproj <mmproj-F16.gguf> \
--image example/omni-vlm/cat.png \
--omni-vlm-version <vlm-81-ocr | vlm-81-instruct | nano-vlm-instruct>
```
See next section to convert gguf files from original safetensors.
@ -27,6 +35,7 @@ See next section to convert gguf files from original safetensors.
)
## Omni-vlm gguf conversion
1) First clone omni-vlm model:
```console
git clone https://huggingface.co/NexaAIDev/nano-vlm-instruct
@ -34,7 +43,7 @@ git clone https://huggingface.co/NexaAIDev/nano-vlm-instruct
2) Install the required Python packages:
```sh
```console
pip install -r examples/omni-vlm/requirements.txt
```
@ -104,6 +113,5 @@ After successfully compiling omni_vlm_wrapper_shared dynamic library, run:
python omni_vlm_demo.py \
--model <PATH TO nano-vlm-processor>/Nano-Llm-494M-F16.gguf \
--mmproj <PATH TO nano-vlm-instruct>/mmproj-omni-vlm-f16.gguf \
--prompt="Describe this image for me" \
--image-path cat.png
--omni-vlm-version <vlm-81-ocr | vlm-81-instruct | nano-vlm-instruct>
```

View file

@ -6,6 +6,7 @@
#include "ggml.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "common.h"
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
@ -167,7 +168,11 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
};
enum omni_vlm_version_type {
VLM_81_OCR,
VLM_81_INSTRUCT,
NANO_VLM_INSTRUCT,
};
//
// utilities to get data from a gguf file
//
@ -294,115 +299,6 @@ static projector_type clip_projector_type_from_string(const std::string & name)
return PROJECTOR_TYPE_UNKNOWN;
}
#ifdef CLIP_DEBUG_FUNCTIONS
static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
std::ofstream file(filename, std::ios::binary);
if (!file.is_open()) {
LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
return;
}
// PPM header: P6 format, width, height, and max color value
file << "P6\n" << img.nx << " " << img.ny << "\n255\n";
// Write pixel data
for (size_t i = 0; i < img.buf.size(); i += 3) {
// PPM expects binary data in RGB format, which matches our image buffer
file.write(reinterpret_cast<const char*>(&img.buf[i]), 3);
}
file.close();
}
static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
std::ofstream file(filename, std::ios::binary);
if (!file.is_open()) {
LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
return;
}
int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data
int bytesPerPixel = 3;
int widthInBytes = img.nx * bytesPerPixel;
int paddingAmount = (4 - (widthInBytes % 4)) % 4;
int stride = widthInBytes + paddingAmount;
// Bitmap file header
unsigned char fileHeader[14] = {
'B','M', // Signature
0,0,0,0, // Image file size in bytes
0,0,0,0, // Reserved
54,0,0,0 // Start of pixel array
};
// Total file size
fileSize = 54 + (stride * img.ny);
fileHeader[2] = (unsigned char)(fileSize);
fileHeader[3] = (unsigned char)(fileSize >> 8);
fileHeader[4] = (unsigned char)(fileSize >> 16);
fileHeader[5] = (unsigned char)(fileSize >> 24);
// Bitmap information header (BITMAPINFOHEADER)
unsigned char infoHeader[40] = {
40,0,0,0, // Size of this header (40 bytes)
0,0,0,0, // Image width
0,0,0,0, // Image height
1,0, // Number of color planes
24,0, // Bits per pixel
0,0,0,0, // No compression
0,0,0,0, // Image size (can be 0 for no compression)
0,0,0,0, // X pixels per meter (not specified)
0,0,0,0, // Y pixels per meter (not specified)
0,0,0,0, // Total colors (color table not used)
0,0,0,0 // Important colors (all are important)
};
// Width and height in the information header
infoHeader[4] = (unsigned char)(img.nx);
infoHeader[5] = (unsigned char)(img.nx >> 8);
infoHeader[6] = (unsigned char)(img.nx >> 16);
infoHeader[7] = (unsigned char)(img.nx >> 24);
infoHeader[8] = (unsigned char)(img.ny);
infoHeader[9] = (unsigned char)(img.ny >> 8);
infoHeader[10] = (unsigned char)(img.ny >> 16);
infoHeader[11] = (unsigned char)(img.ny >> 24);
// Write file headers
file.write(reinterpret_cast<char*>(fileHeader), sizeof(fileHeader));
file.write(reinterpret_cast<char*>(infoHeader), sizeof(infoHeader));
// Pixel data
std::vector<unsigned char> padding(3, 0); // Max padding size to be added to each row
for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top
for (int x = 0; x < img.nx; ++x) {
// Each pixel
size_t pixelIndex = (y * img.nx + x) * 3;
unsigned char pixel[3] = {
img.buf[pixelIndex + 2], // BMP stores pixels in BGR format
img.buf[pixelIndex + 1],
img.buf[pixelIndex]
};
file.write(reinterpret_cast<char*>(pixel), 3);
}
// Write padding for the row
file.write(reinterpret_cast<char*>(padding.data()), paddingAmount);
}
file.close();
}
// debug function to convert f32 to u8
static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
dst.nx = src.nx;
dst.ny = src.ny;
dst.buf.resize(3 * src.nx * src.ny);
for (size_t i = 0; i < src.buf.size(); ++i) {
dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255));
}
}
#endif
//
// clip layers
//
@ -564,6 +460,7 @@ struct clip_ctx {
struct clip_vision_model vision_model;
projector_type proj_type = PROJECTOR_TYPE_MLP;
omni_vlm_version_type omni_vlm_ver_type;
float image_mean[3];
float image_std[3];
@ -785,6 +682,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
}
if(ctx->omni_vlm_ver_type == omni_vlm_version_type::VLM_81_OCR || ctx->omni_vlm_ver_type == omni_vlm_version_type::VLM_81_INSTRUCT) {
embeddings = ggml_reshape_3d(ctx0, embeddings, embeddings->ne[0]*9, embeddings->ne[1]/9, 1);
}
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
@ -1308,6 +1209,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
return new_clip;
}
void clip_set_omni_vlm_version(struct clip_ctx * ctx_clip, const struct gpt_params * params) {
if (params->omni_vlm_version == "vlm-81-ocr") {
ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_OCR;
} else if (params->omni_vlm_version == "vlm-81-instruct") {
ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_INSTRUCT;
} else if (params->omni_vlm_version == "nano-vlm-instruct") {
ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::NANO_VLM_INSTRUCT;
} else {
throw std::runtime_error(std::string("error vlm version info: ") + params->omni_vlm_version);
}
}
void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
ctx_clip->load_image_size = load_image_size;
}

View file

@ -42,6 +42,9 @@ struct clip_image_f32_batch {
CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity);
CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
struct gpt_params;
CLIP_API void clip_set_omni_vlm_version(struct clip_ctx * ctx_clip, const struct gpt_params * params);
CLIP_API void clip_free(struct clip_ctx * ctx);
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);

BIN
examples/omni-vlm/latex.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.8 KiB

View file

@ -12,6 +12,10 @@
#include <cstdlib>
#include <cstring>
#include <vector>
// #include <iostream>
//
// using std::cout;
// using std::endl;
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
int N = (int) tokens.size();
@ -149,7 +153,7 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_omnivlm->ctx_llama, tmp[i]).c_str());
}
}
LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
// LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_omnivlm->ctx_llama, user_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
@ -165,6 +169,9 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima
LOG("\n");
params->sparams.temp = 0.0f;
params->sparams.top_k = 1;
params->sparams.top_p = 1.0f;
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
if (!ctx_sampling) {
LOG_TEE("%s: failed to initialize sampling subsystem\n", __func__);
@ -177,8 +184,8 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima
response += tmp;
if (strcmp(tmp, "<|im_end|>") == 0) break;
if (strcmp(tmp, "</s>") == 0) break;
// if (strstr(tmp, "###")) break; // Yi-VL behavior
printf("%s", tmp);
// LOG("%s", tmp);
// if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
// if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
// if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
@ -212,8 +219,8 @@ static struct omnivlm_context * omnivlm_init_context(gpt_params * params, llama_
prompt = "describe the image in detail.";
}
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 10);
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 0);
clip_set_omni_vlm_version(ctx_clip, params);
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
@ -249,9 +256,6 @@ int main(int argc, char ** argv) {
gpt_params params;
// if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
// return 1;
// }
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
return 1;
@ -261,8 +265,21 @@ int main(int argc, char ** argv) {
print_usage(argc, argv, {});
return 1;
}
if (params.omni_vlm_version != "vlm-81-ocr" && params.prompt.empty()) {
LOG_TEE("%s : prompt is empty.\n", __func__);
print_usage(argc, argv, {});
return 1;
}
params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nDescribe this image for me\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>";
if (params.omni_vlm_version == "vlm-81-ocr") {
params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n <|vision_start|><|image_pad|><|vision_end|><|im_end|>";
} else if (params.omni_vlm_version == "vlm-81-instruct" || params.omni_vlm_version == "nano-vlm-instruct") {
params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" + params.prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>";
} else {
LOG_TEE("%s : error: you set wrong vlm version info:'%s'.\n", __func__, params.omni_vlm_version.c_str());
print_usage(argc, argv, {});
return 1;
}
auto * model = omnivlm_init(&params);
if (model == NULL) {
@ -271,8 +288,8 @@ int main(int argc, char ** argv) {
}
auto * ctx_omnivlm = omnivlm_init_context(&params, model);
for (auto & image : params.image) {
auto * ctx_omnivlm = omnivlm_init_context(&params, model);
auto * image_embed = load_image(ctx_omnivlm, &params, image);
if (!image_embed) {
LOG_TEE("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
@ -283,9 +300,9 @@ int main(int argc, char ** argv) {
llama_print_timings(ctx_omnivlm->ctx_llama);
omnivlm_image_embed_free(image_embed);
ctx_omnivlm->model = NULL;
omnivlm_free(ctx_omnivlm);
}
ctx_omnivlm->model = NULL;
omnivlm_free(ctx_omnivlm);
llama_free_model(model);

View file

@ -1,15 +1,24 @@
// WARNING: this .cpp file is only for debugging. do not user directly.
#include "omni-vlm-wrapper.h"
#include <iostream>
using std::cout;
using std::endl;
int main(int argc, char ** argv) {
const char* llm_model = "<path to llm gguf.>";
const char* mmproj_model = "<path to mm projector gguf>";
const char* image_path = "<path where image is located.>";
const char* llm_model = "";
const char* mmproj_model = "";
const char* image_path = "";
const char* prompt = "";
omnivlm_init(llm_model, mmproj_model);
omnivlm_inference(prompt, image_path);
omnivlm_inference(prompt, image_path);
omnivlm_init(llm_model, mmproj_model, "vlm-81-ocr");
const char* res;
res = omnivlm_inference(prompt, image_path);
cout << "RES: " << res << endl;
res = omnivlm_inference(prompt, image_path);
cout << "RES: " << res << endl;
omnivlm_free();
return 0;

View file

@ -24,6 +24,8 @@ struct omnivlm_context {
struct llama_model * model = NULL;
};
void* internal_chars = nullptr;
static struct gpt_params params;
static struct llama_model* model;
static struct omnivlm_context* ctx_omnivlm;
@ -63,7 +65,8 @@ static struct omnivlm_context * omnivlm_init_context(gpt_params * params, llama_
prompt = "describe the image in detail.";
}
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 10);
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 0);
clip_set_omni_vlm_version(ctx_clip, params);
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
@ -128,19 +131,19 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,
return ret.c_str();
}
static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_image_embed * image_embed, gpt_params * params, const std::string & prompt) {
static const char* process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_image_embed * image_embed, gpt_params * params, const std::string & prompt) {
int n_past = 0;
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
std::string full_prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" \
+ prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>";
size_t image_pos = full_prompt.find("<|image_pad|>");
// std::string full_prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" \
// + prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>";
size_t image_pos = params->prompt.find("<|image_pad|>");
std::string system_prompt, user_prompt;
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
system_prompt = full_prompt.substr(0, image_pos);
user_prompt = full_prompt.substr(image_pos + std::string("<|image_pad|>").length());
system_prompt = params->prompt.substr(0, image_pos);
user_prompt = params->prompt.substr(image_pos + std::string("<|image_pad|>").length());
if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_omnivlm->ctx_llama, system_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
@ -155,6 +158,9 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima
}
}
params->sparams.top_k = 1;
params->sparams.top_p = 1.0f;
eval_string(ctx_omnivlm->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
omnivlm_eval_image_embed(ctx_omnivlm->ctx_llama, image_embed, params->n_batch, &n_past);
eval_string(ctx_omnivlm->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
@ -172,11 +178,11 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima
std::string response = "";
for (int i = 0; i < max_tgt_len; i++) {
const char * tmp = sample(ctx_sampling, ctx_omnivlm->ctx_llama, &n_past);
response += tmp;
if (strcmp(tmp, "<|im_end|>") == 0) break;
if (strcmp(tmp, "</s>") == 0) break;
// if (strstr(tmp, "###")) break; // Yi-VL behavior
printf("%s", tmp);
// printf("%s", tmp);
response += tmp;
// if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
// if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
// if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
@ -186,6 +192,13 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima
llama_sampling_free(ctx_sampling);
printf("\n");
// const char* ret_char_ptr = (const char*)(malloc(sizeof(char)*response.size()));
if(internal_chars != nullptr) { free(internal_chars); }
internal_chars = malloc(sizeof(char)*(response.size()+1));
strncpy((char*)(internal_chars), response.c_str(), response.size());
((char*)(internal_chars))[response.size()] = '\0';
return (const char*)(internal_chars);
}
static void omnivlm_free(struct omnivlm_context * ctx_omnivlm) {
@ -208,8 +221,8 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
}
// inference interface definition
void omnivlm_init(const char* llm_model_path, const char* projector_model_path) {
const char* argv = "hello-omni-vlm-wrapper-cli";
void omnivlm_init(const char* llm_model_path, const char* projector_model_path, const char* omni_vlm_version) {
const char* argv = "omni-wrapper-py";
char* nc_argv = const_cast<char*>(argv);
if (!gpt_params_parse(1, &nc_argv, params)) {
print_usage(1, &nc_argv, {});
@ -217,30 +230,56 @@ void omnivlm_init(const char* llm_model_path, const char* projector_model_path)
}
params.model = llm_model_path;
params.mmproj = projector_model_path;
params.omni_vlm_version = omni_vlm_version;
std::string omni_vlm_ver = params.omni_vlm_version;
if(omni_vlm_ver != "vlm-81-ocr" && omni_vlm_ver != "vlm-81-instruct" && omni_vlm_ver != "nano-vlm-instruct") {
fprintf(stderr, "%s: error: you set wrong omni_vlm_string: %s\n", __func__, omni_vlm_version);
fprintf(stderr, "%s: Valid omni_vlm_version set is ('vlm-81-ocr', 'vlm-81-instruct', 'nano-vlm-instruct')\n", __func__);
throw std::runtime_error("You set wrong vlm_version info strings.");
}
model = omnivlm_init(&params);
if (model == nullptr) {
fprintf(stderr, "%s: error: failed to init omnivlm model\n", __func__);
throw std::runtime_error("Failed to init omnivlm model");
}
ctx_omnivlm = omnivlm_init_context(&params, model);
}
void omnivlm_inference(const char *prompt, const char *imag_path) {
const char* omnivlm_inference(const char *prompt, const char *imag_path) {
ctx_omnivlm = omnivlm_init_context(&params, model);
std::string image = imag_path;
params.prompt = prompt;
if (params.omni_vlm_version == "vlm-81-ocr") {
params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n <|ocr_start|><|vision_start|><|image_pad|><|vision_end|><|ocr_end|><|im_end|>";
} else if (params.omni_vlm_version == "vlm-81-instruct" || params.omni_vlm_version == "nano-vlm-instruct") {
params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" + params.prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>";
} else {
LOG_TEE("%s : error: you set wrong vlm version info:'%s'.\n", __func__, params.omni_vlm_version.c_str());
throw std::runtime_error("You set wrong vlm_version info strings.");
}
auto * image_embed = load_image(ctx_omnivlm, &params, image);
if (!image_embed) {
LOG_TEE("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
throw std::runtime_error("failed to load image " + image);
}
// process the prompt
process_prompt(ctx_omnivlm, image_embed, &params, params.prompt);
const char* ret_chars = process_prompt(ctx_omnivlm, image_embed, &params, params.prompt);
// llama_perf_print(ctx_omnivlm->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
omnivlm_image_embed_free(image_embed);
ctx_omnivlm->model = nullptr;
omnivlm_free(ctx_omnivlm);
ctx_omnivlm = nullptr;
return ret_chars;
}
void omnivlm_free() {
if(internal_chars != nullptr) { free(internal_chars); }
ctx_omnivlm->model = NULL;
omnivlm_free(ctx_omnivlm);
llama_free_model(model);

View file

@ -20,9 +20,9 @@
extern "C" {
#endif
OMNIVLM_API void omnivlm_init(const char* llm_model_path, const char* projector_model_path);
OMNIVLM_API void omnivlm_init(const char* llm_model_path, const char* projector_model_path, const char* omni_vlm_version);
OMNIVLM_API void omnivlm_inference(const char* prompt, const char* imag_path);
OMNIVLM_API const char* omnivlm_inference(const char* prompt, const char* imag_path);
OMNIVLM_API void omnivlm_free();

View file

@ -258,111 +258,6 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
*n_img_pos = clip_n_patches(ctx_clip);
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd);
// cout << "\t\t A NICE START" << endl;
// cout << "\t\t" << *n_img_pos << endl;
/*
if (clip_is_minicpmv(ctx_clip)) {
std::vector<float *> image_embd_v;
image_embd_v.resize(img_res_v.size);
struct clip_image_size * load_image_size = clip_image_size_init();
for (size_t i = 0; i < img_res_v.size; i++) {
const int64_t t_img_enc_step_start_us = ggml_time_us();
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
int patch_size=14;
load_image_size->width = img_res_v.data[i].nx;
load_image_size->height = img_res_v.data[i].ny;
clip_add_load_image_size(ctx_clip, load_image_size);
bool encoded = false;
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
if (has_minicpmv_projector == 2) {
encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
}
else if (has_minicpmv_projector == 3) {
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
}
if (!encoded) {
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
return false;
}
const int64_t t_img_enc_steop_batch_us = ggml_time_us();
LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
}
const int64_t t_img_enc_batch_us = ggml_time_us();
LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
int n_img_pos_out = 0;
for (size_t i = 0; i < image_embd_v.size(); i++) {
std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip));
n_img_pos_out += clip_n_patches(ctx_clip);
}
*n_img_pos = n_img_pos_out;
for (size_t i = 0; i < image_embd_v.size(); i++) {
free(image_embd_v[i]);
}
image_embd_v.clear();
load_image_size->width = img->nx;
load_image_size->height = img->ny;
clip_add_load_image_size(ctx_clip, load_image_size);
LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
}
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
// flat / default llava-1.5 type embedding
*n_img_pos = clip_n_patches(ctx_clip);
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
delete[] img_res_v.data;
if (!encoded) {
LOG_ERR("Unable to encode image\n");
return false;
}
}
else {
// spatial_unpad llava-1.6 type embedding
// TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
std::vector<float *> image_embd_v;
image_embd_v.resize(img_res_v.size);
for (size_t i = 0; i < img_res_v.size; i++) {
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
if (!encoded) {
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
return false;
}
}
const int64_t t_img_enc_batch_us = ggml_time_us();
LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
const int32_t * image_grid = clip_image_grid(ctx_clip);
std::vector<std::pair<int, int>> grid_pinpoints;
for (int i = 0; i < 32 && image_grid[i] != 0; i += 2) {
grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
}
// free all img_res_v - not needed anymore
delete[] img_res_v.data;
img_res_v.size = 0;
img_res_v.data = nullptr;
const int32_t image_size = clip_image_size(ctx_clip);
struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
int n_img_pos_out;
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
*n_img_pos = n_img_pos_out;
for (size_t i = 0; i < image_embd_v.size(); i++) {
free(image_embd_v[i]);
}
image_embd_v.clear();
// debug image/segment/normalization content:
// clip_image_u8 * tmp = clip_image_u8_init();
// clip_image_convert_f32_to_u8(*image_feature, *tmp);
// clip_image_save_to_bmp(*tmp, "image_feature.bmp");
}
*/
LOG("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);

View file

@ -60,11 +60,11 @@ _lib = _load_shared_library(_lib_base_name, base_path)
omni_char_p = ctypes.c_char_p
def omnivlm_init(llm_model_path: omni_char_p, mmproj_model_path: omni_char_p):
return _lib.omnivlm_init(llm_model_path, mmproj_model_path)
def omnivlm_init(llm_model_path: omni_char_p, mmproj_model_path: omni_char_p, vlm_version: omni_char_p):
return _lib.omnivlm_init(llm_model_path, mmproj_model_path, vlm_version)
_lib.omnivlm_init.argtypes = [omni_char_p, omni_char_p]
_lib.omnivlm_init.argtypes = [omni_char_p, omni_char_p, omni_char_p]
_lib.omnivlm_init.restype = None
@ -73,7 +73,7 @@ def omnivlm_inference(prompt: omni_char_p, image_path: omni_char_p):
_lib.omnivlm_inference.argtypes = [omni_char_p, omni_char_p]
_lib.omnivlm_inference.restype = None
_lib.omnivlm_inference.restype = omni_char_p
def omnivlm_free():

View file

@ -11,16 +11,17 @@ class NexaOmniVlmInference:
A class used for vision language model inference.
"""
def __init__(self, llm_model_path: str, mmproj_model_path: str):
def __init__(self, llm_model_path: str, mmproj_model_path: str, omni_vlm_version: str):
self.llm_model = ctypes.c_char_p(llm_model_path.encode("utf-8"))
self.mmproj_model = ctypes.c_char_p(mmproj_model_path.encode("utf-8"))
self.omni_vlm_version = ctypes.c_char_p(omni_vlm_version.encode("utf-8"))
omni_vlm_cpp.omnivlm_init(self.llm_model, self.mmproj_model)
omni_vlm_cpp.omnivlm_init(self.llm_model, self.mmproj_model, self.omni_vlm_version)
def inference(self, prompt: str, image_path: str):
prompt = ctypes.c_char_p(prompt.encode("utf-8"))
image_path = ctypes.c_char_p(image_path.encode("utf-8"))
omni_vlm_cpp.omnivlm_inference(prompt, image_path)
return omni_vlm_cpp.omnivlm_inference(prompt, image_path)
def __del__(self):
omni_vlm_cpp.omnivlm_free()
@ -34,22 +35,30 @@ if __name__ == "__main__":
)
parser.add_argument("--model", type=str, help="Path to the llm model file")
parser.add_argument("--mmproj", type=str, help="Path to the mmproj file")
parser.add_argument("--omni-vlm-version", type=str, help="omni-vlm-version info ('vlm-81-ocr', 'vlm-81-instruct', 'nano-vlm-instruct')")
# parser.add_argument("--prompt", type=str, help="prompt string.")
# parser.add_argument("--image-path", type=str, help="Path to the image.")
args = parser.parse_args()
omni_vlm_obj = NexaOmniVlmInference(args.model, args.mmproj)
print("DEBUG")
print(args.omni_vlm_version)
omni_vlm_obj = NexaOmniVlmInference(args.model, args.mmproj, args.omni_vlm_version)
# omni_vlm_obj.inference(args.prompt, args.image_path)
while True:
print("Input your prompt:")
prompt = input()
if prompt == "":
print("ERROR: you input an empty prompt, try again.")
continue
if args.omni_vlm_version != "vlm-81-ocr":
print("Input your prompt:")
prompt = input()
if prompt == "":
print("ERROR: you input an empty prompt, try again.")
continue
else:
prompt = ""
print("Input your image path:")
image_path = input()
while not os.path.exists(image_path):
print("ERROR: can not find image in your input path, please check and input agian.")
image_path = input()
omni_vlm_obj.inference(prompt, image_path)
response = omni_vlm_obj.inference(prompt, image_path)
print("\tresponse:")
print(response.decode('utf-8'))

View file

@ -22,6 +22,7 @@
//
// Constants
//
void* internal_chars = nullptr;
static const char *AUDIO_TOKEN = "<|AUDIO|>";
@ -565,16 +566,16 @@ bool omni_params_parse(int argc, char **argv, omni_params &params)
static omni_params get_omni_params_from_context_params(omni_context_params &params)
{
omni_params all_params;
// Initialize gpt params
all_params.gpt.n_gpu_layers = params.n_gpu_layers;
all_params.gpt.model = params.model;
all_params.gpt.prompt = params.prompt;
// Initialize whisper params
all_params.whisper.model = params.mmproj;
all_params.whisper.fname_inp = {params.file};
if (all_params.gpt.n_threads <= 0)
{
all_params.gpt.n_threads = std::thread::hardware_concurrency();
@ -703,6 +704,11 @@ struct omni_context *omni_init_context(omni_context_params &params)
void omni_free(struct omni_context *ctx_omni)
{
if(internal_chars != nullptr)
{
free(internal_chars);
}
if (ctx_omni->ctx_whisper)
{
whisper_free(ctx_omni->ctx_whisper);
@ -792,7 +798,7 @@ ggml_tensor *omni_process_audio(struct omni_context *ctx_omni, omni_params &para
return embed_proj;
}
void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed, omni_params &params, const std::string &prompt)
const char* omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed, omni_params &params, const std::string &prompt)
{
int n_past = 0;
@ -841,12 +847,11 @@ void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed
for (int i = 0; i < max_tgt_len; i++)
{
const char * tmp = sample(ctx_sampling, ctx_omni->ctx_llama, &n_past);
response += tmp;
if (strcmp(tmp, "</s>") == 0)
break;
if (strstr(tmp, "###"))
break; // Yi-VL behavior
printf("%s", tmp);
// printf("%s", tmp);
if (strstr(response.c_str(), "<|im_end|>"))
break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
if (strstr(response.c_str(), "<|im_start|>"))
@ -855,16 +860,23 @@ void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed
break; // mistral llava-1.6
fflush(stdout);
response += tmp;
}
llama_sampling_free(ctx_sampling);
printf("\n");
if(internal_chars != nullptr) { free(internal_chars); }
internal_chars = malloc(sizeof(char)*(response.size()+1));
strncpy((char*)(internal_chars), response.c_str(), response.size());
((char*)(internal_chars))[response.size()] = '\0';
return (const char*)(internal_chars);
}
void omni_process_full(struct omni_context *ctx_omni, omni_context_params &params)
const char* omni_process_full(struct omni_context *ctx_omni, omni_context_params &params)
{
omni_params all_params = get_omni_params_from_context_params(params);
ggml_tensor *audio_embed = omni_process_audio(ctx_omni, all_params);
omni_process_prompt(ctx_omni, audio_embed, all_params, all_params.gpt.prompt);
}
return omni_process_prompt(ctx_omni, audio_embed, all_params, all_params.gpt.prompt);
}

View file

@ -54,11 +54,11 @@ OMNI_AUDIO_API struct omni_context *omni_init_context(omni_context_params &param
OMNI_AUDIO_API void omni_free(struct omni_context *ctx_omni);
OMNI_AUDIO_API void omni_process_full(
OMNI_AUDIO_API const char* omni_process_full(
struct omni_context *ctx_omni,
omni_context_params &params
);
#ifdef __cplusplus
}
#endif
#endif