update to get some results; need to check vit and llm

This commit is contained in:
Yutong Dai 2024-09-16 17:11:12 +00:00
parent cc553a0ae0
commit 30b751ef06
12 changed files with 4840 additions and 341 deletions

3
.gitignore vendored
View file

@ -155,3 +155,6 @@ examples/xgenmm copy/imgs/image-1d100e9-1.jpg
examples/xgenmm copy/imgs/image-1d100e9.jpg
examples/xgenmm/imgs/4patches_embeddings.pt
examples/xgenmm/imgs/attention_mask_4patchhes.pt
examples/xgenmm/models/tokenizers/*
models/*.inp
models/*.out

View file

@ -542,18 +542,12 @@ class Model:
if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
# ref: https://huggingface.co/smallcloudai/Refact-1_6-base
res = "refact"
if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
# ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
res = "command-r"
if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
# ref: https://huggingface.co/Qwen/Qwen1.5-7B
res = "qwen2"
if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
# ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
res = "olmo"
if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
# ref: https://huggingface.co/databricks/dbrx-base
res = "dbrx"
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
res = "jina-v2-en"
@ -572,15 +566,9 @@ class Model:
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
res = "jina-v2-code"
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
res = "chatglm-bpe"
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
# ref: https://huggingface.co/LumiOpen/Viking-7B
res = "viking"
if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
# ref: https://huggingface.co/core42/jais-13b
res = "jais"
if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f":
# ref: https://huggingface.co/WisdomShell/CodeShell-7B
res = "codeshell"
@ -596,9 +584,6 @@ class Model:
if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21":
# ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small
res = "gpt3-finnish"
if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
res = "exaone"
if res is None:
logger.warning("\n")

View file

@ -40,6 +40,139 @@
#include <cinttypes>
#include <limits>
void print_my_tensor(ggml_tensor *tensor, const char *name = "", int verbosity = 0)
{
if (tensor->ne[2] == 1)
{
printf("---> %s: (%ld, %ld)\n", name, tensor->ne[0], tensor->ne[1]);
}
else if (ggml_is_3d(tensor))
{
printf("---> %s: (%ld, %ld, %ld)\n", name, tensor->ne[0], tensor->ne[1], tensor->ne[2]);
}
else
{
printf("---> %s: (%ld, %ld, %ld, %ld)\n", name, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
}
if (verbosity == 1)
{
printf("*********************************************************************\n");
if (tensor->ne[2] == 1)
{
const float *mat = (float *)tensor->data;
int dim0 = tensor->ne[1];
int dim1 = tensor->ne[0];
if (dim0 < 6 && dim1 < 6)
{
for (int i = 0; i < dim0; i++)
{
for (int j = 0; j < dim1; j++)
{
printf("%+.4f ", mat[i * dim1 + j]);
}
printf("\n");
}
printf("\n");
}
else
{
for (int i = 0; i < std::min(dim0, 3); i++)
{
for (int j = 0; j < std::min(dim1, 3); j++)
{
printf("%+.6f ", mat[i * dim1 + j]);
}
printf("... ");
for (int j = dim1 - 3; j < dim1; j++)
{
printf("%+.6f ", mat[i * dim1 + j]);
}
printf("\n");
}
if (dim0 > 3)
{
printf("...................... omit ......................\n");
for (int i = dim0 - 3; i < dim0; i++)
{
for (int j = 0; j < std::min(dim1, 3); j++)
{
printf("%+.6f ", mat[i * dim1 + j]);
}
printf("... ");
for (int j = dim1 - 3; j < dim1; j++)
{
printf("%+.6f ", mat[i * dim1 + j]);
}
printf("\n");
}
}
}
}
else if (ggml_is_3d(tensor))
{
const float *data = (float *)tensor->data;
int dim0 = tensor->ne[2];
int dim1 = tensor->ne[1];
int dim2 = tensor->ne[0];
if (dim0 < 6 && dim1 < 6 && dim2 < 6)
{
for (int i = 0; i < dim0; i++)
{
printf("dim0 = %d\n", i);
for (int j = 0; j < dim1; j++)
{
for (int k = 0; k < dim2; k++)
{
printf("%+.6f ", data[i * dim1 * dim2 + j * dim2 + k]);
}
printf("\n");
}
printf("\n");
}
printf("\n");
}
else
{
for (int i = 0; i < std::min(dim0, 3); i++)
{
printf("dim0 = %d\n", i);
for (int j = 0; j < std::min(dim1, 3); j++)
{
for (int k = 0; k < std::min(dim2, 3); k++)
{
printf("%+.6f ", data[i * dim1 * dim2 + j * dim2 + k]);
}
printf("... ");
for (int k = dim2 - 3; k < dim2; k++)
{
printf("%+.6f ", data[i * dim1 * dim2 + j * dim2 + k]);
}
printf("\n");
}
printf("........................\n");
for (int j = dim1 - 3; j < dim1; j++)
{
for (int k = 0; k < std::min(dim2, 3); k++)
{
printf("%+.6f ", data[i * dim1 * dim2 + j * dim2 + k]);
}
printf("... ");
for (int k = dim2 - 3; k < dim2; k++)
{
printf("%+.6f ", data[i * dim1 * dim2 + j * dim2 + k]);
}
printf("\n");
}
printf("---------------------------------------------------\n");
}
printf("\n");
}
}
}
printf("*********************************************************************\n");
printf("\n");
}
//#define CLIP_DEBUG_FUNCTIONS
// RGB uint8 image
@ -602,7 +735,7 @@ struct clip_ctx {
struct clip_image_size * load_image_size;
};
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false, ggml_tensor *attn_bias_input = nullptr) {
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
if (!ctx->has_vision_encoder) {
LOG_TEE("This gguf file seems to have no vision encoder\n");
return nullptr;
@ -1047,7 +1180,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
}
static ggml_cgraph * clip_image_build_graph_vit(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false, ggml_tensor *attn_bias_input = nullptr) {
static ggml_cgraph * clip_image_build_graph_vit(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
if (!ctx->has_vision_encoder) {
LOG_TEE("This gguf file seems to have no vision encoder\n");
return nullptr;
@ -1119,9 +1252,7 @@ static ggml_cgraph * clip_image_build_graph_vit(clip_ctx * ctx, const clip_image
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
}
// loop over layers
if (ctx->has_minicpmv_projector) {
n_layer += 1;
}
n_layer += 1;
for (int il = 0; il < n_layer - 1; il++) {
struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
@ -1218,7 +1349,7 @@ static ggml_cgraph * clip_image_build_graph_vit(clip_ctx * ctx, const clip_image
static ggml_cgraph *clip_build_graph_xgenmm_projector(clip_ctx *ctx, int batch_size, ggml_tensor *img_embeddings, ggml_tensor *attn_bias_input = nullptr)
{
const auto & model = ctx->vision_model;
const auto & hparams = model.hparams;
// const auto & hparams = model.hparams;
// const float eps = hparams.eps; // double check this value
const float eps = 1e-5;
@ -2493,7 +2624,7 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
void clip_free(clip_ctx * ctx) {
ggml_free(ctx->ctx_data);
gguf_free(ctx->ctx_gguf);
ggml_backend_buffer_free(ctx->params_buffer);
ggml_backend_free(ctx->backend);
ggml_gallocr_free(ctx->compute_alloc);
@ -2676,12 +2807,10 @@ bool clip_image_encode_tokenizer(struct clip_ctx * ctx, int batch_size, ggml_ten
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
ggml_backend_graph_compute(ctx->backend, gf);
struct ggml_tensor * llm_inputs = gf->nodes[gf->n_nodes - 1];
print_my_tensor(llm_inputs, "llm_inputs", 1);
// exit(0);
ggml_backend_tensor_get(llm_inputs, image_embd, 0, ggml_nbytes(llm_inputs));
clip_free(ctx);
// ggml_free(tensor.ctx);
// if (ctx0){
// ggml_free(ctx0);
// }
return true;
}
@ -3029,7 +3158,7 @@ bool clip_image_batch_encode_vit(clip_ctx * ctx, const int n_threads, const clip
GGML_ASSERT(batch_size == 1); // TODO: support multiple images
// build the inference graph
ggml_cgraph * gf = clip_image_build_graph_vit(ctx, imgs, ctx->load_image_size, true);
ggml_cgraph * gf = clip_image_build_graph_vit(ctx, imgs);
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
// set inputs
@ -3039,8 +3168,8 @@ bool clip_image_batch_encode_vit(clip_ctx * ctx, const int n_threads, const clip
int image_size_width = image_size;
int image_size_height = image_size;
const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
// const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
// const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
if(ctx->load_image_size==nullptr){
ctx->load_image_size= clip_image_size_init();
}

View file

@ -4,10 +4,20 @@ conda activate xgenmm-flamingo
# # step 1: surgery
# python xgenmm_surgery.py
# step 2: convert to gguf (vit + projector)
# # step 2: convert vit + projector to gguf
python xgenmm_convert_image_encoder_to_gguf.py \
--surgery_dir /export/share/yutong/xgenmm/llamacpp_wd \
--output_dirname gguf_test \
--version siglip_kosmos_phi3_4k_instruct \
--use_f32
# python xgenmm_convert_image_encoder_to_gguf.py \
# --surgery_dir /export/share/yutong/xgenmm/llamacpp_wd \
# --output_dirname gguf_test \
# --version siglip_kosmos_phi3_4k_instruct \
# --use_f32
# step 3: convert llm to gguf
# https://github.com/ggerganov/llama.cpp/discussions/7927
HF_TOKEN=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
LLM_PATH=/export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/llm
# LLM_OUTPUT_FILE=/export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf/phi3_.gguf
# downloads the tokenizer models of the specified models from Huggingface; generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
cd ../..
# python convert_hf_to_gguf_update.py $HF_TOKEN
python convert_hf_to_gguf.py $LLM_PATH

File diff suppressed because it is too large Load diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.1 MiB

View file

@ -16,8 +16,26 @@ make xgenmm-cli
# -p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. \nThe assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n<|user|>\n<image> Describe this image.<|end|>\n<|assistant|>\n"
./xgenmm-cli -m /export/share/tawalgaonkar/llama.cpp/models/llm/xgenmm-phi-3-llm-Q4.gguf \
# ./xgenmm-cli -m /export/share/tawalgaonkar/llama.cpp/models/llm/xgenmm-phi-3-llm-Q4.gguf \
# --mmproj /export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf_test/mmproj-model-f32.gguf \
# -c 4096 --temp 0.01 --repeat-penalty 1.05 \
# --image /export/home/llama.cpp/examples/xgenmm/imgs/image-1d100e9.jpg\
# -p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. \nThe assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n<|user|>\n<image>\n How many objects are there in this image?<|end|>\n<|assistant|>\n"
./xgenmm-cli --model /export/share/tawalgaonkar/llama.cpp/models/llm/xgenmm-phi-3-llm-Q4.gguf \
--mmproj /export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf_test/mmproj-model-f32.gguf \
-c 4096 --temp 0.01 --repeat-penalty 1.05 \
--image /export/home/llama.cpp/examples/xgenmm/imgs/image-1d100e9.jpg\
-p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. \nThe assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n<|user|>\n<image>\n How many objects are there in this image?<|end|>\n<|assistant|>\n"
--image /export/home/llama.cpp/examples/xgenmm/imgs/receipt.jpg\
--prompt "<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n<|user|>\n<image>\n Describe this image.<|end|>\n<|assistant|>\n" \
--seed 42 --ctx-size 4096 --predict 1024 \
--temp 0 --verbose-prompt
#
# ./xgenmm-cli --model /export/share/tawalgaonkar/llama.cpp/models/llm/xgenmm-phi-3-llm-Q4.gguf \
# --mmproj /export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf_test/mmproj-model-f32.gguf \
# --image /export/home/llama.cpp/examples/xgenmm/imgs/receipt.jpg\
# --prompt "<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n<|user|>\n<image>\n What is the address of this restirant?<|end|>\n<|assistant|>\n" \
# --seed 42 --ctx-size 4096 --predict 1024 \
# --temp 0 --verbose-prompt

View file

@ -1,5 +1,3 @@
// refer to example/minicpmv-cli
#include "ggml.h"
#include "log.h"
#include "common.h"
@ -11,97 +9,18 @@
#include <cstdlib>
#include <vector>
struct llava_context {
struct clip_ctx * ctx_clip = NULL;
struct llama_context * ctx_llama = NULL;
struct llama_model * model = NULL;
};
static void show_additional_info(int /*argc*/, char ** argv) {
LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
}
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
(void) level;
(void) user_data;
LOG_TEE("%s", text);
}
static struct llama_model * llava_init(gpt_params * params) {
llama_backend_init();
llama_numa_init(params->numa);
llama_model_params model_params = llama_model_params_from_gpt_params(*params);
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
if (model == NULL) {
LOG_TEE("%s: error: unable to load model\n" , __func__);
return NULL;
}
return model;
}
static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) {
auto prompt = params->prompt;
if (prompt.empty()) {
prompt = "describe the image in detail.";
}
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
if (params->n_ctx < 2048) {
// warn user here, "Image processing requires at least 2048 context, setting context to 2048"
LOG_TEE("%s: warn: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
ctx_params.n_ctx = 2048;
} else {
ctx_params.n_ctx = params->n_ctx;
}
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
if (ctx_llama == NULL) {
LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
return NULL;
}
auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
ctx_llava->ctx_llama = ctx_llama;
ctx_llava->model = model;
return ctx_llava;
}
static void llava_free(struct llava_context * ctx_llava) {
if (ctx_llava->ctx_clip) {
clip_free(ctx_llava->ctx_clip);
ctx_llava->ctx_clip = NULL;
}
llama_free(ctx_llava->ctx_llama);
llama_free_model(ctx_llava->model);
llama_backend_free();
}
static struct clip_ctx * clip_init_context(gpt_params * params) {
const char * clip_path = params->mmproj.c_str();
auto prompt = params->prompt;
if (prompt.empty()) {
prompt = "describe the image in detail.";
}
// std::cout << __LINE__ << std::endl;
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
return ctx_clip;
}
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
int N = (int) tokens.size();
for (int i = 0; i < N; i += n_batch) {
int n_eval = (int) tokens.size() - i;
if (n_eval > n_batch) {
static bool eval_tokens(struct llama_context *ctx_llama, std::vector<llama_token> tokens, int n_batch, int *n_past)
{
int N = (int)tokens.size();
for (int i = 0; i < N; i += n_batch)
{
int n_eval = (int)tokens.size() - i;
if (n_eval > n_batch)
{
n_eval = n_batch;
}
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0)))
{
LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
return false;
}
@ -110,111 +29,170 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
return true;
}
static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
static bool eval_id(struct llama_context *ctx_llama, int id, int *n_past)
{
std::vector<llama_token> tokens;
tokens.push_back(id);
return eval_tokens(ctx_llama, tokens, 1, n_past);
}
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
std::string str2 = str;
static bool eval_string(struct llama_context *ctx_llama, const char *str, int n_batch, int *n_past, bool add_bos)
{
std::string str2 = str;
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true);
printf("prompt: %s", str);
for (auto token : embd_inp){
printf("%6d, ", token);
}
printf("\n");
return eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
}
static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) {
float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
auto slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
slice_embed->embed = image_embed;
slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
llava_image_embed_free(slice_embed);
}
static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, gpt_params * params, int &n_past) {
std::string system_prompt;
int idx = 0;
int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
if (has_minicpmv_projector == 2) {
system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
}
else if (has_minicpmv_projector == 3) {
system_prompt = "<|im_start|>user\n";
}
LOG_TEE("%s: image token past: %d\n", __func__, n_past);
eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
if (num_image_embeds > 1) {
size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
for (size_t j = 0; j < num_image_embeds_col; ++j) {
eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
if (j == num_image_embeds_col - 1) {
eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
}
}
}
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
}
LOG_TEE("%s: image token past: %d\n", __func__, n_past);
}
static const char * sample(struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_llama,
int * n_past) {
static const char *sample(struct llama_sampling_context *ctx_sampling, struct llama_context *ctx_llama, int *n_past)
{
const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
static std::string ret;
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
if (llama_token_is_eog(llama_get_model(ctx_llama), id))
{
ret = "</s>";
} else {
}
else
{
ret = llama_token_to_piece(ctx_llama, id);
}
eval_id(ctx_llama, id, n_past);
return ret.c_str();
}
static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
auto ctx_clip = clip_init_context(params);
auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str());
if (!embeds) {
std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
return NULL;
static const char *IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
static const char *IMG_BASE64_TAG_END = "\">";
static void find_image_tag_in_prompt(const std::string &prompt, size_t &begin_out, size_t &end_out)
{
begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
}
static bool prompt_contains_image(const std::string &prompt)
{
size_t begin, end;
find_image_tag_in_prompt(prompt, begin, end);
return (begin != std::string::npos);
}
// TODO: Implememt this function llava_image_embed_make_with_prompt_base64 for xgenmm
// static llava_image_embed *llava_image_embed_make_with_prompt_base64(struct clip_ctx *ctx_clip, int n_threads,
// const std::string &prompt)
// {
// size_t img_base64_str_start, img_base64_str_end;
// find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
// if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos)
// {
// LOG_TEE("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN,
// IMG_BASE64_TAG_END);
// return NULL;
// }
// auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN);
// auto base64_bytes_count = img_base64_str_end - base64_bytes_start;
// auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count);
// auto required_bytes = base64::required_encode_size(base64_str.size());
// auto img_bytes = std::vector<unsigned char>(required_bytes);
// base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
// auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
// if (!embed)
// {
// LOG_TEE("%s: could not load image from base64 string.\n", __func__);
// return NULL;
// }
// return embed;
// }
static std::string remove_image_from_prompt(const std::string &prompt, const char *replacement = "")
{
size_t begin, end;
find_image_tag_in_prompt(prompt, begin, end);
if (begin == std::string::npos || end == std::string::npos)
{
return prompt;
}
auto pre = prompt.substr(0, begin);
auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END));
return pre + replacement + post;
}
struct llava_context
{
struct clip_ctx *ctx_clip = NULL;
struct llama_context *ctx_llama = NULL;
struct llama_model *model = NULL;
};
// static void process_eval_image_embed(struct llava_context *ctx_llava, const struct llava_image_embed *embeds,
// int n_batch, int *n_past, int idx)
// {
// float *image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
// std::memcpy(image_embed,
// embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip),
// clip_embd_nbytes(ctx_llava->ctx_clip));
// auto slice_embed = (llava_image_embed *)malloc(sizeof(llava_image_embed));
// slice_embed->embed = image_embed;
// slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
// llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
// llava_image_embed_free(slice_embed);
// }
static void print_usage(int argc, char **argv, const gpt_params &params)
{
gpt_params_print_usage(argc, argv, params);
LOG_TEE("\n example usage:\n");
LOG_TEE(
"\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image "
"<path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in "
"detail.\"]\n",
argv[0]);
LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
}
static struct llava_image_embed *load_image(llava_context *ctx_llava, gpt_params *params, const std::string &fname)
{
// load and preprocess the image
llava_image_embed *embed = NULL;
auto prompt = params->prompt;
if (prompt_contains_image(prompt))
{
// if (!params->image.empty())
// {
// LOG_TEE("using base64 encoded image instead of command line image path\n");
// }
// embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
// if (!embed)
// {
// LOG_TEE("%s: can't load image from prompt\n", __func__);
// return NULL;
// }
// params->prompt = remove_image_from_prompt(prompt);
printf("not implemented\n");
exit(1);
}
else
{
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str());
if (!embed)
{
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
return NULL;
}
}
// process the prompt
if (params->prompt.empty() && params->interactive == false) {
LOG_TEE("prompt should be given or interactive mode should be on");
return NULL;
}
auto model = llava_init(params);
if (model == NULL) {
fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
return NULL;
}
const int64_t t_llava_init_start_us = ggml_time_us();
auto ctx_llava = llava_init_context(params, model);
ctx_llava->ctx_clip = ctx_clip;
const int64_t t_llava_init_end_us = ggml_time_us();
float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
const int64_t t_process_image_start_us = ggml_time_us();
process_image(ctx_llava, embeds, params, n_past);
const int64_t t_process_image_end_us = ggml_time_us();
float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
llava_image_embed_free(embeds);
return ctx_llava;
return embed;
}
static void process_prompt(struct llava_context *ctx_llava, struct llava_image_embed *image_embed, gpt_params *params,
@ -233,6 +211,7 @@ static void process_prompt(struct llava_context *ctx_llava, struct llava_image_e
system_prompt = prompt.substr(0, image_pos);
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
LOG_TEE("system_prompt: %s\n", system_prompt.c_str());
// phi3-tokenizer https://github.com/ggerganov/llama.cpp/issues/7938
if (params->verbose_prompt)
{
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
@ -267,7 +246,6 @@ static void process_prompt(struct llava_context *ctx_llava, struct llava_image_e
}
}
}
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
@ -286,9 +264,18 @@ static void process_prompt(struct llava_context *ctx_llava, struct llava_image_e
std::string response = "";
for (int i = 0; i < max_tgt_len; i++)
{
// printf("i: %d\n", i);
const char *tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
response += tmp;
if (strcmp(tmp, "</s>") == 0) break;
// printf("%s", tmp);
if (strcmp(tmp, "<|end|>") == 0){
printf("\n STOP GENERATING because I saw <|end|>\n");
break;
}
if (strcmp(tmp, "</s>") == 0) {
printf("\n STOP GENERATING because I saw </s>\n");
break;
}
if (strstr(tmp, "###")) break; // Yi-VL behavior
printf("%s", tmp);
if (strstr(response.c_str(), "<|im_end|>"))
@ -303,90 +290,207 @@ static void process_prompt(struct llava_context *ctx_llava, struct llava_image_e
printf("\n");
}
static struct llava_context * xgenmm_init(gpt_params * params, const std::string & fname, int &n_past){
auto ctx_clip = clip_init_context(params);
std::cout << "clip model has been loaded \n\n";
auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str());
if (!embeds) {
std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
return NULL;
}
std::cout<< "Start Processing Prompt: " << std::endl;
// TODO:
// process the prompt
if (params->prompt.empty() && params->interactive == false) {
LOG_TEE("prompt should be given or interactive mode should be on");
return NULL;
}
static struct llama_model * llava_init(gpt_params * params) {
llama_backend_init();
llama_numa_init(params->numa);
auto model = llava_init(params);
llama_model_params model_params = llama_model_params_from_gpt_params(*params);
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
if (model == NULL) {
fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
LOG_TEE("%s: error: unable to load model\n" , __func__);
return NULL;
}
const int64_t t_llava_init_start_us = ggml_time_us();
auto ctx_llava = llava_init_context(params, model);
return model;
}
static struct llava_context *llava_init_context(gpt_params *params, llama_model *model)
{
const char *clip_path = params->mmproj.c_str();
auto prompt = params->prompt;
if (prompt.empty())
{
prompt = "describe the image in detail.";
}
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/1);
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
ctx_params.n_ctx =
params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
llama_context *ctx_llama = llama_new_context_with_model(model, ctx_params);
if (ctx_llama == NULL)
{
LOG_TEE("%s: error: failed to create the llama_context\n", __func__);
return NULL;
}
auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
ctx_llava->ctx_llama = ctx_llama;
ctx_llava->ctx_clip = ctx_clip;
const int64_t t_llava_init_end_us = ggml_time_us();
float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
const int64_t t_process_image_start_us = ggml_time_us();
process_prompt(ctx_llava, embeds, params, params->prompt);
// process_image(ctx_llava, embeds, params, n_past);
const int64_t t_process_image_end_us = ggml_time_us();
float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
llava_image_embed_free(embeds);
ctx_llava->model = model;
return ctx_llava;
}
static struct llama_sampling_context * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
std::string user_prompt = prompt;
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
if (!is_first) {
if (has_minicpmv_projector == 2) {
user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt;
}
else if (has_minicpmv_projector == 3) {
user_prompt = "<|im_start|>user\n" + prompt;
}
static void llava_free(struct llava_context * ctx_llava) {
if (ctx_llava->ctx_clip) {
printf(
"YD:::Segmentation fault here; Because header.n_kv is empty\n clip_free->gguf_free(ctx->ctx_gguf)-> for "
"(uint64_t i = 0; i < ctx->header.n_kv; ++i)\n");
exit(1);
clip_free(ctx_llava->ctx_clip);
ctx_llava->ctx_clip = NULL;
}
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
if (has_minicpmv_projector == 2) {
eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
}
else if (has_minicpmv_projector == 3) {
eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
}
// generate the response
LOG_TEE("\n");
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
return ctx_sampling;
llama_free(ctx_llava->ctx_llama);
llama_free_model(ctx_llava->model);
llama_backend_free();
}
static const char * llama_loop(struct llava_context * ctx_llava,struct llama_sampling_context * ctx_sampling, int &n_past){
// static struct clip_ctx * clip_init_context(gpt_params * params) {
// const char * clip_path = params->mmproj.c_str();
const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
return tmp;
// auto prompt = params->prompt;
// if (prompt.empty()) {
// prompt = "describe the image in detail.";
// }
// // std::cout << __LINE__ << std::endl;
// auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
// return ctx_clip;
// }
// TODO: REMOVE THIS FUNCTION
// static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, gpt_params * params, int &n_past) {
// std::string system_prompt;
// int idx = 0;
// int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
// int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
// if (has_minicpmv_projector == 2) {
// system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
// }
// else if (has_minicpmv_projector == 3) {
// system_prompt = "<|im_start|>user\n";
// }
// LOG_TEE("%s: image token past: %d\n", __func__, n_past);
// eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
// process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
// eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
// if (num_image_embeds > 1) {
// size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
// eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
// for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
// for (size_t j = 0; j < num_image_embeds_col; ++j) {
// eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
// process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
// eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
// if (j == num_image_embeds_col - 1) {
// eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
// }
// }
// }
// eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
// }
// LOG_TEE("%s: image token past: %d\n", __func__, n_past);
// }
// static struct llava_context * xgenmm_init(gpt_params * params, const std::string & fname, int &n_past){
// auto ctx_clip = clip_init_context(params);
// std::cout << "clip model has been loaded \n\n";
// auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str());
// if (!embeds) {
// std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
// return NULL;
// }
// std::cout<< "Start Processing Prompt: " << std::endl;
// // TODO:
// // process the prompt
// if (params->prompt.empty() && params->interactive == false) {
// LOG_TEE("prompt should be given or interactive mode should be on");
// return NULL;
// }
// auto model = llava_init(params);
// if (model == NULL) {
// fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
// return NULL;
// }
// const int64_t t_llava_init_start_us = ggml_time_us();
// auto ctx_llava = llava_init_context(params, model);
// ctx_llava->ctx_clip = ctx_clip;
// const int64_t t_llava_init_end_us = ggml_time_us();
// float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
// LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
// const int64_t t_process_image_start_us = ggml_time_us();
// process_prompt(ctx_llava, embeds, params, params->prompt);
// // process_image(ctx_llava, embeds, params, n_past);
// const int64_t t_process_image_end_us = ggml_time_us();
// float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
// LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
// llava_image_embed_free(embeds);
// return ctx_llava;
// }
// static struct llama_sampling_context * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
// std::string user_prompt = prompt;
// int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
// if (!is_first) {
// if (has_minicpmv_projector == 2) {
// user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt;
// }
// else if (has_minicpmv_projector == 3) {
// user_prompt = "<|im_start|>user\n" + prompt;
// }
// }
// eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
// if (has_minicpmv_projector == 2) {
// eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
// }
// else if (has_minicpmv_projector == 3) {
// eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
// }
// // generate the response
// LOG_TEE("\n");
// struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
// return ctx_sampling;
// }
// static const char * llama_loop(struct llava_context * ctx_llava,struct llama_sampling_context * ctx_sampling, int &n_past){
// const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
// return tmp;
// }
static void llama_log_callback_logTee(ggml_log_level level, const char *text, void *user_data)
{
(void)level;
(void)user_data;
LOG_TEE("%s", text);
}
int main(int argc, char ** argv) {
ggml_time_init();
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
show_additional_info(argc, argv);
print_usage(argc, argv, params);
return 1;
}
@ -399,67 +503,67 @@ int main(int argc, char ** argv) {
if (params.mmproj.empty() || (params.image.empty())) {
gpt_params_print_usage(argc, argv, params);
show_additional_info(argc, argv);
print_usage(argc, argv, params);
return 1;
}
for (auto & image : params.image) { // only single image for now
int n_past = 0;
// auto ctx_llava = minicpmv_init(&params, image, n_past);
auto ctx_llava = xgenmm_init(&params, image, n_past); // generate vision tokens
std::cout << "Start llava generation: " << std::endl;
auto model = llava_init(&params);
if (model == NULL)
{
fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
return 1;
}
if (prompt_contains_image(params.prompt))
{
auto ctx_llava = llava_init_context(&params, model);
auto image_embed = load_image(ctx_llava, &params, "");
// process the prompt
process_prompt(ctx_llava, image_embed, &params, params.prompt);
llama_print_timings(ctx_llava->ctx_llama);
// // TODO: integrate base llm
// if (!params.prompt.empty()) {
// LOG_TEE("<user>%s\n", params.prompt.c_str());
// LOG_TEE("<assistant>");
// auto ctx_sampling = llama_init(ctx_llava, &params, params.prompt.c_str(), n_past, true);
// const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
// std::string response = "";
// bool have_tmp = false;
// for (int i = 0; i < max_tgt_len; i++) {
// auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past);
// response += tmp;
// if (strcmp(tmp, "</s>") == 0){
// if(!have_tmp)continue;
// else break;
// }
// if (strstr(tmp, "###")) break; // Yi-VL behavior
// have_tmp = true;
// printf("%s", tmp);
// if (strstr(response.c_str(), "<user>")) break; // minicpm-v
// fflush(stdout);
// }
// llama_sampling_free(ctx_sampling);
// }else {
// while (true) {
// LOG_TEE("<user>");
// std::string prompt;
// std::getline(std::cin, prompt);
// LOG_TEE("<assistant>");
// auto ctx_sampling = llama_init(ctx_llava, &params, prompt, n_past, true);
// const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
// std::string response = "";
// for (int i = 0; i < max_tgt_len; i++) {
// auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past);
// response += tmp;
// if (strcmp(tmp, "</s>") == 0) break;
// if (strstr(tmp, "###")) break; // Yi-VL behavior
// printf("%s", tmp);// mistral llava-1.6
// if (strstr(response.c_str(), "<user>")) break; // minicpm-v
// fflush(stdout);
// }
// llama_sampling_free(ctx_sampling);
// }
// }
// printf("\n");
// llama_print_timings(ctx_llava->ctx_llama);
llava_image_embed_free(image_embed);
ctx_llava->model = NULL;
llava_free(ctx_llava);
}
else
{
for (auto &image : params.image)
{
printf("image: %s\n", image.c_str());
auto ctx_llava = llava_init_context(&params, model);
auto image_embed = load_image(ctx_llava, &params, image);
printf("n_image_pos: %d\n", image_embed->n_image_pos);
if (!image_embed)
{
std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
return 1;
}
// process the prompt
process_prompt(ctx_llava, image_embed, &params, params.prompt);
llama_print_timings(ctx_llava->ctx_llama);
llava_image_embed_free(image_embed);
ctx_llava->model = NULL;
llava_free(ctx_llava);
}
}
llama_free_model(model);
// prompt_contains_image(params.prompt);
// for (auto & image : params.image) { // only single image for now
// int n_past = 0;
// auto ctx_llava = xgenmm_init(&params, image, n_past); // generate vision tokens
// std::cout << "Start llava generation: " << std::endl;
// llama_print_timings(ctx_llava->ctx_llama);
// ctx_llava->model = NULL;
// llava_free(ctx_llava);
// }
printf("Remember to remove print_tensor function in xgenmm.cpp and clip.cpp\n");
return 0;
}

View file

@ -14,6 +14,162 @@
#include "llama.h"
#include "xgenmm.h"
struct tensor_from_gguf
{
struct ggml_tensor *data;
struct ggml_context *ctx;
};
bool load_tensor_from_file(const char *filename, tensor_from_gguf &tensor)
{
struct gguf_init_params params = {
/*.no_alloc =*/false,
/*.ctx =*/&tensor.ctx,
};
gguf_context *ctx = gguf_init_from_file(filename, params);
if (!ctx)
{
fprintf(stderr, "%s: gguf_init_from_file() failed\n", __func__);
return false;
}
tensor.data = ggml_get_tensor(tensor.ctx, "data");
return true;
}
void print_tensor(ggml_tensor *tensor, const char *name = "", int verbosity = 0)
{
if (tensor->ne[2] == 1)
{
printf("---> %s: (%ld, %ld)\n", name, tensor->ne[0], tensor->ne[1]);
}
else if (ggml_is_3d(tensor))
{
printf("---> %s: (%ld, %ld, %ld)\n", name, tensor->ne[0], tensor->ne[1], tensor->ne[2]);
}
else
{
printf("---> %s: (%ld, %ld, %ld, %ld)\n", name, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
}
if (verbosity == 1)
{
printf("*********************************************************************\n");
if (tensor->ne[2] == 1)
{
const float *mat = (float *)tensor->data;
int dim0 = tensor->ne[1];
int dim1 = tensor->ne[0];
if (dim0 < 6 && dim1 < 6)
{
for (int i = 0; i < dim0; i++)
{
for (int j = 0; j < dim1; j++)
{
printf("%+.4f ", mat[i * dim1 + j]);
}
printf("\n");
}
printf("\n");
}
else
{
for (int i = 0; i < std::min(dim0, 3); i++)
{
for (int j = 0; j < std::min(dim1, 3); j++)
{
printf("%+.6f ", mat[i * dim1 + j]);
}
printf("... ");
for (int j = dim1 - 3; j < dim1; j++)
{
printf("%+.6f ", mat[i * dim1 + j]);
}
printf("\n");
}
if (dim0 > 3)
{
printf("...................... omit ......................\n");
for (int i = dim0 - 3; i < dim0; i++)
{
for (int j = 0; j < std::min(dim1, 3); j++)
{
printf("%+.6f ", mat[i * dim1 + j]);
}
printf("... ");
for (int j = dim1 - 3; j < dim1; j++)
{
printf("%+.6f ", mat[i * dim1 + j]);
}
printf("\n");
}
}
}
}
else if (ggml_is_3d(tensor))
{
const float *data = (float *)tensor->data;
int dim0 = tensor->ne[2];
int dim1 = tensor->ne[1];
int dim2 = tensor->ne[0];
if (dim0 < 6 && dim1 < 6 && dim2 < 6)
{
for (int i = 0; i < dim0; i++)
{
printf("dim0 = %d\n", i);
for (int j = 0; j < dim1; j++)
{
for (int k = 0; k < dim2; k++)
{
printf("%+.6f ", data[i * dim1 * dim2 + j * dim2 + k]);
}
printf("\n");
}
printf("\n");
}
printf("\n");
}
else
{
for (int i = 0; i < std::min(dim0, 3); i++)
{
printf("dim0 = %d\n", i);
for (int j = 0; j < std::min(dim1, 3); j++)
{
for (int k = 0; k < std::min(dim2, 3); k++)
{
printf("%+.6f ", data[i * dim1 * dim2 + j * dim2 + k]);
}
printf("... ");
for (int k = dim2 - 3; k < dim2; k++)
{
printf("%+.6f ", data[i * dim1 * dim2 + j * dim2 + k]);
}
printf("\n");
}
printf("........................\n");
for (int j = dim1 - 3; j < dim1; j++)
{
for (int k = 0; k < std::min(dim2, 3); k++)
{
printf("%+.6f ", data[i * dim1 * dim2 + j * dim2 + k]);
}
printf("... ");
for (int k = dim2 - 3; k < dim2; k++)
{
printf("%+.6f ", data[i * dim1 * dim2 + j * dim2 + k]);
}
printf("\n");
}
printf("---------------------------------------------------\n");
}
printf("\n");
}
}
}
printf("*********************************************************************\n");
printf("\n");
}
// RGB uint8 image
struct clip_image_u8
{
@ -418,6 +574,33 @@ static bool clip_xgenmm_handle_vit_patches(clip_ctx *ctx_clip , const clip_image
ggml_graph_compute_with_ctx(mask.ctx, gf, 1);
attention_mask = gf->nodes[gf->n_nodes - 1];
// memcpy(image_embd_v_m_mask_out, (float *)attention_mask->data, ggml_nbytes(attention_mask));
{
printf((" ========================= DEBUG =========================\n"));
printf("Load pre-computed image embeddings and attention_mask\n");
std::string filename = "/export/home/ggml/examples/projectors/receipt_5patches_vision_features.gguf";
tensor_from_gguf tensor;
bool is_successful = load_tensor_from_file(filename.c_str(), tensor);
if (!is_successful)
{
fprintf(stderr, "%s: load_tensor_from_file() failed\n", __func__);
return 1;
}
result = tensor.data;
// print_tensor(result, "result", 1);
filename = "/export/home/ggml/examples/projectors/receipt_5patches_vision_attn_masks.gguf";
is_successful = load_tensor_from_file(filename.c_str(), tensor);
if (!is_successful)
{
fprintf(stderr, "%s: load_tensor_from_file() failed\n", __func__);
return 1;
}
attention_mask = tensor.data;
// print_tensor(attention_mask, "attention_mask", 1);
num_patches_width = 2;
num_patches_height = 2;
}
// compute attnetion masks outside of the graph
struct ggml_tensor * attn_bias_input;
@ -463,10 +646,19 @@ static bool clip_xgenmm_handle_vit_patches(clip_ctx *ctx_clip , const clip_image
ggml_build_forward_expand(gf_temp, attn_bias);
ggml_graph_compute_with_ctx(ctx0, gf_temp, 1);
attn_bias_input = attn_bias;
}else{
attn_bias_input = NULL;
}
int batch_size = num_patches_width * num_patches_height + 1;
// print_tensor(attn_bias_input, "attn_bias_input", 1);
// print_tensor(result, "result", 1);
printf("batch_size: %d\n", batch_size);
const bool encoded = clip_image_encode_tokenizer(
ctx_clip, batch_size, result, attn_bias_input, image_embd);
if (!encoded){
LOG_TEE("%s: failed at image tokenizer (projector step failed)\n", __func__);
return false;
}
ggml_free(model.ctx);
ggml_free(mask.ctx);

View file

@ -1,5 +0,0 @@
python examples/xgenmm/xgenmm_convert_image_encoder_to_gguf.py\
--surgery_dir /export/share/yutong/xgenmm/llamacpp_wd \
--version siglip_kosmos_phi3_4k_instruct \
--xgenmm_projector /export/home/Projects/xgenmm-quantization/target_models/MiniCPM-Llama3-V-2_5/minicpmv.projector \
--use_f32

View file

@ -92,8 +92,9 @@ if __name__ == "__main__":
torch.save(projector_tensors, save_path)
# processors
tokenizer.save_pretrained(f"{save_dir}/tokenizer")
# will hard code the image_processor in the convert_image_encoder_to_gguf.py
# put the tokenizer in the same dir as the lang model
tokenizer.save_pretrained(f"{save_dir}/llm")
end = time.time()
print(f"🟢 time used: [{end-start:.3f} s]")

Binary file not shown.