update to get some results; need to check vit and llm
This commit is contained in:
parent
cc553a0ae0
commit
30b751ef06
12 changed files with 4840 additions and 341 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -155,3 +155,6 @@ examples/xgenmm copy/imgs/image-1d100e9-1.jpg
|
|||
examples/xgenmm copy/imgs/image-1d100e9.jpg
|
||||
examples/xgenmm/imgs/4patches_embeddings.pt
|
||||
examples/xgenmm/imgs/attention_mask_4patchhes.pt
|
||||
examples/xgenmm/models/tokenizers/*
|
||||
models/*.inp
|
||||
models/*.out
|
|
@ -542,18 +542,12 @@ class Model:
|
|||
if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
|
||||
# ref: https://huggingface.co/smallcloudai/Refact-1_6-base
|
||||
res = "refact"
|
||||
if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
|
||||
# ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
|
||||
res = "command-r"
|
||||
if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
|
||||
# ref: https://huggingface.co/Qwen/Qwen1.5-7B
|
||||
res = "qwen2"
|
||||
if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
|
||||
# ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
|
||||
res = "olmo"
|
||||
if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
|
||||
# ref: https://huggingface.co/databricks/dbrx-base
|
||||
res = "dbrx"
|
||||
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
|
||||
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
|
||||
res = "jina-v2-en"
|
||||
|
@ -572,15 +566,9 @@ class Model:
|
|||
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
|
||||
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
|
||||
res = "jina-v2-code"
|
||||
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
|
||||
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
|
||||
res = "chatglm-bpe"
|
||||
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
|
||||
# ref: https://huggingface.co/LumiOpen/Viking-7B
|
||||
res = "viking"
|
||||
if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
|
||||
# ref: https://huggingface.co/core42/jais-13b
|
||||
res = "jais"
|
||||
if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f":
|
||||
# ref: https://huggingface.co/WisdomShell/CodeShell-7B
|
||||
res = "codeshell"
|
||||
|
@ -596,9 +584,6 @@ class Model:
|
|||
if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21":
|
||||
# ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small
|
||||
res = "gpt3-finnish"
|
||||
if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
|
||||
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
|
||||
res = "exaone"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
|
|
|
@ -40,6 +40,139 @@
|
|||
#include <cinttypes>
|
||||
#include <limits>
|
||||
|
||||
void print_my_tensor(ggml_tensor *tensor, const char *name = "", int verbosity = 0)
|
||||
{
|
||||
if (tensor->ne[2] == 1)
|
||||
{
|
||||
printf("---> %s: (%ld, %ld)\n", name, tensor->ne[0], tensor->ne[1]);
|
||||
}
|
||||
else if (ggml_is_3d(tensor))
|
||||
{
|
||||
printf("---> %s: (%ld, %ld, %ld)\n", name, tensor->ne[0], tensor->ne[1], tensor->ne[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("---> %s: (%ld, %ld, %ld, %ld)\n", name, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
|
||||
}
|
||||
if (verbosity == 1)
|
||||
{
|
||||
printf("*********************************************************************\n");
|
||||
if (tensor->ne[2] == 1)
|
||||
{
|
||||
const float *mat = (float *)tensor->data;
|
||||
int dim0 = tensor->ne[1];
|
||||
int dim1 = tensor->ne[0];
|
||||
if (dim0 < 6 && dim1 < 6)
|
||||
{
|
||||
for (int i = 0; i < dim0; i++)
|
||||
{
|
||||
for (int j = 0; j < dim1; j++)
|
||||
{
|
||||
printf("%+.4f ", mat[i * dim1 + j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int i = 0; i < std::min(dim0, 3); i++)
|
||||
{
|
||||
for (int j = 0; j < std::min(dim1, 3); j++)
|
||||
{
|
||||
printf("%+.6f ", mat[i * dim1 + j]);
|
||||
}
|
||||
printf("... ");
|
||||
for (int j = dim1 - 3; j < dim1; j++)
|
||||
{
|
||||
printf("%+.6f ", mat[i * dim1 + j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
if (dim0 > 3)
|
||||
{
|
||||
printf("...................... omit ......................\n");
|
||||
for (int i = dim0 - 3; i < dim0; i++)
|
||||
{
|
||||
for (int j = 0; j < std::min(dim1, 3); j++)
|
||||
{
|
||||
printf("%+.6f ", mat[i * dim1 + j]);
|
||||
}
|
||||
printf("... ");
|
||||
for (int j = dim1 - 3; j < dim1; j++)
|
||||
{
|
||||
printf("%+.6f ", mat[i * dim1 + j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (ggml_is_3d(tensor))
|
||||
{
|
||||
const float *data = (float *)tensor->data;
|
||||
int dim0 = tensor->ne[2];
|
||||
int dim1 = tensor->ne[1];
|
||||
int dim2 = tensor->ne[0];
|
||||
if (dim0 < 6 && dim1 < 6 && dim2 < 6)
|
||||
{
|
||||
for (int i = 0; i < dim0; i++)
|
||||
{
|
||||
printf("dim0 = %d\n", i);
|
||||
for (int j = 0; j < dim1; j++)
|
||||
{
|
||||
for (int k = 0; k < dim2; k++)
|
||||
{
|
||||
printf("%+.6f ", data[i * dim1 * dim2 + j * dim2 + k]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int i = 0; i < std::min(dim0, 3); i++)
|
||||
{
|
||||
printf("dim0 = %d\n", i);
|
||||
for (int j = 0; j < std::min(dim1, 3); j++)
|
||||
{
|
||||
for (int k = 0; k < std::min(dim2, 3); k++)
|
||||
{
|
||||
printf("%+.6f ", data[i * dim1 * dim2 + j * dim2 + k]);
|
||||
}
|
||||
printf("... ");
|
||||
for (int k = dim2 - 3; k < dim2; k++)
|
||||
{
|
||||
printf("%+.6f ", data[i * dim1 * dim2 + j * dim2 + k]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("........................\n");
|
||||
for (int j = dim1 - 3; j < dim1; j++)
|
||||
{
|
||||
for (int k = 0; k < std::min(dim2, 3); k++)
|
||||
{
|
||||
printf("%+.6f ", data[i * dim1 * dim2 + j * dim2 + k]);
|
||||
}
|
||||
printf("... ");
|
||||
for (int k = dim2 - 3; k < dim2; k++)
|
||||
{
|
||||
printf("%+.6f ", data[i * dim1 * dim2 + j * dim2 + k]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("---------------------------------------------------\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
printf("*********************************************************************\n");
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
//#define CLIP_DEBUG_FUNCTIONS
|
||||
|
||||
// RGB uint8 image
|
||||
|
@ -602,7 +735,7 @@ struct clip_ctx {
|
|||
struct clip_image_size * load_image_size;
|
||||
};
|
||||
|
||||
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false, ggml_tensor *attn_bias_input = nullptr) {
|
||||
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
|
||||
if (!ctx->has_vision_encoder) {
|
||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||
return nullptr;
|
||||
|
@ -1047,7 +1180,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
}
|
||||
|
||||
|
||||
static ggml_cgraph * clip_image_build_graph_vit(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false, ggml_tensor *attn_bias_input = nullptr) {
|
||||
static ggml_cgraph * clip_image_build_graph_vit(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
|
||||
if (!ctx->has_vision_encoder) {
|
||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||
return nullptr;
|
||||
|
@ -1119,9 +1252,7 @@ static ggml_cgraph * clip_image_build_graph_vit(clip_ctx * ctx, const clip_image
|
|||
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
|
||||
}
|
||||
// loop over layers
|
||||
if (ctx->has_minicpmv_projector) {
|
||||
n_layer += 1;
|
||||
}
|
||||
n_layer += 1;
|
||||
for (int il = 0; il < n_layer - 1; il++) {
|
||||
struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
|
||||
|
||||
|
@ -1218,7 +1349,7 @@ static ggml_cgraph * clip_image_build_graph_vit(clip_ctx * ctx, const clip_image
|
|||
static ggml_cgraph *clip_build_graph_xgenmm_projector(clip_ctx *ctx, int batch_size, ggml_tensor *img_embeddings, ggml_tensor *attn_bias_input = nullptr)
|
||||
{
|
||||
const auto & model = ctx->vision_model;
|
||||
const auto & hparams = model.hparams;
|
||||
// const auto & hparams = model.hparams;
|
||||
// const float eps = hparams.eps; // double check this value
|
||||
const float eps = 1e-5;
|
||||
|
||||
|
@ -2493,7 +2624,7 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
|
|||
void clip_free(clip_ctx * ctx) {
|
||||
ggml_free(ctx->ctx_data);
|
||||
gguf_free(ctx->ctx_gguf);
|
||||
|
||||
|
||||
ggml_backend_buffer_free(ctx->params_buffer);
|
||||
ggml_backend_free(ctx->backend);
|
||||
ggml_gallocr_free(ctx->compute_alloc);
|
||||
|
@ -2676,12 +2807,10 @@ bool clip_image_encode_tokenizer(struct clip_ctx * ctx, int batch_size, ggml_ten
|
|||
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
|
||||
ggml_backend_graph_compute(ctx->backend, gf);
|
||||
struct ggml_tensor * llm_inputs = gf->nodes[gf->n_nodes - 1];
|
||||
print_my_tensor(llm_inputs, "llm_inputs", 1);
|
||||
// exit(0);
|
||||
ggml_backend_tensor_get(llm_inputs, image_embd, 0, ggml_nbytes(llm_inputs));
|
||||
clip_free(ctx);
|
||||
// ggml_free(tensor.ctx);
|
||||
// if (ctx0){
|
||||
// ggml_free(ctx0);
|
||||
// }
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -3029,7 +3158,7 @@ bool clip_image_batch_encode_vit(clip_ctx * ctx, const int n_threads, const clip
|
|||
GGML_ASSERT(batch_size == 1); // TODO: support multiple images
|
||||
|
||||
// build the inference graph
|
||||
ggml_cgraph * gf = clip_image_build_graph_vit(ctx, imgs, ctx->load_image_size, true);
|
||||
ggml_cgraph * gf = clip_image_build_graph_vit(ctx, imgs);
|
||||
|
||||
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
|
||||
// set inputs
|
||||
|
@ -3039,8 +3168,8 @@ bool clip_image_batch_encode_vit(clip_ctx * ctx, const int n_threads, const clip
|
|||
int image_size_width = image_size;
|
||||
int image_size_height = image_size;
|
||||
const int patch_size = hparams.patch_size;
|
||||
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
||||
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
|
||||
// const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
||||
// const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
|
||||
if(ctx->load_image_size==nullptr){
|
||||
ctx->load_image_size= clip_image_size_init();
|
||||
}
|
||||
|
|
|
@ -4,10 +4,20 @@ conda activate xgenmm-flamingo
|
|||
# # step 1: surgery
|
||||
# python xgenmm_surgery.py
|
||||
|
||||
# step 2: convert to gguf (vit + projector)
|
||||
# # step 2: convert vit + projector to gguf
|
||||
|
||||
python xgenmm_convert_image_encoder_to_gguf.py \
|
||||
--surgery_dir /export/share/yutong/xgenmm/llamacpp_wd \
|
||||
--output_dirname gguf_test \
|
||||
--version siglip_kosmos_phi3_4k_instruct \
|
||||
--use_f32
|
||||
# python xgenmm_convert_image_encoder_to_gguf.py \
|
||||
# --surgery_dir /export/share/yutong/xgenmm/llamacpp_wd \
|
||||
# --output_dirname gguf_test \
|
||||
# --version siglip_kosmos_phi3_4k_instruct \
|
||||
# --use_f32
|
||||
|
||||
# step 3: convert llm to gguf
|
||||
# https://github.com/ggerganov/llama.cpp/discussions/7927
|
||||
HF_TOKEN=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
|
||||
LLM_PATH=/export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/llm
|
||||
# LLM_OUTPUT_FILE=/export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf/phi3_.gguf
|
||||
# downloads the tokenizer models of the specified models from Huggingface; generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
|
||||
cd ../..
|
||||
# python convert_hf_to_gguf_update.py $HF_TOKEN
|
||||
python convert_hf_to_gguf.py $LLM_PATH
|
4062
examples/xgenmm/convert_hf_to_gguf.py
Executable file
4062
examples/xgenmm/convert_hf_to_gguf.py
Executable file
File diff suppressed because it is too large
Load diff
BIN
examples/xgenmm/imgs/receipt.jpg
Normal file
BIN
examples/xgenmm/imgs/receipt.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 3.1 MiB |
|
@ -16,8 +16,26 @@ make xgenmm-cli
|
|||
# -p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. \nThe assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n<|user|>\n<image> Describe this image.<|end|>\n<|assistant|>\n"
|
||||
|
||||
|
||||
./xgenmm-cli -m /export/share/tawalgaonkar/llama.cpp/models/llm/xgenmm-phi-3-llm-Q4.gguf \
|
||||
# ./xgenmm-cli -m /export/share/tawalgaonkar/llama.cpp/models/llm/xgenmm-phi-3-llm-Q4.gguf \
|
||||
# --mmproj /export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf_test/mmproj-model-f32.gguf \
|
||||
# -c 4096 --temp 0.01 --repeat-penalty 1.05 \
|
||||
# --image /export/home/llama.cpp/examples/xgenmm/imgs/image-1d100e9.jpg\
|
||||
# -p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. \nThe assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n<|user|>\n<image>\n How many objects are there in this image?<|end|>\n<|assistant|>\n"
|
||||
|
||||
|
||||
./xgenmm-cli --model /export/share/tawalgaonkar/llama.cpp/models/llm/xgenmm-phi-3-llm-Q4.gguf \
|
||||
--mmproj /export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf_test/mmproj-model-f32.gguf \
|
||||
-c 4096 --temp 0.01 --repeat-penalty 1.05 \
|
||||
--image /export/home/llama.cpp/examples/xgenmm/imgs/image-1d100e9.jpg\
|
||||
-p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. \nThe assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n<|user|>\n<image>\n How many objects are there in this image?<|end|>\n<|assistant|>\n"
|
||||
--image /export/home/llama.cpp/examples/xgenmm/imgs/receipt.jpg\
|
||||
--prompt "<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n<|user|>\n<image>\n Describe this image.<|end|>\n<|assistant|>\n" \
|
||||
--seed 42 --ctx-size 4096 --predict 1024 \
|
||||
--temp 0 --verbose-prompt
|
||||
|
||||
#
|
||||
|
||||
|
||||
# ./xgenmm-cli --model /export/share/tawalgaonkar/llama.cpp/models/llm/xgenmm-phi-3-llm-Q4.gguf \
|
||||
# --mmproj /export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf_test/mmproj-model-f32.gguf \
|
||||
# --image /export/home/llama.cpp/examples/xgenmm/imgs/receipt.jpg\
|
||||
# --prompt "<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n<|user|>\n<image>\n What is the address of this restirant?<|end|>\n<|assistant|>\n" \
|
||||
# --seed 42 --ctx-size 4096 --predict 1024 \
|
||||
# --temp 0 --verbose-prompt
|
||||
|
|
|
@ -1,5 +1,3 @@
|
|||
// refer to example/minicpmv-cli
|
||||
|
||||
#include "ggml.h"
|
||||
#include "log.h"
|
||||
#include "common.h"
|
||||
|
@ -11,97 +9,18 @@
|
|||
#include <cstdlib>
|
||||
#include <vector>
|
||||
|
||||
struct llava_context {
|
||||
struct clip_ctx * ctx_clip = NULL;
|
||||
struct llama_context * ctx_llama = NULL;
|
||||
struct llama_model * model = NULL;
|
||||
};
|
||||
|
||||
static void show_additional_info(int /*argc*/, char ** argv) {
|
||||
LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||
LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||
}
|
||||
|
||||
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
|
||||
(void) level;
|
||||
(void) user_data;
|
||||
LOG_TEE("%s", text);
|
||||
}
|
||||
|
||||
static struct llama_model * llava_init(gpt_params * params) {
|
||||
llama_backend_init();
|
||||
llama_numa_init(params->numa);
|
||||
|
||||
llama_model_params model_params = llama_model_params_from_gpt_params(*params);
|
||||
|
||||
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
||||
if (model == NULL) {
|
||||
LOG_TEE("%s: error: unable to load model\n" , __func__);
|
||||
return NULL;
|
||||
}
|
||||
return model;
|
||||
}
|
||||
|
||||
static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) {
|
||||
auto prompt = params->prompt;
|
||||
if (prompt.empty()) {
|
||||
prompt = "describe the image in detail.";
|
||||
}
|
||||
|
||||
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
|
||||
if (params->n_ctx < 2048) {
|
||||
// warn user here, "Image processing requires at least 2048 context, setting context to 2048"
|
||||
LOG_TEE("%s: warn: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
|
||||
ctx_params.n_ctx = 2048;
|
||||
} else {
|
||||
ctx_params.n_ctx = params->n_ctx;
|
||||
}
|
||||
|
||||
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
|
||||
|
||||
if (ctx_llama == NULL) {
|
||||
LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
||||
|
||||
ctx_llava->ctx_llama = ctx_llama;
|
||||
ctx_llava->model = model;
|
||||
return ctx_llava;
|
||||
}
|
||||
|
||||
static void llava_free(struct llava_context * ctx_llava) {
|
||||
if (ctx_llava->ctx_clip) {
|
||||
clip_free(ctx_llava->ctx_clip);
|
||||
ctx_llava->ctx_clip = NULL;
|
||||
}
|
||||
|
||||
llama_free(ctx_llava->ctx_llama);
|
||||
llama_free_model(ctx_llava->model);
|
||||
llama_backend_free();
|
||||
}
|
||||
|
||||
static struct clip_ctx * clip_init_context(gpt_params * params) {
|
||||
const char * clip_path = params->mmproj.c_str();
|
||||
|
||||
auto prompt = params->prompt;
|
||||
if (prompt.empty()) {
|
||||
prompt = "describe the image in detail.";
|
||||
}
|
||||
// std::cout << __LINE__ << std::endl;
|
||||
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
|
||||
return ctx_clip;
|
||||
}
|
||||
|
||||
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
|
||||
int N = (int) tokens.size();
|
||||
for (int i = 0; i < N; i += n_batch) {
|
||||
int n_eval = (int) tokens.size() - i;
|
||||
if (n_eval > n_batch) {
|
||||
static bool eval_tokens(struct llama_context *ctx_llama, std::vector<llama_token> tokens, int n_batch, int *n_past)
|
||||
{
|
||||
int N = (int)tokens.size();
|
||||
for (int i = 0; i < N; i += n_batch)
|
||||
{
|
||||
int n_eval = (int)tokens.size() - i;
|
||||
if (n_eval > n_batch)
|
||||
{
|
||||
n_eval = n_batch;
|
||||
}
|
||||
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
|
||||
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0)))
|
||||
{
|
||||
LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
||||
return false;
|
||||
}
|
||||
|
@ -110,111 +29,170 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
|
||||
static bool eval_id(struct llama_context *ctx_llama, int id, int *n_past)
|
||||
{
|
||||
std::vector<llama_token> tokens;
|
||||
tokens.push_back(id);
|
||||
return eval_tokens(ctx_llama, tokens, 1, n_past);
|
||||
}
|
||||
|
||||
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
|
||||
std::string str2 = str;
|
||||
static bool eval_string(struct llama_context *ctx_llama, const char *str, int n_batch, int *n_past, bool add_bos)
|
||||
{
|
||||
|
||||
std::string str2 = str;
|
||||
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true);
|
||||
printf("prompt: %s", str);
|
||||
for (auto token : embd_inp){
|
||||
printf("%6d, ", token);
|
||||
}
|
||||
printf("\n");
|
||||
return eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
|
||||
}
|
||||
|
||||
static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) {
|
||||
float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
|
||||
std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
|
||||
|
||||
auto slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
|
||||
slice_embed->embed = image_embed;
|
||||
slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
|
||||
llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
|
||||
llava_image_embed_free(slice_embed);
|
||||
}
|
||||
|
||||
static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, gpt_params * params, int &n_past) {
|
||||
std::string system_prompt;
|
||||
int idx = 0;
|
||||
int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
|
||||
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
|
||||
if (has_minicpmv_projector == 2) {
|
||||
system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
|
||||
}
|
||||
else if (has_minicpmv_projector == 3) {
|
||||
system_prompt = "<|im_start|>user\n";
|
||||
}
|
||||
LOG_TEE("%s: image token past: %d\n", __func__, n_past);
|
||||
eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
|
||||
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
||||
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
||||
if (num_image_embeds > 1) {
|
||||
size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
|
||||
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
|
||||
for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
|
||||
for (size_t j = 0; j < num_image_embeds_col; ++j) {
|
||||
eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
|
||||
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
||||
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
||||
if (j == num_image_embeds_col - 1) {
|
||||
eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
|
||||
}
|
||||
LOG_TEE("%s: image token past: %d\n", __func__, n_past);
|
||||
}
|
||||
|
||||
static const char * sample(struct llama_sampling_context * ctx_sampling,
|
||||
struct llama_context * ctx_llama,
|
||||
int * n_past) {
|
||||
static const char *sample(struct llama_sampling_context *ctx_sampling, struct llama_context *ctx_llama, int *n_past)
|
||||
{
|
||||
const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
|
||||
llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
|
||||
static std::string ret;
|
||||
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
|
||||
if (llama_token_is_eog(llama_get_model(ctx_llama), id))
|
||||
{
|
||||
ret = "</s>";
|
||||
} else {
|
||||
}
|
||||
else
|
||||
{
|
||||
ret = llama_token_to_piece(ctx_llama, id);
|
||||
}
|
||||
eval_id(ctx_llama, id, n_past);
|
||||
return ret.c_str();
|
||||
}
|
||||
|
||||
static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
|
||||
auto ctx_clip = clip_init_context(params);
|
||||
auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str());
|
||||
if (!embeds) {
|
||||
std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
|
||||
return NULL;
|
||||
static const char *IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
|
||||
static const char *IMG_BASE64_TAG_END = "\">";
|
||||
|
||||
static void find_image_tag_in_prompt(const std::string &prompt, size_t &begin_out, size_t &end_out)
|
||||
{
|
||||
begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
|
||||
end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
|
||||
}
|
||||
|
||||
static bool prompt_contains_image(const std::string &prompt)
|
||||
{
|
||||
size_t begin, end;
|
||||
find_image_tag_in_prompt(prompt, begin, end);
|
||||
return (begin != std::string::npos);
|
||||
}
|
||||
|
||||
// TODO: Implememt this function llava_image_embed_make_with_prompt_base64 for xgenmm
|
||||
// static llava_image_embed *llava_image_embed_make_with_prompt_base64(struct clip_ctx *ctx_clip, int n_threads,
|
||||
// const std::string &prompt)
|
||||
// {
|
||||
// size_t img_base64_str_start, img_base64_str_end;
|
||||
// find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
|
||||
// if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos)
|
||||
// {
|
||||
// LOG_TEE("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN,
|
||||
// IMG_BASE64_TAG_END);
|
||||
// return NULL;
|
||||
// }
|
||||
|
||||
// auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN);
|
||||
// auto base64_bytes_count = img_base64_str_end - base64_bytes_start;
|
||||
// auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count);
|
||||
|
||||
// auto required_bytes = base64::required_encode_size(base64_str.size());
|
||||
// auto img_bytes = std::vector<unsigned char>(required_bytes);
|
||||
// base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
|
||||
|
||||
// auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
|
||||
// if (!embed)
|
||||
// {
|
||||
// LOG_TEE("%s: could not load image from base64 string.\n", __func__);
|
||||
// return NULL;
|
||||
// }
|
||||
|
||||
// return embed;
|
||||
// }
|
||||
|
||||
static std::string remove_image_from_prompt(const std::string &prompt, const char *replacement = "")
|
||||
{
|
||||
size_t begin, end;
|
||||
find_image_tag_in_prompt(prompt, begin, end);
|
||||
if (begin == std::string::npos || end == std::string::npos)
|
||||
{
|
||||
return prompt;
|
||||
}
|
||||
auto pre = prompt.substr(0, begin);
|
||||
auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END));
|
||||
return pre + replacement + post;
|
||||
}
|
||||
|
||||
struct llava_context
|
||||
{
|
||||
struct clip_ctx *ctx_clip = NULL;
|
||||
struct llama_context *ctx_llama = NULL;
|
||||
struct llama_model *model = NULL;
|
||||
};
|
||||
|
||||
// static void process_eval_image_embed(struct llava_context *ctx_llava, const struct llava_image_embed *embeds,
|
||||
// int n_batch, int *n_past, int idx)
|
||||
// {
|
||||
// float *image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
|
||||
// std::memcpy(image_embed,
|
||||
// embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip),
|
||||
// clip_embd_nbytes(ctx_llava->ctx_clip));
|
||||
|
||||
// auto slice_embed = (llava_image_embed *)malloc(sizeof(llava_image_embed));
|
||||
// slice_embed->embed = image_embed;
|
||||
// slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
|
||||
// llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
|
||||
// llava_image_embed_free(slice_embed);
|
||||
// }
|
||||
|
||||
static void print_usage(int argc, char **argv, const gpt_params ¶ms)
|
||||
{
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
|
||||
LOG_TEE("\n example usage:\n");
|
||||
LOG_TEE(
|
||||
"\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image "
|
||||
"<path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in "
|
||||
"detail.\"]\n",
|
||||
argv[0]);
|
||||
LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||
}
|
||||
|
||||
static struct llava_image_embed *load_image(llava_context *ctx_llava, gpt_params *params, const std::string &fname)
|
||||
{
|
||||
// load and preprocess the image
|
||||
llava_image_embed *embed = NULL;
|
||||
auto prompt = params->prompt;
|
||||
if (prompt_contains_image(prompt))
|
||||
{
|
||||
// if (!params->image.empty())
|
||||
// {
|
||||
// LOG_TEE("using base64 encoded image instead of command line image path\n");
|
||||
// }
|
||||
// embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
|
||||
// if (!embed)
|
||||
// {
|
||||
// LOG_TEE("%s: can't load image from prompt\n", __func__);
|
||||
// return NULL;
|
||||
// }
|
||||
// params->prompt = remove_image_from_prompt(prompt);
|
||||
printf("not implemented\n");
|
||||
exit(1);
|
||||
}
|
||||
else
|
||||
{
|
||||
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str());
|
||||
if (!embed)
|
||||
{
|
||||
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
// process the prompt
|
||||
if (params->prompt.empty() && params->interactive == false) {
|
||||
LOG_TEE("prompt should be given or interactive mode should be on");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
auto model = llava_init(params);
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
const int64_t t_llava_init_start_us = ggml_time_us();
|
||||
auto ctx_llava = llava_init_context(params, model);
|
||||
ctx_llava->ctx_clip = ctx_clip;
|
||||
const int64_t t_llava_init_end_us = ggml_time_us();
|
||||
float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
|
||||
LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
|
||||
|
||||
const int64_t t_process_image_start_us = ggml_time_us();
|
||||
process_image(ctx_llava, embeds, params, n_past);
|
||||
const int64_t t_process_image_end_us = ggml_time_us();
|
||||
float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
|
||||
LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
|
||||
|
||||
llava_image_embed_free(embeds);
|
||||
return ctx_llava;
|
||||
return embed;
|
||||
}
|
||||
|
||||
static void process_prompt(struct llava_context *ctx_llava, struct llava_image_embed *image_embed, gpt_params *params,
|
||||
|
@ -233,6 +211,7 @@ static void process_prompt(struct llava_context *ctx_llava, struct llava_image_e
|
|||
system_prompt = prompt.substr(0, image_pos);
|
||||
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
|
||||
LOG_TEE("system_prompt: %s\n", system_prompt.c_str());
|
||||
// phi3-tokenizer https://github.com/ggerganov/llama.cpp/issues/7938
|
||||
if (params->verbose_prompt)
|
||||
{
|
||||
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
|
||||
|
@ -267,7 +246,6 @@ static void process_prompt(struct llava_context *ctx_llava, struct llava_image_e
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
|
||||
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
|
||||
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
||||
|
@ -286,9 +264,18 @@ static void process_prompt(struct llava_context *ctx_llava, struct llava_image_e
|
|||
std::string response = "";
|
||||
for (int i = 0; i < max_tgt_len; i++)
|
||||
{
|
||||
// printf("i: %d\n", i);
|
||||
const char *tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
|
||||
response += tmp;
|
||||
if (strcmp(tmp, "</s>") == 0) break;
|
||||
// printf("%s", tmp);
|
||||
if (strcmp(tmp, "<|end|>") == 0){
|
||||
printf("\n STOP GENERATING because I saw <|end|>\n");
|
||||
break;
|
||||
}
|
||||
if (strcmp(tmp, "</s>") == 0) {
|
||||
printf("\n STOP GENERATING because I saw </s>\n");
|
||||
break;
|
||||
}
|
||||
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||
printf("%s", tmp);
|
||||
if (strstr(response.c_str(), "<|im_end|>"))
|
||||
|
@ -303,90 +290,207 @@ static void process_prompt(struct llava_context *ctx_llava, struct llava_image_e
|
|||
printf("\n");
|
||||
}
|
||||
|
||||
static struct llava_context * xgenmm_init(gpt_params * params, const std::string & fname, int &n_past){
|
||||
auto ctx_clip = clip_init_context(params);
|
||||
std::cout << "clip model has been loaded \n\n";
|
||||
|
||||
auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str());
|
||||
if (!embeds) {
|
||||
std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
|
||||
return NULL;
|
||||
}
|
||||
std::cout<< "Start Processing Prompt: " << std::endl;
|
||||
// TODO:
|
||||
// process the prompt
|
||||
if (params->prompt.empty() && params->interactive == false) {
|
||||
LOG_TEE("prompt should be given or interactive mode should be on");
|
||||
return NULL;
|
||||
}
|
||||
static struct llama_model * llava_init(gpt_params * params) {
|
||||
llama_backend_init();
|
||||
llama_numa_init(params->numa);
|
||||
|
||||
auto model = llava_init(params);
|
||||
llama_model_params model_params = llama_model_params_from_gpt_params(*params);
|
||||
|
||||
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
|
||||
LOG_TEE("%s: error: unable to load model\n" , __func__);
|
||||
return NULL;
|
||||
}
|
||||
const int64_t t_llava_init_start_us = ggml_time_us();
|
||||
auto ctx_llava = llava_init_context(params, model);
|
||||
return model;
|
||||
}
|
||||
|
||||
static struct llava_context *llava_init_context(gpt_params *params, llama_model *model)
|
||||
{
|
||||
const char *clip_path = params->mmproj.c_str();
|
||||
|
||||
auto prompt = params->prompt;
|
||||
if (prompt.empty())
|
||||
{
|
||||
prompt = "describe the image in detail.";
|
||||
}
|
||||
|
||||
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/1);
|
||||
|
||||
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
|
||||
ctx_params.n_ctx =
|
||||
params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
|
||||
|
||||
llama_context *ctx_llama = llama_new_context_with_model(model, ctx_params);
|
||||
|
||||
if (ctx_llama == NULL)
|
||||
{
|
||||
LOG_TEE("%s: error: failed to create the llama_context\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
||||
|
||||
ctx_llava->ctx_llama = ctx_llama;
|
||||
ctx_llava->ctx_clip = ctx_clip;
|
||||
const int64_t t_llava_init_end_us = ggml_time_us();
|
||||
float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
|
||||
LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
|
||||
|
||||
const int64_t t_process_image_start_us = ggml_time_us();
|
||||
process_prompt(ctx_llava, embeds, params, params->prompt);
|
||||
// process_image(ctx_llava, embeds, params, n_past);
|
||||
const int64_t t_process_image_end_us = ggml_time_us();
|
||||
float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
|
||||
LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
|
||||
|
||||
llava_image_embed_free(embeds);
|
||||
ctx_llava->model = model;
|
||||
return ctx_llava;
|
||||
}
|
||||
|
||||
|
||||
static struct llama_sampling_context * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
|
||||
std::string user_prompt = prompt;
|
||||
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
|
||||
if (!is_first) {
|
||||
if (has_minicpmv_projector == 2) {
|
||||
user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt;
|
||||
}
|
||||
else if (has_minicpmv_projector == 3) {
|
||||
user_prompt = "<|im_start|>user\n" + prompt;
|
||||
}
|
||||
static void llava_free(struct llava_context * ctx_llava) {
|
||||
if (ctx_llava->ctx_clip) {
|
||||
printf(
|
||||
"YD:::Segmentation fault here; Because header.n_kv is empty\n clip_free->gguf_free(ctx->ctx_gguf)-> for "
|
||||
"(uint64_t i = 0; i < ctx->header.n_kv; ++i)\n");
|
||||
exit(1);
|
||||
clip_free(ctx_llava->ctx_clip);
|
||||
ctx_llava->ctx_clip = NULL;
|
||||
}
|
||||
|
||||
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
||||
if (has_minicpmv_projector == 2) {
|
||||
eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
|
||||
}
|
||||
else if (has_minicpmv_projector == 3) {
|
||||
eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
|
||||
}
|
||||
|
||||
// generate the response
|
||||
|
||||
LOG_TEE("\n");
|
||||
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
|
||||
return ctx_sampling;
|
||||
llama_free(ctx_llava->ctx_llama);
|
||||
llama_free_model(ctx_llava->model);
|
||||
llama_backend_free();
|
||||
}
|
||||
|
||||
static const char * llama_loop(struct llava_context * ctx_llava,struct llama_sampling_context * ctx_sampling, int &n_past){
|
||||
// static struct clip_ctx * clip_init_context(gpt_params * params) {
|
||||
// const char * clip_path = params->mmproj.c_str();
|
||||
|
||||
const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
|
||||
return tmp;
|
||||
// auto prompt = params->prompt;
|
||||
// if (prompt.empty()) {
|
||||
// prompt = "describe the image in detail.";
|
||||
// }
|
||||
// // std::cout << __LINE__ << std::endl;
|
||||
// auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
|
||||
// return ctx_clip;
|
||||
// }
|
||||
|
||||
|
||||
|
||||
// TODO: REMOVE THIS FUNCTION
|
||||
// static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, gpt_params * params, int &n_past) {
|
||||
// std::string system_prompt;
|
||||
// int idx = 0;
|
||||
// int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
|
||||
// int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
|
||||
// if (has_minicpmv_projector == 2) {
|
||||
// system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
|
||||
// }
|
||||
// else if (has_minicpmv_projector == 3) {
|
||||
// system_prompt = "<|im_start|>user\n";
|
||||
// }
|
||||
// LOG_TEE("%s: image token past: %d\n", __func__, n_past);
|
||||
// eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
|
||||
// process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
||||
// eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
||||
// if (num_image_embeds > 1) {
|
||||
// size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
|
||||
// eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
|
||||
// for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
|
||||
// for (size_t j = 0; j < num_image_embeds_col; ++j) {
|
||||
// eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
|
||||
// process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
||||
// eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
||||
// if (j == num_image_embeds_col - 1) {
|
||||
// eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
|
||||
// }
|
||||
// LOG_TEE("%s: image token past: %d\n", __func__, n_past);
|
||||
// }
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// static struct llava_context * xgenmm_init(gpt_params * params, const std::string & fname, int &n_past){
|
||||
// auto ctx_clip = clip_init_context(params);
|
||||
// std::cout << "clip model has been loaded \n\n";
|
||||
|
||||
// auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str());
|
||||
// if (!embeds) {
|
||||
// std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
|
||||
// return NULL;
|
||||
// }
|
||||
// std::cout<< "Start Processing Prompt: " << std::endl;
|
||||
// // TODO:
|
||||
// // process the prompt
|
||||
// if (params->prompt.empty() && params->interactive == false) {
|
||||
// LOG_TEE("prompt should be given or interactive mode should be on");
|
||||
// return NULL;
|
||||
// }
|
||||
|
||||
// auto model = llava_init(params);
|
||||
// if (model == NULL) {
|
||||
// fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
|
||||
// return NULL;
|
||||
// }
|
||||
// const int64_t t_llava_init_start_us = ggml_time_us();
|
||||
// auto ctx_llava = llava_init_context(params, model);
|
||||
// ctx_llava->ctx_clip = ctx_clip;
|
||||
// const int64_t t_llava_init_end_us = ggml_time_us();
|
||||
// float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
|
||||
// LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
|
||||
|
||||
// const int64_t t_process_image_start_us = ggml_time_us();
|
||||
// process_prompt(ctx_llava, embeds, params, params->prompt);
|
||||
// // process_image(ctx_llava, embeds, params, n_past);
|
||||
// const int64_t t_process_image_end_us = ggml_time_us();
|
||||
// float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
|
||||
// LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
|
||||
|
||||
// llava_image_embed_free(embeds);
|
||||
// return ctx_llava;
|
||||
// }
|
||||
|
||||
|
||||
// static struct llama_sampling_context * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
|
||||
// std::string user_prompt = prompt;
|
||||
// int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
|
||||
// if (!is_first) {
|
||||
// if (has_minicpmv_projector == 2) {
|
||||
// user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt;
|
||||
// }
|
||||
// else if (has_minicpmv_projector == 3) {
|
||||
// user_prompt = "<|im_start|>user\n" + prompt;
|
||||
// }
|
||||
// }
|
||||
|
||||
// eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
||||
// if (has_minicpmv_projector == 2) {
|
||||
// eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
|
||||
// }
|
||||
// else if (has_minicpmv_projector == 3) {
|
||||
// eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
|
||||
// }
|
||||
|
||||
// // generate the response
|
||||
|
||||
// LOG_TEE("\n");
|
||||
|
||||
// struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
|
||||
// return ctx_sampling;
|
||||
// }
|
||||
|
||||
// static const char * llama_loop(struct llava_context * ctx_llava,struct llama_sampling_context * ctx_sampling, int &n_past){
|
||||
|
||||
// const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
|
||||
// return tmp;
|
||||
// }
|
||||
|
||||
static void llama_log_callback_logTee(ggml_log_level level, const char *text, void *user_data)
|
||||
{
|
||||
(void)level;
|
||||
(void)user_data;
|
||||
LOG_TEE("%s", text);
|
||||
}
|
||||
|
||||
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_time_init();
|
||||
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
show_additional_info(argc, argv);
|
||||
print_usage(argc, argv, params);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -399,67 +503,67 @@ int main(int argc, char ** argv) {
|
|||
|
||||
if (params.mmproj.empty() || (params.image.empty())) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
show_additional_info(argc, argv);
|
||||
print_usage(argc, argv, params);
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (auto & image : params.image) { // only single image for now
|
||||
int n_past = 0;
|
||||
// auto ctx_llava = minicpmv_init(¶ms, image, n_past);
|
||||
auto ctx_llava = xgenmm_init(¶ms, image, n_past); // generate vision tokens
|
||||
std::cout << "Start llava generation: " << std::endl;
|
||||
auto model = llava_init(¶ms);
|
||||
if (model == NULL)
|
||||
{
|
||||
fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (prompt_contains_image(params.prompt))
|
||||
{
|
||||
auto ctx_llava = llava_init_context(¶ms, model);
|
||||
|
||||
auto image_embed = load_image(ctx_llava, ¶ms, "");
|
||||
|
||||
// process the prompt
|
||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||
|
||||
llama_print_timings(ctx_llava->ctx_llama);
|
||||
|
||||
// // TODO: integrate base llm
|
||||
// if (!params.prompt.empty()) {
|
||||
// LOG_TEE("<user>%s\n", params.prompt.c_str());
|
||||
// LOG_TEE("<assistant>");
|
||||
// auto ctx_sampling = llama_init(ctx_llava, ¶ms, params.prompt.c_str(), n_past, true);
|
||||
// const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
||||
// std::string response = "";
|
||||
// bool have_tmp = false;
|
||||
// for (int i = 0; i < max_tgt_len; i++) {
|
||||
// auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past);
|
||||
// response += tmp;
|
||||
// if (strcmp(tmp, "</s>") == 0){
|
||||
// if(!have_tmp)continue;
|
||||
// else break;
|
||||
// }
|
||||
// if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||
// have_tmp = true;
|
||||
// printf("%s", tmp);
|
||||
// if (strstr(response.c_str(), "<user>")) break; // minicpm-v
|
||||
|
||||
// fflush(stdout);
|
||||
// }
|
||||
// llama_sampling_free(ctx_sampling);
|
||||
// }else {
|
||||
// while (true) {
|
||||
// LOG_TEE("<user>");
|
||||
// std::string prompt;
|
||||
// std::getline(std::cin, prompt);
|
||||
// LOG_TEE("<assistant>");
|
||||
// auto ctx_sampling = llama_init(ctx_llava, ¶ms, prompt, n_past, true);
|
||||
// const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
||||
// std::string response = "";
|
||||
// for (int i = 0; i < max_tgt_len; i++) {
|
||||
// auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past);
|
||||
// response += tmp;
|
||||
// if (strcmp(tmp, "</s>") == 0) break;
|
||||
// if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||
// printf("%s", tmp);// mistral llava-1.6
|
||||
// if (strstr(response.c_str(), "<user>")) break; // minicpm-v
|
||||
// fflush(stdout);
|
||||
// }
|
||||
// llama_sampling_free(ctx_sampling);
|
||||
// }
|
||||
// }
|
||||
// printf("\n");
|
||||
// llama_print_timings(ctx_llava->ctx_llama);
|
||||
|
||||
llava_image_embed_free(image_embed);
|
||||
ctx_llava->model = NULL;
|
||||
llava_free(ctx_llava);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (auto &image : params.image)
|
||||
{
|
||||
printf("image: %s\n", image.c_str());
|
||||
auto ctx_llava = llava_init_context(¶ms, model);
|
||||
|
||||
auto image_embed = load_image(ctx_llava, ¶ms, image);
|
||||
printf("n_image_pos: %d\n", image_embed->n_image_pos);
|
||||
if (!image_embed)
|
||||
{
|
||||
std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
|
||||
return 1;
|
||||
}
|
||||
|
||||
// process the prompt
|
||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||
|
||||
llama_print_timings(ctx_llava->ctx_llama);
|
||||
llava_image_embed_free(image_embed);
|
||||
ctx_llava->model = NULL;
|
||||
llava_free(ctx_llava);
|
||||
}
|
||||
}
|
||||
|
||||
llama_free_model(model);
|
||||
|
||||
// prompt_contains_image(params.prompt);
|
||||
// for (auto & image : params.image) { // only single image for now
|
||||
// int n_past = 0;
|
||||
// auto ctx_llava = xgenmm_init(¶ms, image, n_past); // generate vision tokens
|
||||
// std::cout << "Start llava generation: " << std::endl;
|
||||
// llama_print_timings(ctx_llava->ctx_llama);
|
||||
// ctx_llava->model = NULL;
|
||||
// llava_free(ctx_llava);
|
||||
// }
|
||||
printf("Remember to remove print_tensor function in xgenmm.cpp and clip.cpp\n");
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -14,6 +14,162 @@
|
|||
#include "llama.h"
|
||||
#include "xgenmm.h"
|
||||
|
||||
struct tensor_from_gguf
|
||||
{
|
||||
struct ggml_tensor *data;
|
||||
struct ggml_context *ctx;
|
||||
};
|
||||
|
||||
bool load_tensor_from_file(const char *filename, tensor_from_gguf &tensor)
|
||||
{
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc =*/false,
|
||||
/*.ctx =*/&tensor.ctx,
|
||||
};
|
||||
gguf_context *ctx = gguf_init_from_file(filename, params);
|
||||
if (!ctx)
|
||||
{
|
||||
fprintf(stderr, "%s: gguf_init_from_file() failed\n", __func__);
|
||||
return false;
|
||||
}
|
||||
tensor.data = ggml_get_tensor(tensor.ctx, "data");
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
void print_tensor(ggml_tensor *tensor, const char *name = "", int verbosity = 0)
|
||||
{
|
||||
if (tensor->ne[2] == 1)
|
||||
{
|
||||
printf("---> %s: (%ld, %ld)\n", name, tensor->ne[0], tensor->ne[1]);
|
||||
}
|
||||
else if (ggml_is_3d(tensor))
|
||||
{
|
||||
printf("---> %s: (%ld, %ld, %ld)\n", name, tensor->ne[0], tensor->ne[1], tensor->ne[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("---> %s: (%ld, %ld, %ld, %ld)\n", name, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
|
||||
}
|
||||
if (verbosity == 1)
|
||||
{
|
||||
printf("*********************************************************************\n");
|
||||
if (tensor->ne[2] == 1)
|
||||
{
|
||||
const float *mat = (float *)tensor->data;
|
||||
int dim0 = tensor->ne[1];
|
||||
int dim1 = tensor->ne[0];
|
||||
if (dim0 < 6 && dim1 < 6)
|
||||
{
|
||||
for (int i = 0; i < dim0; i++)
|
||||
{
|
||||
for (int j = 0; j < dim1; j++)
|
||||
{
|
||||
printf("%+.4f ", mat[i * dim1 + j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int i = 0; i < std::min(dim0, 3); i++)
|
||||
{
|
||||
for (int j = 0; j < std::min(dim1, 3); j++)
|
||||
{
|
||||
printf("%+.6f ", mat[i * dim1 + j]);
|
||||
}
|
||||
printf("... ");
|
||||
for (int j = dim1 - 3; j < dim1; j++)
|
||||
{
|
||||
printf("%+.6f ", mat[i * dim1 + j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
if (dim0 > 3)
|
||||
{
|
||||
printf("...................... omit ......................\n");
|
||||
for (int i = dim0 - 3; i < dim0; i++)
|
||||
{
|
||||
for (int j = 0; j < std::min(dim1, 3); j++)
|
||||
{
|
||||
printf("%+.6f ", mat[i * dim1 + j]);
|
||||
}
|
||||
printf("... ");
|
||||
for (int j = dim1 - 3; j < dim1; j++)
|
||||
{
|
||||
printf("%+.6f ", mat[i * dim1 + j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (ggml_is_3d(tensor))
|
||||
{
|
||||
const float *data = (float *)tensor->data;
|
||||
int dim0 = tensor->ne[2];
|
||||
int dim1 = tensor->ne[1];
|
||||
int dim2 = tensor->ne[0];
|
||||
if (dim0 < 6 && dim1 < 6 && dim2 < 6)
|
||||
{
|
||||
for (int i = 0; i < dim0; i++)
|
||||
{
|
||||
printf("dim0 = %d\n", i);
|
||||
for (int j = 0; j < dim1; j++)
|
||||
{
|
||||
for (int k = 0; k < dim2; k++)
|
||||
{
|
||||
printf("%+.6f ", data[i * dim1 * dim2 + j * dim2 + k]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int i = 0; i < std::min(dim0, 3); i++)
|
||||
{
|
||||
printf("dim0 = %d\n", i);
|
||||
for (int j = 0; j < std::min(dim1, 3); j++)
|
||||
{
|
||||
for (int k = 0; k < std::min(dim2, 3); k++)
|
||||
{
|
||||
printf("%+.6f ", data[i * dim1 * dim2 + j * dim2 + k]);
|
||||
}
|
||||
printf("... ");
|
||||
for (int k = dim2 - 3; k < dim2; k++)
|
||||
{
|
||||
printf("%+.6f ", data[i * dim1 * dim2 + j * dim2 + k]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("........................\n");
|
||||
for (int j = dim1 - 3; j < dim1; j++)
|
||||
{
|
||||
for (int k = 0; k < std::min(dim2, 3); k++)
|
||||
{
|
||||
printf("%+.6f ", data[i * dim1 * dim2 + j * dim2 + k]);
|
||||
}
|
||||
printf("... ");
|
||||
for (int k = dim2 - 3; k < dim2; k++)
|
||||
{
|
||||
printf("%+.6f ", data[i * dim1 * dim2 + j * dim2 + k]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("---------------------------------------------------\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
printf("*********************************************************************\n");
|
||||
printf("\n");
|
||||
}
|
||||
// RGB uint8 image
|
||||
struct clip_image_u8
|
||||
{
|
||||
|
@ -418,6 +574,33 @@ static bool clip_xgenmm_handle_vit_patches(clip_ctx *ctx_clip , const clip_image
|
|||
ggml_graph_compute_with_ctx(mask.ctx, gf, 1);
|
||||
attention_mask = gf->nodes[gf->n_nodes - 1];
|
||||
// memcpy(image_embd_v_m_mask_out, (float *)attention_mask->data, ggml_nbytes(attention_mask));
|
||||
|
||||
{
|
||||
printf((" ========================= DEBUG =========================\n"));
|
||||
printf("Load pre-computed image embeddings and attention_mask\n");
|
||||
std::string filename = "/export/home/ggml/examples/projectors/receipt_5patches_vision_features.gguf";
|
||||
tensor_from_gguf tensor;
|
||||
bool is_successful = load_tensor_from_file(filename.c_str(), tensor);
|
||||
if (!is_successful)
|
||||
{
|
||||
fprintf(stderr, "%s: load_tensor_from_file() failed\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
result = tensor.data;
|
||||
// print_tensor(result, "result", 1);
|
||||
filename = "/export/home/ggml/examples/projectors/receipt_5patches_vision_attn_masks.gguf";
|
||||
is_successful = load_tensor_from_file(filename.c_str(), tensor);
|
||||
if (!is_successful)
|
||||
{
|
||||
fprintf(stderr, "%s: load_tensor_from_file() failed\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
attention_mask = tensor.data;
|
||||
// print_tensor(attention_mask, "attention_mask", 1);
|
||||
num_patches_width = 2;
|
||||
num_patches_height = 2;
|
||||
}
|
||||
|
||||
|
||||
// compute attnetion masks outside of the graph
|
||||
struct ggml_tensor * attn_bias_input;
|
||||
|
@ -463,10 +646,19 @@ static bool clip_xgenmm_handle_vit_patches(clip_ctx *ctx_clip , const clip_image
|
|||
ggml_build_forward_expand(gf_temp, attn_bias);
|
||||
ggml_graph_compute_with_ctx(ctx0, gf_temp, 1);
|
||||
attn_bias_input = attn_bias;
|
||||
}else{
|
||||
attn_bias_input = NULL;
|
||||
}
|
||||
int batch_size = num_patches_width * num_patches_height + 1;
|
||||
// print_tensor(attn_bias_input, "attn_bias_input", 1);
|
||||
// print_tensor(result, "result", 1);
|
||||
printf("batch_size: %d\n", batch_size);
|
||||
const bool encoded = clip_image_encode_tokenizer(
|
||||
ctx_clip, batch_size, result, attn_bias_input, image_embd);
|
||||
if (!encoded){
|
||||
LOG_TEE("%s: failed at image tokenizer (projector step failed)\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
ggml_free(model.ctx);
|
||||
ggml_free(mask.ctx);
|
||||
|
|
|
@ -1,5 +0,0 @@
|
|||
python examples/xgenmm/xgenmm_convert_image_encoder_to_gguf.py\
|
||||
--surgery_dir /export/share/yutong/xgenmm/llamacpp_wd \
|
||||
--version siglip_kosmos_phi3_4k_instruct \
|
||||
--xgenmm_projector /export/home/Projects/xgenmm-quantization/target_models/MiniCPM-Llama3-V-2_5/minicpmv.projector \
|
||||
--use_f32
|
|
@ -92,8 +92,9 @@ if __name__ == "__main__":
|
|||
torch.save(projector_tensors, save_path)
|
||||
|
||||
# processors
|
||||
tokenizer.save_pretrained(f"{save_dir}/tokenizer")
|
||||
# will hard code the image_processor in the convert_image_encoder_to_gguf.py
|
||||
|
||||
# put the tokenizer in the same dir as the lang model
|
||||
tokenizer.save_pretrained(f"{save_dir}/llm")
|
||||
|
||||
end = time.time()
|
||||
print(f"🟢 time used: [{end-start:.3f} s]")
|
BIN
xgenmm-cli
BIN
xgenmm-cli
Binary file not shown.
Loading…
Add table
Add a link
Reference in a new issue