add merge type
This commit is contained in:
parent
f645b0bc8c
commit
f70fdf5a86
4 changed files with 41 additions and 34 deletions
|
@ -667,22 +667,21 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
struct ggml_tensor * embeddings = inp;
|
struct ggml_tensor * embeddings = inp;
|
||||||
struct ggml_tensor * pos_embed = nullptr;
|
struct ggml_tensor * pos_embed = nullptr;
|
||||||
if (ctx->has_llava_projector) {
|
if (ctx->has_llava_projector) {
|
||||||
printf("use has_llava_projector\n");
|
printf(" use has_llava_projector\n");
|
||||||
// concat class_embeddings and patch_embeddings
|
// concat class_embeddings and patch_embeddings
|
||||||
if (ctx->has_class_embedding) {
|
if (ctx->has_class_embedding) {
|
||||||
printf("I am in!\n");
|
|
||||||
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
|
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
|
||||||
printf("created embeddings new 3d tensors\n");
|
|
||||||
ggml_set_name(embeddings, "embeddings");
|
ggml_set_name(embeddings, "embeddings");
|
||||||
ggml_set_input(embeddings);
|
ggml_set_input(embeddings);
|
||||||
printf("ggml_set_input\n");
|
|
||||||
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
|
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
|
||||||
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
|
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
|
||||||
|
printf(" first acc worked\n");
|
||||||
embeddings = ggml_acc(ctx0, embeddings, inp,
|
embeddings = ggml_acc(ctx0, embeddings, inp,
|
||||||
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
||||||
|
printf(" second acc worked\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// printf("hi1!");
|
// printf(" after ctx->has_llava_projector\n");
|
||||||
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
|
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
|
||||||
ggml_set_name(positions, "positions");
|
ggml_set_name(positions, "positions");
|
||||||
ggml_set_input(positions);
|
ggml_set_input(positions);
|
||||||
|
@ -2500,6 +2499,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
|
|
||||||
// build the inference graph
|
// build the inference graph
|
||||||
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
|
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
|
||||||
|
printf(" build graph done\n");
|
||||||
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
|
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
|
||||||
// set inputs
|
// set inputs
|
||||||
const auto & model = ctx->vision_model;
|
const auto & model = ctx->vision_model;
|
||||||
|
@ -2546,6 +2546,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
|
ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
|
||||||
free(data);
|
free(data);
|
||||||
}
|
}
|
||||||
|
printf(" before ctx->has_minicpmv_projector\n");
|
||||||
if (ctx->has_minicpmv_projector) {
|
if (ctx->has_minicpmv_projector) {
|
||||||
{
|
{
|
||||||
// inspired from siglip:
|
// inspired from siglip:
|
||||||
|
@ -2638,7 +2639,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
ggml_backend_metal_set_n_cb(ctx->backend, n_threads);
|
ggml_backend_metal_set_n_cb(ctx->backend, n_threads);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
printf(" before ggml_backend_graph_compute\n");
|
||||||
ggml_backend_graph_compute(ctx->backend, gf);
|
ggml_backend_graph_compute(ctx->backend, gf);
|
||||||
|
printf(" after ggml_backend_graph_compute\n");
|
||||||
// the last node is the embedding tensor
|
// the last node is the embedding tensor
|
||||||
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
|
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
|
||||||
// copy the embeddings to the location passed by the user
|
// copy the embeddings to the location passed by the user
|
||||||
|
|
|
@ -1,12 +1,12 @@
|
||||||
# source /export/share/yutong/miniconda3/bin/activate
|
source /export/share/yutong/miniconda3/bin/activate
|
||||||
# conda activate xgenmm-flamingo
|
conda activate xgenmm-flamingo
|
||||||
# which python
|
# which python
|
||||||
# # step 1: surgery
|
# # step 1: surgery
|
||||||
# python xgenmm_surgery.py
|
# python xgenmm_surgery.py
|
||||||
|
|
||||||
# step 2: convert to gguf (vit + projector)
|
# step 2: convert to gguf (vit + projector)
|
||||||
|
|
||||||
python examples/xgenmm/xgenmm_convert_image_encoder_to_gguf.py \
|
python xgenmm_convert_image_encoder_to_gguf.py \
|
||||||
--surgery_dir /export/share/yutong/xgenmm/llamacpp_wd \
|
--surgery_dir /export/share/yutong/xgenmm/llamacpp_wd \
|
||||||
--output_dirname gguf_test \
|
--output_dirname gguf_test \
|
||||||
--version siglip_kosmos_phi3_4k_instruct \
|
--version siglip_kosmos_phi3_4k_instruct \
|
||||||
|
|
|
@ -502,33 +502,34 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
|
||||||
int main(){
|
int main(){
|
||||||
|
|
||||||
|
|
||||||
const char* clip_path = "/export/share/yutong/xgenmm/llamacpp_wd/llava-1.6/vit/mmproj-model-f16.gguf";
|
// const char* clip_path = "/export/share/yutong/xgenmm/llamacpp_wd/llava-1.6/vit/mmproj-model-f16.gguf";
|
||||||
|
const char* clip_path = "/export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf_test/mmproj-model-f32.gguf";
|
||||||
struct clip_ctx * ctx = clip_model_load(clip_path, /*verbosity=*/2);
|
struct clip_ctx * ctx = clip_model_load(clip_path, /*verbosity=*/2);
|
||||||
printf("Model loaded\n");
|
// printf("Model loaded\n");
|
||||||
for (int i=0; i < 3; i++){
|
// for (int i=0; i < 3; i++){
|
||||||
ctx->image_mean[i] = 0.5;
|
// ctx->image_mean[i] = 0.5;
|
||||||
ctx->image_std[i] = 0.5;
|
// ctx->image_std[i] = 0.5;
|
||||||
}
|
// }
|
||||||
LOG_TEE("v_image_mean %f %f %f\n", ctx->image_mean[0], ctx->image_mean[1], ctx->image_mean[2]);
|
// LOG_TEE("v_image_mean %f %f %f\n", ctx->image_mean[0], ctx->image_mean[1], ctx->image_mean[2]);
|
||||||
LOG_TEE("v_image_std %f %f %f\n", ctx->image_std[0], ctx->image_std[1], ctx->image_std[2]);
|
// LOG_TEE("v_image_std %f %f %f\n", ctx->image_std[0], ctx->image_std[1], ctx->image_std[2]);
|
||||||
// [[384, 768], [768, 384], [768, 768], [1152, 384], [384, 1152]]
|
// // [[384, 768], [768, 384], [768, 768], [1152, 384], [384, 1152]]
|
||||||
ctx->vision_model.hparams.image_grid_pinpoints[0] = 384;
|
// ctx->vision_model.hparams.image_grid_pinpoints[0] = 384;
|
||||||
ctx->vision_model.hparams.image_grid_pinpoints[1] = 768;
|
// ctx->vision_model.hparams.image_grid_pinpoints[1] = 768;
|
||||||
ctx->vision_model.hparams.image_grid_pinpoints[2] = 768;
|
// ctx->vision_model.hparams.image_grid_pinpoints[2] = 768;
|
||||||
ctx->vision_model.hparams.image_grid_pinpoints[3] = 384;
|
// ctx->vision_model.hparams.image_grid_pinpoints[3] = 384;
|
||||||
ctx->vision_model.hparams.image_grid_pinpoints[4] = 768;
|
// ctx->vision_model.hparams.image_grid_pinpoints[4] = 768;
|
||||||
ctx->vision_model.hparams.image_grid_pinpoints[5] = 768;
|
// ctx->vision_model.hparams.image_grid_pinpoints[5] = 768;
|
||||||
ctx->vision_model.hparams.image_grid_pinpoints[6] = 1152;
|
// ctx->vision_model.hparams.image_grid_pinpoints[6] = 1152;
|
||||||
ctx->vision_model.hparams.image_grid_pinpoints[7] = 384;
|
// ctx->vision_model.hparams.image_grid_pinpoints[7] = 384;
|
||||||
ctx->vision_model.hparams.image_grid_pinpoints[8] = 384;
|
// ctx->vision_model.hparams.image_grid_pinpoints[8] = 384;
|
||||||
ctx->vision_model.hparams.image_grid_pinpoints[9] = 1152;
|
// ctx->vision_model.hparams.image_grid_pinpoints[9] = 1152;
|
||||||
for (int i = 0; i < 10; i++)
|
// for (int i = 0; i < 10; i++)
|
||||||
{
|
// {
|
||||||
printf("grid[%d]:%d ", i, ctx->vision_model.hparams.image_grid_pinpoints[i]);
|
// printf("grid[%d]:%d ", i, ctx->vision_model.hparams.image_grid_pinpoints[i]);
|
||||||
}
|
// }
|
||||||
printf("\n");
|
// printf("\n");
|
||||||
ctx->vision_model.hparams.image_size = 384;
|
// ctx->vision_model.hparams.image_size = 384;
|
||||||
printf("in test_anyres: params.image_size:%d\n", ctx->vision_model.hparams.image_size);
|
// printf("in test_anyres: params.image_size:%d\n", ctx->vision_model.hparams.image_size);
|
||||||
/*
|
/*
|
||||||
part of:
|
part of:
|
||||||
llava_image_embed_make_with_filename
|
llava_image_embed_make_with_filename
|
||||||
|
@ -618,6 +619,7 @@ int main(){
|
||||||
printf("image_embd_v.size():%d\n", image_embd_v.size());
|
printf("image_embd_v.size():%d\n", image_embd_v.size());
|
||||||
for (size_t i = 0; i < img_res_v.size; i++)
|
for (size_t i = 0; i < img_res_v.size; i++)
|
||||||
{
|
{
|
||||||
|
printf("encode patch %d\n", i);
|
||||||
image_embd_v[i] =
|
image_embd_v[i] =
|
||||||
(float*)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
|
(float*)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
|
||||||
const bool encoded = clip_image_encode(
|
const bool encoded = clip_image_encode(
|
||||||
|
|
|
@ -267,6 +267,8 @@ if __name__ == "__main__":
|
||||||
# ggml implements gelu_with_tanh approximation
|
# ggml implements gelu_with_tanh approximation
|
||||||
use_gelu = "gelu" in vision_config["hidden_act"].lower()
|
use_gelu = "gelu" in vision_config["hidden_act"].lower()
|
||||||
fout.add_bool("clip.use_gelu", use_gelu)
|
fout.add_bool("clip.use_gelu", use_gelu)
|
||||||
|
fout.add_string("clip.vision.mm_patch_merge_type", 'spatial_unpad')
|
||||||
|
print("hard coded mm_patch_merge_type as spatial_unpad")
|
||||||
|
|
||||||
# for VIT model
|
# for VIT model
|
||||||
with print_time("Loading vision encoder and converting to gguf"):
|
with print_time("Loading vision encoder and converting to gguf"):
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue