push for juntao
This commit is contained in:
parent
e353c037d8
commit
07baee57c9
8 changed files with 1189 additions and 111 deletions
20
.gitignore
vendored
20
.gitignore
vendored
|
@ -133,3 +133,23 @@ poetry.toml
|
|||
# Test models for lora adapters
|
||||
/lora-tests
|
||||
examples/xgenmm/imgs/*.csv
|
||||
examples/xgenmm copy/clip.cpp
|
||||
examples/xgenmm copy/clip.h
|
||||
examples/xgenmm copy/CMakeLists.txt
|
||||
examples/xgenmm copy/convert.sh
|
||||
examples/xgenmm copy/debug.py
|
||||
examples/xgenmm copy/playground.ipynb
|
||||
examples/xgenmm copy/test_anyres_img.cpp
|
||||
examples/xgenmm copy/xgenmm_convert_image_encoder_to_gguf.py
|
||||
examples/xgenmm copy/xgenmm_surgery.py
|
||||
examples/xgenmm copy/xgenmm.cpp
|
||||
examples/xgenmm copy/xgenmm.h
|
||||
examples/xgenmm copy/bak/xgenmm-surgery copy.py
|
||||
examples/xgenmm copy/imgs/image_original_resize.csv
|
||||
examples/xgenmm copy/imgs/image_res_0.csv
|
||||
examples/xgenmm copy/imgs/image_res_1.csv
|
||||
examples/xgenmm copy/imgs/image_res_2.csv
|
||||
examples/xgenmm copy/imgs/image_res_3.csv
|
||||
examples/xgenmm copy/imgs/image_res_4.csv
|
||||
examples/xgenmm copy/imgs/image-1d100e9-1.jpg
|
||||
examples/xgenmm copy/imgs/image-1d100e9.jpg
|
||||
|
|
|
@ -38,6 +38,13 @@ target_link_libraries(test_anyres_img PRIVATE common xgenmm ${CMAKE_THREAD_LIBS_
|
|||
target_compile_features(xgenmm PRIVATE cxx_std_11)
|
||||
|
||||
|
||||
set(TARGET test_anyres_handle_patches)
|
||||
add_executable(test_anyres_handle_patches test_anyres_handle_patches.cpp)
|
||||
install(TARGETS test_anyres_handle_patches RUNTIME)
|
||||
target_link_libraries(test_anyres_handle_patches PRIVATE common xgenmm ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(xgenmm PRIVATE cxx_std_11)
|
||||
|
||||
|
||||
# not implemented yet
|
||||
# set(TARGET xgenmm-cli)
|
||||
# add_executable(xgenmm-cli xgenmm-cli.cpp)
|
||||
|
|
|
@ -85,6 +85,7 @@ static std::string format(const char * fmt, ...) {
|
|||
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
|
||||
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
|
||||
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
|
||||
#define KEY_HAS_XGENMM_PROJ "clip.has_xgenmm_projector"
|
||||
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
|
||||
#define KEY_USE_GELU "clip.use_gelu"
|
||||
#define KEY_N_EMBD "clip.%s.embedding_length"
|
||||
|
@ -140,13 +141,17 @@ static std::string format(const char * fmt, ...) {
|
|||
#define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
|
||||
#define TN_MINICPMV_LN "resampler.ln_%s.%s"
|
||||
|
||||
#define TN_XGENMM_ATTN "perceiver_resampler.blk.%d.attn.%s.%s"
|
||||
#define TN_XGENMM_FFN "perceiver_resampler.blk.%d.ffn.%s.%s"
|
||||
|
||||
enum projector_type {
|
||||
enum projector_type
|
||||
{
|
||||
PROJECTOR_TYPE_MLP,
|
||||
PROJECTOR_TYPE_MLP_NORM,
|
||||
PROJECTOR_TYPE_LDP,
|
||||
PROJECTOR_TYPE_LDPV2,
|
||||
PROJECTOR_TYPE_RESAMPLER,
|
||||
PROJECTOR_TYPE_PERCIVER_RESAMPLER,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
|
@ -155,6 +160,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|||
{ PROJECTOR_TYPE_LDP, "ldp" },
|
||||
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
|
||||
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
|
||||
{ PROJECTOR_TYPE_PERCIVER_RESAMPLER, "PercevierResampler"}
|
||||
};
|
||||
|
||||
|
||||
|
@ -436,6 +442,30 @@ struct clip_layer {
|
|||
struct ggml_tensor * ln_2_b;
|
||||
};
|
||||
|
||||
struct xgenmm_perceiver_resampler_layer
|
||||
{
|
||||
// PerceiverAttention
|
||||
int dim = 1152;
|
||||
int dim_head = 96;
|
||||
int heads = 16;
|
||||
float scale = std::pow(dim_head, -0.5);
|
||||
struct ggml_tensor *mm_model_k_w;
|
||||
struct ggml_tensor *mm_model_q_w;
|
||||
struct ggml_tensor *mm_model_v_w;
|
||||
struct ggml_tensor *mm_model_o_w;
|
||||
struct ggml_tensor *mm_model_ln_media_w;
|
||||
struct ggml_tensor *mm_model_ln_media_b;
|
||||
struct ggml_tensor *mm_model_ln_latents_w;
|
||||
struct ggml_tensor *mm_model_ln_latents_b;
|
||||
|
||||
// Forward
|
||||
int mult = 4;
|
||||
struct ggml_tensor *mm_model_ffn_ln_w;
|
||||
struct ggml_tensor *mm_model_ffn_ln_b;
|
||||
struct ggml_tensor *mm_model_ffn_linear_up_w;
|
||||
struct ggml_tensor *mm_model_ffn_linear_down_w;
|
||||
};
|
||||
|
||||
struct clip_vision_model {
|
||||
struct clip_hparams hparams;
|
||||
|
||||
|
@ -524,13 +554,25 @@ struct clip_vision_model {
|
|||
struct ggml_tensor * mm_model_ln_kv_b;
|
||||
struct ggml_tensor * mm_model_ln_post_w;
|
||||
struct ggml_tensor * mm_model_ln_post_b;
|
||||
|
||||
// XGenMM projection
|
||||
struct ggml_tensor *mm_model_latents;
|
||||
struct ggml_tensor *mm_model_projection_w;
|
||||
struct ggml_tensor *mm_model_projection_b;
|
||||
std::vector<xgenmm_perceiver_resampler_layer> mm_model_layers;
|
||||
struct ggml_tensor *mm_model_norm_w;
|
||||
struct ggml_tensor *mm_model_norm_b;
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
struct clip_ctx {
|
||||
bool has_text_encoder = false;
|
||||
bool has_vision_encoder = false;
|
||||
bool has_llava_projector = false;
|
||||
bool has_minicpmv_projector = false;
|
||||
bool has_xgenmm_projector = false;
|
||||
int minicpmv_version = 2;
|
||||
|
||||
struct clip_vision_model vision_model;
|
||||
|
@ -560,7 +602,7 @@ struct clip_ctx {
|
|||
struct clip_image_size * load_image_size;
|
||||
};
|
||||
|
||||
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
|
||||
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false, ggml_tensor *attn_bias_input = nullptr) {
|
||||
if (!ctx->has_vision_encoder) {
|
||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||
return nullptr;
|
||||
|
@ -584,6 +626,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
image_size_height = imgs->data->ny;
|
||||
}
|
||||
}
|
||||
if (ctx->has_xgenmm_projector) {
|
||||
//TODO: implement something for example, image masks
|
||||
printf("use has_xgenmm_projector\n");
|
||||
}
|
||||
const int patch_size = hparams.patch_size;
|
||||
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
||||
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
|
||||
|
@ -591,7 +637,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
const int n_head = hparams.n_head;
|
||||
const int d_head = hidden_size / n_head;
|
||||
int n_layer = hparams.n_layer;
|
||||
const float eps = hparams.eps;
|
||||
const float eps = hparams.eps;
|
||||
|
||||
const int batch_size = imgs->size;
|
||||
|
||||
|
@ -625,25 +671,30 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
struct ggml_tensor * pos_embed = nullptr;
|
||||
|
||||
if (ctx->has_llava_projector) {
|
||||
printf("use has_llava_projector\n");
|
||||
// concat class_embeddings and patch_embeddings
|
||||
if (ctx->has_class_embedding) {
|
||||
printf("I am in!\n");
|
||||
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
|
||||
printf("created embeddings new 3d tensors\n");
|
||||
ggml_set_name(embeddings, "embeddings");
|
||||
ggml_set_input(embeddings);
|
||||
printf("ggml_set_input\n");
|
||||
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
|
||||
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
|
||||
embeddings = ggml_acc(ctx0, embeddings, inp,
|
||||
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
||||
}
|
||||
}
|
||||
|
||||
printf("hi1!");
|
||||
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
|
||||
ggml_set_name(positions, "positions");
|
||||
ggml_set_input(positions);
|
||||
printf("hi2!");
|
||||
|
||||
embeddings =
|
||||
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
|
||||
|
||||
printf("hi3!");
|
||||
if (ctx->has_minicpmv_projector) {
|
||||
int pos_w = image_size_width/patch_size;
|
||||
int pos_h = image_size_height/patch_size;
|
||||
|
@ -1008,6 +1059,124 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
// xgenmm-projector
|
||||
else if (ctx->has_xgenmm_projector)
|
||||
{
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_PERCIVER_RESAMPLER)
|
||||
{
|
||||
struct ggml_tensor * self_latents = model.mm_model_latents;
|
||||
struct ggml_tensor *img_embeddings = embeddings;
|
||||
// FIXME: hard coded for now
|
||||
int n_layer = 6;
|
||||
const float scale = model.mm_model_layers[0].scale;
|
||||
const int num_head = model.mm_model_layers[0].heads;
|
||||
const int dim_head = model.mm_model_layers[0].dim_head;
|
||||
const int q_len = self_latents->ne[1];
|
||||
const int kv_len = img_embeddings->ne[1] + self_latents->ne[1]; // concat img_embeddings and latents
|
||||
const int hidden_size = dim_head * num_head;
|
||||
// TODO: repeat for (batch_size, n_query_tokens, dim)
|
||||
ggml_tensor *latents = self_latents;
|
||||
|
||||
for (int il = 0; il < n_layer; ++il)
|
||||
{
|
||||
struct ggml_tensor *residual = latents;
|
||||
auto &layer = model.mm_model_layers[il];
|
||||
// layer norm
|
||||
|
||||
struct ggml_tensor *img_embeddings_normalized = ggml_norm(ctx0, img_embeddings, eps);
|
||||
img_embeddings_normalized =
|
||||
ggml_add(ctx0, ggml_mul(ctx0, img_embeddings_normalized, layer.mm_model_ln_media_w),
|
||||
layer.mm_model_ln_media_b);
|
||||
|
||||
latents = ggml_norm(ctx0, latents, eps);
|
||||
latents =
|
||||
ggml_add(ctx0, ggml_mul(ctx0, latents, layer.mm_model_ln_latents_w), layer.mm_model_ln_latents_b);
|
||||
|
||||
// cross attention
|
||||
{
|
||||
struct ggml_tensor *Q = ggml_mul_mat(ctx0, layer.mm_model_q_w, latents);
|
||||
Q = ggml_scale_inplace(ctx0, Q, scale);
|
||||
struct ggml_tensor *kv_inputs = ggml_concat(ctx0, img_embeddings_normalized, latents, 1);
|
||||
// if (vision_attn_masks){
|
||||
// // printf("vision_attn_masks dim0: %ld, dim1: %ld\n", vision_attn_masks->ne[0],
|
||||
// // vision_attn_masks->ne[1]); create all one tensor
|
||||
// const int dim0 = latents->ne[1]; // seq length
|
||||
// const int dim1 = batch_size;
|
||||
// struct ggml_tensor *all_one_tensor = ggml_new_tensor_2d(ctx0, latents->type, dim0, dim1);
|
||||
// ggml_set_name(all_one_tensor, "all_one_tensor");
|
||||
// ggml_set_input(all_one_tensor);
|
||||
|
||||
// vision_attn_masks = ggml_concat(ctx0, vision_attn_masks, all_one_tensor, 0);
|
||||
// }
|
||||
struct ggml_tensor *K = ggml_mul_mat(ctx0, layer.mm_model_k_w, kv_inputs);
|
||||
struct ggml_tensor *V = ggml_mul_mat(ctx0, layer.mm_model_v_w, kv_inputs);
|
||||
// permute
|
||||
Q = ggml_reshape_4d(ctx0, Q, dim_head, num_head, q_len, batch_size);
|
||||
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
|
||||
Q = ggml_reshape_3d(ctx0, Q, dim_head, q_len, num_head * batch_size);
|
||||
|
||||
K = ggml_reshape_4d(ctx0, K, dim_head, num_head, kv_len, batch_size);
|
||||
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
||||
K = ggml_reshape_3d(ctx0, K, dim_head, kv_len, num_head * batch_size);
|
||||
|
||||
V = ggml_reshape_4d(ctx0, V, dim_head, num_head, kv_len, batch_size);
|
||||
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
|
||||
V = ggml_reshape_3d(ctx0, V, kv_len, dim_head, num_head * batch_size);
|
||||
|
||||
struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q);
|
||||
|
||||
// Apply vision attention mask here.
|
||||
// if (vision_attn_masks){
|
||||
// }
|
||||
if (attn_bias_input)
|
||||
{
|
||||
KQ = ggml_add(ctx0, KQ, attn_bias_input);
|
||||
};
|
||||
|
||||
// ggml_soft_max_inplace use numerical stable softmax implementation
|
||||
// ggml_soft_max_inplace(ctx0, KQ) = (sim - sim.amax(dim=-1,
|
||||
// keepdim=True).detach()).softmax(dim=-1)
|
||||
KQ = ggml_soft_max_inplace(ctx0, KQ);
|
||||
|
||||
struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ);
|
||||
KQV = ggml_reshape_4d(ctx0, KQV, dim_head, q_len, num_head, batch_size);
|
||||
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||
KQV = ggml_cont_3d(ctx0, KQV, hidden_size, q_len, batch_size);
|
||||
|
||||
latents = ggml_mul_mat(ctx0, layer.mm_model_o_w, KQV);
|
||||
}
|
||||
// residual connection
|
||||
|
||||
latents = ggml_add(ctx0, latents, residual);
|
||||
residual = latents; // update residual
|
||||
|
||||
// FFN
|
||||
{
|
||||
// layer norm
|
||||
latents = ggml_norm(ctx0, latents, eps);
|
||||
latents = ggml_add(ctx0, ggml_mul(ctx0, latents, layer.mm_model_ffn_ln_w), layer.mm_model_ffn_ln_b);
|
||||
// feed forward
|
||||
latents = ggml_mul_mat(ctx0, layer.mm_model_ffn_linear_up_w, latents);
|
||||
latents = ggml_gelu_inplace(ctx0, latents);
|
||||
latents = ggml_mul_mat(ctx0, layer.mm_model_ffn_linear_down_w, latents);
|
||||
}
|
||||
|
||||
// residual connection
|
||||
latents = ggml_add(ctx0, latents, residual);
|
||||
}
|
||||
|
||||
// post layer norm
|
||||
latents = ggml_norm(ctx0, latents, eps);
|
||||
latents = ggml_add(ctx0, ggml_mul(ctx0, latents, model.mm_model_norm_w), model.mm_model_norm_b);
|
||||
latents =
|
||||
ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_projection_w, latents), model.mm_model_projection_b);
|
||||
embeddings = latents;
|
||||
}
|
||||
else
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, embeddings);
|
||||
|
@ -1450,6 +1619,40 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
|
||||
vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
|
||||
}
|
||||
else if(new_clip->proj_type == PROJECTOR_TYPE_PERCIVER_RESAMPLER){
|
||||
vision_model.mm_model_latents = ggml_get_tensor(new_clip->ctx_data, "perceiver_resampler.latents");
|
||||
vision_model.mm_model_projection_w =
|
||||
ggml_get_tensor(new_clip->ctx_data, "perceiver_resampler.projection.weight");
|
||||
vision_model.mm_model_projection_b =
|
||||
ggml_get_tensor(new_clip->ctx_data, "perceiver_resampler.projection.bias");
|
||||
// FIXME: hard coded for now
|
||||
int n_layer = 6;
|
||||
vision_model.mm_model_layers.resize(n_layer);
|
||||
for (int il = 0; il < n_layer; ++il)
|
||||
{
|
||||
auto &layer = vision_model.mm_model_layers[il];
|
||||
layer.mm_model_k_w = get_tensor(new_clip->ctx_data, format(TN_XGENMM_ATTN, il, "to_k", "weight"));
|
||||
layer.mm_model_q_w = get_tensor(new_clip->ctx_data, format(TN_XGENMM_ATTN, il, "to_q", "weight"));
|
||||
layer.mm_model_v_w = get_tensor(new_clip->ctx_data, format(TN_XGENMM_ATTN, il, "to_v", "weight"));
|
||||
layer.mm_model_o_w = get_tensor(new_clip->ctx_data, format(TN_XGENMM_ATTN, il, "to_out", "weight"));
|
||||
layer.mm_model_ln_media_w =
|
||||
get_tensor(new_clip->ctx_data, format(TN_XGENMM_ATTN, il, "norm_media", "weight"));
|
||||
layer.mm_model_ln_media_b =
|
||||
get_tensor(new_clip->ctx_data, format(TN_XGENMM_ATTN, il, "norm_media", "bias"));
|
||||
layer.mm_model_ln_latents_w =
|
||||
get_tensor(new_clip->ctx_data, format(TN_XGENMM_ATTN, il, "norm_latents", "weight"));
|
||||
layer.mm_model_ln_latents_b =
|
||||
get_tensor(new_clip->ctx_data, format(TN_XGENMM_ATTN, il, "norm_latents", "bias"));
|
||||
layer.mm_model_ffn_ln_w = get_tensor(new_clip->ctx_data, format(TN_XGENMM_FFN, il, "ln", "weight"));
|
||||
layer.mm_model_ffn_ln_b = get_tensor(new_clip->ctx_data, format(TN_XGENMM_FFN, il, "ln", "bias"));
|
||||
layer.mm_model_ffn_linear_up_w =
|
||||
get_tensor(new_clip->ctx_data, format(TN_XGENMM_FFN, il, "linear_up", "weight"));
|
||||
layer.mm_model_ffn_linear_down_w =
|
||||
get_tensor(new_clip->ctx_data, format(TN_XGENMM_FFN, il, "linear_down", "weight"));
|
||||
}
|
||||
vision_model.mm_model_norm_w = get_tensor(new_clip->ctx_data, "perceiver_resampler.ln.weight");
|
||||
vision_model.mm_model_norm_b = get_tensor(new_clip->ctx_data, "perceiver_resampler.ln.bias");
|
||||
}
|
||||
else {
|
||||
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
|
||||
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
||||
|
@ -2009,6 +2212,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
|||
possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
|
||||
}
|
||||
std::pair<int, int> best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions);
|
||||
printf("best_resolution: %d %d\n", best_resolution.first, best_resolution.second);
|
||||
// clip_image_save_to_bmp(*img, "input.bmp");
|
||||
resize_and_pad_image(*img, *temp, best_resolution); // we do not pad with mean-bg color anymore in llava-1.6
|
||||
// clip_image_save_to_bmp(*temp, "resized.bmp");
|
||||
|
|
14
examples/xgenmm/convert.sh
Normal file
14
examples/xgenmm/convert.sh
Normal file
|
@ -0,0 +1,14 @@
|
|||
source /export/share/yutong/miniconda3/bin/activate
|
||||
conda activate xgenmm-flamingo
|
||||
which python
|
||||
# # step 1: surgery
|
||||
# python xgenmm_surgery.py
|
||||
|
||||
# step 2: convert to gguf (vit + projector)
|
||||
|
||||
python xgenmm_convert_image_encoder_to_gguf.py \
|
||||
--surgery_dir /export/share/yutong/xgenmm/llamacpp_wd \
|
||||
--output_dirname gguf_test \
|
||||
--version siglip_kosmos_phi3_4k_instruct \
|
||||
--use_f32 \
|
||||
|
|
@ -5,6 +5,54 @@
|
|||
"metadata": {},
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# llama.cpp image layout\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"channel: 0\n",
|
||||
"row 0: 0 3 6 9 12 \n",
|
||||
"row 1: 15 18 21 24 27 \n",
|
||||
"row 2: 30 33 36 39 42 \n",
|
||||
"------------------------------\n",
|
||||
"channel: 1\n",
|
||||
"row 0: 1 4 7 10 13 \n",
|
||||
"row 1: 16 19 22 25 28 \n",
|
||||
"row 2: 31 34 37 40 43 \n",
|
||||
"------------------------------\n",
|
||||
"channel: 2\n",
|
||||
"row 0: 2 5 8 11 14 \n",
|
||||
"row 1: 17 20 23 26 29 \n",
|
||||
"row 2: 32 35 38 41 44 \n",
|
||||
"------------------------------\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"nx = 5\n",
|
||||
"ny = 3\n",
|
||||
"for k in range(3):\n",
|
||||
" print(f'channel: {k}')\n",
|
||||
" for y in range(ny):\n",
|
||||
" print(f'row {y}:', end=' ')\n",
|
||||
" for x in range(nx):\n",
|
||||
" print(f\"{3*(y*nx + x) + k: 3d}\", end=' ')\n",
|
||||
" print()\n",
|
||||
" print('-'*30)\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
|
637
examples/xgenmm/test_anyres_handle_patches.cpp
Normal file
637
examples/xgenmm/test_anyres_handle_patches.cpp
Normal file
|
@ -0,0 +1,637 @@
|
|||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <fstream>
|
||||
#include <type_traits>
|
||||
#include <typeinfo>
|
||||
#include <vector>
|
||||
|
||||
#include "clip.h"
|
||||
#include "common.h"
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
#include "xgenmm.h"
|
||||
#ifndef _MSC_VER
|
||||
#include <cxxabi.h>
|
||||
#endif
|
||||
#include <cstdlib>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
template <class T>
|
||||
std::string type_name()
|
||||
{
|
||||
typedef typename std::remove_reference<T>::type TR;
|
||||
std::unique_ptr<char, void (*)(void*)> own(
|
||||
#ifndef _MSC_VER
|
||||
abi::__cxa_demangle(typeid(TR).name(), nullptr, nullptr, nullptr),
|
||||
#else
|
||||
nullptr,
|
||||
#endif
|
||||
std::free);
|
||||
std::string r = own != nullptr ? own.get() : typeid(TR).name();
|
||||
if (std::is_const<TR>::value) r += " const";
|
||||
if (std::is_volatile<TR>::value) r += " volatile";
|
||||
if (std::is_lvalue_reference<T>::value)
|
||||
r += "&";
|
||||
else if (std::is_rvalue_reference<T>::value)
|
||||
r += "&&";
|
||||
return r;
|
||||
}
|
||||
|
||||
struct clip_image_u8
|
||||
{
|
||||
int nx;
|
||||
int ny;
|
||||
|
||||
std::vector<uint8_t> buf;
|
||||
};
|
||||
|
||||
struct clip_image_f32
|
||||
{
|
||||
int nx;
|
||||
int ny;
|
||||
|
||||
std::vector<float> buf;
|
||||
};
|
||||
|
||||
inline int clip(int x, int lower, int upper) { return std::max(lower, std::min(x, upper)); }
|
||||
|
||||
static bool bicubic_resize(const clip_image_u8& img, clip_image_u8& dst, int target_width, int target_height)
|
||||
{
|
||||
const int nx = img.nx;
|
||||
const int ny = img.ny;
|
||||
|
||||
dst.nx = target_width;
|
||||
dst.ny = target_height;
|
||||
dst.buf.resize(3 * target_width * target_height);
|
||||
|
||||
float Cc;
|
||||
float C[5];
|
||||
float d0, d2, d3, a0, a1, a2, a3;
|
||||
int i, j, k, jj;
|
||||
int x, y;
|
||||
float dx, dy;
|
||||
float tx, ty;
|
||||
|
||||
tx = (float)nx / (float)target_width;
|
||||
ty = (float)ny / (float)target_height;
|
||||
|
||||
// Bicubic interpolation; adapted from ViT.cpp, inspired from :
|
||||
// -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36
|
||||
// -> https://en.wikipedia.org/wiki/Bicubic_interpolation
|
||||
|
||||
for (i = 0; i < target_height; i++)
|
||||
{
|
||||
for (j = 0; j < target_width; j++)
|
||||
{
|
||||
x = (int)(tx * j);
|
||||
y = (int)(ty * i);
|
||||
|
||||
dx = tx * j - x;
|
||||
dy = ty * i - y;
|
||||
|
||||
for (k = 0; k < 3; k++)
|
||||
{
|
||||
for (jj = 0; jj <= 3; jj++)
|
||||
{
|
||||
d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] -
|
||||
img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
||||
d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] -
|
||||
img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
||||
d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] -
|
||||
img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
||||
a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
||||
|
||||
a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
|
||||
a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
|
||||
a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
|
||||
|
||||
C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;
|
||||
|
||||
d0 = C[0] - C[1];
|
||||
d2 = C[2] - C[1];
|
||||
d3 = C[3] - C[1];
|
||||
a0 = C[1];
|
||||
a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
|
||||
a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
|
||||
a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
|
||||
Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
|
||||
|
||||
const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
|
||||
dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
enum projector_type
|
||||
{
|
||||
PROJECTOR_TYPE_MLP,
|
||||
PROJECTOR_TYPE_MLP_NORM,
|
||||
PROJECTOR_TYPE_LDP,
|
||||
PROJECTOR_TYPE_LDPV2,
|
||||
PROJECTOR_TYPE_RESAMPLER,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{PROJECTOR_TYPE_MLP, "mlp"},
|
||||
{PROJECTOR_TYPE_LDP, "ldp"},
|
||||
{PROJECTOR_TYPE_LDPV2, "ldpv2"},
|
||||
{PROJECTOR_TYPE_RESAMPLER, "resampler"},
|
||||
};
|
||||
|
||||
|
||||
|
||||
struct clip_hparams
|
||||
{
|
||||
int32_t image_size;
|
||||
int32_t patch_size;
|
||||
int32_t hidden_size;
|
||||
int32_t n_intermediate;
|
||||
int32_t projection_dim;
|
||||
int32_t n_head;
|
||||
int32_t n_layer;
|
||||
|
||||
float eps;
|
||||
|
||||
char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default)
|
||||
|
||||
int32_t image_grid_pinpoints[32];
|
||||
int32_t image_crop_resolution;
|
||||
};
|
||||
|
||||
struct clip_layer
|
||||
{
|
||||
// attention
|
||||
struct ggml_tensor* k_w;
|
||||
struct ggml_tensor* k_b;
|
||||
struct ggml_tensor* q_w;
|
||||
struct ggml_tensor* q_b;
|
||||
struct ggml_tensor* v_w;
|
||||
struct ggml_tensor* v_b;
|
||||
|
||||
struct ggml_tensor* o_w;
|
||||
struct ggml_tensor* o_b;
|
||||
|
||||
// layernorm 1
|
||||
struct ggml_tensor* ln_1_w;
|
||||
struct ggml_tensor* ln_1_b;
|
||||
|
||||
// ff
|
||||
struct ggml_tensor* ff_i_w;
|
||||
struct ggml_tensor* ff_i_b;
|
||||
|
||||
struct ggml_tensor* ff_o_w;
|
||||
struct ggml_tensor* ff_o_b;
|
||||
|
||||
// layernorm 2
|
||||
struct ggml_tensor* ln_2_w;
|
||||
struct ggml_tensor* ln_2_b;
|
||||
};
|
||||
|
||||
struct clip_vision_model
|
||||
{
|
||||
struct clip_hparams hparams;
|
||||
|
||||
// embeddings
|
||||
struct ggml_tensor* class_embedding;
|
||||
struct ggml_tensor* patch_embeddings;
|
||||
struct ggml_tensor* patch_bias;
|
||||
struct ggml_tensor* position_embeddings;
|
||||
|
||||
struct ggml_tensor* pre_ln_w;
|
||||
struct ggml_tensor* pre_ln_b;
|
||||
|
||||
std::vector<clip_layer> layers;
|
||||
|
||||
struct ggml_tensor* post_ln_w;
|
||||
struct ggml_tensor* post_ln_b;
|
||||
|
||||
struct ggml_tensor* projection;
|
||||
|
||||
// LLaVA projection
|
||||
struct ggml_tensor* mm_0_w = NULL;
|
||||
struct ggml_tensor* mm_0_b = NULL;
|
||||
struct ggml_tensor* mm_2_w = NULL;
|
||||
struct ggml_tensor* mm_2_b = NULL;
|
||||
|
||||
struct ggml_tensor* image_newline = NULL;
|
||||
|
||||
// Yi type models with mlp+normalization projection
|
||||
struct ggml_tensor* mm_1_w = NULL; // Yi type models have 0, 1, 3, 4
|
||||
struct ggml_tensor* mm_1_b = NULL;
|
||||
struct ggml_tensor* mm_3_w = NULL;
|
||||
struct ggml_tensor* mm_3_b = NULL;
|
||||
struct ggml_tensor* mm_4_w = NULL;
|
||||
struct ggml_tensor* mm_4_b = NULL;
|
||||
|
||||
// MobileVLM projection
|
||||
struct ggml_tensor* mm_model_mlp_1_w;
|
||||
struct ggml_tensor* mm_model_mlp_1_b;
|
||||
struct ggml_tensor* mm_model_mlp_3_w;
|
||||
struct ggml_tensor* mm_model_mlp_3_b;
|
||||
struct ggml_tensor* mm_model_block_1_block_0_0_w;
|
||||
struct ggml_tensor* mm_model_block_1_block_0_1_w;
|
||||
struct ggml_tensor* mm_model_block_1_block_0_1_b;
|
||||
struct ggml_tensor* mm_model_block_1_block_1_fc1_w;
|
||||
struct ggml_tensor* mm_model_block_1_block_1_fc1_b;
|
||||
struct ggml_tensor* mm_model_block_1_block_1_fc2_w;
|
||||
struct ggml_tensor* mm_model_block_1_block_1_fc2_b;
|
||||
struct ggml_tensor* mm_model_block_1_block_2_0_w;
|
||||
struct ggml_tensor* mm_model_block_1_block_2_1_w;
|
||||
struct ggml_tensor* mm_model_block_1_block_2_1_b;
|
||||
struct ggml_tensor* mm_model_block_2_block_0_0_w;
|
||||
struct ggml_tensor* mm_model_block_2_block_0_1_w;
|
||||
struct ggml_tensor* mm_model_block_2_block_0_1_b;
|
||||
struct ggml_tensor* mm_model_block_2_block_1_fc1_w;
|
||||
struct ggml_tensor* mm_model_block_2_block_1_fc1_b;
|
||||
struct ggml_tensor* mm_model_block_2_block_1_fc2_w;
|
||||
struct ggml_tensor* mm_model_block_2_block_1_fc2_b;
|
||||
struct ggml_tensor* mm_model_block_2_block_2_0_w;
|
||||
struct ggml_tensor* mm_model_block_2_block_2_1_w;
|
||||
struct ggml_tensor* mm_model_block_2_block_2_1_b;
|
||||
|
||||
// MobileVLM_V2 projection
|
||||
struct ggml_tensor* mm_model_mlp_0_w;
|
||||
struct ggml_tensor* mm_model_mlp_0_b;
|
||||
struct ggml_tensor* mm_model_mlp_2_w;
|
||||
struct ggml_tensor* mm_model_mlp_2_b;
|
||||
struct ggml_tensor* mm_model_peg_0_w;
|
||||
struct ggml_tensor* mm_model_peg_0_b;
|
||||
|
||||
// MINICPMV projection
|
||||
struct ggml_tensor* mm_model_pos_embed_k;
|
||||
struct ggml_tensor* mm_model_query;
|
||||
struct ggml_tensor* mm_model_proj;
|
||||
struct ggml_tensor* mm_model_kv_proj;
|
||||
struct ggml_tensor* mm_model_attn_q_w;
|
||||
struct ggml_tensor* mm_model_attn_q_b;
|
||||
struct ggml_tensor* mm_model_attn_k_w;
|
||||
struct ggml_tensor* mm_model_attn_k_b;
|
||||
struct ggml_tensor* mm_model_attn_v_w;
|
||||
struct ggml_tensor* mm_model_attn_v_b;
|
||||
struct ggml_tensor* mm_model_attn_o_w;
|
||||
struct ggml_tensor* mm_model_attn_o_b;
|
||||
struct ggml_tensor* mm_model_ln_q_w;
|
||||
struct ggml_tensor* mm_model_ln_q_b;
|
||||
struct ggml_tensor* mm_model_ln_kv_w;
|
||||
struct ggml_tensor* mm_model_ln_kv_b;
|
||||
struct ggml_tensor* mm_model_ln_post_w;
|
||||
struct ggml_tensor* mm_model_ln_post_b;
|
||||
};
|
||||
|
||||
struct clip_ctx
|
||||
{
|
||||
bool has_text_encoder = false;
|
||||
bool has_vision_encoder = false;
|
||||
bool has_llava_projector = false;
|
||||
bool has_minicpmv_projector = false;
|
||||
bool has_xgenmm_projector = false;
|
||||
int minicpmv_version = 2;
|
||||
|
||||
struct clip_vision_model vision_model;
|
||||
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
||||
|
||||
float image_mean[3];
|
||||
float image_std[3];
|
||||
bool use_gelu = false;
|
||||
int32_t ftype = 1;
|
||||
|
||||
bool has_class_embedding = true;
|
||||
bool has_pre_norm = true;
|
||||
bool has_post_norm = false;
|
||||
bool has_patch_bias = false;
|
||||
|
||||
struct gguf_context* ctx_gguf;
|
||||
struct ggml_context* ctx_data;
|
||||
|
||||
std::vector<uint8_t> buf_compute_meta;
|
||||
|
||||
// memory buffers to evaluate the model
|
||||
ggml_backend_buffer_t params_buffer = NULL;
|
||||
|
||||
ggml_backend_t backend = NULL;
|
||||
ggml_gallocr_t compute_alloc = NULL;
|
||||
|
||||
struct clip_image_size* load_image_size;
|
||||
};
|
||||
|
||||
static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long* sizeOut)
|
||||
{
|
||||
auto file = fopen(path, "rb");
|
||||
if (file == NULL)
|
||||
{
|
||||
LOG_TEE("%s: can't read file %s\n", __func__, path);
|
||||
return false;
|
||||
}
|
||||
|
||||
fseek(file, 0, SEEK_END);
|
||||
auto fileSize = ftell(file);
|
||||
fseek(file, 0, SEEK_SET);
|
||||
|
||||
auto buffer = (unsigned char*)malloc(fileSize); // Allocate memory to hold the file data
|
||||
if (buffer == NULL)
|
||||
{
|
||||
LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
|
||||
perror("Memory allocation error");
|
||||
fclose(file);
|
||||
return false;
|
||||
}
|
||||
errno = 0;
|
||||
size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer
|
||||
if (ferror(file))
|
||||
{
|
||||
die_fmt("read error: %s", strerror(errno));
|
||||
}
|
||||
if (ret != (size_t)fileSize)
|
||||
{
|
||||
die("unexpectedly reached end of file");
|
||||
}
|
||||
fclose(file); // Close the file
|
||||
|
||||
*bytesOut = buffer;
|
||||
*sizeOut = fileSize;
|
||||
return true;
|
||||
}
|
||||
|
||||
void print_img(clip_image_u8* img)
|
||||
{
|
||||
const int nx = img->nx;
|
||||
const int ny = img->ny;
|
||||
printf("num pixels: %d\n", img->buf.size());
|
||||
printf("raw img: nx:%d | ny:%d\n", nx, ny);
|
||||
|
||||
const int n = nx * ny;
|
||||
for (int k = 0; k < 3; k++)
|
||||
{
|
||||
for (int y = 0; y < 5; y++)
|
||||
{
|
||||
for (int x = 0; x < 10; x++)
|
||||
{
|
||||
// data[(i * 3 * n) + k * n + y * nx + x] = imgs->data[i].buf[3 * (y * nx + x) + k];
|
||||
printf("%d ", img->buf[3 * (y * nx + x) + k]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
void img_to_csv(clip_image_u8* img, const char* filename)
|
||||
{
|
||||
std::ofstream outFile(filename);
|
||||
if (!outFile.is_open())
|
||||
{
|
||||
std::cerr << "Error opening file!" << std::endl;
|
||||
}
|
||||
const int nx = img->nx;
|
||||
const int ny = img->ny;
|
||||
|
||||
for (int k = 0; k < 3; k++)
|
||||
{
|
||||
for (int y = 0; y < ny; y++)
|
||||
{
|
||||
for (int x = 0; x < nx; x++)
|
||||
{
|
||||
outFile << int(img->buf[3 * (y * nx + x) + k]);
|
||||
if (x < nx - 1)
|
||||
{
|
||||
outFile << ",";
|
||||
}
|
||||
}
|
||||
outFile << std::endl;
|
||||
}
|
||||
outFile << std::endl;
|
||||
}
|
||||
|
||||
outFile.close();
|
||||
printf("file saved to %s\n", filename);
|
||||
}
|
||||
|
||||
void tensor_to_csv(clip_image_f32* img, const char* filename)
|
||||
{
|
||||
|
||||
std::ofstream outFile(filename);
|
||||
if (!outFile.is_open())
|
||||
{
|
||||
std::cerr << "Error opening file!" << std::endl;
|
||||
}
|
||||
const int nx = img->nx;
|
||||
const int ny = img->ny;
|
||||
|
||||
for (int k = 0; k < 3; k++)
|
||||
{
|
||||
for (int y = 0; y < ny; y++)
|
||||
{
|
||||
for (int x = 0; x < nx; x++)
|
||||
{
|
||||
outFile << float(img->buf[3 * (y * nx + x) + k]);
|
||||
if (x < nx - 1)
|
||||
{
|
||||
outFile << ",";
|
||||
}
|
||||
}
|
||||
outFile << std::endl;
|
||||
}
|
||||
outFile << std::endl;
|
||||
}
|
||||
|
||||
outFile.close();
|
||||
printf("file saved to %s\n", filename);
|
||||
}
|
||||
|
||||
struct clip_image_grid_shape
|
||||
{
|
||||
int first;
|
||||
int second;
|
||||
};
|
||||
|
||||
static std::pair<int, int> select_best_resolution(const std::pair<int, int>& original_size,
|
||||
const std::vector<std::pair<int, int>>& possible_resolutions)
|
||||
{
|
||||
int original_width = original_size.first;
|
||||
int original_height = original_size.second;
|
||||
|
||||
std::pair<int, int> best_fit;
|
||||
int max_effective_resolution = 0;
|
||||
int min_wasted_resolution = std::numeric_limits<int>::max();
|
||||
|
||||
for (const auto& resolution : possible_resolutions)
|
||||
{
|
||||
int width = resolution.first;
|
||||
int height = resolution.second;
|
||||
float scale =
|
||||
std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
|
||||
int downscaled_width = static_cast<int>(original_width * scale);
|
||||
int downscaled_height = static_cast<int>(original_height * scale);
|
||||
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
|
||||
int wasted_resolution = (width * height) - effective_resolution;
|
||||
// LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale,
|
||||
// downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
|
||||
if (effective_resolution > max_effective_resolution ||
|
||||
(effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution))
|
||||
{
|
||||
max_effective_resolution = effective_resolution;
|
||||
min_wasted_resolution = wasted_resolution;
|
||||
best_fit = resolution;
|
||||
}
|
||||
}
|
||||
|
||||
return best_fit;
|
||||
}
|
||||
|
||||
static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, int>& image_size,
|
||||
const std::vector<std::pair<int, int>>& grid_pinpoints,
|
||||
int image_patch_size)
|
||||
{
|
||||
/**
|
||||
Conversion from gguf flat array to vector:
|
||||
std::vector<std::pair<int, int>> possible_resolutions;
|
||||
for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
|
||||
possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
|
||||
}
|
||||
*/
|
||||
auto best_resolution = select_best_resolution(image_size, grid_pinpoints);
|
||||
return {best_resolution.first / image_patch_size, best_resolution.second / image_patch_size};
|
||||
}
|
||||
|
||||
int main(){
|
||||
|
||||
|
||||
const char* clip_path = "/export/share/yutong/xgenmm/llamacpp_wd/llava-1.6/vit/mmproj-model-f16.gguf";
|
||||
struct clip_ctx * ctx = clip_model_load(clip_path, /*verbosity=*/2);
|
||||
printf("Model loaded\n");
|
||||
for (int i=0; i < 3; i++){
|
||||
ctx->image_mean[i] = 0.5;
|
||||
ctx->image_std[i] = 0.5;
|
||||
}
|
||||
LOG_TEE("v_image_mean %f %f %f\n", ctx->image_mean[0], ctx->image_mean[1], ctx->image_mean[2]);
|
||||
LOG_TEE("v_image_std %f %f %f\n", ctx->image_std[0], ctx->image_std[1], ctx->image_std[2]);
|
||||
// [[384, 768], [768, 384], [768, 768], [1152, 384], [384, 1152]]
|
||||
ctx->vision_model.hparams.image_grid_pinpoints[0] = 384;
|
||||
ctx->vision_model.hparams.image_grid_pinpoints[1] = 768;
|
||||
ctx->vision_model.hparams.image_grid_pinpoints[2] = 768;
|
||||
ctx->vision_model.hparams.image_grid_pinpoints[3] = 384;
|
||||
ctx->vision_model.hparams.image_grid_pinpoints[4] = 768;
|
||||
ctx->vision_model.hparams.image_grid_pinpoints[5] = 768;
|
||||
ctx->vision_model.hparams.image_grid_pinpoints[6] = 1152;
|
||||
ctx->vision_model.hparams.image_grid_pinpoints[7] = 384;
|
||||
ctx->vision_model.hparams.image_grid_pinpoints[8] = 384;
|
||||
ctx->vision_model.hparams.image_grid_pinpoints[9] = 1152;
|
||||
for (int i = 0; i < 10; i++)
|
||||
{
|
||||
printf("grid[%d]:%d ", i, ctx->vision_model.hparams.image_grid_pinpoints[i]);
|
||||
}
|
||||
printf("\n");
|
||||
ctx->vision_model.hparams.image_size = 384;
|
||||
printf("in test_anyres: params.image_size:%d\n", ctx->vision_model.hparams.image_size);
|
||||
/*
|
||||
part of:
|
||||
llava_image_embed_make_with_filename
|
||||
*/
|
||||
const char* image_path = "/export/home/llama.cpp/examples/xgenmm/imgs/image-1d100e9.jpg"; // Porcelain
|
||||
// const char* image_path = "/export/home/llama.cpp/examples/xgenmm/imgs/image-1d100e9-1.jpg";
|
||||
unsigned char* image_bytes;
|
||||
long image_bytes_length;
|
||||
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
|
||||
if (!loaded)
|
||||
{
|
||||
LOG_TEE("%s: failed to load %s\n", __func__, image_path);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
part of:
|
||||
llava_image_embed_make_with_bytes
|
||||
*/
|
||||
clip_image_u8* img = clip_image_u8_init();
|
||||
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img))
|
||||
{
|
||||
clip_image_u8_free(img);
|
||||
LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
part of:
|
||||
encode_image_with_clip
|
||||
*/
|
||||
clip_image_f32_batch img_res_v;
|
||||
img_res_v.size = 0;
|
||||
img_res_v.data = nullptr;
|
||||
if (!clip_image_preprocess(ctx, img, &img_res_v))
|
||||
{
|
||||
LOG_TEE("%s: unable to preprocess image\n", __func__);
|
||||
delete[] img_res_v.data;
|
||||
return false;
|
||||
}
|
||||
printf("img->nx:%ld | img->ny:%ld\n", img->nx, img->ny);
|
||||
printf("Bacth size: img_res_v.size:%ld\n", img_res_v.size);
|
||||
|
||||
// std::cout << "decltype(img_res_v.data) is " << type_name<decltype(img_res_v.data)>() << '\n';
|
||||
|
||||
// printf("Image Dimension in this batch: img_res_v.data->nx:%ld | img_res_v.data->nx:%ld\n", img_res_v.data->nx,
|
||||
// img_res_v.data->ny);
|
||||
// printf("img_res_v.data->buf.size():%ld\n", img_res_v.data->buf.size());
|
||||
|
||||
|
||||
// std::cout << "decltype(img_res_v.data[0]) is " << type_name<decltype(img_res_v.data[0])>() << '\n';
|
||||
// std::cout << "decltype(img_res_v.data[0].buf[0]) is " << type_name<decltype(img_res_v.data[0].buf[0])>() << '\n';
|
||||
// for (size_t i = 0; i < img_res_v.size; i++) {
|
||||
// const int nx = img_res_v.data[i].nx;
|
||||
// const int ny = img_res_v.data[i].ny;
|
||||
// const int vec_len = img_res_v.data[i].buf.size();
|
||||
// printf("i:%d | nx:%d | ny:%d | vec len:%d\n", i, nx, ny, vec_len);
|
||||
// }
|
||||
|
||||
const char* mm_patch_merge_type = clip_patch_merge_type(ctx);
|
||||
printf("mm_patch_merge_type:%s\n", mm_patch_merge_type);
|
||||
|
||||
struct clip_ctx* ctx_clip = ctx;
|
||||
const int32_t* image_grid = clip_image_grid(ctx_clip);
|
||||
|
||||
std::vector<std::pair<int, int>> grid_pinpoints;
|
||||
for (int i = 0; i < 32 && image_grid[i] != 0; i += 2)
|
||||
{
|
||||
grid_pinpoints.push_back({image_grid[i], image_grid[i + 1]});
|
||||
}
|
||||
for (const auto& point : grid_pinpoints)
|
||||
{
|
||||
std::cout << "(" << point.first << ", " << point.second << ")" << std::endl;
|
||||
}
|
||||
|
||||
const int32_t image_size = clip_image_size(ctx_clip);
|
||||
printf("image_size:%d\n", image_size);
|
||||
|
||||
struct clip_image_grid_shape grid_shape =
|
||||
get_anyres_image_grid_shape({img->nx, img->ny}, grid_pinpoints, image_size);
|
||||
|
||||
printf("grid_shape.first:%d | grid_shape.second:%d\n", grid_shape.first, grid_shape.second);
|
||||
|
||||
std::vector<float*> image_embd_v;
|
||||
image_embd_v.resize(img_res_v.size);
|
||||
printf("image_embd_v.size():%d\n", image_embd_v.size());
|
||||
for (size_t i = 0; i < img_res_v.size; i++)
|
||||
{
|
||||
image_embd_v[i] =
|
||||
(float*)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
|
||||
const bool encoded = clip_image_encode(
|
||||
ctx_clip, 1, &img_res_v.data[i],
|
||||
image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
|
||||
if (!encoded)
|
||||
{
|
||||
LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int)i + 1, (int)img_res_v.size);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
// make test_anyres_handle_patches && ./bin/test_anyres_handle_patches
|
|
@ -255,19 +255,21 @@ struct clip_vision_model
|
|||
struct ggml_tensor* mm_model_ln_post_b;
|
||||
};
|
||||
|
||||
struct clip_ctx {
|
||||
bool has_text_encoder = false;
|
||||
bool has_vision_encoder = false;
|
||||
struct clip_ctx
|
||||
{
|
||||
bool has_text_encoder = false;
|
||||
bool has_vision_encoder = false;
|
||||
bool has_llava_projector = false;
|
||||
bool has_minicpmv_projector = false;
|
||||
int minicpmv_version = 2;
|
||||
bool has_xgenmm_projector = true;
|
||||
int minicpmv_version = 2;
|
||||
|
||||
struct clip_vision_model vision_model;
|
||||
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
||||
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
||||
|
||||
float image_mean[3];
|
||||
float image_std[3];
|
||||
bool use_gelu = false;
|
||||
float image_mean[3];
|
||||
float image_std[3];
|
||||
bool use_gelu = false;
|
||||
int32_t ftype = 1;
|
||||
|
||||
bool has_class_embedding = true;
|
||||
|
@ -275,18 +277,18 @@ struct clip_ctx {
|
|||
bool has_post_norm = false;
|
||||
bool has_patch_bias = false;
|
||||
|
||||
struct gguf_context * ctx_gguf;
|
||||
struct ggml_context * ctx_data;
|
||||
struct gguf_context* ctx_gguf;
|
||||
struct ggml_context* ctx_data;
|
||||
|
||||
std::vector<uint8_t> buf_compute_meta;
|
||||
|
||||
// memory buffers to evaluate the model
|
||||
ggml_backend_buffer_t params_buffer = NULL;
|
||||
ggml_backend_buffer_t params_buffer = NULL;
|
||||
|
||||
ggml_backend_t backend = NULL;
|
||||
ggml_backend_t backend = NULL;
|
||||
ggml_gallocr_t compute_alloc = NULL;
|
||||
|
||||
struct clip_image_size * load_image_size;
|
||||
struct clip_image_size* load_image_size;
|
||||
};
|
||||
|
||||
static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long* sizeOut)
|
||||
|
@ -476,7 +478,7 @@ int main(){
|
|||
}
|
||||
printf("\n");
|
||||
ctx->vision_model.hparams.image_size = 384;
|
||||
printf("params.image_size:%d\n", ctx->vision_model.hparams.image_size);
|
||||
printf("in test_anyres: params.image_size:%d\n", ctx->vision_model.hparams.image_size);
|
||||
/*
|
||||
part of:
|
||||
llava_image_embed_make_with_filename
|
||||
|
@ -504,17 +506,17 @@ int main(){
|
|||
return NULL;
|
||||
}
|
||||
|
||||
print_img(img);
|
||||
// print_img(img);
|
||||
|
||||
clip_image_u8* image_original_resize = clip_image_u8_init();
|
||||
bicubic_resize(*img, *image_original_resize, 384, 384);
|
||||
// clip_image_u8* image_original_resize = clip_image_u8_init();
|
||||
// bicubic_resize(*img, *image_original_resize, 384, 384);
|
||||
|
||||
printf("**********************************\n");
|
||||
// printf("**********************************\n");
|
||||
|
||||
print_img(image_original_resize);
|
||||
img_to_csv(image_original_resize, "/export/home/llama.cpp/examples/xgenmm/imgs/image_original_resize.csv");
|
||||
printf("num pixels: %d\n", image_original_resize->buf.size());
|
||||
printf("raw img: nx:%d | ny:%d\n", image_original_resize->nx, image_original_resize->ny);
|
||||
// print_img(image_original_resize);
|
||||
// img_to_csv(image_original_resize, "/export/home/llama.cpp/examples/xgenmm/imgs/image_original_resize.csv");
|
||||
// printf("num pixels: %d\n", image_original_resize->buf.size());
|
||||
// printf("raw img: nx:%d | ny:%d\n", image_original_resize->nx, image_original_resize->ny);
|
||||
|
||||
/*
|
||||
part of:
|
||||
|
@ -523,7 +525,6 @@ int main(){
|
|||
clip_image_f32_batch img_res_v;
|
||||
img_res_v.size = 0;
|
||||
img_res_v.data = nullptr;
|
||||
|
||||
if (!clip_image_preprocess(ctx, img, &img_res_v))
|
||||
{
|
||||
LOG_TEE("%s: unable to preprocess image\n", __func__);
|
||||
|
@ -531,36 +532,34 @@ int main(){
|
|||
return false;
|
||||
}
|
||||
printf("img->nx:%ld | img->ny:%ld\n", img->nx, img->ny);
|
||||
// printf("img_res_v.size:%ld\n", img_res_v.size);
|
||||
printf("img_res_v.size:%ld\n", img_res_v.size);
|
||||
printf("img_res_v->nx:%ld | img_res_v->ny:%ld\n", img_res_v.data->nx, img_res_v.data->ny);
|
||||
// std::cout << img_res_v.data->nx << " | " << img_res_v.data->ny << std::endl;
|
||||
// std::cout << img_res_v.data->buf.size() << std::endl;
|
||||
|
||||
const char* mm_patch_merge_type = clip_patch_merge_type(ctx);
|
||||
printf("mm_patch_merge_type:%s\n", mm_patch_merge_type);
|
||||
|
||||
std::string basename = "/export/home/llama.cpp/examples/xgenmm/imgs/image_res";
|
||||
for (size_t i = 0; i < img_res_v.size; i++) {
|
||||
const int nx = img_res_v.data[i].nx;
|
||||
const int ny = img_res_v.data[i].ny;
|
||||
printf("i:%d | nx:%d | ny:%d\n", i, nx, ny);
|
||||
// std::string basename = "/export/home/llama.cpp/examples/xgenmm/imgs/image_res";
|
||||
// for (size_t i = 0; i < img_res_v.size; i++) {
|
||||
// const int nx = img_res_v.data[i].nx;
|
||||
// const int ny = img_res_v.data[i].ny;
|
||||
// printf("i:%d | nx:%d | ny:%d\n", i, nx, ny);
|
||||
|
||||
const int n = nx * ny;
|
||||
// const int n = nx * ny;
|
||||
|
||||
|
||||
for (int k = 0; k < 1; k++) {
|
||||
for (int y = 0; y < 5; y++) {
|
||||
for (int x = 0; x < 10; x++) {
|
||||
// data[(i * 3 * n) + k * n + y * nx + x] = imgs->data[i].buf[3 * (y * nx + x) + k];
|
||||
printf("%.4f ", img_res_v.data[i].buf[3 * (y * nx + x) + k]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
std::string fileName = basename + "_" + std::to_string(i) + ".csv";
|
||||
tensor_to_csv(&img_res_v.data[i], fileName.c_str());
|
||||
}
|
||||
// for (int k = 0; k < 1; k++) {
|
||||
// for (int y = 0; y < 5; y++) {
|
||||
// for (int x = 0; x < 10; x++) {
|
||||
// // data[(i * 3 * n) + k * n + y * nx + x] = imgs->data[i].buf[3 * (y * nx + x) + k];
|
||||
// printf("%.4f ", img_res_v.data[i].buf[3 * (y * nx + x) + k]);
|
||||
// }
|
||||
// printf("\n");
|
||||
// }
|
||||
// printf("\n");
|
||||
// }
|
||||
// std::string fileName = basename + "_" + std::to_string(i) + ".csv";
|
||||
// tensor_to_csv(&img_res_v.data[i], fileName.c_str());
|
||||
// }
|
||||
|
||||
|
||||
// /*
|
||||
|
|
|
@ -72,6 +72,60 @@ def bytes_to_unicode():
|
|||
cs = [chr(n) for n in cs]
|
||||
return dict(zip(bs, cs))
|
||||
|
||||
#########################################
|
||||
#### belows are added for xgenmm
|
||||
########################################
|
||||
|
||||
def _replace_name_vit(s,v):
|
||||
s = "vision_model." + s
|
||||
if re.match("vision_model.embeddings.position_embedding", s):
|
||||
v = v.unsqueeze(0)
|
||||
return {s: v}
|
||||
return {s: v}
|
||||
|
||||
def _replace_attn_layer(key, value):
|
||||
# Check for the special case first
|
||||
if re.match(r'layers\.(\d+)\.0\.to_kv\.weight', key):
|
||||
idx = re.search(r'layers\.(\d+)\.0\.to_kv\.weight', key).group(1)
|
||||
KVweight = value.chunk(2, dim=0)
|
||||
return {f'blk.{idx}.attn.to_k.weight': KVweight[0],
|
||||
f'blk.{idx}.attn.to_v.weight': KVweight[1]
|
||||
}
|
||||
|
||||
# Apply general replacements for other patterns
|
||||
# Define the replacement patterns
|
||||
patterns = [
|
||||
(r'layers\.(\d+)\.0\.norm_media\.(weight|bias)', r'blk.\1.attn.norm_media.\2'),
|
||||
(r'layers\.(\d+)\.0\.norm_latents\.(weight|bias)', r'blk.\1.attn.norm_latents.\2'),
|
||||
(r'layers\.(\d+)\.0\.to_q\.(weight)', r'blk.\1.attn.to_q.\2'),
|
||||
(r'layers\.(\d+)\.0\.to_out\.(weight)', r'blk.\1.attn.to_out.\2'),
|
||||
(r'layers\.(\d+)\.1\.0\.(weight|bias)', r'blk.\1.ffn.ln.\2'),
|
||||
(r'layers\.(\d+)\.1\.1\.weight', r'blk.\1.ffn.linear_up.weight'),
|
||||
(r'layers\.(\d+)\.1\.3\.weight', r'blk.\1.ffn.linear_down.weight'),
|
||||
]
|
||||
for pattern, replacement in patterns:
|
||||
key = re.sub(pattern, replacement, key)
|
||||
|
||||
return {key: value}
|
||||
|
||||
def replace_tensor_name_xgenmm_projector(ckpt):
|
||||
identifier = 'perceiver_resampler.'
|
||||
new_state_dict = {}
|
||||
for k, v in ckpt.items():
|
||||
# handel the layer
|
||||
if 'layers' in k:
|
||||
new_kvs = _replace_attn_layer(k, v)
|
||||
for new_k, new_v in new_kvs.items():
|
||||
new_state_dict[identifier+new_k] = new_v
|
||||
elif k == 'norm.weight':
|
||||
new_k = 'ln.weight'
|
||||
new_state_dict[identifier+new_k] = v
|
||||
elif k == 'norm.bias':
|
||||
new_k = 'ln.bias'
|
||||
new_state_dict[identifier+new_k] = v
|
||||
else:
|
||||
new_state_dict[identifier+k] = v
|
||||
return new_state_dict
|
||||
|
||||
class print_time():
|
||||
def __init__(self, task):
|
||||
|
@ -90,12 +144,14 @@ def get_args():
|
|||
parser.add_argument("--surgery_dir", type=str, default='/export/share/yutong/xgenmm/llamacpp_wd')
|
||||
parser.add_argument('--version', type=str, default='siglip_kosmos_phi3_4k_instruct', help='help identify the version of the saved ckpt')
|
||||
# options kept from llama.cpp projects
|
||||
parser.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
|
||||
parser.add_argument("--text-only", action="store_true", required=False,
|
||||
parser.add_argument("--use_f32", action="store_true", default=False, help="Use f32 instead of f16")
|
||||
parser.add_argument("--text_only", action="store_true", required=False,
|
||||
help="Save a text-only model. It can't be used to encode images")
|
||||
parser.add_argument("--vision-only", action="store_true", required=False,
|
||||
parser.add_argument("--vision_only", action="store_true", required=False,
|
||||
help="Save a vision-only model. It can't be used to encode texts")
|
||||
parser.add_argument("--xgenmm-projector", help="Path to minicpmv.projector file. If specified, save an image encoder for XgenMM models.")
|
||||
parser.add_argument("--xgenmm_projector", help="Path to xgenmm projector file. If specified, save an image encoder for XgenMM models.")
|
||||
parser.add_argument("--xgenmm_vit", help="Path to vit file.")
|
||||
parser.add_argument("--output_dirname", default="gguf",help="Output directory")
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
@ -108,7 +164,7 @@ if __name__ == "__main__":
|
|||
exit(1)
|
||||
|
||||
if args.use_f32:
|
||||
print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
|
||||
print("🟡 WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
|
||||
|
||||
# possible data types
|
||||
# ftype == 0 -> float32
|
||||
|
@ -122,16 +178,16 @@ if __name__ == "__main__":
|
|||
ftype = 0
|
||||
|
||||
ckpt_dir = f"{args.surgery_dir}/{args.version}"
|
||||
args.xgenmm_projector = f"ckpt_dir/xgenmm.projector"
|
||||
if args.xgenmm_projector is None:
|
||||
args.xgenmm_projector = f"{ckpt_dir}/xgenmm.projector"
|
||||
if args.xgenmm_vit is None:
|
||||
args.xgenmm_vit = f"{ckpt_dir}/vision_encoder/xgenmm.vision_encoder"
|
||||
output_dir = f"{ckpt_dir}/{args.output_dirname}"
|
||||
|
||||
with print_time("Loading vision encoder"):
|
||||
vision_encoder_config_path = f"{args.surgery_dir}/{args.version}/vision_encoder/config.json"
|
||||
with open(vision_encoder_config_path, 'r') as f:
|
||||
vision_config = json.load(f)
|
||||
vision_encoder_config = SiglipVisionConfig(**vision_config)
|
||||
# vision_encoder = SiglipVisionTransformer(vision_encoder_config)
|
||||
# vision_encoder_ckpt = torch.load(f'{ckpt_dir}/vision_encoder/xgenmm.vision_encoder')
|
||||
# vision_encoder.load_state_dict(vision_encoder_ckpt)
|
||||
|
||||
vision_encoder_config_path = f"{args.surgery_dir}/{args.version}/vision_encoder/config.json"
|
||||
with open(vision_encoder_config_path, 'r') as f:
|
||||
vision_config = json.load(f)
|
||||
|
||||
fname_middle = None
|
||||
has_text_encoder = True
|
||||
|
@ -151,10 +207,9 @@ if __name__ == "__main__":
|
|||
fname_middle = ""
|
||||
|
||||
|
||||
output_dir = f"{ckpt_dir}/gguf"
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
output_prefix = os.path.basename(output_dir).replace("ggml_", "")
|
||||
|
||||
fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
|
||||
|
||||
fout = GGUFWriter(path=fname_out, arch="clip")
|
||||
|
@ -181,52 +236,146 @@ if __name__ == "__main__":
|
|||
"num_attention_heads", "layer_norm_eps", "num_hidden_layers", "hidden_act"
|
||||
unused: "attention_dropout", "model_type", "num_channels"
|
||||
"""
|
||||
fout.add_uint32("clip.vision.image_size", vision_config["image_size"])
|
||||
fout.add_uint32("clip.vision.patch_size", vision_config["patch_size"])
|
||||
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vision_config["hidden_size"])
|
||||
fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), vision_config["intermediate_size"])
|
||||
# TODO: need to check the value of projection_dim; follow minicpmv to set it as 0
|
||||
fout.add_uint32("clip.vision.projection_dim", 0)
|
||||
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vision_config["num_attention_heads"])
|
||||
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), vision_config["layer_norm_eps"])
|
||||
# TODO: chekck this as it might causes bugs
|
||||
# orginial llaval implementation:
|
||||
# block_count = vision_config["num_hidden_layers"] - 1 if has_xgenmm_projector else vision_config["num_hidden_layers"]
|
||||
# we are different from llama1.6, which used the second to the last layer's hidden states as the image features.
|
||||
block_count = vision_config["num_hidden_layers"]
|
||||
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
|
||||
print(KEY_BLOCK_COUNT)
|
||||
# xgenmm use anyres with grids configuration
|
||||
# 1*2, 2*1, 2*2, 3*1, 1*3, the same as the llava1.6, we just hard code it here
|
||||
image_grid_pinpoints = [336, 672, 672, 336, 672, 672, 1008, 336, 336, 1008]
|
||||
fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints)
|
||||
with print_time("add vit configs to gguf"):
|
||||
fout.add_uint32("clip.vision.image_size", vision_config["image_size"])
|
||||
fout.add_uint32("clip.vision.patch_size", vision_config["patch_size"])
|
||||
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vision_config["hidden_size"])
|
||||
fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), vision_config["intermediate_size"])
|
||||
# TODO: need to check the value of projection_dim; follow minicpmv to set it as 0
|
||||
fout.add_uint32("clip.vision.projection_dim", 0)
|
||||
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vision_config["num_attention_heads"])
|
||||
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), vision_config["layer_norm_eps"])
|
||||
# TODO: chekck this as it might causes bugs
|
||||
# orginial llaval implementation:
|
||||
# block_count = vision_config["num_hidden_layers"] - 1 if has_xgenmm_projector else vision_config["num_hidden_layers"]
|
||||
# we are different from llama1.6, which used the second to the last layer's hidden states as the image features.
|
||||
block_count = vision_config["num_hidden_layers"]
|
||||
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
|
||||
# xgenmm use anyres with grids configuration
|
||||
# 1*2, 2*1, 2*2, 3*1, 1*3, the same as the llava1.6, we just hard code it here
|
||||
# the base resolution is 384
|
||||
image_grid_pinpoints = [384, 768, 768, 384, 768, 768, 1152, 384, 384, 1152]
|
||||
fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints)
|
||||
|
||||
|
||||
image_mean = [0.5, 0.5, 0.5]
|
||||
image_std = [0.5, 0.5, 0.5]
|
||||
fout.add_array("clip.vision.image_mean", image_mean)
|
||||
fout.add_array("clip.vision.image_std", image_std)
|
||||
|
||||
# vision_config["hidden_act"] is gelu_pytorch_tanh
|
||||
# ggml implements gelu_with_tanh approximation
|
||||
use_gelu = "gelu" in vision_config["hidden_act"].lower()
|
||||
fout.add_bool("clip.use_gelu", use_gelu)
|
||||
|
||||
# for VIT model
|
||||
with print_time("Loading vision encoder and converting to gguf"):
|
||||
vision_encoder_config = SiglipVisionConfig(**vision_config)
|
||||
vision_encoder = SiglipVisionTransformer(vision_encoder_config)
|
||||
vision_encoder_ckpt = torch.load(f'{ckpt_dir}/vision_encoder/xgenmm.vision_encoder')
|
||||
vision_encoder.load_state_dict(vision_encoder_ckpt)
|
||||
state_dict = vision_encoder.state_dict()
|
||||
new_state_dict = {}
|
||||
for k_, v_ in state_dict.items():
|
||||
kvs = _replace_name_vit(k_, v_)
|
||||
for nk, nv in kvs.items():
|
||||
# split in_proj_weight to q_proj_weight, k_proj_weight, v_proj_weight
|
||||
if nk == "vision_model.head.attention.in_proj_weight":
|
||||
dim = int(nv.shape[0] / 3)
|
||||
nk_1 = "vision_model.head.attention.q_proj_weight"
|
||||
nv_1 = nv[:dim, :]
|
||||
nk_2 = "vision_model.head.attention.k_proj_weight"
|
||||
nv_2 = nv[dim:2*dim, :]
|
||||
nk_3 = "vision_model.head.attention.v_proj_weight"
|
||||
nv_3 = nv[2*dim:, :]
|
||||
new_state_dict[nk_1] = nv_1
|
||||
new_state_dict[nk_2] = nv_2
|
||||
new_state_dict[nk_3] = nv_3
|
||||
# split in_proj_bias to q_proj_bias, k_proj_bias, v_proj_bias
|
||||
elif nk == "vision_model.head.attention.in_proj_bias":
|
||||
dim = int(nv.shape[0] / 3)
|
||||
nk_1 = "vision_model.head.attention.q_proj_bias"
|
||||
nv_1 = nv[:dim]
|
||||
nk_2 = "vision_model.head.attention.k_proj_bias"
|
||||
nv_2 = nv[dim:2*dim]
|
||||
nk_3 = "vision_model.head.attention.v_proj_bias"
|
||||
nv_3 = nv[2*dim:]
|
||||
new_state_dict[nk_1] = nv_1
|
||||
new_state_dict[nk_2] = nv_2
|
||||
new_state_dict[nk_3] = nv_3
|
||||
else:
|
||||
new_state_dict[nk] = nv
|
||||
|
||||
state_dict = new_state_dict
|
||||
for name, data in state_dict.items():
|
||||
if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_xgenmm_projector):
|
||||
# we don't need this
|
||||
print(f"skipping parameter: {name}")
|
||||
continue
|
||||
|
||||
name = get_tensor_name(name)
|
||||
data = data.squeeze().numpy()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
|
||||
ftype_cur = 0
|
||||
if n_dims == 4:
|
||||
print(f"tensor {name} is always saved in f16")
|
||||
data = data.astype(np.float16)
|
||||
ftype_cur = 1
|
||||
elif ftype == 1:
|
||||
if name[-7:] == ".weight" and n_dims == 2:
|
||||
print(" Converting to float16")
|
||||
data = data.astype(np.float16)
|
||||
ftype_cur = 1
|
||||
else:
|
||||
print(" Converting to float32")
|
||||
data = data.astype(np.float32)
|
||||
ftype_cur = 0
|
||||
else:
|
||||
if data.dtype != np.float32:
|
||||
print(" Converting to float32")
|
||||
data = data.astype(np.float32)
|
||||
ftype_cur = 0
|
||||
|
||||
print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
|
||||
fout.add_tensor(name, data)
|
||||
|
||||
|
||||
image_mean = [0.5, 0.5, 0.5]
|
||||
image_std = [0.5, 0.5, 0.5]
|
||||
fout.add_array("clip.vision.image_mean", image_mean)
|
||||
fout.add_array("clip.vision.image_std", image_std)
|
||||
|
||||
# TODO: need to check; vision_config["hidden_act"] is gelu_pytorch_tanh
|
||||
use_gelu = "gelu" in vision_config["hidden_act"].lower()
|
||||
fout.add_bool("clip.use_gelu", use_gelu)
|
||||
print("🟢 Vit tensors added !")
|
||||
|
||||
if has_xgenmm_projector:
|
||||
projector = torch.load(args.xgenmm_projector)
|
||||
fout.add_uint32("clip.projector.input_dim", projector["input_dim"])
|
||||
fout.add_uint32("clip.projector.output_dim", projector["output_dim"])
|
||||
fout.add_uint32("clip.projector.num_heads", projector["num_heads"])
|
||||
fout.add_uint32("clip.projector.num_layers", projector["num_layers"])
|
||||
fout.add_uint32("clip.projector.hidden_dim", projector["hidden_dim"])
|
||||
fout.add_float32("clip.projector.dropout", projector["dropout"])
|
||||
fout.add_string("clip.projector.activation", projector["activation"])
|
||||
fout.add_string("clip.projector.norm", projector["norm"])
|
||||
fout.add_string("clip.projector.pooling", projector["pooling"])
|
||||
fout.add_string("clip.projector.pooling_norm", projector["pooling_norm"])
|
||||
fout.add_string("clip.projector.pooling_activation", projector["pooling_activation
|
||||
with print_time("Loading projector and converting to gguf"):
|
||||
projector_ckpt = torch.load(args.xgenmm_projector)
|
||||
projector = replace_tensor_name_xgenmm_projector(projector_ckpt)
|
||||
if args.use_f32:
|
||||
ftype = 0
|
||||
else:
|
||||
ftype = 1
|
||||
ftype_cur = ftype
|
||||
for name, tensor in projector.items():
|
||||
tensor = tensor.squeeze().numpy()
|
||||
if ftype_cur == 1:
|
||||
if 'ln.bias' in name or 'ln.weight' in name:
|
||||
tensor = tensor.astype(np.float32)
|
||||
ftype_cur = 0
|
||||
print(f'❗ {name} is set to np.float32')
|
||||
else:
|
||||
tensor = tensor.astype(np.float16)
|
||||
ftype_cur = 1
|
||||
print(f'❗ {name} is set to np.float16')
|
||||
else:
|
||||
if tensor.dtype != np.float32:
|
||||
tensor = tensor.astype(np.float32)
|
||||
print(f'❗ {name} is set to np.float32')
|
||||
ftype_cur = 0
|
||||
|
||||
print(f"{name} - {ftype_str[ftype_cur]} - shape = {tensor.shape}")
|
||||
fout.add_tensor(name, tensor)
|
||||
print("🟢 Projector tensors added\n")
|
||||
|
||||
fout.write_header_to_file()
|
||||
fout.write_kv_data_to_file()
|
||||
fout.write_tensors_to_file()
|
||||
fout.close()
|
||||
print("Done. Output file: " + fname_out)
|
||||
with print_time("write to gguf file"):
|
||||
fout.write_header_to_file()
|
||||
fout.write_kv_data_to_file()
|
||||
fout.write_tensors_to_file()
|
||||
fout.close()
|
||||
print("🟢 Done. Output file: " + fname_out)
|
Loading…
Add table
Add a link
Reference in a new issue