minicpmv works but missing uhd slices

This commit is contained in:
Xuan Son Nguyen 2025-01-22 22:42:00 +01:00
parent ba489b4743
commit c0d93dd509
11 changed files with 423 additions and 281 deletions

View file

@ -1372,12 +1372,14 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_V_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" },
{ LLM_TENSOR_V_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" },
{ LLM_TENSOR_V_RESMPL_POS_EMBD_K, "v.resmpl.pos_embd_k" },
{ LLM_TENSOR_V_RESMPL_ATTN_IN, "v.resmpl.attn_in" },
{ LLM_TENSOR_V_RESMPL_ATTN_Q, "v.resmpl.attn_q" },
{ LLM_TENSOR_V_RESMPL_ATTN_K, "v.resmpl.attn_k" },
{ LLM_TENSOR_V_RESMPL_ATTN_V, "v.resmpl.attn_v" },
{ LLM_TENSOR_V_RESMPL_ATTN_OUT, "v.resmpl.attn_out" },
{ LLM_TENSOR_V_RESMPL_KV_PROJ, "v.resmpl.kv_proj" },
{ LLM_TENSOR_V_RESMPL_NORM_POST, "v.resmpl.norm_post" },
{ LLM_TENSOR_V_RESMPL_NORM_KV, "v.resmpl.norm_kv" },
{ LLM_TENSOR_V_RESMPL_NORM_Q, "v.resmpl.norm_q" },
{ LLM_TENSOR_V_RESMPL_KV, "v.resmpl.kv" },
{ LLM_TENSOR_V_RESMPL_KV_NORM, "v.resmpl.kv_norm" },
{ LLM_TENSOR_V_RESMPL_POST_NORM, "v.resmpl.post_norm" },
{ LLM_TENSOR_V_RESMPL_Q_NORM, "v.resmpl.q_norm" },
{ LLM_TENSOR_V_RESMPL_PROJ, "v.resmpl.proj" },
{ LLM_TENSOR_V_RESMPL_QUERY, "v.resmpl.query" },
}
@ -1531,6 +1533,24 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_CONVNEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_CONVNEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
// vision
{LLM_TENSOR_V_MMPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_V_MMPROJ_MLP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_V_MMPROJ_PEG, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_V_ENC_EMBD_CLS, {LLM_TENSOR_LAYER_INPUT, GGML_OP_ADD}},
{LLM_TENSOR_V_ENC_EMBD_PATCH, {LLM_TENSOR_LAYER_INPUT, GGML_OP_ADD}},
{LLM_TENSOR_V_ENC_EMBD_POS, {LLM_TENSOR_LAYER_INPUT, GGML_OP_ADD}},
{LLM_TENSOR_V_ENC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_V_ENC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_V_ENC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_V_ENC_INPUT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_V_ENC_OUTPUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_V_ENC_OUTPUT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_V_ENC_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_V_ENC_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_V_PRE_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_V_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
// TODO: add minicpmv resampler tensors
};
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}

View file

@ -371,12 +371,14 @@ enum llm_tensor {
LLM_TENSOR_V_POST_NORM,
// vision - minicpmv
LLM_TENSOR_V_RESMPL_POS_EMBD_K,
LLM_TENSOR_V_RESMPL_ATTN_IN,
LLM_TENSOR_V_RESMPL_ATTN_Q,
LLM_TENSOR_V_RESMPL_ATTN_K,
LLM_TENSOR_V_RESMPL_ATTN_V,
LLM_TENSOR_V_RESMPL_ATTN_OUT,
LLM_TENSOR_V_RESMPL_KV_PROJ,
LLM_TENSOR_V_RESMPL_NORM_POST,
LLM_TENSOR_V_RESMPL_NORM_KV,
LLM_TENSOR_V_RESMPL_NORM_Q,
LLM_TENSOR_V_RESMPL_KV,
LLM_TENSOR_V_RESMPL_KV_NORM,
LLM_TENSOR_V_RESMPL_POST_NORM,
LLM_TENSOR_V_RESMPL_Q_NORM,
LLM_TENSOR_V_RESMPL_PROJ,
LLM_TENSOR_V_RESMPL_QUERY,
};

View file

@ -1248,7 +1248,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.rope_type = llama_model_rope_type(this);
// vision model
auto & vparams = clip.hparams;
auto & vparams = vit.hparams;
std::string vision_type;
ml.get_key(LLM_KV_VISION_TYPE, vision_type, false);
if (vision_type == "vit") {
@ -3451,10 +3451,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
__func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
}
}
// load tensors for vision model
auto & vparams = clip.hparams;
auto & vparams = vit.hparams;
if (has_vision) {
// language params
const int64_t n_embd = hparams.n_embd;
@ -3467,101 +3466,122 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const int64_t patch_size = vparams.patch_size;
const auto tn = LLM_TN(vparams.arch);
// clip is CPU-only for now
clip.buft = ggml_backend_cpu_buffer_type();
ggml_context * ctx_vision = ctx_map.at(clip.buft);
clip.layers.resize(n_vlayer);
// TODO: vit is cpu only for now
vit.buft = ggml_backend_cpu_buffer_type();
ggml_context * ctx_vision = ctx_map.at(vit.buft);
vit.layers.resize(n_vlayer);
switch (vparams.arch) {
case LLM_ARCH_VISION_LLAVA:
case LLM_ARCH_VISION_MOBILEVLM:
{
if (vparams.arch == LLM_ARCH_VISION_LLAVA) {
clip.mm_1_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "weight", 1), {n_vembd, n_vff});
clip.mm_1_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "bias" , 1), {n_vff});
clip.mm_2_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "weight", 2), {n_vff, n_vff});
clip.mm_2_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "bias" , 2), {n_vff});
vit.mm_1_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "weight", 1), {n_vembd, n_vff});
vit.mm_1_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "bias" , 1), {n_vff});
vit.mm_2_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "weight", 2), {n_vff, n_vff});
vit.mm_2_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "bias" , 2), {n_vff});
} else if (vparams.arch == LLM_ARCH_VISION_MOBILEVLM) {
clip.mm_model_mlp_0_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "weight", 0), {n_vembd, n_embd});
clip.mm_model_mlp_0_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "bias", 0), {n_embd});
clip.mm_model_mlp_2_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "weight", 2), {n_embd, n_embd});
clip.mm_model_mlp_2_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "bias", 2), {n_embd});
clip.mm_model_peg_0_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_PEG, "weight", 0), {n_channel, n_channel, 1, n_embd});
clip.mm_model_peg_0_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_PEG, "bias", 0), {n_embd});
vit.mm_model_mlp_0_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "weight", 0), {n_vembd, n_embd});
vit.mm_model_mlp_0_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "bias", 0), {n_embd});
vit.mm_model_mlp_2_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "weight", 2), {n_embd, n_embd});
vit.mm_model_mlp_2_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "bias", 2), {n_embd});
vit.mm_model_peg_0_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_PEG, "weight", 0), {n_channel, n_channel, 1, n_embd});
vit.mm_model_peg_0_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_PEG, "bias", 0), {n_embd});
}
clip.class_embedding = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_CLS ), {n_vembd});
clip.patch_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd});
clip.position_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd});
vit.class_embedding = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_CLS ), {n_vembd});
vit.patch_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd});
vit.position_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd});
clip.pre_norm_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_PRE_NORM, "weight"), {n_vembd});
clip.pre_norm_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_PRE_NORM, "bias" ), {n_vembd});
clip.post_norm_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_POST_NORM, "weight"), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED);
clip.post_norm_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_POST_NORM, "bias" ), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED);
vit.pre_norm_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_PRE_NORM, "weight"), {n_vembd});
vit.pre_norm_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_PRE_NORM, "bias" ), {n_vembd});
vit.post_norm_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_POST_NORM, "weight"), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED);
vit.post_norm_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_POST_NORM, "bias" ), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED);
for (int i = 0; i < n_vlayer; ++i) {
auto & layer = clip.layers[i];
auto & layer = vit.layers[i];
layer.k_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_K, "weight", i), {n_vembd, n_vembd});
layer.k_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_K, "bias" , i), {n_vembd});
layer.v_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_V, "weight", i), {n_vembd, n_vembd});
layer.v_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_V, "bias" , i), {n_vembd});
layer.q_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_Q, "weight", i), {n_vembd, n_vembd});
layer.q_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_Q, "bias" , i), {n_vembd});
layer.k_w = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_K, "weight", i), {n_vembd, n_vembd}, 0);
layer.k_b = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_K, "bias" , i), {n_vembd}, 0);
layer.v_w = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_V, "weight", i), {n_vembd, n_vembd}, 0);
layer.v_b = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_V, "bias" , i), {n_vembd}, 0);
layer.q_w = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_Q, "weight", i), {n_vembd, n_vembd}, 0);
layer.q_b = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_Q, "bias" , i), {n_vembd}, 0);
layer.ffn_up_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_UP, "weight", i), {n_vembd, n_vff});
layer.ffn_up_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_UP, "bias" , i), {n_vff});
layer.ffn_down_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_DOWN, "weight", i), {n_vff, n_vembd});
layer.ffn_down_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_DOWN, "bias" , i), {n_vembd});
layer.ffn_up_w = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_UP, "weight", i), {n_vembd, n_vff}, 0);
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_UP, "bias" , i), {n_vff}, 0);
layer.ffn_down_w = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_DOWN, "weight", i), {n_vff, n_vembd}, 0);
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_DOWN, "bias" , i), {n_vembd}, 0);
layer.norm_in_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_INPUT_NORM, "weight", i), {n_vembd});
layer.norm_in_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_INPUT_NORM, "bias" , i), {n_vembd});
layer.norm_out_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "weight", i), {n_vembd});
layer.norm_out_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "bias" , i), {n_vembd});
layer.norm_in_w = create_tensor(tn(LLM_TENSOR_V_ENC_INPUT_NORM, "weight", i), {n_vembd}, 0);
layer.norm_in_b = create_tensor(tn(LLM_TENSOR_V_ENC_INPUT_NORM, "bias" , i), {n_vembd}, 0);
layer.norm_out_w = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "weight", i), {n_vembd}, 0);
layer.norm_out_b = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "bias" , i), {n_vembd}, 0);
layer.output_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd});
layer.output_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT, "bias" , i), {n_vembd});
layer.output_w = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd}, 0);
layer.output_b = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT, "bias" , i), {n_vembd}, 0);
}
} break;
case LLM_ARCH_VISION_MINICPMV:
{
clip.patch_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd});
clip.position_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd});
vit.patch_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd});
vit.patch_bias = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "bias" ), {n_vembd});
vit.position_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd});
// TODO: load all resampler tensors
// resampler
int rs_n_embd = llama_vision_n_mmproj_embd(vit);
vit.mm_model_pos_embed_k = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_POS_EMBD_K, "weight"), {rs_n_embd, max_pos_embd});
vit.mm_model_query = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_QUERY, "weight"), {rs_n_embd, 64}); // why 64?
vit.mm_model_proj = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_PROJ, "weight"), {rs_n_embd, rs_n_embd});
vit.mm_model_kv_proj = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_KV, "weight"), {n_vembd, rs_n_embd});
vit.mm_model_attn_q_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_ATTN_Q, "weight"), {rs_n_embd, rs_n_embd});
vit.mm_model_attn_q_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_ATTN_Q, "bias" ), {rs_n_embd});
vit.mm_model_attn_k_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_ATTN_K, "weight"), {rs_n_embd, rs_n_embd});
vit.mm_model_attn_k_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_ATTN_K, "bias" ), {rs_n_embd});
vit.mm_model_attn_v_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_ATTN_V, "weight"), {rs_n_embd, rs_n_embd});
vit.mm_model_attn_v_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_ATTN_V, "bias" ), {rs_n_embd});
vit.mm_model_attn_o_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_ATTN_OUT, "weight"), {rs_n_embd, rs_n_embd});
vit.mm_model_attn_o_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_ATTN_OUT, "bias" ), {rs_n_embd});
vit.mm_model_ln_q_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_Q_NORM, "weight"), {rs_n_embd});
vit.mm_model_ln_q_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_Q_NORM, "bias" ), {rs_n_embd});
vit.mm_model_ln_kv_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_KV_NORM, "weight"), {rs_n_embd});
vit.mm_model_ln_kv_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_KV_NORM, "bias" ), {rs_n_embd});
vit.mm_model_ln_post_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_POST_NORM, "weight"), {rs_n_embd});
vit.mm_model_ln_post_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_POST_NORM, "bias" ), {rs_n_embd});
for (int i = 0; i < n_vlayer; ++i) {
auto & layer = clip.layers[i];
auto & layer = vit.layers[i];
layer.k_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_K, "weight", i), {n_vembd, n_vembd});
layer.k_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_K, "bias" , i), {n_vembd});
layer.v_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_V, "weight", i), {n_vembd, n_vembd});
layer.v_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_V, "bias" , i), {n_vembd});
layer.q_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_Q, "weight", i), {n_vembd, n_vembd});
layer.q_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_Q, "bias" , i), {n_vembd});
layer.k_w = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_K, "weight", i), {n_vembd, n_vembd}, 0);
layer.k_b = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_K, "bias" , i), {n_vembd}, 0);
layer.v_w = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_V, "weight", i), {n_vembd, n_vembd}, 0);
layer.v_b = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_V, "bias" , i), {n_vembd}, 0);
layer.q_w = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_Q, "weight", i), {n_vembd, n_vembd}, 0);
layer.q_b = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_Q, "bias" , i), {n_vembd}, 0);
layer.ffn_up_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_UP, "weight", i), {n_vembd, n_vff});
layer.ffn_up_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_UP, "bias" , i), {n_vff});
layer.ffn_down_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_DOWN, "weight", i), {n_vff, n_vembd});
layer.ffn_down_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_DOWN, "bias" , i), {n_vembd});
layer.ffn_up_w = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_UP, "weight", i), {n_vembd, n_vff}, 0);
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_UP, "bias" , i), {n_vff}, 0);
layer.ffn_down_w = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_DOWN, "weight", i), {n_vff, n_vembd}, 0);
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_DOWN, "bias" , i), {n_vembd}, 0);
layer.norm_in_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_INPUT_NORM, "weight", i), {n_vembd});
layer.norm_in_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_INPUT_NORM, "bias" , i), {n_vembd});
layer.norm_out_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "weight", i), {n_vembd});
layer.norm_out_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "bias" , i), {n_vembd});
layer.norm_in_w = create_tensor(tn(LLM_TENSOR_V_ENC_INPUT_NORM, "weight", i), {n_vembd}, 0);
layer.norm_in_b = create_tensor(tn(LLM_TENSOR_V_ENC_INPUT_NORM, "bias" , i), {n_vembd}, 0);
layer.norm_out_w = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "weight", i), {n_vembd}, 0);
layer.norm_out_b = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "bias" , i), {n_vembd}, 0);
layer.output_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd});
layer.output_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT, "bias" , i), {n_vembd});
layer.output_w = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd}, 0);
layer.output_b = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT, "bias" , i), {n_vembd}, 0);
}
} break;
default:
throw std::runtime_error("unknown vision architecture");
}
if (llama_vision_n_mmproj_embd(clip) != hparams.n_embd) {
if (llama_vision_n_mmproj_embd(vit) != hparams.n_embd) {
std::runtime_error("model has vision, but n_mmproj_embd != n_embd");
}
}
}
ml.done_getting_tensors();

View file

@ -365,7 +365,7 @@ struct llama_model {
// vision
bool has_vision = false;
llama_vision_model clip;
llama_vision_model vit;
private:
struct impl;

View file

@ -19,8 +19,6 @@ struct img_size;
static int bmp_export(const struct llama_image_u8 &img, const std::string &location);
#endif
#define VISION_GRAPH_MAX_NODE 1024
struct img_size {
int width;
int height;
@ -48,9 +46,9 @@ uint32_t llama_vision_n_mmproj_embd(const llama_vision_model & vmodel) {
} else if (proj_type == VISION_PROJECTOR_TYPE_LDPV2) {
return vmodel.mm_model_peg_0_b->ne[0];
} else if (proj_type == VISION_PROJECTOR_TYPE_MINICPMV_2_5) {
return 4096;
return 4096; // resampler
} else if (proj_type == VISION_PROJECTOR_TYPE_MINICPMV_2_6) {
return 3584;
return 3584; // resampler
} else {
GGML_ASSERT(false && "invalid proj type");
}
@ -761,16 +759,21 @@ struct llama_vision_graph_builder {
return cur;
}
// graph for each vision arch
struct ggml_cgraph * build_llava() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, VISION_GRAPH_MAX_NODE, false);
struct ggml_tensor * build_vit() {
struct ggml_tensor * cur = build_inp();
cur = build_pre_norm(cur);
for (int il = 0; il < n_layers; il++) {
cur = build_layer(cur, il);
}
cur = build_post_norm(cur);
return cur;
}
// graph for each vision arch
struct ggml_cgraph * build_llava() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, VISION_GRAPH_MAX_NODE, false);
struct ggml_tensor * cur = build_vit();
// llava projector
{
@ -825,6 +828,78 @@ struct llama_vision_graph_builder {
return gf;
}
struct ggml_cgraph * build_minicpmv() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, VISION_GRAPH_MAX_NODE, false);
struct ggml_tensor * cur = build_vit();
// minicpmv resampler projector
{
int hidden_size = llama_vision_n_mmproj_embd(*ctx.model);
struct ggml_tensor * q = model.mm_model_query;
// layernorm
{
q = ggml_norm(ctx0, q, eps);
q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
}
struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, cur);
// layernorm
{
v = ggml_norm(ctx0, v, eps);
v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b);
}
// position
struct ggml_tensor * k = ggml_add(ctx0, v, model.mm_model_pos_embed_k);
// attention
{
const int d_head = 128;
int n_head = hidden_size/d_head;
int num_query = -1;
if (model.hparams.proj_type == VISION_PROJECTOR_TYPE_MINICPMV_2_5) {
num_query = 96;
} else if (model.hparams.proj_type == VISION_PROJECTOR_TYPE_MINICPMV_2_6) {
num_query = 64;
}
struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
// permute
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); // TODO: do this when converting the model
Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); // TODO: do this when converting the model
K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); // TODO: do this when converting the model
V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
KQ = ggml_soft_max_inplace(ctx0, KQ);
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); // TODO: do this when converting the model
KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
}
// layernorm
{
cur = ggml_norm(ctx0, cur, eps);
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.mm_model_ln_post_w), model.mm_model_ln_post_b);
}
cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
}
ggml_set_name(cur, "output");
ggml_build_forward_expand(gf, cur);
return gf;
}
};
static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_vision_tokens & inp) {
@ -852,8 +927,11 @@ static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_
case LLM_ARCH_VISION_MOBILEVLM:
gf = builder.build_llava();
break;
case LLM_ARCH_VISION_MINICPMV:
gf = builder.build_minicpmv();
break;
default:
GGML_ASSERT(false && "unsupported arch");
GGML_ASSERT(false && "unsupported vision arch");
}
// alloc memory for graph
@ -903,8 +981,8 @@ static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_
free(positions_data);
}
{
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "inp_patches");
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "inp_patches");
if (patches) {
int* patches_data = (int*)malloc(ggml_nbytes(patches));
for (int i = 0; i < num_patches; i++) {
patches_data[i] = i + 1;
@ -962,7 +1040,8 @@ struct llama_vision_tokens * llama_vision_tokenize(
case LLM_ARCH_VISION_MOBILEVLM:
return new llama_vision_tokens(llama_vision_processor_llava(vctx).tokenize(*bmp));
case LLM_ARCH_VISION_MINICPMV:
return new llama_vision_tokens(llama_vision_processor_uhd(vctx).tokenize(*bmp));
//return new llama_vision_tokens(llama_vision_processor_uhd(vctx).tokenize(*bmp));
return new llama_vision_tokens(llama_vision_processor_llava(vctx).tokenize(*bmp));
default:
GGML_ASSERT(false && "unsupported arch");
}

View file

@ -7,6 +7,8 @@
#include <vector>
#include <array>
#define VISION_GRAPH_MAX_NODE 2048
enum vision_projector_type {
VISION_PROJECTOR_TYPE_UNKNOWN,
VISION_PROJECTOR_TYPE_MLP,
@ -108,24 +110,24 @@ struct llama_vision_model {
struct ggml_tensor * mm_model_peg_0_b = nullptr;
// MINICPMV projection
struct ggml_tensor * mm_model_pos_embed_k;
struct ggml_tensor * mm_model_query;
struct ggml_tensor * mm_model_proj;
struct ggml_tensor * mm_model_kv_proj;
struct ggml_tensor * mm_model_attn_q_w;
struct ggml_tensor * mm_model_attn_q_b;
struct ggml_tensor * mm_model_attn_k_w;
struct ggml_tensor * mm_model_attn_k_b;
struct ggml_tensor * mm_model_attn_v_w;
struct ggml_tensor * mm_model_attn_v_b;
struct ggml_tensor * mm_model_attn_o_w;
struct ggml_tensor * mm_model_attn_o_b;
struct ggml_tensor * mm_model_ln_q_w;
struct ggml_tensor * mm_model_ln_q_b;
struct ggml_tensor * mm_model_ln_kv_w;
struct ggml_tensor * mm_model_ln_kv_b;
struct ggml_tensor * mm_model_ln_post_w;
struct ggml_tensor * mm_model_ln_post_b;
struct ggml_tensor * mm_model_pos_embed_k = nullptr;
struct ggml_tensor * mm_model_query = nullptr;
struct ggml_tensor * mm_model_proj = nullptr;
struct ggml_tensor * mm_model_kv_proj = nullptr;
struct ggml_tensor * mm_model_attn_q_w = nullptr;
struct ggml_tensor * mm_model_attn_q_b = nullptr;
struct ggml_tensor * mm_model_attn_k_w = nullptr;
struct ggml_tensor * mm_model_attn_k_b = nullptr;
struct ggml_tensor * mm_model_attn_v_w = nullptr;
struct ggml_tensor * mm_model_attn_v_b = nullptr;
struct ggml_tensor * mm_model_attn_o_w = nullptr;
struct ggml_tensor * mm_model_attn_o_b = nullptr;
struct ggml_tensor * mm_model_ln_q_w = nullptr;
struct ggml_tensor * mm_model_ln_q_b = nullptr;
struct ggml_tensor * mm_model_ln_kv_w = nullptr;
struct ggml_tensor * mm_model_ln_kv_b = nullptr;
struct ggml_tensor * mm_model_ln_post_w = nullptr;
struct ggml_tensor * mm_model_ln_post_b = nullptr;
struct ggml_tensor * image_newline = nullptr;
};

View file

@ -9838,9 +9838,9 @@ struct llama_context * llama_init_from_model(
}
if (model->has_vision) {
ctx->vctx.model = &model->clip;
ctx->vctx.model = &model->vit;
ctx->vctx.sched = ctx->sched.get();
const size_t max_nodes = 1024;
const size_t max_nodes = VISION_GRAPH_MAX_NODE; // TODO: make it dynamic
ctx->vctx.buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
}