add SmolVLM

This commit is contained in:
Xuan Son Nguyen 2025-01-23 15:51:30 +01:00
parent 25a97ce4cb
commit c3a654c0fb
9 changed files with 171 additions and 10 deletions

View file

@ -66,6 +66,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_VISION_LLAVA, "llava" },
{ LLM_ARCH_VISION_MOBILEVLM, "mobilevlm" },
{ LLM_ARCH_VISION_MINICPMV, "minicpmv" },
{ LLM_ARCH_VISION_IDEFICS3, "idefics3" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
@ -214,6 +215,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_VISION_VIT_PATCH_MERGE_TYPE, "vision.vit.patch_merge_type" },
{ LLM_KV_VISION_VIT_HEAD_COUNT, "vision.vit.attention.head_count" },
{ LLM_KV_VISION_VIT_LAYERNORM_EPS, "vision.vit.attention.layer_norm_epsilon" },
{ LLM_KV_VISION_VIT_SCALE_FACTOR, "vision.vit.scale_factor" },
// deprecated
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
@ -1388,6 +1390,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_V_TOK_EMBD_END_SLICE, "v.tok_embd.end_slice" },
}
},
{
LLM_ARCH_VISION_IDEFICS3,
{
{ LLM_TENSOR_V_MMPROJ_FC, "v.mmproj.fc" },
{ LLM_TENSOR_V_ENC_EMBD_CLS, "v.enc.embd.cls" },
{ LLM_TENSOR_V_ENC_EMBD_PATCH, "v.enc.embd.patch" },
{ LLM_TENSOR_V_ENC_EMBD_POS, "v.enc.embd.pos" },
{ LLM_TENSOR_V_ENC_ATTN_Q, "v.enc.blk.%d.attn_q" },
{ LLM_TENSOR_V_ENC_ATTN_K, "v.enc.blk.%d.attn_k" },
{ LLM_TENSOR_V_ENC_ATTN_V, "v.enc.blk.%d.attn_v" },
{ LLM_TENSOR_V_ENC_INPUT_NORM, "v.enc.blk.%d.input_norm" },
{ LLM_TENSOR_V_ENC_OUTPUT, "v.enc.blk.%d.output" },
{ LLM_TENSOR_V_ENC_OUTPUT_NORM, "v.enc.blk.%d.output_norm" },
{ LLM_TENSOR_V_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" },
{ LLM_TENSOR_V_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" },
{ LLM_TENSOR_V_PRE_NORM, "v.pre_norm" },
{ LLM_TENSOR_V_POST_NORM, "v.post_norm" },
}
},
{
LLM_ARCH_UNKNOWN,
{

View file

@ -70,6 +70,7 @@ enum llm_arch {
LLM_ARCH_VISION_LLAVA,
LLM_ARCH_VISION_MOBILEVLM,
LLM_ARCH_VISION_MINICPMV,
LLM_ARCH_VISION_IDEFICS3,
LLM_ARCH_UNKNOWN,
};
@ -218,6 +219,7 @@ enum llm_kv {
LLM_KV_VISION_VIT_PATCH_MERGE_TYPE,
LLM_KV_VISION_VIT_HEAD_COUNT,
LLM_KV_VISION_VIT_LAYERNORM_EPS,
LLM_KV_VISION_VIT_SCALE_FACTOR,
// deprecated:
LLM_KV_TOKENIZER_PREFIX_ID,
@ -354,6 +356,7 @@ enum llm_tensor {
LLM_TENSOR_POS_NET_ATTN_OUT,
// vision
LLM_TENSOR_V_MMPROJ,
LLM_TENSOR_V_MMPROJ_FC,
LLM_TENSOR_V_MMPROJ_MLP,
LLM_TENSOR_V_MMPROJ_PEG,
LLM_TENSOR_V_ENC_EMBD_CLS,

View file

@ -1265,6 +1265,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_VISION_VIT_LAYERNORM_EPS, vparams.eps, true);
ml.get_key(LLM_KV_VISION_VIT_SELECT_LAYER, vparams.select_layer, true);
ml.get_key(LLM_KV_VISION_VIT_MAX_POS_EMBD, vparams.max_pos_embd, true);
ml.get_key(LLM_KV_VISION_VIT_SCALE_FACTOR, vparams.scale_factor, false);
{
std::string name;
ml.get_key(LLM_KV_VISION_VIT_PROJECTOR_TYPE, name, true);
@ -3555,6 +3556,42 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
vit.mm_tok_embd_slice = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_TOK_EMBD_SLICE, "weight"), {n_embd});
vit.mm_tok_embd_end_slice = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_TOK_EMBD_END_SLICE, "weight"), {n_embd});
for (int i = 0; i < n_vlayer; ++i) {
auto & layer = vit.layers[i];
layer.k_w = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_K, "weight", i), {n_vembd, n_vembd}, 0);
layer.k_b = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_K, "bias" , i), {n_vembd}, 0);
layer.v_w = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_V, "weight", i), {n_vembd, n_vembd}, 0);
layer.v_b = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_V, "bias" , i), {n_vembd}, 0);
layer.q_w = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_Q, "weight", i), {n_vembd, n_vembd}, 0);
layer.q_b = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_Q, "bias" , i), {n_vembd}, 0);
layer.ffn_up_w = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_UP, "weight", i), {n_vembd, n_vff}, 0);
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_UP, "bias" , i), {n_vff}, 0);
layer.ffn_down_w = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_DOWN, "weight", i), {n_vff, n_vembd}, 0);
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_DOWN, "bias" , i), {n_vembd}, 0);
layer.norm_in_w = create_tensor(tn(LLM_TENSOR_V_ENC_INPUT_NORM, "weight", i), {n_vembd}, 0);
layer.norm_in_b = create_tensor(tn(LLM_TENSOR_V_ENC_INPUT_NORM, "bias" , i), {n_vembd}, 0);
layer.norm_out_w = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "weight", i), {n_vembd}, 0);
layer.norm_out_b = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "bias" , i), {n_vembd}, 0);
layer.output_w = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd}, 0);
layer.output_b = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT, "bias" , i), {n_vembd}, 0);
}
} break;
case LLM_ARCH_VISION_IDEFICS3:
{
int scale_factor = vit.hparams.scale_factor;
vit.projection = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_FC, "weight"), {n_vembd * scale_factor * scale_factor, n_embd});
vit.patch_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd});
vit.patch_bias = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "bias" ), {n_vembd});
vit.position_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd});
vit.post_norm_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_POST_NORM, "weight"), {n_vembd});
vit.post_norm_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_POST_NORM, "bias" ), {n_vembd});
for (int i = 0; i < n_vlayer; ++i) {
auto & layer = vit.layers[i];
@ -4085,6 +4122,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
case LLM_ARCH_VISION_LLAVA:
case LLM_ARCH_VISION_MOBILEVLM:
case LLM_ARCH_VISION_MINICPMV:
case LLM_ARCH_VISION_IDEFICS3:
GGML_ABORT("vision arch does not use RoPE");
// all model arches should be listed explicitly here

View file

@ -42,7 +42,9 @@ struct llama_image_u8 {
uint32_t llama_vision_n_mmproj_embd(const llama_vision_model & vmodel) {
auto & proj_type = vmodel.hparams.proj_type;
if (proj_type == VISION_PROJECTOR_TYPE_MLP) {
return vmodel.mm_2_b->ne[0];
return vmodel.mm_2_b
? vmodel.mm_2_b->ne[0]
: vmodel.projection->ne[1]; // idefics3
} else if (proj_type == VISION_PROJECTOR_TYPE_LDPV2) {
return vmodel.mm_model_peg_0_b->ne[0];
} else if (proj_type == VISION_PROJECTOR_TYPE_MINICPMV_2_5) {
@ -903,6 +905,40 @@ struct llama_vision_graph_builder {
return gf;
}
struct ggml_cgraph * build_idefics3() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, VISION_GRAPH_MAX_NODE, false);
struct ggml_tensor * cur = build_vit();
// https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
{
const int scale_factor = model.hparams.scale_factor;
const int n_embd = cur->ne[0];
const int seq = cur->ne[1];
const int bsz = 1; // batch size, always 1 for now since we don't support batching
const int height = std::sqrt(seq);
const int width = std::sqrt(seq);
cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
n_embd * scale_factor * scale_factor,
height / scale_factor,
width / scale_factor,
bsz);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur),
n_embd * scale_factor * scale_factor,
seq / (scale_factor * scale_factor),
bsz);
cur = ggml_mul_mat(ctx0, model.projection, cur);
}
ggml_set_name(cur, "output");
ggml_build_forward_expand(gf, cur);
return gf;
}
};
static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_vision_tokens & inp) {
@ -933,6 +969,9 @@ static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_
case LLM_ARCH_VISION_MINICPMV:
gf = builder.build_minicpmv();
break;
case LLM_ARCH_VISION_IDEFICS3:
gf = builder.build_idefics3();
break;
default:
GGML_ASSERT(false && "unsupported vision arch");
}
@ -1064,6 +1103,7 @@ struct llama_vision_tokens * llama_vision_tokenize(
switch (vctx.model->hparams.arch) {
case LLM_ARCH_VISION_LLAVA:
case LLM_ARCH_VISION_MOBILEVLM:
case LLM_ARCH_VISION_IDEFICS3:
return new llama_vision_tokens(llama_vision_processor_llava(vctx).tokenize(*bmp));
case LLM_ARCH_VISION_MINICPMV:
//return new llama_vision_tokens(llama_vision_processor_uhd(vctx).tokenize(*bmp));

View file

@ -48,6 +48,9 @@ struct llama_vision_model {
std::array<int32_t, 32> image_grid_pinpoints; // TODO: should this be array of (x, y) pairs?
int32_t image_crop_resolution;
// idefics3
int scale_factor = 0;
};
struct vision_hparams hparams;
ggml_backend_buffer_type_t buft;