add mobilevlm

This commit is contained in:
Xuan Son Nguyen 2025-01-19 16:29:20 +01:00
parent 6cabdda0df
commit d0068ef0ed
9 changed files with 210 additions and 61 deletions

View file

@ -67,6 +67,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
static const std::map<vision_arch, const char *> VISION_ARCH_NAMES = {
{ VISION_ARCH_LLAVA, "llava" },
{ VISION_ARCH_MOBILEVLM, "mobilevlm" },
{ VISION_ARCH_UNKNOWN, "(unknown)" },
};
@ -1345,7 +1346,27 @@ static const std::map<vision_arch, std::map<vision_tensor, const char *>> VISION
{ VISION_TENSOR_PRE_NORM, "v.pre_norm" },
{ VISION_TENSOR_POST_NORM, "v.post_norm" },
}
}
},
{
VISION_ARCH_MOBILEVLM,
{
{ VISION_TENSOR_MMPROJ_MLP, "v.mmproj.mlp.%d" },
{ VISION_TENSOR_MMPROJ_PEG, "v.mmproj.peg.%d" },
{ VISION_TENSOR_ENC_EMBD_CLS, "v.enc.embd.cls" },
{ VISION_TENSOR_ENC_EMBD_PATCH, "v.enc.embd.patch" },
{ VISION_TENSOR_ENC_EMBD_POS, "v.enc.embd.pos" },
{ VISION_TENSOR_ENC_ATTN_Q, "v.enc.blk.%d.attn_q" },
{ VISION_TENSOR_ENC_ATTN_K, "v.enc.blk.%d.attn_k" },
{ VISION_TENSOR_ENC_ATTN_V, "v.enc.blk.%d.attn_v" },
{ VISION_TENSOR_ENC_INPUT_NORM, "v.enc.blk.%d.input_norm" },
{ VISION_TENSOR_ENC_OUTPUT, "v.enc.blk.%d.output" },
{ VISION_TENSOR_ENC_OUTPUT_NORM, "v.enc.blk.%d.output_norm" },
{ VISION_TENSOR_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" },
{ VISION_TENSOR_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" },
{ VISION_TENSOR_PRE_NORM, "v.pre_norm" },
{ VISION_TENSOR_POST_NORM, "v.post_norm" },
}
},
};
static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
@ -1499,6 +1520,10 @@ std::string LLM_KV::operator()(llm_kv kv) const {
template<>
std::string BASE_TN_IMPL<llm_arch, llm_tensor>::str() const {
if (LLM_TENSOR_NAMES.find(arch) == LLM_TENSOR_NAMES.end()) {
throw std::runtime_error(format("Cannot find tensor name mapping for arch %d", arch));
}
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
return "__missing__";
}
@ -1515,6 +1540,10 @@ std::string BASE_TN_IMPL<llm_arch, llm_tensor>::str() const {
template<>
std::string BASE_TN_IMPL<vision_arch, vision_tensor>::str() const {
if (VISION_TENSOR_NAMES.find(arch) == VISION_TENSOR_NAMES.end()) {
throw std::runtime_error(format("Cannot find tensor name mapping for arch %d", arch));
}
if (VISION_TENSOR_NAMES.at(arch).find(tensor) == VISION_TENSOR_NAMES.at(arch).end()) {
return "__missing__";
}

View file

@ -72,6 +72,7 @@ enum llm_arch {
enum vision_arch {
VISION_ARCH_UNKNOWN,
VISION_ARCH_LLAVA,
VISION_ARCH_MOBILEVLM,
};
enum llm_kv {
@ -356,6 +357,8 @@ enum llm_tensor {
enum vision_tensor {
VISION_TENSOR_MMPROJ,
VISION_TENSOR_MMPROJ_MLP,
VISION_TENSOR_MMPROJ_PEG,
VISION_TENSOR_ENC_EMBD_CLS,
VISION_TENSOR_ENC_EMBD_PATCH,
VISION_TENSOR_ENC_EMBD_POS,

View file

@ -1280,6 +1280,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
std::string arch;
ml.get_key(LLM_KV_VISION_CLIP_ARCHITECTURE, arch, true);
vparams.arch = vision_arch_from_string(arch);
if (vparams.arch == VISION_ARCH_UNKNOWN) {
throw std::runtime_error(format("unsupported vision arch: %s", arch.c_str()));
}
}
} else if (!vision_type.empty()) {
throw std::runtime_error(format("unsupported vision type: %s", vision_type.c_str()));
@ -1288,6 +1291,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
// arch-specific CLIP hparams
switch (vparams.arch) {
case VISION_ARCH_LLAVA:
case VISION_ARCH_MOBILEVLM:
{
ml.get_key(LLM_KV_VISION_CLIP_MAX_POS_EMBD, vparams.max_pos_embd, true);
} break;
@ -3410,58 +3414,71 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
// load tensors for vision model
auto & vparams = clip.hparams;
if (has_vision) {
const int64_t n_layer = vparams.n_layer;
const int64_t n_embd = vparams.hidden_size;
const int64_t n_ff = vparams.n_intermediate;
const int64_t max_pos_embd = vparams.max_pos_embd;
const int64_t n_channel = 3; // always RGB
const int64_t patch_size = vparams.patch_size;
// language params
const int64_t n_embd = hparams.n_embd;
// vision params
const int64_t n_vlayer = vparams.n_layer;
const int64_t n_vembd = vparams.hidden_size;
const int64_t n_vff = vparams.n_intermediate;
const int64_t max_pos_embd = vparams.max_pos_embd;
const int64_t n_channel = 3; // always RGB
const int64_t patch_size = vparams.patch_size;
const auto tn = VISION_TN(vparams.arch);
// clip is CPU-only for now
clip.buft = ggml_backend_cpu_buffer_type();
ggml_context * ctx_vision = ctx_map.at(clip.buft);
clip.layers.resize(n_layer);
clip.layers.resize(n_vlayer);
switch (vparams.arch) {
case VISION_ARCH_LLAVA:
case VISION_ARCH_MOBILEVLM:
{
clip.mm_1_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 1), {n_embd, n_ff});
clip.mm_1_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias" , 1), {n_ff});
clip.mm_2_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 2), {n_ff, n_ff});
clip.mm_2_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias" , 2), {n_ff});
if (vparams.arch == VISION_ARCH_LLAVA) {
clip.mm_1_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 1), {n_vembd, n_vff});
clip.mm_1_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias" , 1), {n_vff});
clip.mm_2_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 2), {n_vff, n_vff});
clip.mm_2_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias" , 2), {n_vff});
} else if (vparams.arch == VISION_ARCH_MOBILEVLM) {
clip.mm_model_mlp_0_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_MLP, "weight", 0), {n_vembd, n_embd});
clip.mm_model_mlp_0_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_MLP, "bias", 0), {n_embd});
clip.mm_model_mlp_2_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_MLP, "weight", 2), {n_embd, n_embd});
clip.mm_model_mlp_2_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_MLP, "bias", 2), {n_embd});
clip.mm_model_peg_0_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_PEG, "weight", 0), {n_channel, n_channel, 1, n_embd});
clip.mm_model_peg_0_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_PEG, "bias", 0), {n_embd});
}
clip.class_embedding = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_CLS ), {n_embd});
clip.patch_embeddings = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_embd});
clip.position_embeddings = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_POS, "weight"), {n_embd, max_pos_embd});
clip.class_embedding = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_CLS ), {n_vembd});
clip.patch_embeddings = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd});
clip.position_embeddings = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd});
clip.pre_norm_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_PRE_NORM, "weight"), {n_embd});
clip.pre_norm_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_PRE_NORM, "bias" ), {n_embd});
clip.post_norm_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM, "weight"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
clip.post_norm_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM, "bias" ), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
clip.pre_norm_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_PRE_NORM, "weight"), {n_vembd});
clip.pre_norm_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_PRE_NORM, "bias" ), {n_vembd});
clip.post_norm_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM, "weight"), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED);
clip.post_norm_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM, "bias" ), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED);
for (int i = 0; i < n_layer; ++i) {
for (int i = 0; i < n_vlayer; ++i) {
auto & layer = clip.layers[i];
layer.k_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd});
layer.k_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_K, "bias" , i), {n_embd});
layer.v_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd});
layer.v_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_V, "bias" , i), {n_embd});
layer.q_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd});
layer.q_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_Q, "bias" , i), {n_embd});
layer.k_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_K, "weight", i), {n_vembd, n_vembd});
layer.k_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_K, "bias" , i), {n_vembd});
layer.v_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_V, "weight", i), {n_vembd, n_vembd});
layer.v_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_V, "bias" , i), {n_vembd});
layer.q_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_Q, "weight", i), {n_vembd, n_vembd});
layer.q_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_Q, "bias" , i), {n_vembd});
layer.ffn_up_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff});
layer.ffn_up_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_UP, "bias" , i), {n_ff});
layer.ffn_down_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_DOWN, "weight", i), {n_ff, n_embd});
layer.ffn_down_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_DOWN, "bias" , i), {n_embd});
layer.ffn_up_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_UP, "weight", i), {n_vembd, n_vff});
layer.ffn_up_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_UP, "bias" , i), {n_vff});
layer.ffn_down_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_DOWN, "weight", i), {n_vff, n_vembd});
layer.ffn_down_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_DOWN, "bias" , i), {n_vembd});
layer.norm_in_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_INPUT_NORM, "weight", i), {n_embd});
layer.norm_in_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_INPUT_NORM, "bias" , i), {n_embd});
layer.norm_out_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT_NORM, "weight", i), {n_embd});
layer.norm_out_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT_NORM, "bias" , i), {n_embd});
layer.norm_in_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_INPUT_NORM, "weight", i), {n_vembd});
layer.norm_in_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_INPUT_NORM, "bias" , i), {n_vembd});
layer.norm_out_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT_NORM, "weight", i), {n_vembd});
layer.norm_out_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT_NORM, "bias" , i), {n_vembd});
layer.output_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT, "weight", i), {n_embd, n_embd});
layer.output_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT, "bias" , i), {n_embd});
layer.output_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd});
layer.output_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT, "bias" , i), {n_vembd});
}
} break;
default:

View file

@ -58,8 +58,11 @@ static int clip_n_patches(const clip_context & ctx) {
}
uint32_t clip_n_mmproj_embd(const clip_vision_model & clip_model) {
if (clip_model.hparams.proj_type == CLIP_PROJECTOR_TYPE_MLP) {
auto & proj_type = clip_model.hparams.proj_type;
if (proj_type == CLIP_PROJECTOR_TYPE_MLP) {
return clip_model.mm_2_b->ne[0];
} else if (proj_type == CLIP_PROJECTOR_TYPE_LDPV2) {
return clip_model.mm_model_peg_0_b->ne[0];
} else {
GGML_ASSERT(false && "invalid proj type");
}
@ -559,6 +562,30 @@ static ggml_cgraph * clip_image_build_graph(clip_context & ctx, int batch_size,
embeddings = ggml_gelu(ctx0, embeddings);
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
} else if (hparams.proj_type == CLIP_PROJECTOR_TYPE_LDPV2) {
int n_patch = 24;
struct ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
mlp_0 = ggml_gelu(ctx0, mlp_0);
struct ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
// mlp_2 ne = [2048, 576, 1, 1]
// // AVG Pool Layer 2*2, strides = 2
mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3));
// mlp_2 ne = [576, 2048, 1, 1]
mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
// mlp_2 ne [24, 24, 2048, 1]
mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
// weight ne = [3, 3, 2048, 1]
struct ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
peg_0 = ggml_add(ctx0, peg_0, mlp_2);
peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
embeddings = peg_0;
} else {
GGML_ASSERT(false && "unsupported proj type");
}

View file

@ -10,6 +10,7 @@
enum clip_projector_type {
CLIP_PROJECTOR_TYPE_UNKNOWN,
CLIP_PROJECTOR_TYPE_MLP,
CLIP_PROJECTOR_TYPE_LDPV2,
};
enum mm_patch_merge {
@ -98,6 +99,14 @@ struct clip_vision_model {
struct ggml_tensor * mm_2_w = nullptr;
struct ggml_tensor * mm_2_b = nullptr;
// MobileVLM_V2 projection
struct ggml_tensor * mm_model_mlp_0_w = nullptr;
struct ggml_tensor * mm_model_mlp_0_b = nullptr;
struct ggml_tensor * mm_model_mlp_2_w = nullptr;
struct ggml_tensor * mm_model_mlp_2_b = nullptr;
struct ggml_tensor * mm_model_peg_0_w = nullptr;
struct ggml_tensor * mm_model_peg_0_b = nullptr;
struct ggml_tensor * image_newline = nullptr;
};
@ -138,6 +147,8 @@ inline mm_patch_merge mm_patch_merge_from_name(std::string & name) {
inline clip_projector_type clip_projector_type_from_name(std::string & name) {
if (name == "mlp") {
return CLIP_PROJECTOR_TYPE_MLP;
} else if (name == "ldpv2") {
return CLIP_PROJECTOR_TYPE_LDPV2;
}
return CLIP_PROJECTOR_TYPE_UNKNOWN;
}