From bd0714b977ae7cb59569c28ced0a9a1186ba68d9 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 21 Jan 2025 14:27:16 +0100 Subject: [PATCH] reuse LLM_ARCH and LLM_TENSOR --- src/llama-arch.cpp | 160 ++++++++++++++++++------------------------- src/llama-arch.h | 95 ++++++++++--------------- src/llama-model.cpp | 118 +++++++++++++++---------------- src/llama-vision.cpp | 6 +- src/llama-vision.h | 14 +--- 5 files changed, 167 insertions(+), 226 deletions(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 48336943c..a2e848c11 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -63,6 +63,9 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_GRANITE_MOE, "granitemoe" }, { LLM_ARCH_CHAMELEON, "chameleon" }, { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" }, + { LLM_ARCH_VISION_LLAVA, "llava" }, + { LLM_ARCH_VISION_MOBILEVLM, "mobilevlm" }, + { LLM_ARCH_VISION_MINICPMV, "minicpmv" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -1314,6 +1317,70 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" }, }, }, + // vision + { + LLM_ARCH_VISION_LLAVA, + { + { LLM_TENSOR_V_MMPROJ, "v.mmproj_%d" }, + { LLM_TENSOR_V_ENC_EMBD_CLS, "v.enc.embd.cls" }, + { LLM_TENSOR_V_ENC_EMBD_PATCH, "v.enc.embd.patch" }, + { LLM_TENSOR_V_ENC_EMBD_POS, "v.enc.embd.pos" }, + { LLM_TENSOR_V_ENC_ATTN_Q, "v.enc.blk.%d.attn_q" }, + { LLM_TENSOR_V_ENC_ATTN_K, "v.enc.blk.%d.attn_k" }, + { LLM_TENSOR_V_ENC_ATTN_V, "v.enc.blk.%d.attn_v" }, + { LLM_TENSOR_V_ENC_INPUT_NORM, "v.enc.blk.%d.input_norm" }, + { LLM_TENSOR_V_ENC_OUTPUT, "v.enc.blk.%d.output" }, + { LLM_TENSOR_V_ENC_OUTPUT_NORM, "v.enc.blk.%d.output_norm" }, + { LLM_TENSOR_V_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" }, + { LLM_TENSOR_V_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" }, + { LLM_TENSOR_V_PRE_NORM, "v.pre_norm" }, + { LLM_TENSOR_V_POST_NORM, "v.post_norm" }, + } + }, + { + LLM_ARCH_VISION_MOBILEVLM, + { + { LLM_TENSOR_V_MMPROJ_MLP, "v.mmproj.mlp.%d" }, + { LLM_TENSOR_V_MMPROJ_PEG, "v.mmproj.peg.%d" }, + { LLM_TENSOR_V_ENC_EMBD_CLS, "v.enc.embd.cls" }, + { LLM_TENSOR_V_ENC_EMBD_PATCH, "v.enc.embd.patch" }, + { LLM_TENSOR_V_ENC_EMBD_POS, "v.enc.embd.pos" }, + { LLM_TENSOR_V_ENC_ATTN_Q, "v.enc.blk.%d.attn_q" }, + { LLM_TENSOR_V_ENC_ATTN_K, "v.enc.blk.%d.attn_k" }, + { LLM_TENSOR_V_ENC_ATTN_V, "v.enc.blk.%d.attn_v" }, + { LLM_TENSOR_V_ENC_INPUT_NORM, "v.enc.blk.%d.input_norm" }, + { LLM_TENSOR_V_ENC_OUTPUT, "v.enc.blk.%d.output" }, + { LLM_TENSOR_V_ENC_OUTPUT_NORM, "v.enc.blk.%d.output_norm" }, + { LLM_TENSOR_V_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" }, + { LLM_TENSOR_V_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" }, + { LLM_TENSOR_V_PRE_NORM, "v.pre_norm" }, + { LLM_TENSOR_V_POST_NORM, "v.post_norm" }, + } + }, + { + LLM_ARCH_VISION_MINICPMV, + { + { LLM_TENSOR_V_ENC_EMBD_PATCH, "v.enc.embd.patch" }, + { LLM_TENSOR_V_ENC_EMBD_POS, "v.enc.embd.pos" }, + { LLM_TENSOR_V_ENC_ATTN_Q, "v.enc.blk.%d.attn_q" }, + { LLM_TENSOR_V_ENC_ATTN_K, "v.enc.blk.%d.attn_k" }, + { LLM_TENSOR_V_ENC_ATTN_V, "v.enc.blk.%d.attn_v" }, + { LLM_TENSOR_V_ENC_INPUT_NORM, "v.enc.blk.%d.input_norm" }, + { LLM_TENSOR_V_ENC_OUTPUT, "v.enc.blk.%d.output" }, + { LLM_TENSOR_V_ENC_OUTPUT_NORM, "v.enc.blk.%d.output_norm" }, + { LLM_TENSOR_V_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" }, + { LLM_TENSOR_V_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" }, + { LLM_TENSOR_V_RESMPL_POS_EMBD_K, "v.resmpl.pos_embd_k" }, + { LLM_TENSOR_V_RESMPL_ATTN_IN, "v.resmpl.attn_in" }, + { LLM_TENSOR_V_RESMPL_ATTN_OUT, "v.resmpl.attn_out" }, + { LLM_TENSOR_V_RESMPL_KV_PROJ, "v.resmpl.kv_proj" }, + { LLM_TENSOR_V_RESMPL_NORM_POST, "v.resmpl.norm_post" }, + { LLM_TENSOR_V_RESMPL_NORM_KV, "v.resmpl.norm_kv" }, + { LLM_TENSOR_V_RESMPL_NORM_Q, "v.resmpl.norm_q" }, + { LLM_TENSOR_V_RESMPL_PROJ, "v.resmpl.proj" }, + { LLM_TENSOR_V_RESMPL_QUERY, "v.resmpl.query" }, + } + }, { LLM_ARCH_UNKNOWN, { @@ -1322,72 +1389,6 @@ static const std::map> LLM_TENSOR_N }, }; -static const std::map> VISION_TENSOR_NAMES = { - { - VISION_ARCH_LLAVA, - { - { VISION_TENSOR_MMPROJ, "v.mmproj_%d" }, - { VISION_TENSOR_ENC_EMBD_CLS, "v.enc.embd.cls" }, - { VISION_TENSOR_ENC_EMBD_PATCH, "v.enc.embd.patch" }, - { VISION_TENSOR_ENC_EMBD_POS, "v.enc.embd.pos" }, - { VISION_TENSOR_ENC_ATTN_Q, "v.enc.blk.%d.attn_q" }, - { VISION_TENSOR_ENC_ATTN_K, "v.enc.blk.%d.attn_k" }, - { VISION_TENSOR_ENC_ATTN_V, "v.enc.blk.%d.attn_v" }, - { VISION_TENSOR_ENC_INPUT_NORM, "v.enc.blk.%d.input_norm" }, - { VISION_TENSOR_ENC_OUTPUT, "v.enc.blk.%d.output" }, - { VISION_TENSOR_ENC_OUTPUT_NORM, "v.enc.blk.%d.output_norm" }, - { VISION_TENSOR_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" }, - { VISION_TENSOR_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" }, - { VISION_TENSOR_PRE_NORM, "v.pre_norm" }, - { VISION_TENSOR_POST_NORM, "v.post_norm" }, - } - }, - { - VISION_ARCH_MOBILEVLM, - { - { VISION_TENSOR_MMPROJ_MLP, "v.mmproj.mlp.%d" }, - { VISION_TENSOR_MMPROJ_PEG, "v.mmproj.peg.%d" }, - { VISION_TENSOR_ENC_EMBD_CLS, "v.enc.embd.cls" }, - { VISION_TENSOR_ENC_EMBD_PATCH, "v.enc.embd.patch" }, - { VISION_TENSOR_ENC_EMBD_POS, "v.enc.embd.pos" }, - { VISION_TENSOR_ENC_ATTN_Q, "v.enc.blk.%d.attn_q" }, - { VISION_TENSOR_ENC_ATTN_K, "v.enc.blk.%d.attn_k" }, - { VISION_TENSOR_ENC_ATTN_V, "v.enc.blk.%d.attn_v" }, - { VISION_TENSOR_ENC_INPUT_NORM, "v.enc.blk.%d.input_norm" }, - { VISION_TENSOR_ENC_OUTPUT, "v.enc.blk.%d.output" }, - { VISION_TENSOR_ENC_OUTPUT_NORM, "v.enc.blk.%d.output_norm" }, - { VISION_TENSOR_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" }, - { VISION_TENSOR_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" }, - { VISION_TENSOR_PRE_NORM, "v.pre_norm" }, - { VISION_TENSOR_POST_NORM, "v.post_norm" }, - } - }, - { - VISION_ARCH_MINICPMV, - { - { VISION_TENSOR_ENC_EMBD_PATCH, "v.enc.embd.patch" }, - { VISION_TENSOR_ENC_EMBD_POS, "v.enc.embd.pos" }, - { VISION_TENSOR_ENC_ATTN_Q, "v.enc.blk.%d.attn_q" }, - { VISION_TENSOR_ENC_ATTN_K, "v.enc.blk.%d.attn_k" }, - { VISION_TENSOR_ENC_ATTN_V, "v.enc.blk.%d.attn_v" }, - { VISION_TENSOR_ENC_INPUT_NORM, "v.enc.blk.%d.input_norm" }, - { VISION_TENSOR_ENC_OUTPUT, "v.enc.blk.%d.output" }, - { VISION_TENSOR_ENC_OUTPUT_NORM, "v.enc.blk.%d.output_norm" }, - { VISION_TENSOR_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" }, - { VISION_TENSOR_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" }, - { VISION_TENSOR_RESMPL_POS_EMBD_K, "v.resmpl.pos_embd_k" }, - { VISION_TENSOR_RESMPL_ATTN_IN, "v.resmpl.attn_in" }, - { VISION_TENSOR_RESMPL_ATTN_OUT, "v.resmpl.attn_out" }, - { VISION_TENSOR_RESMPL_KV_PROJ, "v.resmpl.kv_proj" }, - { VISION_TENSOR_RESMPL_NORM_POST, "v.resmpl.norm_post" }, - { VISION_TENSOR_RESMPL_NORM_KV, "v.resmpl.norm_kv" }, - { VISION_TENSOR_RESMPL_NORM_Q, "v.resmpl.norm_q" }, - { VISION_TENSOR_RESMPL_PROJ, "v.resmpl.proj" }, - { VISION_TENSOR_RESMPL_QUERY, "v.resmpl.query" }, - } - }, -}; - static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_TOKEN_EMBD, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}}, {LLM_TENSOR_POS_EMBD, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}}, @@ -1537,12 +1538,7 @@ std::string LLM_KV::operator()(llm_kv kv) const { return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch)); } -template<> -std::string BASE_TN_IMPL::str() const { - if (LLM_TENSOR_NAMES.find(arch) == LLM_TENSOR_NAMES.end()) { - throw std::runtime_error(format("Cannot find tensor name mapping for arch %d", arch)); - } - +std::string LLM_TN_IMPL::str() const { if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) { return "__missing__"; } @@ -1557,26 +1553,6 @@ std::string BASE_TN_IMPL::str() const { return name; } -template<> -std::string BASE_TN_IMPL::str() const { - if (VISION_TENSOR_NAMES.find(arch) == VISION_TENSOR_NAMES.end()) { - throw std::runtime_error(format("Cannot find tensor name mapping for arch %d", arch)); - } - - if (VISION_TENSOR_NAMES.at(arch).find(tensor) == VISION_TENSOR_NAMES.at(arch).end()) { - return "__missing__"; - } - - std::string name = ::format(VISION_TENSOR_NAMES.at(arch).at(tensor), bid, xid); - - if (suffix != nullptr) { - name += "."; - name += suffix; - } - - return name; -} - const char * llm_arch_name(llm_arch arch) { auto it = LLM_ARCH_NAMES.find(arch); if (it == LLM_ARCH_NAMES.end()) { diff --git a/src/llama-arch.h b/src/llama-arch.h index 5629dc46d..da118e1e1 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -66,16 +66,13 @@ enum llm_arch { LLM_ARCH_GRANITE_MOE, LLM_ARCH_CHAMELEON, LLM_ARCH_WAVTOKENIZER_DEC, + // vision + LLM_ARCH_VISION_LLAVA, + LLM_ARCH_VISION_MOBILEVLM, + LLM_ARCH_VISION_MINICPMV, LLM_ARCH_UNKNOWN, }; -enum vision_arch { - VISION_ARCH_UNKNOWN, - VISION_ARCH_LLAVA, - VISION_ARCH_MOBILEVLM, - VISION_ARCH_MINICPMV, -}; - enum llm_kv { LLM_KV_GENERAL_TYPE, LLM_KV_GENERAL_ARCHITECTURE, @@ -354,35 +351,33 @@ enum llm_tensor { LLM_TENSOR_POS_NET_ATTN_K, LLM_TENSOR_POS_NET_ATTN_V, LLM_TENSOR_POS_NET_ATTN_OUT, -}; - -enum vision_tensor { - VISION_TENSOR_MMPROJ, - VISION_TENSOR_MMPROJ_MLP, - VISION_TENSOR_MMPROJ_PEG, - VISION_TENSOR_ENC_EMBD_CLS, - VISION_TENSOR_ENC_EMBD_PATCH, - VISION_TENSOR_ENC_EMBD_POS, - VISION_TENSOR_ENC_ATTN_Q, - VISION_TENSOR_ENC_ATTN_K, - VISION_TENSOR_ENC_ATTN_V, - VISION_TENSOR_ENC_INPUT_NORM, - VISION_TENSOR_ENC_OUTPUT, - VISION_TENSOR_ENC_OUTPUT_NORM, - VISION_TENSOR_ENC_FFN_UP, - VISION_TENSOR_ENC_FFN_DOWN, - VISION_TENSOR_PRE_NORM, - VISION_TENSOR_POST_NORM, - // minicpmv - VISION_TENSOR_RESMPL_POS_EMBD_K, - VISION_TENSOR_RESMPL_ATTN_IN, - VISION_TENSOR_RESMPL_ATTN_OUT, - VISION_TENSOR_RESMPL_KV_PROJ, - VISION_TENSOR_RESMPL_NORM_POST, - VISION_TENSOR_RESMPL_NORM_KV, - VISION_TENSOR_RESMPL_NORM_Q, - VISION_TENSOR_RESMPL_PROJ, - VISION_TENSOR_RESMPL_QUERY, + // vision + LLM_TENSOR_V_MMPROJ, + LLM_TENSOR_V_MMPROJ_MLP, + LLM_TENSOR_V_MMPROJ_PEG, + LLM_TENSOR_V_ENC_EMBD_CLS, + LLM_TENSOR_V_ENC_EMBD_PATCH, + LLM_TENSOR_V_ENC_EMBD_POS, + LLM_TENSOR_V_ENC_ATTN_Q, + LLM_TENSOR_V_ENC_ATTN_K, + LLM_TENSOR_V_ENC_ATTN_V, + LLM_TENSOR_V_ENC_INPUT_NORM, + LLM_TENSOR_V_ENC_OUTPUT, + LLM_TENSOR_V_ENC_OUTPUT_NORM, + LLM_TENSOR_V_ENC_FFN_UP, + LLM_TENSOR_V_ENC_FFN_DOWN, + LLM_TENSOR_V_PRE_NORM, + LLM_TENSOR_V_POST_NORM, + // vision - minicpmv + LLM_TENSOR_V_RESMPL_POS_EMBD_K, + LLM_TENSOR_V_RESMPL_ATTN_IN, + LLM_TENSOR_V_RESMPL_ATTN_OUT, + LLM_TENSOR_V_RESMPL_KV_PROJ, + LLM_TENSOR_V_RESMPL_NORM_POST, + LLM_TENSOR_V_RESMPL_NORM_KV, + LLM_TENSOR_V_RESMPL_NORM_Q, + LLM_TENSOR_V_RESMPL_PROJ, + LLM_TENSOR_V_RESMPL_QUERY, }; enum llm_tensor_layer { @@ -408,10 +403,9 @@ struct LLM_KV { // std::string name = tn(LLM_TENSOR_TOKEN_EMBD, "bias"); -> "token_embd.bias" // std::string name = tn(LLM_TENSOR_ATTN_NORM, "weight", 3); -> "blk.3.attn_norm.weight" // -template -struct BASE_TN_IMPL { - const Tname arch; - const Ttensor tensor; +struct LLM_TN_IMPL { + const llm_arch arch; + const llm_tensor tensor; const char * const suffix; const int bid; const int xid; @@ -422,16 +416,15 @@ struct BASE_TN_IMPL { return str(); } - friend bool operator==(const std::string & str, const BASE_TN_IMPL & tn) { + friend bool operator==(const std::string & str, const LLM_TN_IMPL & tn) { return str == tn.str(); } - friend bool operator!=(const std::string & str, const BASE_TN_IMPL & tn) { + friend bool operator!=(const std::string & str, const LLM_TN_IMPL & tn) { return str != tn.str(); } }; -using LLM_TN_IMPL = BASE_TN_IMPL; struct LLM_TN { LLM_TN(llm_arch arch) : arch(arch) {} @@ -446,20 +439,6 @@ struct LLM_TN { } }; -struct VISION_TN { - VISION_TN(vision_arch arch) : arch(arch) {} - - vision_arch arch; - - BASE_TN_IMPL operator()(vision_tensor tensor, const char * suffix, int bid = -1, int xid = -1) const { - return { arch, tensor, suffix, bid, xid }; - } - - BASE_TN_IMPL operator()(vision_tensor tensor, int bid = -1, int xid = -1) const { - return { arch, tensor, nullptr, bid, xid }; - } -}; - struct llm_tensor_info { llm_tensor_layer layer; @@ -470,6 +449,4 @@ const char * llm_arch_name(llm_arch arch); llm_arch llm_arch_from_string(const std::string & name); -vision_arch vision_arch_from_string(const std::string & name); - const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index a305fa463..0ea66d254 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1281,8 +1281,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { { std::string arch; ml.get_key(LLM_KV_VISION_VIT_ARCHITECTURE, arch, true); - vparams.arch = vision_arch_from_string(arch); - if (vparams.arch == VISION_ARCH_UNKNOWN) { + vparams.arch = llm_arch_from_string(arch); + if (vparams.arch == LLM_ARCH_UNKNOWN) { throw std::runtime_error(format("unsupported vision arch: %s", arch.c_str())); } } @@ -3421,7 +3421,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const int64_t max_pos_embd = vparams.max_pos_embd; const int64_t n_channel = 3; // always RGB const int64_t patch_size = vparams.patch_size; - const auto tn = VISION_TN(vparams.arch); + const auto tn = LLM_TN(vparams.arch); // clip is CPU-only for now clip.buft = ggml_backend_cpu_buffer_type(); @@ -3429,85 +3429,85 @@ bool llama_model::load_tensors(llama_model_loader & ml) { clip.layers.resize(n_vlayer); switch (vparams.arch) { - case VISION_ARCH_LLAVA: - case VISION_ARCH_MOBILEVLM: + case LLM_ARCH_VISION_LLAVA: + case LLM_ARCH_VISION_MOBILEVLM: { - if (vparams.arch == VISION_ARCH_LLAVA) { - clip.mm_1_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 1), {n_vembd, n_vff}); - clip.mm_1_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias" , 1), {n_vff}); - clip.mm_2_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 2), {n_vff, n_vff}); - clip.mm_2_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias" , 2), {n_vff}); - } else if (vparams.arch == VISION_ARCH_MOBILEVLM) { - clip.mm_model_mlp_0_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_MLP, "weight", 0), {n_vembd, n_embd}); - clip.mm_model_mlp_0_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_MLP, "bias", 0), {n_embd}); - clip.mm_model_mlp_2_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_MLP, "weight", 2), {n_embd, n_embd}); - clip.mm_model_mlp_2_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_MLP, "bias", 2), {n_embd}); - clip.mm_model_peg_0_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_PEG, "weight", 0), {n_channel, n_channel, 1, n_embd}); - clip.mm_model_peg_0_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_PEG, "bias", 0), {n_embd}); + if (vparams.arch == LLM_ARCH_VISION_LLAVA) { + clip.mm_1_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "weight", 1), {n_vembd, n_vff}); + clip.mm_1_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "bias" , 1), {n_vff}); + clip.mm_2_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "weight", 2), {n_vff, n_vff}); + clip.mm_2_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "bias" , 2), {n_vff}); + } else if (vparams.arch == LLM_ARCH_VISION_MOBILEVLM) { + clip.mm_model_mlp_0_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "weight", 0), {n_vembd, n_embd}); + clip.mm_model_mlp_0_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "bias", 0), {n_embd}); + clip.mm_model_mlp_2_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "weight", 2), {n_embd, n_embd}); + clip.mm_model_mlp_2_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "bias", 2), {n_embd}); + clip.mm_model_peg_0_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_PEG, "weight", 0), {n_channel, n_channel, 1, n_embd}); + clip.mm_model_peg_0_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_PEG, "bias", 0), {n_embd}); } - clip.class_embedding = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_CLS ), {n_vembd}); - clip.patch_embeddings = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd}); - clip.position_embeddings = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd}); + clip.class_embedding = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_CLS ), {n_vembd}); + clip.patch_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd}); + clip.position_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd}); - clip.pre_norm_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_PRE_NORM, "weight"), {n_vembd}); - clip.pre_norm_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_PRE_NORM, "bias" ), {n_vembd}); - clip.post_norm_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM, "weight"), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED); - clip.post_norm_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM, "bias" ), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED); + clip.pre_norm_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_PRE_NORM, "weight"), {n_vembd}); + clip.pre_norm_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_PRE_NORM, "bias" ), {n_vembd}); + clip.post_norm_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_POST_NORM, "weight"), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED); + clip.post_norm_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_POST_NORM, "bias" ), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED); for (int i = 0; i < n_vlayer; ++i) { auto & layer = clip.layers[i]; - layer.k_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_K, "weight", i), {n_vembd, n_vembd}); - layer.k_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_K, "bias" , i), {n_vembd}); - layer.v_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_V, "weight", i), {n_vembd, n_vembd}); - layer.v_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_V, "bias" , i), {n_vembd}); - layer.q_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_Q, "weight", i), {n_vembd, n_vembd}); - layer.q_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_Q, "bias" , i), {n_vembd}); + layer.k_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_K, "weight", i), {n_vembd, n_vembd}); + layer.k_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_K, "bias" , i), {n_vembd}); + layer.v_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_V, "weight", i), {n_vembd, n_vembd}); + layer.v_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_V, "bias" , i), {n_vembd}); + layer.q_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_Q, "weight", i), {n_vembd, n_vembd}); + layer.q_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_Q, "bias" , i), {n_vembd}); - layer.ffn_up_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_UP, "weight", i), {n_vembd, n_vff}); - layer.ffn_up_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_UP, "bias" , i), {n_vff}); - layer.ffn_down_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_DOWN, "weight", i), {n_vff, n_vembd}); - layer.ffn_down_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_DOWN, "bias" , i), {n_vembd}); + layer.ffn_up_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_UP, "weight", i), {n_vembd, n_vff}); + layer.ffn_up_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_UP, "bias" , i), {n_vff}); + layer.ffn_down_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_DOWN, "weight", i), {n_vff, n_vembd}); + layer.ffn_down_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_DOWN, "bias" , i), {n_vembd}); - layer.norm_in_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_INPUT_NORM, "weight", i), {n_vembd}); - layer.norm_in_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_INPUT_NORM, "bias" , i), {n_vembd}); - layer.norm_out_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT_NORM, "weight", i), {n_vembd}); - layer.norm_out_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT_NORM, "bias" , i), {n_vembd}); + layer.norm_in_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_INPUT_NORM, "weight", i), {n_vembd}); + layer.norm_in_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_INPUT_NORM, "bias" , i), {n_vembd}); + layer.norm_out_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "weight", i), {n_vembd}); + layer.norm_out_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "bias" , i), {n_vembd}); - layer.output_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd}); - layer.output_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT, "bias" , i), {n_vembd}); + layer.output_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd}); + layer.output_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT, "bias" , i), {n_vembd}); } } break; - case VISION_ARCH_MINICPMV: + case LLM_ARCH_VISION_MINICPMV: { - clip.patch_embeddings = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd}); - clip.position_embeddings = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd}); + clip.patch_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd}); + clip.position_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd}); // TODO: load all resampler tensors for (int i = 0; i < n_vlayer; ++i) { auto & layer = clip.layers[i]; - layer.k_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_K, "weight", i), {n_vembd, n_vembd}); - layer.k_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_K, "bias" , i), {n_vembd}); - layer.v_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_V, "weight", i), {n_vembd, n_vembd}); - layer.v_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_V, "bias" , i), {n_vembd}); - layer.q_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_Q, "weight", i), {n_vembd, n_vembd}); - layer.q_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_Q, "bias" , i), {n_vembd}); + layer.k_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_K, "weight", i), {n_vembd, n_vembd}); + layer.k_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_K, "bias" , i), {n_vembd}); + layer.v_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_V, "weight", i), {n_vembd, n_vembd}); + layer.v_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_V, "bias" , i), {n_vembd}); + layer.q_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_Q, "weight", i), {n_vembd, n_vembd}); + layer.q_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_Q, "bias" , i), {n_vembd}); - layer.ffn_up_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_UP, "weight", i), {n_vembd, n_vff}); - layer.ffn_up_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_UP, "bias" , i), {n_vff}); - layer.ffn_down_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_DOWN, "weight", i), {n_vff, n_vembd}); - layer.ffn_down_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_DOWN, "bias" , i), {n_vembd}); + layer.ffn_up_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_UP, "weight", i), {n_vembd, n_vff}); + layer.ffn_up_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_UP, "bias" , i), {n_vff}); + layer.ffn_down_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_DOWN, "weight", i), {n_vff, n_vembd}); + layer.ffn_down_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_DOWN, "bias" , i), {n_vembd}); - layer.norm_in_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_INPUT_NORM, "weight", i), {n_vembd}); - layer.norm_in_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_INPUT_NORM, "bias" , i), {n_vembd}); - layer.norm_out_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT_NORM, "weight", i), {n_vembd}); - layer.norm_out_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT_NORM, "bias" , i), {n_vembd}); + layer.norm_in_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_INPUT_NORM, "weight", i), {n_vembd}); + layer.norm_in_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_INPUT_NORM, "bias" , i), {n_vembd}); + layer.norm_out_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "weight", i), {n_vembd}); + layer.norm_out_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "bias" , i), {n_vembd}); - layer.output_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd}); - layer.output_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT, "bias" , i), {n_vembd}); + layer.output_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd}); + layer.output_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT, "bias" , i), {n_vembd}); } } break; default: diff --git a/src/llama-vision.cpp b/src/llama-vision.cpp index 73c960315..e348d31da 100644 --- a/src/llama-vision.cpp +++ b/src/llama-vision.cpp @@ -393,7 +393,7 @@ struct minicpmv_preprocessor { static llama_vision_patches clip_image_preprocess_minicpmv(const clip_context & ctx, const clip_image_u8 & img) { auto & params = ctx.model->hparams; - GGML_ASSERT(params.arch == VISION_ARCH_MINICPMV); + GGML_ASSERT(params.arch == LLM_ARCH_VISION_MINICPMV); static const int max_slice_nums = 9; minicpmv_preprocessor preprocessor; @@ -775,7 +775,7 @@ static int32_t clip_image_encode(clip_context & ctx, const llama_vision_patches auto & model = *ctx.model; auto & hparams = ctx.model->hparams; - if (hparams.arch == VISION_ARCH_LLAVA) { + if (hparams.arch == LLM_ARCH_VISION_LLAVA) { GGML_ASSERT(batch_size == 1); // TODO: support multiple images } @@ -895,7 +895,7 @@ struct llama_vision_patches * llama_vision_patches_init( struct llama_context * ctx, llama_vision_bitmap * bmp) { clip_context & vctx = ctx->vctx; - if (vctx.model->hparams.arch == VISION_ARCH_MINICPMV) { + if (vctx.model->hparams.arch == LLM_ARCH_VISION_MINICPMV) { return new llama_vision_patches(clip_image_preprocess_minicpmv(vctx, *bmp)); } return new llama_vision_patches(clip_image_preprocess(vctx, *bmp)); diff --git a/src/llama-vision.h b/src/llama-vision.h index 19377abef..a9304867f 100644 --- a/src/llama-vision.h +++ b/src/llama-vision.h @@ -22,7 +22,7 @@ enum mm_patch_merge { }; struct clip_hparams { - vision_arch arch = VISION_ARCH_UNKNOWN; + llm_arch arch = LLM_ARCH_UNKNOWN; uint32_t image_size; uint32_t patch_size; @@ -157,18 +157,6 @@ struct llama_vision_patches { std::vector> buf; // preprocessed image data }; -inline vision_arch vision_arch_from_string(const std::string & name) { - if (name == "llava") { - return VISION_ARCH_LLAVA; - } else if (name == "mobilevlm") { - return VISION_ARCH_MOBILEVLM; - } else if (name == "minicpmv") { - return VISION_ARCH_MINICPMV; - } - - return VISION_ARCH_UNKNOWN; -} - inline mm_patch_merge mm_patch_merge_from_name(std::string & name) { if (name == "flat") { return MM_PATCH_MERGE_FLAT;