change gguf KV from clip to vit

This commit is contained in:
Xuan Son Nguyen 2025-01-21 10:51:26 +01:00
parent 4a7ab89d75
commit 431bb08059
6 changed files with 103 additions and 103 deletions

View file

@ -195,21 +195,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_VISION_PATCH_SIZE, "vision.patch_size" },
{ LLM_KV_VISION_IMAGE_MEAN, "vision.image_mean" },
{ LLM_KV_VISION_IMAGE_STD, "vision.image_std" },
{ LLM_KV_VISION_CLIP_ARCHITECTURE, "vision.clip.architecture" },
{ LLM_KV_VISION_CLIP_CONTEXT_LENGTH, "vision.clip.context_length" },
{ LLM_KV_VISION_CLIP_EMBEDDING_LENGTH, "vision.clip.embedding_length" },
{ LLM_KV_VISION_CLIP_BLOCK_COUNT, "vision.clip.block_count" },
{ LLM_KV_VISION_CLIP_FEED_FORWARD_LENGTH, "vision.clip.feed_forward_length" },
{ LLM_KV_VISION_CLIP_PROJECTION_TYPE, "vision.clip.projection_type" },
{ LLM_KV_VISION_CLIP_PROJECTION_DIM, "vision.clip.projection_dim" },
{ LLM_KV_VISION_CLIP_USE_GELU, "vision.clip.use_gelu" },
{ LLM_KV_VISION_CLIP_MAX_POS_EMBD, "vision.clip.max_position_embeddings" },
{ LLM_KV_VISION_CLIP_MAX_SLICES, "vision.clip.max_slices" },
{ LLM_KV_VISION_CLIP_PROJECTOR_TYPE, "vision.clip.projector_type" },
{ LLM_KV_VISION_CLIP_SELECT_LAYER, "vision.clip.select_layer" },
{ LLM_KV_VISION_CLIP_PATCH_MERGE_TYPE, "vision.clip.patch_merge_type" },
{ LLM_KV_VISION_CLIP_HEAD_COUNT, "vision.clip.attention.head_count" },
{ LLM_KV_VISION_CLIP_LAYERNORM_EPS, "vision.clip.attention.layer_norm_epsilon" },
{ LLM_KV_VISION_VIT_ARCHITECTURE, "vision.vit.architecture" },
{ LLM_KV_VISION_VIT_CONTEXT_LENGTH, "vision.vit.context_length" },
{ LLM_KV_VISION_VIT_EMBEDDING_LENGTH, "vision.vit.embedding_length" },
{ LLM_KV_VISION_VIT_BLOCK_COUNT, "vision.vit.block_count" },
{ LLM_KV_VISION_VIT_FEED_FORWARD_LENGTH, "vision.vit.feed_forward_length" },
{ LLM_KV_VISION_VIT_PROJECTION_TYPE, "vision.vit.projection_type" },
{ LLM_KV_VISION_VIT_PROJECTION_DIM, "vision.vit.projection_dim" },
{ LLM_KV_VISION_VIT_USE_GELU, "vision.vit.use_gelu" },
{ LLM_KV_VISION_VIT_MAX_POS_EMBD, "vision.vit.max_position_embeddings" },
{ LLM_KV_VISION_VIT_MAX_SLICES, "vision.vit.max_slices" },
{ LLM_KV_VISION_VIT_PROJECTOR_TYPE, "vision.vit.projector_type" },
{ LLM_KV_VISION_VIT_SELECT_LAYER, "vision.vit.select_layer" },
{ LLM_KV_VISION_VIT_PATCH_MERGE_TYPE, "vision.vit.patch_merge_type" },
{ LLM_KV_VISION_VIT_HEAD_COUNT, "vision.vit.attention.head_count" },
{ LLM_KV_VISION_VIT_LAYERNORM_EPS, "vision.vit.attention.layer_norm_epsilon" },
// deprecated
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },

View file

@ -205,21 +205,21 @@ enum llm_kv {
LLM_KV_VISION_PATCH_SIZE,
LLM_KV_VISION_IMAGE_MEAN,
LLM_KV_VISION_IMAGE_STD,
LLM_KV_VISION_CLIP_ARCHITECTURE,
LLM_KV_VISION_CLIP_CONTEXT_LENGTH,
LLM_KV_VISION_CLIP_EMBEDDING_LENGTH,
LLM_KV_VISION_CLIP_BLOCK_COUNT,
LLM_KV_VISION_CLIP_FEED_FORWARD_LENGTH,
LLM_KV_VISION_CLIP_PROJECTION_TYPE,
LLM_KV_VISION_CLIP_PROJECTION_DIM,
LLM_KV_VISION_CLIP_USE_GELU,
LLM_KV_VISION_CLIP_MAX_POS_EMBD,
LLM_KV_VISION_CLIP_MAX_SLICES,
LLM_KV_VISION_CLIP_PROJECTOR_TYPE,
LLM_KV_VISION_CLIP_SELECT_LAYER,
LLM_KV_VISION_CLIP_PATCH_MERGE_TYPE,
LLM_KV_VISION_CLIP_HEAD_COUNT,
LLM_KV_VISION_CLIP_LAYERNORM_EPS,
LLM_KV_VISION_VIT_ARCHITECTURE,
LLM_KV_VISION_VIT_CONTEXT_LENGTH,
LLM_KV_VISION_VIT_EMBEDDING_LENGTH,
LLM_KV_VISION_VIT_BLOCK_COUNT,
LLM_KV_VISION_VIT_FEED_FORWARD_LENGTH,
LLM_KV_VISION_VIT_PROJECTION_TYPE,
LLM_KV_VISION_VIT_PROJECTION_DIM,
LLM_KV_VISION_VIT_USE_GELU,
LLM_KV_VISION_VIT_MAX_POS_EMBD,
LLM_KV_VISION_VIT_MAX_SLICES,
LLM_KV_VISION_VIT_PROJECTOR_TYPE,
LLM_KV_VISION_VIT_SELECT_LAYER,
LLM_KV_VISION_VIT_PATCH_MERGE_TYPE,
LLM_KV_VISION_VIT_HEAD_COUNT,
LLM_KV_VISION_VIT_LAYERNORM_EPS,
// deprecated:
LLM_KV_TOKENIZER_PREFIX_ID,

View file

@ -1251,23 +1251,23 @@ void llama_model::load_hparams(llama_model_loader & ml) {
auto & vparams = clip.hparams;
std::string vision_type;
ml.get_key(LLM_KV_VISION_TYPE, vision_type, false);
if (vision_type == "clip-vit") {
LLAMA_LOG_INFO("%s: loading clip-vit vision model\n", __func__);
if (vision_type == "vit") {
LLAMA_LOG_INFO("%s: loading ViT vision model\n", __func__);
has_vision = true;
ml.get_key(LLM_KV_VISION_IMAGE_SIZE, vparams.image_size, true);
ml.get_key(LLM_KV_VISION_PATCH_SIZE, vparams.patch_size, true);
ml.get_key_or_arr(LLM_KV_VISION_IMAGE_MEAN, vparams.image_mean, 3, true);
ml.get_key_or_arr(LLM_KV_VISION_IMAGE_STD, vparams.image_std, 3, true);
ml.get_key(LLM_KV_VISION_CLIP_EMBEDDING_LENGTH, vparams.hidden_size, true);
ml.get_key(LLM_KV_VISION_CLIP_BLOCK_COUNT, vparams.n_layer, true);
ml.get_key(LLM_KV_VISION_CLIP_FEED_FORWARD_LENGTH, vparams.n_intermediate, true);
ml.get_key(LLM_KV_VISION_CLIP_HEAD_COUNT, vparams.n_head, true);
ml.get_key(LLM_KV_VISION_CLIP_LAYERNORM_EPS, vparams.eps, true);
ml.get_key(LLM_KV_VISION_CLIP_SELECT_LAYER, vparams.select_layer, true);
ml.get_key(LLM_KV_VISION_CLIP_MAX_POS_EMBD, vparams.max_pos_embd, true);
ml.get_key(LLM_KV_VISION_VIT_EMBEDDING_LENGTH, vparams.hidden_size, true);
ml.get_key(LLM_KV_VISION_VIT_BLOCK_COUNT, vparams.n_layer, true);
ml.get_key(LLM_KV_VISION_VIT_FEED_FORWARD_LENGTH, vparams.n_intermediate, true);
ml.get_key(LLM_KV_VISION_VIT_HEAD_COUNT, vparams.n_head, true);
ml.get_key(LLM_KV_VISION_VIT_LAYERNORM_EPS, vparams.eps, true);
ml.get_key(LLM_KV_VISION_VIT_SELECT_LAYER, vparams.select_layer, true);
ml.get_key(LLM_KV_VISION_VIT_MAX_POS_EMBD, vparams.max_pos_embd, true);
{
std::string name;
ml.get_key(LLM_KV_VISION_CLIP_PROJECTOR_TYPE, name, true);
ml.get_key(LLM_KV_VISION_VIT_PROJECTOR_TYPE, name, true);
vparams.proj_type = clip_projector_type_from_name(name);
if (vparams.proj_type == CLIP_PROJECTOR_TYPE_UNKNOWN) {
throw std::runtime_error(format("unsupported clip projector type: %s", name.c_str()));
@ -1275,12 +1275,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
}
{
std::string name;
ml.get_key(LLM_KV_VISION_CLIP_PATCH_MERGE_TYPE, name, false);
ml.get_key(LLM_KV_VISION_VIT_PATCH_MERGE_TYPE, name, false);
vparams.mm_patch_merge_type = mm_patch_merge_from_name(name);
}
{
std::string arch;
ml.get_key(LLM_KV_VISION_CLIP_ARCHITECTURE, arch, true);
ml.get_key(LLM_KV_VISION_VIT_ARCHITECTURE, arch, true);
vparams.arch = vision_arch_from_string(arch);
if (vparams.arch == VISION_ARCH_UNKNOWN) {
throw std::runtime_error(format("unsupported vision arch: %s", arch.c_str()));