change gguf KV from clip to vit
This commit is contained in:
parent
4a7ab89d75
commit
431bb08059
6 changed files with 103 additions and 103 deletions
|
@ -195,21 +195,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|||
{ LLM_KV_VISION_PATCH_SIZE, "vision.patch_size" },
|
||||
{ LLM_KV_VISION_IMAGE_MEAN, "vision.image_mean" },
|
||||
{ LLM_KV_VISION_IMAGE_STD, "vision.image_std" },
|
||||
{ LLM_KV_VISION_CLIP_ARCHITECTURE, "vision.clip.architecture" },
|
||||
{ LLM_KV_VISION_CLIP_CONTEXT_LENGTH, "vision.clip.context_length" },
|
||||
{ LLM_KV_VISION_CLIP_EMBEDDING_LENGTH, "vision.clip.embedding_length" },
|
||||
{ LLM_KV_VISION_CLIP_BLOCK_COUNT, "vision.clip.block_count" },
|
||||
{ LLM_KV_VISION_CLIP_FEED_FORWARD_LENGTH, "vision.clip.feed_forward_length" },
|
||||
{ LLM_KV_VISION_CLIP_PROJECTION_TYPE, "vision.clip.projection_type" },
|
||||
{ LLM_KV_VISION_CLIP_PROJECTION_DIM, "vision.clip.projection_dim" },
|
||||
{ LLM_KV_VISION_CLIP_USE_GELU, "vision.clip.use_gelu" },
|
||||
{ LLM_KV_VISION_CLIP_MAX_POS_EMBD, "vision.clip.max_position_embeddings" },
|
||||
{ LLM_KV_VISION_CLIP_MAX_SLICES, "vision.clip.max_slices" },
|
||||
{ LLM_KV_VISION_CLIP_PROJECTOR_TYPE, "vision.clip.projector_type" },
|
||||
{ LLM_KV_VISION_CLIP_SELECT_LAYER, "vision.clip.select_layer" },
|
||||
{ LLM_KV_VISION_CLIP_PATCH_MERGE_TYPE, "vision.clip.patch_merge_type" },
|
||||
{ LLM_KV_VISION_CLIP_HEAD_COUNT, "vision.clip.attention.head_count" },
|
||||
{ LLM_KV_VISION_CLIP_LAYERNORM_EPS, "vision.clip.attention.layer_norm_epsilon" },
|
||||
{ LLM_KV_VISION_VIT_ARCHITECTURE, "vision.vit.architecture" },
|
||||
{ LLM_KV_VISION_VIT_CONTEXT_LENGTH, "vision.vit.context_length" },
|
||||
{ LLM_KV_VISION_VIT_EMBEDDING_LENGTH, "vision.vit.embedding_length" },
|
||||
{ LLM_KV_VISION_VIT_BLOCK_COUNT, "vision.vit.block_count" },
|
||||
{ LLM_KV_VISION_VIT_FEED_FORWARD_LENGTH, "vision.vit.feed_forward_length" },
|
||||
{ LLM_KV_VISION_VIT_PROJECTION_TYPE, "vision.vit.projection_type" },
|
||||
{ LLM_KV_VISION_VIT_PROJECTION_DIM, "vision.vit.projection_dim" },
|
||||
{ LLM_KV_VISION_VIT_USE_GELU, "vision.vit.use_gelu" },
|
||||
{ LLM_KV_VISION_VIT_MAX_POS_EMBD, "vision.vit.max_position_embeddings" },
|
||||
{ LLM_KV_VISION_VIT_MAX_SLICES, "vision.vit.max_slices" },
|
||||
{ LLM_KV_VISION_VIT_PROJECTOR_TYPE, "vision.vit.projector_type" },
|
||||
{ LLM_KV_VISION_VIT_SELECT_LAYER, "vision.vit.select_layer" },
|
||||
{ LLM_KV_VISION_VIT_PATCH_MERGE_TYPE, "vision.vit.patch_merge_type" },
|
||||
{ LLM_KV_VISION_VIT_HEAD_COUNT, "vision.vit.attention.head_count" },
|
||||
{ LLM_KV_VISION_VIT_LAYERNORM_EPS, "vision.vit.attention.layer_norm_epsilon" },
|
||||
|
||||
// deprecated
|
||||
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
|
||||
|
|
|
@ -205,21 +205,21 @@ enum llm_kv {
|
|||
LLM_KV_VISION_PATCH_SIZE,
|
||||
LLM_KV_VISION_IMAGE_MEAN,
|
||||
LLM_KV_VISION_IMAGE_STD,
|
||||
LLM_KV_VISION_CLIP_ARCHITECTURE,
|
||||
LLM_KV_VISION_CLIP_CONTEXT_LENGTH,
|
||||
LLM_KV_VISION_CLIP_EMBEDDING_LENGTH,
|
||||
LLM_KV_VISION_CLIP_BLOCK_COUNT,
|
||||
LLM_KV_VISION_CLIP_FEED_FORWARD_LENGTH,
|
||||
LLM_KV_VISION_CLIP_PROJECTION_TYPE,
|
||||
LLM_KV_VISION_CLIP_PROJECTION_DIM,
|
||||
LLM_KV_VISION_CLIP_USE_GELU,
|
||||
LLM_KV_VISION_CLIP_MAX_POS_EMBD,
|
||||
LLM_KV_VISION_CLIP_MAX_SLICES,
|
||||
LLM_KV_VISION_CLIP_PROJECTOR_TYPE,
|
||||
LLM_KV_VISION_CLIP_SELECT_LAYER,
|
||||
LLM_KV_VISION_CLIP_PATCH_MERGE_TYPE,
|
||||
LLM_KV_VISION_CLIP_HEAD_COUNT,
|
||||
LLM_KV_VISION_CLIP_LAYERNORM_EPS,
|
||||
LLM_KV_VISION_VIT_ARCHITECTURE,
|
||||
LLM_KV_VISION_VIT_CONTEXT_LENGTH,
|
||||
LLM_KV_VISION_VIT_EMBEDDING_LENGTH,
|
||||
LLM_KV_VISION_VIT_BLOCK_COUNT,
|
||||
LLM_KV_VISION_VIT_FEED_FORWARD_LENGTH,
|
||||
LLM_KV_VISION_VIT_PROJECTION_TYPE,
|
||||
LLM_KV_VISION_VIT_PROJECTION_DIM,
|
||||
LLM_KV_VISION_VIT_USE_GELU,
|
||||
LLM_KV_VISION_VIT_MAX_POS_EMBD,
|
||||
LLM_KV_VISION_VIT_MAX_SLICES,
|
||||
LLM_KV_VISION_VIT_PROJECTOR_TYPE,
|
||||
LLM_KV_VISION_VIT_SELECT_LAYER,
|
||||
LLM_KV_VISION_VIT_PATCH_MERGE_TYPE,
|
||||
LLM_KV_VISION_VIT_HEAD_COUNT,
|
||||
LLM_KV_VISION_VIT_LAYERNORM_EPS,
|
||||
|
||||
// deprecated:
|
||||
LLM_KV_TOKENIZER_PREFIX_ID,
|
||||
|
|
|
@ -1251,23 +1251,23 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
auto & vparams = clip.hparams;
|
||||
std::string vision_type;
|
||||
ml.get_key(LLM_KV_VISION_TYPE, vision_type, false);
|
||||
if (vision_type == "clip-vit") {
|
||||
LLAMA_LOG_INFO("%s: loading clip-vit vision model\n", __func__);
|
||||
if (vision_type == "vit") {
|
||||
LLAMA_LOG_INFO("%s: loading ViT vision model\n", __func__);
|
||||
has_vision = true;
|
||||
ml.get_key(LLM_KV_VISION_IMAGE_SIZE, vparams.image_size, true);
|
||||
ml.get_key(LLM_KV_VISION_PATCH_SIZE, vparams.patch_size, true);
|
||||
ml.get_key_or_arr(LLM_KV_VISION_IMAGE_MEAN, vparams.image_mean, 3, true);
|
||||
ml.get_key_or_arr(LLM_KV_VISION_IMAGE_STD, vparams.image_std, 3, true);
|
||||
ml.get_key(LLM_KV_VISION_CLIP_EMBEDDING_LENGTH, vparams.hidden_size, true);
|
||||
ml.get_key(LLM_KV_VISION_CLIP_BLOCK_COUNT, vparams.n_layer, true);
|
||||
ml.get_key(LLM_KV_VISION_CLIP_FEED_FORWARD_LENGTH, vparams.n_intermediate, true);
|
||||
ml.get_key(LLM_KV_VISION_CLIP_HEAD_COUNT, vparams.n_head, true);
|
||||
ml.get_key(LLM_KV_VISION_CLIP_LAYERNORM_EPS, vparams.eps, true);
|
||||
ml.get_key(LLM_KV_VISION_CLIP_SELECT_LAYER, vparams.select_layer, true);
|
||||
ml.get_key(LLM_KV_VISION_CLIP_MAX_POS_EMBD, vparams.max_pos_embd, true);
|
||||
ml.get_key(LLM_KV_VISION_VIT_EMBEDDING_LENGTH, vparams.hidden_size, true);
|
||||
ml.get_key(LLM_KV_VISION_VIT_BLOCK_COUNT, vparams.n_layer, true);
|
||||
ml.get_key(LLM_KV_VISION_VIT_FEED_FORWARD_LENGTH, vparams.n_intermediate, true);
|
||||
ml.get_key(LLM_KV_VISION_VIT_HEAD_COUNT, vparams.n_head, true);
|
||||
ml.get_key(LLM_KV_VISION_VIT_LAYERNORM_EPS, vparams.eps, true);
|
||||
ml.get_key(LLM_KV_VISION_VIT_SELECT_LAYER, vparams.select_layer, true);
|
||||
ml.get_key(LLM_KV_VISION_VIT_MAX_POS_EMBD, vparams.max_pos_embd, true);
|
||||
{
|
||||
std::string name;
|
||||
ml.get_key(LLM_KV_VISION_CLIP_PROJECTOR_TYPE, name, true);
|
||||
ml.get_key(LLM_KV_VISION_VIT_PROJECTOR_TYPE, name, true);
|
||||
vparams.proj_type = clip_projector_type_from_name(name);
|
||||
if (vparams.proj_type == CLIP_PROJECTOR_TYPE_UNKNOWN) {
|
||||
throw std::runtime_error(format("unsupported clip projector type: %s", name.c_str()));
|
||||
|
@ -1275,12 +1275,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
}
|
||||
{
|
||||
std::string name;
|
||||
ml.get_key(LLM_KV_VISION_CLIP_PATCH_MERGE_TYPE, name, false);
|
||||
ml.get_key(LLM_KV_VISION_VIT_PATCH_MERGE_TYPE, name, false);
|
||||
vparams.mm_patch_merge_type = mm_patch_merge_from_name(name);
|
||||
}
|
||||
{
|
||||
std::string arch;
|
||||
ml.get_key(LLM_KV_VISION_CLIP_ARCHITECTURE, arch, true);
|
||||
ml.get_key(LLM_KV_VISION_VIT_ARCHITECTURE, arch, true);
|
||||
vparams.arch = vision_arch_from_string(arch);
|
||||
if (vparams.arch == VISION_ARCH_UNKNOWN) {
|
||||
throw std::runtime_error(format("unsupported vision arch: %s", arch.c_str()));
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue