add SmolVLM

This commit is contained in:
Xuan Son Nguyen 2025-01-23 15:51:30 +01:00
parent 25a97ce4cb
commit c3a654c0fb
9 changed files with 171 additions and 10 deletions

View file

@ -292,7 +292,10 @@ class Model:
self.gguf_writer.add_vision_vit_head_count(self.vparams["num_attention_heads"])
self.gguf_writer.add_vision_vit_image_mean(self.preprocessor_config["image_mean"])
self.gguf_writer.add_vision_vit_image_std(self.preprocessor_config["image_std"])
self.gguf_writer.add_vision_vit_select_layer(self.find_hparam(["vision_feature_layer", "mm_vision_select_layer"]))
try:
self.gguf_writer.add_vision_vit_select_layer(self.find_hparam(["vision_feature_layer", "mm_vision_select_layer"]))
except KeyError:
self.gguf_writer.add_vision_vit_select_layer(0)
self.gguf_writer.add_file_type(self.ftype)
logger.info(f"gguf: file type = {self.ftype}")
@ -506,8 +509,9 @@ class Model:
hparams = json.load(f)
if "text_config" in hparams:
text_config = hparams["text_config"]
model_id = text_config.get("_name_or_path", None)
# for example, llava-1.5-7b-hf misses the language model config, need to retrieve it via model ID
if "_name_or_path" in text_config:
if model_id is not None and model_id != "None" and model_id != "":
text_config = AutoConfig.from_pretrained(text_config["_name_or_path"]).to_dict()
hparams = {**text_config, **hparams}
return hparams
@ -1616,7 +1620,7 @@ class StableLMModel(Model):
raise ValueError(f"Unprocessed norms: {norms}")
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "LlavaForConditionalGeneration", "MobileLlamaForCausalLM")
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "LlavaForConditionalGeneration", "MobileLlamaForCausalLM", "Idefics3ForConditionalGeneration")
class LlamaModel(Model):
model_arch = gguf.MODEL_ARCH.LLAMA
@ -1640,6 +1644,11 @@ class LlamaModel(Model):
self.preprocessor_config = AutoImageProcessor.from_pretrained(vision_model_id).to_dict()
self.vision_arch = gguf.MODEL_ARCH.VISION_MOBILEVLM
if "vision_config" in self.hparams and model_type == "idefics3":
self.vparams = self.hparams["vision_config"]
self.preprocessor_config = self.load_preprocessor_config(self.dir_model)
self.vision_arch = gguf.MODEL_ARCH.VISION_IDEFICS3
if self.vparams is not None and self.vision_arch is not None:
self.v_tensor_map = gguf.get_tensor_name_map(self.vision_arch, self.vparams["num_hidden_layers"])
@ -1694,14 +1703,20 @@ class LlamaModel(Model):
# For vision model
if self.vparams is not None:
max_pos_embd = -1
self.gguf_writer.add_vision_vit_patch_merge_type(gguf.CLIPPatchMergeType.FLAT)
# TODO: should not hardcode these, but they are currently missing from config.json
if self.vision_arch == gguf.MODEL_ARCH.VISION_LLAVA:
self.gguf_writer.add_vision_vit_projector_type(gguf.constants.CLIPProjectorType.MLP)
max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1
if self.vision_arch == gguf.MODEL_ARCH.VISION_MOBILEVLM:
self.gguf_writer.add_vision_vit_projector_type(gguf.constants.CLIPProjectorType.LDPV2)
max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1
if self.vision_arch == gguf.MODEL_ARCH.VISION_IDEFICS3:
self.gguf_writer.add_vision_vit_projector_type(gguf.constants.CLIPProjectorType.MLP)
self.gguf_writer.add_vision_vit_scale_factor(self.hparams["scale_factor"])
max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2
self.gguf_writer.add_vision_vit_layer_norm_epsilon(1e-05)
max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1
self.gguf_writer.add_vision_vit_max_position_embeddings(max_pos_embd)
@staticmethod
@ -1717,19 +1732,23 @@ class LlamaModel(Model):
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
n_head = self.hparams["num_attention_heads"]
n_kv_head = self.hparams.get("num_key_value_heads")
is_vision_tensor = "vision_tower" in name or "vision_model" in name
# For vision model
if name.startswith("language_model"):
name = name.replace("language_model.", "")
if name.startswith("model.text_model"):
name = name.replace("text_model.", "") # for SmolVLM
else:
name = name.replace("model.vision_tower.", "")
if "post_layernorm" in name:
if "post_layernorm" in name and self.vision_arch != gguf.MODEL_ARCH.VISION_IDEFICS3:
return [] # skip post_layernorm
if name.endswith(("q_proj.weight", "q_proj.bias")):
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
if name.endswith(("k_proj.weight", "k_proj.bias")):
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
if not is_vision_tensor:
if name.endswith(("q_proj.weight", "q_proj.bias")):
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
if name.endswith(("k_proj.weight", "k_proj.bias")):
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
# process the experts separately
if name.find("block_sparse_moe.experts") != -1: