add llava to conversion
This commit is contained in:
parent
1b2f992cd2
commit
cd806a7e88
4 changed files with 200 additions and 4 deletions
|
@ -66,6 +66,11 @@ class Model:
|
||||||
dir_model_card: Path
|
dir_model_card: Path
|
||||||
is_lora: bool
|
is_lora: bool
|
||||||
|
|
||||||
|
# for vision model
|
||||||
|
vparams: dict[str, Any] | None = None
|
||||||
|
v_tensor_map: gguf.TensorNameMap
|
||||||
|
v_tensor_names: set[str] | None
|
||||||
|
|
||||||
# subclasses should define this!
|
# subclasses should define this!
|
||||||
model_arch: gguf.MODEL_ARCH
|
model_arch: gguf.MODEL_ARCH
|
||||||
|
|
||||||
|
@ -210,9 +215,13 @@ class Model:
|
||||||
|
|
||||||
def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
|
def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
|
||||||
new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes)
|
new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes)
|
||||||
if new_name is None:
|
new_name_vision = self.v_tensor_map.get_name(key=name, try_suffixes=try_suffixes)
|
||||||
|
if new_name is not None:
|
||||||
|
return new_name
|
||||||
|
elif new_name_vision is not None:
|
||||||
|
return new_name_vision
|
||||||
|
else:
|
||||||
raise ValueError(f"Can not map tensor {name!r}")
|
raise ValueError(f"Can not map tensor {name!r}")
|
||||||
return new_name
|
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
self.gguf_writer.add_block_count(self.block_count)
|
self.gguf_writer.add_block_count(self.block_count)
|
||||||
|
@ -452,7 +461,10 @@ class Model:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def load_hparams(dir_model: Path):
|
def load_hparams(dir_model: Path):
|
||||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||||
return json.load(f)
|
hparams = json.load(f)
|
||||||
|
if "text_config" in hparams:
|
||||||
|
hparams = {**hparams, **hparams["text_config"]}
|
||||||
|
return hparams
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
|
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
|
||||||
|
@ -1501,10 +1513,17 @@ class StableLMModel(Model):
|
||||||
raise ValueError(f"Unprocessed norms: {norms}")
|
raise ValueError(f"Unprocessed norms: {norms}")
|
||||||
|
|
||||||
|
|
||||||
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
|
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "LlavaForConditionalGeneration")
|
||||||
class LlamaModel(Model):
|
class LlamaModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.LLAMA
|
model_arch = gguf.MODEL_ARCH.LLAMA
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
if "vision_config" in self.hparams:
|
||||||
|
self.vparams = self.hparams["vision_config"]
|
||||||
|
if self.vparams is not None:
|
||||||
|
self.v_tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAVA_VISION, self.vparams["num_hidden_layers"])
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
try:
|
try:
|
||||||
self._set_vocab_sentencepiece()
|
self._set_vocab_sentencepiece()
|
||||||
|
@ -1554,6 +1573,17 @@ class LlamaModel(Model):
|
||||||
if self.hparams.get("vocab_size", 32000) == 49152:
|
if self.hparams.get("vocab_size", 32000) == 49152:
|
||||||
self.gguf_writer.add_add_bos_token(False)
|
self.gguf_writer.add_add_bos_token(False)
|
||||||
|
|
||||||
|
# For vision model
|
||||||
|
if self.vparams is not None:
|
||||||
|
self.gguf_writer.add_vision_type("clip")
|
||||||
|
self.gguf_writer.add_vision_image_size(self.vparams["image_size"])
|
||||||
|
self.gguf_writer.add_vision_patch_size(self.vparams["patch_size"])
|
||||||
|
self.gguf_writer.add_vision_clip_architecture("llava")
|
||||||
|
self.gguf_writer.add_vision_clip_block_count(self.vparams["num_hidden_layers"])
|
||||||
|
self.gguf_writer.add_vision_clip_embedding_length(self.vparams["hidden_size"])
|
||||||
|
self.gguf_writer.add_vision_clip_feed_forward_length(self.vparams["intermediate_size"])
|
||||||
|
self.gguf_writer.add_vision_clip_head_count(self.vparams["num_attention_heads"])
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
||||||
if n_head_kv is not None and n_head != n_head_kv:
|
if n_head_kv is not None and n_head != n_head_kv:
|
||||||
|
@ -1568,6 +1598,9 @@ class LlamaModel(Model):
|
||||||
n_head = self.hparams["num_attention_heads"]
|
n_head = self.hparams["num_attention_heads"]
|
||||||
n_kv_head = self.hparams.get("num_key_value_heads")
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||||
|
|
||||||
|
if name.startswith("language_model"):
|
||||||
|
name = name.replace("language_model.", "")
|
||||||
|
|
||||||
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
||||||
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
||||||
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
||||||
|
|
|
@ -178,6 +178,26 @@ class Keys:
|
||||||
TYPE = "adapter.type"
|
TYPE = "adapter.type"
|
||||||
LORA_ALPHA = "adapter.lora.alpha"
|
LORA_ALPHA = "adapter.lora.alpha"
|
||||||
|
|
||||||
|
class Vision:
|
||||||
|
# only support vision.type = "clip" for now
|
||||||
|
TYPE = "vision.type"
|
||||||
|
IMAGE_SIZE = "vision.image_size"
|
||||||
|
PATCH_SIZE = "vision.patch_size"
|
||||||
|
IMAGE_MEAN = "vision.image_mean"
|
||||||
|
IMAGE_STD = "vision.image_std"
|
||||||
|
|
||||||
|
class Clip:
|
||||||
|
ARCHITECTURE = "vision.clip.architecture"
|
||||||
|
CONTEXT_LENGTH = "vision.clip.context_length"
|
||||||
|
EMBEDDING_LENGTH = "vision.clip.embedding_length"
|
||||||
|
BLOCK_COUNT = "vision.clip.block_count"
|
||||||
|
FEED_FORWARD_LENGTH = "vision.clip.feed_forward_length"
|
||||||
|
PROJECTION_TYPE = "vision.clip.projection_type"
|
||||||
|
PROJECTION_DIM = "vision.clip.projection_dim"
|
||||||
|
USE_GELU = "vision.clip.use_gelu"
|
||||||
|
HEAD_COUNT = "vision.clip.attention.head_count"
|
||||||
|
LAYERNORM_EPS = "vision.clip.attention.layer_norm_epsilon"
|
||||||
|
|
||||||
#
|
#
|
||||||
# recommended mapping of model tensor names for storage in gguf
|
# recommended mapping of model tensor names for storage in gguf
|
||||||
#
|
#
|
||||||
|
@ -238,6 +258,8 @@ class MODEL_ARCH(IntEnum):
|
||||||
GRANITE = auto()
|
GRANITE = auto()
|
||||||
GRANITE_MOE = auto()
|
GRANITE_MOE = auto()
|
||||||
CHAMELEON = auto()
|
CHAMELEON = auto()
|
||||||
|
# vision models
|
||||||
|
LLAVA_VISION = auto()
|
||||||
|
|
||||||
|
|
||||||
class MODEL_TENSOR(IntEnum):
|
class MODEL_TENSOR(IntEnum):
|
||||||
|
@ -345,6 +367,22 @@ class MODEL_TENSOR(IntEnum):
|
||||||
ENC_FFN_DOWN = auto()
|
ENC_FFN_DOWN = auto()
|
||||||
ENC_FFN_UP = auto()
|
ENC_FFN_UP = auto()
|
||||||
ENC_OUTPUT_NORM = auto()
|
ENC_OUTPUT_NORM = auto()
|
||||||
|
# vision
|
||||||
|
V_MMPROJ_A = auto()
|
||||||
|
V_MMPROJ_B = auto()
|
||||||
|
V_ENC_EMBD_CLS = auto()
|
||||||
|
V_ENC_EMBD_PATCH = auto()
|
||||||
|
V_ENC_EMBD_POS = auto()
|
||||||
|
V_ENC_ATTN_Q = auto()
|
||||||
|
V_ENC_ATTN_K = auto()
|
||||||
|
V_ENC_ATTN_V = auto()
|
||||||
|
V_ENC_INPUT_NORM = auto()
|
||||||
|
V_ENC_OUTPUT = auto()
|
||||||
|
V_ENC_OUTPUT_NORM = auto()
|
||||||
|
V_ENC_FFN_UP = auto()
|
||||||
|
V_ENC_FFN_DOWN = auto()
|
||||||
|
V_PRE_NORM = auto()
|
||||||
|
V_POST_NORM = auto()
|
||||||
|
|
||||||
|
|
||||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
|
@ -397,6 +435,8 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.GRANITE: "granite",
|
MODEL_ARCH.GRANITE: "granite",
|
||||||
MODEL_ARCH.GRANITE_MOE: "granitemoe",
|
MODEL_ARCH.GRANITE_MOE: "granitemoe",
|
||||||
MODEL_ARCH.CHAMELEON: "chameleon",
|
MODEL_ARCH.CHAMELEON: "chameleon",
|
||||||
|
# vision
|
||||||
|
MODEL_ARCH.LLAVA_VISION: "llava",
|
||||||
}
|
}
|
||||||
|
|
||||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
|
@ -504,6 +544,22 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down",
|
MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down",
|
||||||
MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up",
|
MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up",
|
||||||
MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
|
MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
|
||||||
|
# vision
|
||||||
|
MODEL_TENSOR.V_MMPROJ_A: "v.mmproj_a",
|
||||||
|
MODEL_TENSOR.V_MMPROJ_B: "v.mmproj_b",
|
||||||
|
MODEL_TENSOR.V_ENC_EMBD_CLS: "v.enc.embd.cls",
|
||||||
|
MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.enc.embd.patch",
|
||||||
|
MODEL_TENSOR.V_ENC_EMBD_POS: "v.enc.embd.pos",
|
||||||
|
MODEL_TENSOR.V_ENC_ATTN_Q: "v.enc.blk.{bid}.attn_q",
|
||||||
|
MODEL_TENSOR.V_ENC_ATTN_K: "v.enc.blk.{bid}.attn_k",
|
||||||
|
MODEL_TENSOR.V_ENC_ATTN_V: "v.enc.blk.{bid}.attn_v",
|
||||||
|
MODEL_TENSOR.V_ENC_INPUT_NORM: "v.enc.blk.{bid}.input_norm",
|
||||||
|
MODEL_TENSOR.V_ENC_OUTPUT: "v.enc.blk.{bid}.output",
|
||||||
|
MODEL_TENSOR.V_ENC_OUTPUT_NORM: "v.enc.blk.{bid}.output_norm",
|
||||||
|
MODEL_TENSOR.V_ENC_FFN_UP: "v.enc.blk.{bid}.ffn_up",
|
||||||
|
MODEL_TENSOR.V_ENC_FFN_DOWN: "v.enc.blk.{bid}.ffn_down",
|
||||||
|
MODEL_TENSOR.V_PRE_NORM: "v.pre_norm",
|
||||||
|
MODEL_TENSOR.V_POST_NORM: "v.post_norm",
|
||||||
}
|
}
|
||||||
|
|
||||||
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
|
@ -1279,6 +1335,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.LLAVA_VISION: [
|
||||||
|
MODEL_TENSOR.V_MMPROJ_A,
|
||||||
|
MODEL_TENSOR.V_MMPROJ_B,
|
||||||
|
MODEL_TENSOR.V_ENC_EMBD_CLS,
|
||||||
|
MODEL_TENSOR.V_ENC_EMBD_PATCH,
|
||||||
|
MODEL_TENSOR.V_ENC_EMBD_POS,
|
||||||
|
MODEL_TENSOR.V_ENC_ATTN_Q,
|
||||||
|
MODEL_TENSOR.V_ENC_ATTN_K,
|
||||||
|
MODEL_TENSOR.V_ENC_ATTN_V,
|
||||||
|
MODEL_TENSOR.V_ENC_INPUT_NORM,
|
||||||
|
MODEL_TENSOR.V_ENC_OUTPUT,
|
||||||
|
MODEL_TENSOR.V_ENC_OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.V_ENC_FFN_UP,
|
||||||
|
MODEL_TENSOR.V_ENC_FFN_DOWN,
|
||||||
|
MODEL_TENSOR.V_PRE_NORM,
|
||||||
|
MODEL_TENSOR.V_POST_NORM,
|
||||||
|
],
|
||||||
# TODO
|
# TODO
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -814,6 +814,36 @@ class GGUFWriter:
|
||||||
def add_precompiled_charsmap(self, charsmap: Sequence[bytes]) -> None:
|
def add_precompiled_charsmap(self, charsmap: Sequence[bytes]) -> None:
|
||||||
self.add_array(Keys.Tokenizer.PRECOMPILED_CHARSMAP, charsmap)
|
self.add_array(Keys.Tokenizer.PRECOMPILED_CHARSMAP, charsmap)
|
||||||
|
|
||||||
|
def add_vision_type(self, value: str) -> None:
|
||||||
|
self.add_string(Keys.Vision.TYPE, value)
|
||||||
|
|
||||||
|
def add_vision_image_size(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.Vision.IMAGE_SIZE, value)
|
||||||
|
|
||||||
|
def add_vision_patch_size(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.Vision.PATCH_SIZE, value)
|
||||||
|
|
||||||
|
def add_vision_clip_architecture(self, value: str) -> None:
|
||||||
|
self.add_string(Keys.Vision.Clip.ARCHITECTURE, value)
|
||||||
|
|
||||||
|
def add_vision_clip_context_length(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.Vision.Clip.CONTEXT_LENGTH, value)
|
||||||
|
|
||||||
|
def add_vision_clip_embedding_length(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.Vision.Clip.EMBEDDING_LENGTH, value)
|
||||||
|
|
||||||
|
def add_vision_clip_block_count(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.Vision.Clip.BLOCK_COUNT, value)
|
||||||
|
|
||||||
|
def add_vision_clip_feed_forward_length(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.Vision.Clip.FEED_FORWARD_LENGTH, value)
|
||||||
|
|
||||||
|
def add_vision_clip_head_count(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.Vision.Clip.HEAD_COUNT, value)
|
||||||
|
|
||||||
|
def add_vision_clip_layer_norm_epsilon(self, value: float) -> None:
|
||||||
|
self.add_float32(Keys.Vision.Clip.LAYERNORM_EPS, value)
|
||||||
|
|
||||||
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
|
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
|
||||||
if not isinstance(value, str):
|
if not isinstance(value, str):
|
||||||
template_default = None
|
template_default = None
|
||||||
|
|
|
@ -679,6 +679,66 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.ENC_OUTPUT_NORM: (
|
MODEL_TENSOR.ENC_OUTPUT_NORM: (
|
||||||
"encoder.final_layer_norm", # t5
|
"encoder.final_layer_norm", # t5
|
||||||
),
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.V_MMPROJ_A: (
|
||||||
|
"multi_modal_projector.linear_1",
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.V_MMPROJ_B: (
|
||||||
|
"multi_modal_projector.linear_2",
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.V_ENC_EMBD_CLS: (
|
||||||
|
"vision_tower.vision_model.embeddings.class_embedding",
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.V_ENC_EMBD_PATCH: (
|
||||||
|
"vision_tower.vision_model.embeddings.patch_embedding",
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.V_ENC_EMBD_POS: (
|
||||||
|
"vision_tower.vision_model.embeddings.position_embedding",
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.V_ENC_ATTN_Q: (
|
||||||
|
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj",
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.V_ENC_ATTN_K: (
|
||||||
|
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj",
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.V_ENC_ATTN_V: (
|
||||||
|
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj",
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.V_ENC_INPUT_NORM: (
|
||||||
|
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm1",
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.V_ENC_OUTPUT: (
|
||||||
|
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj",
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.V_ENC_OUTPUT_NORM: (
|
||||||
|
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm2",
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.V_ENC_FFN_UP: (
|
||||||
|
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1",
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.V_ENC_FFN_DOWN: (
|
||||||
|
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2",
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.V_PRE_NORM: (
|
||||||
|
"vision_tower.vision_model.pre_layrnorm",
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.V_POST_NORM: (
|
||||||
|
"vision_tower.vision_model.post_layernorm",
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
# architecture-specific block mappings
|
# architecture-specific block mappings
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue