add back convert hf to gguf
This commit is contained in:
parent
0a81051ae2
commit
6cabdda0df
7 changed files with 266 additions and 6 deletions
|
@ -202,6 +202,9 @@ class Keys:
|
|||
FIM_PAD_ID = "tokenizer.ggml.fim_pad_token_id"
|
||||
FIM_REP_ID = "tokenizer.ggml.fim_rep_token_id"
|
||||
FIM_SEP_ID = "tokenizer.ggml.fim_sep_token_id"
|
||||
# Vision models
|
||||
IMAGE_START_ID = "tokenizer.ggml.image_start_token_id"
|
||||
IMAGE_END_ID = "tokenizer.ggml.image_end_token_id"
|
||||
# deprecated:
|
||||
PREFIX_ID = "tokenizer.ggml.prefix_token_id"
|
||||
SUFFIX_ID = "tokenizer.ggml.suffix_token_id"
|
||||
|
@ -211,6 +214,31 @@ class Keys:
|
|||
TYPE = "adapter.type"
|
||||
LORA_ALPHA = "adapter.lora.alpha"
|
||||
|
||||
class Vision:
|
||||
# only support vision.type = "clip-vit" for now
|
||||
TYPE = "vision.type"
|
||||
IMAGE_SIZE = "vision.image_size"
|
||||
PATCH_SIZE = "vision.patch_size"
|
||||
IMAGE_MEAN = "vision.image_mean"
|
||||
IMAGE_STD = "vision.image_std"
|
||||
|
||||
class Clip:
|
||||
ARCHITECTURE = "vision.clip.architecture"
|
||||
CONTEXT_LENGTH = "vision.clip.context_length"
|
||||
EMBEDDING_LENGTH = "vision.clip.embedding_length"
|
||||
BLOCK_COUNT = "vision.clip.block_count"
|
||||
FEED_FORWARD_LENGTH = "vision.clip.feed_forward_length"
|
||||
PROJECTION_TYPE = "vision.clip.projection_type"
|
||||
PROJECTION_DIM = "vision.clip.projection_dim"
|
||||
USE_GELU = "vision.clip.use_gelu"
|
||||
MAX_POS_EMBEDDING = "vision.clip.max_position_embeddings"
|
||||
MAX_SLICES = "vision.clip.max_slices"
|
||||
PROJECTOR_TYPE = "vision.clip.projector_type"
|
||||
SELECT_LAYER = "vision.clip.select_layer"
|
||||
PATCH_MERGE_TYPE = "vision.clip.patch_merge_type"
|
||||
HEAD_COUNT = "vision.clip.attention.head_count"
|
||||
LAYERNORM_EPS = "vision.clip.attention.layer_norm_epsilon"
|
||||
|
||||
#
|
||||
# recommended mapping of model tensor names for storage in gguf
|
||||
#
|
||||
|
@ -279,6 +307,8 @@ class MODEL_ARCH(IntEnum):
|
|||
GRANITE_MOE = auto()
|
||||
CHAMELEON = auto()
|
||||
WAVTOKENIZER_DEC = auto()
|
||||
# vision models
|
||||
LLAVA_VISION = auto()
|
||||
|
||||
|
||||
class MODEL_TENSOR(IntEnum):
|
||||
|
@ -390,6 +420,7 @@ class MODEL_TENSOR(IntEnum):
|
|||
ENC_OUTPUT_NORM = auto()
|
||||
CLS = auto() # classifier
|
||||
CLS_OUT = auto() # classifier output projection
|
||||
# wavtokenizer
|
||||
CONV1D = auto()
|
||||
CONVNEXT_DW = auto()
|
||||
CONVNEXT_NORM = auto()
|
||||
|
@ -406,6 +437,21 @@ class MODEL_TENSOR(IntEnum):
|
|||
POSNET_ATTN_K = auto()
|
||||
POSNET_ATTN_V = auto()
|
||||
POSNET_ATTN_OUT = auto()
|
||||
# vision
|
||||
V_MMPROJ = auto()
|
||||
V_ENC_EMBD_CLS = auto()
|
||||
V_ENC_EMBD_PATCH = auto()
|
||||
V_ENC_EMBD_POS = auto()
|
||||
V_ENC_ATTN_Q = auto()
|
||||
V_ENC_ATTN_K = auto()
|
||||
V_ENC_ATTN_V = auto()
|
||||
V_ENC_INPUT_NORM = auto()
|
||||
V_ENC_OUTPUT = auto()
|
||||
V_ENC_OUTPUT_NORM = auto()
|
||||
V_ENC_FFN_UP = auto()
|
||||
V_ENC_FFN_DOWN = auto()
|
||||
V_PRE_NORM = auto()
|
||||
V_POST_NORM = auto()
|
||||
|
||||
|
||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
|
@ -593,6 +639,21 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k",
|
||||
MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v",
|
||||
MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output",
|
||||
# vision
|
||||
MODEL_TENSOR.V_MMPROJ: "v.mmproj_{bid}",
|
||||
MODEL_TENSOR.V_ENC_EMBD_CLS: "v.enc.embd.cls",
|
||||
MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.enc.embd.patch",
|
||||
MODEL_TENSOR.V_ENC_EMBD_POS: "v.enc.embd.pos",
|
||||
MODEL_TENSOR.V_ENC_ATTN_Q: "v.enc.blk.{bid}.attn_q",
|
||||
MODEL_TENSOR.V_ENC_ATTN_K: "v.enc.blk.{bid}.attn_k",
|
||||
MODEL_TENSOR.V_ENC_ATTN_V: "v.enc.blk.{bid}.attn_v",
|
||||
MODEL_TENSOR.V_ENC_INPUT_NORM: "v.enc.blk.{bid}.input_norm",
|
||||
MODEL_TENSOR.V_ENC_OUTPUT: "v.enc.blk.{bid}.output",
|
||||
MODEL_TENSOR.V_ENC_OUTPUT_NORM: "v.enc.blk.{bid}.output_norm",
|
||||
MODEL_TENSOR.V_ENC_FFN_UP: "v.enc.blk.{bid}.ffn_up",
|
||||
MODEL_TENSOR.V_ENC_FFN_DOWN: "v.enc.blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.V_PRE_NORM: "v.pre_norm",
|
||||
MODEL_TENSOR.V_POST_NORM: "v.post_norm",
|
||||
}
|
||||
|
||||
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
|
@ -1534,6 +1595,22 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.POSNET_ATTN_V,
|
||||
MODEL_TENSOR.POSNET_ATTN_OUT,
|
||||
],
|
||||
MODEL_ARCH.LLAVA_VISION: [
|
||||
MODEL_TENSOR.V_MMPROJ,
|
||||
MODEL_TENSOR.V_ENC_EMBD_CLS,
|
||||
MODEL_TENSOR.V_ENC_EMBD_PATCH,
|
||||
MODEL_TENSOR.V_ENC_EMBD_POS,
|
||||
MODEL_TENSOR.V_ENC_ATTN_Q,
|
||||
MODEL_TENSOR.V_ENC_ATTN_K,
|
||||
MODEL_TENSOR.V_ENC_ATTN_V,
|
||||
MODEL_TENSOR.V_ENC_INPUT_NORM,
|
||||
MODEL_TENSOR.V_ENC_OUTPUT,
|
||||
MODEL_TENSOR.V_ENC_OUTPUT_NORM,
|
||||
MODEL_TENSOR.V_ENC_FFN_UP,
|
||||
MODEL_TENSOR.V_ENC_FFN_DOWN,
|
||||
MODEL_TENSOR.V_PRE_NORM,
|
||||
MODEL_TENSOR.V_POST_NORM,
|
||||
],
|
||||
# TODO
|
||||
}
|
||||
|
||||
|
@ -1615,6 +1692,15 @@ class PoolingType(IntEnum):
|
|||
CLS = 2
|
||||
|
||||
|
||||
class CLIPProjectorType(Enum):
|
||||
MLP = 'mlp'
|
||||
|
||||
|
||||
class CLIPPatchMergeType(Enum):
|
||||
FLAT = 'flat'
|
||||
SPATIAL_UNPAD = 'spatial_unpad'
|
||||
|
||||
|
||||
class GGMLQuantizationType(IntEnum):
|
||||
F32 = 0
|
||||
F16 = 1
|
||||
|
|
|
@ -27,6 +27,8 @@ from .constants import (
|
|||
PoolingType,
|
||||
TokenType,
|
||||
ExpertGatingFuncType,
|
||||
CLIPPatchMergeType,
|
||||
CLIPProjectorType,
|
||||
)
|
||||
|
||||
from .quants import quant_shape_from_byte_shape
|
||||
|
@ -874,6 +876,57 @@ class GGUFWriter:
|
|||
|
||||
def add_precompiled_charsmap(self, charsmap: Sequence[bytes]) -> None:
|
||||
self.add_array(Keys.Tokenizer.PRECOMPILED_CHARSMAP, charsmap)
|
||||
|
||||
def add_vision_type(self, value: str) -> None:
|
||||
self.add_string(Keys.Vision.TYPE, value)
|
||||
|
||||
def add_vision_image_size(self, value: int) -> None:
|
||||
self.add_uint32(Keys.Vision.IMAGE_SIZE, value)
|
||||
|
||||
def add_vision_patch_size(self, value: int) -> None:
|
||||
self.add_uint32(Keys.Vision.PATCH_SIZE, value)
|
||||
|
||||
def add_vision_clip_architecture(self, value: str) -> None:
|
||||
self.add_string(Keys.Vision.Clip.ARCHITECTURE, value)
|
||||
|
||||
def add_vision_clip_context_length(self, value: int) -> None:
|
||||
self.add_uint32(Keys.Vision.Clip.CONTEXT_LENGTH, value)
|
||||
|
||||
def add_vision_clip_embedding_length(self, value: int) -> None:
|
||||
self.add_uint32(Keys.Vision.Clip.EMBEDDING_LENGTH, value)
|
||||
|
||||
def add_vision_clip_block_count(self, value: int) -> None:
|
||||
self.add_uint32(Keys.Vision.Clip.BLOCK_COUNT, value)
|
||||
|
||||
def add_vision_clip_feed_forward_length(self, value: int) -> None:
|
||||
self.add_uint32(Keys.Vision.Clip.FEED_FORWARD_LENGTH, value)
|
||||
|
||||
def add_vision_clip_head_count(self, value: int) -> None:
|
||||
self.add_uint32(Keys.Vision.Clip.HEAD_COUNT, value)
|
||||
|
||||
def add_vision_clip_max_position_embeddings(self, value: int) -> None:
|
||||
self.add_uint32(Keys.Vision.Clip.MAX_POS_EMBEDDING, value)
|
||||
|
||||
def add_vision_clip_projector_type(self, value: CLIPProjectorType) -> None:
|
||||
self.add_string(Keys.Vision.Clip.PROJECTOR_TYPE, value.value)
|
||||
|
||||
def add_vision_clip_max_slices(self, value: int) -> None:
|
||||
self.add_uint32(Keys.Vision.Clip.MAX_SLICES, value)
|
||||
|
||||
def add_vision_clip_select_layer(self, value: int) -> None:
|
||||
self.add_int32(Keys.Vision.Clip.SELECT_LAYER, value)
|
||||
|
||||
def add_vision_clip_patch_merge_type(self, value: CLIPPatchMergeType) -> None:
|
||||
self.add_string(Keys.Vision.Clip.PATCH_MERGE_TYPE, value.value)
|
||||
|
||||
def add_vision_clip_layer_norm_epsilon(self, value: float) -> None:
|
||||
self.add_float32(Keys.Vision.Clip.LAYERNORM_EPS, value)
|
||||
|
||||
def add_vision_clip_image_mean(self, value: Sequence[float]) -> None:
|
||||
self.add_array(Keys.Vision.IMAGE_MEAN, value)
|
||||
|
||||
def add_vision_clip_image_std(self, value: Sequence[float]) -> None:
|
||||
self.add_array(Keys.Vision.IMAGE_STD, value)
|
||||
|
||||
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
|
||||
if not isinstance(value, str):
|
||||
|
|
|
@ -787,6 +787,64 @@ class TensorNameMap:
|
|||
MODEL_TENSOR.POSNET_ATTN_OUT: (
|
||||
"backbone.posnet.{bid}.proj_out", # wavtokenizer
|
||||
),
|
||||
|
||||
#############################################################################
|
||||
|
||||
MODEL_TENSOR.V_MMPROJ: (
|
||||
"multi_modal_projector.linear_{bid}",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_EMBD_CLS: (
|
||||
"vision_tower.vision_model.embeddings.class_embedding",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_EMBD_PATCH: (
|
||||
"vision_tower.vision_model.embeddings.patch_embedding",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_EMBD_POS: (
|
||||
"vision_tower.vision_model.embeddings.position_embedding",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_Q: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_K: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_V: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_INPUT_NORM: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm1",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_OUTPUT: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_OUTPUT_NORM: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm2",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_FFN_UP: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_FFN_DOWN: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_PRE_NORM: (
|
||||
"vision_tower.vision_model.pre_layrnorm",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_POST_NORM: (
|
||||
"vision_tower.vision_model.post_layernorm",
|
||||
),
|
||||
}
|
||||
|
||||
# architecture-specific block mappings
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue