diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 1c8f912a9..e6b4cd5f2 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -471,7 +471,7 @@ class Model: text_config = AutoConfig.from_pretrained(text_config["_name_or_path"]).to_dict() hparams = {**text_config, **hparams} return hparams - + @staticmethod def load_preprocessor_config(dir_model: Path): file_path = dir_model / "preprocessor_config.json" @@ -1590,7 +1590,7 @@ class LlamaModel(Model): # For vision model if self.vparams is not None and self.preprocessor_config is not None: - self.gguf_writer.add_vision_type("clip") + self.gguf_writer.add_vision_type("clip-vit") self.gguf_writer.add_vision_image_size(self.vparams["image_size"]) self.gguf_writer.add_vision_patch_size(self.vparams["patch_size"]) self.gguf_writer.add_vision_clip_architecture("llava") @@ -1600,6 +1600,8 @@ class LlamaModel(Model): self.gguf_writer.add_vision_clip_head_count(self.vparams["num_attention_heads"]) self.gguf_writer.add_vision_clip_image_mean(self.preprocessor_config["image_mean"]) self.gguf_writer.add_vision_clip_image_std(self.preprocessor_config["image_std"]) + self.gguf_writer.add_vision_clip_select_layer(self.hparams["vision_feature_layer"]) + self.gguf_writer.add_vision_clip_patch_merge_type(gguf.CLIPPatchMergeType.FLAT) max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1 self.gguf_writer.add_vision_clip_max_position_embeddings(max_pos_embd) # TODO: should not hardcode these, but they are currently missing from config.json diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index b83dc311a..d351a56d1 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -173,13 +173,15 @@ class Keys: MIDDLE_ID = "tokenizer.ggml.middle_token_id" EOT_ID = "tokenizer.ggml.eot_token_id" EOM_ID = "tokenizer.ggml.eom_token_id" + IMAGE_START_ID = "tokenizer.ggml.image_start_token_id" + IMAGE_END_ID = "tokenizer.ggml.image_end_token_id" class Adapter: TYPE = "adapter.type" LORA_ALPHA = "adapter.lora.alpha" class Vision: - # only support vision.type = "clip" for now + # only support vision.type = "clip-vit" for now TYPE = "vision.type" IMAGE_SIZE = "vision.image_size" PATCH_SIZE = "vision.patch_size" @@ -196,7 +198,10 @@ class Keys: PROJECTION_DIM = "vision.clip.projection_dim" USE_GELU = "vision.clip.use_gelu" MAX_POS_EMBEDDING = "vision.clip.max_position_embeddings" + MAX_SLICES = "vision.clip.max_slices" PROJECTOR_TYPE = "vision.clip.projector_type" + SELECT_LAYER = "vision.clip.select_layer" + PATCH_MERGE_TYPE = "vision.clip.patch_merge_type" HEAD_COUNT = "vision.clip.attention.head_count" LAYERNORM_EPS = "vision.clip.attention.layer_norm_epsilon" @@ -1430,6 +1435,11 @@ class CLIPProjectorType(Enum): MLP = 'mlp' +class CLIPPatchMergeType(Enum): + FLAT = 'flat' + SPATIAL_UNPAD = 'spatial_unpad' + + class GGMLQuantizationType(IntEnum): F32 = 0 F16 = 1 diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index e44ef9a1d..02c2cf64e 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -27,6 +27,7 @@ from .constants import ( PoolingType, TokenType, CLIPProjectorType, + CLIPPatchMergeType, ) from .quants import quant_shape_from_byte_shape @@ -848,6 +849,15 @@ class GGUFWriter: def add_vision_clip_projector_type(self, value: CLIPProjectorType) -> None: self.add_string(Keys.Vision.Clip.PROJECTOR_TYPE, value.value) + def add_vision_clip_max_slices(self, value: int) -> None: + self.add_uint32(Keys.Vision.Clip.MAX_SLICES, value) + + def add_vision_clip_select_layer(self, value: int) -> None: + self.add_int32(Keys.Vision.Clip.SELECT_LAYER, value) + + def add_vision_clip_patch_merge_type(self, value: CLIPPatchMergeType) -> None: + self.add_string(Keys.Vision.Clip.PATCH_MERGE_TYPE, value.value) + def add_vision_clip_layer_norm_epsilon(self, value: float) -> None: self.add_float32(Keys.Vision.Clip.LAYERNORM_EPS, value) diff --git a/src/llama-vision.cpp b/src/llama-vision.cpp index dab3b999a..2950579ee 100644 --- a/src/llama-vision.cpp +++ b/src/llama-vision.cpp @@ -54,6 +54,22 @@ struct clip_image_f32 { using clip_image_f32_batch = std::vector; using clip_image_f8_batch = std::vector; +clip_projector_type projector_type_from_name(std::string & name) { + if (name == "mlp") { + return CLIP_PROJECTOR_TYPE_MLP; + } + return CLIP_PROJECTOR_TYPE_UNKNOWN; +} + +mm_patch_merge mm_patch_merge_from_name(std::string & name) { + if (name == "flat") { + return MM_PATCH_MERGE_FLAT; + } else if (name == "spatial_unpad") { + return MM_PATCH_MERGE_SPATIAL_UNPAD; + } + return MM_PATCH_MERGE_UNKNOWN; +} + int clip_n_patches(const clip_context & ctx) { auto & hparams = ctx.model->hparams; int n_patches = (hparams.image_size / hparams.patch_size) * (hparams.image_size / hparams.patch_size); @@ -456,7 +472,7 @@ static ggml_cgraph * clip_image_build_graph(clip_context & ctx, int batch_size, } // loop over layers - for (int il = 0; il < (int)hparams.n_layer - 2; il++) { + for (int il = 0; il < (int)hparams.n_layer + hparams.select_layer; il++) { struct ggml_tensor * cur = embeddings; // layernorm1 diff --git a/src/llama-vision.h b/src/llama-vision.h index dfcab10a5..1b2dbf5a4 100644 --- a/src/llama-vision.h +++ b/src/llama-vision.h @@ -11,10 +11,12 @@ enum vision_arch { }; enum clip_projector_type { + CLIP_PROJECTOR_TYPE_UNKNOWN, CLIP_PROJECTOR_TYPE_MLP, }; enum mm_patch_merge { + MM_PATCH_MERGE_UNKNOWN, MM_PATCH_MERGE_FLAT, MM_PATCH_MERGE_SPATIAL_UNPAD, }; @@ -30,11 +32,12 @@ struct clip_hparams { uint32_t n_head; uint32_t n_layer; uint32_t max_pos_embd; + int32_t select_layer = 0; bool use_gelu = false; float eps; - clip_projector_type proj_type = CLIP_PROJECTOR_TYPE_MLP; + clip_projector_type proj_type = CLIP_PROJECTOR_TYPE_UNKNOWN; mm_patch_merge mm_patch_merge_type = MM_PATCH_MERGE_FLAT; std::array image_mean; @@ -112,6 +115,8 @@ struct clip_context { std::vector output; // size == n_output * n_embd }; +mm_patch_merge mm_patch_merge_from_name(std::string & name); +clip_projector_type projector_type_from_name(std::string & name); int clip_n_patches(const clip_context & ctx); int clip_n_mmproj_embd(const clip_context & ctx); int clip_n_embd(const clip_context & ctx); diff --git a/src/llama.cpp b/src/llama.cpp index 08b1aa17e..c4f1c1d76 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -401,7 +401,10 @@ enum llm_kv { LLM_KV_VISION_CLIP_PROJECTION_DIM, LLM_KV_VISION_CLIP_USE_GELU, LLM_KV_VISION_CLIP_MAX_POS_EMBD, + LLM_KV_VISION_CLIP_MAX_SLICES, LLM_KV_VISION_CLIP_PROJECTOR_TYPE, + LLM_KV_VISION_CLIP_SELECT_LAYER, + LLM_KV_VISION_CLIP_PATCH_MERGE_TYPE, LLM_KV_VISION_CLIP_HEAD_COUNT, LLM_KV_VISION_CLIP_LAYERNORM_EPS, }; @@ -527,7 +530,10 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_VISION_CLIP_PROJECTION_DIM, "vision.clip.projection_dim" }, { LLM_KV_VISION_CLIP_USE_GELU, "vision.clip.use_gelu" }, { LLM_KV_VISION_CLIP_MAX_POS_EMBD, "vision.clip.max_position_embeddings" }, + { LLM_KV_VISION_CLIP_MAX_SLICES, "vision.clip.max_slices" }, { LLM_KV_VISION_CLIP_PROJECTOR_TYPE, "vision.clip.projector_type" }, + { LLM_KV_VISION_CLIP_SELECT_LAYER, "vision.clip.select_layer" }, + { LLM_KV_VISION_CLIP_PATCH_MERGE_TYPE, "vision.clip.patch_merge_type" }, { LLM_KV_VISION_CLIP_HEAD_COUNT, "vision.clip.attention.head_count" }, { LLM_KV_VISION_CLIP_LAYERNORM_EPS, "vision.clip.attention.layer_norm_epsilon" }, }; @@ -6227,9 +6233,8 @@ static void llm_load_hparams( auto & vparams = model.clip.hparams; std::string vision_type; ml.get_key(LLM_KV_VISION_TYPE, vision_type, false); - if (vision_type == "clip") { + if (vision_type == "clip-vit") { model.has_vision = true; - std::string proj_type; ml.get_key(LLM_KV_VISION_IMAGE_SIZE, vparams.image_size, true); ml.get_key(LLM_KV_VISION_PATCH_SIZE, vparams.patch_size, true); ml.get_key_or_arr(LLM_KV_VISION_IMAGE_MEAN, vparams.image_mean, 3, true); @@ -6239,18 +6244,28 @@ static void llm_load_hparams( ml.get_key(LLM_KV_VISION_CLIP_FEED_FORWARD_LENGTH, vparams.n_intermediate, true); ml.get_key(LLM_KV_VISION_CLIP_HEAD_COUNT, vparams.n_head, true); ml.get_key(LLM_KV_VISION_CLIP_LAYERNORM_EPS, vparams.eps, true); - ml.get_key(LLM_KV_VISION_CLIP_PROJECTOR_TYPE, proj_type, true); - if (proj_type == "mlp") { - vparams.proj_type = CLIP_PROJECTOR_TYPE_MLP; - } else { - throw std::runtime_error(format("unsupported clip projector type: %s", proj_type.c_str())); + ml.get_key(LLM_KV_VISION_CLIP_SELECT_LAYER, vparams.select_layer, true); + { + std::string name; + ml.get_key(LLM_KV_VISION_CLIP_PROJECTOR_TYPE, name, true); + vparams.proj_type = projector_type_from_name(name); + if (vparams.proj_type == CLIP_PROJECTOR_TYPE_UNKNOWN) { + throw std::runtime_error(format("unsupported clip projector type: %s", name.c_str())); + } } - std::string arch; - ml.get_key(LLM_KV_VISION_CLIP_ARCHITECTURE, arch, true); - for (auto & it : VISION_ARCH_NAMES) { - if (arch == it.second) { - vparams.arch = it.first; - break; + { + std::string name; + ml.get_key(LLM_KV_VISION_CLIP_PATCH_MERGE_TYPE, name, false); + vparams.mm_patch_merge_type = mm_patch_merge_from_name(name); + } + { + std::string arch; + ml.get_key(LLM_KV_VISION_CLIP_ARCHITECTURE, arch, true); + for (auto & it : VISION_ARCH_NAMES) { + if (arch == it.second) { + vparams.arch = it.first; + break; + } } } } else if (!vision_type.empty()) {