Avoid dropping last image encoder layer in llava models

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
2025-02-05 03:07:35 -07:00 · 2025-02-05 03:07:35 -07:00 · d85580c41c
commit d85580c41c
parent 65935431b4
1 changed files with 9 additions and 7 deletions
--- a/examples/llava/convert_image_encoder_to_gguf.py
+++ b/examples/llava/convert_image_encoder_to_gguf.py
@ -162,7 +162,7 @@ if args.use_f32:
 if args.clip_model_is_siglip:
    model = SiglipVisionModel.from_pretrained(dir_model)
-    processor = None # TODO - optionally handle processor to correctly extract image stats etc
+    processor = None
 elif args.clip_model_is_vision or args.clip_model_is_openclip:
    model = CLIPVisionModel.from_pretrained(dir_model)
    processor = None
@ -228,10 +228,12 @@ if has_text_encoder:
    fout.add_token_list(tokens)
 if has_vision_encoder:
-    # vision feature layer may be an integer or an array.
+    ## FIXME Need to pull this out of the overall model config, not just the top one?
-    # TODO - it seems like llama cpp may not handle this correctly
+    #  TODO or document that vision_feature_layer can be set here, but it's usually in the
-    # normally; check if HF llava next models can run through this converter...
+    #  llava config and not the vision config itself;
-    if "vision_feature_layer" in v_hparams:
+    # Handle vision feature layers in transformers, where features may be taken
    # from layers that are not the last. NOTE - these values can be unsigned...
    if "vision_feature_layer" in config:
        feature_layers = v_hparams["vision_feature_layer"]
        if isinstance(feature_layers, int):
            feature_layers = [feature_layers]
@ -251,7 +253,7 @@ if has_vision_encoder:
    fout.add_uint32("clip.vision.projection_dim", visual_projection_dim)
    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
-    block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
+    block_count = v_hparams["num_hidden_layers"] #- 1 if has_llava_projector else v_hparams["num_hidden_layers"]
    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
                            #     /**
                            #      "image_grid_pinpoints": [
@ -319,7 +321,7 @@ fout.add_bool("clip.use_gelu", use_gelu)
 if has_llava_projector:
-    model.vision_model.encoder.layers.pop(-1)
+    # model.vision_model.encoder.layers.pop(-1)
    projector = torch.load(args.llava_projector)
    for name, data in projector.items():
        name = get_tensor_name(name)