Add super wip scripts for multimodal granite gguf

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
2025-01-16 14:54:31 -07:00 · 2025-01-16 14:54:31 -07:00 · 6ccf234031
commit 6ccf234031
parent d774ab3acc
4 changed files with 119 additions and 20 deletions
--- a/examples/llava/llava_surgery_v2.py
+++ b/examples/llava/llava_surgery_v2.py
@ -40,7 +40,7 @@ def clean_vision_tower_from_checkpoint(checkpoint_path):
    # file_type = 'pytorch'
    model_path = os.path.dirname(checkpoint_path)
    print(f"Searching for vision tower tensors in {checkpoint_path}")
-    clip_tensors = [k for k, v in checkpoint.items() if (k.startswith("model.vision_tower") or k.startswith("vit."))]
+    clip_tensors = [k for k, v in checkpoint.items() if (k.startswith("model.vision_tower") or k.startswith("vit.") or k.startswith("vision_tower"))]

    if len(clip_tensors) > 0:
        print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}")
@ -85,10 +85,10 @@ def find_relevant_checkpoints(checkpoint_paths, newline_criteria, projector):
    return newline_checkpoint_path, projector_checkpoint_path

 def newline_criteria(checkpoint):
-    return any(k.startswith("model.image_newline") for k in checkpoint.keys())
+    return any(k.startswith("model.image_newline") or k.startswith("image_newline") for k in checkpoint.keys())

 def proj_criteria(checkpoint):
-    return any(k.startswith("model.mm_projector") or k.startswith("vision_proj.") for k in checkpoint.keys())
+    return any(k.startswith("model.mm_projector") or k.startswith("vision_proj.") or k.startswith("multi_modal_projector") for k in checkpoint.keys())


 # Command-line interface setup
@ -123,14 +123,14 @@ first_checkpoint = None
 if newline_checkpoint_path is not None:
    print(f"Taking newline from {newline_checkpoint_path}")
    first_checkpoint, file_type = load_model(newline_checkpoint_path)
-    first_mm_tensors = [k for k, v in first_checkpoint.items() if k.startswith("model.image_newline")]
+    first_mm_tensors = [k for k, v in first_checkpoint.items() if k.startswith("model.image_newline") or k.startswith("image_newline")]

 # Load the checkpoint
 mm_tensors = []
 last_checkpoint = None
 if projector_checkpoint_path is not None:
    last_checkpoint, file_type = load_model(projector_checkpoint_path)
-    mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("model.mm_projector") or k.startswith("vision_proj.")]
+    mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("model.mm_projector") or k.startswith("vision_proj.") or k.startswith("multi_modal_projector")]

 if len(mm_tensors) == 0:
    if last_checkpoint is not None:
@ -146,14 +146,24 @@ print(f"Found additional {len(first_mm_tensors)} tensors to extract.")
 projector = {}
 for name in mm_tensors:
    assert last_checkpoint is not None
-    projector[name] = last_checkpoint[name].float()
+    # HACK - this should probably be in the second script...
+    new_name = name
+    if new_name.startswith("multi_modal_projector.linear_1"):
+        new_name = new_name.replace("multi_modal_projector.linear_1", "mm.0")
+    elif new_name.startswith("multi_modal_projector.linear_2"):
+        new_name = new_name.replace("multi_modal_projector.linear_2", "mm.2")
+    projector[new_name] = last_checkpoint[name].float()
 for name in first_mm_tensors:
    assert first_checkpoint is not None
-    projector[name] = first_checkpoint[name].float()
+    # HACK - this should probably be in the second script too...
+    new_name = name
+    if new_name == "image_newline":
+        new_name = "model.image_newline"
+    projector[new_name] = first_checkpoint[name].float()

 if len(projector) > 0:
    save_model(projector, f"{args.model}/llava.projector", 'pytorch')

 print("Done!")
-print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
+print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
 print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")