Update convert-image-encoder-to-gguf.py

will now search for projector
2024-02-02 01:48:14 +01:00 · 2024-02-02 01:48:14 +01:00 · a27b9a45df
commit a27b9a45df
parent 1f9367c134
1 changed files with 131 additions and 299 deletions
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@ -1,327 +1,159 @@
 import argparse
 import glob
 import os
 import json
 import torch
-import numpy as np
+from safetensors.torch import load as safe_load, save as safe_save, safe_open, save_file
 from gguf import *
 from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel
 TEXT = "clip.text"
 VISION = "clip.vision"
-def k(raw_key: str, arch: str) -> str:
+# Function to determine if file is a SafeTensor file
-    return raw_key.format(arch=arch)
+def is_safetensor_file(file_path):
    return file_path.endswith('.safetensors')
-def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool:
+# Unified loading function
-    if name in (
+def load_model(file_path):
-        "logit_scale",
+    if is_safetensor_file(file_path):
-        "text_model.embeddings.position_ids",
+        # return safe_load(file_path,framework="pt", device="cpu"), 'safetensor'
-        "vision_model.embeddings.position_ids",
+        tensors = {}
-    ):
+        with safe_open(file_path, framework="pt", device="cpu") as f:
            for key in f.keys():
                tensors[key] = f.get_tensor(key).clone()
        return tensors, 'safetensor'
    else:
        return torch.load(file_path, map_location=torch.device('cpu')), 'pytorch'
 # Unified saving function
 def save_model(model, file_path, file_type):
    if file_type == 'safetensor':
        # safe_save(model, file_path)
        save_file(model, file_path)
    else:
        torch.save(model, file_path)
 # Adapted function to clean vision tower from checkpoint
 def clean_vision_tower_from_checkpoint(checkpoint_path):
    checkpoint, file_type = load_model(checkpoint_path)
    # file_type = 'pytorch'
    model_path = os.path.dirname(checkpoint_path)
    print(f"Searching for vision tower tensors in {checkpoint_path}")
    clip_tensors = [k for k, v in checkpoint.items() if (k.startswith("model.vision_tower") ) ]
    if len(clip_tensors) > 0:
        print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}")
        # Adapted for file type
        clip_path = os.path.join(model_path, "llava.clip")
        if os.path.exists(clip_path):
            existing_clip, _ = load_model(clip_path)
        else:
            existing_clip = {}
        # Update existing_clip with new tensors, avoid duplicates
        for name in clip_tensors:
            simple_name = name[name.index('vision_model.'):] if 'vision_model.' in name else name
            print(f"Adding {simple_name} to llava.clip")
            if simple_name not in existing_clip:
                existing_clip[simple_name] = checkpoint[name]
        # Save the updated clip tensors back to llava.clip
        save_model(existing_clip, clip_path, 'pytorch')
        # Remove the tensors from the original checkpoint
        for name in clip_tensors:
            del checkpoint[name]
        # Save the updated checkpoint
        checkpoint_path = checkpoint_path
        save_model(checkpoint, checkpoint_path, file_type)
        return True
    if has_llava and name in ["visual_projection.weight", "vision_model.post_layernorm.weight", "vision_model.post_layernorm.bias"]:
        return True
    if name.startswith("v") and not has_vision:
        return True
    if name.startswith("t") and not has_text:
        return True
    return False
 def find_relevant_checkpoints(checkpoint_paths, newline_criteria, projector):
    newline_checkpoint_path = None
    projector_checkpoint_path = None
-def get_tensor_name(name: str) -> str:
+    for path in checkpoint_paths:
-    if "projection" in name:
+        checkpoint, _ = load_model(path)
-        return name
+        if newline_criteria(checkpoint) and newline_checkpoint_path is None:
            newline_checkpoint_path = path
        if projector(checkpoint):
            projector_checkpoint_path = path
-    if "mm_projector" in name:
+    return newline_checkpoint_path, projector_checkpoint_path
        return name.replace("model.mm_projector", "mm")
-    return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
+def newline_criteria(checkpoint):
    return any(k.startswith("model.image_newline") for k in checkpoint.keys())
 def proj_criteria(checkpoint):
    return any(k.startswith("model.mm_projector") or k.startswith("vision_proj.") for k in checkpoint.keys())
-def bytes_to_unicode():
+# Command-line interface setup
-    """
+ap = argparse.ArgumentParser()
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
+ap.add_argument("-m", "--model", required=True, help="Path to LLaVA v1.5+ model")
-    The reversible bpe codes work on unicode strings.
+ap.add_argument("-C", "--clean-vision-tower", action="store_true", help="Remove any vision tower from the model files")
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a significant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = (
        list(range(ord("!"), ord("~") + 1))
        + list(range(ord("¡"), ord("¬") + 1))
        + list(range(ord("®"), ord("ÿ") + 1))
    )
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))
 ap = argparse.ArgumentParser(prog="convert_hf_to_gguf.py")
 ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
 ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
 ap.add_argument("--text-only", action="store_true", required=False,
                help="Save a text-only model. It can't be used to encode images")
 ap.add_argument("--vision-only", action="store_true", required=False,
                help="Save a vision-only model. It can't be used to encode texts")
 ap.add_argument("--clip_model_is_vision", action="store_true", required=False,
                help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
 ap.add_argument("--clip_model_is_openclip", action="store_true", required=False,
                help="The clip model is from openclip (for ViT-SO400M type))")
 ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
 ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp")
 ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
 # Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
 # Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
 default_image_mean = [0.48145466, 0.4578275, 0.40821073]
 default_image_std = [0.26862954, 0.26130258, 0.27577711]
 ap.add_argument('--image_mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
 ap.add_argument('--image_std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
 # with proper
 args = ap.parse_args()
 if args.clean_vision_tower:
    # Generalized to handle both PyTorch and SafeTensors models
    model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True)
    # checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and path.startswith('pytorch')) or (path.endswith('.safetensors') and path.startswith('model'))]
    checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])]
    for projector_checkpoint_path in checkpoint_paths:
        print(f"Cleaning {projector_checkpoint_path}")
        if not clean_vision_tower_from_checkpoint(projector_checkpoint_path):
            print(f"No vision tower found in {projector_checkpoint_path}")
            # we break once none is found, so far all models append them at the end
            break
    print("Done! All vision tower tensors are removed from the model files and stored in llava.clip file.")
-if args.text_only and args.vision_only:
+# Now we look for the projector in the last checkpoint
-    print("--text-only and --image-only arguments cannot be specified at the same time.")
+model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True)
-    exit(1)
+checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])]
 # last_checkpoint_path = checkpoint_paths[0]
 # first_checkpoint_path = checkpoint_paths[-1]
 newline_checkpoint_path, projector_checkpoint_path = find_relevant_checkpoints(checkpoint_paths, newline_criteria, proj_criteria)
-if args.use_f32:
+print(f"Taking projector from {projector_checkpoint_path}")
-    print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
+print(f"Taking newline from {newline_checkpoint_path}")
-# output in the same directory as the model if output_dir is None
+# Load the checkpoint
-dir_model = args.model_dir
+first_checkpoint, file_type = load_model(newline_checkpoint_path)
-
+last_checkpoint, file_type = load_model(projector_checkpoint_path)
-if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
+mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("model.mm_projector") or k.startswith("vision_proj.")]
-    vocab = None
+first_mm_tensors = [k for k, v in first_checkpoint.items() if k.startswith("model.image_newline")]
    tokens = None
 else:
    with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
        vocab = json.load(f)
        tokens = [key for key in vocab]
 with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
    config = json.load(f)
    if args.clip_model_is_vision:
        v_hparams = config
        t_hparams = None
    else:
        v_hparams = config["vision_config"]
        t_hparams = config["text_config"]
 # possible data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
 #
 # map from ftype to string
 ftype_str = ["f32", "f16"]
 ftype = 1
 if args.use_f32:
    ftype = 0
 if args.clip_model_is_vision or args.clip_model_is_openclip:
    model = CLIPVisionModel.from_pretrained(dir_model)
    processor = None
 else:
    model = CLIPModel.from_pretrained(dir_model)
    processor = CLIPProcessor.from_pretrained(dir_model)
 fname_middle = None
 has_text_encoder = True
 has_vision_encoder = True
 has_llava_projector = False
 if args.text_only:
    fname_middle = "text-"
    has_vision_encoder = False
 elif args.llava_projector is not None:
    fname_middle = "mmproj-"
    has_text_encoder = False
    has_llava_projector = True
 elif args.vision_only:
    fname_middle = "vision-"
    has_text_encoder = False
 else:
    fname_middle = ""
 output_dir = args.output_dir if args.output_dir is not None else dir_model
 os.makedirs(output_dir, exist_ok=True)
 output_prefix = os.path.basename(output_dir).replace("ggml_", "")
 fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
 fout = GGUFWriter(path=fname_out, arch="clip")
 fout.add_bool("clip.has_text_encoder", has_text_encoder)
 fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
 fout.add_bool("clip.has_llava_projector", has_llava_projector)
 fout.add_file_type(ftype)
 model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model)
 fout.add_name(model_name)
 if args.text_only:
    fout.add_description("text-only CLIP model")
 elif args.vision_only and not has_llava_projector:
    fout.add_description("vision-only CLIP model")
 elif has_llava_projector:
    fout.add_description("image encoder for LLaVA")
    # add projector type
    fout.add_string("clip.projector_type", args.projector_type)
 else:
    fout.add_description("two-tower CLIP model")
 if has_text_encoder:
    # text_model hparams
    fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"])
    fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"]))
    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"])
    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"])
    fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"])
    fout.add_token_list(tokens)
 if has_vision_encoder:
    # vision_model hparams
    fout.add_uint32("clip.vision.image_size", v_hparams["image_size"])
    fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"])
    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"])
    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"])
    fout.add_uint32("clip.vision.projection_dim", v_hparams.get("projection_dim", config["projection_dim"]))
    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
    block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
                            #     /**
                            #      "image_grid_pinpoints": [
                            #         [
                            #         336,
                            #         672
                            #         ],
                            #         [
                            #         672,
                            #         336
                            #         ],
                            #         [
                            #         672,
                            #         672
                            #         ],
                            #         [
                            #         1008,
                            #         336
                            #         ],
                            #         [
                            #         336,
                            #         1008
                            #         ]
                            #     ],
                            #     Flattened:
                            #     [
                            #         336, 672,
                            #         672, 336,
                            #         672, 672,
                            #         1008, 336,
                            #         336, 1008
                            #     ]
                            #  * 
                            #  */
    if "image_grid_pinpoints" in v_hparams:
        # flatten it
        image_grid_pinpoints = []
        for pinpoint in v_hparams["image_grid_pinpoints"]:
            image_grid_pinpoints.extend(pinpoint)
        fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints)
    if "image_crop_resolution" in v_hparams:
        fout.add_uint32("clip.vision.image_crop_resolution", v_hparams["image_crop_resolution"])
    if "image_aspect_ratio" in v_hparams:
        fout.add_string("clip.vision.image_aspect_ratio", v_hparams["image_aspect_ratio"])
    if "image_split_resolution" in v_hparams:
        fout.add_uint32("clip.vision.image_split_resolution", v_hparams["image_split_resolution"])
    if "mm_patch_merge_type" in v_hparams:
        fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"])
    if "mm_projector_type" in v_hparams:
        fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"])
    if processor is not None:
        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
        image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
    else:
        image_mean = args.image_mean if args.image_mean is not None else default_image_mean
        image_std = args.image_std if args.image_std is not None else default_image_std
    fout.add_array("clip.vision.image_mean", image_mean)
    fout.add_array("clip.vision.image_std", image_std)
-use_gelu = v_hparams["hidden_act"] == "gelu"
+if len(mm_tensors) == 0:
-fout.add_bool("clip.use_gelu", use_gelu)
+    for k, v in last_checkpoint.items():
        print(k)
    print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint)} tensors.")
    print("No tensors found. Is this a LLaVA model?")
    exit()
 print(f"Found {len(mm_tensors)} tensors to extract.")
 print(f"Found additional {len(first_mm_tensors)} tensors to extract.")
 # projector = {name: checkpoint.[name].float() for name in mm_tensors}
 projector = {}
 for name in mm_tensors:
    projector[name] = last_checkpoint[name].float()
 for name in first_mm_tensors:
    projector[name] = first_checkpoint[name].float()
-if has_llava_projector:
+save_model(projector, f"{args.model}/llava.projector", 'pytorch')
    model.vision_model.encoder.layers.pop(-1)
    projector = torch.load(args.llava_projector)
    for name, data in projector.items():
        name = get_tensor_name(name)
        # pw and dw conv ndim==4
        if data.ndim == 2 or data.ndim == 4:
            data = data.squeeze().numpy().astype(np.float16)
        else:
            data = data.squeeze().numpy().astype(np.float32)
-        fout.add_tensor(name, data)
+for name in mm_tensors:
    del last_checkpoint[name]
 for name in first_mm_tensors:
    del first_checkpoint[name]
-    print("Projector tensors added\n")
+if len(mm_tensors) > 0:
    save_model(last_checkpoint, projector_checkpoint_path, file_type)
 if len(first_mm_tensors) > 0:
    save_model(first_checkpoint, newline_checkpoint_path, file_type)
-state_dict = model.state_dict()
+print("Done!")
-for name, data in state_dict.items():
+print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
-    if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector):
+print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
        # we don't need this
        print(f"skipping parameter: {name}")
        continue
    name = get_tensor_name(name)
    data = data.squeeze().numpy()
    n_dims = len(data.shape)
    # ftype == 0 -> float32, ftype == 1 -> float16
    ftype_cur = 0
    if n_dims == 4:
        print(f"tensor {name} is always saved in f16")
        data = data.astype(np.float16)
        ftype_cur = 1
    elif ftype == 1:
        if name[-7:] == ".weight" and n_dims == 2:
            print("  Converting to float16")
            data = data.astype(np.float16)
            ftype_cur = 1
        else:
            print("  Converting to float32")
            data = data.astype(np.float32)
            ftype_cur = 0
    else:
        if data.dtype != np.float32:
            print("  Converting to float32")
            data = data.astype(np.float32)
            ftype_cur = 0
    print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
    fout.add_tensor(name, data)
 fout.write_header_to_file()
 fout.write_kv_data_to_file()
 fout.write_tensors_to_file()
 fout.close()
 print("Done. Output file: " + fname_out)