Update convert-image-encoder-to-gguf.py

whoops
2024-02-02 02:07:29 +01:00 · 2024-02-02 02:07:29 +01:00 · 440b2ae2b1
commit 440b2ae2b1
parent a27b9a45df
1 changed files with 299 additions and 131 deletions
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@ -1,159 +1,327 @@
 import argparse
 import glob
 import os
 import json
 import torch
-from safetensors.torch import load as safe_load, save as safe_save, safe_open, save_file
+import numpy as np
 from gguf import *
 from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel
- 
+TEXT = "clip.text"
-# Function to determine if file is a SafeTensor file
+VISION = "clip.vision"
 def is_safetensor_file(file_path):
    return file_path.endswith('.safetensors')
-# Unified loading function
+def k(raw_key: str, arch: str) -> str:
-def load_model(file_path):
+    return raw_key.format(arch=arch)
    if is_safetensor_file(file_path):
        # return safe_load(file_path,framework="pt", device="cpu"), 'safetensor'
        tensors = {}
        with safe_open(file_path, framework="pt", device="cpu") as f:
            for key in f.keys():
                tensors[key] = f.get_tensor(key).clone()
        return tensors, 'safetensor'
    else:
        return torch.load(file_path, map_location=torch.device('cpu')), 'pytorch'
-# Unified saving function
+def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool:
-def save_model(model, file_path, file_type):
+    if name in (
-    if file_type == 'safetensor':
+        "logit_scale",
-        # safe_save(model, file_path)
+        "text_model.embeddings.position_ids",
-        save_file(model, file_path)
+        "vision_model.embeddings.position_ids",
-    else:
+    ):
        torch.save(model, file_path)
 # Adapted function to clean vision tower from checkpoint
 def clean_vision_tower_from_checkpoint(checkpoint_path):
    checkpoint, file_type = load_model(checkpoint_path)
    # file_type = 'pytorch'
    model_path = os.path.dirname(checkpoint_path)
    print(f"Searching for vision tower tensors in {checkpoint_path}")
    clip_tensors = [k for k, v in checkpoint.items() if (k.startswith("model.vision_tower") ) ]
    if len(clip_tensors) > 0:
        print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}")
        # Adapted for file type
        clip_path = os.path.join(model_path, "llava.clip")
        if os.path.exists(clip_path):
            existing_clip, _ = load_model(clip_path)
        else:
            existing_clip = {}
        # Update existing_clip with new tensors, avoid duplicates
        for name in clip_tensors:
            simple_name = name[name.index('vision_model.'):] if 'vision_model.' in name else name
            print(f"Adding {simple_name} to llava.clip")
            if simple_name not in existing_clip:
                existing_clip[simple_name] = checkpoint[name]
        # Save the updated clip tensors back to llava.clip
        save_model(existing_clip, clip_path, 'pytorch')
        # Remove the tensors from the original checkpoint
        for name in clip_tensors:
            del checkpoint[name]
        # Save the updated checkpoint
        checkpoint_path = checkpoint_path
        save_model(checkpoint, checkpoint_path, file_type)
        return True
    if has_llava and name in ["visual_projection.weight", "vision_model.post_layernorm.weight", "vision_model.post_layernorm.bias"]:
        return True
    if name.startswith("v") and not has_vision:
        return True
    if name.startswith("t") and not has_text:
        return True
    return False
 def find_relevant_checkpoints(checkpoint_paths, newline_criteria, projector):
    newline_checkpoint_path = None
    projector_checkpoint_path = None
-    for path in checkpoint_paths:
+def get_tensor_name(name: str) -> str:
-        checkpoint, _ = load_model(path)
+    if "projection" in name:
-        if newline_criteria(checkpoint) and newline_checkpoint_path is None:
+        return name
            newline_checkpoint_path = path
        if projector(checkpoint):
            projector_checkpoint_path = path
-    return newline_checkpoint_path, projector_checkpoint_path
+    if "mm_projector" in name:
        return name.replace("model.mm_projector", "mm")
-def newline_criteria(checkpoint):
+    return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
    return any(k.startswith("model.image_newline") for k in checkpoint.keys())
 def proj_criteria(checkpoint):
    return any(k.startswith("model.mm_projector") or k.startswith("vision_proj.") for k in checkpoint.keys())
-# Command-line interface setup
+def bytes_to_unicode():
-ap = argparse.ArgumentParser()
+    """
-ap.add_argument("-m", "--model", required=True, help="Path to LLaVA v1.5+ model")
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
-ap.add_argument("-C", "--clean-vision-tower", action="store_true", help="Remove any vision tower from the model files")
+    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a significant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = (
        list(range(ord("!"), ord("~") + 1))
        + list(range(ord("¡"), ord("¬") + 1))
        + list(range(ord("®"), ord("ÿ") + 1))
    )
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))
 ap = argparse.ArgumentParser(prog="convert_hf_to_gguf.py")
 ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
 ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
 ap.add_argument("--text-only", action="store_true", required=False,
                help="Save a text-only model. It can't be used to encode images")
 ap.add_argument("--vision-only", action="store_true", required=False,
                help="Save a vision-only model. It can't be used to encode texts")
 ap.add_argument("--clip_model_is_vision", action="store_true", required=False,
                help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
 ap.add_argument("--clip_model_is_openclip", action="store_true", required=False,
                help="The clip model is from openclip (for ViT-SO400M type))")
 ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
 ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp")
 ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
 # Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
 # Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
 default_image_mean = [0.48145466, 0.4578275, 0.40821073]
 default_image_std = [0.26862954, 0.26130258, 0.27577711]
 ap.add_argument('--image_mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
 ap.add_argument('--image_std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
 # with proper
 args = ap.parse_args()
 if args.clean_vision_tower:
    # Generalized to handle both PyTorch and SafeTensors models
    model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True)
    # checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and path.startswith('pytorch')) or (path.endswith('.safetensors') and path.startswith('model'))]
    checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])]
    for projector_checkpoint_path in checkpoint_paths:
        print(f"Cleaning {projector_checkpoint_path}")
        if not clean_vision_tower_from_checkpoint(projector_checkpoint_path):
            print(f"No vision tower found in {projector_checkpoint_path}")
            # we break once none is found, so far all models append them at the end
            break
    print("Done! All vision tower tensors are removed from the model files and stored in llava.clip file.")
-# Now we look for the projector in the last checkpoint
+if args.text_only and args.vision_only:
-model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True)
+    print("--text-only and --image-only arguments cannot be specified at the same time.")
-checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])]
+    exit(1)
 # last_checkpoint_path = checkpoint_paths[0]
 # first_checkpoint_path = checkpoint_paths[-1]
 newline_checkpoint_path, projector_checkpoint_path = find_relevant_checkpoints(checkpoint_paths, newline_criteria, proj_criteria)
-print(f"Taking projector from {projector_checkpoint_path}")
+if args.use_f32:
-print(f"Taking newline from {newline_checkpoint_path}")
+    print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
-# Load the checkpoint
+# output in the same directory as the model if output_dir is None
-first_checkpoint, file_type = load_model(newline_checkpoint_path)
+dir_model = args.model_dir
-last_checkpoint, file_type = load_model(projector_checkpoint_path)
+
-mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("model.mm_projector") or k.startswith("vision_proj.")]
+if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
-first_mm_tensors = [k for k, v in first_checkpoint.items() if k.startswith("model.image_newline")]
+    vocab = None
    tokens = None
 else:
    with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
        vocab = json.load(f)
        tokens = [key for key in vocab]
 with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
    config = json.load(f)
    if args.clip_model_is_vision:
        v_hparams = config
        t_hparams = None
    else:
        v_hparams = config["vision_config"]
        t_hparams = config["text_config"]
 # possible data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
 #
 # map from ftype to string
 ftype_str = ["f32", "f16"]
 ftype = 1
 if args.use_f32:
    ftype = 0
 if args.clip_model_is_vision or args.clip_model_is_openclip:
    model = CLIPVisionModel.from_pretrained(dir_model)
    processor = None
 else:
    model = CLIPModel.from_pretrained(dir_model)
    processor = CLIPProcessor.from_pretrained(dir_model)
 fname_middle = None
 has_text_encoder = True
 has_vision_encoder = True
 has_llava_projector = False
 if args.text_only:
    fname_middle = "text-"
    has_vision_encoder = False
 elif args.llava_projector is not None:
    fname_middle = "mmproj-"
    has_text_encoder = False
    has_llava_projector = True
 elif args.vision_only:
    fname_middle = "vision-"
    has_text_encoder = False
 else:
    fname_middle = ""
 output_dir = args.output_dir if args.output_dir is not None else dir_model
 os.makedirs(output_dir, exist_ok=True)
 output_prefix = os.path.basename(output_dir).replace("ggml_", "")
 fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
 fout = GGUFWriter(path=fname_out, arch="clip")
 fout.add_bool("clip.has_text_encoder", has_text_encoder)
 fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
 fout.add_bool("clip.has_llava_projector", has_llava_projector)
 fout.add_file_type(ftype)
 model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model)
 fout.add_name(model_name)
 if args.text_only:
    fout.add_description("text-only CLIP model")
 elif args.vision_only and not has_llava_projector:
    fout.add_description("vision-only CLIP model")
 elif has_llava_projector:
    fout.add_description("image encoder for LLaVA")
    # add projector type
    fout.add_string("clip.projector_type", args.projector_type)
 else:
    fout.add_description("two-tower CLIP model")
 if has_text_encoder:
    # text_model hparams
    fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"])
    fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"]))
    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"])
    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"])
    fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"])
    fout.add_token_list(tokens)
 if has_vision_encoder:
    # vision_model hparams
    fout.add_uint32("clip.vision.image_size", v_hparams["image_size"])
    fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"])
    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"])
    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"])
    fout.add_uint32("clip.vision.projection_dim", v_hparams.get("projection_dim", config["projection_dim"]))
    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
    block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
                            #     /**
                            #      "image_grid_pinpoints": [
                            #         [
                            #         336,
                            #         672
                            #         ],
                            #         [
                            #         672,
                            #         336
                            #         ],
                            #         [
                            #         672,
                            #         672
                            #         ],
                            #         [
                            #         1008,
                            #         336
                            #         ],
                            #         [
                            #         336,
                            #         1008
                            #         ]
                            #     ],
                            #     Flattened:
                            #     [
                            #         336, 672,
                            #         672, 336,
                            #         672, 672,
                            #         1008, 336,
                            #         336, 1008
                            #     ]
                            #  * 
                            #  */
    if "image_grid_pinpoints" in v_hparams:
        # flatten it
        image_grid_pinpoints = []
        for pinpoint in v_hparams["image_grid_pinpoints"]:
            image_grid_pinpoints.extend(pinpoint)
        fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints)
    if "image_crop_resolution" in v_hparams:
        fout.add_uint32("clip.vision.image_crop_resolution", v_hparams["image_crop_resolution"])
    if "image_aspect_ratio" in v_hparams:
        fout.add_string("clip.vision.image_aspect_ratio", v_hparams["image_aspect_ratio"])
    if "image_split_resolution" in v_hparams:
        fout.add_uint32("clip.vision.image_split_resolution", v_hparams["image_split_resolution"])
    if "mm_patch_merge_type" in v_hparams:
        fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"])
    if "mm_projector_type" in v_hparams:
        fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"])
    if processor is not None:
        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
        image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
    else:
        image_mean = args.image_mean if args.image_mean is not None else default_image_mean
        image_std = args.image_std if args.image_std is not None else default_image_std
    fout.add_array("clip.vision.image_mean", image_mean)
    fout.add_array("clip.vision.image_std", image_std)
-if len(mm_tensors) == 0:
+use_gelu = v_hparams["hidden_act"] == "gelu"
-    for k, v in last_checkpoint.items():
+fout.add_bool("clip.use_gelu", use_gelu)
        print(k)
    print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint)} tensors.")
    print("No tensors found. Is this a LLaVA model?")
    exit()
 print(f"Found {len(mm_tensors)} tensors to extract.")
 print(f"Found additional {len(first_mm_tensors)} tensors to extract.")
 # projector = {name: checkpoint.[name].float() for name in mm_tensors}
 projector = {}
 for name in mm_tensors:
    projector[name] = last_checkpoint[name].float()
 for name in first_mm_tensors:
    projector[name] = first_checkpoint[name].float()
 save_model(projector, f"{args.model}/llava.projector", 'pytorch')
-for name in mm_tensors:
+if has_llava_projector:
-    del last_checkpoint[name]
+    model.vision_model.encoder.layers.pop(-1)
-for name in first_mm_tensors:
+    projector = torch.load(args.llava_projector)
-    del first_checkpoint[name]
+    for name, data in projector.items():
        name = get_tensor_name(name)
        # pw and dw conv ndim==4
        if data.ndim == 2 or data.ndim == 4:
            data = data.squeeze().numpy().astype(np.float16)
        else:
            data = data.squeeze().numpy().astype(np.float32)
-if len(mm_tensors) > 0:
+        fout.add_tensor(name, data)
    save_model(last_checkpoint, projector_checkpoint_path, file_type)
 if len(first_mm_tensors) > 0:
    save_model(first_checkpoint, newline_checkpoint_path, file_type)
-print("Done!")
+    print("Projector tensors added\n")
-print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
+
-print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
+state_dict = model.state_dict()
 for name, data in state_dict.items():
    if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector):
        # we don't need this
        print(f"skipping parameter: {name}")
        continue
    name = get_tensor_name(name)
    data = data.squeeze().numpy()
    n_dims = len(data.shape)
    # ftype == 0 -> float32, ftype == 1 -> float16
    ftype_cur = 0
    if n_dims == 4:
        print(f"tensor {name} is always saved in f16")
        data = data.astype(np.float16)
        ftype_cur = 1
    elif ftype == 1:
        if name[-7:] == ".weight" and n_dims == 2:
            print("  Converting to float16")
            data = data.astype(np.float16)
            ftype_cur = 1
        else:
            print("  Converting to float32")
            data = data.astype(np.float32)
            ftype_cur = 0
    else:
        if data.dtype != np.float32:
            print("  Converting to float32")
            data = data.astype(np.float32)
            ftype_cur = 0
    print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
    fout.add_tensor(name, data)
 fout.write_header_to_file()
 fout.write_kv_data_to_file()
 fout.write_tensors_to_file()
 fout.close()
 print("Done. Output file: " + fname_out)