From 440b2ae2b1cdaa53b5b546c569d658bd8cecfa6a Mon Sep 17 00:00:00 2001 From: John <78893154+cmp-nct@users.noreply.github.com> Date: Fri, 2 Feb 2024 02:07:29 +0100 Subject: [PATCH] Update convert-image-encoder-to-gguf.py whoops --- .../llava/convert-image-encoder-to-gguf.py | 430 ++++++++++++------ 1 file changed, 299 insertions(+), 131 deletions(-) diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py index a65b05f8a..115b6b35b 100644 --- a/examples/llava/convert-image-encoder-to-gguf.py +++ b/examples/llava/convert-image-encoder-to-gguf.py @@ -1,159 +1,327 @@ import argparse -import glob import os +import json + import torch -from safetensors.torch import load as safe_load, save as safe_save, safe_open, save_file +import numpy as np +from gguf import * +from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel - -# Function to determine if file is a SafeTensor file -def is_safetensor_file(file_path): - return file_path.endswith('.safetensors') +TEXT = "clip.text" +VISION = "clip.vision" -# Unified loading function -def load_model(file_path): - if is_safetensor_file(file_path): - # return safe_load(file_path,framework="pt", device="cpu"), 'safetensor' - tensors = {} - with safe_open(file_path, framework="pt", device="cpu") as f: - for key in f.keys(): - tensors[key] = f.get_tensor(key).clone() - return tensors, 'safetensor' - else: - return torch.load(file_path, map_location=torch.device('cpu')), 'pytorch' +def k(raw_key: str, arch: str) -> str: + return raw_key.format(arch=arch) -# Unified saving function -def save_model(model, file_path, file_type): - if file_type == 'safetensor': - # safe_save(model, file_path) - save_file(model, file_path) - else: - torch.save(model, file_path) - - -# Adapted function to clean vision tower from checkpoint -def clean_vision_tower_from_checkpoint(checkpoint_path): - checkpoint, file_type = load_model(checkpoint_path) - # file_type = 'pytorch' - model_path = os.path.dirname(checkpoint_path) - print(f"Searching for vision tower tensors in {checkpoint_path}") - clip_tensors = [k for k, v in checkpoint.items() if (k.startswith("model.vision_tower") ) ] - - if len(clip_tensors) > 0: - print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}") - # Adapted for file type - clip_path = os.path.join(model_path, "llava.clip") - - if os.path.exists(clip_path): - existing_clip, _ = load_model(clip_path) - else: - existing_clip = {} - # Update existing_clip with new tensors, avoid duplicates - for name in clip_tensors: - simple_name = name[name.index('vision_model.'):] if 'vision_model.' in name else name - print(f"Adding {simple_name} to llava.clip") - if simple_name not in existing_clip: - existing_clip[simple_name] = checkpoint[name] - - # Save the updated clip tensors back to llava.clip - save_model(existing_clip, clip_path, 'pytorch') - - # Remove the tensors from the original checkpoint - for name in clip_tensors: - del checkpoint[name] - - # Save the updated checkpoint - checkpoint_path = checkpoint_path - save_model(checkpoint, checkpoint_path, file_type) +def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool: + if name in ( + "logit_scale", + "text_model.embeddings.position_ids", + "vision_model.embeddings.position_ids", + ): return True + + if has_llava and name in ["visual_projection.weight", "vision_model.post_layernorm.weight", "vision_model.post_layernorm.bias"]: + return True + + if name.startswith("v") and not has_vision: + return True + + if name.startswith("t") and not has_text: + return True + return False -def find_relevant_checkpoints(checkpoint_paths, newline_criteria, projector): - newline_checkpoint_path = None - projector_checkpoint_path = None - for path in checkpoint_paths: - checkpoint, _ = load_model(path) - if newline_criteria(checkpoint) and newline_checkpoint_path is None: - newline_checkpoint_path = path - if projector(checkpoint): - projector_checkpoint_path = path +def get_tensor_name(name: str) -> str: + if "projection" in name: + return name - return newline_checkpoint_path, projector_checkpoint_path + if "mm_projector" in name: + return name.replace("model.mm_projector", "mm") -def newline_criteria(checkpoint): - return any(k.startswith("model.image_newline") for k in checkpoint.keys()) - -def proj_criteria(checkpoint): - return any(k.startswith("model.mm_projector") or k.startswith("vision_proj.") for k in checkpoint.keys()) + return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln") -# Command-line interface setup -ap = argparse.ArgumentParser() -ap.add_argument("-m", "--model", required=True, help="Path to LLaVA v1.5+ model") -ap.add_argument("-C", "--clean-vision-tower", action="store_true", help="Remove any vision tower from the model files") +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a significant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + + list(range(ord("¡"), ord("¬") + 1)) + + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +ap = argparse.ArgumentParser(prog="convert_hf_to_gguf.py") +ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True) +ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16") +ap.add_argument("--text-only", action="store_true", required=False, + help="Save a text-only model. It can't be used to encode images") +ap.add_argument("--vision-only", action="store_true", required=False, + help="Save a vision-only model. It can't be used to encode texts") +ap.add_argument("--clip_model_is_vision", action="store_true", required=False, + help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") +ap.add_argument("--clip_model_is_openclip", action="store_true", required=False, + help="The clip model is from openclip (for ViT-SO400M type))") +ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.") +ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp") +ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) +# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 +# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5 +default_image_mean = [0.48145466, 0.4578275, 0.40821073] +default_image_std = [0.26862954, 0.26130258, 0.27577711] +ap.add_argument('--image_mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) +ap.add_argument('--image_std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) + +# with proper args = ap.parse_args() -if args.clean_vision_tower: - # Generalized to handle both PyTorch and SafeTensors models - model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True) - # checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and path.startswith('pytorch')) or (path.endswith('.safetensors') and path.startswith('model'))] - checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])] - for projector_checkpoint_path in checkpoint_paths: - print(f"Cleaning {projector_checkpoint_path}") - if not clean_vision_tower_from_checkpoint(projector_checkpoint_path): - print(f"No vision tower found in {projector_checkpoint_path}") - # we break once none is found, so far all models append them at the end - break - print("Done! All vision tower tensors are removed from the model files and stored in llava.clip file.") -# Now we look for the projector in the last checkpoint -model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True) -checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])] -# last_checkpoint_path = checkpoint_paths[0] -# first_checkpoint_path = checkpoint_paths[-1] -newline_checkpoint_path, projector_checkpoint_path = find_relevant_checkpoints(checkpoint_paths, newline_criteria, proj_criteria) +if args.text_only and args.vision_only: + print("--text-only and --image-only arguments cannot be specified at the same time.") + exit(1) -print(f"Taking projector from {projector_checkpoint_path}") -print(f"Taking newline from {newline_checkpoint_path}") +if args.use_f32: + print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.") -# Load the checkpoint -first_checkpoint, file_type = load_model(newline_checkpoint_path) -last_checkpoint, file_type = load_model(projector_checkpoint_path) -mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("model.mm_projector") or k.startswith("vision_proj.")] -first_mm_tensors = [k for k, v in first_checkpoint.items() if k.startswith("model.image_newline")] +# output in the same directory as the model if output_dir is None +dir_model = args.model_dir + +if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip: + vocab = None + tokens = None +else: + with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f: + vocab = json.load(f) + tokens = [key for key in vocab] + +with open(dir_model + "/config.json", "r", encoding="utf-8") as f: + config = json.load(f) + if args.clip_model_is_vision: + v_hparams = config + t_hparams = None + else: + v_hparams = config["vision_config"] + t_hparams = config["text_config"] + +# possible data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 +# +# map from ftype to string +ftype_str = ["f32", "f16"] + +ftype = 1 +if args.use_f32: + ftype = 0 + +if args.clip_model_is_vision or args.clip_model_is_openclip: + model = CLIPVisionModel.from_pretrained(dir_model) + processor = None +else: + model = CLIPModel.from_pretrained(dir_model) + processor = CLIPProcessor.from_pretrained(dir_model) + +fname_middle = None +has_text_encoder = True +has_vision_encoder = True +has_llava_projector = False +if args.text_only: + fname_middle = "text-" + has_vision_encoder = False +elif args.llava_projector is not None: + fname_middle = "mmproj-" + has_text_encoder = False + has_llava_projector = True +elif args.vision_only: + fname_middle = "vision-" + has_text_encoder = False +else: + fname_middle = "" + +output_dir = args.output_dir if args.output_dir is not None else dir_model +os.makedirs(output_dir, exist_ok=True) +output_prefix = os.path.basename(output_dir).replace("ggml_", "") +fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf") +fout = GGUFWriter(path=fname_out, arch="clip") + +fout.add_bool("clip.has_text_encoder", has_text_encoder) +fout.add_bool("clip.has_vision_encoder", has_vision_encoder) +fout.add_bool("clip.has_llava_projector", has_llava_projector) +fout.add_file_type(ftype) +model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model) +fout.add_name(model_name) +if args.text_only: + fout.add_description("text-only CLIP model") +elif args.vision_only and not has_llava_projector: + fout.add_description("vision-only CLIP model") +elif has_llava_projector: + fout.add_description("image encoder for LLaVA") + # add projector type + fout.add_string("clip.projector_type", args.projector_type) +else: + fout.add_description("two-tower CLIP model") + +if has_text_encoder: + # text_model hparams + fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"]) + fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"]) + fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"]) + fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"])) + fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"]) + fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"]) + fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"]) + fout.add_token_list(tokens) + +if has_vision_encoder: + # vision_model hparams + fout.add_uint32("clip.vision.image_size", v_hparams["image_size"]) + fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"]) + fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"]) + fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"]) + fout.add_uint32("clip.vision.projection_dim", v_hparams.get("projection_dim", config["projection_dim"])) + fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"]) + fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"]) + block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"] + fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count) + # /** + # "image_grid_pinpoints": [ + # [ + # 336, + # 672 + # ], + # [ + # 672, + # 336 + # ], + # [ + # 672, + # 672 + # ], + # [ + # 1008, + # 336 + # ], + # [ + # 336, + # 1008 + # ] + # ], + # Flattened: + # [ + # 336, 672, + # 672, 336, + # 672, 672, + # 1008, 336, + # 336, 1008 + # ] + # * + # */ + if "image_grid_pinpoints" in v_hparams: + # flatten it + image_grid_pinpoints = [] + for pinpoint in v_hparams["image_grid_pinpoints"]: + image_grid_pinpoints.extend(pinpoint) + fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints) + if "image_crop_resolution" in v_hparams: + fout.add_uint32("clip.vision.image_crop_resolution", v_hparams["image_crop_resolution"]) + if "image_aspect_ratio" in v_hparams: + fout.add_string("clip.vision.image_aspect_ratio", v_hparams["image_aspect_ratio"]) + if "image_split_resolution" in v_hparams: + fout.add_uint32("clip.vision.image_split_resolution", v_hparams["image_split_resolution"]) + if "mm_patch_merge_type" in v_hparams: + fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"]) + if "mm_projector_type" in v_hparams: + fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"]) + if processor is not None: + image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean + image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std + else: + image_mean = args.image_mean if args.image_mean is not None else default_image_mean + image_std = args.image_std if args.image_std is not None else default_image_std + fout.add_array("clip.vision.image_mean", image_mean) + fout.add_array("clip.vision.image_std", image_std) -if len(mm_tensors) == 0: - for k, v in last_checkpoint.items(): - print(k) - print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint)} tensors.") - print("No tensors found. Is this a LLaVA model?") - exit() +use_gelu = v_hparams["hidden_act"] == "gelu" +fout.add_bool("clip.use_gelu", use_gelu) -print(f"Found {len(mm_tensors)} tensors to extract.") -print(f"Found additional {len(first_mm_tensors)} tensors to extract.") -# projector = {name: checkpoint.[name].float() for name in mm_tensors} -projector = {} -for name in mm_tensors: - projector[name] = last_checkpoint[name].float() -for name in first_mm_tensors: - projector[name] = first_checkpoint[name].float() - -save_model(projector, f"{args.model}/llava.projector", 'pytorch') -for name in mm_tensors: - del last_checkpoint[name] -for name in first_mm_tensors: - del first_checkpoint[name] +if has_llava_projector: + model.vision_model.encoder.layers.pop(-1) + projector = torch.load(args.llava_projector) + for name, data in projector.items(): + name = get_tensor_name(name) + # pw and dw conv ndim==4 + if data.ndim == 2 or data.ndim == 4: + data = data.squeeze().numpy().astype(np.float16) + else: + data = data.squeeze().numpy().astype(np.float32) -if len(mm_tensors) > 0: - save_model(last_checkpoint, projector_checkpoint_path, file_type) -if len(first_mm_tensors) > 0: - save_model(first_checkpoint, newline_checkpoint_path, file_type) + fout.add_tensor(name, data) -print("Done!") -print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.") -print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.") + print("Projector tensors added\n") + +state_dict = model.state_dict() +for name, data in state_dict.items(): + if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector): + # we don't need this + print(f"skipping parameter: {name}") + continue + + name = get_tensor_name(name) + data = data.squeeze().numpy() + + n_dims = len(data.shape) + + # ftype == 0 -> float32, ftype == 1 -> float16 + ftype_cur = 0 + if n_dims == 4: + print(f"tensor {name} is always saved in f16") + data = data.astype(np.float16) + ftype_cur = 1 + elif ftype == 1: + if name[-7:] == ".weight" and n_dims == 2: + print(" Converting to float16") + data = data.astype(np.float16) + ftype_cur = 1 + else: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + else: + if data.dtype != np.float32: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + + print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}") + fout.add_tensor(name, data) + + +fout.write_header_to_file() +fout.write_kv_data_to_file() +fout.write_tensors_to_file() +fout.close() + +print("Done. Output file: " + fname_out)