From bd2d24aa0dd3f12843a7d69f3ab1117d44f085dd Mon Sep 17 00:00:00 2001 From: Yutong Dai Date: Wed, 24 Jul 2024 07:12:58 +0000 Subject: [PATCH] halfway --- .../xgenmm-surgery copy.py} | 8 +- .../xgenmm_convert_image_encoder_to_gguf.py | 214 ++++++++++++++++++ examples/xgenmm/xgenmm_surgery.py | 97 ++++++++ 3 files changed, 316 insertions(+), 3 deletions(-) rename examples/xgenmm/{xgenmm-surgery.py => bak/xgenmm-surgery copy.py} (93%) create mode 100644 examples/xgenmm/xgenmm_convert_image_encoder_to_gguf.py create mode 100644 examples/xgenmm/xgenmm_surgery.py diff --git a/examples/xgenmm/xgenmm-surgery.py b/examples/xgenmm/bak/xgenmm-surgery copy.py similarity index 93% rename from examples/xgenmm/xgenmm-surgery.py rename to examples/xgenmm/bak/xgenmm-surgery copy.py index 951cc7bc0..e1569473b 100644 --- a/examples/xgenmm/xgenmm-surgery.py +++ b/examples/xgenmm/bak/xgenmm-surgery copy.py @@ -44,7 +44,7 @@ if __name__ == "__main__": if not os.path.exists(save_dir): os.makedirs(save_dir) # get a list vl connector keys - projector_tensors = {v.float(): v for k, v in ckpt.items() if k.startswith(PROJECTOR)} + projector_tensors = {k: v.float() for k, v in ckpt.items() if k.startswith(PROJECTOR)} print("๐ŸŸก Saving project ckpt...") save_path = f"{save_dir}/xgenmm.projector" start = time.time() @@ -53,9 +53,9 @@ if __name__ == "__main__": print(f"๐ŸŸข time used: [{end-start:.3f} s] | Save projector ckpt at: {save_path}") # here we use the siglip - vision_encoder_tensors = {v.float(): v for k, v in ckpt.items() if k.startswith(VISION_ENCODER_KEY)} + vision_encoder_tensors = {k: v.float() for k, v in ckpt.items() if k.startswith(VISION_ENCODER_KEY)} print("๐ŸŸก Saving vision encoder ckpt...") - save_path = f"{save_dir}/xgenmm.clip" + save_path = f"{save_dir}/xgenmm.vision_encoder" start = time.time() torch.save(vision_encoder_tensors, save_path) end = time.time() @@ -95,5 +95,7 @@ if __name__ == "__main__": start = time.time() llm = model.lang_model.save_pretrained(f"{save_dir}/model") tokenizer.save_pretrained(f"{save_dir}/model") + vision_encoder_config = model.vision_encoder.config + vision_encoder_config.save_pretrained(f"{save_dir}/vit_config") end = time.time() print(f"๐ŸŸข time used: [{end-start:.3f} s] | Save projector ckpt at: {save_dir}/model") \ No newline at end of file diff --git a/examples/xgenmm/xgenmm_convert_image_encoder_to_gguf.py b/examples/xgenmm/xgenmm_convert_image_encoder_to_gguf.py new file mode 100644 index 000000000..9e5889f21 --- /dev/null +++ b/examples/xgenmm/xgenmm_convert_image_encoder_to_gguf.py @@ -0,0 +1,214 @@ +import os +import re +import torch +import argparse +import json +import numpy as np +import time + +from gguf import * +from transformers.models.siglip.modeling_siglip import SiglipVisionTransformer, SiglipVisionConfig + +TEXT = "clip.text" +VISION = "clip.vision" + +def k(raw_key: str, arch: str) -> str: + return raw_key.format(arch=arch) + +def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_minicpmv: bool) -> bool: + if name in ( + "logit_scale", + "text_model.embeddings.position_ids", + "vision_model.embeddings.position_ids", + ): + return True + + if has_minicpmv and name in ["visual_projection.weight"]: + return True + + if name.startswith("v") and not has_vision: + return True + + if name.startswith("t") and not has_text: + return True + + return False + + +def get_tensor_name(name: str) -> str: + if "projection" in name: + return name + if "mm_projector" in name: + name = name.replace("model.mm_projector", "mm") + name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1) + name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1) + return name + + return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln") + + +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a significant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + + list(range(ord("ยก"), ord("ยฌ") + 1)) + + list(range(ord("ยฎ"), ord("รฟ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +class print_time(): + def __init__(self, task): + self.task = task + + def __enter__(self): + print(f"๐ŸŸก {self.task}") + self.t = time.time() + + def __exit__(self, type, value, traceback): + print(f'๐ŸŸข time used: [{time.time() - self.t:.03f}] secs') + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--surgery_dir", type=str, default='/export/share/yutong/xgenmm/llamacpp_wd') + parser.add_argument('--version', type=str, default='siglip_kosmos_phi3_4k_instruct', help='help identify the version of the saved ckpt') + # options kept from llama.cpp projects + parser.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16") + parser.add_argument("--text-only", action="store_true", required=False, + help="Save a text-only model. It can't be used to encode images") + parser.add_argument("--vision-only", action="store_true", required=False, + help="Save a vision-only model. It can't be used to encode texts") + parser.add_argument("--xgenmm-projector", help="Path to minicpmv.projector file. If specified, save an image encoder for XgenMM models.") + + return parser.parse_args() + + +if __name__ == "__main__": + args = get_args() + + if args.text_only and args.vision_only: + print("--text-only and --image-only arguments cannot be specified at the same time.") + exit(1) + + if args.use_f32: + print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.") + + # possible data types + # ftype == 0 -> float32 + # ftype == 1 -> float16 + # + # map from ftype to string + ftype_str = ["f32", "f16"] + + ftype = 1 + if args.use_f32: + ftype = 0 + + ckpt_dir = f"{args.surgery_dir}/{args.version}" + args.xgenmm_projector = f"ckpt_dir/xgenmm.projector" + + with print_time("Loading vision encoder"): + vision_encoder_config_path = f"{args.surgery_dir}/{args.version}/vision_encoder/config.json" + with open(vision_encoder_config_path, 'r') as f: + vision_config = json.load(f) + vision_encoder_config = SiglipVisionConfig(**vision_config) + # vision_encoder = SiglipVisionTransformer(vision_encoder_config) + # vision_encoder_ckpt = torch.load(f'{ckpt_dir}/vision_encoder/xgenmm.vision_encoder') + # vision_encoder.load_state_dict(vision_encoder_ckpt) + + fname_middle = None + has_text_encoder = True + has_vision_encoder = True + has_xgenmm_projector = False + if args.text_only: + fname_middle = "text-" + has_vision_encoder = False + elif args.xgenmm_projector is not None: + fname_middle = "mmproj-" + has_text_encoder = False + has_xgenmm_projector = False + elif args.vision_only: + fname_middle = "vision-" + has_text_encoder = False + else: + fname_middle = "" + + + output_dir = f"{ckpt_dir}/gguf" + if not os.path.exists(output_dir): + os.makedirs(output_dir) + output_prefix = os.path.basename(output_dir).replace("ggml_", "") + fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf") + + fout = GGUFWriter(path=fname_out, arch="clip") + fout.add_bool("clip.has_text_encoder", has_text_encoder) + fout.add_bool("clip.has_vision_encoder", has_vision_encoder) + fout.add_bool("clip.has_xgenmm_projector", has_xgenmm_projector) + fout.add_file_type(ftype) + + if args.text_only: + fout.add_description("text-only CLIP model") + elif args.vision_only and not has_xgenmm_projector: + fout.add_description("vision-only CLIP model") + elif has_xgenmm_projector: + fout.add_description("image encoder for XgenMM model") + # add projector type + fout.add_string("clip.projector_type", "PerceiverResampler") + else: + fout.add_description("two-tower CLIP model") + + if has_vision_encoder: + """ + In siglip config, we have following keys + used: "image_size", "patch_size", "hidden_size", "intermediate_size" + "num_attention_heads", "layer_norm_eps", "num_hidden_layers", "hidden_act" + unused: "attention_dropout", "model_type", "num_channels" + """ + fout.add_uint32("clip.vision.image_size", vision_config["image_size"]) + fout.add_uint32("clip.vision.patch_size", vision_config["patch_size"]) + fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vision_config["hidden_size"]) + fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), vision_config["intermediate_size"]) + # TODO: need to check the value of projection_dim; follow minicpmv to set it as 0 + fout.add_uint32("clip.vision.projection_dim", 0) + fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vision_config["num_attention_heads"]) + fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), vision_config["layer_norm_eps"]) + block_count = vision_config["num_hidden_layers"] - 1 if has_xgenmm_projector else vision_config["num_hidden_layers"] + fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count) + + # xgenmm use anyres with grids configuration + # 1*2, 2*1, 2*2, 3*1, 1*3, the same as the llava1.6, we just hard code it here + image_grid_pinpoints = [336, 672, 672, 336, 672, 672, 1008, 336, 336, 1008] + fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints) + + + image_mean = [0.5, 0.5, 0.5] + image_std = [0.5, 0.5, 0.5] + fout.add_array("clip.vision.image_mean", image_mean) + fout.add_array("clip.vision.image_std", image_std) + + # TODO: need to check; vision_config["hidden_act"] is gelu_pytorch_tanh + use_gelu = "gelu" in vision_config["hidden_act"].lower() + fout.add_bool("clip.use_gelu", use_gelu) + + fout.write_header_to_file() + fout.write_kv_data_to_file() + fout.write_tensors_to_file() + fout.close() + print("Done. Output file: " + fname_out) \ No newline at end of file diff --git a/examples/xgenmm/xgenmm_surgery.py b/examples/xgenmm/xgenmm_surgery.py new file mode 100644 index 000000000..59ba06c3e --- /dev/null +++ b/examples/xgenmm/xgenmm_surgery.py @@ -0,0 +1,97 @@ +import torch +import argparse +from open_flamingo import create_model_and_transforms +from omegaconf import OmegaConf +import os +import time + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--ckpt_pth", type=str, default='/export/share/manli_shu/models/open-flamingo-dev/anyres_ablation_HFSiglip_patch128-kosmos_non_instruct-phi3_4k_instruct_nq128_pre_V3_5-llava_1p6_ocrmathmix_v4-8x8-ckpt2/checkpoint_0.pt') + parser.add_argument('--save_pth', type=str, default='/export/share/yutong/xgenmm/llamacpp_wd') + parser.add_argument('--version', type=str, default='siglip_kosmos_phi3_4k_instruct', help='help identify the version of the saved ckpt') + return parser.parse_args() + +VISION_ENCODER_KEY = 'vision_encoder' +LLM_KEY = 'lang_model' +PROJECTOR = 'vision_tokenizer' + + +if __name__ == "__main__": + # load ckpt + args = get_args() + print("๐ŸŸก Loading ckpt...") + start = time.time() + ckpt = torch.load(args.ckpt_pth)["model_state_dict"] + end = time.time() + print(f"๐ŸŸข time used: [{end-start:.3f} s] | Done with loading ckpt") + + # sanity check + unexpected_component_keys = set() + for k in list(ckpt.keys()): + matched = False + for c in ['vision_encoder', 'lang_model', 'vision_tokenizer']: + if k.startswith(c): + matched = True + continue + if not matched: + unexpected_component_keys.add(k) + + if len(unexpected_component_keys) > 0: + print(f"โ—โ—โ— Unexpected component keys: {unexpected_component_keys}. Proceed with caution.") + + save_dir = f"{args.save_pth}/{args.version}" + print("๐ŸŸก Instaiate the model.") + start = time.time() + cfg = dict( + model_family = 'kosmos', + lm_path = 'microsoft/Phi-3-mini-4k-instruct', + vision_encoder_path = 'google/siglip-so400m-patch14-384', + vision_encoder_pretrained = 'google', + num_vision_tokens = 128, + image_aspect_ratio = 'anyres', + anyres_patch_sampling = True, + anyres_grids=[[1,2],[2,1],[2,2],[3,1],[1,3]], + ckpt_pth = args.ckpt_pth) + cfg = OmegaConf.create(cfg) + if cfg.model_family in ['kosmos-instruct', 'kosmos', 'llava']: + additional_kwargs = { + "image_aspect_ratio": cfg.image_aspect_ratio, + } + if cfg.model_family in ['kosmos-instruct', 'kosmos']: + additional_kwargs.update({ + "num_vision_tokens": cfg.num_vision_tokens, + "anyres_patch_sampling": cfg.anyres_patch_sampling, + }) + model, image_processor, tokenizer = create_model_and_transforms( + clip_vision_encoder_path=cfg.vision_encoder_path, + clip_vision_encoder_pretrained=cfg.vision_encoder_pretrained, + lang_model_path=cfg.lm_path, + tokenizer_path=cfg.lm_path, + model_family=cfg.model_family, + **additional_kwargs) + model.load_state_dict(ckpt, strict=True) + end = time.time() + print(f"๐ŸŸข time used: [{end-start:.3f} s] | Done with instaiating the model.") + + + print("๐ŸŸก Peforming the surgery...") + + model.lang_model.save_pretrained(f"{save_dir}/llm") + + model.vision_encoder.config.save_pretrained(f"{save_dir}/vision_encoder") + vision_encoder_tensors = {k.split(VISION_ENCODER_KEY + '.')[-1]: v.float() for k, v in ckpt.items() if k.startswith(VISION_ENCODER_KEY)} + save_path = f"{save_dir}/vision_encoder/xgenmm.vision_encoder" + torch.save(vision_encoder_tensors, save_path) + + + projector_tensors = {k.split(PROJECTOR + '.')[-1]: v.float() for k, v in ckpt.items() if k.startswith(PROJECTOR)} + save_path = f"{save_dir}/xgenmm.projector" + torch.save(projector_tensors, save_path) + + # processors + tokenizer.save_pretrained(f"{save_dir}/tokenizer") + # will hard code the image_processor in the convert_image_encoder_to_gguf.py + + end = time.time() + print(f"๐ŸŸข time used: [{end-start:.3f} s]") \ No newline at end of file