From 3667a0a4a3274b874b4a290c63693eb9d34e4311 Mon Sep 17 00:00:00 2001 From: ravenouse <85110830+ravenouse@users.noreply.github.com> Date: Wed, 5 Feb 2025 20:42:35 +0000 Subject: [PATCH] Add example clip cli and enhance tensor name processing in Janus converter --- examples/llava/clip-cli.cpp | 118 ++++++++++++++++++ .../llava/convert_janus_encoder_to_gguf.py | 99 +++++++++------ 2 files changed, 182 insertions(+), 35 deletions(-) create mode 100644 examples/llava/clip-cli.cpp diff --git a/examples/llava/clip-cli.cpp b/examples/llava/clip-cli.cpp new file mode 100644 index 000000000..6f40c5116 --- /dev/null +++ b/examples/llava/clip-cli.cpp @@ -0,0 +1,118 @@ +// +// Example usage of just the vision encoder (CLIP) part of the LLAVA codebase. +// It loads a CLIP model (gguf file) and an image file, +// computes the image embedding, and prints out (a few elements of) the embedding. +// +// Build and run (for example): +// ./bin/llama-clip-cli -c model.gguf -i input.png --threads 1 --verbosity 1 +// ./bin/llama-clip-cli -c clip.gguf -i input.png --threads 1 --verbosity 1 + +#include "arg.h" +#include "base64.hpp" +#include "log.h" +#include "common.h" +#include "clip.h" +#include "llava.h" +#include "ggml.h" + +#include +#include +#include +#include +#include +#include + +// Structure to hold our command line parameters. +struct vision_params { + std::string clip_model; // Path to the CLIP model file (gguf) + std::string image_file; // Path to the image file to process + int n_threads = 1; // Number of CPU threads to use + int verbosity = 1; // Verbosity level for model loading +}; + +static void print_usage(const char* progname) { + LOG("\nUsage: %s -c -i [--threads ] [--verbosity ]\n\n", progname); +} + +int main(int argc, char ** argv) { + ggml_time_init(); + + vision_params params; + + // Simple command line parsing + if (argc < 5) { + print_usage(argv[0]); + return 1; + } + for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; + if (arg == "-c" || arg == "--clip") { + if (i + 1 < argc) { + params.clip_model = argv[++i]; + } else { + print_usage(argv[0]); + return 1; + } + } else if (arg == "-i" || arg == "--image") { + if (i + 1 < argc) { + params.image_file = argv[++i]; + } else { + print_usage(argv[0]); + return 1; + } + } else if (arg == "--threads") { + if (i + 1 < argc) { + params.n_threads = std::atoi(argv[++i]); + } else { + print_usage(argv[0]); + return 1; + } + } else if (arg == "--verbosity") { + if (i + 1 < argc) { + params.verbosity = std::atoi(argv[++i]); + } else { + print_usage(argv[0]); + return 1; + } + } else { + // Unknown argument. + print_usage(argv[0]); + return 1; + } + } + + if (params.clip_model.empty() || params.image_file.empty()) { + print_usage(argv[0]); + return 1; + } + + // Load the CLIP model. + struct clip_ctx * ctx_clip = clip_model_load(params.clip_model.c_str(), params.verbosity); + if (!ctx_clip) { + LOG_ERR("Failed to load clip model from %s\n", params.clip_model.c_str()); + return 1; + } + LOG_INF("Clip model loaded from %s\n", params.clip_model.c_str()); + + // Load and process the image. + llava_image_embed * embed = llava_image_embed_make_with_filename(ctx_clip, params.n_threads, params.image_file.c_str()); + if (!embed) { + LOG_ERR("Failed to load or process image from %s\n", params.image_file.c_str()); + clip_free(ctx_clip); + return 1; + } + LOG_INF("Image loaded and processed from %s\n", params.image_file.c_str()); + LOG_INF("Image embedding computed with %d positions.\n", embed->n_image_pos); + int print_count = (embed->n_image_pos < 10 ? embed->n_image_pos : 10); + LOG_INF("First %d elements: ", print_count); + + for (int i = 0; i < print_count; i++) { + LOG_INF("%f ", embed->embed[i]); + } + LOG_INF("\n"); + + llava_image_embed_free(embed); + clip_free(ctx_clip); + + return 0; +} diff --git a/examples/llava/convert_janus_encoder_to_gguf.py b/examples/llava/convert_janus_encoder_to_gguf.py index d8678c1d5..bae67f283 100644 --- a/examples/llava/convert_janus_encoder_to_gguf.py +++ b/examples/llava/convert_janus_encoder_to_gguf.py @@ -37,17 +37,64 @@ def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: b return False -def get_tensor_name(name: str) -> str: - if "projection" in name: - return name - if "mm_projector" in name: - name = name.replace("model.mm_projector", "mm") - name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1) - name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1) - return name +def get_tensor_name_from_janus(name: str) -> str: + name = re.sub(r'^vision_tower\.blocks\.(\d+)\.attn\.qkv\.(weight|bias)$', r'v.blk.\1.attn_qkv.\2',name) + name = re.sub(r'^vision_tower\.blocks\.(\d+)\.norm1\.(.*)$', r'v.blk.\1.ln1.\2', name) + name = re.sub(r'^vision_tower\.blocks\.(\d+)\.attn\.proj\.(.*)$', r'v.blk.\1.attn_out.\2', name) + name = re.sub(r'^vision_tower\.blocks\.(\d+)\.norm2\.(.*)$', r'v.blk.\1.ln2.\2', name) + name = re.sub(r'^vision_tower\.blocks\.(\d+)\.mlp\.fc1\.(.*)$', r'v.blk.\1.ffn_down.\2', name) + name = re.sub(r'^vision_tower\.blocks\.(\d+)\.mlp\.fc2\.(.*)$', r'v.blk.\1.ffn_up.\2', name) + name = re.sub(r'^vision_tower\.patch_embed\.proj\.(.*)$', r'v.patch_embd.\1', name) + name = re.sub(r'^vision_tower\.pos_embed$', r'v.position_embd.weight', name) + name = re.sub(r'^vision_tower\.norm\.(weight|bias)$', r'v.post_ln.\1', name) + + name = name.replace("vision_tower", "v") + name = name.replace("text_model", "t") + name = name.replace("vision_model", "v") + name = name.replace("encoder.layers", "blk") + name = name.replace("blocks", "blk") + name = name.replace("embeddings.", "") + name = name.replace("_proj", "") + name = name.replace("self_attn.", "attn_") + name = name.replace("layer_norm", "ln") + name = name.replace("layernorm", "ln") + name = name.replace("mlp.fc1", "ffn_down") + name = name.replace("mlp.fc2", "ffn_up") + name = name.replace("embedding", "embd") + name = name.replace("final", "post") + name = name.replace("layrnorm", "ln") + + return name - return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln") +def process_and_save_tensor(tensor: torch.Tensor, new_name: str, ftype: int, fout) -> None: + """Process a tensor (squeeze, cast dtype, log) and save it to `fout`.""" + data = tensor.squeeze().numpy() + n_dims = len(data.shape) + ftype_str = {0: "f32", 1: "f16"} + + ftype_cur = 0 + if n_dims == 4: + print(f"tensor {new_name} is always saved in f16") + data = data.astype(np.float16) + ftype_cur = 1 + elif ftype == 1: + if new_name.endswith(".weight") and n_dims == 2: + print(" Converting to float16") + data = data.astype(np.float16) + ftype_cur = 1 + else: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + else: + if data.dtype != np.float32: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + + print(f"{new_name} - {ftype_str[ftype_cur]} - shape = {data.shape}") + fout.add_tensor(new_name, data) def bytes_to_unicode(): """ @@ -261,35 +308,17 @@ for name, data in state_dict.items(): print(f"skipping parameter: {name}") continue - name = get_tensor_name(name) - data = data.squeeze().numpy() + name = get_tensor_name_from_janus(name) - n_dims = len(data.shape) + # Handle the qkv projection weights and biases + if "qkv" in name: + q_tensor, k_tensor, v_tensor = torch.chunk(data, 3, dim=0) - # ftype == 0 -> float32, ftype == 1 -> float16 - ftype_cur = 0 - if n_dims == 4: - print(f"tensor {name} is always saved in f16") - data = data.astype(np.float16) - ftype_cur = 1 - elif ftype == 1: - if name[-7:] == ".weight" and n_dims == 2: - print(" Converting to float16") - data = data.astype(np.float16) - ftype_cur = 1 - else: - print(" Converting to float32") - data = data.astype(np.float32) - ftype_cur = 0 + process_and_save_tensor(q_tensor, name.replace("qkv", "q"), ftype, fout) + process_and_save_tensor(k_tensor, name.replace("qkv", "k"), ftype, fout) + process_and_save_tensor(v_tensor, name.replace("qkv", "v"), ftype, fout) else: - if data.dtype != np.float32: - print(" Converting to float32") - data = data.astype(np.float32) - ftype_cur = 0 - - print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}") - fout.add_tensor(name, data) - + process_and_save_tensor(data, name, ftype, fout) fout.write_header_to_file() fout.write_kv_data_to_file()