LLaVA is working e2e, needs to optimize memory allocation + cleanup

2023-10-08 01:15:13 +03:00 · 2023-10-08 01:15:13 +03:00 · 8690f425ec
commit 8690f425ec
parent d37ed4750f
4 changed files with 78 additions and 92 deletions
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -78,7 +78,7 @@ static std::string format(const char * fmt, ...) {
 #define TN_LN_POST "%s.post_ln.%s"
 #define TN_TEXT_PROJ "text_projection.weight"
 #define TN_VIS_PROJ "visual_projection.weight"
-#define TN_LLAVA_PROJ "llava_projector.%s"
+#define TN_LLAVA_PROJ "mm.%d.%s"

 //
 // utilities to get data from a gguf file
@ -225,8 +225,10 @@ struct clip_vision_model {
    struct ggml_tensor * projection;

    // LLaVA projection
-    struct ggml_tensor * llava_proj_w;
-    struct ggml_tensor * llava_proj_b;
+    struct ggml_tensor * mm_0_w;
+    struct ggml_tensor * mm_0_b;
+    struct ggml_tensor * mm_2_w;
+    struct ggml_tensor * mm_2_b;
 };

 // Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
@ -283,11 +285,11 @@ size_t get_mem_req_by_size(struct clip_ctx * ctx) {
        return 96 * mb;
    case 589:                     // large, two-tower
    case 392:                     // large, vision-only
-    case 375:                     // large, LLaVA encoder
+    case 377:                     // large, LLaVA encoder
        if (vision_hparams->image_size == 224) { // input image size = 224
            return 1200 * mb;
        } else { // input image size = 336
-            return 1800 * mb;
+            return 2900 * mb;
        }
    case 909: // huge, two-tower
    case 520: // huge, vision-only
@ -572,8 +574,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        vision_model.position_embeddings = get_tensor(new_clip->ctx, format(TN_POS_EMBD, "v"));
        vision_model.pre_ln_w = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "weight"));
        vision_model.pre_ln_b = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "bias"));if (new_clip->has_llava_projector) {
-            vision_model.llava_proj_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, "weight"));
-            vision_model.llava_proj_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, "bias"));
+            vision_model.mm_0_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 0, "weight"));
+            vision_model.mm_0_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 0, "bias"));
+            vision_model.mm_2_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 2, "weight"));
+            vision_model.mm_2_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 2, "bias"));
        } else {
            vision_model.post_ln_w = get_tensor(new_clip->ctx, format(TN_LN_POST, "v", "weight"));
            vision_model.post_ln_b = get_tensor(new_clip->ctx, format(TN_LN_POST, "v", "bias"));
@ -1278,20 +1282,26 @@ bool clip_image_batch_encode(const clip_ctx * ctx, const int n_threads, const cl
        embeddings = cur;
    }

-
    //ggml_set_scratch(ctx0, {0, 0, nullptr});

-    struct ggml_tensor * output = NULL;
    if (ctx->has_llava_projector) {
-        output = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
-        embeddings = ggml_mul_mat(ctx0, model.llava_proj_w, embeddings);
-        output = ggml_add(ctx0, ggml_repeat(ctx0, model.llava_proj_b, embeddings), embeddings);
-        output = ggml_reshape_2d(ctx0, output, output->ne[0], output->ne[1]);
+        embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
        struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
        for (int i = 0; i < num_patches; ++i) {
            ggml_set_i32_1d(patches, i, i+1);
        }
-        output = ggml_get_rows(ctx0, output, patches);
+        embeddings = ggml_get_rows(ctx0, embeddings, patches);
+        
+        // mm projection 0
+        embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+        embeddings = ggml_add(ctx0, ggml_repeat(ctx0, model.mm_0_b, embeddings), embeddings);
+
+        embeddings = ggml_gelu(ctx0, embeddings);
+        
+        embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
+        embeddings = ggml_add(ctx0, ggml_repeat(ctx0, model.mm_2_b, embeddings), embeddings);
+
+        ggml_set_name(embeddings, "check");
    } else {
        // get the output of cls token, e.g., 0th index
        struct ggml_tensor * cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, batch_size);
@ -1312,7 +1322,7 @@ bool clip_image_batch_encode(const clip_ctx * ctx, const int n_threads, const cl
        embeddings = ggml_mul_mat(ctx0, model.projection, embeddings);

        // normalize output embeddings
-        output = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, projection_dim, batch_size);
+        struct ggml_tensor * output = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, projection_dim, batch_size);

        for (int b = 0; b < batch_size; b++) {
            struct ggml_tensor * embedding = ggml_get_rows(ctx0, embeddings, ggml_new_i32(ctx0, b));
@ -1322,11 +1332,13 @@ bool clip_image_batch_encode(const clip_ctx * ctx, const int n_threads, const cl
            }
            output = ggml_acc(ctx0, output, embedding, output->nb[1], output->nb[2], output->nb[3], b * ggml_nbytes(embedding));
        }
+
+        embeddings = output;
    }
-    ggml_set_name(output, "check");
+    //ggml_set_name(embeddings, "check");

    // run the computation
-    ggml_build_forward_expand(&gf, output);
+    ggml_build_forward_expand(&gf, embeddings);

    /*
    ggml_cplan cplan = ggml_graph_plan(&gf, n_threads);
@ -1386,7 +1398,7 @@ bool clip_image_batch_encode(const clip_ctx * ctx, const int n_threads, const cl
    printf("used_mem = %zu\n", ggml_used_mem(ctx0));
 #endif

-    memcpy(vec, ggml_get_data_f32(output), sizeof(float) * projection_dim * batch_size);
+    memcpy(vec, ggml_get_data_f32(embeddings), ggml_nbytes(embeddings));

    /*
    if (cplan.work_size != 0) {
--- a/examples/llava/convert_hf_to_gguf.py
+++ b/examples/llava/convert_hf_to_gguf.py
@ -39,6 +39,9 @@ def get_tensor_name(name: str) -> str:
    if "projection" in name:
        return name

+    if "mm_projector" in name:
+        return name.replace("model.mm_projector", "mm")
+
    return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")


@ -75,7 +78,7 @@ ap.add_argument("--text-only", action="store_true", required=False,
                help="Save a text-only model. It can't be used to encode images")
 ap.add_argument("--vision-only", action="store_true", required=False,
                help="Save a vision-only model. It can't be used to encode texts")
-ap.add_argument("--llava-projector", help="Path to projector.pt file. If specified, save an image encoder for LLaVA models.")
+ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
 ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
 ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
 ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
@ -138,7 +141,7 @@ else:
 output_dir = args.output_dir if args.output_dir is not None else dir_model
 os.makedirs(output_dir, exist_ok=True)
 output_prefix = os.path.basename(output_dir).replace("ggml_", "")
-fname_out = os.path.join(output_dir, f"{output_prefix}_ggml-{fname_middle}model-{ftype_str[ftype]}.gguf")
+fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
 fout = GGUFWriter(path=fname_out, arch="clip")

 fout.add_bool("clip.has_text_encoder", has_text_encoder)
@ -191,15 +194,19 @@ fout.add_bool("clip.use_gelu", use_gelu)
 if has_llava_projector:
    model.vision_model.encoder.layers.pop(-1)
    projector = torch.load(args.llava_projector)
-    weight = projector["model.mm_projector.weight"].cpu().squeeze().float().numpy().astype(np.float16)
-    bias = projector['model.mm_projector.bias'].cpu().squeeze().float().numpy().astype(np.float32)
-    fout.add_tensor("llava_projector.weight", weight)
-    fout.add_tensor("llava_projector.bias", bias)
+    for name, data in projector.items():
+        name = get_tensor_name(name)
+        if data.ndim == 2:
+            data = data.squeeze().numpy().astype(np.float16)
+        else:
+            data = data.squeeze().numpy().astype(np.float32)
+
+        fout.add_tensor(name, data)
+
    print("Projector tensors added\n")

-
-list_vars = model.state_dict()
-for name, data in list_vars.items():
+state_dict = model.state_dict()
+for name, data in state_dict.items():
    if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector):
        # we don't need this
        print(f"skipping parameter: {name}")
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@ -7,12 +7,11 @@
 #include "llama.h"


-static bool eval_image_embd(llama_context * ctx_llama, float * embd, int N, int * n_past) {
+static bool eval_image_embd(llama_context * ctx_llama, float * embd, int N, int n_batch, int * n_past) {
    int n_embd  = llama_n_embd(llama_get_model(ctx_llama));
-    int n_batch = N;                                         // params.n_batch;
    
-    for (int i = 0; i < (int) N; i += n_batch) {
-        int n_eval = (int) N - i;
+    for (int i = 0; i < N; i += n_batch) {
+        int n_eval = N - i;
        if (n_eval > n_batch) {
            n_eval = n_batch;
        }
@ -161,16 +160,16 @@ int main(int argc, char ** argv) {
    }

    if (params.prompt.empty()) {
-        params.prompt = "user: describe the image in detail.\nassistant:";
+        params.prompt = "describe the image in detail.";
    }
    
    
-    auto ctx_clip = clip_model_load(clip_path, 1);
+    auto ctx_clip = clip_model_load(clip_path, 3);
    clip_image_u8 img;
    clip_image_f32 img_res;
    clip_image_load_from_file(img_path, &img);
    clip_image_preprocess(ctx_clip, &img, &img_res);
-    float * vec = (float *)malloc(4096 * 256 * sizeof(float));
+    float * vec = (float *)malloc(4096 * 576 * sizeof(float));
    clip_image_encode(ctx_clip, params.n_threads, &img_res, vec, false);
    clip_free(ctx_clip);
    
@ -198,9 +197,10 @@ clip_free(ctx_clip);
    
    int n_past      = 0;
    int max_tgt_len = 256;
-    //eval_string(ctx_llama, params.prompt.c_str(), params.n_batch, &n_past);
-    eval_image_embd(ctx_llama, vec, 256, &n_past);
-//eval_string(ctx_llama, "assistant:", params.n_batch, &n_past);
+    eval_string(ctx_llama, "user: ", params.n_batch, &n_past);
+    eval_image_embd(ctx_llama, vec, 576, params.n_batch, &n_past);
+    eval_string(ctx_llama, params.prompt.c_str(), params.n_batch, &n_past);
+eval_string(ctx_llama, "\nassistant:", params.n_batch, &n_past);
 printf("n_past = %d\n", n_past);
    
    const char* tmp;
--- a/examples/llava/llava_surgery.py
+++ b/examples/llava/llava_surgery.py
@ -1,63 +1,30 @@
 import argparse
-from llava.model import LlavaLlamaForCausalLM
-from transformers import AutoTokenizer
-from peft import PeftModel
+import glob
+import os
 import torch

-dtype = torch.bfloat16

 ap = argparse.ArgumentParser()
-ap.add_argument("-m", "--model", help="Path to LLaVA RLHF model")
-ap.add_argument("-o", "--output", help="Output directory to save the merged file")
+ap.add_argument("-m", "--model", help="Path to LLaVA v1.5 model")
 args = ap.parse_args()

-model_path = f"{args.model}/sft_model"
-lora_path = f"{args.model}/rlhf_lora_adapter_model"
-save_path = args.output
+# find the model part that includes the the multimodal projector weights
+path = sorted(glob.glob(f"{args.model}/pytorch_model*.bin"))[-1]
+checkpoint = torch.load(path)

-model = LlavaLlamaForCausalLM.from_pretrained(
-    model_path,
-    device_map={"": "cuda:0"},
-    torch_dtype=dtype,
-)
-model = PeftModel.from_pretrained(
-    model,
-    lora_path,
-)
+# get a list of mm tensor names
+mm_tensors = [k for k, v in checkpoint.items() if k.startswith("model.mm_projector")]

+# store these tensors in a new dictionary and torch.save them
+projector = {name: checkpoint[name] for name in mm_tensors}
+torch.save(projector, f"{args.model}/llava.projector")

-model = model.merge_and_unload()
+# remove these tensors from the checkpoint and save it again
+for name in mm_tensors:
+    del checkpoint[name]

-model.save_pretrained(save_path)
+torch.save(checkpoint, path)

-tokenizer = AutoTokenizer.from_pretrained(model_path)
-tokenizer.save_pretrained(save_path)
-
-del model
-del tokenizer
-
-
-# Load the checkpoint
-checkpoint = torch.load(f"{save_path}/pytorch_model-00002-of-00002.bin")
-
-# Extract the tensors we want
-mm_projector_weight = checkpoint['model.mm_projector.weight']
-mm_projector_bias = checkpoint['model.mm_projector.bias']
-
-# Remove the tensors from the checkpoint
-del checkpoint['model.mm_projector.weight']
-del checkpoint['model.mm_projector.bias']
-
-# Create a dictionary with the original names as keys
-mm_projector = {
-    'model.mm_projector.weight': mm_projector_weight,
-    'model.mm_projector.bias': mm_projector_bias
-}
-
-# Save the combined dictionary using torch.save
-torch.save(mm_projector, "projector.pt")
-
-# Save the rest of the model with the same original name
-torch.save(checkpoint, "./llava-7b-rlhf-merged/pytorch_model-00002-of-00002.bin")
-
-Print("Operation complete!")
+print("Done!")
+print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
+print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")