From a88c0d5f2626bc65395b678e8758eb855ffc8f67 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 3 Oct 2024 20:15:36 +0200
Subject: [PATCH] wip

---
 common/common.cpp          |  21 +++++++
 common/common.h            |  25 +++++++++
 common/vision.cpp          |   2 +-
 examples/simple/simple.cpp | 111 ++++++++++++++++++++-----------------
 include/llama.h            |  22 +++++---
 src/llama-vision.cpp       |  42 ++++++++------
 src/llama-vision.h         |  18 +++---
 src/llama.cpp              |  94 ++++++++++++++++++++++++-------
 8 files changed, 230 insertions(+), 105 deletions(-)
diff --git a/common/common.cpp b/common/common.cpp
index 8d0ed4f95..921928d97 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1474,6 +1474,27 @@ std::vector<llama_token> llama_tokenize(
     return result;
 }
 
+// TODO: this function is hacky, need to be improved
+std::vector<llama_token> llama_tokenize_with_img(
+  const struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_special,
+                        bool   parse_special) {
+    static const std::string IMG_PLACEMENT = "<img_placement>";
+    std::vector<std::string> parts = string_split(text, IMG_PLACEMENT);
+    std::vector<llama_token> output;
+    for (const auto & part : parts) {
+        bool add_bos = &parts.front() == &part;
+        auto tokens = llama_tokenize(ctx, part, add_special && add_bos, parse_special);
+        output.insert(output.end(), tokens.begin(), tokens.end());
+        if (&parts.back() != &part) {
+            // add image token to middle of 2 parts
+            output.push_back(TOKEN_IMG_PLACEMENT);
+        }
+    }
+    return output;
+}
+
 std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
     std::string piece;
     piece.resize(piece.capacity());  // using string internal cache, 15 bytes + '\n'
diff --git a/common/common.h b/common/common.h
index cb87c4479..e6fa1c2d4 100644
--- a/common/common.h
+++ b/common/common.h
@@ -378,6 +378,20 @@ static std::vector<T> string_split(const std::string & str, char delim) {
     return values;
 }
 
+// split string by a `std::string delim` instead of `char delim`
+static std::vector<std::string> string_split(std::string s, const std::string & delimiter) {
+    std::vector<std::string> tokens;
+    size_t pos = 0;
+    std::string token;
+    while ((pos = s.find(delimiter)) != std::string::npos) {
+        token = s.substr(0, pos);
+        tokens.push_back(token);
+        s.erase(0, pos + delimiter.length());
+    }
+    tokens.push_back(s);
+    return tokens;
+}
+
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);
 
@@ -447,6 +461,17 @@ std::vector<llama_token> llama_tokenize(
                         bool   add_special,
                         bool   parse_special = false);
 
+const llama_token TOKEN_IMG_PLACEMENT = -1000;
+
+// tokenize with "placeholder" for image embedding tokens
+// "<img_placement>" will be replaced with TOKEN_IMG_PLACEMENT
+// TODO: this function is hacky, need to be improved
+std::vector<llama_token> llama_tokenize_with_img(
+  const struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_special,
+                        bool   parse_special = false);
+
 // tokenizes a token into a piece, optionally renders special/control tokens
 // should work similar to Python's `tokenizer.id_to_piece`
 std::string llama_token_to_piece(
diff --git a/common/vision.cpp b/common/vision.cpp
index 5b003654a..2b37ded16 100644
--- a/common/vision.cpp
+++ b/common/vision.cpp
@@ -31,7 +31,7 @@ llama_img * load_image_from_file(const char * fname) {
     //     printf("\n");
     // }
     // printf("\n");
-    llama_img * result = llama_img_alloc(nx, ny);
+    llama_img * result = llama_img_init(nx, ny);
     memcpy(result->data, img, nx*ny*3);
     stbi_image_free(img);
     return result;
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 50f2ff4ea..0a28f9bf6 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -15,7 +15,9 @@ static void print_usage(int, char ** argv) {
 int main(int argc, char ** argv) {
     gpt_params params;
 
-    params.prompt = "Hello my name is";
+    //params.prompt = "Hello my name is";
+    params.prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n"
+        "USER:<img_placement>\nwhat did you see?\nASSISTANT:";
     params.n_predict = 32;
 
     if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
@@ -62,52 +64,10 @@ int main(int argc, char ** argv) {
 
     llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
 
-
-
-
-    // TODO: this is for testing; DELETE ME
-    int n_cur = 0;
-    params.prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
-    {
-        llama_img_batch ibatch;
-        ibatch.n_imgs = 1;
-        ibatch.imgs = (llama_img **) malloc(1024);
-        ibatch.imgs[0] = load_image_from_file("../models/eiffel-tower-3349075_1280.jpg");
-        llama_vision_encode(ctx, &ibatch);
-
-        auto tokens = ::llama_tokenize(ctx, params.prompt, true);
-        int n_imgs = ibatch.n_imgs;
-        int n_embd = llama_n_embd(model);
-        int n_patches = llama_vision_n_patches(ctx);
-        printf("n_embd = %d ; n_patches = %d \n", n_embd, n_patches);
-        float * output_img = llama_vision_get_embeddings(ctx, 0);
-
-        n_cur += tokens.size();
-        llama_batch batch = llama_batch_init(512, 0, 1);
-        llama_batch_clear(batch);
-        for (auto t : tokens) { llama_batch_add(batch, t, n_cur, { 0 }, false); n_cur++; }
-        if (llama_decode(ctx, batch) != 0) {
-            LOG("%s: llama_decode() failed\n", __func__);
-            return 1;
-        }
-
-        // for (int k = 0; k < 10; k++) printf("%f\n", output_img[k]);
-        llama_batch_clear(batch);
-        batch = {int32_t(n_patches*n_imgs), nullptr, output_img, nullptr, nullptr, nullptr, nullptr, n_cur, 1, 0, };
-        if (llama_decode(ctx, batch) != 0) {
-            LOG("%s: llama_decode() failed\n", __func__);
-            return 1;
-        }
-        n_cur += n_embd*n_imgs;
-    }
-    params.prompt = "\nwhat did you see?\nASSISTANT:";
-
-
-
     // tokenize the prompt
 
     std::vector<llama_token> tokens_list;
-    tokens_list = ::llama_tokenize(ctx, params.prompt, true);
+    tokens_list = ::llama_tokenize_with_img(ctx, params.prompt, true);
 
     const int n_ctx    = llama_n_ctx(ctx);
     const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());
@@ -127,25 +87,75 @@ int main(int argc, char ** argv) {
     LOG("\n");
 
     for (auto id : tokens_list) {
-        LOG("%s", llama_token_to_piece(ctx, id).c_str());
+        if (id == TOKEN_IMG_PLACEMENT) {
+            LOG("<img_placement>");
+        } else {
+            LOG("%s", llama_token_to_piece(ctx, id).c_str());
+        }
     }
 
+    LOG("\n\n");
+
+    // load image
+    llama_batch_img img_batch = llama_batch_img_init(1);
+    img_batch.imgs[0] = load_image_from_file("../models/eiffel-tower-3349075_1280.jpg");
+
     // create a llama_batch with size 512
     // we use this object to submit token data for decoding
 
     llama_batch batch = llama_batch_init(512, 0, 1);
 
     // evaluate the initial prompt
-    for (size_t i = 0; i < tokens_list.size(); i++) {
-        //llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
-        if (i == 0) continue;
-        llama_batch_add(batch, tokens_list[i], n_cur, { 0 }, false);
-        n_cur++;
+    int n_cur = 0;
+    int i_img = 0;
+    for (auto id : tokens_list) {
+        if (id == TOKEN_IMG_PLACEMENT) {
+            img_batch.pos[i_img] = n_cur;
+            n_cur += llama_img_n_tokens(ctx, img_batch.imgs[i_img]);
+            i_img++;
+        } else {
+            llama_batch_add(batch, id, n_cur, { 0 }, false);
+            printf("pos %d tok %d --> %s\n", n_cur, id, llama_token_to_piece(ctx, id).c_str());
+            n_cur++;
+        }
     }
 
     // llama_decode will output logits only for the last token of the prompt
     batch.logits[batch.n_tokens - 1] = true;
 
+    if (llama_encode_vision(ctx, img_batch) != 0) {
+        LOG("%s: llama_encode_vision() failed\n", __func__);
+        return 1;
+    }
+
+    n_cur = 0;
+    {
+        auto t1 = ::llama_tokenize(ctx, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", false);
+        auto t2 = ::llama_tokenize(ctx, "\nwhat did you see?\nASSISTANT:", false);
+        t1.insert(t1.begin(), 1);
+
+        n_cur = 0;
+        llama_batch_clear(batch);
+        llama_batch_add(batch, 1, 0, { 0 }, false);
+        llama_decode(ctx, batch);
+
+        n_cur = t1.size();
+        llama_batch_clear(batch);
+        llama_batch batch0 = {int32_t(576), nullptr, _test_get_img_embd(ctx), nullptr, nullptr, nullptr, nullptr, n_cur, 1, 0, };
+        llama_decode(ctx, batch0);
+
+        n_cur = 0;
+        llama_batch_clear(batch);
+        for (auto t : t1) { llama_batch_add(batch, t, n_cur, { 0 }, false); n_cur++; }
+        llama_decode(ctx, batch);
+
+        n_cur = t1.size() + 576;
+        llama_batch_clear(batch);
+        printf("pos %d\n", n_cur);
+        for (auto t : t2) { llama_batch_add(batch, t, n_cur, { 0 }, false); n_cur++; }
+        batch.logits[batch.n_tokens - 1] = true;
+    }
+
     if (llama_decode(ctx, batch) != 0) {
         LOG("%s: llama_decode() failed\n", __func__);
         return 1;
@@ -153,7 +163,6 @@ int main(int argc, char ** argv) {
 
     // main loop
 
-    //int n_cur    = batch.n_tokens;
     int n_decode = 0;
 
     const auto t_main_start = ggml_time_us();
diff --git a/include/llama.h b/include/llama.h
index ed43796b1..e66dd0da1 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -233,10 +233,11 @@ extern "C" {
     } llama_img;
 
     // Input data for llama_vision_decode
-    typedef struct llama_img_batch {
+    typedef struct llama_batch_img {
         int32_t      n_imgs;
         llama_img ** imgs;
-    } llama_img_batch;
+        llama_pos *  pos;
+    } llama_batch_img;
 
     // Input data for llama_decode
     // A llama_batch object can contain input about one or many sequences
@@ -894,16 +895,18 @@ extern "C" {
     //
 
     // create new RGB image for input
-    LLAMA_API llama_img * llama_img_alloc(int width, int height);
-    LLAMA_API void llama_img_free(llama_img * img);
+    LLAMA_API llama_img * llama_img_init(int width, int height);
+    LLAMA_API void        llama_img_free(llama_img * img);
 
-    // encode image into embeddings
-    LLAMA_API int32_t llama_vision_encode(struct llama_context * ctx, llama_img_batch * batch);
+    // get number of tokens that an image occupies, used to determine the position in the batch
+    LLAMA_API int32_t llama_img_n_tokens(struct llama_context * ctx, llama_img * img);
 
-    // get output embeddings, to be put into language batch
-    LLAMA_API float * llama_vision_get_embeddings(struct llama_context * ctx, int32_t idx);
+    // create new image batch
+    LLAMA_API llama_batch_img llama_batch_img_init(int n_imgs);
+    LLAMA_API void            llama_batch_img_free(llama_batch_img batch);
 
-    LLAMA_API int32_t llama_vision_n_patches(struct llama_context * ctx);
+    // encode the input image batch
+    LLAMA_API int32_t llama_encode_vision(struct llama_context * ctx, llama_batch_img batch);
 
     //
     // Vocab
@@ -1237,6 +1240,7 @@ extern "C" {
     LLAMA_API void                           llama_perf_sampler_reset(      struct llama_sampler * chain);
 
     LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
+    LLAMA_API float * _test_get_img_embd(struct llama_context * ctx);
 
 #ifdef __cplusplus
 }
diff --git a/src/llama-vision.cpp b/src/llama-vision.cpp
index 2950579ee..93f9e4b52 100644
--- a/src/llama-vision.cpp
+++ b/src/llama-vision.cpp
@@ -78,16 +78,12 @@ int clip_n_patches(const clip_context & ctx) {
 
 int clip_n_mmproj_embd(const clip_context & ctx) {
     if (ctx.model->hparams.proj_type == CLIP_PROJECTOR_TYPE_MLP) {
-        return ctx.model->mm_b_b->ne[0];
+        return ctx.model->mm_2_b->ne[0];
     } else {
         GGML_ASSERT(false && "invalid proj type");
     }
 }
 
-int clip_n_embd(const clip_context & ctx) {
-    return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx);
-}
-
 /**
  * Selects the best resolution from a list of possible resolutions based on the original size.
  *
@@ -575,12 +571,12 @@ static ggml_cgraph * clip_image_build_graph(clip_context & ctx, int batch_size,
         embeddings = ggml_get_rows(ctx0, embeddings, patches);
 
         if (hparams.proj_type == CLIP_PROJECTOR_TYPE_MLP) {
-            embeddings = ggml_mul_mat(ctx0, model.mm_a_w, embeddings);
-            embeddings = ggml_add(ctx0, embeddings, model.mm_a_b);
+            embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
 
             embeddings = ggml_gelu(ctx0, embeddings);
-            embeddings = ggml_mul_mat(ctx0, model.mm_b_w, embeddings);
-            embeddings = ggml_add(ctx0, embeddings, model.mm_b_b);
+            embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
         } else {
             GGML_ASSERT(false && "unsupported proj type");
         }
@@ -681,7 +677,9 @@ static int32_t clip_image_batch_encode(clip_context & ctx, const clip_image_f32_
     ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(ctx.sched, embeddings);
 
     // copy the embeddings to the location passed by the user
-    output.resize(clip_n_embd(ctx));
+    size_t out_nbytes = clip_n_patches(ctx)*clip_n_mmproj_embd(ctx)*sizeof(float);
+    GGML_ASSERT(out_nbytes == ggml_nbytes(embeddings));
+    output.resize(out_nbytes);
     ggml_backend_tensor_get_async(backend_embd, embeddings, output.data(), 0, ggml_nbytes(embeddings));
 
     ggml_backend_sched_synchronize(ctx.sched);
@@ -731,15 +729,18 @@ static int32_t encode_image_with_clip(clip_context & ctx, const llama_img img, s
 ////////////////////////////////////////////////////////////////////////////////////////
 // public API
 
-int32_t llama_vision_encode_internal(clip_context & ctx, llama_img_batch * batch) {
+int32_t llama_encode_vision_internal(clip_context & ctx, llama_batch_img * batch) {
     if (batch->n_imgs == 0) {
         return 0;
     }
 
     // TODO: batching is not working atm, should be fixed later
-    const int n_embd = clip_n_embd(ctx);
-    ctx.output.resize(n_embd * batch->n_imgs);
-    ctx.n_output = batch->n_imgs;
+    const int n_embd = clip_n_mmproj_embd(ctx);
+    const int n_tokens_per_img = clip_n_patches(ctx);
+    const int n_pos = n_tokens_per_img*batch->n_imgs;
+
+    ctx.out_embd.resize(n_embd*n_pos);
+    ctx.out_pos.resize(n_pos);
 
     for (int i = 0; i < batch->n_imgs; i++) {
         std::vector<float> output_single;
@@ -748,14 +749,23 @@ int32_t llama_vision_encode_internal(clip_context & ctx, llama_img_batch * batch
             return status;
         }
         // copy output embeddings to result
-        for (int k = 0; k < n_embd; k++) {
-            ctx.output[n_embd*i + k] = output_single[k];
+        for (int k = 0; k < n_embd*n_tokens_per_img; k++) {
+            ctx.out_embd[n_embd*n_tokens_per_img*i + k] = output_single[k];
+        }
+        // fill position for all output tokens
+        for (int p = 0; p < n_tokens_per_img; p++) {
+            ctx.out_pos[n_tokens_per_img*i + p] = batch->pos[i] + p;
         }
     }
 
     return 0;
 }
 
+void llama_vision_clear_output(clip_context & ctx) {
+    ctx.out_embd.clear();
+    ctx.out_pos.clear();
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////
 // for debugging
 #ifndef NDEBUG
diff --git a/src/llama-vision.h b/src/llama-vision.h
index 1b2dbf5a4..950f497c8 100644
--- a/src/llama-vision.h
+++ b/src/llama-vision.h
@@ -95,10 +95,10 @@ struct clip_vision_model {
     struct ggml_tensor * projection = NULL;
 
     // LLaVA projection
-    struct ggml_tensor * mm_a_w = NULL;
-    struct ggml_tensor * mm_a_b = NULL;
-    struct ggml_tensor * mm_b_w = NULL;
-    struct ggml_tensor * mm_b_b = NULL;
+    struct ggml_tensor * mm_1_w = NULL;
+    struct ggml_tensor * mm_1_b = NULL;
+    struct ggml_tensor * mm_2_w = NULL;
+    struct ggml_tensor * mm_2_b = NULL;
 
     struct ggml_tensor * image_newline = NULL;
 };
@@ -110,15 +110,15 @@ struct clip_context {
 
     const clip_vision_model * model;
 
-    // temporary output data
-    int n_output;
-    std::vector<float> output; // size == n_output * n_embd
+    // temporary output data, to be picked up by llama_decode()
+    std::vector<float>     out_embd;  // size == n_tokens * n_embd
+    std::vector<llama_pos> out_pos;   // position of each token
 };
 
 mm_patch_merge mm_patch_merge_from_name(std::string & name);
 clip_projector_type projector_type_from_name(std::string & name);
 int clip_n_patches(const clip_context & ctx);
 int clip_n_mmproj_embd(const clip_context & ctx);
-int clip_n_embd(const clip_context & ctx);
 
-int32_t llama_vision_encode_internal(clip_context & ctx, llama_img_batch * batch);
+int32_t llama_encode_vision_internal(clip_context & ctx, llama_batch_img * batch);
+void llama_vision_clear_output(clip_context & ctx);
diff --git a/src/llama.cpp b/src/llama.cpp
index 0dd60cd81..b1b44aaca 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1600,7 +1600,7 @@ static const std::map<vision_arch, std::map<vision_tensor, std::string>> VISION_
     {
         VISION_ARCH_LLAVA,
         {
-            { VISION_TENSOR_MMPROJ,                  "v.mmproj"                    },
+            { VISION_TENSOR_MMPROJ,                  "v.mmproj_%d"                 },
             { VISION_TENSOR_ENC_EMBD_CLS,            "v.enc.embd.cls"              },
             { VISION_TENSOR_ENC_EMBD_PATCH,          "v.enc.embd.patch"            },
             { VISION_TENSOR_ENC_EMBD_POS,            "v.enc.embd.pos"              },
@@ -8990,10 +8990,10 @@ static bool llm_load_tensors(
         switch (vparams.arch) {
             case VISION_ARCH_LLAVA:
                 {
-                    model.clip.mm_a_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 1), {n_embd, n_ff});
-                    model.clip.mm_a_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias"  , 1), {n_ff});
-                    model.clip.mm_b_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 2), {n_ff,   n_ff});
-                    model.clip.mm_b_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias"  , 2), {n_ff});
+                    model.clip.mm_1_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 1), {n_embd, n_ff});
+                    model.clip.mm_1_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias"  , 1), {n_ff});
+                    model.clip.mm_2_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 2), {n_ff,   n_ff});
+                    model.clip.mm_2_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias"  , 2), {n_ff});
 
                     model.clip.class_embedding     = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_CLS            ), {n_embd});
                     model.clip.patch_embeddings    = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_embd});
@@ -9001,8 +9001,8 @@ static bool llm_load_tensors(
 
                     model.clip.pre_norm_w          = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_PRE_NORM,       "weight"), {n_embd});
                     model.clip.pre_norm_b          = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_PRE_NORM,       "bias"  ), {n_embd});
-                    // model.clip.post_norm_w         = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM,      "weight"), {n_embd});
-                    // model.clip.post_norm_b         = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM,      "bias"  ), {n_embd});
+                    model.clip.post_norm_w         = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM,      "weight"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                    model.clip.post_norm_b         = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM,      "bias"  ), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
 
                     for (int i = 0; i < n_layer; ++i) {
                         ggml_context * ctx_layer = ctx_for_layer(i);
@@ -20110,6 +20110,8 @@ int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
 
 void llama_kv_cache_clear(struct llama_context * ctx) {
     llama_kv_cache_clear(ctx->kv_self);
+    // clear vision embeddings output
+    llama_vision_clear_output(ctx->clip);
 }
 
 bool llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
@@ -21226,9 +21228,48 @@ int32_t llama_encode(
     return ret;
 }
 
+float * _test_get_img_embd(struct llama_context * ctx) { return ctx->clip.out_embd.data(); }
 int32_t llama_decode(
         struct llama_context * ctx,
           struct llama_batch   batch) {
+    // hacky vision implementation, for testing only
+    if (!ctx->clip.out_embd.empty()) {
+        // int8_t * logits        = new int8_t        [ctx->clip.out_pos.size()];
+        // int32_t * n_seq_id     = new int32_t       [ctx->clip.out_pos.size()];
+        // llama_seq_id ** seq_id = new llama_seq_id *[ctx->clip.out_pos.size()];
+        // llama_seq_id seq_id_0  = 0;
+        // printf("out_pos %d\n", ctx->clip.out_pos.size());
+        // llama_batch ibatch = {
+        //     /*n_tokens       =*/ static_cast<int32_t>(ctx->clip.out_pos.size()),
+        //     /*tokens         =*/ nullptr,
+        //     /*embd           =*/ ctx->clip.out_embd.data(),
+        //     /*pos            =*/ ctx->clip.out_pos.data(),
+        //     /*n_seq_id       =*/ n_seq_id,
+        //     /*seq_id         =*/ seq_id,
+        //     /*logits         =*/ logits,
+        //     /*all_pos_0      =*/ 0,
+        //     /*all_pos_1      =*/ 0,
+        //     /*all_seq_id     =*/ 0,
+        // };
+        // for (size_t i = 0; i < ctx->clip.out_pos.size(); i++) {
+        //     ibatch.n_seq_id[i] = 1;
+        //     ibatch.seq_id  [i] = &seq_id_0;
+        //     ibatch.logits  [i] = 0;
+        // }
+        // llama_decode_internal(*ctx, ibatch);
+        // delete[] logits;
+        // delete[] n_seq_id;
+        // delete[] seq_id;
+        // llama_vision_clear_output(ctx->clip);
+
+        //int n_eval = ctx->clip.out_pos.size();
+        //int n_past = ctx->clip.out_pos[0];
+        //printf("n_eval %d, n_past %d\n", n_eval, n_past);
+        //llama_batch ibatch = {int32_t(n_eval), nullptr, ctx->clip.out_embd.data(), nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
+        //llama_decode_internal(*ctx, ibatch);
+        //llama_vision_clear_output(ctx->clip);
+    }
+
     const int ret = llama_decode_internal(*ctx, batch);
     if (ret < 0) {
         LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
@@ -21808,30 +21849,45 @@ struct llama_sampler * llama_sampler_init_grammar(const struct llama_model * mod
 // vision
 //
 
-llama_img * llama_img_alloc(int width, int height) {
+llama_img * llama_img_init(int width, int height) {
     llama_img * img = new llama_img();
     img->nx = width;
     img->ny = height;
-    img->data = (unsigned char *)malloc(width*height*3);
+    if (width > 0 && height > 0) {
+        img->data = (unsigned char *)malloc(width*height*3);
+    }
     return img;
 }
+
 void llama_img_free(llama_img * img) {
-    free(img->data);
+    if (img->data) free(img->data);
     delete img;
 }
 
-int32_t llama_vision_encode(struct llama_context * ctx, llama_img_batch * batch) {
-    return llama_vision_encode_internal(ctx->clip, batch);
-}
-
-float * llama_vision_get_embeddings(struct llama_context * ctx, int32_t idx) {
-    return ctx->clip.output.data();
-}
-
-int32_t llama_vision_n_patches(struct llama_context * ctx) {
+int32_t llama_img_n_tokens(struct llama_context * ctx, llama_img * img) {
+    GGML_UNUSED(img); // reserved for future usage
     return clip_n_patches(ctx->clip);
 }
 
+llama_batch_img llama_batch_img_init(int n_imgs) {
+    llama_batch_img batch;
+    batch.n_imgs = n_imgs;
+    if (n_imgs > 0) {
+        batch.imgs = (llama_img **)malloc(n_imgs*sizeof(llama_img *));
+        batch.pos  = (llama_pos * )malloc(n_imgs*sizeof(llama_pos  ));
+    }
+    return batch;
+}
+
+void llama_batch_img_free(llama_batch_img batch) {
+    if (batch.imgs) free(batch.imgs);
+    if (batch.pos ) free(batch.pos );
+}
+
+int32_t llama_encode_vision(struct llama_context * ctx, llama_batch_img batch) {
+    return llama_encode_vision_internal(ctx->clip, &batch);
+}
+
 //
 // model split
 //