wip

2024-10-03 20:15:36 +02:00 · 2024-10-03 20:15:36 +02:00 · a88c0d5f26
commit a88c0d5f26
parent 49e7304cdf
8 changed files with 230 additions and 105 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1474,6 +1474,27 @@ std::vector<llama_token> llama_tokenize(
    return result;
 }

+// TODO: this function is hacky, need to be improved
+std::vector<llama_token> llama_tokenize_with_img(
+  const struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_special,
+                        bool   parse_special) {
+    static const std::string IMG_PLACEMENT = "<img_placement>";
+    std::vector<std::string> parts = string_split(text, IMG_PLACEMENT);
+    std::vector<llama_token> output;
+    for (const auto & part : parts) {
+        bool add_bos = &parts.front() == &part;
+        auto tokens = llama_tokenize(ctx, part, add_special && add_bos, parse_special);
+        output.insert(output.end(), tokens.begin(), tokens.end());
+        if (&parts.back() != &part) {
+            // add image token to middle of 2 parts
+            output.push_back(TOKEN_IMG_PLACEMENT);
+        }
+    }
+    return output;
+}
+
 std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
    std::string piece;
    piece.resize(piece.capacity());  // using string internal cache, 15 bytes + '\n'
--- a/common/common.h
+++ b/common/common.h
@ -378,6 +378,20 @@ static std::vector<T> string_split(const std::string & str, char delim) {
    return values;
 }

+// split string by a `std::string delim` instead of `char delim`
+static std::vector<std::string> string_split(std::string s, const std::string & delimiter) {
+    std::vector<std::string> tokens;
+    size_t pos = 0;
+    std::string token;
+    while ((pos = s.find(delimiter)) != std::string::npos) {
+        token = s.substr(0, pos);
+        tokens.push_back(token);
+        s.erase(0, pos + delimiter.length());
+    }
+    tokens.push_back(s);
+    return tokens;
+}
+
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);

@ -447,6 +461,17 @@ std::vector<llama_token> llama_tokenize(
                        bool   add_special,
                        bool   parse_special = false);

+const llama_token TOKEN_IMG_PLACEMENT = -1000;
+
+// tokenize with "placeholder" for image embedding tokens
+// "<img_placement>" will be replaced with TOKEN_IMG_PLACEMENT
+// TODO: this function is hacky, need to be improved
+std::vector<llama_token> llama_tokenize_with_img(
+  const struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_special,
+                        bool   parse_special = false);
+
 // tokenizes a token into a piece, optionally renders special/control tokens
 // should work similar to Python's `tokenizer.id_to_piece`
 std::string llama_token_to_piece(
--- a/common/vision.cpp
+++ b/common/vision.cpp
@ -31,7 +31,7 @@ llama_img * load_image_from_file(const char * fname) {
    //     printf("\n");
    // }
    // printf("\n");
-    llama_img * result = llama_img_alloc(nx, ny);
+    llama_img * result = llama_img_init(nx, ny);
    memcpy(result->data, img, nx*ny*3);
    stbi_image_free(img);
    return result;
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -15,7 +15,9 @@ static void print_usage(int, char ** argv) {
 int main(int argc, char ** argv) {
    gpt_params params;

-    params.prompt = "Hello my name is";
+    //params.prompt = "Hello my name is";
+    params.prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n"
+        "USER:<img_placement>\nwhat did you see?\nASSISTANT:";
    params.n_predict = 32;

    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
@ -62,52 +64,10 @@ int main(int argc, char ** argv) {

    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());

-
-
-
-    // TODO: this is for testing; DELETE ME
-    int n_cur = 0;
-    params.prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
-    {
-        llama_img_batch ibatch;
-        ibatch.n_imgs = 1;
-        ibatch.imgs = (llama_img **) malloc(1024);
-        ibatch.imgs[0] = load_image_from_file("../models/eiffel-tower-3349075_1280.jpg");
-        llama_vision_encode(ctx, &ibatch);
-
-        auto tokens = ::llama_tokenize(ctx, params.prompt, true);
-        int n_imgs = ibatch.n_imgs;
-        int n_embd = llama_n_embd(model);
-        int n_patches = llama_vision_n_patches(ctx);
-        printf("n_embd = %d ; n_patches = %d \n", n_embd, n_patches);
-        float * output_img = llama_vision_get_embeddings(ctx, 0);
-
-        n_cur += tokens.size();
-        llama_batch batch = llama_batch_init(512, 0, 1);
-        llama_batch_clear(batch);
-        for (auto t : tokens) { llama_batch_add(batch, t, n_cur, { 0 }, false); n_cur++; }
-        if (llama_decode(ctx, batch) != 0) {
-            LOG("%s: llama_decode() failed\n", __func__);
-            return 1;
-        }
-
-        // for (int k = 0; k < 10; k++) printf("%f\n", output_img[k]);
-        llama_batch_clear(batch);
-        batch = {int32_t(n_patches*n_imgs), nullptr, output_img, nullptr, nullptr, nullptr, nullptr, n_cur, 1, 0, };
-        if (llama_decode(ctx, batch) != 0) {
-            LOG("%s: llama_decode() failed\n", __func__);
-            return 1;
-        }
-        n_cur += n_embd*n_imgs;
-    }
-    params.prompt = "\nwhat did you see?\nASSISTANT:";
-
-
-
    // tokenize the prompt

    std::vector<llama_token> tokens_list;
-    tokens_list = ::llama_tokenize(ctx, params.prompt, true);
+    tokens_list = ::llama_tokenize_with_img(ctx, params.prompt, true);

    const int n_ctx    = llama_n_ctx(ctx);
    const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());
@ -127,25 +87,75 @@ int main(int argc, char ** argv) {
    LOG("\n");

    for (auto id : tokens_list) {
-        LOG("%s", llama_token_to_piece(ctx, id).c_str());
+        if (id == TOKEN_IMG_PLACEMENT) {
+            LOG("<img_placement>");
+        } else {
+            LOG("%s", llama_token_to_piece(ctx, id).c_str());
+        }
    }

+    LOG("\n\n");
+
+    // load image
+    llama_batch_img img_batch = llama_batch_img_init(1);
+    img_batch.imgs[0] = load_image_from_file("../models/eiffel-tower-3349075_1280.jpg");
+
    // create a llama_batch with size 512
    // we use this object to submit token data for decoding

    llama_batch batch = llama_batch_init(512, 0, 1);

    // evaluate the initial prompt
-    for (size_t i = 0; i < tokens_list.size(); i++) {
-        //llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
-        if (i == 0) continue;
-        llama_batch_add(batch, tokens_list[i], n_cur, { 0 }, false);
-        n_cur++;
+    int n_cur = 0;
+    int i_img = 0;
+    for (auto id : tokens_list) {
+        if (id == TOKEN_IMG_PLACEMENT) {
+            img_batch.pos[i_img] = n_cur;
+            n_cur += llama_img_n_tokens(ctx, img_batch.imgs[i_img]);
+            i_img++;
+        } else {
+            llama_batch_add(batch, id, n_cur, { 0 }, false);
+            printf("pos %d tok %d --> %s\n", n_cur, id, llama_token_to_piece(ctx, id).c_str());
+            n_cur++;
+        }
    }

    // llama_decode will output logits only for the last token of the prompt
    batch.logits[batch.n_tokens - 1] = true;

+    if (llama_encode_vision(ctx, img_batch) != 0) {
+        LOG("%s: llama_encode_vision() failed\n", __func__);
+        return 1;
+    }
+
+    n_cur = 0;
+    {
+        auto t1 = ::llama_tokenize(ctx, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", false);
+        auto t2 = ::llama_tokenize(ctx, "\nwhat did you see?\nASSISTANT:", false);
+        t1.insert(t1.begin(), 1);
+
+        n_cur = 0;
+        llama_batch_clear(batch);
+        llama_batch_add(batch, 1, 0, { 0 }, false);
+        llama_decode(ctx, batch);
+
+        n_cur = t1.size();
+        llama_batch_clear(batch);
+        llama_batch batch0 = {int32_t(576), nullptr, _test_get_img_embd(ctx), nullptr, nullptr, nullptr, nullptr, n_cur, 1, 0, };
+        llama_decode(ctx, batch0);
+
+        n_cur = 0;
+        llama_batch_clear(batch);
+        for (auto t : t1) { llama_batch_add(batch, t, n_cur, { 0 }, false); n_cur++; }
+        llama_decode(ctx, batch);
+
+        n_cur = t1.size() + 576;
+        llama_batch_clear(batch);
+        printf("pos %d\n", n_cur);
+        for (auto t : t2) { llama_batch_add(batch, t, n_cur, { 0 }, false); n_cur++; }
+        batch.logits[batch.n_tokens - 1] = true;
+    }
+
    if (llama_decode(ctx, batch) != 0) {
        LOG("%s: llama_decode() failed\n", __func__);
        return 1;
@ -153,7 +163,6 @@ int main(int argc, char ** argv) {

    // main loop

-    //int n_cur    = batch.n_tokens;
    int n_decode = 0;

    const auto t_main_start = ggml_time_us();
--- a/include/llama.h
+++ b/include/llama.h
@ -233,10 +233,11 @@ extern "C" {
    } llama_img;

    // Input data for llama_vision_decode
-    typedef struct llama_img_batch {
+    typedef struct llama_batch_img {
        int32_t      n_imgs;
        llama_img ** imgs;
-    } llama_img_batch;
+        llama_pos *  pos;
+    } llama_batch_img;

    // Input data for llama_decode
    // A llama_batch object can contain input about one or many sequences
@ -894,16 +895,18 @@ extern "C" {
    //

    // create new RGB image for input
-    LLAMA_API llama_img * llama_img_alloc(int width, int height);
-    LLAMA_API void llama_img_free(llama_img * img);
+    LLAMA_API llama_img * llama_img_init(int width, int height);
+    LLAMA_API void        llama_img_free(llama_img * img);

-    // encode image into embeddings
-    LLAMA_API int32_t llama_vision_encode(struct llama_context * ctx, llama_img_batch * batch);
+    // get number of tokens that an image occupies, used to determine the position in the batch
+    LLAMA_API int32_t llama_img_n_tokens(struct llama_context * ctx, llama_img * img);

-    // get output embeddings, to be put into language batch
-    LLAMA_API float * llama_vision_get_embeddings(struct llama_context * ctx, int32_t idx);
+    // create new image batch
+    LLAMA_API llama_batch_img llama_batch_img_init(int n_imgs);
+    LLAMA_API void            llama_batch_img_free(llama_batch_img batch);

-    LLAMA_API int32_t llama_vision_n_patches(struct llama_context * ctx);
+    // encode the input image batch
+    LLAMA_API int32_t llama_encode_vision(struct llama_context * ctx, llama_batch_img batch);

    //
    // Vocab
@ -1237,6 +1240,7 @@ extern "C" {
    LLAMA_API void                           llama_perf_sampler_reset(      struct llama_sampler * chain);

    LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
+    LLAMA_API float * _test_get_img_embd(struct llama_context * ctx);

 #ifdef __cplusplus
 }
--- a/src/llama-vision.cpp
+++ b/src/llama-vision.cpp
@ -78,16 +78,12 @@ int clip_n_patches(const clip_context & ctx) {

 int clip_n_mmproj_embd(const clip_context & ctx) {
    if (ctx.model->hparams.proj_type == CLIP_PROJECTOR_TYPE_MLP) {
-        return ctx.model->mm_b_b->ne[0];
+        return ctx.model->mm_2_b->ne[0];
    } else {
        GGML_ASSERT(false && "invalid proj type");
    }
 }

-int clip_n_embd(const clip_context & ctx) {
-    return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx);
-}
-
 /**
 * Selects the best resolution from a list of possible resolutions based on the original size.
 *
@ -575,12 +571,12 @@ static ggml_cgraph * clip_image_build_graph(clip_context & ctx, int batch_size,
        embeddings = ggml_get_rows(ctx0, embeddings, patches);

        if (hparams.proj_type == CLIP_PROJECTOR_TYPE_MLP) {
-            embeddings = ggml_mul_mat(ctx0, model.mm_a_w, embeddings);
-            embeddings = ggml_add(ctx0, embeddings, model.mm_a_b);
+            embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);

            embeddings = ggml_gelu(ctx0, embeddings);
-            embeddings = ggml_mul_mat(ctx0, model.mm_b_w, embeddings);
-            embeddings = ggml_add(ctx0, embeddings, model.mm_b_b);
+            embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
        } else {
            GGML_ASSERT(false && "unsupported proj type");
        }
@ -681,7 +677,9 @@ static int32_t clip_image_batch_encode(clip_context & ctx, const clip_image_f32_
    ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(ctx.sched, embeddings);

    // copy the embeddings to the location passed by the user
-    output.resize(clip_n_embd(ctx));
+    size_t out_nbytes = clip_n_patches(ctx)*clip_n_mmproj_embd(ctx)*sizeof(float);
+    GGML_ASSERT(out_nbytes == ggml_nbytes(embeddings));
+    output.resize(out_nbytes);
    ggml_backend_tensor_get_async(backend_embd, embeddings, output.data(), 0, ggml_nbytes(embeddings));

    ggml_backend_sched_synchronize(ctx.sched);
@ -731,15 +729,18 @@ static int32_t encode_image_with_clip(clip_context & ctx, const llama_img img, s
 ////////////////////////////////////////////////////////////////////////////////////////
 // public API

-int32_t llama_vision_encode_internal(clip_context & ctx, llama_img_batch * batch) {
+int32_t llama_encode_vision_internal(clip_context & ctx, llama_batch_img * batch) {
    if (batch->n_imgs == 0) {
        return 0;
    }

    // TODO: batching is not working atm, should be fixed later
-    const int n_embd = clip_n_embd(ctx);
-    ctx.output.resize(n_embd * batch->n_imgs);
-    ctx.n_output = batch->n_imgs;
+    const int n_embd = clip_n_mmproj_embd(ctx);
+    const int n_tokens_per_img = clip_n_patches(ctx);
+    const int n_pos = n_tokens_per_img*batch->n_imgs;
+
+    ctx.out_embd.resize(n_embd*n_pos);
+    ctx.out_pos.resize(n_pos);

    for (int i = 0; i < batch->n_imgs; i++) {
        std::vector<float> output_single;
@ -748,14 +749,23 @@ int32_t llama_vision_encode_internal(clip_context & ctx, llama_img_batch * batch
            return status;
        }
        // copy output embeddings to result
-        for (int k = 0; k < n_embd; k++) {
-            ctx.output[n_embd*i + k] = output_single[k];
+        for (int k = 0; k < n_embd*n_tokens_per_img; k++) {
+            ctx.out_embd[n_embd*n_tokens_per_img*i + k] = output_single[k];
+        }
+        // fill position for all output tokens
+        for (int p = 0; p < n_tokens_per_img; p++) {
+            ctx.out_pos[n_tokens_per_img*i + p] = batch->pos[i] + p;
        }
    }

    return 0;
 }

+void llama_vision_clear_output(clip_context & ctx) {
+    ctx.out_embd.clear();
+    ctx.out_pos.clear();
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////
 // for debugging
 #ifndef NDEBUG
--- a/src/llama-vision.h
+++ b/src/llama-vision.h
@ -95,10 +95,10 @@ struct clip_vision_model {
    struct ggml_tensor * projection = NULL;

    // LLaVA projection
-    struct ggml_tensor * mm_a_w = NULL;
-    struct ggml_tensor * mm_a_b = NULL;
-    struct ggml_tensor * mm_b_w = NULL;
-    struct ggml_tensor * mm_b_b = NULL;
+    struct ggml_tensor * mm_1_w = NULL;
+    struct ggml_tensor * mm_1_b = NULL;
+    struct ggml_tensor * mm_2_w = NULL;
+    struct ggml_tensor * mm_2_b = NULL;

    struct ggml_tensor * image_newline = NULL;
 };
@ -110,15 +110,15 @@ struct clip_context {

    const clip_vision_model * model;

-    // temporary output data
-    int n_output;
-    std::vector<float> output; // size == n_output * n_embd
+    // temporary output data, to be picked up by llama_decode()
+    std::vector<float>     out_embd;  // size == n_tokens * n_embd
+    std::vector<llama_pos> out_pos;   // position of each token
 };

 mm_patch_merge mm_patch_merge_from_name(std::string & name);
 clip_projector_type projector_type_from_name(std::string & name);
 int clip_n_patches(const clip_context & ctx);
 int clip_n_mmproj_embd(const clip_context & ctx);
-int clip_n_embd(const clip_context & ctx);

-int32_t llama_vision_encode_internal(clip_context & ctx, llama_img_batch * batch);
+int32_t llama_encode_vision_internal(clip_context & ctx, llama_batch_img * batch);
+void llama_vision_clear_output(clip_context & ctx);
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -1600,7 +1600,7 @@ static const std::map<vision_arch, std::map<vision_tensor, std::string>> VISION_
    {
        VISION_ARCH_LLAVA,
        {
-            { VISION_TENSOR_MMPROJ,                  "v.mmproj"                    },
+            { VISION_TENSOR_MMPROJ,                  "v.mmproj_%d"                 },
            { VISION_TENSOR_ENC_EMBD_CLS,            "v.enc.embd.cls"              },
            { VISION_TENSOR_ENC_EMBD_PATCH,          "v.enc.embd.patch"            },
            { VISION_TENSOR_ENC_EMBD_POS,            "v.enc.embd.pos"              },
@ -8990,10 +8990,10 @@ static bool llm_load_tensors(
        switch (vparams.arch) {
            case VISION_ARCH_LLAVA:
                {
-                    model.clip.mm_a_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 1), {n_embd, n_ff});
-                    model.clip.mm_a_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias"  , 1), {n_ff});
-                    model.clip.mm_b_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 2), {n_ff,   n_ff});
-                    model.clip.mm_b_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias"  , 2), {n_ff});
+                    model.clip.mm_1_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 1), {n_embd, n_ff});
+                    model.clip.mm_1_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias"  , 1), {n_ff});
+                    model.clip.mm_2_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 2), {n_ff,   n_ff});
+                    model.clip.mm_2_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias"  , 2), {n_ff});

                    model.clip.class_embedding     = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_CLS            ), {n_embd});
                    model.clip.patch_embeddings    = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_embd});
@ -9001,8 +9001,8 @@ static bool llm_load_tensors(

                    model.clip.pre_norm_w          = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_PRE_NORM,       "weight"), {n_embd});
                    model.clip.pre_norm_b          = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_PRE_NORM,       "bias"  ), {n_embd});
-                    // model.clip.post_norm_w         = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM,      "weight"), {n_embd});
-                    // model.clip.post_norm_b         = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM,      "bias"  ), {n_embd});
+                    model.clip.post_norm_w         = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM,      "weight"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                    model.clip.post_norm_b         = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM,      "bias"  ), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);

                    for (int i = 0; i < n_layer; ++i) {
                        ggml_context * ctx_layer = ctx_for_layer(i);
@ -20110,6 +20110,8 @@ int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) {

 void llama_kv_cache_clear(struct llama_context * ctx) {
    llama_kv_cache_clear(ctx->kv_self);
+    // clear vision embeddings output
+    llama_vision_clear_output(ctx->clip);
 }

 bool llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
@ -21226,9 +21228,48 @@ int32_t llama_encode(
    return ret;
 }

+float * _test_get_img_embd(struct llama_context * ctx) { return ctx->clip.out_embd.data(); }
 int32_t llama_decode(
        struct llama_context * ctx,
          struct llama_batch   batch) {
+    // hacky vision implementation, for testing only
+    if (!ctx->clip.out_embd.empty()) {
+        // int8_t * logits        = new int8_t        [ctx->clip.out_pos.size()];
+        // int32_t * n_seq_id     = new int32_t       [ctx->clip.out_pos.size()];
+        // llama_seq_id ** seq_id = new llama_seq_id *[ctx->clip.out_pos.size()];
+        // llama_seq_id seq_id_0  = 0;
+        // printf("out_pos %d\n", ctx->clip.out_pos.size());
+        // llama_batch ibatch = {
+        //     /*n_tokens       =*/ static_cast<int32_t>(ctx->clip.out_pos.size()),
+        //     /*tokens         =*/ nullptr,
+        //     /*embd           =*/ ctx->clip.out_embd.data(),
+        //     /*pos            =*/ ctx->clip.out_pos.data(),
+        //     /*n_seq_id       =*/ n_seq_id,
+        //     /*seq_id         =*/ seq_id,
+        //     /*logits         =*/ logits,
+        //     /*all_pos_0      =*/ 0,
+        //     /*all_pos_1      =*/ 0,
+        //     /*all_seq_id     =*/ 0,
+        // };
+        // for (size_t i = 0; i < ctx->clip.out_pos.size(); i++) {
+        //     ibatch.n_seq_id[i] = 1;
+        //     ibatch.seq_id  [i] = &seq_id_0;
+        //     ibatch.logits  [i] = 0;
+        // }
+        // llama_decode_internal(*ctx, ibatch);
+        // delete[] logits;
+        // delete[] n_seq_id;
+        // delete[] seq_id;
+        // llama_vision_clear_output(ctx->clip);
+
+        //int n_eval = ctx->clip.out_pos.size();
+        //int n_past = ctx->clip.out_pos[0];
+        //printf("n_eval %d, n_past %d\n", n_eval, n_past);
+        //llama_batch ibatch = {int32_t(n_eval), nullptr, ctx->clip.out_embd.data(), nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
+        //llama_decode_internal(*ctx, ibatch);
+        //llama_vision_clear_output(ctx->clip);
+    }
+
    const int ret = llama_decode_internal(*ctx, batch);
    if (ret < 0) {
        LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
@ -21808,30 +21849,45 @@ struct llama_sampler * llama_sampler_init_grammar(const struct llama_model * mod
 // vision
 //

-llama_img * llama_img_alloc(int width, int height) {
+llama_img * llama_img_init(int width, int height) {
    llama_img * img = new llama_img();
    img->nx = width;
    img->ny = height;
-    img->data = (unsigned char *)malloc(width*height*3);
+    if (width > 0 && height > 0) {
+        img->data = (unsigned char *)malloc(width*height*3);
+    }
    return img;
 }
+
 void llama_img_free(llama_img * img) {
-    free(img->data);
+    if (img->data) free(img->data);
    delete img;
 }

-int32_t llama_vision_encode(struct llama_context * ctx, llama_img_batch * batch) {
-    return llama_vision_encode_internal(ctx->clip, batch);
-}
-
-float * llama_vision_get_embeddings(struct llama_context * ctx, int32_t idx) {
-    return ctx->clip.output.data();
-}
-
-int32_t llama_vision_n_patches(struct llama_context * ctx) {
+int32_t llama_img_n_tokens(struct llama_context * ctx, llama_img * img) {
+    GGML_UNUSED(img); // reserved for future usage
    return clip_n_patches(ctx->clip);
 }

+llama_batch_img llama_batch_img_init(int n_imgs) {
+    llama_batch_img batch;
+    batch.n_imgs = n_imgs;
+    if (n_imgs > 0) {
+        batch.imgs = (llama_img **)malloc(n_imgs*sizeof(llama_img *));
+        batch.pos  = (llama_pos * )malloc(n_imgs*sizeof(llama_pos  ));
+    }
+    return batch;
+}
+
+void llama_batch_img_free(llama_batch_img batch) {
+    if (batch.imgs) free(batch.imgs);
+    if (batch.pos ) free(batch.pos );
+}
+
+int32_t llama_encode_vision(struct llama_context * ctx, llama_batch_img batch) {
+    return llama_encode_vision_internal(ctx->clip, &batch);
+}
+
 //
 // model split
 //