From a88c0d5f2626bc65395b678e8758eb855ffc8f67 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 3 Oct 2024 20:15:36 +0200 Subject: [PATCH] wip --- common/common.cpp | 21 +++++++ common/common.h | 25 +++++++++ common/vision.cpp | 2 +- examples/simple/simple.cpp | 111 ++++++++++++++++++++----------------- include/llama.h | 22 +++++--- src/llama-vision.cpp | 42 ++++++++------ src/llama-vision.h | 18 +++--- src/llama.cpp | 94 ++++++++++++++++++++++++------- 8 files changed, 230 insertions(+), 105 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 8d0ed4f95..921928d97 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1474,6 +1474,27 @@ std::vector llama_tokenize( return result; } +// TODO: this function is hacky, need to be improved +std::vector llama_tokenize_with_img( + const struct llama_context * ctx, + const std::string & text, + bool add_special, + bool parse_special) { + static const std::string IMG_PLACEMENT = ""; + std::vector parts = string_split(text, IMG_PLACEMENT); + std::vector output; + for (const auto & part : parts) { + bool add_bos = &parts.front() == ∂ + auto tokens = llama_tokenize(ctx, part, add_special && add_bos, parse_special); + output.insert(output.end(), tokens.begin(), tokens.end()); + if (&parts.back() != &part) { + // add image token to middle of 2 parts + output.push_back(TOKEN_IMG_PLACEMENT); + } + } + return output; +} + std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { std::string piece; piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n' diff --git a/common/common.h b/common/common.h index cb87c4479..e6fa1c2d4 100644 --- a/common/common.h +++ b/common/common.h @@ -378,6 +378,20 @@ static std::vector string_split(const std::string & str, char delim) { return values; } +// split string by a `std::string delim` instead of `char delim` +static std::vector string_split(std::string s, const std::string & delimiter) { + std::vector tokens; + size_t pos = 0; + std::string token; + while ((pos = s.find(delimiter)) != std::string::npos) { + token = s.substr(0, pos); + tokens.push_back(token); + s.erase(0, pos + delimiter.length()); + } + tokens.push_back(s); + return tokens; +} + bool string_parse_kv_override(const char * data, std::vector & overrides); void string_process_escapes(std::string & input); @@ -447,6 +461,17 @@ std::vector llama_tokenize( bool add_special, bool parse_special = false); +const llama_token TOKEN_IMG_PLACEMENT = -1000; + +// tokenize with "placeholder" for image embedding tokens +// "" will be replaced with TOKEN_IMG_PLACEMENT +// TODO: this function is hacky, need to be improved +std::vector llama_tokenize_with_img( + const struct llama_context * ctx, + const std::string & text, + bool add_special, + bool parse_special = false); + // tokenizes a token into a piece, optionally renders special/control tokens // should work similar to Python's `tokenizer.id_to_piece` std::string llama_token_to_piece( diff --git a/common/vision.cpp b/common/vision.cpp index 5b003654a..2b37ded16 100644 --- a/common/vision.cpp +++ b/common/vision.cpp @@ -31,7 +31,7 @@ llama_img * load_image_from_file(const char * fname) { // printf("\n"); // } // printf("\n"); - llama_img * result = llama_img_alloc(nx, ny); + llama_img * result = llama_img_init(nx, ny); memcpy(result->data, img, nx*ny*3); stbi_image_free(img); return result; diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 50f2ff4ea..0a28f9bf6 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -15,7 +15,9 @@ static void print_usage(int, char ** argv) { int main(int argc, char ** argv) { gpt_params params; - params.prompt = "Hello my name is"; + //params.prompt = "Hello my name is"; + params.prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n" + "USER:\nwhat did you see?\nASSISTANT:"; params.n_predict = 32; if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) { @@ -62,52 +64,10 @@ int main(int argc, char ** argv) { llama_sampler_chain_add(smpl, llama_sampler_init_greedy()); - - - - // TODO: this is for testing; DELETE ME - int n_cur = 0; - params.prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:"; - { - llama_img_batch ibatch; - ibatch.n_imgs = 1; - ibatch.imgs = (llama_img **) malloc(1024); - ibatch.imgs[0] = load_image_from_file("../models/eiffel-tower-3349075_1280.jpg"); - llama_vision_encode(ctx, &ibatch); - - auto tokens = ::llama_tokenize(ctx, params.prompt, true); - int n_imgs = ibatch.n_imgs; - int n_embd = llama_n_embd(model); - int n_patches = llama_vision_n_patches(ctx); - printf("n_embd = %d ; n_patches = %d \n", n_embd, n_patches); - float * output_img = llama_vision_get_embeddings(ctx, 0); - - n_cur += tokens.size(); - llama_batch batch = llama_batch_init(512, 0, 1); - llama_batch_clear(batch); - for (auto t : tokens) { llama_batch_add(batch, t, n_cur, { 0 }, false); n_cur++; } - if (llama_decode(ctx, batch) != 0) { - LOG("%s: llama_decode() failed\n", __func__); - return 1; - } - - // for (int k = 0; k < 10; k++) printf("%f\n", output_img[k]); - llama_batch_clear(batch); - batch = {int32_t(n_patches*n_imgs), nullptr, output_img, nullptr, nullptr, nullptr, nullptr, n_cur, 1, 0, }; - if (llama_decode(ctx, batch) != 0) { - LOG("%s: llama_decode() failed\n", __func__); - return 1; - } - n_cur += n_embd*n_imgs; - } - params.prompt = "\nwhat did you see?\nASSISTANT:"; - - - // tokenize the prompt std::vector tokens_list; - tokens_list = ::llama_tokenize(ctx, params.prompt, true); + tokens_list = ::llama_tokenize_with_img(ctx, params.prompt, true); const int n_ctx = llama_n_ctx(ctx); const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size()); @@ -127,25 +87,75 @@ int main(int argc, char ** argv) { LOG("\n"); for (auto id : tokens_list) { - LOG("%s", llama_token_to_piece(ctx, id).c_str()); + if (id == TOKEN_IMG_PLACEMENT) { + LOG(""); + } else { + LOG("%s", llama_token_to_piece(ctx, id).c_str()); + } } + LOG("\n\n"); + + // load image + llama_batch_img img_batch = llama_batch_img_init(1); + img_batch.imgs[0] = load_image_from_file("../models/eiffel-tower-3349075_1280.jpg"); + // create a llama_batch with size 512 // we use this object to submit token data for decoding llama_batch batch = llama_batch_init(512, 0, 1); // evaluate the initial prompt - for (size_t i = 0; i < tokens_list.size(); i++) { - //llama_batch_add(batch, tokens_list[i], i, { 0 }, false); - if (i == 0) continue; - llama_batch_add(batch, tokens_list[i], n_cur, { 0 }, false); - n_cur++; + int n_cur = 0; + int i_img = 0; + for (auto id : tokens_list) { + if (id == TOKEN_IMG_PLACEMENT) { + img_batch.pos[i_img] = n_cur; + n_cur += llama_img_n_tokens(ctx, img_batch.imgs[i_img]); + i_img++; + } else { + llama_batch_add(batch, id, n_cur, { 0 }, false); + printf("pos %d tok %d --> %s\n", n_cur, id, llama_token_to_piece(ctx, id).c_str()); + n_cur++; + } } // llama_decode will output logits only for the last token of the prompt batch.logits[batch.n_tokens - 1] = true; + if (llama_encode_vision(ctx, img_batch) != 0) { + LOG("%s: llama_encode_vision() failed\n", __func__); + return 1; + } + + n_cur = 0; + { + auto t1 = ::llama_tokenize(ctx, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", false); + auto t2 = ::llama_tokenize(ctx, "\nwhat did you see?\nASSISTANT:", false); + t1.insert(t1.begin(), 1); + + n_cur = 0; + llama_batch_clear(batch); + llama_batch_add(batch, 1, 0, { 0 }, false); + llama_decode(ctx, batch); + + n_cur = t1.size(); + llama_batch_clear(batch); + llama_batch batch0 = {int32_t(576), nullptr, _test_get_img_embd(ctx), nullptr, nullptr, nullptr, nullptr, n_cur, 1, 0, }; + llama_decode(ctx, batch0); + + n_cur = 0; + llama_batch_clear(batch); + for (auto t : t1) { llama_batch_add(batch, t, n_cur, { 0 }, false); n_cur++; } + llama_decode(ctx, batch); + + n_cur = t1.size() + 576; + llama_batch_clear(batch); + printf("pos %d\n", n_cur); + for (auto t : t2) { llama_batch_add(batch, t, n_cur, { 0 }, false); n_cur++; } + batch.logits[batch.n_tokens - 1] = true; + } + if (llama_decode(ctx, batch) != 0) { LOG("%s: llama_decode() failed\n", __func__); return 1; @@ -153,7 +163,6 @@ int main(int argc, char ** argv) { // main loop - //int n_cur = batch.n_tokens; int n_decode = 0; const auto t_main_start = ggml_time_us(); diff --git a/include/llama.h b/include/llama.h index ed43796b1..e66dd0da1 100644 --- a/include/llama.h +++ b/include/llama.h @@ -233,10 +233,11 @@ extern "C" { } llama_img; // Input data for llama_vision_decode - typedef struct llama_img_batch { + typedef struct llama_batch_img { int32_t n_imgs; llama_img ** imgs; - } llama_img_batch; + llama_pos * pos; + } llama_batch_img; // Input data for llama_decode // A llama_batch object can contain input about one or many sequences @@ -894,16 +895,18 @@ extern "C" { // // create new RGB image for input - LLAMA_API llama_img * llama_img_alloc(int width, int height); - LLAMA_API void llama_img_free(llama_img * img); + LLAMA_API llama_img * llama_img_init(int width, int height); + LLAMA_API void llama_img_free(llama_img * img); - // encode image into embeddings - LLAMA_API int32_t llama_vision_encode(struct llama_context * ctx, llama_img_batch * batch); + // get number of tokens that an image occupies, used to determine the position in the batch + LLAMA_API int32_t llama_img_n_tokens(struct llama_context * ctx, llama_img * img); - // get output embeddings, to be put into language batch - LLAMA_API float * llama_vision_get_embeddings(struct llama_context * ctx, int32_t idx); + // create new image batch + LLAMA_API llama_batch_img llama_batch_img_init(int n_imgs); + LLAMA_API void llama_batch_img_free(llama_batch_img batch); - LLAMA_API int32_t llama_vision_n_patches(struct llama_context * ctx); + // encode the input image batch + LLAMA_API int32_t llama_encode_vision(struct llama_context * ctx, llama_batch_img batch); // // Vocab @@ -1237,6 +1240,7 @@ extern "C" { LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain); LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx); + LLAMA_API float * _test_get_img_embd(struct llama_context * ctx); #ifdef __cplusplus } diff --git a/src/llama-vision.cpp b/src/llama-vision.cpp index 2950579ee..93f9e4b52 100644 --- a/src/llama-vision.cpp +++ b/src/llama-vision.cpp @@ -78,16 +78,12 @@ int clip_n_patches(const clip_context & ctx) { int clip_n_mmproj_embd(const clip_context & ctx) { if (ctx.model->hparams.proj_type == CLIP_PROJECTOR_TYPE_MLP) { - return ctx.model->mm_b_b->ne[0]; + return ctx.model->mm_2_b->ne[0]; } else { GGML_ASSERT(false && "invalid proj type"); } } -int clip_n_embd(const clip_context & ctx) { - return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx); -} - /** * Selects the best resolution from a list of possible resolutions based on the original size. * @@ -575,12 +571,12 @@ static ggml_cgraph * clip_image_build_graph(clip_context & ctx, int batch_size, embeddings = ggml_get_rows(ctx0, embeddings, patches); if (hparams.proj_type == CLIP_PROJECTOR_TYPE_MLP) { - embeddings = ggml_mul_mat(ctx0, model.mm_a_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_a_b); + embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_1_b); embeddings = ggml_gelu(ctx0, embeddings); - embeddings = ggml_mul_mat(ctx0, model.mm_b_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_b_b); + embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); } else { GGML_ASSERT(false && "unsupported proj type"); } @@ -681,7 +677,9 @@ static int32_t clip_image_batch_encode(clip_context & ctx, const clip_image_f32_ ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(ctx.sched, embeddings); // copy the embeddings to the location passed by the user - output.resize(clip_n_embd(ctx)); + size_t out_nbytes = clip_n_patches(ctx)*clip_n_mmproj_embd(ctx)*sizeof(float); + GGML_ASSERT(out_nbytes == ggml_nbytes(embeddings)); + output.resize(out_nbytes); ggml_backend_tensor_get_async(backend_embd, embeddings, output.data(), 0, ggml_nbytes(embeddings)); ggml_backend_sched_synchronize(ctx.sched); @@ -731,15 +729,18 @@ static int32_t encode_image_with_clip(clip_context & ctx, const llama_img img, s //////////////////////////////////////////////////////////////////////////////////////// // public API -int32_t llama_vision_encode_internal(clip_context & ctx, llama_img_batch * batch) { +int32_t llama_encode_vision_internal(clip_context & ctx, llama_batch_img * batch) { if (batch->n_imgs == 0) { return 0; } // TODO: batching is not working atm, should be fixed later - const int n_embd = clip_n_embd(ctx); - ctx.output.resize(n_embd * batch->n_imgs); - ctx.n_output = batch->n_imgs; + const int n_embd = clip_n_mmproj_embd(ctx); + const int n_tokens_per_img = clip_n_patches(ctx); + const int n_pos = n_tokens_per_img*batch->n_imgs; + + ctx.out_embd.resize(n_embd*n_pos); + ctx.out_pos.resize(n_pos); for (int i = 0; i < batch->n_imgs; i++) { std::vector output_single; @@ -748,14 +749,23 @@ int32_t llama_vision_encode_internal(clip_context & ctx, llama_img_batch * batch return status; } // copy output embeddings to result - for (int k = 0; k < n_embd; k++) { - ctx.output[n_embd*i + k] = output_single[k]; + for (int k = 0; k < n_embd*n_tokens_per_img; k++) { + ctx.out_embd[n_embd*n_tokens_per_img*i + k] = output_single[k]; + } + // fill position for all output tokens + for (int p = 0; p < n_tokens_per_img; p++) { + ctx.out_pos[n_tokens_per_img*i + p] = batch->pos[i] + p; } } return 0; } +void llama_vision_clear_output(clip_context & ctx) { + ctx.out_embd.clear(); + ctx.out_pos.clear(); +} + //////////////////////////////////////////////////////////////////////////////////////// // for debugging #ifndef NDEBUG diff --git a/src/llama-vision.h b/src/llama-vision.h index 1b2dbf5a4..950f497c8 100644 --- a/src/llama-vision.h +++ b/src/llama-vision.h @@ -95,10 +95,10 @@ struct clip_vision_model { struct ggml_tensor * projection = NULL; // LLaVA projection - struct ggml_tensor * mm_a_w = NULL; - struct ggml_tensor * mm_a_b = NULL; - struct ggml_tensor * mm_b_w = NULL; - struct ggml_tensor * mm_b_b = NULL; + struct ggml_tensor * mm_1_w = NULL; + struct ggml_tensor * mm_1_b = NULL; + struct ggml_tensor * mm_2_w = NULL; + struct ggml_tensor * mm_2_b = NULL; struct ggml_tensor * image_newline = NULL; }; @@ -110,15 +110,15 @@ struct clip_context { const clip_vision_model * model; - // temporary output data - int n_output; - std::vector output; // size == n_output * n_embd + // temporary output data, to be picked up by llama_decode() + std::vector out_embd; // size == n_tokens * n_embd + std::vector out_pos; // position of each token }; mm_patch_merge mm_patch_merge_from_name(std::string & name); clip_projector_type projector_type_from_name(std::string & name); int clip_n_patches(const clip_context & ctx); int clip_n_mmproj_embd(const clip_context & ctx); -int clip_n_embd(const clip_context & ctx); -int32_t llama_vision_encode_internal(clip_context & ctx, llama_img_batch * batch); +int32_t llama_encode_vision_internal(clip_context & ctx, llama_batch_img * batch); +void llama_vision_clear_output(clip_context & ctx); diff --git a/src/llama.cpp b/src/llama.cpp index 0dd60cd81..b1b44aaca 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1600,7 +1600,7 @@ static const std::map> VISION_ { VISION_ARCH_LLAVA, { - { VISION_TENSOR_MMPROJ, "v.mmproj" }, + { VISION_TENSOR_MMPROJ, "v.mmproj_%d" }, { VISION_TENSOR_ENC_EMBD_CLS, "v.enc.embd.cls" }, { VISION_TENSOR_ENC_EMBD_PATCH, "v.enc.embd.patch" }, { VISION_TENSOR_ENC_EMBD_POS, "v.enc.embd.pos" }, @@ -8990,10 +8990,10 @@ static bool llm_load_tensors( switch (vparams.arch) { case VISION_ARCH_LLAVA: { - model.clip.mm_a_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 1), {n_embd, n_ff}); - model.clip.mm_a_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias" , 1), {n_ff}); - model.clip.mm_b_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 2), {n_ff, n_ff}); - model.clip.mm_b_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias" , 2), {n_ff}); + model.clip.mm_1_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 1), {n_embd, n_ff}); + model.clip.mm_1_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias" , 1), {n_ff}); + model.clip.mm_2_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 2), {n_ff, n_ff}); + model.clip.mm_2_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias" , 2), {n_ff}); model.clip.class_embedding = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_CLS ), {n_embd}); model.clip.patch_embeddings = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_embd}); @@ -9001,8 +9001,8 @@ static bool llm_load_tensors( model.clip.pre_norm_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_PRE_NORM, "weight"), {n_embd}); model.clip.pre_norm_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_PRE_NORM, "bias" ), {n_embd}); - // model.clip.post_norm_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM, "weight"), {n_embd}); - // model.clip.post_norm_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM, "bias" ), {n_embd}); + model.clip.post_norm_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM, "weight"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + model.clip.post_norm_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM, "bias" ), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); for (int i = 0; i < n_layer; ++i) { ggml_context * ctx_layer = ctx_for_layer(i); @@ -20110,6 +20110,8 @@ int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) { void llama_kv_cache_clear(struct llama_context * ctx) { llama_kv_cache_clear(ctx->kv_self); + // clear vision embeddings output + llama_vision_clear_output(ctx->clip); } bool llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) { @@ -21226,9 +21228,48 @@ int32_t llama_encode( return ret; } +float * _test_get_img_embd(struct llama_context * ctx) { return ctx->clip.out_embd.data(); } int32_t llama_decode( struct llama_context * ctx, struct llama_batch batch) { + // hacky vision implementation, for testing only + if (!ctx->clip.out_embd.empty()) { + // int8_t * logits = new int8_t [ctx->clip.out_pos.size()]; + // int32_t * n_seq_id = new int32_t [ctx->clip.out_pos.size()]; + // llama_seq_id ** seq_id = new llama_seq_id *[ctx->clip.out_pos.size()]; + // llama_seq_id seq_id_0 = 0; + // printf("out_pos %d\n", ctx->clip.out_pos.size()); + // llama_batch ibatch = { + // /*n_tokens =*/ static_cast(ctx->clip.out_pos.size()), + // /*tokens =*/ nullptr, + // /*embd =*/ ctx->clip.out_embd.data(), + // /*pos =*/ ctx->clip.out_pos.data(), + // /*n_seq_id =*/ n_seq_id, + // /*seq_id =*/ seq_id, + // /*logits =*/ logits, + // /*all_pos_0 =*/ 0, + // /*all_pos_1 =*/ 0, + // /*all_seq_id =*/ 0, + // }; + // for (size_t i = 0; i < ctx->clip.out_pos.size(); i++) { + // ibatch.n_seq_id[i] = 1; + // ibatch.seq_id [i] = &seq_id_0; + // ibatch.logits [i] = 0; + // } + // llama_decode_internal(*ctx, ibatch); + // delete[] logits; + // delete[] n_seq_id; + // delete[] seq_id; + // llama_vision_clear_output(ctx->clip); + + //int n_eval = ctx->clip.out_pos.size(); + //int n_past = ctx->clip.out_pos[0]; + //printf("n_eval %d, n_past %d\n", n_eval, n_past); + //llama_batch ibatch = {int32_t(n_eval), nullptr, ctx->clip.out_embd.data(), nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, }; + //llama_decode_internal(*ctx, ibatch); + //llama_vision_clear_output(ctx->clip); + } + const int ret = llama_decode_internal(*ctx, batch); if (ret < 0) { LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret); @@ -21808,30 +21849,45 @@ struct llama_sampler * llama_sampler_init_grammar(const struct llama_model * mod // vision // -llama_img * llama_img_alloc(int width, int height) { +llama_img * llama_img_init(int width, int height) { llama_img * img = new llama_img(); img->nx = width; img->ny = height; - img->data = (unsigned char *)malloc(width*height*3); + if (width > 0 && height > 0) { + img->data = (unsigned char *)malloc(width*height*3); + } return img; } + void llama_img_free(llama_img * img) { - free(img->data); + if (img->data) free(img->data); delete img; } -int32_t llama_vision_encode(struct llama_context * ctx, llama_img_batch * batch) { - return llama_vision_encode_internal(ctx->clip, batch); -} - -float * llama_vision_get_embeddings(struct llama_context * ctx, int32_t idx) { - return ctx->clip.output.data(); -} - -int32_t llama_vision_n_patches(struct llama_context * ctx) { +int32_t llama_img_n_tokens(struct llama_context * ctx, llama_img * img) { + GGML_UNUSED(img); // reserved for future usage return clip_n_patches(ctx->clip); } +llama_batch_img llama_batch_img_init(int n_imgs) { + llama_batch_img batch; + batch.n_imgs = n_imgs; + if (n_imgs > 0) { + batch.imgs = (llama_img **)malloc(n_imgs*sizeof(llama_img *)); + batch.pos = (llama_pos * )malloc(n_imgs*sizeof(llama_pos )); + } + return batch; +} + +void llama_batch_img_free(llama_batch_img batch) { + if (batch.imgs) free(batch.imgs); + if (batch.pos ) free(batch.pos ); +} + +int32_t llama_encode_vision(struct llama_context * ctx, llama_batch_img batch) { + return llama_encode_vision_internal(ctx->clip, &batch); +} + // // model split //