From 997dd1fdf7c367dcfc92758d5b4f61de7546125f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 13 Feb 2024 20:40:01 +0200 Subject: [PATCH] llava : style --- examples/llava/clip.cpp | 230 ++++++++++++++++++++------------------- examples/llava/clip.h | 34 +++--- examples/llava/llava.cpp | 11 +- examples/llava/llava.h | 2 - 4 files changed, 140 insertions(+), 137 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index a7562eb47..1cdb2be74 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -70,29 +70,29 @@ static std::string format(const char * fmt, ...) { // key constants // -#define KEY_FTYPE "general.file_type" -#define KEY_NAME "general.name" -#define KEY_DESCRIPTION "general.description" -#define KEY_HAS_TEXT_ENC "clip.has_text_encoder" -#define KEY_HAS_VIS_ENC "clip.has_vision_encoder" +#define KEY_FTYPE "general.file_type" +#define KEY_NAME "general.name" +#define KEY_DESCRIPTION "general.description" +#define KEY_HAS_TEXT_ENC "clip.has_text_encoder" +#define KEY_HAS_VIS_ENC "clip.has_vision_encoder" #define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector" -#define KEY_USE_GELU "clip.use_gelu" -#define KEY_N_EMBD "clip.%s.embedding_length" -#define KEY_N_FF "clip.%s.feed_forward_length" -#define KEY_N_BLOCK "clip.%s.block_count" -#define KEY_N_HEAD "clip.%s.attention.head_count" +#define KEY_USE_GELU "clip.use_gelu" +#define KEY_N_EMBD "clip.%s.embedding_length" +#define KEY_N_FF "clip.%s.feed_forward_length" +#define KEY_N_BLOCK "clip.%s.block_count" +#define KEY_N_HEAD "clip.%s.attention.head_count" #define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" -#define KEY_PROJ_DIM "clip.%s.projection_dim" -#define KEY_TOKENS "tokenizer.ggml.tokens" -#define KEY_N_POSITIONS "clip.text.context_length" -#define KEY_IMAGE_SIZE "clip.vision.image_size" -#define KEY_PATCH_SIZE "clip.vision.patch_size" -#define KEY_IMAGE_MEAN "clip.vision.image_mean" -#define KEY_IMAGE_STD "clip.vision.image_std" -#define KEY_PROJ_TYPE "clip.projector_type" +#define KEY_PROJ_DIM "clip.%s.projection_dim" +#define KEY_TOKENS "tokenizer.ggml.tokens" +#define KEY_N_POSITIONS "clip.text.context_length" +#define KEY_IMAGE_SIZE "clip.vision.image_size" +#define KEY_PATCH_SIZE "clip.vision.patch_size" +#define KEY_IMAGE_MEAN "clip.vision.image_mean" +#define KEY_IMAGE_STD "clip.vision.image_std" +#define KEY_PROJ_TYPE "clip.projector_type" -#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" -#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" +#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" +#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" #define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution" @@ -100,26 +100,26 @@ static std::string format(const char * fmt, ...) { // tensor name constants // -#define TN_TOKEN_EMBD "%s.token_embd.weight" -#define TN_POS_EMBD "%s.position_embd.weight" -#define TN_CLASS_EMBD "v.class_embd" -#define TN_PATCH_EMBD "v.patch_embd.weight" -#define TN_ATTN_K "%s.blk.%d.attn_k.%s" -#define TN_ATTN_Q "%s.blk.%d.attn_q.%s" -#define TN_ATTN_V "%s.blk.%d.attn_v.%s" -#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s" -#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s" -#define TN_FFN_UP "%s.blk.%d.ffn_up.%s" -#define TN_LN_1 "%s.blk.%d.ln1.%s" -#define TN_LN_2 "%s.blk.%d.ln2.%s" -#define TN_LN_PRE "%s.pre_ln.%s" -#define TN_LN_POST "%s.post_ln.%s" -#define TN_TEXT_PROJ "text_projection.weight" -#define TN_VIS_PROJ "visual_projection.weight" -#define TN_LLAVA_PROJ "mm.%d.%s" -#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s" +#define TN_TOKEN_EMBD "%s.token_embd.weight" +#define TN_POS_EMBD "%s.position_embd.weight" +#define TN_CLASS_EMBD "v.class_embd" +#define TN_PATCH_EMBD "v.patch_embd.weight" +#define TN_ATTN_K "%s.blk.%d.attn_k.%s" +#define TN_ATTN_Q "%s.blk.%d.attn_q.%s" +#define TN_ATTN_V "%s.blk.%d.attn_v.%s" +#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s" +#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s" +#define TN_FFN_UP "%s.blk.%d.ffn_up.%s" +#define TN_LN_1 "%s.blk.%d.ln1.%s" +#define TN_LN_2 "%s.blk.%d.ln2.%s" +#define TN_LN_PRE "%s.pre_ln.%s" +#define TN_LN_POST "%s.post_ln.%s" +#define TN_TEXT_PROJ "text_projection.weight" +#define TN_VIS_PROJ "visual_projection.weight" +#define TN_LLAVA_PROJ "mm.%d.%s" +#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s" #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" -#define TN_IMAGE_NEWLINE "model.image_newline" +#define TN_IMAGE_NEWLINE "model.image_newline" enum projector_type { @@ -130,8 +130,8 @@ enum projector_type { }; static std::map PROJECTOR_TYPE_NAMES = { - { PROJECTOR_TYPE_MLP, "mlp" }, - { PROJECTOR_TYPE_LDP, "ldp" }, + { PROJECTOR_TYPE_MLP, "mlp" }, + { PROJECTOR_TYPE_LDP, "ldp" }, }; @@ -191,7 +191,6 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int } } - static void replace_all(std::string & s, const std::string & search, const std::string & replace) { std::string result; for (size_t pos = 0; ; pos += search.length()) { @@ -279,7 +278,6 @@ struct clip_hparams { int32_t image_grid_pinpoints[32]; int32_t image_crop_resolution; - }; struct clip_layer { @@ -333,6 +331,7 @@ struct clip_vision_model { struct ggml_tensor * mm_0_b = NULL; struct ggml_tensor * mm_2_w = NULL; struct ggml_tensor * mm_2_b = NULL; + struct ggml_tensor * image_newline = NULL; // Yi type models with mlp+normalization projection @@ -389,9 +388,10 @@ struct clip_ctx { std::vector buf_compute_meta; // memory buffers to evaluate the model - ggml_backend_buffer_t params_buffer = NULL; + ggml_backend_buffer_t params_buffer = NULL; ggml_backend_buffer_t compute_buffer = NULL; - ggml_backend_t backend = NULL; + + ggml_backend_t backend = NULL; ggml_gallocr_t compute_alloc = NULL; }; @@ -404,19 +404,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 const auto & model = ctx->vision_model; const auto & hparams = model.hparams; - const int image_size = hparams.image_size; - const int patch_size = hparams.patch_size; - const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); + const int image_size = hparams.image_size; + const int patch_size = hparams.patch_size; + const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side); - const int num_positions = num_patches + 1; - const int hidden_size = hparams.hidden_size; - const int n_head = hparams.n_head; - const int d_head = hidden_size / n_head; - const int n_layer = hparams.n_layer; - //const int n_intermediate = hparams.n_intermediate; - //const int projection_dim = hparams.projection_dim; - const float eps = hparams.eps; - int batch_size = imgs->size; + const int num_positions = num_patches + 1; + const int hidden_size = hparams.hidden_size; + const int n_head = hparams.n_head; + const int d_head = hidden_size / n_head; + const int n_layer = hparams.n_layer; + const float eps = hparams.eps; + + const int batch_size = imgs->size; + if (ctx->has_llava_projector) { GGML_ASSERT(batch_size == 1); } @@ -816,10 +816,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { if (idx != -1) { const std::string proj_type = gguf_get_val_str(ctx, idx); new_clip->proj_type = clip_projector_type_from_string(proj_type); - } - else { + } else { new_clip->proj_type = PROJECTOR_TYPE_MLP; } + if (new_clip->proj_type == PROJECTOR_TYPE_MLP) { if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) { new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM; @@ -944,6 +944,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { hparams.patch_size = get_u32(ctx, KEY_PATCH_SIZE); hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision")); hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision")); + try { int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS); int n = gguf_get_arr_n(ctx, idx); @@ -956,23 +957,26 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } catch (std::runtime_error & e) { hparams.image_grid_pinpoints[0]=0; } + try { int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE); strcpy(hparams.mm_patch_merge_type, gguf_get_val_str(ctx, idx)); } catch (std::runtime_error & e) { strcpy(hparams.mm_patch_merge_type, "flat"); } + try { hparams.image_crop_resolution = get_u32(ctx, KEY_IMAGE_CROP_RESOLUTION); // llava-1.6 - } - catch(const std::exception& e) { + } catch(const std::exception& e) { hparams.image_crop_resolution = hparams.image_size; } int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN); int idx_std = get_key_idx(ctx, KEY_IMAGE_STD); + const float * mean_data = (const float *)gguf_get_arr_data(ctx, idx_mean); const float * std_data = (const float *)gguf_get_arr_data(ctx, idx_std); + for (int i = 0; i < 3; ++i) { new_clip->image_mean[i] = mean_data[i]; new_clip->image_std[i] = std_data[i]; @@ -998,16 +1002,14 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { printf("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type); } - try - { + + try { vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD); vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight")); vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); - } - catch(const std::exception& e) - { + } catch(const std::exception& e) { fprintf(stderr, "%s: failed to load vision model tensors\n", __func__); } @@ -1039,40 +1041,39 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE); // fprintf(stderr, "%s: image_newline tensor (llava-1.6) found\n", __func__); } catch (std::runtime_error & e) { } - } - else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) { + } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) { // MobileVLM projection - vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight")); - vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias")); - vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight")); - vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias")); - vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight")); - vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight")); - vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias")); + vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight")); + vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias")); + vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight")); + vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias")); + vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight")); + vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight")); + vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias")); vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight")); vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias")); vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight")); vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias")); - vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight")); - vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight")); - vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias")); - vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight")); - vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight")); - vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias")); + vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight")); + vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight")); + vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias")); + vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight")); + vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight")); + vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias")); vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight")); vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias")); vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight")); vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias")); - vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight")); - vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight")); - vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias")); - } - else { + vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight")); + vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight")); + vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias")); + } else { std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type]; throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); } vision_model.layers.resize(hparams.n_layer); + for (int il = 0; il < hparams.n_layer; ++il) { auto & layer = vision_model.layers[il]; layer.k_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "weight")); @@ -1412,7 +1413,6 @@ static void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &imag image_output = std::move(padded_image); } - /** * Selects the best resolution from a list of possible resolutions based on the original size. * @@ -1446,7 +1446,6 @@ static std::pair select_best_resolution(const std::pair & or return best_fit; } - static std::vector divide_to_patches_u8(const clip_image_u8 & image, int patch_size) { std::vector patches; int width = image.nx; @@ -1472,7 +1471,7 @@ static std::vector divide_to_patches_u8(const clip_image_u8 & im #ifdef CLIP_DEBUG_FUNCTIONS // debug function to convert f32 to u8 -void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) { +static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) { dst.nx = src.nx; dst.ny = src.ny; dst.buf.resize(3 * src.nx * src.ny); @@ -1532,8 +1531,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli } } } else { - if (params.image_grid_pinpoints[0] != 0) - { + if (params.image_grid_pinpoints[0] != 0) { // "spatial_unpad" with "anyres" processing for llava-1.6 std::vector> possible_resolutions; for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) { @@ -1656,6 +1654,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli return true; } +ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) { + return ctx->vision_model.image_newline; +} + void clip_free(clip_ctx * ctx) { ggml_free(ctx->ctx_data); gguf_free(ctx->ctx_gguf); @@ -1687,6 +1689,18 @@ const int32_t * clip_image_grid(const struct clip_ctx * ctx) { return ctx->vision_model.hparams.image_grid_pinpoints; } +int clip_n_patches(const struct clip_ctx * ctx) { + const auto & params = ctx->vision_model.hparams; + + int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size); + + if (ctx->proj_type == PROJECTOR_TYPE_LDP) { + n_patches /= 4; + } + + return n_patches; +} + bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { if (!ctx->has_vision_encoder) { printf("This gguf file seems to have no vision encoder\n"); @@ -1706,7 +1720,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } int batch_size = imgs->size; - if(ctx->has_llava_projector) { + if (ctx->has_llava_projector) { GGML_ASSERT(batch_size == 1); // TODO: support multiple images } @@ -1717,9 +1731,10 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // set inputs const auto & model = ctx->vision_model; const auto & hparams = model.hparams; - const int image_size = hparams.image_size; - const int patch_size = hparams.patch_size; - const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); + + const int image_size = hparams.image_size; + const int patch_size = hparams.patch_size; + const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); const int num_positions = num_patches + 1; { @@ -1794,11 +1809,11 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // copy the embeddings to the location passed by the user ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); + return true; } bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) { - ggml_type type = GGML_TYPE_Q4_1; assert(itype < GGML_TYPE_COUNT); @@ -1987,26 +2002,13 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { if (ctx->proj_type == PROJECTOR_TYPE_LDP) { return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0]; } - else if (ctx->proj_type == PROJECTOR_TYPE_MLP) { + if (ctx->proj_type == PROJECTOR_TYPE_MLP) { return ctx->vision_model.mm_2_b->ne[0]; - } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { + } + if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { return ctx->vision_model.mm_3_b->ne[0]; } - else { - std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; - throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); - } -} -ggml_tensor *clip_get_newline_tensor(const struct clip_ctx * ctx) { - return ctx->vision_model.image_newline; -} - -int clip_n_patches(const struct clip_ctx * ctx) { - auto & params = ctx->vision_model.hparams; - int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size); - if (ctx->proj_type == PROJECTOR_TYPE_LDP) { - n_patches /= 4; - } - return n_patches; + std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; + throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); } diff --git a/examples/llava/clip.h b/examples/llava/clip.h index 5e0b5c64b..cd9a4022f 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -26,7 +26,17 @@ extern "C" { struct clip_ctx; -CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity); +struct clip_image_u8_batch { + struct clip_image_u8 * data; + size_t size; +}; + +struct clip_image_f32_batch { + struct clip_image_f32 * data; + size_t size; +}; + +CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity); CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity); CLIP_API void clip_free(struct clip_ctx * ctx); @@ -45,33 +55,21 @@ CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx); CLIP_API int clip_n_patches (const struct clip_ctx * ctx); CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); -struct clip_image_u8_batch { - struct clip_image_u8 * data; - size_t size; -}; - -struct clip_image_f32_batch { - struct clip_image_f32 * data; - size_t size; -}; -CLIP_API struct clip_image_grid_shape { - int first; - int second; -}; - CLIP_API struct clip_image_u8 * clip_image_u8_init (); CLIP_API struct clip_image_f32 * clip_image_f32_init(); -CLIP_API void clip_image_u8_free (struct clip_image_u8 * img); +CLIP_API void clip_image_u8_free (struct clip_image_u8 * img); CLIP_API void clip_image_f32_free(struct clip_image_f32 * img); CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); /** interpret bytes as an image file with length bytes_length, and use the result to populate img */ CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img); + /** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */ -CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch& res_imgs ); -CLIP_API struct ggml_tensor *clip_get_newline_tensor(const struct clip_ctx * ctx); +CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs ); + +CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx); CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec); CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec); diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index ea956ac00..22953417f 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -26,6 +26,11 @@ struct clip_image_f32 { std::vector buf; }; +struct clip_image_grid_shape { + int first; + int second; +}; + /** * Selects the best resolution from a list of possible resolutions based on the original size. * @@ -344,7 +349,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_ return true; } -LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) { +struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) { clip_image_u8 * img = clip_image_u8_init(); if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) { clip_image_u8_free(img); @@ -401,7 +406,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long return true; } -LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) { +struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) { unsigned char* image_bytes; long image_bytes_length; auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length); @@ -416,7 +421,7 @@ LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct return embed; } -LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed) { +void llava_image_embed_free(struct llava_image_embed * embed) { free(embed->embed); free(embed); } diff --git a/examples/llava/llava.h b/examples/llava/llava.h index e08ce7883..9e9466a5d 100644 --- a/examples/llava/llava.h +++ b/examples/llava/llava.h @@ -3,7 +3,6 @@ #include "ggml.h" - #ifdef LLAMA_SHARED # if defined(_WIN32) && !defined(__MINGW32__) # ifdef LLAMA_BUILD @@ -42,7 +41,6 @@ LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed); /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */ LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past); - #ifdef __cplusplus } #endif