diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 8193945ee..7a7374cd8 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1,7 +1,6 @@ // NOTE: This is modified from clip.cpp only for LLaVA, // so there might be still unnecessary artifacts hanging around // I'll gradually clean and extend it - #include "clip.h" #include "ggml.h" #include "ggml-alloc.h" @@ -965,7 +964,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { hparams.image_grid_pinpoints[i] = pinpoints[i]; } hparams.image_grid_pinpoints[n] = 0; - } catch (std::runtime_error & e) { + } catch (std::runtime_error & e) { hparams.image_grid_pinpoints[0]=0; } try { @@ -979,7 +978,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } catch(const std::exception& e) { hparams.image_crop_resolution = hparams.image_size; - } + } int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN); int idx_std = get_key_idx(ctx, KEY_IMAGE_STD); @@ -1022,7 +1021,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { { fprintf(stderr, "%s: failed to load vision model tensors\n", __func__); } - + // LLaVA projection if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) { vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight")); @@ -1270,12 +1269,12 @@ void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filenam inline float lerp(float s, float e, float t) { return s + (e - s) * t; } -// Bilinear resize function +// Bilinear resize function void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) { dst.nx = target_width; dst.ny = target_height; dst.buf.resize(3 * target_width * target_height); - + float x_ratio = static_cast(src.nx - 1) / target_width; float y_ratio = static_cast(src.ny - 1) / target_height; @@ -1343,11 +1342,11 @@ void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, co dst->nx = src->nx; dst->ny = src->ny; dst->buf.resize(src->buf.size()); - + for (size_t i = 0; i < src->buf.size(); ++i) { int c = i % 3; // rgb dst->buf[i] = (static_cast(src->buf[i]) / 255.0f - mean[c]) / std[c]; - + if (replicate_float16) { dst->buf[i] = simulateFloat16Precision(dst->buf[i]); } @@ -1546,15 +1545,15 @@ void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) /** * @brief Get the anyres image grid shape object - * - * @param image_size - * @param grid_pinpoints - * @param image_patch_size - * @return + * + * @param image_size + * @param grid_pinpoints + * @param image_patch_size + * @return */ struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair& image_size, const std::vector>& grid_pinpoints, int image_patch_size) { /** - Conversion from gguf flat array to vector: + Conversion from gguf flat array to vector: std::vector> possible_resolutions; for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) { possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]}); @@ -1628,7 +1627,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std resize_and_pad_image(*img, *temp, best_resolution); // we do not pad with mean-bg color anymore in llava-1.6 // clip_image_save_to_bmp(*temp, "resized.bmp"); // visually verify normalized image: - // normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std); + // normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std); // { // clip_image_u8 * temp2 = clip_image_u8_init(); // clip_image_convert_f32_to_u8(*res, *temp2); @@ -1638,7 +1637,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std std::vector patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6) // fprintf(stderr, "patches: %d, %d\n", patches.size(), params.image_size); - + clip_image_u8 *image_original_resize = clip_image_u8_init(); // bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square ? bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square ? @@ -1655,9 +1654,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std // printf("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny); clip_image_u8_free(patches[i]); } - + clip_image_u8_free(temp); - + return true; } else { temp->nx = img->nx; @@ -1802,7 +1801,6 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i type = static_cast(itype); auto * ctx_clip = clip_model_load(fname_inp, 2); - const auto & ctx_src = ctx_clip->ctx_gguf; const auto & ctx_data = ctx_clip->ctx_data; diff --git a/examples/llava/clip.h b/examples/llava/clip.h index 09346b603..c1981bb5d 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -38,7 +38,7 @@ struct clip_vision_hparams { float eps; char mm_patch_merge_type[32]="flat"; // spatial_unpad or flat (default) - int32_t image_grid_pinpoints[32]; + int32_t image_grid_pinpoints[32]; int32_t image_crop_resolution; }; diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py index ea331f2fe..61a147037 100644 --- a/examples/llava/convert-image-encoder-to-gguf.py +++ b/examples/llava/convert-image-encoder-to-gguf.py @@ -234,7 +234,7 @@ if has_vision_encoder: # 1008, 336, # 336, 1008 # ] - # * + # * # */ if "image_grid_pinpoints" in v_hparams: # flatten it diff --git a/examples/llava/llava-surgery-v2.py b/examples/llava/llava-surgery-v2.py index 6b4fac80d..e94d10a55 100644 --- a/examples/llava/llava-surgery-v2.py +++ b/examples/llava/llava-surgery-v2.py @@ -4,7 +4,6 @@ import os import torch from safetensors.torch import load as safe_load, save as safe_save, safe_open, save_file - # Function to determine if file is a SafeTensor file def is_safetensor_file(file_path): return file_path.endswith('.safetensors') @@ -40,12 +39,12 @@ def clean_vision_tower_from_checkpoint(checkpoint_path): model_path = os.path.dirname(checkpoint_path) print(f"Searching for vision tower tensors in {checkpoint_path}") clip_tensors = [k for k, v in checkpoint.items() if (k.startswith("model.vision_tower") ) ] - + if len(clip_tensors) > 0: print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}") # Adapted for file type clip_path = os.path.join(model_path, "llava.clip") - + if os.path.exists(clip_path): existing_clip, _ = load_model(clip_path) else: @@ -142,7 +141,7 @@ for name in mm_tensors: projector[name] = last_checkpoint[name].float() for name in first_mm_tensors: projector[name] = first_checkpoint[name].float() - + save_model(projector, f"{args.model}/llava.projector", 'pytorch') for name in mm_tensors: diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 3a0c4a8a4..5ba9d072d 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -14,21 +14,21 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) { struct temp_model { struct ggml_tensor *newline; - struct ggml_context * ctx; + struct ggml_context * ctx; } model; auto & vparams = clip_get_vision_hparams(ctx_clip); auto num_patches_per_side = vparams.image_size / vparams.patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches) int num_patches_width = grid_shape.first; // grid 1-4 int num_patches_height = grid_shape.second; // grid 1-4 - + // TODO: size calculation is not calculated - it's only tens of MB size_t ctx_size = 0; { ctx_size += clip_embd_nbytes(ctx_clip) * image_embd_v.size() * 8; // image_features - ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32); // + ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32); } - + struct ggml_init_params params { /*.mem_size =*/ ctx_size, /*.mem_buffer =*/ NULL, @@ -47,7 +47,7 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector & image_emb // ), dim=-1) // image_feature = image_feature.flatten(1, 2).transpose(0, 1) // image_feature = torch.cat((base_image_feature, image_feature), dim=0) - + // embeddings -> tokens -> 24 x 24 /** * We now have two options: unpad or no unpad - unpad removes tokens for faster llm eval @@ -66,13 +66,13 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector & image_emb image_feature = image_feature.view(2, 2, 24, 24*4096) image_feature = image_feature.permute(0, 2, 1, 3).contiguous() image_feature = image_feature.view(-1, 4096) - * + * */ model.ctx = ggml_init(params); - + ggml_context *ctx_noalloc = ggml_init({2048, NULL, true}); // struct ggml_tensor * image_features = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip) * (image_embd_v.size() - 1)); - + ggml_tensor *newline_tmp = clip_get_newline_tensor(ctx_clip); model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]); if (newline_tmp->backend != GGML_BACKEND_CPU) { @@ -112,28 +112,28 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector & image_emb size_t size_ele = ggml_type_size(GGML_TYPE_F32); // struct ggml_tensor *dummy = ggml_new_tensor_4d(ctx_noalloc, GGML_TYPE_F32, num_patches_height, num_patches_width, num_patches_per_side, num_patches_per_side * clip_n_mmproj_embd(ctx_clip)); - struct ggml_tensor *image_features_view = ggml_view_4d(model.ctx, image_features, - num_patches_height, // nb0 : 4 byte für jedes - num_patches_width, - num_patches_per_side * num_patches_per_side, - clip_n_mmproj_embd(ctx_clip), + struct ggml_tensor *image_features_view = ggml_view_4d(model.ctx, image_features, + num_patches_height, + num_patches_width, + num_patches_per_side * num_patches_per_side, + clip_n_mmproj_embd(ctx_clip), size_ele * num_patches_height, size_ele * num_patches_height * num_patches_width, size_ele * num_patches_height * num_patches_width * num_patches_per_side, 0); - - struct ggml_tensor *image_features_patchview = ggml_view_4d(model.ctx, image_features, - num_patches_height, - num_patches_width, - num_patches_per_side, + + struct ggml_tensor *image_features_patchview = ggml_view_4d(model.ctx, image_features, + num_patches_height, + num_patches_width, + num_patches_per_side, num_patches_per_side * clip_n_mmproj_embd(ctx_clip), - + size_ele * num_patches_height, size_ele * num_patches_height * num_patches_width, size_ele * num_patches_height * num_patches_width * num_patches_per_side, 0); - struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3)); + struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3)); permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, permuted_cont, 0, 2, 1, 3)); // permute back to before - todo: fix bug struct ggml_tensor *prepared = ggml_view_2d(model.ctx, permuted_cont, num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, clip_n_mmproj_embd(ctx_clip), size_ele * num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, 0); @@ -172,9 +172,8 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector & image_emb // memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as context // memcpy(image_embd_out, (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip)); // main image as context // *n_img_pos_out=576; - - ggml_free(model.ctx); + ggml_free(model.ctx); return true; } @@ -205,7 +204,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli // } // } - if (strcmp(vparams.mm_patch_merge_type, "spatial_unpad") != 0) + if (strcmp(vparams.mm_patch_merge_type, "spatial_unpad") != 0) { // flat / default llava-1.5 type embedding *n_img_pos = clip_n_patches(ctx_clip); @@ -233,7 +232,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli } } const int64_t t_img_enc_batch_us = ggml_time_us(); - printf("%s: %d segments encoded in %8.2f ms\n", __func__, img_res_v.size(), (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); + printf("%s: %d segments encoded in %8.2f ms\n", __func__, img_res_v.size(), (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); std::vector> grid_pinpoints; @@ -260,7 +259,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli } printf("%s: image embedding created: %d tokens\n", __func__, *n_img_pos); - + const int64_t t_img_enc_end_us = ggml_time_us(); float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;