From 37a147ebf9c492af646bba349ee0d26e76bd6035 Mon Sep 17 00:00:00 2001 From: John <78893154+cmp-nct@users.noreply.github.com> Date: Thu, 8 Feb 2024 07:42:49 +0100 Subject: [PATCH] Clip: Bugfix for normalization (it did not loat the 3 std and mean values) Clip: bicubic resize function Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6) Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final convert-image-encoder: fixed image-grid flattening --- examples/llava/clip.cpp | 581 ++++++++++++++++-- examples/llava/clip.h | 41 +- .../llava/convert-image-encoder-to-gguf.py | 3 +- examples/llava/llava-surgery-v2.py | 5 +- examples/llava/llava.cpp | 264 +++++++- examples/server/server.cpp | 5 +- 6 files changed, 841 insertions(+), 58 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 9129052a2..8193945ee 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -71,6 +71,11 @@ static std::string format(const char * fmt, ...) { #define KEY_IMAGE_STD "clip.vision.image_std" #define KEY_PROJ_TYPE "clip.projector_type" +#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" +#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" +#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution" + + // // tensor name constants // @@ -94,6 +99,7 @@ static std::string format(const char * fmt, ...) { #define TN_LLAVA_PROJ "mm.%d.%s" #define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s" #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" +#define TN_IMAGE_NEWLINE "model.image_newline" enum projector_type { @@ -233,26 +239,6 @@ static projector_type clip_projector_type_from_string(const std::string & name) return PROJECTOR_TYPE_UNKNOWN; } -// -// image data -// - -// RGB uint8 image -struct clip_image_u8 { - int nx; - int ny; - - std::vector buf; -}; - -// RGB float32 image (NHWC) -// Memory layout: RGBRGBRGB... -struct clip_image_f32 { - int nx; - int ny; - - std::vector buf; -}; // // clip layers @@ -309,6 +295,7 @@ struct clip_vision_model { struct ggml_tensor * mm_0_b = NULL; struct ggml_tensor * mm_2_w = NULL; struct ggml_tensor * mm_2_b = NULL; + struct ggml_tensor * image_newline = NULL; // Yi type models with mlp+normalization projection struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4 @@ -370,6 +357,10 @@ struct clip_ctx { ggml_allocr * compute_alloc = NULL; }; +const struct clip_vision_hparams clip_get_vision_hparams(const struct clip_ctx * ctx) { + return ctx->vision_model.hparams; +} + static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) { if (!ctx->has_vision_encoder) { printf("This gguf file seems to have no vision encoder\n"); @@ -382,6 +373,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 const int image_size = hparams.image_size; const int patch_size = hparams.patch_size; const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); + const int num_patches_per_side = image_size / patch_size; const int num_positions = num_patches + 1; const int hidden_size = hparams.hidden_size; const int n_head = hparams.n_head; @@ -582,7 +574,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); embeddings = ggml_gelu(ctx0, embeddings); - embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); @@ -966,12 +957,37 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { hparams.patch_size = get_u32(ctx, KEY_PATCH_SIZE); hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision")); hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision")); + try { + int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS); + int n = gguf_get_arr_n(ctx, idx); + const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx); + for (int i = 0; i < 32 && pinpoints[i] != 0; ++i) { + hparams.image_grid_pinpoints[i] = pinpoints[i]; + } + hparams.image_grid_pinpoints[n] = 0; + } catch (std::runtime_error & e) { + hparams.image_grid_pinpoints[0]=0; + } + try { + int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE); + strcpy(hparams.mm_patch_merge_type, gguf_get_val_str(ctx, idx)); + } catch (std::runtime_error & e) { + strcpy(hparams.mm_patch_merge_type, "flat"); + } + try { + hparams.image_crop_resolution = get_u32(ctx, KEY_IMAGE_CROP_RESOLUTION); // llava-1.6 + } + catch(const std::exception& e) { + hparams.image_crop_resolution = hparams.image_size; + } int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN); int idx_std = get_key_idx(ctx, KEY_IMAGE_STD); + const float * mean_data = (const float *)gguf_get_arr_data(ctx, idx_mean); + const float * std_data = (const float *)gguf_get_arr_data(ctx, idx_std); for (int i = 0; i < 3; ++i) { - new_clip->image_mean[i] = *((const float *)gguf_get_arr_data(ctx, idx_mean)); - new_clip->image_std[i] = *((const float *)gguf_get_arr_data(ctx, idx_std)); + new_clip->image_mean[i] = mean_data[i]; + new_clip->image_std[i] = std_data[i]; } if (verbosity >= 2) { @@ -983,14 +999,30 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { printf("v_projection_dim %d\n", hparams.projection_dim); printf("v_n_head %d\n", hparams.n_head); printf("v_n_layer %d\n", hparams.n_layer); + printf("v_eps %f\n", hparams.eps); + printf("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]); + printf("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]); + printf("v_image_grid_pinpoints: "); + for (int i = 0; i < 32 & hparams.image_grid_pinpoints[i]!=0; ++i) { + printf("%d ", hparams.image_grid_pinpoints[i]); + } + printf("\n"); + printf("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type); + } - - vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); - vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD); - vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); - vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight")); - vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); - + try + { + vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); + vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD); + vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); + vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight")); + vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); + } + catch(const std::exception& e) + { + fprintf(stderr, "%s: failed to load vision model tensors\n", __func__); + } + // LLaVA projection if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) { vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight")); @@ -1015,6 +1047,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight")); vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias")); } catch (std::runtime_error & e) { } + try { + vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE); + // fprintf(stderr, "%s: image_newline tensor (llava-1.6) found\n", __func__); + } catch (std::runtime_error & e) { } } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) { // MobileVLM projection @@ -1134,13 +1170,423 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length return true; } + +void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) { + std::ofstream file(filename, std::ios::binary); + if (!file.is_open()) { + std::cerr << "Failed to open file for writing: " << filename << std::endl; + return; + } + + // PPM header: P6 format, width, height, and max color value + file << "P6\n" << img.nx << " " << img.ny << "\n255\n"; + + // Write pixel data + for (size_t i = 0; i < img.buf.size(); i += 3) { + // PPM expects binary data in RGB format, which matches our image buffer + file.write(reinterpret_cast(&img.buf[i]), 3); + } + + file.close(); +} +void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) { + std::ofstream file(filename, std::ios::binary); + if (!file.is_open()) { + std::cerr << "Failed to open file for writing: " << filename << std::endl; + return; + } + + int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data + int bytesPerPixel = 3; + int widthInBytes = img.nx * bytesPerPixel; + int paddingAmount = (4 - (widthInBytes % 4)) % 4; + int stride = widthInBytes + paddingAmount; + + // Bitmap file header + unsigned char fileHeader[14] = { + 'B','M', // Signature + 0,0,0,0, // Image file size in bytes + 0,0,0,0, // Reserved + 54,0,0,0 // Start of pixel array + }; + + // Total file size + fileSize = 54 + (stride * img.ny); + fileHeader[2] = (unsigned char)(fileSize); + fileHeader[3] = (unsigned char)(fileSize >> 8); + fileHeader[4] = (unsigned char)(fileSize >> 16); + fileHeader[5] = (unsigned char)(fileSize >> 24); + + // Bitmap information header (BITMAPINFOHEADER) + unsigned char infoHeader[40] = { + 40,0,0,0, // Size of this header (40 bytes) + 0,0,0,0, // Image width + 0,0,0,0, // Image height + 1,0, // Number of color planes + 24,0, // Bits per pixel + 0,0,0,0, // No compression + 0,0,0,0, // Image size (can be 0 for no compression) + 0,0,0,0, // X pixels per meter (not specified) + 0,0,0,0, // Y pixels per meter (not specified) + 0,0,0,0, // Total colors (color table not used) + 0,0,0,0 // Important colors (all are important) + }; + + // Width and height in the information header + infoHeader[4] = (unsigned char)(img.nx); + infoHeader[5] = (unsigned char)(img.nx >> 8); + infoHeader[6] = (unsigned char)(img.nx >> 16); + infoHeader[7] = (unsigned char)(img.nx >> 24); + infoHeader[8] = (unsigned char)(img.ny); + infoHeader[9] = (unsigned char)(img.ny >> 8); + infoHeader[10] = (unsigned char)(img.ny >> 16); + infoHeader[11] = (unsigned char)(img.ny >> 24); + + // Write file headers + file.write(reinterpret_cast(fileHeader), sizeof(fileHeader)); + file.write(reinterpret_cast(infoHeader), sizeof(infoHeader)); + + // Pixel data + std::vector padding(3, 0); // Max padding size to be added to each row + for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top + for (int x = 0; x < img.nx; ++x) { + // Each pixel + size_t pixelIndex = (y * img.nx + x) * 3; + unsigned char pixel[3] = { + img.buf[pixelIndex + 2], // BMP stores pixels in BGR format + img.buf[pixelIndex + 1], + img.buf[pixelIndex] + }; + file.write(reinterpret_cast(pixel), 3); + } + // Write padding for the row + file.write(reinterpret_cast(padding.data()), paddingAmount); + } + + file.close(); +} + +// Linear interpolation between two points +inline float lerp(float s, float e, float t) { + return s + (e - s) * t; +} +// Bilinear resize function +void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) { + dst.nx = target_width; + dst.ny = target_height; + dst.buf.resize(3 * target_width * target_height); + + float x_ratio = static_cast(src.nx - 1) / target_width; + float y_ratio = static_cast(src.ny - 1) / target_height; + + for (int y = 0; y < target_height; y++) { + for (int x = 0; x < target_width; x++) { + float px = x_ratio * x; + float py = y_ratio * y; + int x_floor = static_cast(px); + int y_floor = static_cast(py); + float x_lerp = px - x_floor; + float y_lerp = py - y_floor; + + for (int c = 0; c < 3; c++) { + float top = lerp( + static_cast(src.buf[3 * (y_floor * src.nx + x_floor) + c]), + static_cast(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]), + x_lerp + ); + float bottom = lerp( + static_cast(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]), + static_cast(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]), + x_lerp + ); + dst.buf[3 * (y * target_width + x) + c] = static_cast(lerp(top, bottom, y_lerp)); + } + } + } +} + +// for replication purposes `.to(model.device, dtype=torch.float16)` +// converts a float to half precision and back to float +float simulateFloat16Precision(float value) { + // Convert float32 to float16 + uint32_t f32 = *reinterpret_cast(&value); + uint32_t sign = (f32 >> 16) & 0x8000; // Top bit (sign bit) + uint32_t exponent = ((f32 >> 23) & 0xFF) - 112; // Adjust bias (112 is bias of float16, 127 is bias of float32) + uint32_t mantissa = (f32 >> 13) & 0x3FF; // Keep top 10 bits (10 bits of precision in float16, 23 in float32) + + // Handle overflow/underflow + if ((f32 & 0x7FFFFFFF) > 0x477FE000) { // Not representable + exponent = 0x1F; + mantissa = 0; + } else if ((f32 & 0x7FFFFFFF) < 0x38800000) { // Too small for normal half precision + exponent = 0; + mantissa = 0; + } + + uint16_t f16 = sign | (exponent << 10) | mantissa; + + // Convert back to float32 + uint32_t sign32 = (f16 & 0x8000) << 16; + uint32_t exponent32 = ((f16 >> 10) & 0x1F); + uint32_t mantissa32 = (f16 & 0x3FF) << 13; + + // Adjust bias back + exponent32 = exponent32 == 0 ? 0 : exponent32 + 112; + + uint32_t f32Result = sign32 | (exponent32 << 23) | mantissa32; + float result = *reinterpret_cast(&f32Result); + + return result; +} +// Normalize image to float32 - supports float16 replication as in pytorch .to(model.device, dtype=torch.float16) +void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, const float mean[3], const float std[3], bool replicate_float16) { + dst->nx = src->nx; + dst->ny = src->ny; + dst->buf.resize(src->buf.size()); + + for (size_t i = 0; i < src->buf.size(); ++i) { + int c = i % 3; // rgb + dst->buf[i] = (static_cast(src->buf[i]) / 255.0f - mean[c]) / std[c]; + + if (replicate_float16) { + dst->buf[i] = simulateFloat16Precision(dst->buf[i]); + } + } +} +inline float clip(float x, float lower, float upper) +{ + return std::max(lower, std::min(x, upper)); +} +bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_width, int target_height) +{ + const int nx = img.nx; + const int ny = img.ny; + + dst.nx = target_width; + dst.ny = target_height; + dst.buf.resize(3 * target_width * target_height); + + int a, b, c, d, index; + float Ca, Cb, Cc; + float C[5]; + float d0, d2, d3, a0, a1, a2, a3; + int i, j, k, ii, jj; + int x, y; + float dx, dy; + float tx, ty; + + tx = (float)nx / (float)target_width; + ty = (float)ny / (float)target_height; + + float scale = std::max(tx, ty); + + // Bicubic interpolation; adapted from ViT.cpp, inspired from : + // -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36 + // -> https://en.wikipedia.org/wiki/Bicubic_interpolation + + for (i = 0; i < target_height; i++) + { + for (j = 0; j < target_width; j++) + { + x = (int)(tx * j); + y = (int)(ty * i); + + dx = tx * j - x; + dy = ty * i - y; + + index = (y * nx + x) * 3; + a = (y * nx + (x + 1)) * 3; + b = ((y + 1) * nx + x) * 3; + c = ((y + 1) * nx + (x + 1)) * 3; + + for (k = 0; k < 3; k++) + { + for (jj = 0; jj <= 3; jj++) + { + d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + + a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; + a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; + a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; + C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx; + + d0 = C[0] - C[1]; + d2 = C[2] - C[1]; + d3 = C[3] - C[1]; + a0 = C[1]; + a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; + a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; + a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; + Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy; + + const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f); + dst.buf[(i * target_width + j) * 3 + k] = float(Cc2); + } + } + } + } + + return true; +} + +// llava-1.6 type of resize_and_pad (black) +void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &image_output, const std::pair& target_resolution) { + int target_width = target_resolution.first; + int target_height = target_resolution.second; + + float scale_w = static_cast(target_width) / image.nx; + float scale_h = static_cast(target_height) / image.ny; + + int new_width, new_height; + + if (scale_w < scale_h) { + new_width = target_width; + new_height = std::min(static_cast(std::ceil(image.ny * scale_w)), target_height); + } else { + new_height = target_height; + new_width = std::min(static_cast(std::ceil(image.nx * scale_h)), target_width); + } + + clip_image_u8 resized_image; + // bilinear_resize(image, resized_image, new_width, new_height); + bicubic_resize(image, resized_image, new_width, new_height); + + clip_image_u8 padded_image; + padded_image.nx = target_width; + padded_image.ny = target_height; + padded_image.buf.resize(3 * target_width * target_height, 0); // Initialize with black + + // Calculate padding offsets + int pad_x = (target_width - new_width) / 2; + int pad_y = (target_height - new_height) / 2; + + // Copy the resized image into the center of the padded buffer + for (int y = 0; y < new_height; ++y) { + for (int x = 0; x < new_width; ++x) { + for (int c = 0; c < 3; ++c) { + padded_image.buf[3 * ((y + pad_y) * target_width + (x + pad_x)) + c] = resized_image.buf[3 * (y * new_width + x) + c]; + } + } + } + + image_output = std::move(padded_image); +} + + +/** + * Selects the best resolution from a list of possible resolutions based on the original size. + * + * @param original_size The original size of the image in the format (width, height). + * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...]. + * @return The best fit resolution in the format (width, height). + */ +static std::pair select_best_resolution(const std::pair& original_size, const std::vector>& possible_resolutions) { + int original_width = original_size.first; + int original_height = original_size.second; + std::pair best_fit; + int max_effective_resolution = 0; + int min_wasted_resolution = std::numeric_limits::max(); + + for (const auto& resolution : possible_resolutions) { + int width = resolution.first; + int height = resolution.second; + float scale = std::min(static_cast(width) / original_width, static_cast(height) / original_height); + int downscaled_width = static_cast(original_width * scale); + int downscaled_height = static_cast(original_height * scale); + int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); + int wasted_resolution = (width * height) - effective_resolution; + // fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); + if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { + max_effective_resolution = effective_resolution; + min_wasted_resolution = wasted_resolution; + best_fit = resolution; + } + } + + return best_fit; +} + + +std::vector divide_to_patches_u8(const clip_image_u8& image, int patch_size) { + std::vector patches; + int width = image.nx; + int height = image.ny; + for (int i = 0; i < height; i += patch_size) { + for (int j = 0; j < width; j += patch_size) { + clip_image_u8 *patch = clip_image_u8_init(); + patch->nx = std::min(patch_size, width - j); + patch->ny = std::min(patch_size, height - i); + patch->buf.resize(3 * patch->nx * patch->ny); + for (int y = 0; y < patch->ny; ++y) { + for (int x = 0; x < patch->nx; ++x) { + for (int c = 0; c < 3; ++c) { + patch->buf[3 * (y * patch->nx + x) + c] = image.buf[3 * ((i + y) * width + (j + x)) + c]; + } + } + } + patches.push_back(patch); + } + } + return patches; +} + + +// debug function to convert f32 to u8 +void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) { + dst.nx = src.nx; + dst.ny = src.ny; + dst.buf.resize(3 * src.nx * src.ny); + for (size_t i = 0; i < src.buf.size(); ++i) { + dst.buf[i] = static_cast(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255)); + } +} + +/** + * @brief Get the anyres image grid shape object + * + * @param image_size + * @param grid_pinpoints + * @param image_patch_size + * @return + */ +struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair& image_size, const std::vector>& grid_pinpoints, int image_patch_size) { + /** + Conversion from gguf flat array to vector: + std::vector> possible_resolutions; + for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) { + possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]}); + } + */ + auto best_resolution = select_best_resolution(image_size, grid_pinpoints); + return {best_resolution.first / image_patch_size, best_resolution.second / image_patch_size}; +} + + // normalize: x = (x - mean) / std // TODO: implement bicubic interpolation instead of linear. -bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res, const bool pad2square) { +// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patche tensors as a vector +bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std::vector& res_tensor, bool pad2square) { if (!ctx->has_vision_encoder) { printf("This gguf file seems to have no vision encoder\n"); return false; } + auto & params = ctx->vision_model.hparams; + // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing + if (strcmp(params.mm_patch_merge_type, "spatial_unpad") == 0) { + pad2square = false; + } else { + // pad2square = true; // todo: consider automatic decisions on that options for all models + } + // free the previous res_tensor + if (res_tensor.size() > 0) { + for (size_t i = 0; i < res_tensor.size(); i++) { + clip_image_f32_free(res_tensor[i]); + } + res_tensor.clear(); + } // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 @@ -1151,7 +1597,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli temp->nx = longer_side; temp->ny = longer_side; temp->buf.resize(3 * longer_side * longer_side); - const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA + const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA (this is the mean rgb color * 255) // fill with background color for (size_t i = 0; i < temp->buf.size(); i++) { @@ -1169,18 +1615,65 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli } } } else { - temp->nx = img->nx; - temp->ny = img->ny; - temp->buf.resize(img->buf.size()); - memcpy(temp->buf.data(), img->buf.data(), temp->buf.size()); + if (params.image_grid_pinpoints[0] != 0) + { + // "spatial_unpad" with "anyres" processing for llava-1.6 + std::vector> possible_resolutions; + for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) { + possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]}); + } + std::pair best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions); + // fprintf(stderr, "%s - Working with resolution: %d %d\n", __func__, best_resolution.first, best_resolution.second); + // clip_image_save_to_bmp(*img, "input.bmp"); + resize_and_pad_image(*img, *temp, best_resolution); // we do not pad with mean-bg color anymore in llava-1.6 + // clip_image_save_to_bmp(*temp, "resized.bmp"); + // visually verify normalized image: + // normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std); + // { + // clip_image_u8 * temp2 = clip_image_u8_init(); + // clip_image_convert_f32_to_u8(*res, *temp2); + // clip_image_save_to_bmp(*temp2, "resized_normalized_f32.bmp"); + // clip_image_u8_free(temp2); + // } + + std::vector patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6) + // fprintf(stderr, "patches: %d, %d\n", patches.size(), params.image_size); + + clip_image_u8 *image_original_resize = clip_image_u8_init(); + // bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square ? + bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square ? + patches.insert(patches.begin(), image_original_resize); + + res_tensor.clear(); + for (auto& patch : patches) { + clip_image_f32 *temp_image_f32 = clip_image_f32_init(); + normalize_image_u8_to_f32(patch, temp_image_f32, ctx->image_mean, ctx->image_std, true); + res_tensor.push_back(temp_image_f32); + } + + for (size_t i = 0; i < patches.size(); i++) { + // printf("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny); + clip_image_u8_free(patches[i]); + } + + clip_image_u8_free(temp); + + return true; + } else { + temp->nx = img->nx; + temp->ny = img->ny; + temp->buf.resize(img->buf.size()); + memcpy(temp->buf.data(), img->buf.data(), temp->buf.size()); + } } const int nx = temp->nx; const int ny = temp->ny; + // clip_image_save_to_bmp(*temp, "resized_vanilla.bmp"); const int nx2 = ctx->vision_model.hparams.image_size; const int ny2 = ctx->vision_model.hparams.image_size; - + clip_image_f32 * res = clip_image_f32_init(); res->nx = nx2; res->ny = ny2; res->buf.resize(3 * nx2 * ny2); @@ -1234,6 +1727,13 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli } clip_image_u8_free(temp); + // { + // clip_image_u8 * temp2 = clip_image_u8_init(); + // clip_image_convert_f32_to_u8(*res, *temp2); + // clip_image_save_to_bmp(*temp2, "resized_normalized_f32_vanilla.bmp"); + // clip_image_u8_free(temp2); + // } + res_tensor.push_back(res); return true; } @@ -1302,6 +1802,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i type = static_cast(itype); auto * ctx_clip = clip_model_load(fname_inp, 2); + const auto & ctx_src = ctx_clip->ctx_gguf; const auto & ctx_data = ctx_clip->ctx_data; @@ -1495,6 +1996,10 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { } } +ggml_tensor *clip_get_newline_tensor(const struct clip_ctx * ctx) { + return ctx->vision_model.image_newline; +} + int clip_n_patches(const struct clip_ctx * ctx) { auto & params = ctx->vision_model.hparams; int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size); @@ -1506,4 +2011,4 @@ int clip_n_patches(const struct clip_ctx * ctx) { size_t clip_embd_nbytes(const struct clip_ctx * ctx) { return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float); -} +} \ No newline at end of file diff --git a/examples/llava/clip.h b/examples/llava/clip.h index 458a256a1..09346b603 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -3,6 +3,8 @@ #include #include +#include +#include #ifdef LLAMA_SHARED # if defined(_WIN32) && !defined(__MINGW32__) @@ -32,10 +34,20 @@ struct clip_vision_hparams { int32_t projection_dim; int32_t n_head; int32_t n_layer; + float eps; + + char mm_patch_merge_type[32]="flat"; // spatial_unpad or flat (default) + int32_t image_grid_pinpoints[32]; + int32_t image_crop_resolution; + }; +struct clip_ctx; +CLIP_API const struct clip_vision_hparams clip_get_vision_hparams(const struct clip_ctx * ctx); + CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity); +CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity); CLIP_API void clip_free(struct clip_ctx * ctx); @@ -44,6 +56,24 @@ CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx); CLIP_API int clip_n_patches (const struct clip_ctx * ctx); CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); +// RGB uint8 image +CLIP_API struct clip_image_u8 { + int nx; + int ny; + + std::vector buf; +}; + +// RGB float32 image (NHWC) +// Memory layout: RGBRGBRGB... + CLIP_API struct clip_image_f32 { + int nx; + int ny; + + std::vector buf; +}; + + struct clip_image_u8_batch { struct clip_image_u8 * data; size_t size; @@ -53,6 +83,10 @@ struct clip_image_f32_batch { struct clip_image_f32 * data; size_t size; }; +CLIP_API struct clip_image_grid_shape { + int first; + int second; +}; CLIP_API struct clip_image_u8 * clip_image_u8_init (); CLIP_API struct clip_image_f32 * clip_image_f32_init(); @@ -61,11 +95,16 @@ CLIP_API void clip_image_u8_free (struct clip_image_u8 * img); CLIP_API void clip_image_f32_free(struct clip_image_f32 * img); CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); +CLIP_API void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename); +CLIP_API void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst); /** interpret bytes as an image file with length bytes_length, and use the result to populate img */ CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img); +/** preprocess img and store the result in res_tensor, pad2square may be overriden to false depending on model configuration */ +CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std::vector& res_tensor, bool pad2square); +CLIP_API struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair& image_size, const std::vector>& grid_pinpoints, int image_patch_size); +CLIP_API struct ggml_tensor *clip_get_newline_tensor(const struct clip_ctx * ctx); -CLIP_API bool clip_image_preprocess (struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, bool pad2square); CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec); CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec); diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py index 115b6b35b..ea331f2fe 100644 --- a/examples/llava/convert-image-encoder-to-gguf.py +++ b/examples/llava/convert-image-encoder-to-gguf.py @@ -240,7 +240,8 @@ if has_vision_encoder: # flatten it image_grid_pinpoints = [] for pinpoint in v_hparams["image_grid_pinpoints"]: - image_grid_pinpoints.extend(pinpoint) + for p in pinpoint: + image_grid_pinpoints.append(p) fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints) if "image_crop_resolution" in v_hparams: fout.add_uint32("clip.vision.image_crop_resolution", v_hparams["image_crop_resolution"]) diff --git a/examples/llava/llava-surgery-v2.py b/examples/llava/llava-surgery-v2.py index a5850b96e..6b4fac80d 100644 --- a/examples/llava/llava-surgery-v2.py +++ b/examples/llava/llava-surgery-v2.py @@ -13,11 +13,12 @@ def is_safetensor_file(file_path): # Unified loading function def load_model(file_path): if is_safetensor_file(file_path): - # return safe_load(file_path,framework="pt", device="cpu"), 'safetensor' tensors = {} with safe_open(file_path, framework="pt", device="cpu") as f: for key in f.keys(): tensors[key] = f.get_tensor(key).clone() + # output shape + print(f"{key} : {tensors[key].shape}") return tensors, 'safetensor' else: return torch.load(file_path, map_location=torch.device('cpu')), 'pytorch' @@ -156,4 +157,4 @@ if len(first_mm_tensors) > 0: print("Done!") print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.") -print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.") +print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.") \ No newline at end of file diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index d42e7582e..3a0c4a8a4 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -6,27 +6,261 @@ #include #include #include +#include #include "base64.hpp" -static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) { - clip_image_f32 * img_res = clip_image_f32_init(); - if (!clip_image_preprocess(ctx_clip, img, img_res, /*pad2square =*/ true)) { - fprintf(stderr, "%s: unable to preprocess image\n", __func__); - clip_image_f32_free(img_res); - return false; +// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out) +static bool handle_patches(clip_ctx * ctx_clip, std::vector & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) { + struct temp_model { + struct ggml_tensor *newline; + struct ggml_context * ctx; + } model; + + auto & vparams = clip_get_vision_hparams(ctx_clip); + auto num_patches_per_side = vparams.image_size / vparams.patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches) + int num_patches_width = grid_shape.first; // grid 1-4 + int num_patches_height = grid_shape.second; // grid 1-4 + + // TODO: size calculation is not calculated - it's only tens of MB + size_t ctx_size = 0; + { + ctx_size += clip_embd_nbytes(ctx_clip) * image_embd_v.size() * 8; // image_features + ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32); // + } + + struct ggml_init_params params { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, // NOTE: this should be false when using the legacy API + }; + + // Python reference for full unpad: + // base_image_feature = image_feature[0] + // image_feature = image_feature[1:] + // image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous() + // image_feature = image_feature.flatten(1, 2).flatten(2, 3) + // image_feature = unpad_image(image_feature, image_sizes[image_idx]) + // image_feature = torch.cat(( + // image_feature, + // self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1) + // ), dim=-1) + // image_feature = image_feature.flatten(1, 2).transpose(0, 1) + // image_feature = torch.cat((base_image_feature, image_feature), dim=0) + + // embeddings -> tokens -> 24 x 24 + /** + * We now have two options: unpad or no unpad - unpad removes tokens for faster llm eval + * In terms of result quality it appears to make no difference, so we'll start with the easier approach given 5D tensors are not supported in ggml yet + * Without unpad we have to split the sub-image embeddings into patches of 24 features each and permute them. + * Once all images are processed to prepended the base_image_features without any changes. + */ + /** + Pytorch reference simplified, modified for ggml compatibility - confirmed identical output in python (for a 2x2 grid image (676x676 scaling)) + # image_feature = image_feature.view(2, 2, 24, 24, 4096) + # image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous() + # image_feature = image_feature.view(2, 24, 2, 24, 4096) + # image_feature = image_feature.flatten(0, 3) + + # Reshape to 4D tensor by merging the last two dimensions + image_feature = image_feature.view(2, 2, 24, 24*4096) + image_feature = image_feature.permute(0, 2, 1, 3).contiguous() + image_feature = image_feature.view(-1, 4096) + * + */ + model.ctx = ggml_init(params); + + ggml_context *ctx_noalloc = ggml_init({2048, NULL, true}); + // struct ggml_tensor * image_features = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip) * (image_embd_v.size() - 1)); + + ggml_tensor *newline_tmp = clip_get_newline_tensor(ctx_clip); + model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]); + if (newline_tmp->backend != GGML_BACKEND_CPU) { + if (newline_tmp->buffer == NULL) { + printf("newline_tmp tensor buffer is NULL\n"); + } + ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp)); + } else + { + model.newline->data = newline_tmp->data; + if (model.newline->data == NULL) { + printf("newline_tmp tensor data is NULL\n"); + } } - *n_img_pos = clip_n_patches(ctx_clip); + struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, image_embd_v.size() - 1, clip_n_patches(ctx_clip), clip_n_mmproj_embd(ctx_clip)); + // fill it with the image embeddings, ignoring the first + for (int i = 1; i < image_embd_v.size(); i++) + { + // printf("Copying image_embd_v[%d] to image_features tensor\n", i); + size_t offset = (i-1) * clip_embd_nbytes(ctx_clip); + + // for debugging we now try and set the entire tensor row to 0.0001f,0.0002f,0.0003f,0.0004f etc: + // float *floatPtr = static_cast(image_embd_v[i]); + // for (int j = 0; j < clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip); j++) + // { + // // floatPtr[j] = (j + 1) / 10000.0f; + // int feature = j % clip_n_mmproj_embd(ctx_clip) + 1; + // floatPtr[j] = i + feature / 10000.0f; + // } + memcpy((uint8_t *)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip)); + } + // printf("image_features size = %d\n", clip_embd_nbytes(ctx_clip) * (image_embd_v.size() - 1)); + + struct ggml_cgraph * gf = ggml_new_graph(model.ctx); + // image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1) + size_t size_ele = ggml_type_size(GGML_TYPE_F32); + // struct ggml_tensor *dummy = ggml_new_tensor_4d(ctx_noalloc, GGML_TYPE_F32, num_patches_height, num_patches_width, num_patches_per_side, num_patches_per_side * clip_n_mmproj_embd(ctx_clip)); + + struct ggml_tensor *image_features_view = ggml_view_4d(model.ctx, image_features, + num_patches_height, // nb0 : 4 byte für jedes + num_patches_width, + num_patches_per_side * num_patches_per_side, + clip_n_mmproj_embd(ctx_clip), + + size_ele * num_patches_height, + size_ele * num_patches_height * num_patches_width, + size_ele * num_patches_height * num_patches_width * num_patches_per_side, + 0); + + struct ggml_tensor *image_features_patchview = ggml_view_4d(model.ctx, image_features, + num_patches_height, + num_patches_width, + num_patches_per_side, + num_patches_per_side * clip_n_mmproj_embd(ctx_clip), + + size_ele * num_patches_height, + size_ele * num_patches_height * num_patches_width, + size_ele * num_patches_height * num_patches_width * num_patches_per_side, 0); + + struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3)); + permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, permuted_cont, 0, 2, 1, 3)); // permute back to before - todo: fix bug + + struct ggml_tensor *prepared = ggml_view_2d(model.ctx, permuted_cont, num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, clip_n_mmproj_embd(ctx_clip), size_ele * num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, 0); + struct ggml_tensor *prepared_cont = ggml_cont(model.ctx, prepared); // not needed + // struct ggml_tensor *prepared_cont = prepared; // the view only flattens + + ggml_build_forward_expand(gf, prepared_cont); + + ggml_graph_compute_with_ctx(model.ctx, gf, 1); + + struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1]; + // ggml_tensor_printf(image_features,"image_features",__LINE__,false,true); + // ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,true); + // ggml_tensor_printf(prepared_cont,"prepared_cont",__LINE__,false,true); + + memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context + // append without newline tokens: + // memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip) * (image_embd_v.size()-1)); // grid patches + // append with newline tokens: + for (size_t i = 0; i < image_embd_v.size() - 1; ++i) { + // we append with +1 offset (base image is prepended) + memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip) * (i+1) + model.newline->ne[0] * i, + (float*)prepared_cont->data + i * clip_n_mmproj_embd(ctx_clip) * clip_n_patches(ctx_clip), + clip_embd_nbytes(ctx_clip)); + memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip) * (i+2) + model.newline->ne[0] * i , + (float*)model.newline->data, + ggml_nbytes(model.newline)); + } + + size_t newline_tokens = image_embd_v.size()-1; + *n_img_pos_out = prepared_cont->ne[0]+clip_n_patches(ctx_clip) + newline_tokens; + + // Debug: Test single segments + // Current findings: sending base image, sending a segment embedding all works similar to python + // However, permuted embeddings do not work yet (stride issue?) + // memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as context + // memcpy(image_embd_out, (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip)); // main image as context + // *n_img_pos_out=576; + + ggml_free(model.ctx); + + return true; +} + + +static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) { + std::vector img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336 + if (!clip_image_preprocess(ctx_clip, img, img_res_v, /*pad2square =*/ true)) { + fprintf(stderr, "%s: unable to preprocess image\n", __func__); + for (auto img_res : img_res_v) { + clip_image_f32_free(img_res); + } + return false; + } const int64_t t_img_enc_start_us = ggml_time_us(); - bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); - clip_image_f32_free(img_res); - if (!encoded) { - fprintf(stderr, "Unable to encode image\n"); + auto & vparams = clip_get_vision_hparams(ctx_clip); + // DEBUG print the "shape" and the first 10 rows and 10 cols of img_res_v in exp format + // for (int i = 0; i < img_res_v.size(); i++) + // { + // printf("img_res_v[%d] shape: %d x %d\n", i, img_res_v[i]->nx, img_res_v[i]->ny); + // for (int j = 0; j < 10; j++) + // { + // for (int k = 0; k < 10; k++) + // { + // printf("%e ", img_res_v[i]->buf[j*img_res_v[i]->ny + k]); + // } + // printf("\n"); + // } + // } + + if (strcmp(vparams.mm_patch_merge_type, "spatial_unpad") != 0) + { + // flat / default llava-1.5 type embedding + *n_img_pos = clip_n_patches(ctx_clip); + bool encoded = clip_image_encode(ctx_clip, n_threads, img_res_v[0], image_embd); // image_embd shape is 576 x 4096 + clip_image_f32_free(img_res_v[0]); + if (!encoded) { + fprintf(stderr, "Unable to encode image\n"); + + return false; + } + } else + { + // spatial_unpad llava-1.6 type embedding + // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working + std::vector image_embd_v; + image_embd_v.resize(img_res_v.size()); + for (int i = 0; i < img_res_v.size(); i++) + { + image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184 + bool encoded = clip_image_encode(ctx_clip, n_threads, img_res_v[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside + clip_image_f32_free(img_res_v[i]); + if (!encoded) { + fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", i+1, img_res_v.size()); + return false; + } + } + const int64_t t_img_enc_batch_us = ggml_time_us(); + printf("%s: %d segments encoded in %8.2f ms\n", __func__, img_res_v.size(), (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); + + + std::vector> grid_pinpoints; + for (int i = 0; i < 32 && vparams.image_grid_pinpoints[i] != 0; i+=2) { + grid_pinpoints.push_back({vparams.image_grid_pinpoints[i], vparams.image_grid_pinpoints[i+1]}); + } + img_res_v.clear(); + struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, vparams.image_size); + + int n_img_pos_out; + handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out); + *n_img_pos = n_img_pos_out; + + for (int i = 0; i < image_embd_v.size(); i++) + { + free(image_embd_v[i]); + } + image_embd_v.clear(); + + // debug image/segment/normalization content: + // clip_image_u8 * tmp = clip_image_u8_init(); + // clip_image_convert_f32_to_u8(*image_feature, *tmp); + // clip_image_save_to_bmp(*tmp, "image_feature.bmp"); - return false; } + printf("%s: image embedding created: %d tokens\n", __func__, *n_img_pos); + const int64_t t_img_enc_end_us = ggml_time_us(); float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0; @@ -36,6 +270,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli return true; } + + bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) { // make sure that the correct mmproj was used, i.e., compare apples to apples int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama)); @@ -48,7 +284,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * } static bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) { - float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)); + float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model if (!image_embd) { fprintf(stderr, "Unable to allocate memory for image embeddings\n"); free(image_embd); @@ -151,7 +387,7 @@ LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct return NULL; } - auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length); + llava_image_embed *embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length); free(image_bytes); return embed; diff --git a/examples/server/server.cpp b/examples/server/server.cpp index ea77125ea..353bd8976 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -943,13 +943,14 @@ struct llama_server_context { continue; } - clip_image_f32 * img_res = clip_image_f32_init(); - if (!clip_image_preprocess(clp_ctx, img.img_data, img_res, /*pad2square =*/ true)) + std::vector img_res_v; + if (!clip_image_preprocess(clp_ctx, img.img_data, img_res_v, /*pad2square =*/ true)) { LOG_TEE("Error processing the given image"); clip_free(clp_ctx); return false; } + clip_image_f32 * img_res = img_res_v[0]; img.image_tokens = clip_n_patches(clp_ctx); img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx)); if (!img.image_embedding)