moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed

This commit is contained in:
John 2024-02-13 00:29:17 +01:00
parent 0dd6c9da2a
commit 3a72267869
4 changed files with 184 additions and 149 deletions

View file

@ -31,6 +31,25 @@
#include <sstream> #include <sstream>
#include <cinttypes> #include <cinttypes>
// #define CLIP_DEBUG_FUNCTIONS
// RGB uint8 image
struct clip_image_u8 {
int nx;
int ny;
std::vector<uint8_t> buf;
};
// RGB float32 image (NHWC)
// Memory layout: RGBRGBRGB...
struct clip_image_f32 {
int nx;
int ny;
std::vector<float> buf;
};
static std::string format(const char * fmt, ...) { static std::string format(const char * fmt, ...) {
va_list ap; va_list ap;
va_list ap2; va_list ap2;
@ -961,9 +980,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS); int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS);
int n = gguf_get_arr_n(ctx, idx); int n = gguf_get_arr_n(ctx, idx);
const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx); const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx);
for (int i = 0; i < 32 && pinpoints[i] != 0; ++i) { for (int i = 0; i < 32 && i < n && pinpoints[i] != 0; ++i) {
hparams.image_grid_pinpoints[i] = pinpoints[i]; hparams.image_grid_pinpoints[i] = pinpoints[i];
} }
if (n < 32)
hparams.image_grid_pinpoints[n] = 0; hparams.image_grid_pinpoints[n] = 0;
} catch (std::runtime_error & e) { } catch (std::runtime_error & e) {
hparams.image_grid_pinpoints[0]=0; hparams.image_grid_pinpoints[0]=0;
@ -1170,7 +1190,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
return true; return true;
} }
#ifdef CLIP_DEBUG_FUNCTIONS
void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) { void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
std::ofstream file(filename, std::ios::binary); std::ofstream file(filename, std::ios::binary);
if (!file.is_open()) { if (!file.is_open()) {
@ -1265,6 +1285,7 @@ void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filenam
file.close(); file.close();
} }
#endif
// Linear interpolation between two points // Linear interpolation between two points
inline float lerp(float s, float e, float t) { inline float lerp(float s, float e, float t) {
@ -1305,41 +1326,8 @@ void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_wi
} }
} }
// for replication purposes `.to(model.device, dtype=torch.float16)` // Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
// converts a float to half precision and back to float void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, const float mean[3], const float std[3]) {
float simulateFloat16Precision(float value) {
// Convert float32 to float16
uint32_t f32 = *reinterpret_cast<uint32_t*>(&value);
uint32_t sign = (f32 >> 16) & 0x8000; // Top bit (sign bit)
uint32_t exponent = ((f32 >> 23) & 0xFF) - 112; // Adjust bias (112 is bias of float16, 127 is bias of float32)
uint32_t mantissa = (f32 >> 13) & 0x3FF; // Keep top 10 bits (10 bits of precision in float16, 23 in float32)
// Handle overflow/underflow
if ((f32 & 0x7FFFFFFF) > 0x477FE000) { // Not representable
exponent = 0x1F;
mantissa = 0;
} else if ((f32 & 0x7FFFFFFF) < 0x38800000) { // Too small for normal half precision
exponent = 0;
mantissa = 0;
}
uint16_t f16 = sign | (exponent << 10) | mantissa;
// Convert back to float32
uint32_t sign32 = (f16 & 0x8000) << 16;
uint32_t exponent32 = ((f16 >> 10) & 0x1F);
uint32_t mantissa32 = (f16 & 0x3FF) << 13;
// Adjust bias back
exponent32 = exponent32 == 0 ? 0 : exponent32 + 112;
uint32_t f32Result = sign32 | (exponent32 << 23) | mantissa32;
float result = *reinterpret_cast<float*>(&f32Result);
return result;
}
// Normalize image to float32 - supports float16 replication as in pytorch .to(model.device, dtype=torch.float16)
void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, const float mean[3], const float std[3], bool replicate_float16) {
dst->nx = src->nx; dst->nx = src->nx;
dst->ny = src->ny; dst->ny = src->ny;
dst->buf.resize(src->buf.size()); dst->buf.resize(src->buf.size());
@ -1347,12 +1335,9 @@ void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, co
for (size_t i = 0; i < src->buf.size(); ++i) { for (size_t i = 0; i < src->buf.size(); ++i) {
int c = i % 3; // rgb int c = i % 3; // rgb
dst->buf[i] = (static_cast<float>(src->buf[i]) / 255.0f - mean[c]) / std[c]; dst->buf[i] = (static_cast<float>(src->buf[i]) / 255.0f - mean[c]) / std[c];
}
}
if (replicate_float16) {
dst->buf[i] = simulateFloat16Precision(dst->buf[i]);
}
}
}
inline float clip(float x, float lower, float upper) inline float clip(float x, float lower, float upper)
{ {
return std::max(lower, std::min(x, upper)); return std::max(lower, std::min(x, upper));
@ -1471,7 +1456,6 @@ void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &image_outpu
} }
} }
} }
image_output = std::move(padded_image); image_output = std::move(padded_image);
} }
@ -1533,7 +1517,7 @@ std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8& image, int
return patches; return patches;
} }
#ifdef CLIP_DEBUG_FUNCTIONS
// debug function to convert f32 to u8 // debug function to convert f32 to u8
void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) { void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
dst.nx = src.nx; dst.nx = src.nx;
@ -1543,32 +1527,12 @@ void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst)
dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255)); dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255));
} }
} }
#endif
/** // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
* @brief Get the anyres image grid shape object // res_imgs memory is being allocated here, previous allocations will be freed if found
* bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch& res_imgs ) {
* @param image_size bool pad_to_square = true;
* @param grid_pinpoints
* @param image_patch_size
* @return <int, int>
*/
struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, int>& image_size, const std::vector<std::pair<int, int>>& grid_pinpoints, int image_patch_size) {
/**
Conversion from gguf flat array to vector:
std::vector<std::pair<int, int>> possible_resolutions;
for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
}
*/
auto best_resolution = select_best_resolution(image_size, grid_pinpoints);
return {best_resolution.first / image_patch_size, best_resolution.second / image_patch_size};
}
// normalize: x = (x - mean) / std
// TODO: implement bicubic interpolation instead of linear.
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patche tensors as a vector
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std::vector<clip_image_f32*>& res_tensor, bool pad2square) {
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
printf("This gguf file seems to have no vision encoder\n"); printf("This gguf file seems to have no vision encoder\n");
return false; return false;
@ -1576,23 +1540,23 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std
auto & params = ctx->vision_model.hparams; auto & params = ctx->vision_model.hparams;
// The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
if (strcmp(params.mm_patch_merge_type, "spatial_unpad") == 0) { if (strcmp(params.mm_patch_merge_type, "spatial_unpad") == 0) {
pad2square = false; pad_to_square = false;
} else {
// pad2square = true; // todo: consider automatic decisions on that options for all models
} }
// free the previous res_tensor // free the previous res_imgs if any set
if (res_tensor.size() > 0) { if (res_imgs.size > 0 && res_imgs.size < 100) {
for (size_t i = 0; i < res_tensor.size(); i++) { for (size_t i = 0; i < res_imgs.size; i++) {
clip_image_f32_free(res_tensor[i]); clip_image_f32_free(&(res_imgs.data[i]));
} }
res_tensor.clear(); delete[] res_imgs.data;
} }
res_imgs.data = nullptr;
res_imgs.size = 0;
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
clip_image_u8 * temp = clip_image_u8_init(); // we will keep the input image data here temporarily clip_image_u8 * temp = clip_image_u8_init(); // we will keep the input image data here temporarily
if (pad2square && img->nx != img->ny) { if (pad_to_square && img->nx != img->ny) {
int longer_side = std::max(img->nx, img->ny); int longer_side = std::max(img->nx, img->ny);
temp->nx = longer_side; temp->nx = longer_side;
temp->ny = longer_side; temp->ny = longer_side;
@ -1636,18 +1600,18 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std
// } // }
std::vector<clip_image_u8 *> patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6) std::vector<clip_image_u8 *> patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6)
// fprintf(stderr, "patches: %d, %d\n", patches.size(), params.image_size);
clip_image_u8 *image_original_resize = clip_image_u8_init(); clip_image_u8 *image_original_resize = clip_image_u8_init();
// bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square ? // bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square ? bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
patches.insert(patches.begin(), image_original_resize); patches.insert(patches.begin(), image_original_resize);
// clip_image_f32_batch_init(patches.size());
res_tensor.clear(); res_imgs.size = patches.size();
res_imgs.data = new clip_image_f32[res_imgs.size];
int num=0;
for (auto& patch : patches) { for (auto& patch : patches) {
clip_image_f32 *temp_image_f32 = clip_image_f32_init(); normalize_image_u8_to_f32(patch, &res_imgs.data[num], ctx->image_mean, ctx->image_std);
normalize_image_u8_to_f32(patch, temp_image_f32, ctx->image_mean, ctx->image_std, false); // set to true for pytorch fp16 value replication num++;
res_tensor.push_back(temp_image_f32);
} }
for (size_t i = 0; i < patches.size(); i++) { for (size_t i = 0; i < patches.size(); i++) {
@ -1732,7 +1696,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std
// clip_image_save_to_bmp(*temp2, "resized_normalized_f32_vanilla.bmp"); // clip_image_save_to_bmp(*temp2, "resized_normalized_f32_vanilla.bmp");
// clip_image_u8_free(temp2); // clip_image_u8_free(temp2);
// } // }
res_tensor.push_back(res); // res_imgs.push_back(res);
res_imgs.size = 1;
res_imgs.data = new clip_image_f32[res_imgs.size];
res_imgs.data[0] = std::move(*res);
return true; return true;
} }

View file

@ -3,8 +3,6 @@
#include <stddef.h> #include <stddef.h>
#include <stdint.h> #include <stdint.h>
#include <string>
#include <vector>
#ifdef LLAMA_SHARED #ifdef LLAMA_SHARED
# if defined(_WIN32) && !defined(__MINGW32__) # if defined(_WIN32) && !defined(__MINGW32__)
@ -56,24 +54,6 @@ CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
CLIP_API int clip_n_patches (const struct clip_ctx * ctx); CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
// RGB uint8 image
CLIP_API struct clip_image_u8 {
int nx;
int ny;
std::vector<uint8_t> buf;
};
// RGB float32 image (NHWC)
// Memory layout: RGBRGBRGB...
CLIP_API struct clip_image_f32 {
int nx;
int ny;
std::vector<float> buf;
};
struct clip_image_u8_batch { struct clip_image_u8_batch {
struct clip_image_u8 * data; struct clip_image_u8 * data;
size_t size; size_t size;
@ -95,14 +75,11 @@ CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img); CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
CLIP_API void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename);
CLIP_API void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst);
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */ /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img); CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
/** preprocess img and store the result in res_tensor, pad2square may be overriden to false depending on model configuration */ /** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std::vector<clip_image_f32*>& res_tensor, bool pad2square); CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch& res_imgs );
CLIP_API struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, int>& image_size, const std::vector<std::pair<int, int>>& grid_pinpoints, int image_patch_size);
CLIP_API struct ggml_tensor *clip_get_newline_tensor(const struct clip_ctx * ctx); CLIP_API struct ggml_tensor *clip_get_newline_tensor(const struct clip_ctx * ctx);
CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec); CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);

View file

@ -10,8 +10,78 @@
#include "base64.hpp" #include "base64.hpp"
// RGB uint8 image
struct clip_image_u8 {
int nx;
int ny;
std::vector<uint8_t> buf;
};
// RGB float32 image (NHWC)
// Memory layout: RGBRGBRGB...
struct clip_image_f32 {
int nx;
int ny;
std::vector<float> buf;
};
/**
* Selects the best resolution from a list of possible resolutions based on the original size.
*
* @param original_size The original size of the image in the format (width, height).
* @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
* @return The best fit resolution in the format (width, height).
*/
static std::pair<int, int> select_best_resolution(const std::pair<int, int>& original_size, const std::vector<std::pair<int, int>>& possible_resolutions) {
int original_width = original_size.first;
int original_height = original_size.second;
std::pair<int, int> best_fit;
int max_effective_resolution = 0;
int min_wasted_resolution = std::numeric_limits<int>::max();
for (const auto& resolution : possible_resolutions) {
int width = resolution.first;
int height = resolution.second;
float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
int downscaled_width = static_cast<int>(original_width * scale);
int downscaled_height = static_cast<int>(original_height * scale);
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
int wasted_resolution = (width * height) - effective_resolution;
// fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
max_effective_resolution = effective_resolution;
min_wasted_resolution = wasted_resolution;
best_fit = resolution;
}
}
return best_fit;
}
/**
* @brief Get the anyres image grid shape object
*
* @param image_size
* @param grid_pinpoints
* @param image_patch_size
* @return <int, int>
*/
struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, int>& image_size, const std::vector<std::pair<int, int>>& grid_pinpoints, int image_patch_size) {
/**
Conversion from gguf flat array to vector:
std::vector<std::pair<int, int>> possible_resolutions;
for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
}
*/
auto best_resolution = select_best_resolution(image_size, grid_pinpoints);
return {best_resolution.first / image_patch_size, best_resolution.second / image_patch_size};
}
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out) // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) { static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
struct temp_model { struct temp_model {
struct ggml_tensor *newline; struct ggml_tensor *newline;
struct ggml_context * ctx; struct ggml_context * ctx;
@ -21,11 +91,12 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_emb
auto num_patches_per_side = vparams.image_size / vparams.patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches) auto num_patches_per_side = vparams.image_size / vparams.patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)
int num_patches_width = grid_shape.first; // grid 1-4 int num_patches_width = grid_shape.first; // grid 1-4
int num_patches_height = grid_shape.second; // grid 1-4 int num_patches_height = grid_shape.second; // grid 1-4
const size_t num_images = num_patches_width + num_patches_height + 1;
// TODO: size calculation is not calculated - it's only tens of MB // TODO: size calculation is not calculated - it's only tens of MB
size_t ctx_size = 0; size_t ctx_size = 0;
{ {
ctx_size += clip_embd_nbytes(ctx_clip) * image_embd_v.size() * 8; // image_features ctx_size += clip_embd_nbytes(ctx_clip) * num_images * 8; // image_features
ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32); ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32);
} }
@ -84,10 +155,10 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_emb
} }
} }
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), image_embd_v.size() - 1); // example: 4096 x 576 x 4 struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false); // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
// fill it with the image embeddings, ignoring the base // fill it with the image embeddings, ignoring the base
for (int i = 1; i < image_embd_v.size(); i++) for (int i = 1; i < num_images; i++)
{ {
size_t offset = (i-1) * clip_embd_nbytes(ctx_clip); size_t offset = (i-1) * clip_embd_nbytes(ctx_clip);
memcpy((uint8_t *)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip)); memcpy((uint8_t *)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip));
@ -106,6 +177,15 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_emb
size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side * num_patches_width, 0); size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side * num_patches_width, 0);
// ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,false); // ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,false);
struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3)); struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3));
/**
At the end of each row we have to add the row_end embeddings, which are the same as the newline embeddings
image_feature = torch.cat((
image_feature,
self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)
), dim=-1)
*
*/
// ggml_tensor_printf(permuted_cont,"permuted_cont",__LINE__,false,false); // ggml_tensor_printf(permuted_cont,"permuted_cont",__LINE__,false,false);
struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, size_ele * clip_n_mmproj_embd(ctx_clip), 0); struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, size_ele * clip_n_mmproj_embd(ctx_clip), 0);
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false); // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
@ -115,7 +195,7 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_emb
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
// append without newline tokens (default behavior in llava_arch when not using unpad ): // append without newline tokens (default behavior in llava_arch when not using unpad ):
memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (image_embd_v.size()-1)); // grid patches memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip)); *n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip));
// Debug: Test single segments // Debug: Test single segments
@ -131,37 +211,25 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_emb
static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) { static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336 // std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
if (!clip_image_preprocess(ctx_clip, img, img_res_v, /*pad2square =*/ true)) { clip_image_f32_batch img_res_v;
img_res_v.size = 0;
img_res_v.data = nullptr;
if (!clip_image_preprocess(ctx_clip, img, img_res_v)) {
fprintf(stderr, "%s: unable to preprocess image\n", __func__); fprintf(stderr, "%s: unable to preprocess image\n", __func__);
for (auto img_res : img_res_v) { delete[] img_res_v.data;
clip_image_f32_free(img_res);
}
return false; return false;
} }
const int64_t t_img_enc_start_us = ggml_time_us(); const int64_t t_img_enc_start_us = ggml_time_us();
auto & vparams = clip_get_vision_hparams(ctx_clip); auto & vparams = clip_get_vision_hparams(ctx_clip);
// DEBUG print the "shape" and the first 10 rows and 10 cols of img_res_v in exp format
// for (int i = 0; i < img_res_v.size(); i++)
// {
// printf("img_res_v[%d] shape: %d x %d\n", i, img_res_v[i]->nx, img_res_v[i]->ny);
// for (int j = 0; j < 10; j++)
// {
// for (int k = 0; k < 10; k++)
// {
// printf("%e ", img_res_v[i]->buf[j*img_res_v[i]->ny + k]);
// }
// printf("\n");
// }
// }
if (strcmp(vparams.mm_patch_merge_type, "spatial_unpad") != 0) if (strcmp(vparams.mm_patch_merge_type, "spatial_unpad") != 0)
{ {
// flat / default llava-1.5 type embedding // flat / default llava-1.5 type embedding
*n_img_pos = clip_n_patches(ctx_clip); *n_img_pos = clip_n_patches(ctx_clip);
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res_v[0], image_embd); // image_embd shape is 576 x 4096 bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
clip_image_f32_free(img_res_v[0]); delete[] img_res_v.data;
if (!encoded) { if (!encoded) {
fprintf(stderr, "Unable to encode image\n"); fprintf(stderr, "Unable to encode image\n");
@ -172,30 +240,32 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
// spatial_unpad llava-1.6 type embedding // spatial_unpad llava-1.6 type embedding
// TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
std::vector<float *> image_embd_v; std::vector<float *> image_embd_v;
image_embd_v.resize(img_res_v.size()); image_embd_v.resize(img_res_v.size);
for (int i = 0; i < img_res_v.size(); i++) for (int i = 0; i < img_res_v.size; i++)
{ {
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184 image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res_v[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
clip_image_f32_free(img_res_v[i]);
if (!encoded) { if (!encoded) {
fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", i+1, (int)img_res_v.size()); fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", i+1, (int)img_res_v.size);
return false; return false;
} }
} }
const int64_t t_img_enc_batch_us = ggml_time_us(); const int64_t t_img_enc_batch_us = ggml_time_us();
printf("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size(), (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); printf("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
std::vector<std::pair<int, int>> grid_pinpoints; std::vector<std::pair<int, int>> grid_pinpoints;
for (int i = 0; i < 32 && vparams.image_grid_pinpoints[i] != 0; i+=2) { for (int i = 0; i < 32 && vparams.image_grid_pinpoints[i] != 0; i+=2) {
grid_pinpoints.push_back({vparams.image_grid_pinpoints[i], vparams.image_grid_pinpoints[i+1]}); grid_pinpoints.push_back({vparams.image_grid_pinpoints[i], vparams.image_grid_pinpoints[i+1]});
} }
img_res_v.clear(); // free all img_res_v - not needed anymore
delete[] img_res_v.data;
img_res_v.size = 0;
img_res_v.data = nullptr;
struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, vparams.image_size); struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, vparams.image_size);
int n_img_pos_out; int n_img_pos_out;
handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out); clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
*n_img_pos = n_img_pos_out; *n_img_pos = n_img_pos_out;
for (int i = 0; i < image_embd_v.size(); i++) for (int i = 0; i < image_embd_v.size(); i++)

View file

@ -31,6 +31,23 @@
using json = nlohmann::json; using json = nlohmann::json;
// RGB uint8 image
struct clip_image_u8 {
int nx;
int ny;
std::vector<uint8_t> buf;
};
// RGB float32 image (NHWC)
// Memory layout: RGBRGBRGB...
struct clip_image_f32 {
int nx;
int ny;
std::vector<float> buf;
};
struct server_params struct server_params
{ {
std::string hostname = "127.0.0.1"; std::string hostname = "127.0.0.1";
@ -943,14 +960,17 @@ struct llama_server_context
{ {
continue; continue;
} }
std::vector<clip_image_f32*> img_res_v; clip_image_f32_batch img_res_v;
if (!clip_image_preprocess(clp_ctx, img.img_data, img_res_v, /*pad2square =*/ true)) img_res_v.size = 0;
img_res_v.data = nullptr;
if (!clip_image_preprocess(clp_ctx, img.img_data, img_res_v))
{ {
LOG_TEE("Error processing the given image"); LOG_TEE("Error processing the given image");
clip_free(clp_ctx); clip_free(clp_ctx);
delete[] img_res_v.data;
return false; return false;
} }
clip_image_f32 * img_res = img_res_v[0]; clip_image_f32 * img_res = &img_res_v.data[0];
img.image_tokens = clip_n_patches(clp_ctx); img.image_tokens = clip_n_patches(clp_ctx);
img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx)); img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx));
if (!img.image_embedding) if (!img.image_embedding)
@ -965,7 +985,8 @@ struct llama_server_context
LOG_TEE("Unable to encode image\n"); LOG_TEE("Unable to encode image\n");
return false; return false;
} }
clip_image_f32_free(img_res); // clip_image_f32_free(img_res);
delete[] img_res_v.data;
img.request_encode_image = false; img.request_encode_image = false;
} }