llava : support v1.6 (#5267)
* Create llava-survery-v2.py * Update convert-image-encoder-to-gguf.py * Update convert-image-encoder-to-gguf.py * Rename llava-survery-v2.py to llava-surgery-v2.py * Update convert-image-encoder-to-gguf.py will now search for projector * Update convert-image-encoder-to-gguf.py whoops * Update llava-surgery-v2.py * Clip: Bugfix for normalization (it did not loat the 3 std and mean values) Clip: bicubic resize function Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6) Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final convert-image-encoder: fixed image-grid flattening * whitespace corrections * ws * Tensors are now properly permuted. Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference. * ws * added verbose_prompt support into cli added stopwords for llava-1.6 into cli * moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed * ws * convert : skip unknown tensors (need for LLaVA) * llava : update readme * llava : fix compile warnings * llava : style * convert : add --skip-unknown CLI arg * server : remove clip structs * bugfix for non llava-1.6 It should now work with llava-1.5 as well * clip : minor code rearrange * llava : update readme a bit --------- Co-authored-by: John <cmt-nct@users.noreply.github.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
f5ca054855
commit
aa23412989
10 changed files with 1229 additions and 205 deletions
|
@ -1,7 +1,7 @@
|
|||
// NOTE: This is modified from clip.cpp only for LLaVA,
|
||||
// so there might be still unnecessary artifacts hanging around
|
||||
// I'll gradually clean and extend it
|
||||
|
||||
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
|
||||
#include "clip.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-alloc.h"
|
||||
|
@ -30,6 +30,26 @@
|
|||
#include <vector>
|
||||
#include <sstream>
|
||||
#include <cinttypes>
|
||||
#include <limits>
|
||||
|
||||
//#define CLIP_DEBUG_FUNCTIONS
|
||||
|
||||
// RGB uint8 image
|
||||
struct clip_image_u8 {
|
||||
int nx;
|
||||
int ny;
|
||||
|
||||
std::vector<uint8_t> buf;
|
||||
};
|
||||
|
||||
// RGB float32 image (NHWC)
|
||||
// Memory layout: RGBRGBRGB...
|
||||
struct clip_image_f32 {
|
||||
int nx;
|
||||
int ny;
|
||||
|
||||
std::vector<float> buf;
|
||||
};
|
||||
|
||||
static std::string format(const char * fmt, ...) {
|
||||
va_list ap;
|
||||
|
@ -50,50 +70,56 @@ static std::string format(const char * fmt, ...) {
|
|||
// key constants
|
||||
//
|
||||
|
||||
#define KEY_FTYPE "general.file_type"
|
||||
#define KEY_NAME "general.name"
|
||||
#define KEY_DESCRIPTION "general.description"
|
||||
#define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
|
||||
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
|
||||
#define KEY_FTYPE "general.file_type"
|
||||
#define KEY_NAME "general.name"
|
||||
#define KEY_DESCRIPTION "general.description"
|
||||
#define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
|
||||
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
|
||||
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
|
||||
#define KEY_USE_GELU "clip.use_gelu"
|
||||
#define KEY_N_EMBD "clip.%s.embedding_length"
|
||||
#define KEY_N_FF "clip.%s.feed_forward_length"
|
||||
#define KEY_N_BLOCK "clip.%s.block_count"
|
||||
#define KEY_N_HEAD "clip.%s.attention.head_count"
|
||||
#define KEY_USE_GELU "clip.use_gelu"
|
||||
#define KEY_N_EMBD "clip.%s.embedding_length"
|
||||
#define KEY_N_FF "clip.%s.feed_forward_length"
|
||||
#define KEY_N_BLOCK "clip.%s.block_count"
|
||||
#define KEY_N_HEAD "clip.%s.attention.head_count"
|
||||
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
|
||||
#define KEY_PROJ_DIM "clip.%s.projection_dim"
|
||||
#define KEY_TOKENS "tokenizer.ggml.tokens"
|
||||
#define KEY_N_POSITIONS "clip.text.context_length"
|
||||
#define KEY_IMAGE_SIZE "clip.vision.image_size"
|
||||
#define KEY_PATCH_SIZE "clip.vision.patch_size"
|
||||
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
|
||||
#define KEY_IMAGE_STD "clip.vision.image_std"
|
||||
#define KEY_PROJ_TYPE "clip.projector_type"
|
||||
#define KEY_PROJ_DIM "clip.%s.projection_dim"
|
||||
#define KEY_TOKENS "tokenizer.ggml.tokens"
|
||||
#define KEY_N_POSITIONS "clip.text.context_length"
|
||||
#define KEY_IMAGE_SIZE "clip.vision.image_size"
|
||||
#define KEY_PATCH_SIZE "clip.vision.patch_size"
|
||||
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
|
||||
#define KEY_IMAGE_STD "clip.vision.image_std"
|
||||
#define KEY_PROJ_TYPE "clip.projector_type"
|
||||
|
||||
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
|
||||
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
|
||||
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
|
||||
|
||||
|
||||
//
|
||||
// tensor name constants
|
||||
//
|
||||
|
||||
#define TN_TOKEN_EMBD "%s.token_embd.weight"
|
||||
#define TN_POS_EMBD "%s.position_embd.weight"
|
||||
#define TN_CLASS_EMBD "v.class_embd"
|
||||
#define TN_PATCH_EMBD "v.patch_embd.weight"
|
||||
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
|
||||
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
|
||||
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
|
||||
#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
|
||||
#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
|
||||
#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
|
||||
#define TN_LN_1 "%s.blk.%d.ln1.%s"
|
||||
#define TN_LN_2 "%s.blk.%d.ln2.%s"
|
||||
#define TN_LN_PRE "%s.pre_ln.%s"
|
||||
#define TN_LN_POST "%s.post_ln.%s"
|
||||
#define TN_TEXT_PROJ "text_projection.weight"
|
||||
#define TN_VIS_PROJ "visual_projection.weight"
|
||||
#define TN_LLAVA_PROJ "mm.%d.%s"
|
||||
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
|
||||
#define TN_TOKEN_EMBD "%s.token_embd.weight"
|
||||
#define TN_POS_EMBD "%s.position_embd.weight"
|
||||
#define TN_CLASS_EMBD "v.class_embd"
|
||||
#define TN_PATCH_EMBD "v.patch_embd.weight"
|
||||
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
|
||||
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
|
||||
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
|
||||
#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
|
||||
#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
|
||||
#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
|
||||
#define TN_LN_1 "%s.blk.%d.ln1.%s"
|
||||
#define TN_LN_2 "%s.blk.%d.ln2.%s"
|
||||
#define TN_LN_PRE "%s.pre_ln.%s"
|
||||
#define TN_LN_POST "%s.post_ln.%s"
|
||||
#define TN_TEXT_PROJ "text_projection.weight"
|
||||
#define TN_VIS_PROJ "visual_projection.weight"
|
||||
#define TN_LLAVA_PROJ "mm.%d.%s"
|
||||
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
|
||||
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
||||
#define TN_IMAGE_NEWLINE "model.image_newline"
|
||||
|
||||
|
||||
enum projector_type {
|
||||
|
@ -104,8 +130,8 @@ enum projector_type {
|
|||
};
|
||||
|
||||
static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_MLP, "mlp" },
|
||||
{ PROJECTOR_TYPE_LDP, "ldp" },
|
||||
{ PROJECTOR_TYPE_MLP, "mlp" },
|
||||
{ PROJECTOR_TYPE_LDP, "ldp" },
|
||||
};
|
||||
|
||||
|
||||
|
@ -165,7 +191,6 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||
std::string result;
|
||||
for (size_t pos = 0; ; pos += search.length()) {
|
||||
|
@ -217,7 +242,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
|||
}
|
||||
}
|
||||
|
||||
static void print_tensor_info(const ggml_tensor* tensor, const char* prefix = "") {
|
||||
static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
|
||||
size_t tensor_size = ggml_nbytes(tensor);
|
||||
printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
|
||||
prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
|
||||
|
@ -233,31 +258,136 @@ static projector_type clip_projector_type_from_string(const std::string & name)
|
|||
return PROJECTOR_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
//
|
||||
// image data
|
||||
//
|
||||
#ifdef CLIP_DEBUG_FUNCTIONS
|
||||
static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
|
||||
std::ofstream file(filename, std::ios::binary);
|
||||
if (!file.is_open()) {
|
||||
std::cerr << "Failed to open file for writing: " << filename << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
// RGB uint8 image
|
||||
struct clip_image_u8 {
|
||||
int nx;
|
||||
int ny;
|
||||
// PPM header: P6 format, width, height, and max color value
|
||||
file << "P6\n" << img.nx << " " << img.ny << "\n255\n";
|
||||
|
||||
std::vector<uint8_t> buf;
|
||||
};
|
||||
// Write pixel data
|
||||
for (size_t i = 0; i < img.buf.size(); i += 3) {
|
||||
// PPM expects binary data in RGB format, which matches our image buffer
|
||||
file.write(reinterpret_cast<const char*>(&img.buf[i]), 3);
|
||||
}
|
||||
|
||||
// RGB float32 image (NHWC)
|
||||
// Memory layout: RGBRGBRGB...
|
||||
struct clip_image_f32 {
|
||||
int nx;
|
||||
int ny;
|
||||
file.close();
|
||||
}
|
||||
|
||||
static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
|
||||
std::ofstream file(filename, std::ios::binary);
|
||||
if (!file.is_open()) {
|
||||
std::cerr << "Failed to open file for writing: " << filename << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data
|
||||
int bytesPerPixel = 3;
|
||||
int widthInBytes = img.nx * bytesPerPixel;
|
||||
int paddingAmount = (4 - (widthInBytes % 4)) % 4;
|
||||
int stride = widthInBytes + paddingAmount;
|
||||
|
||||
// Bitmap file header
|
||||
unsigned char fileHeader[14] = {
|
||||
'B','M', // Signature
|
||||
0,0,0,0, // Image file size in bytes
|
||||
0,0,0,0, // Reserved
|
||||
54,0,0,0 // Start of pixel array
|
||||
};
|
||||
|
||||
// Total file size
|
||||
fileSize = 54 + (stride * img.ny);
|
||||
fileHeader[2] = (unsigned char)(fileSize);
|
||||
fileHeader[3] = (unsigned char)(fileSize >> 8);
|
||||
fileHeader[4] = (unsigned char)(fileSize >> 16);
|
||||
fileHeader[5] = (unsigned char)(fileSize >> 24);
|
||||
|
||||
// Bitmap information header (BITMAPINFOHEADER)
|
||||
unsigned char infoHeader[40] = {
|
||||
40,0,0,0, // Size of this header (40 bytes)
|
||||
0,0,0,0, // Image width
|
||||
0,0,0,0, // Image height
|
||||
1,0, // Number of color planes
|
||||
24,0, // Bits per pixel
|
||||
0,0,0,0, // No compression
|
||||
0,0,0,0, // Image size (can be 0 for no compression)
|
||||
0,0,0,0, // X pixels per meter (not specified)
|
||||
0,0,0,0, // Y pixels per meter (not specified)
|
||||
0,0,0,0, // Total colors (color table not used)
|
||||
0,0,0,0 // Important colors (all are important)
|
||||
};
|
||||
|
||||
// Width and height in the information header
|
||||
infoHeader[4] = (unsigned char)(img.nx);
|
||||
infoHeader[5] = (unsigned char)(img.nx >> 8);
|
||||
infoHeader[6] = (unsigned char)(img.nx >> 16);
|
||||
infoHeader[7] = (unsigned char)(img.nx >> 24);
|
||||
infoHeader[8] = (unsigned char)(img.ny);
|
||||
infoHeader[9] = (unsigned char)(img.ny >> 8);
|
||||
infoHeader[10] = (unsigned char)(img.ny >> 16);
|
||||
infoHeader[11] = (unsigned char)(img.ny >> 24);
|
||||
|
||||
// Write file headers
|
||||
file.write(reinterpret_cast<char*>(fileHeader), sizeof(fileHeader));
|
||||
file.write(reinterpret_cast<char*>(infoHeader), sizeof(infoHeader));
|
||||
|
||||
// Pixel data
|
||||
std::vector<unsigned char> padding(3, 0); // Max padding size to be added to each row
|
||||
for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top
|
||||
for (int x = 0; x < img.nx; ++x) {
|
||||
// Each pixel
|
||||
size_t pixelIndex = (y * img.nx + x) * 3;
|
||||
unsigned char pixel[3] = {
|
||||
img.buf[pixelIndex + 2], // BMP stores pixels in BGR format
|
||||
img.buf[pixelIndex + 1],
|
||||
img.buf[pixelIndex]
|
||||
};
|
||||
file.write(reinterpret_cast<char*>(pixel), 3);
|
||||
}
|
||||
// Write padding for the row
|
||||
file.write(reinterpret_cast<char*>(padding.data()), paddingAmount);
|
||||
}
|
||||
|
||||
file.close();
|
||||
}
|
||||
|
||||
// debug function to convert f32 to u8
|
||||
static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
|
||||
dst.nx = src.nx;
|
||||
dst.ny = src.ny;
|
||||
dst.buf.resize(3 * src.nx * src.ny);
|
||||
for (size_t i = 0; i < src.buf.size(); ++i) {
|
||||
dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
std::vector<float> buf;
|
||||
};
|
||||
|
||||
//
|
||||
// clip layers
|
||||
//
|
||||
|
||||
struct clip_hparams {
|
||||
int32_t image_size;
|
||||
int32_t patch_size;
|
||||
int32_t hidden_size;
|
||||
int32_t n_intermediate;
|
||||
int32_t projection_dim;
|
||||
int32_t n_head;
|
||||
int32_t n_layer;
|
||||
|
||||
float eps;
|
||||
|
||||
char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default)
|
||||
|
||||
int32_t image_grid_pinpoints[32];
|
||||
int32_t image_crop_resolution;
|
||||
};
|
||||
|
||||
struct clip_layer {
|
||||
// attention
|
||||
struct ggml_tensor * k_w;
|
||||
|
@ -287,7 +417,7 @@ struct clip_layer {
|
|||
};
|
||||
|
||||
struct clip_vision_model {
|
||||
struct clip_vision_hparams hparams;
|
||||
struct clip_hparams hparams;
|
||||
|
||||
// embeddings
|
||||
struct ggml_tensor * class_embedding;
|
||||
|
@ -310,6 +440,8 @@ struct clip_vision_model {
|
|||
struct ggml_tensor * mm_2_w = NULL;
|
||||
struct ggml_tensor * mm_2_b = NULL;
|
||||
|
||||
struct ggml_tensor * image_newline = NULL;
|
||||
|
||||
// Yi type models with mlp+normalization projection
|
||||
struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4
|
||||
struct ggml_tensor * mm_1_b = NULL;
|
||||
|
@ -364,9 +496,10 @@ struct clip_ctx {
|
|||
std::vector<uint8_t> buf_compute_meta;
|
||||
|
||||
// memory buffers to evaluate the model
|
||||
ggml_backend_buffer_t params_buffer = NULL;
|
||||
ggml_backend_buffer_t params_buffer = NULL;
|
||||
ggml_backend_buffer_t compute_buffer = NULL;
|
||||
ggml_backend_t backend = NULL;
|
||||
|
||||
ggml_backend_t backend = NULL;
|
||||
ggml_gallocr_t compute_alloc = NULL;
|
||||
};
|
||||
|
||||
|
@ -379,18 +512,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
const auto & model = ctx->vision_model;
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int image_size = hparams.image_size;
|
||||
const int patch_size = hparams.patch_size;
|
||||
const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
|
||||
const int num_positions = num_patches + 1;
|
||||
const int hidden_size = hparams.hidden_size;
|
||||
const int n_head = hparams.n_head;
|
||||
const int d_head = hidden_size / n_head;
|
||||
const int n_layer = hparams.n_layer;
|
||||
//const int n_intermediate = hparams.n_intermediate;
|
||||
//const int projection_dim = hparams.projection_dim;
|
||||
const float eps = hparams.eps;
|
||||
int batch_size = imgs->size;
|
||||
const int image_size = hparams.image_size;
|
||||
const int patch_size = hparams.patch_size;
|
||||
const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
|
||||
const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
|
||||
const int num_positions = num_patches + 1;
|
||||
const int hidden_size = hparams.hidden_size;
|
||||
const int n_head = hparams.n_head;
|
||||
const int d_head = hidden_size / n_head;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const float eps = hparams.eps;
|
||||
|
||||
const int batch_size = imgs->size;
|
||||
|
||||
if (ctx->has_llava_projector) {
|
||||
GGML_ASSERT(batch_size == 1);
|
||||
}
|
||||
|
@ -540,7 +674,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
||||
|
||||
embeddings = ggml_gelu(ctx0, embeddings);
|
||||
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
|
||||
|
||||
|
@ -791,10 +924,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
if (idx != -1) {
|
||||
const std::string proj_type = gguf_get_val_str(ctx, idx);
|
||||
new_clip->proj_type = clip_projector_type_from_string(proj_type);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
new_clip->proj_type = PROJECTOR_TYPE_MLP;
|
||||
}
|
||||
|
||||
if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
|
||||
if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) {
|
||||
new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM;
|
||||
|
@ -920,11 +1053,41 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision"));
|
||||
hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision"));
|
||||
|
||||
try {
|
||||
int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS);
|
||||
int n = gguf_get_arr_n(ctx, idx);
|
||||
const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx);
|
||||
for (int i = 0; i < 32 && i < n && pinpoints[i] != 0; ++i) {
|
||||
hparams.image_grid_pinpoints[i] = pinpoints[i];
|
||||
}
|
||||
if (n < 32)
|
||||
hparams.image_grid_pinpoints[n] = 0;
|
||||
} catch (std::runtime_error & e) {
|
||||
hparams.image_grid_pinpoints[0]=0;
|
||||
}
|
||||
|
||||
try {
|
||||
int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE);
|
||||
strcpy(hparams.mm_patch_merge_type, gguf_get_val_str(ctx, idx));
|
||||
} catch (std::runtime_error & e) {
|
||||
strcpy(hparams.mm_patch_merge_type, "flat");
|
||||
}
|
||||
|
||||
try {
|
||||
hparams.image_crop_resolution = get_u32(ctx, KEY_IMAGE_CROP_RESOLUTION); // llava-1.6
|
||||
} catch(const std::exception& e) {
|
||||
hparams.image_crop_resolution = hparams.image_size;
|
||||
}
|
||||
|
||||
int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
|
||||
int idx_std = get_key_idx(ctx, KEY_IMAGE_STD);
|
||||
|
||||
const float * mean_data = (const float *)gguf_get_arr_data(ctx, idx_mean);
|
||||
const float * std_data = (const float *)gguf_get_arr_data(ctx, idx_std);
|
||||
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
new_clip->image_mean[i] = *((const float *)gguf_get_arr_data(ctx, idx_mean));
|
||||
new_clip->image_std[i] = *((const float *)gguf_get_arr_data(ctx, idx_std));
|
||||
new_clip->image_mean[i] = mean_data[i];
|
||||
new_clip->image_std[i] = std_data[i];
|
||||
}
|
||||
|
||||
if (verbosity >= 2) {
|
||||
|
@ -936,13 +1099,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
printf("v_projection_dim %d\n", hparams.projection_dim);
|
||||
printf("v_n_head %d\n", hparams.n_head);
|
||||
printf("v_n_layer %d\n", hparams.n_layer);
|
||||
printf("v_eps %f\n", hparams.eps);
|
||||
printf("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
|
||||
printf("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
|
||||
printf("v_image_grid_pinpoints: ");
|
||||
for (int i = 0; i < 32 & hparams.image_grid_pinpoints[i]!=0; ++i) {
|
||||
printf("%d ", hparams.image_grid_pinpoints[i]);
|
||||
}
|
||||
printf("\n");
|
||||
printf("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
|
||||
|
||||
}
|
||||
|
||||
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
||||
vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
|
||||
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
|
||||
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
|
||||
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
|
||||
try {
|
||||
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
||||
vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
|
||||
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
|
||||
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
|
||||
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
|
||||
} catch(const std::exception& e) {
|
||||
fprintf(stderr, "%s: failed to load vision model tensors\n", __func__);
|
||||
}
|
||||
|
||||
// LLaVA projection
|
||||
if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
||||
|
@ -968,40 +1145,43 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
|
||||
vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
|
||||
} catch (std::runtime_error & e) { }
|
||||
}
|
||||
else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
|
||||
try {
|
||||
vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
|
||||
// fprintf(stderr, "%s: image_newline tensor (llava-1.6) found\n", __func__);
|
||||
} catch (std::runtime_error & e) { }
|
||||
} else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
|
||||
// MobileVLM projection
|
||||
vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
|
||||
vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias"));
|
||||
vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight"));
|
||||
vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias"));
|
||||
vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
|
||||
vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
|
||||
vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
|
||||
vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
|
||||
vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias"));
|
||||
vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight"));
|
||||
vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias"));
|
||||
vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
|
||||
vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
|
||||
vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
|
||||
vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
|
||||
vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
|
||||
vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
|
||||
vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
|
||||
vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
|
||||
vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
|
||||
vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
|
||||
vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
|
||||
vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
|
||||
vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
|
||||
vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
|
||||
vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
|
||||
vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
|
||||
vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
|
||||
vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
|
||||
vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
|
||||
vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
|
||||
vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
|
||||
vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
|
||||
vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
|
||||
vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
|
||||
vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
|
||||
vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
|
||||
}
|
||||
else {
|
||||
vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
|
||||
vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
|
||||
vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
|
||||
} else {
|
||||
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
|
||||
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
||||
}
|
||||
|
||||
vision_model.layers.resize(hparams.n_layer);
|
||||
|
||||
for (int il = 0; il < hparams.n_layer; ++il) {
|
||||
auto & layer = vision_model.layers[il];
|
||||
layer.k_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "weight"));
|
||||
|
@ -1084,24 +1264,255 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
|
|||
return true;
|
||||
}
|
||||
|
||||
// normalize: x = (x - mean) / std
|
||||
// TODO: implement bicubic interpolation instead of linear.
|
||||
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res, const bool pad2square) {
|
||||
// Linear interpolation between two points
|
||||
inline float lerp(float s, float e, float t) {
|
||||
return s + (e - s) * t;
|
||||
}
|
||||
// Bilinear resize function
|
||||
static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) {
|
||||
dst.nx = target_width;
|
||||
dst.ny = target_height;
|
||||
dst.buf.resize(3 * target_width * target_height);
|
||||
|
||||
float x_ratio = static_cast<float>(src.nx - 1) / target_width;
|
||||
float y_ratio = static_cast<float>(src.ny - 1) / target_height;
|
||||
|
||||
for (int y = 0; y < target_height; y++) {
|
||||
for (int x = 0; x < target_width; x++) {
|
||||
float px = x_ratio * x;
|
||||
float py = y_ratio * y;
|
||||
int x_floor = static_cast<int>(px);
|
||||
int y_floor = static_cast<int>(py);
|
||||
float x_lerp = px - x_floor;
|
||||
float y_lerp = py - y_floor;
|
||||
|
||||
for (int c = 0; c < 3; c++) {
|
||||
float top = lerp(
|
||||
static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]),
|
||||
static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]),
|
||||
x_lerp
|
||||
);
|
||||
float bottom = lerp(
|
||||
static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]),
|
||||
static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]),
|
||||
x_lerp
|
||||
);
|
||||
dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(top, bottom, y_lerp));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
|
||||
static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, const float mean[3], const float std[3]) {
|
||||
dst->nx = src->nx;
|
||||
dst->ny = src->ny;
|
||||
dst->buf.resize(src->buf.size());
|
||||
|
||||
for (size_t i = 0; i < src->buf.size(); ++i) {
|
||||
int c = i % 3; // rgb
|
||||
dst->buf[i] = (static_cast<float>(src->buf[i]) / 255.0f - mean[c]) / std[c];
|
||||
}
|
||||
}
|
||||
|
||||
inline float clip(float x, float lower, float upper) {
|
||||
return std::max(lower, std::min(x, upper));
|
||||
}
|
||||
|
||||
static bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_width, int target_height) {
|
||||
const int nx = img.nx;
|
||||
const int ny = img.ny;
|
||||
|
||||
dst.nx = target_width;
|
||||
dst.ny = target_height;
|
||||
dst.buf.resize(3 * target_width * target_height);
|
||||
|
||||
float Cc;
|
||||
float C[5];
|
||||
float d0, d2, d3, a0, a1, a2, a3;
|
||||
int i, j, k, jj;
|
||||
int x, y;
|
||||
float dx, dy;
|
||||
float tx, ty;
|
||||
|
||||
tx = (float)nx / (float)target_width;
|
||||
ty = (float)ny / (float)target_height;
|
||||
|
||||
// Bicubic interpolation; adapted from ViT.cpp, inspired from :
|
||||
// -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36
|
||||
// -> https://en.wikipedia.org/wiki/Bicubic_interpolation
|
||||
|
||||
for (i = 0; i < target_height; i++) {
|
||||
for (j = 0; j < target_width; j++) {
|
||||
x = (int)(tx * j);
|
||||
y = (int)(ty * i);
|
||||
|
||||
dx = tx * j - x;
|
||||
dy = ty * i - y;
|
||||
|
||||
for (k = 0; k < 3; k++) {
|
||||
for (jj = 0; jj <= 3; jj++) {
|
||||
d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
||||
d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
||||
d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
||||
a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
||||
|
||||
a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
|
||||
a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
|
||||
a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
|
||||
|
||||
C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;
|
||||
|
||||
d0 = C[0] - C[1];
|
||||
d2 = C[2] - C[1];
|
||||
d3 = C[3] - C[1];
|
||||
a0 = C[1];
|
||||
a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
|
||||
a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
|
||||
a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
|
||||
Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
|
||||
|
||||
const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
|
||||
dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// llava-1.6 type of resize_and_pad (black)
|
||||
static void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &image_output, const std::pair<int, int>& target_resolution) {
|
||||
int target_width = target_resolution.first;
|
||||
int target_height = target_resolution.second;
|
||||
|
||||
float scale_w = static_cast<float>(target_width) / image.nx;
|
||||
float scale_h = static_cast<float>(target_height) / image.ny;
|
||||
|
||||
int new_width, new_height;
|
||||
|
||||
if (scale_w < scale_h) {
|
||||
new_width = target_width;
|
||||
new_height = std::min(static_cast<int>(std::ceil(image.ny * scale_w)), target_height);
|
||||
} else {
|
||||
new_height = target_height;
|
||||
new_width = std::min(static_cast<int>(std::ceil(image.nx * scale_h)), target_width);
|
||||
}
|
||||
|
||||
clip_image_u8 resized_image;
|
||||
// bilinear_resize(image, resized_image, new_width, new_height);
|
||||
bicubic_resize(image, resized_image, new_width, new_height);
|
||||
|
||||
clip_image_u8 padded_image;
|
||||
padded_image.nx = target_width;
|
||||
padded_image.ny = target_height;
|
||||
padded_image.buf.resize(3 * target_width * target_height, 0); // Initialize with black
|
||||
|
||||
// Calculate padding offsets
|
||||
int pad_x = (target_width - new_width) / 2;
|
||||
int pad_y = (target_height - new_height) / 2;
|
||||
|
||||
// Copy the resized image into the center of the padded buffer
|
||||
for (int y = 0; y < new_height; ++y) {
|
||||
for (int x = 0; x < new_width; ++x) {
|
||||
for (int c = 0; c < 3; ++c) {
|
||||
padded_image.buf[3 * ((y + pad_y) * target_width + (x + pad_x)) + c] = resized_image.buf[3 * (y * new_width + x) + c];
|
||||
}
|
||||
}
|
||||
}
|
||||
image_output = std::move(padded_image);
|
||||
}
|
||||
|
||||
/**
|
||||
* Selects the best resolution from a list of possible resolutions based on the original size.
|
||||
*
|
||||
* @param original_size The original size of the image in the format (width, height).
|
||||
* @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
|
||||
* @return The best fit resolution in the format (width, height).
|
||||
*/
|
||||
static std::pair<int, int> select_best_resolution(const std::pair<int, int> & original_size, const std::vector<std::pair<int, int>> & possible_resolutions) {
|
||||
int original_width = original_size.first;
|
||||
int original_height = original_size.second;
|
||||
std::pair<int, int> best_fit;
|
||||
int max_effective_resolution = 0;
|
||||
int min_wasted_resolution = std::numeric_limits<int>::max();
|
||||
|
||||
for (const auto& resolution : possible_resolutions) {
|
||||
int width = resolution.first;
|
||||
int height = resolution.second;
|
||||
float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
|
||||
int downscaled_width = static_cast<int>(original_width * scale);
|
||||
int downscaled_height = static_cast<int>(original_height * scale);
|
||||
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
|
||||
int wasted_resolution = (width * height) - effective_resolution;
|
||||
// fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
|
||||
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
|
||||
max_effective_resolution = effective_resolution;
|
||||
min_wasted_resolution = wasted_resolution;
|
||||
best_fit = resolution;
|
||||
}
|
||||
}
|
||||
|
||||
return best_fit;
|
||||
}
|
||||
|
||||
static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & image, int patch_size) {
|
||||
std::vector<clip_image_u8*> patches;
|
||||
int width = image.nx;
|
||||
int height = image.ny;
|
||||
for (int i = 0; i < height; i += patch_size) {
|
||||
for (int j = 0; j < width; j += patch_size) {
|
||||
clip_image_u8 *patch = clip_image_u8_init();
|
||||
patch->nx = std::min(patch_size, width - j);
|
||||
patch->ny = std::min(patch_size, height - i);
|
||||
patch->buf.resize(3 * patch->nx * patch->ny);
|
||||
for (int y = 0; y < patch->ny; ++y) {
|
||||
for (int x = 0; x < patch->nx; ++x) {
|
||||
for (int c = 0; c < 3; ++c) {
|
||||
patch->buf[3 * (y * patch->nx + x) + c] = image.buf[3 * ((i + y) * width + (j + x)) + c];
|
||||
}
|
||||
}
|
||||
}
|
||||
patches.push_back(patch);
|
||||
}
|
||||
}
|
||||
return patches;
|
||||
}
|
||||
|
||||
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
|
||||
// res_imgs memory is being allocated here, previous allocations will be freed if found
|
||||
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs) {
|
||||
bool pad_to_square = true;
|
||||
if (!ctx->has_vision_encoder) {
|
||||
printf("This gguf file seems to have no vision encoder\n");
|
||||
return false;
|
||||
}
|
||||
auto & params = ctx->vision_model.hparams;
|
||||
// The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
|
||||
if (strcmp(params.mm_patch_merge_type, "spatial_unpad") == 0) {
|
||||
pad_to_square = false;
|
||||
}
|
||||
// free the previous res_imgs if any set
|
||||
if (res_imgs.size > 0 && res_imgs.size < 100) {
|
||||
for (size_t i = 0; i < res_imgs.size; i++) {
|
||||
clip_image_f32_free(&(res_imgs.data[i]));
|
||||
}
|
||||
delete[] res_imgs.data;
|
||||
}
|
||||
res_imgs.data = nullptr;
|
||||
res_imgs.size = 0;
|
||||
|
||||
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
|
||||
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
|
||||
|
||||
clip_image_u8 * temp = clip_image_u8_init(); // we will keep the input image data here temporarily
|
||||
if (pad2square && img->nx != img->ny) {
|
||||
if (pad_to_square && img->nx != img->ny) {
|
||||
int longer_side = std::max(img->nx, img->ny);
|
||||
temp->nx = longer_side;
|
||||
temp->ny = longer_side;
|
||||
temp->buf.resize(3 * longer_side * longer_side);
|
||||
const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA
|
||||
const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA (this is the mean rgb color * 255)
|
||||
|
||||
// fill with background color
|
||||
for (size_t i = 0; i < temp->buf.size(); i++) {
|
||||
|
@ -1119,18 +1530,63 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
|||
}
|
||||
}
|
||||
} else {
|
||||
temp->nx = img->nx;
|
||||
temp->ny = img->ny;
|
||||
temp->buf.resize(img->buf.size());
|
||||
memcpy(temp->buf.data(), img->buf.data(), temp->buf.size());
|
||||
if (params.image_grid_pinpoints[0] != 0) {
|
||||
// "spatial_unpad" with "anyres" processing for llava-1.6
|
||||
std::vector<std::pair<int, int>> possible_resolutions;
|
||||
for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
|
||||
possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
|
||||
}
|
||||
std::pair<int, int> best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions);
|
||||
// clip_image_save_to_bmp(*img, "input.bmp");
|
||||
resize_and_pad_image(*img, *temp, best_resolution); // we do not pad with mean-bg color anymore in llava-1.6
|
||||
// clip_image_save_to_bmp(*temp, "resized.bmp");
|
||||
// visually verify normalized image:
|
||||
// normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std);
|
||||
// {
|
||||
// clip_image_u8 * temp2 = clip_image_u8_init();
|
||||
// clip_image_convert_f32_to_u8(*res, *temp2);
|
||||
// clip_image_save_to_bmp(*temp2, "resized_normalized_f32.bmp");
|
||||
// clip_image_u8_free(temp2);
|
||||
// }
|
||||
|
||||
std::vector<clip_image_u8 *> patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6)
|
||||
|
||||
clip_image_u8 *image_original_resize = clip_image_u8_init();
|
||||
// bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
|
||||
bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
|
||||
patches.insert(patches.begin(), image_original_resize);
|
||||
// clip_image_f32_batch_init(patches.size());
|
||||
res_imgs.size = patches.size();
|
||||
res_imgs.data = new clip_image_f32[res_imgs.size];
|
||||
int num=0;
|
||||
for (auto& patch : patches) {
|
||||
normalize_image_u8_to_f32(patch, &res_imgs.data[num], ctx->image_mean, ctx->image_std);
|
||||
num++;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < patches.size(); i++) {
|
||||
// printf("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
|
||||
clip_image_u8_free(patches[i]);
|
||||
}
|
||||
|
||||
clip_image_u8_free(temp);
|
||||
|
||||
return true;
|
||||
} else {
|
||||
temp->nx = img->nx;
|
||||
temp->ny = img->ny;
|
||||
temp->buf.resize(img->buf.size());
|
||||
memcpy(temp->buf.data(), img->buf.data(), temp->buf.size());
|
||||
}
|
||||
}
|
||||
|
||||
const int nx = temp->nx;
|
||||
const int ny = temp->ny;
|
||||
// clip_image_save_to_bmp(*temp, "resized_vanilla.bmp");
|
||||
|
||||
const int nx2 = ctx->vision_model.hparams.image_size;
|
||||
const int ny2 = ctx->vision_model.hparams.image_size;
|
||||
|
||||
clip_image_f32 * res = clip_image_f32_init();
|
||||
res->nx = nx2;
|
||||
res->ny = ny2;
|
||||
res->buf.resize(3 * nx2 * ny2);
|
||||
|
@ -1184,9 +1640,25 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
|||
}
|
||||
clip_image_u8_free(temp);
|
||||
|
||||
// {
|
||||
// clip_image_u8 * temp2 = clip_image_u8_init();
|
||||
// clip_image_convert_f32_to_u8(*res, *temp2);
|
||||
// clip_image_save_to_bmp(*temp2, "resized_normalized_f32_vanilla.bmp");
|
||||
// clip_image_u8_free(temp2);
|
||||
// }
|
||||
// res_imgs.push_back(res);
|
||||
|
||||
res_imgs.size = 1;
|
||||
res_imgs.data = new clip_image_f32[res_imgs.size];
|
||||
res_imgs.data[0] = std::move(*res);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
|
||||
return ctx->vision_model.image_newline;
|
||||
}
|
||||
|
||||
void clip_free(clip_ctx * ctx) {
|
||||
ggml_free(ctx->ctx_data);
|
||||
gguf_free(ctx->ctx_gguf);
|
||||
|
@ -1194,6 +1666,42 @@ void clip_free(clip_ctx * ctx) {
|
|||
delete ctx;
|
||||
}
|
||||
|
||||
size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
|
||||
return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
|
||||
}
|
||||
|
||||
int32_t clip_image_size(const struct clip_ctx * ctx) {
|
||||
return ctx->vision_model.hparams.image_size;
|
||||
}
|
||||
|
||||
int32_t clip_patch_size(const struct clip_ctx * ctx) {
|
||||
return ctx->vision_model.hparams.patch_size;
|
||||
}
|
||||
|
||||
int32_t clip_hidden_size(const struct clip_ctx * ctx) {
|
||||
return ctx->vision_model.hparams.hidden_size;
|
||||
}
|
||||
|
||||
const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
|
||||
return ctx->vision_model.hparams.mm_patch_merge_type;
|
||||
}
|
||||
|
||||
const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
|
||||
return ctx->vision_model.hparams.image_grid_pinpoints;
|
||||
}
|
||||
|
||||
int clip_n_patches(const struct clip_ctx * ctx) {
|
||||
const auto & params = ctx->vision_model.hparams;
|
||||
|
||||
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
|
||||
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
|
||||
n_patches /= 4;
|
||||
}
|
||||
|
||||
return n_patches;
|
||||
}
|
||||
|
||||
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
|
||||
if (!ctx->has_vision_encoder) {
|
||||
printf("This gguf file seems to have no vision encoder\n");
|
||||
|
@ -1213,7 +1721,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||
}
|
||||
|
||||
int batch_size = imgs->size;
|
||||
if(ctx->has_llava_projector) {
|
||||
if (ctx->has_llava_projector) {
|
||||
GGML_ASSERT(batch_size == 1); // TODO: support multiple images
|
||||
}
|
||||
|
||||
|
@ -1224,9 +1732,10 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||
// set inputs
|
||||
const auto & model = ctx->vision_model;
|
||||
const auto & hparams = model.hparams;
|
||||
const int image_size = hparams.image_size;
|
||||
const int patch_size = hparams.patch_size;
|
||||
const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
|
||||
|
||||
const int image_size = hparams.image_size;
|
||||
const int patch_size = hparams.patch_size;
|
||||
const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
|
||||
const int num_positions = num_patches + 1;
|
||||
|
||||
{
|
||||
|
@ -1301,11 +1810,11 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||
|
||||
// copy the embeddings to the location passed by the user
|
||||
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) {
|
||||
|
||||
ggml_type type = GGML_TYPE_Q4_1;
|
||||
|
||||
assert(itype < GGML_TYPE_COUNT);
|
||||
|
@ -1494,26 +2003,13 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|||
if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
|
||||
return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
|
||||
}
|
||||
else if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
|
||||
return ctx->vision_model.mm_2_b->ne[0];
|
||||
} else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
||||
}
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
||||
return ctx->vision_model.mm_3_b->ne[0];
|
||||
}
|
||||
else {
|
||||
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
|
||||
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
||||
}
|
||||
}
|
||||
|
||||
int clip_n_patches(const struct clip_ctx * ctx) {
|
||||
auto & params = ctx->vision_model.hparams;
|
||||
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
|
||||
n_patches /= 4;
|
||||
}
|
||||
return n_patches;
|
||||
}
|
||||
|
||||
size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
|
||||
return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
|
||||
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
|
||||
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue