wip minicpmv
This commit is contained in:
parent
d0068ef0ed
commit
4a7ab89d75
9 changed files with 491 additions and 77 deletions
|
@ -3,6 +3,7 @@
|
|||
#include "llama-impl.h"
|
||||
|
||||
#include <map>
|
||||
#include <exception>
|
||||
|
||||
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_LLAMA, "llama" },
|
||||
|
@ -65,12 +66,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
static const std::map<vision_arch, const char *> VISION_ARCH_NAMES = {
|
||||
{ VISION_ARCH_LLAVA, "llava" },
|
||||
{ VISION_ARCH_MOBILEVLM, "mobilevlm" },
|
||||
{ VISION_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_GENERAL_TYPE, "general.type" },
|
||||
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
||||
|
@ -1367,6 +1362,30 @@ static const std::map<vision_arch, std::map<vision_tensor, const char *>> VISION
|
|||
{ VISION_TENSOR_POST_NORM, "v.post_norm" },
|
||||
}
|
||||
},
|
||||
{
|
||||
VISION_ARCH_MINICPMV,
|
||||
{
|
||||
{ VISION_TENSOR_ENC_EMBD_PATCH, "v.enc.embd.patch" },
|
||||
{ VISION_TENSOR_ENC_EMBD_POS, "v.enc.embd.pos" },
|
||||
{ VISION_TENSOR_ENC_ATTN_Q, "v.enc.blk.%d.attn_q" },
|
||||
{ VISION_TENSOR_ENC_ATTN_K, "v.enc.blk.%d.attn_k" },
|
||||
{ VISION_TENSOR_ENC_ATTN_V, "v.enc.blk.%d.attn_v" },
|
||||
{ VISION_TENSOR_ENC_INPUT_NORM, "v.enc.blk.%d.input_norm" },
|
||||
{ VISION_TENSOR_ENC_OUTPUT, "v.enc.blk.%d.output" },
|
||||
{ VISION_TENSOR_ENC_OUTPUT_NORM, "v.enc.blk.%d.output_norm" },
|
||||
{ VISION_TENSOR_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" },
|
||||
{ VISION_TENSOR_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" },
|
||||
{ VISION_TENSOR_RESMPL_POS_EMBD_K, "v.resmpl.pos_embd_k" },
|
||||
{ VISION_TENSOR_RESMPL_ATTN_IN, "v.resmpl.attn_in" },
|
||||
{ VISION_TENSOR_RESMPL_ATTN_OUT, "v.resmpl.attn_out" },
|
||||
{ VISION_TENSOR_RESMPL_KV_PROJ, "v.resmpl.kv_proj" },
|
||||
{ VISION_TENSOR_RESMPL_NORM_POST, "v.resmpl.norm_post" },
|
||||
{ VISION_TENSOR_RESMPL_NORM_KV, "v.resmpl.norm_kv" },
|
||||
{ VISION_TENSOR_RESMPL_NORM_Q, "v.resmpl.norm_q" },
|
||||
{ VISION_TENSOR_RESMPL_PROJ, "v.resmpl.proj" },
|
||||
{ VISION_TENSOR_RESMPL_QUERY, "v.resmpl.query" },
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
|
@ -1576,16 +1595,6 @@ llm_arch llm_arch_from_string(const std::string & name) {
|
|||
return LLM_ARCH_UNKNOWN;
|
||||
}
|
||||
|
||||
vision_arch vision_arch_from_string(const std::string & name) {
|
||||
for (const auto & kv : VISION_ARCH_NAMES) { // NOLINT
|
||||
if (kv.second == name) {
|
||||
return kv.first;
|
||||
}
|
||||
}
|
||||
|
||||
return VISION_ARCH_UNKNOWN;
|
||||
}
|
||||
|
||||
const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor) {
|
||||
return LLM_TENSOR_INFOS.at(tensor);
|
||||
}
|
||||
|
|
|
@ -73,6 +73,7 @@ enum vision_arch {
|
|||
VISION_ARCH_UNKNOWN,
|
||||
VISION_ARCH_LLAVA,
|
||||
VISION_ARCH_MOBILEVLM,
|
||||
VISION_ARCH_MINICPMV,
|
||||
};
|
||||
|
||||
enum llm_kv {
|
||||
|
@ -372,6 +373,16 @@ enum vision_tensor {
|
|||
VISION_TENSOR_ENC_FFN_DOWN,
|
||||
VISION_TENSOR_PRE_NORM,
|
||||
VISION_TENSOR_POST_NORM,
|
||||
// minicpmv
|
||||
VISION_TENSOR_RESMPL_POS_EMBD_K,
|
||||
VISION_TENSOR_RESMPL_ATTN_IN,
|
||||
VISION_TENSOR_RESMPL_ATTN_OUT,
|
||||
VISION_TENSOR_RESMPL_KV_PROJ,
|
||||
VISION_TENSOR_RESMPL_NORM_POST,
|
||||
VISION_TENSOR_RESMPL_NORM_KV,
|
||||
VISION_TENSOR_RESMPL_NORM_Q,
|
||||
VISION_TENSOR_RESMPL_PROJ,
|
||||
VISION_TENSOR_RESMPL_QUERY,
|
||||
};
|
||||
|
||||
enum llm_tensor_layer {
|
||||
|
|
|
@ -96,7 +96,7 @@ struct llama_hparams {
|
|||
float f_max_alibi_bias = 0.0f;
|
||||
float f_logit_scale = 0.0f;
|
||||
|
||||
// Additional scale factors (Granite/Granite MoE)
|
||||
// Additional scale factors (Granite/Granite MoE/MiniCPM)
|
||||
float f_residual_scale = 0.0f;
|
||||
float f_embedding_scale = 0.0f;
|
||||
float f_attention_scale = 0.0f;
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
#include "llama-impl.h"
|
||||
#include "llama-mmap.h"
|
||||
#include "llama-vision.h"
|
||||
#include "llama-model-loader.h"
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
|
@ -1263,6 +1264,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
ml.get_key(LLM_KV_VISION_CLIP_HEAD_COUNT, vparams.n_head, true);
|
||||
ml.get_key(LLM_KV_VISION_CLIP_LAYERNORM_EPS, vparams.eps, true);
|
||||
ml.get_key(LLM_KV_VISION_CLIP_SELECT_LAYER, vparams.select_layer, true);
|
||||
ml.get_key(LLM_KV_VISION_CLIP_MAX_POS_EMBD, vparams.max_pos_embd, true);
|
||||
{
|
||||
std::string name;
|
||||
ml.get_key(LLM_KV_VISION_CLIP_PROJECTOR_TYPE, name, true);
|
||||
|
@ -1289,14 +1291,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
}
|
||||
|
||||
// arch-specific CLIP hparams
|
||||
switch (vparams.arch) {
|
||||
case VISION_ARCH_LLAVA:
|
||||
case VISION_ARCH_MOBILEVLM:
|
||||
{
|
||||
ml.get_key(LLM_KV_VISION_CLIP_MAX_POS_EMBD, vparams.max_pos_embd, true);
|
||||
} break;
|
||||
default: (void)0;
|
||||
}
|
||||
// switch (vparams.arch) {
|
||||
// case VISION_ARCH_LLAVA:
|
||||
// default: (void)0;
|
||||
// }
|
||||
}
|
||||
|
||||
void llama_model::load_vocab(llama_model_loader & ml) {
|
||||
|
@ -3457,6 +3455,37 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
clip.post_norm_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM, "weight"), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
clip.post_norm_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM, "bias" ), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
|
||||
for (int i = 0; i < n_vlayer; ++i) {
|
||||
auto & layer = clip.layers[i];
|
||||
|
||||
layer.k_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_K, "weight", i), {n_vembd, n_vembd});
|
||||
layer.k_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_K, "bias" , i), {n_vembd});
|
||||
layer.v_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_V, "weight", i), {n_vembd, n_vembd});
|
||||
layer.v_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_V, "bias" , i), {n_vembd});
|
||||
layer.q_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_Q, "weight", i), {n_vembd, n_vembd});
|
||||
layer.q_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_Q, "bias" , i), {n_vembd});
|
||||
|
||||
layer.ffn_up_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_UP, "weight", i), {n_vembd, n_vff});
|
||||
layer.ffn_up_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_UP, "bias" , i), {n_vff});
|
||||
layer.ffn_down_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_DOWN, "weight", i), {n_vff, n_vembd});
|
||||
layer.ffn_down_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_DOWN, "bias" , i), {n_vembd});
|
||||
|
||||
layer.norm_in_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_INPUT_NORM, "weight", i), {n_vembd});
|
||||
layer.norm_in_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_INPUT_NORM, "bias" , i), {n_vembd});
|
||||
layer.norm_out_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT_NORM, "weight", i), {n_vembd});
|
||||
layer.norm_out_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT_NORM, "bias" , i), {n_vembd});
|
||||
|
||||
layer.output_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd});
|
||||
layer.output_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT, "bias" , i), {n_vembd});
|
||||
}
|
||||
} break;
|
||||
case VISION_ARCH_MINICPMV:
|
||||
{
|
||||
clip.patch_embeddings = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd});
|
||||
clip.position_embeddings = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd});
|
||||
|
||||
// TODO: load all resampler tensors
|
||||
|
||||
for (int i = 0; i < n_vlayer; ++i) {
|
||||
auto & layer = clip.layers[i];
|
||||
|
||||
|
|
|
@ -63,6 +63,10 @@ uint32_t clip_n_mmproj_embd(const clip_vision_model & clip_model) {
|
|||
return clip_model.mm_2_b->ne[0];
|
||||
} else if (proj_type == CLIP_PROJECTOR_TYPE_LDPV2) {
|
||||
return clip_model.mm_model_peg_0_b->ne[0];
|
||||
} else if (proj_type == CLIP_PROJECTOR_TYPE_MINICPMV_2_5) {
|
||||
return 4096;
|
||||
} else if (proj_type == CLIP_PROJECTOR_TYPE_MINICPMV_2_6) {
|
||||
return 3584;
|
||||
} else {
|
||||
GGML_ASSERT(false && "invalid proj type");
|
||||
}
|
||||
|
@ -243,6 +247,173 @@ static void normalize_image_u8_to_f32(const clip_image_u8 & src, std::vector<flo
|
|||
}
|
||||
}
|
||||
|
||||
#define LLAMA_LOG_DEBUG LLAMA_LOG_INFO
|
||||
|
||||
// minicpmv preprocessor
|
||||
struct minicpmv_preprocessor {
|
||||
int ensure_divide(int length, int patch_size) {
|
||||
return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
|
||||
}
|
||||
|
||||
std::pair<int, int> uhd_find_best_resize(std::pair<int, int> original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
|
||||
int width = original_size.first;
|
||||
int height = original_size.second;
|
||||
if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
|
||||
float r = static_cast<float>(width) / height;
|
||||
height = static_cast<int>(scale_resolution / std::sqrt(r));
|
||||
width = static_cast<int>(height * r);
|
||||
}
|
||||
int best_width = ensure_divide(width, patch_size);
|
||||
int best_height = ensure_divide(height, patch_size);
|
||||
return std::make_pair(best_width, best_height);
|
||||
}
|
||||
|
||||
std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size, std::pair<int, int> grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
|
||||
int width, height;
|
||||
std::tie(width, height) = original_size;
|
||||
int grid_x, grid_y;
|
||||
std::tie(grid_x, grid_y) = grid;
|
||||
|
||||
int refine_width = ensure_divide(width, grid_x);
|
||||
int refine_height = ensure_divide(height, grid_y);
|
||||
|
||||
int grid_width = refine_width / grid_x;
|
||||
int grid_height = refine_height / grid_y;
|
||||
|
||||
// auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line)
|
||||
auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair
|
||||
int best_grid_width, best_grid_height;
|
||||
std::tie(best_grid_width, best_grid_height) = best_grid_size;
|
||||
|
||||
// std::pair<int, int> refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line)
|
||||
std::pair<int, int> refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line)
|
||||
return refine_size;
|
||||
}
|
||||
|
||||
std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
|
||||
std::vector<int> candidate_split_grids_nums;
|
||||
for (int i : {multiple - 1, multiple, multiple + 1}) {
|
||||
if (i == 1 || i > max_slice_nums) {
|
||||
continue;
|
||||
}
|
||||
candidate_split_grids_nums.push_back(i);
|
||||
}
|
||||
|
||||
std::vector<std::pair<int, int>> candidate_grids;
|
||||
for (int split_grids_nums : candidate_split_grids_nums) {
|
||||
int m = 1;
|
||||
while (m <= split_grids_nums) {
|
||||
if (split_grids_nums % m == 0) {
|
||||
candidate_grids.emplace_back(m, split_grids_nums / m);
|
||||
}
|
||||
++m;
|
||||
}
|
||||
}
|
||||
|
||||
std::pair<int, int> best_grid{1, 1};
|
||||
float min_error = std::numeric_limits<float>::infinity();
|
||||
for (const auto& grid : candidate_grids) {
|
||||
float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second));
|
||||
if (error < min_error) {
|
||||
best_grid = grid;
|
||||
min_error = error;
|
||||
}
|
||||
}
|
||||
return best_grid;
|
||||
}
|
||||
|
||||
std::vector<std::vector<clip_image_u8>> uhd_slice_image(
|
||||
const clip_image_u8 & img,
|
||||
const int max_slice_nums = 9,
|
||||
const int scale_resolution = 448,
|
||||
const int patch_size = 14) {
|
||||
const std::pair<int, int> original_size={img.nx,img.ny};
|
||||
const int original_width = img.nx;
|
||||
const int original_height = img.ny;
|
||||
const float log_ratio = log(1.0*original_width/original_height);
|
||||
const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
|
||||
const int multiple = fmin(ceil(ratio), max_slice_nums);
|
||||
|
||||
std::vector<std::vector<clip_image_u8>> images;
|
||||
LLAMA_LOG_DEBUG("%s: multiple %d\n", __func__, multiple);
|
||||
images.push_back(std::vector<clip_image_u8>());
|
||||
|
||||
if (multiple <= 1) {
|
||||
auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true);
|
||||
clip_image_u8 source_image;
|
||||
bicubic_resize(img, source_image, best_size.first, best_size.second);
|
||||
// source_image = image.resize(best_size, Image.Resampling.BICUBIC)
|
||||
images[images.size()-1].push_back(source_image);
|
||||
}
|
||||
else if (multiple > 1) {
|
||||
auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size);
|
||||
clip_image_u8 source_image;
|
||||
bicubic_resize(img, source_image, best_size.first, best_size.second);
|
||||
// source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
|
||||
LLAMA_LOG_DEBUG("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img.nx, img.ny, best_size.first, best_size.second);
|
||||
images[images.size()-1].push_back(source_image);
|
||||
|
||||
std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
|
||||
LLAMA_LOG_DEBUG("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img.nx, img.ny, best_grid.first, best_grid.second);
|
||||
|
||||
auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
|
||||
clip_image_u8 refine_image;
|
||||
bicubic_resize(img, refine_image, refine_size.first, refine_size.second);
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image.nx, refine_image.ny, refine_size.first, refine_size.second);
|
||||
|
||||
// split_to_patches
|
||||
int width = refine_image.nx;
|
||||
int height = refine_image.ny;
|
||||
int grid_x = int(width / best_grid.first);
|
||||
int grid_y = int(height / best_grid.second);
|
||||
for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){
|
||||
images.push_back(std::vector<clip_image_u8>());
|
||||
for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){
|
||||
clip_image_u8 patch;
|
||||
patch.nx = grid_x;
|
||||
patch.ny = grid_y;
|
||||
patch.buf.resize(3 * patch.nx * patch.ny);
|
||||
for (int y = patches_i; y < patches_i + grid_y; ++y) {
|
||||
for (int x = patches_j; x < patches_j + grid_x; ++x) {
|
||||
const int i = 3 * (y * refine_image.nx + x);
|
||||
const int j = 3 * ((y-patches_i) * patch.nx + (x-patches_j));
|
||||
patch.buf[j] = refine_image.buf[i];
|
||||
patch.buf[j+1] = refine_image.buf[i+1];
|
||||
patch.buf[j+2] = refine_image.buf[i+2];
|
||||
}
|
||||
}
|
||||
images[images.size()-1].push_back(patch);
|
||||
}
|
||||
}
|
||||
}
|
||||
return images;
|
||||
}
|
||||
};
|
||||
|
||||
static llama_vision_patches clip_image_preprocess_minicpmv(const clip_context & ctx, const clip_image_u8 & img) {
|
||||
auto & params = ctx.model->hparams;
|
||||
GGML_ASSERT(params.arch == VISION_ARCH_MINICPMV);
|
||||
|
||||
static const int max_slice_nums = 9;
|
||||
minicpmv_preprocessor preprocessor;
|
||||
std::vector<std::vector<clip_image_u8>> imgs = preprocessor.uhd_slice_image(img, max_slice_nums);
|
||||
|
||||
llama_vision_patches output_patches;
|
||||
output_patches.n_px = clip_n_patches_x(ctx);
|
||||
output_patches.n_py = clip_n_patches_y(ctx);
|
||||
output_patches.px = params.patch_size;
|
||||
output_patches.py = params.patch_size;
|
||||
|
||||
for (size_t i = 0; i < imgs.size(); ++i) {
|
||||
for (size_t j = 0; j < imgs[i].size(); ++j) {
|
||||
std::vector<float> res;
|
||||
normalize_image_u8_to_f32(imgs[i][j], res, params.image_mean, params.image_std);
|
||||
output_patches.buf.push_back(res);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
|
||||
// res_imgs memory is being allocated here, previous allocations will be freed if found
|
||||
static llama_vision_patches clip_image_preprocess(const clip_context & ctx, const clip_image_u8 & img) {
|
||||
|
@ -724,8 +895,10 @@ struct llama_vision_patches * llama_vision_patches_init(
|
|||
struct llama_context * ctx,
|
||||
llama_vision_bitmap * bmp) {
|
||||
clip_context & vctx = ctx->vctx;
|
||||
llama_vision_patches p = clip_image_preprocess(vctx, *bmp);
|
||||
return new llama_vision_patches(p);
|
||||
if (vctx.model->hparams.arch == VISION_ARCH_MINICPMV) {
|
||||
return new llama_vision_patches(clip_image_preprocess_minicpmv(vctx, *bmp));
|
||||
}
|
||||
return new llama_vision_patches(clip_image_preprocess(vctx, *bmp));
|
||||
}
|
||||
|
||||
void llama_vision_patches_free(llama_vision_patches * p) {
|
||||
|
|
|
@ -11,6 +11,8 @@ enum clip_projector_type {
|
|||
CLIP_PROJECTOR_TYPE_UNKNOWN,
|
||||
CLIP_PROJECTOR_TYPE_MLP,
|
||||
CLIP_PROJECTOR_TYPE_LDPV2,
|
||||
CLIP_PROJECTOR_TYPE_MINICPMV_2_5,
|
||||
CLIP_PROJECTOR_TYPE_MINICPMV_2_6,
|
||||
};
|
||||
|
||||
enum mm_patch_merge {
|
||||
|
@ -36,7 +38,7 @@ struct clip_hparams {
|
|||
float eps;
|
||||
|
||||
clip_projector_type proj_type = CLIP_PROJECTOR_TYPE_UNKNOWN;
|
||||
mm_patch_merge mm_patch_merge_type = MM_PATCH_MERGE_FLAT;
|
||||
mm_patch_merge mm_patch_merge_type = MM_PATCH_MERGE_UNKNOWN;
|
||||
|
||||
std::array<float, 3> image_mean;
|
||||
std::array<float, 3> image_std;
|
||||
|
@ -107,6 +109,26 @@ struct clip_vision_model {
|
|||
struct ggml_tensor * mm_model_peg_0_w = nullptr;
|
||||
struct ggml_tensor * mm_model_peg_0_b = nullptr;
|
||||
|
||||
// MINICPMV projection
|
||||
struct ggml_tensor * mm_model_pos_embed_k;
|
||||
struct ggml_tensor * mm_model_query;
|
||||
struct ggml_tensor * mm_model_proj;
|
||||
struct ggml_tensor * mm_model_kv_proj;
|
||||
struct ggml_tensor * mm_model_attn_q_w;
|
||||
struct ggml_tensor * mm_model_attn_q_b;
|
||||
struct ggml_tensor * mm_model_attn_k_w;
|
||||
struct ggml_tensor * mm_model_attn_k_b;
|
||||
struct ggml_tensor * mm_model_attn_v_w;
|
||||
struct ggml_tensor * mm_model_attn_v_b;
|
||||
struct ggml_tensor * mm_model_attn_o_w;
|
||||
struct ggml_tensor * mm_model_attn_o_b;
|
||||
struct ggml_tensor * mm_model_ln_q_w;
|
||||
struct ggml_tensor * mm_model_ln_q_b;
|
||||
struct ggml_tensor * mm_model_ln_kv_w;
|
||||
struct ggml_tensor * mm_model_ln_kv_b;
|
||||
struct ggml_tensor * mm_model_ln_post_w;
|
||||
struct ggml_tensor * mm_model_ln_post_b;
|
||||
|
||||
struct ggml_tensor * image_newline = nullptr;
|
||||
};
|
||||
|
||||
|
@ -135,6 +157,18 @@ struct llama_vision_patches {
|
|||
std::vector<std::vector<float>> buf; // preprocessed image data
|
||||
};
|
||||
|
||||
inline vision_arch vision_arch_from_string(const std::string & name) {
|
||||
if (name == "llava") {
|
||||
return VISION_ARCH_LLAVA;
|
||||
} else if (name == "mobilevlm") {
|
||||
return VISION_ARCH_MOBILEVLM;
|
||||
} else if (name == "minicpmv") {
|
||||
return VISION_ARCH_MINICPMV;
|
||||
}
|
||||
|
||||
return VISION_ARCH_UNKNOWN;
|
||||
}
|
||||
|
||||
inline mm_patch_merge mm_patch_merge_from_name(std::string & name) {
|
||||
if (name == "flat") {
|
||||
return MM_PATCH_MERGE_FLAT;
|
||||
|
@ -149,6 +183,10 @@ inline clip_projector_type clip_projector_type_from_name(std::string & name) {
|
|||
return CLIP_PROJECTOR_TYPE_MLP;
|
||||
} else if (name == "ldpv2") {
|
||||
return CLIP_PROJECTOR_TYPE_LDPV2;
|
||||
} else if (name == "minicpmv-2.5") {
|
||||
return CLIP_PROJECTOR_TYPE_MINICPMV_2_5;
|
||||
} else if (name == "minicpmv-2.6") {
|
||||
return CLIP_PROJECTOR_TYPE_MINICPMV_2_6;
|
||||
}
|
||||
return CLIP_PROJECTOR_TYPE_UNKNOWN;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue