whitespace corrections
This commit is contained in:
parent
37a147ebf9
commit
7dcadb4ec3
5 changed files with 46 additions and 50 deletions
|
@ -1,7 +1,6 @@
|
||||||
// NOTE: This is modified from clip.cpp only for LLaVA,
|
// NOTE: This is modified from clip.cpp only for LLaVA,
|
||||||
// so there might be still unnecessary artifacts hanging around
|
// so there might be still unnecessary artifacts hanging around
|
||||||
// I'll gradually clean and extend it
|
// I'll gradually clean and extend it
|
||||||
|
|
||||||
#include "clip.h"
|
#include "clip.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
|
@ -965,7 +964,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
hparams.image_grid_pinpoints[i] = pinpoints[i];
|
hparams.image_grid_pinpoints[i] = pinpoints[i];
|
||||||
}
|
}
|
||||||
hparams.image_grid_pinpoints[n] = 0;
|
hparams.image_grid_pinpoints[n] = 0;
|
||||||
} catch (std::runtime_error & e) {
|
} catch (std::runtime_error & e) {
|
||||||
hparams.image_grid_pinpoints[0]=0;
|
hparams.image_grid_pinpoints[0]=0;
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
|
@ -979,7 +978,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
}
|
}
|
||||||
catch(const std::exception& e) {
|
catch(const std::exception& e) {
|
||||||
hparams.image_crop_resolution = hparams.image_size;
|
hparams.image_crop_resolution = hparams.image_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
|
int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
|
||||||
int idx_std = get_key_idx(ctx, KEY_IMAGE_STD);
|
int idx_std = get_key_idx(ctx, KEY_IMAGE_STD);
|
||||||
|
@ -1022,7 +1021,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
{
|
{
|
||||||
fprintf(stderr, "%s: failed to load vision model tensors\n", __func__);
|
fprintf(stderr, "%s: failed to load vision model tensors\n", __func__);
|
||||||
}
|
}
|
||||||
|
|
||||||
// LLaVA projection
|
// LLaVA projection
|
||||||
if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
||||||
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
|
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
|
||||||
|
@ -1270,12 +1269,12 @@ void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filenam
|
||||||
inline float lerp(float s, float e, float t) {
|
inline float lerp(float s, float e, float t) {
|
||||||
return s + (e - s) * t;
|
return s + (e - s) * t;
|
||||||
}
|
}
|
||||||
// Bilinear resize function
|
// Bilinear resize function
|
||||||
void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) {
|
void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) {
|
||||||
dst.nx = target_width;
|
dst.nx = target_width;
|
||||||
dst.ny = target_height;
|
dst.ny = target_height;
|
||||||
dst.buf.resize(3 * target_width * target_height);
|
dst.buf.resize(3 * target_width * target_height);
|
||||||
|
|
||||||
float x_ratio = static_cast<float>(src.nx - 1) / target_width;
|
float x_ratio = static_cast<float>(src.nx - 1) / target_width;
|
||||||
float y_ratio = static_cast<float>(src.ny - 1) / target_height;
|
float y_ratio = static_cast<float>(src.ny - 1) / target_height;
|
||||||
|
|
||||||
|
@ -1343,11 +1342,11 @@ void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, co
|
||||||
dst->nx = src->nx;
|
dst->nx = src->nx;
|
||||||
dst->ny = src->ny;
|
dst->ny = src->ny;
|
||||||
dst->buf.resize(src->buf.size());
|
dst->buf.resize(src->buf.size());
|
||||||
|
|
||||||
for (size_t i = 0; i < src->buf.size(); ++i) {
|
for (size_t i = 0; i < src->buf.size(); ++i) {
|
||||||
int c = i % 3; // rgb
|
int c = i % 3; // rgb
|
||||||
dst->buf[i] = (static_cast<float>(src->buf[i]) / 255.0f - mean[c]) / std[c];
|
dst->buf[i] = (static_cast<float>(src->buf[i]) / 255.0f - mean[c]) / std[c];
|
||||||
|
|
||||||
if (replicate_float16) {
|
if (replicate_float16) {
|
||||||
dst->buf[i] = simulateFloat16Precision(dst->buf[i]);
|
dst->buf[i] = simulateFloat16Precision(dst->buf[i]);
|
||||||
}
|
}
|
||||||
|
@ -1546,15 +1545,15 @@ void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst)
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Get the anyres image grid shape object
|
* @brief Get the anyres image grid shape object
|
||||||
*
|
*
|
||||||
* @param image_size
|
* @param image_size
|
||||||
* @param grid_pinpoints
|
* @param grid_pinpoints
|
||||||
* @param image_patch_size
|
* @param image_patch_size
|
||||||
* @return <int, int>
|
* @return <int, int>
|
||||||
*/
|
*/
|
||||||
struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, int>& image_size, const std::vector<std::pair<int, int>>& grid_pinpoints, int image_patch_size) {
|
struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, int>& image_size, const std::vector<std::pair<int, int>>& grid_pinpoints, int image_patch_size) {
|
||||||
/**
|
/**
|
||||||
Conversion from gguf flat array to vector:
|
Conversion from gguf flat array to vector:
|
||||||
std::vector<std::pair<int, int>> possible_resolutions;
|
std::vector<std::pair<int, int>> possible_resolutions;
|
||||||
for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
|
for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
|
||||||
possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
|
possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
|
||||||
|
@ -1628,7 +1627,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std
|
||||||
resize_and_pad_image(*img, *temp, best_resolution); // we do not pad with mean-bg color anymore in llava-1.6
|
resize_and_pad_image(*img, *temp, best_resolution); // we do not pad with mean-bg color anymore in llava-1.6
|
||||||
// clip_image_save_to_bmp(*temp, "resized.bmp");
|
// clip_image_save_to_bmp(*temp, "resized.bmp");
|
||||||
// visually verify normalized image:
|
// visually verify normalized image:
|
||||||
// normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std);
|
// normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std);
|
||||||
// {
|
// {
|
||||||
// clip_image_u8 * temp2 = clip_image_u8_init();
|
// clip_image_u8 * temp2 = clip_image_u8_init();
|
||||||
// clip_image_convert_f32_to_u8(*res, *temp2);
|
// clip_image_convert_f32_to_u8(*res, *temp2);
|
||||||
|
@ -1638,7 +1637,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std
|
||||||
|
|
||||||
std::vector<clip_image_u8 *> patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6)
|
std::vector<clip_image_u8 *> patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6)
|
||||||
// fprintf(stderr, "patches: %d, %d\n", patches.size(), params.image_size);
|
// fprintf(stderr, "patches: %d, %d\n", patches.size(), params.image_size);
|
||||||
|
|
||||||
clip_image_u8 *image_original_resize = clip_image_u8_init();
|
clip_image_u8 *image_original_resize = clip_image_u8_init();
|
||||||
// bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square ?
|
// bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square ?
|
||||||
bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square ?
|
bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square ?
|
||||||
|
@ -1655,9 +1654,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std
|
||||||
// printf("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
|
// printf("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
|
||||||
clip_image_u8_free(patches[i]);
|
clip_image_u8_free(patches[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
clip_image_u8_free(temp);
|
clip_image_u8_free(temp);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
temp->nx = img->nx;
|
temp->nx = img->nx;
|
||||||
|
@ -1802,7 +1801,6 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
||||||
type = static_cast<ggml_type>(itype);
|
type = static_cast<ggml_type>(itype);
|
||||||
|
|
||||||
auto * ctx_clip = clip_model_load(fname_inp, 2);
|
auto * ctx_clip = clip_model_load(fname_inp, 2);
|
||||||
|
|
||||||
|
|
||||||
const auto & ctx_src = ctx_clip->ctx_gguf;
|
const auto & ctx_src = ctx_clip->ctx_gguf;
|
||||||
const auto & ctx_data = ctx_clip->ctx_data;
|
const auto & ctx_data = ctx_clip->ctx_data;
|
||||||
|
|
|
@ -38,7 +38,7 @@ struct clip_vision_hparams {
|
||||||
float eps;
|
float eps;
|
||||||
|
|
||||||
char mm_patch_merge_type[32]="flat"; // spatial_unpad or flat (default)
|
char mm_patch_merge_type[32]="flat"; // spatial_unpad or flat (default)
|
||||||
int32_t image_grid_pinpoints[32];
|
int32_t image_grid_pinpoints[32];
|
||||||
int32_t image_crop_resolution;
|
int32_t image_crop_resolution;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
|
@ -234,7 +234,7 @@ if has_vision_encoder:
|
||||||
# 1008, 336,
|
# 1008, 336,
|
||||||
# 336, 1008
|
# 336, 1008
|
||||||
# ]
|
# ]
|
||||||
# *
|
# *
|
||||||
# */
|
# */
|
||||||
if "image_grid_pinpoints" in v_hparams:
|
if "image_grid_pinpoints" in v_hparams:
|
||||||
# flatten it
|
# flatten it
|
||||||
|
|
|
@ -4,7 +4,6 @@ import os
|
||||||
import torch
|
import torch
|
||||||
from safetensors.torch import load as safe_load, save as safe_save, safe_open, save_file
|
from safetensors.torch import load as safe_load, save as safe_save, safe_open, save_file
|
||||||
|
|
||||||
|
|
||||||
# Function to determine if file is a SafeTensor file
|
# Function to determine if file is a SafeTensor file
|
||||||
def is_safetensor_file(file_path):
|
def is_safetensor_file(file_path):
|
||||||
return file_path.endswith('.safetensors')
|
return file_path.endswith('.safetensors')
|
||||||
|
@ -40,12 +39,12 @@ def clean_vision_tower_from_checkpoint(checkpoint_path):
|
||||||
model_path = os.path.dirname(checkpoint_path)
|
model_path = os.path.dirname(checkpoint_path)
|
||||||
print(f"Searching for vision tower tensors in {checkpoint_path}")
|
print(f"Searching for vision tower tensors in {checkpoint_path}")
|
||||||
clip_tensors = [k for k, v in checkpoint.items() if (k.startswith("model.vision_tower") ) ]
|
clip_tensors = [k for k, v in checkpoint.items() if (k.startswith("model.vision_tower") ) ]
|
||||||
|
|
||||||
if len(clip_tensors) > 0:
|
if len(clip_tensors) > 0:
|
||||||
print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}")
|
print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}")
|
||||||
# Adapted for file type
|
# Adapted for file type
|
||||||
clip_path = os.path.join(model_path, "llava.clip")
|
clip_path = os.path.join(model_path, "llava.clip")
|
||||||
|
|
||||||
if os.path.exists(clip_path):
|
if os.path.exists(clip_path):
|
||||||
existing_clip, _ = load_model(clip_path)
|
existing_clip, _ = load_model(clip_path)
|
||||||
else:
|
else:
|
||||||
|
@ -142,7 +141,7 @@ for name in mm_tensors:
|
||||||
projector[name] = last_checkpoint[name].float()
|
projector[name] = last_checkpoint[name].float()
|
||||||
for name in first_mm_tensors:
|
for name in first_mm_tensors:
|
||||||
projector[name] = first_checkpoint[name].float()
|
projector[name] = first_checkpoint[name].float()
|
||||||
|
|
||||||
save_model(projector, f"{args.model}/llava.projector", 'pytorch')
|
save_model(projector, f"{args.model}/llava.projector", 'pytorch')
|
||||||
|
|
||||||
for name in mm_tensors:
|
for name in mm_tensors:
|
||||||
|
|
|
@ -14,21 +14,21 @@
|
||||||
static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
|
static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
|
||||||
struct temp_model {
|
struct temp_model {
|
||||||
struct ggml_tensor *newline;
|
struct ggml_tensor *newline;
|
||||||
struct ggml_context * ctx;
|
struct ggml_context * ctx;
|
||||||
} model;
|
} model;
|
||||||
|
|
||||||
auto & vparams = clip_get_vision_hparams(ctx_clip);
|
auto & vparams = clip_get_vision_hparams(ctx_clip);
|
||||||
auto num_patches_per_side = vparams.image_size / vparams.patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)
|
auto num_patches_per_side = vparams.image_size / vparams.patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)
|
||||||
int num_patches_width = grid_shape.first; // grid 1-4
|
int num_patches_width = grid_shape.first; // grid 1-4
|
||||||
int num_patches_height = grid_shape.second; // grid 1-4
|
int num_patches_height = grid_shape.second; // grid 1-4
|
||||||
|
|
||||||
// TODO: size calculation is not calculated - it's only tens of MB
|
// TODO: size calculation is not calculated - it's only tens of MB
|
||||||
size_t ctx_size = 0;
|
size_t ctx_size = 0;
|
||||||
{
|
{
|
||||||
ctx_size += clip_embd_nbytes(ctx_clip) * image_embd_v.size() * 8; // image_features
|
ctx_size += clip_embd_nbytes(ctx_clip) * image_embd_v.size() * 8; // image_features
|
||||||
ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32); //
|
ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_init_params params {
|
struct ggml_init_params params {
|
||||||
/*.mem_size =*/ ctx_size,
|
/*.mem_size =*/ ctx_size,
|
||||||
/*.mem_buffer =*/ NULL,
|
/*.mem_buffer =*/ NULL,
|
||||||
|
@ -47,7 +47,7 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_emb
|
||||||
// ), dim=-1)
|
// ), dim=-1)
|
||||||
// image_feature = image_feature.flatten(1, 2).transpose(0, 1)
|
// image_feature = image_feature.flatten(1, 2).transpose(0, 1)
|
||||||
// image_feature = torch.cat((base_image_feature, image_feature), dim=0)
|
// image_feature = torch.cat((base_image_feature, image_feature), dim=0)
|
||||||
|
|
||||||
// embeddings -> tokens -> 24 x 24
|
// embeddings -> tokens -> 24 x 24
|
||||||
/**
|
/**
|
||||||
* We now have two options: unpad or no unpad - unpad removes tokens for faster llm eval
|
* We now have two options: unpad or no unpad - unpad removes tokens for faster llm eval
|
||||||
|
@ -66,13 +66,13 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_emb
|
||||||
image_feature = image_feature.view(2, 2, 24, 24*4096)
|
image_feature = image_feature.view(2, 2, 24, 24*4096)
|
||||||
image_feature = image_feature.permute(0, 2, 1, 3).contiguous()
|
image_feature = image_feature.permute(0, 2, 1, 3).contiguous()
|
||||||
image_feature = image_feature.view(-1, 4096)
|
image_feature = image_feature.view(-1, 4096)
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
model.ctx = ggml_init(params);
|
model.ctx = ggml_init(params);
|
||||||
|
|
||||||
ggml_context *ctx_noalloc = ggml_init({2048, NULL, true});
|
ggml_context *ctx_noalloc = ggml_init({2048, NULL, true});
|
||||||
// struct ggml_tensor * image_features = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip) * (image_embd_v.size() - 1));
|
// struct ggml_tensor * image_features = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip) * (image_embd_v.size() - 1));
|
||||||
|
|
||||||
ggml_tensor *newline_tmp = clip_get_newline_tensor(ctx_clip);
|
ggml_tensor *newline_tmp = clip_get_newline_tensor(ctx_clip);
|
||||||
model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
|
model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
|
||||||
if (newline_tmp->backend != GGML_BACKEND_CPU) {
|
if (newline_tmp->backend != GGML_BACKEND_CPU) {
|
||||||
|
@ -112,28 +112,28 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_emb
|
||||||
size_t size_ele = ggml_type_size(GGML_TYPE_F32);
|
size_t size_ele = ggml_type_size(GGML_TYPE_F32);
|
||||||
// struct ggml_tensor *dummy = ggml_new_tensor_4d(ctx_noalloc, GGML_TYPE_F32, num_patches_height, num_patches_width, num_patches_per_side, num_patches_per_side * clip_n_mmproj_embd(ctx_clip));
|
// struct ggml_tensor *dummy = ggml_new_tensor_4d(ctx_noalloc, GGML_TYPE_F32, num_patches_height, num_patches_width, num_patches_per_side, num_patches_per_side * clip_n_mmproj_embd(ctx_clip));
|
||||||
|
|
||||||
struct ggml_tensor *image_features_view = ggml_view_4d(model.ctx, image_features,
|
struct ggml_tensor *image_features_view = ggml_view_4d(model.ctx, image_features,
|
||||||
num_patches_height, // nb0 : 4 byte für jedes
|
num_patches_height,
|
||||||
num_patches_width,
|
num_patches_width,
|
||||||
num_patches_per_side * num_patches_per_side,
|
num_patches_per_side * num_patches_per_side,
|
||||||
clip_n_mmproj_embd(ctx_clip),
|
clip_n_mmproj_embd(ctx_clip),
|
||||||
|
|
||||||
size_ele * num_patches_height,
|
size_ele * num_patches_height,
|
||||||
size_ele * num_patches_height * num_patches_width,
|
size_ele * num_patches_height * num_patches_width,
|
||||||
size_ele * num_patches_height * num_patches_width * num_patches_per_side,
|
size_ele * num_patches_height * num_patches_width * num_patches_per_side,
|
||||||
0);
|
0);
|
||||||
|
|
||||||
struct ggml_tensor *image_features_patchview = ggml_view_4d(model.ctx, image_features,
|
struct ggml_tensor *image_features_patchview = ggml_view_4d(model.ctx, image_features,
|
||||||
num_patches_height,
|
num_patches_height,
|
||||||
num_patches_width,
|
num_patches_width,
|
||||||
num_patches_per_side,
|
num_patches_per_side,
|
||||||
num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
|
num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
|
||||||
|
|
||||||
size_ele * num_patches_height,
|
size_ele * num_patches_height,
|
||||||
size_ele * num_patches_height * num_patches_width,
|
size_ele * num_patches_height * num_patches_width,
|
||||||
size_ele * num_patches_height * num_patches_width * num_patches_per_side, 0);
|
size_ele * num_patches_height * num_patches_width * num_patches_per_side, 0);
|
||||||
|
|
||||||
struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3));
|
struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3));
|
||||||
permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, permuted_cont, 0, 2, 1, 3)); // permute back to before - todo: fix bug
|
permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, permuted_cont, 0, 2, 1, 3)); // permute back to before - todo: fix bug
|
||||||
|
|
||||||
struct ggml_tensor *prepared = ggml_view_2d(model.ctx, permuted_cont, num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, clip_n_mmproj_embd(ctx_clip), size_ele * num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, 0);
|
struct ggml_tensor *prepared = ggml_view_2d(model.ctx, permuted_cont, num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, clip_n_mmproj_embd(ctx_clip), size_ele * num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, 0);
|
||||||
|
@ -172,9 +172,8 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_emb
|
||||||
// memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as context
|
// memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as context
|
||||||
// memcpy(image_embd_out, (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip)); // main image as context
|
// memcpy(image_embd_out, (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip)); // main image as context
|
||||||
// *n_img_pos_out=576;
|
// *n_img_pos_out=576;
|
||||||
|
|
||||||
ggml_free(model.ctx);
|
|
||||||
|
|
||||||
|
ggml_free(model.ctx);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -205,7 +204,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
|
|
||||||
if (strcmp(vparams.mm_patch_merge_type, "spatial_unpad") != 0)
|
if (strcmp(vparams.mm_patch_merge_type, "spatial_unpad") != 0)
|
||||||
{
|
{
|
||||||
// flat / default llava-1.5 type embedding
|
// flat / default llava-1.5 type embedding
|
||||||
*n_img_pos = clip_n_patches(ctx_clip);
|
*n_img_pos = clip_n_patches(ctx_clip);
|
||||||
|
@ -233,7 +232,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
const int64_t t_img_enc_batch_us = ggml_time_us();
|
const int64_t t_img_enc_batch_us = ggml_time_us();
|
||||||
printf("%s: %d segments encoded in %8.2f ms\n", __func__, img_res_v.size(), (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
printf("%s: %d segments encoded in %8.2f ms\n", __func__, img_res_v.size(), (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
||||||
|
|
||||||
|
|
||||||
std::vector<std::pair<int, int>> grid_pinpoints;
|
std::vector<std::pair<int, int>> grid_pinpoints;
|
||||||
|
@ -260,7 +259,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||||
|
|
||||||
}
|
}
|
||||||
printf("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
|
printf("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
|
||||||
|
|
||||||
|
|
||||||
const int64_t t_img_enc_end_us = ggml_time_us();
|
const int64_t t_img_enc_end_us = ggml_time_us();
|
||||||
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
|
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue