Fix Vit & Patch merging

This commit is contained in:
root 2024-10-05 00:41:54 +00:00
parent 56e149d627
commit aa23425236
2 changed files with 40 additions and 65 deletions

View file

@ -3168,8 +3168,8 @@ bool clip_image_batch_encode_vit(clip_ctx * ctx, const int n_threads, const clip
int image_size_width = image_size; int image_size_width = image_size;
int image_size_height = image_size; int image_size_height = image_size;
const int patch_size = hparams.patch_size; const int patch_size = hparams.patch_size;
// const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
// const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
if(ctx->load_image_size==nullptr){ if(ctx->load_image_size==nullptr){
ctx->load_image_size= clip_image_size_init(); ctx->load_image_size= clip_image_size_init();
} }
@ -3206,28 +3206,16 @@ bool clip_image_batch_encode_vit(clip_ctx * ctx, const int n_threads, const clip
free(data); free(data);
} }
// copy from minicpm implementation for positional embedding. {
// inspired from siglip: // compute positions
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
int* positions_data = (int*)malloc(ggml_nbytes(positions)); int* positions_data = (int*)malloc(ggml_nbytes(positions));
int bucket_coords_h[70]; for (int i = 0; i < num_patches; i++){
int bucket_coords_w[70]; positions_data[i] = i;
for (int i = 0; i < pos_h; i++){
bucket_coords_h[i] = std::floor(70.0*i/pos_h);
}
for (int i = 0; i < pos_w; i++){
bucket_coords_w[i] = std::floor(70.0*i/pos_w);
}
for (int i = 0, id = 0; i < pos_h; i++){
for (int j = 0; j < pos_w; j++){
positions_data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
}
} }
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
free(positions_data); free(positions_data);
}
if (ggml_backend_is_cpu(ctx->backend)) { if (ggml_backend_is_cpu(ctx->backend)) {

View file

@ -438,34 +438,21 @@ static bool clip_xgenmm_handle_vit_patches(clip_ctx *ctx_clip , const clip_image
float* base_image_feature_data = (float*)base_image_feature->data; float* base_image_feature_data = (float*)base_image_feature->data;
for (int i=0; i < dim0; i++) for (int i=0; i < dim0; i++)
{ {
if (i==0) for (int j=0; j < dim1; j++)
{ {
// base_image_feature_data for (int k=0; k < dim2; k++)
float* image_embd = image_embd_v[i];
for (int j=0; j < dim1; j++)
{ {
for (int k=0; k < dim2; k++) image_features_data[i * dim1 * dim2 + j * dim2 + k] =
image_embd_v[i+1][j * dim2 + k];
if (i == 0)
{ {
base_image_feature_data[j * dim2 + k] = image_embd[j * dim2 + k]; base_image_feature_data[j * dim2 + k] = image_embd_v[i][j * dim2 + k];
}
}
}
else
{
// other sub-images
float* image_embd = image_embd_v[i+1];
for (int j=0; j < dim1; j++)
{
for (int k=0; k < dim2; k++)
{
image_features_data[i * dim1 * dim2 + j * dim2 + k] = image_embd[j * dim2 + k];
} }
} }
} }
} }
struct ggml_tensor* image_features_patchview = ggml_view_4d( struct ggml_tensor* image_features_patchview = ggml_view_4d(
model.ctx, image_features, num_patches_per_side * hidden_size, num_patches_per_side, model.ctx, image_features, num_patches_per_side * hidden_size, num_patches_per_side,
num_patches_width, num_patches_height, size_ele * num_patches_per_side * hidden_size, num_patches_width, num_patches_height, size_ele * num_patches_per_side * hidden_size,
@ -575,31 +562,31 @@ static bool clip_xgenmm_handle_vit_patches(clip_ctx *ctx_clip , const clip_image
attention_mask = gf->nodes[gf->n_nodes - 1]; attention_mask = gf->nodes[gf->n_nodes - 1];
// memcpy(image_embd_v_m_mask_out, (float *)attention_mask->data, ggml_nbytes(attention_mask)); // memcpy(image_embd_v_m_mask_out, (float *)attention_mask->data, ggml_nbytes(attention_mask));
{ // {
printf((" ========================= DEBUG =========================\n")); // printf((" ========================= DEBUG =========================\n"));
printf("Load pre-computed image embeddings and attention_mask\n"); // printf("Load pre-computed image embeddings and attention_mask\n");
std::string filename = "/export/home/ggml/examples/projectors/receipt_5patches_vision_features.gguf"; // std::string filename = "/export/home/ggml/examples/projectors/receipt_5patches_vision_features.gguf";
tensor_from_gguf tensor; // tensor_from_gguf tensor;
bool is_successful = load_tensor_from_file(filename.c_str(), tensor); // bool is_successful = load_tensor_from_file(filename.c_str(), tensor);
if (!is_successful) // if (!is_successful)
{ // {
fprintf(stderr, "%s: load_tensor_from_file() failed\n", __func__); // fprintf(stderr, "%s: load_tensor_from_file() failed\n", __func__);
return 1; // return 1;
} // }
result = tensor.data; // result = tensor.data;
// print_tensor(result, "result", 1); // // print_tensor(result, "result", 1);
filename = "/export/home/ggml/examples/projectors/receipt_5patches_vision_attn_masks.gguf"; // filename = "/export/home/ggml/examples/projectors/receipt_5patches_vision_attn_masks.gguf";
is_successful = load_tensor_from_file(filename.c_str(), tensor); // is_successful = load_tensor_from_file(filename.c_str(), tensor);
if (!is_successful) // if (!is_successful)
{ // {
fprintf(stderr, "%s: load_tensor_from_file() failed\n", __func__); // fprintf(stderr, "%s: load_tensor_from_file() failed\n", __func__);
return 1; // return 1;
} // }
attention_mask = tensor.data; // attention_mask = tensor.data;
// print_tensor(attention_mask, "attention_mask", 1); // // print_tensor(attention_mask, "attention_mask", 1);
num_patches_width = 2; // num_patches_width = 2;
num_patches_height = 2; // num_patches_height = 2;
} // }
// compute attnetion masks outside of the graph // compute attnetion masks outside of the graph
@ -1126,4 +1113,4 @@ void llava_image_embed_free(struct llava_image_embed *embed)
{ {
free(embed->embed); free(embed->embed);
free(embed); free(embed);
} }