diff --git a/examples/xgenmm/clip.cpp b/examples/xgenmm/clip.cpp index d7444354d..558f906f2 100644 --- a/examples/xgenmm/clip.cpp +++ b/examples/xgenmm/clip.cpp @@ -3168,8 +3168,8 @@ bool clip_image_batch_encode_vit(clip_ctx * ctx, const int n_threads, const clip int image_size_width = image_size; int image_size_height = image_size; const int patch_size = hparams.patch_size; - // const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); - // const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); + const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); + const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); if(ctx->load_image_size==nullptr){ ctx->load_image_size= clip_image_size_init(); } @@ -3206,28 +3206,16 @@ bool clip_image_batch_encode_vit(clip_ctx * ctx, const int n_threads, const clip free(data); } - // copy from minicpm implementation for positional embedding. - // inspired from siglip: - // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit - // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316 + { + // compute positions struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); int* positions_data = (int*)malloc(ggml_nbytes(positions)); - int bucket_coords_h[70]; - int bucket_coords_w[70]; - for (int i = 0; i < pos_h; i++){ - bucket_coords_h[i] = std::floor(70.0*i/pos_h); - } - for (int i = 0; i < pos_w; i++){ - bucket_coords_w[i] = std::floor(70.0*i/pos_w); - } - for (int i = 0, id = 0; i < pos_h; i++){ - for (int j = 0; j < pos_w; j++){ - positions_data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j]; - } + for (int i = 0; i < num_patches; i++){ + positions_data[i] = i; } ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); free(positions_data); - + } if (ggml_backend_is_cpu(ctx->backend)) { diff --git a/examples/xgenmm/xgenmm.cpp b/examples/xgenmm/xgenmm.cpp index 75469a30c..b82cfa728 100644 --- a/examples/xgenmm/xgenmm.cpp +++ b/examples/xgenmm/xgenmm.cpp @@ -438,34 +438,21 @@ static bool clip_xgenmm_handle_vit_patches(clip_ctx *ctx_clip , const clip_image float* base_image_feature_data = (float*)base_image_feature->data; for (int i=0; i < dim0; i++) - { - if (i==0) + { + for (int j=0; j < dim1; j++) { - // base_image_feature_data - float* image_embd = image_embd_v[i]; - for (int j=0; j < dim1; j++) + for (int k=0; k < dim2; k++) { - for (int k=0; k < dim2; k++) + image_features_data[i * dim1 * dim2 + j * dim2 + k] = + image_embd_v[i+1][j * dim2 + k]; + if (i == 0) { - base_image_feature_data[j * dim2 + k] = image_embd[j * dim2 + k]; - } - } - } - else - { - // other sub-images - float* image_embd = image_embd_v[i+1]; - for (int j=0; j < dim1; j++) - { - for (int k=0; k < dim2; k++) - { - image_features_data[i * dim1 * dim2 + j * dim2 + k] = image_embd[j * dim2 + k]; + base_image_feature_data[j * dim2 + k] = image_embd_v[i][j * dim2 + k]; } } } } - struct ggml_tensor* image_features_patchview = ggml_view_4d( model.ctx, image_features, num_patches_per_side * hidden_size, num_patches_per_side, num_patches_width, num_patches_height, size_ele * num_patches_per_side * hidden_size, @@ -575,31 +562,31 @@ static bool clip_xgenmm_handle_vit_patches(clip_ctx *ctx_clip , const clip_image attention_mask = gf->nodes[gf->n_nodes - 1]; // memcpy(image_embd_v_m_mask_out, (float *)attention_mask->data, ggml_nbytes(attention_mask)); - { - printf((" ========================= DEBUG =========================\n")); - printf("Load pre-computed image embeddings and attention_mask\n"); - std::string filename = "/export/home/ggml/examples/projectors/receipt_5patches_vision_features.gguf"; - tensor_from_gguf tensor; - bool is_successful = load_tensor_from_file(filename.c_str(), tensor); - if (!is_successful) - { - fprintf(stderr, "%s: load_tensor_from_file() failed\n", __func__); - return 1; - } - result = tensor.data; - // print_tensor(result, "result", 1); - filename = "/export/home/ggml/examples/projectors/receipt_5patches_vision_attn_masks.gguf"; - is_successful = load_tensor_from_file(filename.c_str(), tensor); - if (!is_successful) - { - fprintf(stderr, "%s: load_tensor_from_file() failed\n", __func__); - return 1; - } - attention_mask = tensor.data; - // print_tensor(attention_mask, "attention_mask", 1); - num_patches_width = 2; - num_patches_height = 2; - } + // { + // printf((" ========================= DEBUG =========================\n")); + // printf("Load pre-computed image embeddings and attention_mask\n"); + // std::string filename = "/export/home/ggml/examples/projectors/receipt_5patches_vision_features.gguf"; + // tensor_from_gguf tensor; + // bool is_successful = load_tensor_from_file(filename.c_str(), tensor); + // if (!is_successful) + // { + // fprintf(stderr, "%s: load_tensor_from_file() failed\n", __func__); + // return 1; + // } + // result = tensor.data; + // // print_tensor(result, "result", 1); + // filename = "/export/home/ggml/examples/projectors/receipt_5patches_vision_attn_masks.gguf"; + // is_successful = load_tensor_from_file(filename.c_str(), tensor); + // if (!is_successful) + // { + // fprintf(stderr, "%s: load_tensor_from_file() failed\n", __func__); + // return 1; + // } + // attention_mask = tensor.data; + // // print_tensor(attention_mask, "attention_mask", 1); + // num_patches_width = 2; + // num_patches_height = 2; + // } // compute attnetion masks outside of the graph @@ -1126,4 +1113,4 @@ void llava_image_embed_free(struct llava_image_embed *embed) { free(embed->embed); free(embed); -} +} \ No newline at end of file