Tensors are now properly permuted.

Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference.
This commit is contained in:
John 2024-02-12 04:02:54 +01:00
parent 7107b9098e
commit 51e60c996f
2 changed files with 46 additions and 94 deletions

View file

@ -1,6 +1,7 @@
// NOTE: This is modified from clip.cpp only for LLaVA, // NOTE: This is modified from clip.cpp only for LLaVA,
// so there might be still unnecessary artifacts hanging around // so there might be still unnecessary artifacts hanging around
// I'll gradually clean and extend it // I'll gradually clean and extend it
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
#include "clip.h" #include "clip.h"
#include "ggml.h" #include "ggml.h"
#include "ggml-alloc.h" #include "ggml-alloc.h"
@ -1622,7 +1623,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std
possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]}); possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
} }
std::pair<int, int> best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions); std::pair<int, int> best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions);
// fprintf(stderr, "%s - Working with resolution: %d %d\n", __func__, best_resolution.first, best_resolution.second);
// clip_image_save_to_bmp(*img, "input.bmp"); // clip_image_save_to_bmp(*img, "input.bmp");
resize_and_pad_image(*img, *temp, best_resolution); // we do not pad with mean-bg color anymore in llava-1.6 resize_and_pad_image(*img, *temp, best_resolution); // we do not pad with mean-bg color anymore in llava-1.6
// clip_image_save_to_bmp(*temp, "resized.bmp"); // clip_image_save_to_bmp(*temp, "resized.bmp");
@ -1646,7 +1646,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std
res_tensor.clear(); res_tensor.clear();
for (auto& patch : patches) { for (auto& patch : patches) {
clip_image_f32 *temp_image_f32 = clip_image_f32_init(); clip_image_f32 *temp_image_f32 = clip_image_f32_init();
normalize_image_u8_to_f32(patch, temp_image_f32, ctx->image_mean, ctx->image_std, true); normalize_image_u8_to_f32(patch, temp_image_f32, ctx->image_mean, ctx->image_std, false); // set to true for pytorch fp16 value replication
res_tensor.push_back(temp_image_f32); res_tensor.push_back(temp_image_f32);
} }

View file

@ -34,44 +34,40 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_emb
/*.mem_buffer =*/ NULL, /*.mem_buffer =*/ NULL,
/*.no_alloc =*/ false, // NOTE: this should be false when using the legacy API /*.no_alloc =*/ false, // NOTE: this should be false when using the legacy API
}; };
// Python reference code for full unpad:
/*
base_image_feature = image_feature[0]
image_feature = image_feature[1:]
image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
image_feature = image_feature.flatten(1, 2).flatten(2, 3)
image_feature = unpad_image(image_feature, image_sizes[image_idx])
image_feature = torch.cat((
image_feature,
self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1)
), dim=-1)
image_feature = image_feature.flatten(1, 2).transpose(0, 1)
image_feature = torch.cat((base_image_feature, image_feature), dim=0)
*/
// We now have two options: unpad or no unpad. Unpad removes tokens for faster llm eval.
// In terms of result quality it appears to make no difference, so we'll start with the easier approach given 5D tensors are not supported in ggml yet.
// Without unpad we have to split the sub-image embeddings into patches of 24 features each and permute them.
// Once all images are processed to prepended the base_image_features without any changes.
// Python reference for full unpad: // Pytorch reference simplified, modified for ggml compatibility - confirmed identical output in python (for a 2x2 grid image (676x676 scaling))
// base_image_feature = image_feature[0] /*
// image_feature = image_feature[1:] image_feature = image_feature.view(2, 2, 24, 24, 4096)
// image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous() image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
// image_feature = image_feature.flatten(1, 2).flatten(2, 3) image_feature = image_feature.view(2, 24, 2, 24, 4096)
// image_feature = unpad_image(image_feature, image_sizes[image_idx]) image_feature = image_feature.flatten(0, 3)
// image_feature = torch.cat((
// image_feature,
// self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1)
// ), dim=-1)
// image_feature = image_feature.flatten(1, 2).transpose(0, 1)
// image_feature = torch.cat((base_image_feature, image_feature), dim=0)
// embeddings -> tokens -> 24 x 24 // Reshape to 4D tensor by merging the last two dimensions
/**
* We now have two options: unpad or no unpad - unpad removes tokens for faster llm eval
* In terms of result quality it appears to make no difference, so we'll start with the easier approach given 5D tensors are not supported in ggml yet
* Without unpad we have to split the sub-image embeddings into patches of 24 features each and permute them.
* Once all images are processed to prepended the base_image_features without any changes.
*/
/**
Pytorch reference simplified, modified for ggml compatibility - confirmed identical output in python (for a 2x2 grid image (676x676 scaling))
# image_feature = image_feature.view(2, 2, 24, 24, 4096)
# image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
# image_feature = image_feature.view(2, 24, 2, 24, 4096)
# image_feature = image_feature.flatten(0, 3)
# Reshape to 4D tensor by merging the last two dimensions
image_feature = image_feature.view(2, 2, 24, 24*4096) image_feature = image_feature.view(2, 2, 24, 24*4096)
image_feature = image_feature.permute(0, 2, 1, 3).contiguous() image_feature = image_feature.permute(0, 2, 1, 3).contiguous()
image_feature = image_feature.view(-1, 4096) image_feature = image_feature.view(-1, 4096)
* */
*/
model.ctx = ggml_init(params);
model.ctx = ggml_init(params);
ggml_context *ctx_noalloc = ggml_init({2048, NULL, true}); ggml_context *ctx_noalloc = ggml_init({2048, NULL, true});
// struct ggml_tensor * image_features = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip) * (image_embd_v.size() - 1));
ggml_tensor *newline_tmp = clip_get_newline_tensor(ctx_clip); ggml_tensor *newline_tmp = clip_get_newline_tensor(ctx_clip);
model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]); model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
@ -88,83 +84,39 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_emb
} }
} }
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, image_embd_v.size() - 1, clip_n_patches(ctx_clip), clip_n_mmproj_embd(ctx_clip)); struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), image_embd_v.size() - 1); // example: 4096 x 576 x 4
// fill it with the image embeddings, ignoring the first // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
// fill it with the image embeddings, ignoring the base
for (int i = 1; i < image_embd_v.size(); i++) for (int i = 1; i < image_embd_v.size(); i++)
{ {
// printf("Copying image_embd_v[%d] to image_features tensor\n", i);
size_t offset = (i-1) * clip_embd_nbytes(ctx_clip); size_t offset = (i-1) * clip_embd_nbytes(ctx_clip);
// for debugging we now try and set the entire tensor row to 0.0001f,0.0002f,0.0003f,0.0004f etc:
// float *floatPtr = static_cast<float*>(image_embd_v[i]);
// for (int j = 0; j < clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip); j++)
// {
// // floatPtr[j] = (j + 1) / 10000.0f;
// int feature = j % clip_n_mmproj_embd(ctx_clip) + 1;
// floatPtr[j] = i + feature / 10000.0f;
// }
memcpy((uint8_t *)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip)); memcpy((uint8_t *)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip));
} }
// printf("image_features size = %d\n", clip_embd_nbytes(ctx_clip) * (image_embd_v.size() - 1));
struct ggml_cgraph * gf = ggml_new_graph(model.ctx); struct ggml_cgraph * gf = ggml_new_graph(model.ctx);
// image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
size_t size_ele = ggml_type_size(GGML_TYPE_F32); size_t size_ele = ggml_type_size(GGML_TYPE_F32);
// struct ggml_tensor *dummy = ggml_new_tensor_4d(ctx_noalloc, GGML_TYPE_F32, num_patches_height, num_patches_width, num_patches_per_side, num_patches_per_side * clip_n_mmproj_embd(ctx_clip));
struct ggml_tensor *image_features_view = ggml_view_4d(model.ctx, image_features,
num_patches_height,
num_patches_width,
num_patches_per_side * num_patches_per_side,
clip_n_mmproj_embd(ctx_clip),
size_ele * num_patches_height,
size_ele * num_patches_height * num_patches_width,
size_ele * num_patches_height * num_patches_width * num_patches_per_side,
0);
struct ggml_tensor *image_features_patchview = ggml_view_4d(model.ctx, image_features, struct ggml_tensor *image_features_patchview = ggml_view_4d(model.ctx, image_features,
num_patches_height,
num_patches_width,
num_patches_per_side,
num_patches_per_side * clip_n_mmproj_embd(ctx_clip), num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
num_patches_per_side,
size_ele * num_patches_height, num_patches_width,
size_ele * num_patches_height * num_patches_width, num_patches_height,
size_ele * num_patches_height * num_patches_width * num_patches_per_side, 0); size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side,
size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side * num_patches_width, 0);
// ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,false);
struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3)); struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3));
permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, permuted_cont, 0, 2, 1, 3)); // permute back to before - todo: fix bug // ggml_tensor_printf(permuted_cont,"permuted_cont",__LINE__,false,false);
struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, size_ele * clip_n_mmproj_embd(ctx_clip), 0);
struct ggml_tensor *prepared = ggml_view_2d(model.ctx, permuted_cont, num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, clip_n_mmproj_embd(ctx_clip), size_ele * num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, 0); // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
struct ggml_tensor *prepared_cont = ggml_cont(model.ctx, prepared); // not needed ggml_build_forward_expand(gf, flatten);
// struct ggml_tensor *prepared_cont = prepared; // the view only flattens
ggml_build_forward_expand(gf, prepared_cont);
ggml_graph_compute_with_ctx(model.ctx, gf, 1); ggml_graph_compute_with_ctx(model.ctx, gf, 1);
struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1]; struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1];
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,true);
// ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,true);
// ggml_tensor_printf(prepared_cont,"prepared_cont",__LINE__,false,true);
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
// append without newline tokens: // append without newline tokens (default behavior in llava_arch when not using unpad ):
// memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip) * (image_embd_v.size()-1)); // grid patches memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (image_embd_v.size()-1)); // grid patches
// append with newline tokens: *n_img_pos_out = result->ne[1]+clip_n_patches(ctx_clip);
for (size_t i = 0; i < image_embd_v.size() - 1; ++i) {
// we append with +1 offset (base image is prepended)
memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip) * (i+1) + model.newline->ne[0] * i,
(float*)prepared_cont->data + i * clip_n_mmproj_embd(ctx_clip) * clip_n_patches(ctx_clip),
clip_embd_nbytes(ctx_clip));
memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip) * (i+2) + model.newline->ne[0] * i ,
(float*)model.newline->data,
ggml_nbytes(model.newline));
}
size_t newline_tokens = image_embd_v.size()-1;
*n_img_pos_out = prepared_cont->ne[0]+clip_n_patches(ctx_clip) + newline_tokens;
// Debug: Test single segments // Debug: Test single segments
// Current findings: sending base image, sending a segment embedding all works similar to python // Current findings: sending base image, sending a segment embedding all works similar to python