patch mergeing + masking done

This commit is contained in:
Yutong Dai 2024-09-03 18:39:46 +00:00
parent 6c1f137ba5
commit a81ba75193
10 changed files with 610 additions and 55 deletions

2
.gitignore vendored
View file

@ -153,3 +153,5 @@ examples/xgenmm copy/imgs/image_res_3.csv
examples/xgenmm copy/imgs/image_res_4.csv
examples/xgenmm copy/imgs/image-1d100e9-1.jpg
examples/xgenmm copy/imgs/image-1d100e9.jpg
examples/xgenmm/imgs/4patches_embeddings.pt
examples/xgenmm/imgs/attention_mask_4patchhes.pt

View file

@ -21,6 +21,7 @@ BUILD_TARGETS = \
llama-llava-cli \
llama-minicpmv-cli\
xgenmm-cli\
test_anyres_handle_patches\
llama-lookahead \
llama-lookup \
llama-lookup-create \
@ -1482,6 +1483,14 @@ xgenmm-cli: examples/xgenmm/xgenmm-cli.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
test_anyres_handle_patches: examples/xgenmm/test_anyres_handle_patches.cpp \
examples/xgenmm/xgenmm.cpp \
examples/xgenmm/xgenmm.h \
examples/xgenmm/clip.cpp \
examples/xgenmm/clip.h \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
ifeq ($(UNAME_S),Darwin)
swift: examples/batched.swift
(cd examples/batched.swift; make build)

View file

@ -44,6 +44,12 @@ install(TARGETS test_anyres_handle_patches RUNTIME)
target_link_libraries(test_anyres_handle_patches PRIVATE common xgenmm ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(xgenmm PRIVATE cxx_std_11)
set(TARGET test_patch_ops)
add_executable(test_patch_ops test_patch_ops.cpp)
install(TARGETS test_patch_ops RUNTIME)
target_link_libraries(test_patch_ops PRIVATE common xgenmm ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(xgenmm PRIVATE cxx_std_11)
# not implemented yet
# set(TARGET xgenmm-cli)

View file

@ -485,7 +485,7 @@ struct clip_vision_model {
struct ggml_tensor * projection;
// LLaVA projection
// LLaVA projecclip_image_encodeion
struct ggml_tensor * mm_0_w = NULL;
struct ggml_tensor * mm_0_b = NULL;
struct ggml_tensor * mm_2_w = NULL;

View file

@ -1,15 +1,32 @@
from torchvision.transforms import Resize
from torchvision.transforms import InterpolationMode
from PIL import Image
# from torchvision.transforms import Resize
# from torchvision.transforms import InterpolationMode
# from PIL import Image
# import numpy as np
# n_px = 384
# resize_func = Resize((n_px, n_px), interpolation=InterpolationMode.BICUBIC, antialias=True)
# img_dir = "./imgs"
# image_path_1 = f'{img_dir}/image-1d100e9-1.jpg'
# image_path_2 = f'{img_dir}/image-1d100e9.jpg'
# image_1 = Image.open(image_path_1).convert('RGB')
# image_2 = Image.open(image_path_2).convert('RGB')
# print(np.asarray(resize_func(image_2))[:5, :10, 0])
import gguf
import numpy as np
import torch
n_px = 384
resize_func = Resize((n_px, n_px), interpolation=InterpolationMode.BICUBIC, antialias=True)
patches_embeddings = torch.load('./imgs/4patches_embeddings.pt').numpy()
print(f'4patches_embeddings:{patches_embeddings.shape}\n')
print(patches_embeddings[1:,:,:])
img_dir = "./imgs"
image_path_1 = f'{img_dir}/image-1d100e9-1.jpg'
image_path_2 = f'{img_dir}/image-1d100e9.jpg'
image_1 = Image.open(image_path_1).convert('RGB')
image_2 = Image.open(image_path_2).convert('RGB')
print(np.asarray(resize_func(image_2))[:5, :10, 0])
# gguf_writer = gguf.GGUFWriter(path='./imgs/4patches_embeddings.gguf', arch='4patches_embeddings')
# gguf_writer.add_tensor("data", patches_embeddings)
# gguf_writer.write_header_to_file()
# gguf_writer.write_kv_data_to_file()
# gguf_writer.write_tensors_to_file()
# gguf_writer.close()

View file

@ -5,6 +5,27 @@
"metadata": {},
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# check mask"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def csv_to_tensor(filename, axis=0):\n",
" matrix = np.loadtxt(filename, delimiter=',')\n",
" return tensor\n",
"\n",
"filename = 'imgs/attention_mask_4patchhes.csv'\n",
"pacthes = csv_to_tensor(filename)"
]
},
{
"cell_type": "markdown",
"metadata": {},

View file

@ -0,0 +1,9 @@
#!/bin/bash
make xgenmm-cli
./xgenmm-cli -m /export/share/llamacpp_models/MiniCPM-Llama3-V-2_5/ggml-model-Q4_K_M.gguf \
--mmproj /export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf_test/mmproj-model-f32.gguf \
-c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 \
--image /export/home/llama.cpp/examples/xgenmm/imgs/image-1d100e9-1.jpg \
-p "What is in the image?"

View file

@ -535,8 +535,8 @@ int main(){
part of:
llava_image_embed_make_with_filename
*/
const char* image_path = "/export/home/llama.cpp/examples/xgenmm/imgs/image-1d100e9.jpg"; // Porcelain
// const char* image_path = "/export/home/llama.cpp/examples/xgenmm/imgs/image-1d100e9-1.jpg";
// const char* image_path = "/export/home/llama.cpp/examples/xgenmm/imgs/image-1d100e9.jpg"; // Porcelain
const char* image_path = "/export/home/llama.cpp/examples/xgenmm/imgs/image-1d100e9-1.jpg";
unsigned char* image_bytes;
long image_bytes_length;
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
@ -618,31 +618,36 @@ int main(){
std::vector<float*> image_embd_v;
image_embd_v.resize(img_res_v.size);
printf("image_embd_v.size():%d\n", image_embd_v.size());
for (size_t i = 0; i < img_res_v.size; i++)
{
printf("encode patch %d\n", i);
const int nx = img_res_v.data[i].nx;
const int ny = img_res_v.data[i].ny;
const int vec_len = img_res_v.data[i].buf.size();
printf(" i:%d | nx:%d | ny:%d | vec len:%d\n", i, nx, ny, vec_len); // 384^2 * 3(channel) = 442368
auto start = std::chrono::high_resolution_clock::now();
image_embd_v[i] =
(float*)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
const bool encoded = clip_image_encode(
ctx_clip, 1, &img_res_v.data[i],
image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
if (!encoded)
{
LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int)i + 1, (int)img_res_v.size);
return false;
}
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> duration = end - start;
std::cout << " Wall time: " << duration.count() << " seconds" << std::endl;
}
// for (size_t i = 0; i < img_res_v.size; i++)
// {
// printf("encode patch %d\n", i);
// const int nx = img_res_v.data[i].nx;
// const int ny = img_res_v.data[i].ny;
// const int vec_len = img_res_v.data[i].buf.size();
// printf(" i:%d | nx:%d | ny:%d | vec len:%d\n", i, nx, ny, vec_len); // 384^2 * 3(channel) = 442368
// auto start = std::chrono::high_resolution_clock::now();
// image_embd_v[i] =
// (float*)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
// const bool encoded = clip_image_encode(
// ctx_clip, 1, &img_res_v.data[i],
// image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
// if (!encoded)
// {
// LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int)i + 1, (int)img_res_v.size);
// return false;
// }
// auto end = std::chrono::high_resolution_clock::now();
// std::chrono::duration<double> duration = end - start;
// std::cout << " Wall time: " << duration.count() << " seconds" << std::endl;
// for (int j = 0; j < 5; j++)
// {
// printf(" %.4f ", image_embd_v[i][j]);
// }
// printf("\n");
// }
// handle patches goes here
return 0;
}

View file

@ -0,0 +1,401 @@
#include <cassert>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <iostream>
#include <map>
#include <string>
#include <vector>
#include "ggml.h"
void print_tensor(ggml_tensor* tensor, const char* name = "", int verbosity = 0)
{
if (tensor->ne[2] == 1)
{
printf("---> %s: (%ld, %ld)\n", name, tensor->ne[0], tensor->ne[1]);
}
else if (ggml_is_3d(tensor))
{
printf("---> %s: (%ld, %ld, %ld)\n", name, tensor->ne[0], tensor->ne[1], tensor->ne[2]);
}
else
{
printf("---> %s: (%ld, %ld, %ld, %ld)\n", name, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
}
if (verbosity == 1)
{
printf("*********************************************************************\n");
if (tensor->ne[2] == 1)
{
const float* mat = (float*)tensor->data;
int dim0 = tensor->ne[1];
int dim1 = tensor->ne[0];
if (dim0 < 6 && dim1 < 6)
{
for (int i = 0; i < dim0; i++)
{
for (int j = 0; j < dim1; j++)
{
printf("%+.4f ", mat[i * dim1 + j]);
}
printf("\n");
}
printf("\n");
}
else
{
for (int i = 0; i < std::min(dim0, 3); i++)
{
for (int j = 0; j < std::min(dim1, 3); j++)
{
printf("%+.4f ", mat[i * dim1 + j]);
}
printf("... ");
for (int j = dim1 - 3; j < dim1; j++)
{
printf("%+.4f ", mat[i * dim1 + j]);
}
printf("\n");
}
if (dim0 > 3)
{
printf("...................... omit ......................\n");
for (int i = dim0 - 3; i < dim0; i++)
{
for (int j = 0; j < std::min(dim1, 3); j++)
{
printf("%+.4f ", mat[i * dim1 + j]);
}
printf("... ");
for (int j = dim1 - 3; j < dim1; j++)
{
printf("%+.4f ", mat[i * dim1 + j]);
}
printf("\n");
}
}
}
}
else if (ggml_is_3d(tensor))
{
const float* data = (float*)tensor->data;
int dim0 = tensor->ne[2];
int dim1 = tensor->ne[1];
int dim2 = tensor->ne[0];
if (dim0 < 6 && dim1 < 6 && dim2 < 6)
{
for (int i = 0; i < dim0; i++)
{
printf("dim0 = %d\n", i);
for (int j = 0; j < dim1; j++)
{
for (int k = 0; k < dim2; k++)
{
printf("%+.4f ", data[i * dim1 * dim2 + j * dim2 + k]);
}
printf("\n");
}
printf("\n");
}
printf("\n");
}
else
{
for (int i = 0; i < std::min(dim0, 4); i++)
{
printf("dim0 = %d\n", i);
for (int j = 0; j < std::min(dim1, 3); j++)
{
for (int k = 0; k < std::min(dim2, 3); k++)
{
printf("%+.4f ", data[i * dim1 * dim2 + j * dim2 + k]);
}
printf("... ");
for (int k = dim2 - 3; k < dim2; k++)
{
printf("%+.4f ", data[i * dim1 * dim2 + j * dim2 + k]);
}
printf("\n");
}
printf("........................ omit .....................\n");
for (int j = dim1 - 3; j < dim1; j++)
{
for (int k = 0; k < std::min(dim2, 3); k++)
{
printf("%+.4f ", data[i * dim1 * dim2 + j * dim2 + k]);
}
printf("... ");
for (int k = dim2 - 3; k < dim2; k++)
{
printf("%+.4f ", data[i * dim1 * dim2 + j * dim2 + k]);
}
printf("\n");
}
printf("---------------------------------------------------\n");
}
printf("\n");
}
}
}
printf("*********************************************************************\n");
printf("\n");
}
void tensor_to_csv(ggml_tensor* tensor, const char* filename)
{
std::ofstream outFile(filename);
if (!outFile.is_open())
{
std::cerr << "Error opening file!" << std::endl;
}
const float* mat = (float*)tensor->data;
int dim0 = tensor->ne[1];
int dim1 = tensor->ne[0];
{
for (int i = 0; i < dim0; i++)
{
for (int j = 0; j < dim1; j++)
{
outFile << float(mat[i * dim1 + j]);
if (j < dim1 - 1)
{
outFile << ",";
}
}
outFile << std::endl;
}
}
outFile.close();
printf("file saved to %s\n", filename);
}
struct tensor_from_gguf
{
struct ggml_tensor* data;
struct ggml_context* ctx;
};
bool load_tensor_from_file(const char* filename, tensor_from_gguf& tensor)
{
struct gguf_init_params params = {
/*.no_alloc =*/false,
/*.ctx =*/&tensor.ctx,
};
gguf_context* ctx = gguf_init_from_file(filename, params);
if (!ctx)
{
fprintf(stderr, "%s: gguf_init_from_file() failed\n", __func__);
return false;
}
tensor.data = ggml_get_tensor(tensor.ctx, "data");
return true;
}
int main(){
tensor_from_gguf tensor;
std::string filename = "../examples/xgenmm/imgs/4patches_embeddings.gguf";
bool is_successful = load_tensor_from_file(filename.c_str(), tensor);
if (!is_successful)
{
fprintf(stderr, "%s: load_tensor_from_file() failed\n", __func__);
return 1;
}
ggml_tensor* patch_embeds = tensor.data;
// print_tensor(patch_embeds, "patch_embeds", 1);
/*
hardcoded values
*/
int original_width = 955;
int original_height = 289;
int num_images = 4; // 3 patches + 1 base
int32_t num_patches_per_side = 384 / 14;
int num_patches_width = 3; //grid_shape.first
int num_patches_height = 1; // grid_shape.second
size_t size_ele = ggml_type_size(GGML_TYPE_F32);
struct
{
struct ggml_context* ctx;
} model;
// TODO: size calculation is not calculated - it's only tens of MB
size_t ctx_size = 0;
{
ctx_size +=
num_patches_per_side * num_patches_per_side * 1152 * sizeof(float) * num_images * 8; // image_features
ctx_size += 1024 * 1024 * ggml_type_size(GGML_TYPE_F32);
}
struct ggml_init_params params
{
/*.mem_size =*/ctx_size,
/*.mem_buffer =*/NULL,
/*.no_alloc =*/false, // NOTE: this should be false when using the legacy API
};
model.ctx = ggml_init(params);
// FIXME: hardcoded for the patch size and vit embedding size
struct ggml_tensor* image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, 1152, 729, num_images - 1);
struct ggml_tensor* base_image_feature = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, 1152, 729, 1);
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
// fill it with the image embeddings, ignoring the base
// for (size_t i = 1; i < num_images; i++)
// {
// size_t offset = (i - 1) * 729 * 1152 * sizeof(float);
// // size_t offset = (i - 1) * clip_embd_nbytes(ctx_clip);
// // memcpy((uint8_t*)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip));
// }
int dim0 = num_images - 1;
int dim1 = num_patches_per_side * num_patches_per_side;
int dim2 = 1152;
float* patch_embeds_data = (float*)patch_embeds->data;
float* image_features_data = (float*)image_features->data;
float* base_image_feature_data = (float*)base_image_feature->data;
for (int i=0; i < dim0; i++)
{
for (int j=0; j < dim1; j++)
{
for (int k=0; k < dim2; k++)
{
image_features_data[i * dim1 * dim2 + j * dim2 + k] =
patch_embeds_data[(i + 1) * dim1 * dim2 + j * dim2 + k];
if (i == 0)
{
base_image_feature_data[j * dim2 + k] = patch_embeds_data[j * dim2 + k];
}
}
}
}
// print_tensor(image_features, "image_features", 1);
struct ggml_tensor* image_features_patchview = ggml_view_4d(
model.ctx, image_features, num_patches_per_side * 1152, num_patches_per_side,
num_patches_width, num_patches_height, size_ele * num_patches_per_side * 1152,
size_ele * num_patches_per_side * 1152 * num_patches_per_side,
size_ele * num_patches_per_side * 1152 * num_patches_per_side * num_patches_width, 0);
print_tensor(image_features_patchview, "image_features_patchview", 0); // (27 * 1152, 27, 3, 1)
struct ggml_tensor* permuted_cont =
ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3));
print_tensor(permuted_cont, "permuted_cont", 0); // (27 * 1152, 3, 27, 1)
struct ggml_tensor* flatten =
ggml_view_2d(model.ctx, permuted_cont, 1152,
num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side,
size_ele * 1152, 0);
print_tensor(flatten, "flatten", 0); // (1152, 27 * 27 * 3)
// struct ggml_tensor* tensor_3d =
// ggml_view_3d(model.ctx, flatten,
// 1152, // ne0
// num_patches_per_side * num_patches_per_side, // ne1
// num_patches_width * num_patches_height, // ne2 = num_patches_width * num_patches_height,
// size_ele * num_patches_width * num_patches_height, // nb1 = sizeof(float) × ne2,
// size_ele * num_patches_width * num_patches_height * num_patches_per_side *
// num_patches_per_side, // nb2 = sizeof(float)×ne1×ne2
// 0);
struct ggml_tensor* tensor_3d =
ggml_reshape_3d(model.ctx, flatten,
1152,
num_patches_per_side * num_patches_per_side,
num_patches_width * num_patches_height);
tensor_3d = ggml_cont(model.ctx, tensor_3d);
tensor_3d = ggml_concat(model.ctx, base_image_feature, tensor_3d, 2);
struct ggml_cgraph* gf = ggml_new_graph(model.ctx);
ggml_build_forward_expand(gf, tensor_3d);
ggml_graph_compute_with_ctx(model.ctx, gf, 1);
struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1];
print_tensor(result, "result", 1); // (1152, 27 * 27, 3)
struct
{
struct ggml_context* ctx;
} mask;
// TODO: size calculation is not calculated - it's only tens of MB
ctx_size = 0;
{
ctx_size +=
num_patches_per_side * num_patches_width * num_patches_per_side * num_patches_height * sizeof(float) * 2;
}
params =
{
/*.mem_size =*/ctx_size,
/*.mem_buffer =*/NULL,
/*.no_alloc =*/false, // NOTE: this should be false when using the legacy API
};
mask.ctx = ggml_init(params);
int current_height = num_patches_per_side * num_patches_height;
int current_width = num_patches_per_side * num_patches_width;
float original_aspect_ratio = (float)original_width / (float)original_height;
float current_aspect_ratio = (float)current_width / (float)current_height;
printf("original_height: %d, original_width: %d, original_aspect_ratio: %.2f\n", original_height, original_width,
original_aspect_ratio);
printf("current_height: %d, current_width: %d, current_aspect_ratio: %.2f\n", current_height, current_width,
current_aspect_ratio);
float scale_factor = 1.0;
struct ggml_tensor* attention_mask = ggml_new_tensor_2d(mask.ctx, GGML_TYPE_F32, current_width, current_height);
if (original_aspect_ratio > current_aspect_ratio){
scale_factor = (float)current_width / (float)original_width;
int new_height = int(original_height * scale_factor);
int padding = (current_height - new_height) / 2;
// printf("new_height: %d, padding: %d\n", new_height, padding);
float* attention_mask_data = (float*)attention_mask->data;
for (int i = 0; i < current_height; i++){
for (int j = 0; j < current_width; j++){
if (i < padding || i > padding + new_height){
attention_mask_data[i * current_width + j] = 0.0;
} else {
attention_mask_data[i * current_width + j] = 1.0;
}
}
}
}else{
scale_factor = current_height / original_height;
int new_width = int(original_width * scale_factor);
int padding = (current_width - new_width) / 2;
float* attention_mask_data = (float*)attention_mask->data;
for (int i = 0; i < current_height; i++){
for (int j = 0; j < current_width; j++){
if (j < padding || j > padding + new_width){
attention_mask_data[i * current_width + j] = 0.0;
} else {
attention_mask_data[i * current_width + j] = 1.0;
}
}
}
}
print_tensor(attention_mask, "attention_mask", 1);
tensor_to_csv(attention_mask, "/export/home/llama.cpp/examples/xgenmm/imgs/attention_mask_4patchhes.csv");
ggml_free(model.ctx);
ggml_free(mask.ctx);
ggml_free(tensor.ctx);
return 0;
}
// make test_patch_ops && ./bin/test_patch_ops

View file

@ -343,31 +343,116 @@ static bool encode_image_with_clip(clip_ctx *ctx_clip, int n_threads, const clip
}
else if (clip_is_xgenmm(ctx_clip))
{
// xgenmm embedding
*n_img_pos = clip_n_patches(ctx_clip);
bool encoded =
clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 729 x
delete[] img_res_v.data;
if (!encoded)
// spatial_unpad llava-1.6 type embedding
// TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a
// solution to quickly get batching working
std::vector<float *> image_embd_v;
image_embd_v.resize(img_res_v.size);
for (size_t i = 0; i < img_res_v.size; i++)
{
LOG_TEE("Unable to encode image\n");
return false;
image_embd_v[i] =
(float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
const bool encoded = clip_image_encode(
ctx_clip, n_threads, &img_res_v.data[i],
image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
if (!encoded)
{
LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int)i + 1,
(int)img_res_v.size);
return false;
}
for (int j = 0; j < 5; j++)
{
printf(" %.4f ", image_embd_v[i][j]);
}
printf("\n");
}
const int64_t t_img_enc_batch_us = ggml_time_us();
LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size,
(t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
const int32_t *image_grid = clip_image_grid(ctx_clip);
std::vector<std::pair<int, int>> grid_pinpoints;
for (int i = 0; i < 32 && image_grid[i] != 0; i += 2)
{
grid_pinpoints.push_back({image_grid[i], image_grid[i + 1]});
}
// free all img_res_v - not needed anymore
delete[] img_res_v.data;
img_res_v.size = 0;
img_res_v.data = nullptr;
const int32_t image_size = clip_image_size(ctx_clip);
struct clip_image_grid_shape grid_shape =
get_anyres_image_grid_shape({img->nx, img->ny}, grid_pinpoints, image_size);
int n_img_pos_out;
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
*n_img_pos = n_img_pos_out;
for (size_t i = 0; i < image_embd_v.size(); i++)
{
free(image_embd_v[i]);
}
image_embd_v.clear();
// debug image/segment/normalization content:
// clip_image_u8 * tmp = clip_image_u8_init();
// clip_image_convert_f32_to_u8(*image_feature, *tmp);
// clip_image_save_to_bmp(*tmp, "image_feature.bmp");
}
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0)
{
// flat / default llava-1.5 type embedding
*n_img_pos = clip_n_patches(ctx_clip);
bool encoded =
clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
delete[] img_res_v.data;
if (!encoded)
std::vector<float *> image_embd_v;
image_embd_v.resize(img_res_v.size);
for (size_t i = 0; i < img_res_v.size; i++)
{
LOG_TEE("Unable to encode image\n");
return false;
image_embd_v[i] =
(float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
const bool encoded = clip_image_encode(
ctx_clip, n_threads, &img_res_v.data[i],
image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
if (!encoded)
{
LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int)i + 1,
(int)img_res_v.size);
return false;
}
}
const int64_t t_img_enc_batch_us = ggml_time_us();
LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size,
(t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
const int32_t *image_grid = clip_image_grid(ctx_clip);
std::vector<std::pair<int, int>> grid_pinpoints;
for (int i = 0; i < 32 && image_grid[i] != 0; i += 2)
{
grid_pinpoints.push_back({image_grid[i], image_grid[i + 1]});
}
// free all img_res_v - not needed anymore
delete[] img_res_v.data;
img_res_v.size = 0;
img_res_v.data = nullptr;
const int32_t image_size = clip_image_size(ctx_clip);
struct clip_image_grid_shape grid_shape =
get_anyres_image_grid_shape({img->nx, img->ny}, grid_pinpoints, image_size);
int n_img_pos_out;
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
*n_img_pos = n_img_pos_out;
for (size_t i = 0; i < image_embd_v.size(); i++)
{
free(image_embd_v[i]);
}
image_embd_v.clear();
}
else
{