the difference is from resize
This commit is contained in:
parent
b31dc0b5ed
commit
ba0861e384
11 changed files with 4200 additions and 0 deletions
|
@ -39,6 +39,7 @@ else()
|
|||
add_subdirectory(quantize-stats)
|
||||
add_subdirectory(quantize)
|
||||
add_subdirectory(retrieval)
|
||||
add_subdirectory(xgenmm)
|
||||
if (GGML_RPC)
|
||||
add_subdirectory(rpc)
|
||||
endif()
|
||||
|
|
51
examples/xgenmm/CMakeLists.txt
Normal file
51
examples/xgenmm/CMakeLists.txt
Normal file
|
@ -0,0 +1,51 @@
|
|||
add_library(xgenmm OBJECT
|
||||
xgenmm.cpp
|
||||
xgenmm.h
|
||||
clip.cpp
|
||||
clip.h
|
||||
)
|
||||
|
||||
target_link_libraries(xgenmm PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
target_include_directories(xgenmm PUBLIC .)
|
||||
target_include_directories(xgenmm PUBLIC ../..)
|
||||
target_include_directories(xgenmm PUBLIC ../../common)
|
||||
|
||||
target_compile_features(xgenmm PRIVATE cxx_std_11)
|
||||
|
||||
add_library(xgenmm_static STATIC $<TARGET_OBJECTS:xgenmm>)
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(xgenmm PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_compile_definitions(xgenmm PRIVATE LLAMA_SHARED LLAMA_BUILD)
|
||||
add_library(xgenmm_shared SHARED $<TARGET_OBJECTS:xgenmm>)
|
||||
target_link_libraries(xgenmm_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
install(TARGETS xgenmm_shared LIBRARY)
|
||||
endif()
|
||||
|
||||
if (NOT MSVC)
|
||||
target_compile_options(xgenmm PRIVATE -Wno-cast-qual) # stb_image.h
|
||||
endif()
|
||||
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(xgenmm BUILD_INFO)
|
||||
endif()
|
||||
|
||||
|
||||
set(TARGET test_anyres_img)
|
||||
add_executable(test_anyres_img test_anyres_img.cpp)
|
||||
install(TARGETS test_anyres_img RUNTIME)
|
||||
target_link_libraries(test_anyres_img PRIVATE common xgenmm ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(xgenmm PRIVATE cxx_std_11)
|
||||
|
||||
|
||||
# not implemented yet
|
||||
# set(TARGET xgenmm-cli)
|
||||
# add_executable(xgenmm-cli xgenmm-cli.cpp)
|
||||
# install(TARGETS xgenmm-cli RUNTIME)
|
||||
# target_link_libraries(xgenmm-cli PRIVATE common xgenmm_io xgenmm ${CMAKE_THREAD_LIBS_INIT})
|
||||
# target_compile_features(xgenmm PRIVATE cxx_std_11)
|
||||
|
||||
# add_library(xgenmm_io OBJECT
|
||||
# xgenmm_io.cpp
|
||||
# )
|
||||
# target_link_libraries(xgenmm_io PRIVATE xgenmm ${CMAKE_THREAD_LIBS_INIT})
|
2618
examples/xgenmm/clip.cpp
Normal file
2618
examples/xgenmm/clip.cpp
Normal file
File diff suppressed because it is too large
Load diff
98
examples/xgenmm/clip.h
Normal file
98
examples/xgenmm/clip.h
Normal file
|
@ -0,0 +1,98 @@
|
|||
/*
|
||||
08/18/2024 - Yutong - The file is adpated from examples/llava/llava.h in the llama.cpp repository.
|
||||
*/
|
||||
|
||||
#ifndef CLIP_H
|
||||
#define CLIP_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef LLAMA_SHARED
|
||||
# if defined(_WIN32) && !defined(__MINGW32__)
|
||||
# ifdef LLAMA_BUILD
|
||||
# define CLIP_API __declspec(dllexport)
|
||||
# else
|
||||
# define CLIP_API __declspec(dllimport)
|
||||
# endif
|
||||
# else
|
||||
# define CLIP_API __attribute__ ((visibility ("default")))
|
||||
# endif
|
||||
#else
|
||||
# define CLIP_API
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct clip_ctx;
|
||||
|
||||
struct clip_image_size {
|
||||
int width;
|
||||
int height;
|
||||
};
|
||||
|
||||
struct clip_image_u8_batch {
|
||||
struct clip_image_u8 * data;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
struct clip_image_f32_batch {
|
||||
struct clip_image_f32 * data;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity);
|
||||
CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
|
||||
|
||||
CLIP_API void clip_free(struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
|
||||
CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
|
||||
CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx);
|
||||
|
||||
// TODO: should be enum, not string
|
||||
CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
|
||||
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
|
||||
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
|
||||
|
||||
CLIP_API struct clip_image_size * clip_image_size_init();
|
||||
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
|
||||
CLIP_API struct clip_image_f32 * clip_image_f32_init();
|
||||
|
||||
CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
|
||||
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
|
||||
CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
|
||||
CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
|
||||
|
||||
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
|
||||
|
||||
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
|
||||
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
|
||||
|
||||
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
|
||||
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
||||
|
||||
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
|
||||
CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
|
||||
|
||||
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
|
||||
|
||||
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // CLIP_H
|
15
examples/xgenmm/debug.py
Normal file
15
examples/xgenmm/debug.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
from torchvision.transforms import Resize
|
||||
from torchvision.transforms import InterpolationMode
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
|
||||
n_px = 384
|
||||
resize_func = Resize((n_px, n_px), interpolation=InterpolationMode.BICUBIC, antialias=True)
|
||||
|
||||
img_dir = "./imgs"
|
||||
image_path_1 = f'{img_dir}/image-1d100e9-1.jpg'
|
||||
image_path_2 = f'{img_dir}/image-1d100e9.jpg'
|
||||
image_1 = Image.open(image_path_1).convert('RGB')
|
||||
image_2 = Image.open(image_path_2).convert('RGB')
|
||||
|
||||
print(np.asarray(resize_func(image_2))[:5, :10, 0])
|
BIN
examples/xgenmm/imgs/image-1d100e9-1.jpg
Normal file
BIN
examples/xgenmm/imgs/image-1d100e9-1.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 52 KiB |
BIN
examples/xgenmm/imgs/image-1d100e9.jpg
Normal file
BIN
examples/xgenmm/imgs/image-1d100e9.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 63 KiB |
|
@ -1,5 +1,242 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Image Resize"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from omegaconf import OmegaConf\n",
|
||||
"from open_flamingo.train.any_res_data_utils import process_images\n",
|
||||
"from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize, Lambda\n",
|
||||
"from torchvision.transforms import InterpolationMode\n",
|
||||
"BICUBIC = InterpolationMode.BICUBIC\n",
|
||||
"from PIL import Image\n",
|
||||
"from functools import partial\n",
|
||||
"\n",
|
||||
"cfg = dict(\n",
|
||||
" model_family = 'kosmos',\n",
|
||||
" lm_path = 'microsoft/Phi-3-mini-4k-instruct',\n",
|
||||
" # vision_encoder_path = 'ViT-H-14-378-quickgelu',\n",
|
||||
" # vision_encoder_pretrained = 'dfn5b',\n",
|
||||
" vision_encoder_path = 'google/siglip-so400m-patch14-384',\n",
|
||||
" vision_encoder_pretrained = 'google',\n",
|
||||
" num_vision_tokens = 128,\n",
|
||||
" image_aspect_ratio = 'anyres',\n",
|
||||
" anyres_patch_sampling = True,\n",
|
||||
" anyres_grids=[[1,2],[2,1],[2,2],[3,1],[1,3]],\n",
|
||||
" ckpt_pth = '/export/share/manli_shu/models/open-flamingo-dev/anyres_ablation_HFSiglip_patch128-kosmos_non_instruct-phi3_4k_instruct_nq128_pre_V3_5-llava_1p6_ocrmathmix_v4-8x8-ckpt2/checkpoint_0.pt',\n",
|
||||
")\n",
|
||||
"cfg = OmegaConf.create(cfg)\n",
|
||||
"n_px = 384\n",
|
||||
"image_processor = Compose([\n",
|
||||
" Resize((n_px, n_px), interpolation=InterpolationMode.BICUBIC, antialias=True),\n",
|
||||
" Lambda(lambda x: x.convert('RGB') if x.mode != 'RGB' else x),\n",
|
||||
" ToTensor(),\n",
|
||||
" Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))\n",
|
||||
" ])\n",
|
||||
"image_proc = partial(process_images, image_processor=image_processor, model_cfg=cfg)\n",
|
||||
"base_img_size = image_processor.transforms[0].size[0]\n",
|
||||
"anyres_grids = []\n",
|
||||
"for (m,n) in cfg.anyres_grids:\n",
|
||||
" anyres_grids.append([base_img_size*m, base_img_size*n])\n",
|
||||
"cfg.anyres_grids = anyres_grids"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"image_aspect_ratio: anyres\n",
|
||||
"anyres_grids: [[384, 768], [768, 384], [768, 768], [1152, 384], [384, 1152]]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"image_aspect_ratio = cfg.image_aspect_ratio\n",
|
||||
"print(f\"image_aspect_ratio: {image_aspect_ratio}\")\n",
|
||||
"anyres_grids = cfg.anyres_grids\n",
|
||||
"print(f\"anyres_grids: {anyres_grids}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"img_dir = \"./imgs\"\n",
|
||||
"image_path_1 = f'{img_dir}/image-1d100e9-1.jpg'\n",
|
||||
"image_path_2 = f'{img_dir}/image-1d100e9.jpg'\n",
|
||||
"image_1 = Image.open(image_path_1).convert('RGB')\n",
|
||||
"image_2 = Image.open(image_path_2).convert('RGB')\n",
|
||||
"images = [image_1, image_2]\n",
|
||||
"image_size = [image_1.size, image_2.size]\n",
|
||||
"image_size = [image_size]\n",
|
||||
"vision_x = [image_proc([img]) for img in images]\n",
|
||||
"vision_x = [vision_x]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"<class 'torchvision.transforms.transforms.Resize'>\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Resize(size=(384, 384), interpolation=bicubic, max_size=None, antialias=True),\n",
|
||||
" Lambda(),\n",
|
||||
" ToTensor(),\n",
|
||||
" Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))]"
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(type(image_processor.transforms[0]))\n",
|
||||
"image_processor.transforms"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"array([[217, 212, 211, 213, 213, 210, 210, 210, 213, 214],\n",
|
||||
" [213, 211, 212, 212, 209, 212, 211, 210, 210, 211],\n",
|
||||
" [213, 211, 211, 212, 210, 213, 212, 211, 210, 210],\n",
|
||||
" [215, 211, 209, 212, 212, 211, 210, 210, 210, 210],\n",
|
||||
" [211, 208, 209, 211, 210, 211, 211, 211, 211, 211]], dtype=uint8)"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"np.asarray(image_processor.transforms[0](image_2))[:5, :10, 0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def cubic_interpolate(p, x):\n",
|
||||
" return (\n",
|
||||
" p[1] +\n",
|
||||
" 0.5 * x * (p[2] - p[0] + \n",
|
||||
" x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] + \n",
|
||||
" x * (3.0 * (p[1] - p[2]) + p[3] - p[0])))\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"def bicubic_interpolate(p, x, y):\n",
|
||||
" arr = np.array([cubic_interpolate(p[i], y) for i in range(4)])\n",
|
||||
" return cubic_interpolate(arr, x)\n",
|
||||
"\n",
|
||||
"def resize_bicubic_pil(image, new_width, new_height):\n",
|
||||
" # Convert the PIL image to a NumPy array\n",
|
||||
" image_np = np.array(image)\n",
|
||||
" \n",
|
||||
" height, width, channels = image_np.shape\n",
|
||||
" resized_image = np.zeros((new_height, new_width, channels))\n",
|
||||
"\n",
|
||||
" x_ratio = width / new_width\n",
|
||||
" y_ratio = height / new_height\n",
|
||||
"\n",
|
||||
" for i in range(new_height):\n",
|
||||
" for j in range(new_width):\n",
|
||||
" x = j * x_ratio\n",
|
||||
" y = i * y_ratio\n",
|
||||
"\n",
|
||||
" x_int = int(x)\n",
|
||||
" y_int = int(y)\n",
|
||||
"\n",
|
||||
" x_diff = x - x_int\n",
|
||||
" y_diff = y - y_int\n",
|
||||
"\n",
|
||||
" p = np.zeros((4, 4, channels))\n",
|
||||
"\n",
|
||||
" for m in range(-1, 3):\n",
|
||||
" for n in range(-1, 3):\n",
|
||||
" xm = min(max(x_int + m, 0), width - 1)\n",
|
||||
" yn = min(max(y_int + n, 0), height - 1)\n",
|
||||
" p[m + 1, n + 1] = image_np[yn, xm]\n",
|
||||
"\n",
|
||||
" for c in range(channels):\n",
|
||||
" resized_image[i, j, c] = bicubic_interpolate(p[:, :, c], x_diff, y_diff)\n",
|
||||
"\n",
|
||||
" # Convert the NumPy array back to a PIL image\n",
|
||||
" resized_image = np.clip(resized_image, 0, 255).astype(np.uint8)\n",
|
||||
" return Image.fromarray(resized_image)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"array([[222, 217, 214, 216, 218, 213, 212, 214, 216, 218],\n",
|
||||
" [213, 209, 209, 211, 209, 209, 209, 209, 208, 210],\n",
|
||||
" [212, 210, 211, 212, 209, 213, 212, 209, 209, 210],\n",
|
||||
" [217, 212, 211, 212, 212, 212, 211, 210, 210, 211],\n",
|
||||
" [212, 208, 208, 210, 210, 210, 211, 211, 211, 210]], dtype=uint8)"
|
||||
]
|
||||
},
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"res = resize_bicubic_pil(image_2, base_img_size, base_img_size)\n",
|
||||
"np.asarray(res)[:5, :10, 0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Model surgery"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
530
examples/xgenmm/test_anyres_img.cpp
Normal file
530
examples/xgenmm/test_anyres_img.cpp
Normal file
|
@ -0,0 +1,530 @@
|
|||
#include "ggml.h"
|
||||
#include "common.h"
|
||||
#include "clip.h"
|
||||
#include "xgenmm.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
|
||||
|
||||
|
||||
struct clip_image_u8
|
||||
{
|
||||
int nx;
|
||||
int ny;
|
||||
|
||||
std::vector<uint8_t> buf;
|
||||
};
|
||||
|
||||
struct clip_image_f32
|
||||
{
|
||||
int nx;
|
||||
int ny;
|
||||
|
||||
std::vector<float> buf;
|
||||
};
|
||||
|
||||
inline int clip(int x, int lower, int upper) { return std::max(lower, std::min(x, upper)); }
|
||||
|
||||
static bool bicubic_resize(const clip_image_u8& img, clip_image_u8& dst, int target_width, int target_height)
|
||||
{
|
||||
const int nx = img.nx;
|
||||
const int ny = img.ny;
|
||||
|
||||
dst.nx = target_width;
|
||||
dst.ny = target_height;
|
||||
dst.buf.resize(3 * target_width * target_height);
|
||||
|
||||
float Cc;
|
||||
float C[5];
|
||||
float d0, d2, d3, a0, a1, a2, a3;
|
||||
int i, j, k, jj;
|
||||
int x, y;
|
||||
float dx, dy;
|
||||
float tx, ty;
|
||||
|
||||
tx = (float)nx / (float)target_width;
|
||||
ty = (float)ny / (float)target_height;
|
||||
|
||||
// Bicubic interpolation; adapted from ViT.cpp, inspired from :
|
||||
// -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36
|
||||
// -> https://en.wikipedia.org/wiki/Bicubic_interpolation
|
||||
|
||||
for (i = 0; i < target_height; i++)
|
||||
{
|
||||
for (j = 0; j < target_width; j++)
|
||||
{
|
||||
x = (int)(tx * j);
|
||||
y = (int)(ty * i);
|
||||
|
||||
dx = tx * j - x;
|
||||
dy = ty * i - y;
|
||||
|
||||
for (k = 0; k < 3; k++)
|
||||
{
|
||||
for (jj = 0; jj <= 3; jj++)
|
||||
{
|
||||
d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] -
|
||||
img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
||||
d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] -
|
||||
img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
||||
d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] -
|
||||
img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
||||
a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
||||
|
||||
a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
|
||||
a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
|
||||
a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
|
||||
|
||||
C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;
|
||||
|
||||
d0 = C[0] - C[1];
|
||||
d2 = C[2] - C[1];
|
||||
d3 = C[3] - C[1];
|
||||
a0 = C[1];
|
||||
a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
|
||||
a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
|
||||
a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
|
||||
Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
|
||||
|
||||
const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
|
||||
dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
enum projector_type
|
||||
{
|
||||
PROJECTOR_TYPE_MLP,
|
||||
PROJECTOR_TYPE_MLP_NORM,
|
||||
PROJECTOR_TYPE_LDP,
|
||||
PROJECTOR_TYPE_LDPV2,
|
||||
PROJECTOR_TYPE_RESAMPLER,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{PROJECTOR_TYPE_MLP, "mlp"},
|
||||
{PROJECTOR_TYPE_LDP, "ldp"},
|
||||
{PROJECTOR_TYPE_LDPV2, "ldpv2"},
|
||||
{PROJECTOR_TYPE_RESAMPLER, "resampler"},
|
||||
};
|
||||
|
||||
|
||||
|
||||
struct clip_hparams
|
||||
{
|
||||
int32_t image_size;
|
||||
int32_t patch_size;
|
||||
int32_t hidden_size;
|
||||
int32_t n_intermediate;
|
||||
int32_t projection_dim;
|
||||
int32_t n_head;
|
||||
int32_t n_layer;
|
||||
|
||||
float eps;
|
||||
|
||||
char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default)
|
||||
|
||||
int32_t image_grid_pinpoints[32];
|
||||
int32_t image_crop_resolution;
|
||||
};
|
||||
|
||||
struct clip_layer
|
||||
{
|
||||
// attention
|
||||
struct ggml_tensor* k_w;
|
||||
struct ggml_tensor* k_b;
|
||||
struct ggml_tensor* q_w;
|
||||
struct ggml_tensor* q_b;
|
||||
struct ggml_tensor* v_w;
|
||||
struct ggml_tensor* v_b;
|
||||
|
||||
struct ggml_tensor* o_w;
|
||||
struct ggml_tensor* o_b;
|
||||
|
||||
// layernorm 1
|
||||
struct ggml_tensor* ln_1_w;
|
||||
struct ggml_tensor* ln_1_b;
|
||||
|
||||
// ff
|
||||
struct ggml_tensor* ff_i_w;
|
||||
struct ggml_tensor* ff_i_b;
|
||||
|
||||
struct ggml_tensor* ff_o_w;
|
||||
struct ggml_tensor* ff_o_b;
|
||||
|
||||
// layernorm 2
|
||||
struct ggml_tensor* ln_2_w;
|
||||
struct ggml_tensor* ln_2_b;
|
||||
};
|
||||
|
||||
struct clip_vision_model
|
||||
{
|
||||
struct clip_hparams hparams;
|
||||
|
||||
// embeddings
|
||||
struct ggml_tensor* class_embedding;
|
||||
struct ggml_tensor* patch_embeddings;
|
||||
struct ggml_tensor* patch_bias;
|
||||
struct ggml_tensor* position_embeddings;
|
||||
|
||||
struct ggml_tensor* pre_ln_w;
|
||||
struct ggml_tensor* pre_ln_b;
|
||||
|
||||
std::vector<clip_layer> layers;
|
||||
|
||||
struct ggml_tensor* post_ln_w;
|
||||
struct ggml_tensor* post_ln_b;
|
||||
|
||||
struct ggml_tensor* projection;
|
||||
|
||||
// LLaVA projection
|
||||
struct ggml_tensor* mm_0_w = NULL;
|
||||
struct ggml_tensor* mm_0_b = NULL;
|
||||
struct ggml_tensor* mm_2_w = NULL;
|
||||
struct ggml_tensor* mm_2_b = NULL;
|
||||
|
||||
struct ggml_tensor* image_newline = NULL;
|
||||
|
||||
// Yi type models with mlp+normalization projection
|
||||
struct ggml_tensor* mm_1_w = NULL; // Yi type models have 0, 1, 3, 4
|
||||
struct ggml_tensor* mm_1_b = NULL;
|
||||
struct ggml_tensor* mm_3_w = NULL;
|
||||
struct ggml_tensor* mm_3_b = NULL;
|
||||
struct ggml_tensor* mm_4_w = NULL;
|
||||
struct ggml_tensor* mm_4_b = NULL;
|
||||
|
||||
// MobileVLM projection
|
||||
struct ggml_tensor* mm_model_mlp_1_w;
|
||||
struct ggml_tensor* mm_model_mlp_1_b;
|
||||
struct ggml_tensor* mm_model_mlp_3_w;
|
||||
struct ggml_tensor* mm_model_mlp_3_b;
|
||||
struct ggml_tensor* mm_model_block_1_block_0_0_w;
|
||||
struct ggml_tensor* mm_model_block_1_block_0_1_w;
|
||||
struct ggml_tensor* mm_model_block_1_block_0_1_b;
|
||||
struct ggml_tensor* mm_model_block_1_block_1_fc1_w;
|
||||
struct ggml_tensor* mm_model_block_1_block_1_fc1_b;
|
||||
struct ggml_tensor* mm_model_block_1_block_1_fc2_w;
|
||||
struct ggml_tensor* mm_model_block_1_block_1_fc2_b;
|
||||
struct ggml_tensor* mm_model_block_1_block_2_0_w;
|
||||
struct ggml_tensor* mm_model_block_1_block_2_1_w;
|
||||
struct ggml_tensor* mm_model_block_1_block_2_1_b;
|
||||
struct ggml_tensor* mm_model_block_2_block_0_0_w;
|
||||
struct ggml_tensor* mm_model_block_2_block_0_1_w;
|
||||
struct ggml_tensor* mm_model_block_2_block_0_1_b;
|
||||
struct ggml_tensor* mm_model_block_2_block_1_fc1_w;
|
||||
struct ggml_tensor* mm_model_block_2_block_1_fc1_b;
|
||||
struct ggml_tensor* mm_model_block_2_block_1_fc2_w;
|
||||
struct ggml_tensor* mm_model_block_2_block_1_fc2_b;
|
||||
struct ggml_tensor* mm_model_block_2_block_2_0_w;
|
||||
struct ggml_tensor* mm_model_block_2_block_2_1_w;
|
||||
struct ggml_tensor* mm_model_block_2_block_2_1_b;
|
||||
|
||||
// MobileVLM_V2 projection
|
||||
struct ggml_tensor* mm_model_mlp_0_w;
|
||||
struct ggml_tensor* mm_model_mlp_0_b;
|
||||
struct ggml_tensor* mm_model_mlp_2_w;
|
||||
struct ggml_tensor* mm_model_mlp_2_b;
|
||||
struct ggml_tensor* mm_model_peg_0_w;
|
||||
struct ggml_tensor* mm_model_peg_0_b;
|
||||
|
||||
// MINICPMV projection
|
||||
struct ggml_tensor* mm_model_pos_embed_k;
|
||||
struct ggml_tensor* mm_model_query;
|
||||
struct ggml_tensor* mm_model_proj;
|
||||
struct ggml_tensor* mm_model_kv_proj;
|
||||
struct ggml_tensor* mm_model_attn_q_w;
|
||||
struct ggml_tensor* mm_model_attn_q_b;
|
||||
struct ggml_tensor* mm_model_attn_k_w;
|
||||
struct ggml_tensor* mm_model_attn_k_b;
|
||||
struct ggml_tensor* mm_model_attn_v_w;
|
||||
struct ggml_tensor* mm_model_attn_v_b;
|
||||
struct ggml_tensor* mm_model_attn_o_w;
|
||||
struct ggml_tensor* mm_model_attn_o_b;
|
||||
struct ggml_tensor* mm_model_ln_q_w;
|
||||
struct ggml_tensor* mm_model_ln_q_b;
|
||||
struct ggml_tensor* mm_model_ln_kv_w;
|
||||
struct ggml_tensor* mm_model_ln_kv_b;
|
||||
struct ggml_tensor* mm_model_ln_post_w;
|
||||
struct ggml_tensor* mm_model_ln_post_b;
|
||||
};
|
||||
|
||||
struct clip_ctx {
|
||||
bool has_text_encoder = false;
|
||||
bool has_vision_encoder = false;
|
||||
bool has_llava_projector = false;
|
||||
bool has_minicpmv_projector = false;
|
||||
int minicpmv_version = 2;
|
||||
|
||||
struct clip_vision_model vision_model;
|
||||
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
||||
|
||||
float image_mean[3];
|
||||
float image_std[3];
|
||||
bool use_gelu = false;
|
||||
int32_t ftype = 1;
|
||||
|
||||
bool has_class_embedding = true;
|
||||
bool has_pre_norm = true;
|
||||
bool has_post_norm = false;
|
||||
bool has_patch_bias = false;
|
||||
|
||||
struct gguf_context * ctx_gguf;
|
||||
struct ggml_context * ctx_data;
|
||||
|
||||
std::vector<uint8_t> buf_compute_meta;
|
||||
|
||||
// memory buffers to evaluate the model
|
||||
ggml_backend_buffer_t params_buffer = NULL;
|
||||
|
||||
ggml_backend_t backend = NULL;
|
||||
ggml_gallocr_t compute_alloc = NULL;
|
||||
|
||||
struct clip_image_size * load_image_size;
|
||||
};
|
||||
|
||||
static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long* sizeOut)
|
||||
{
|
||||
auto file = fopen(path, "rb");
|
||||
if (file == NULL)
|
||||
{
|
||||
LOG_TEE("%s: can't read file %s\n", __func__, path);
|
||||
return false;
|
||||
}
|
||||
|
||||
fseek(file, 0, SEEK_END);
|
||||
auto fileSize = ftell(file);
|
||||
fseek(file, 0, SEEK_SET);
|
||||
|
||||
auto buffer = (unsigned char*)malloc(fileSize); // Allocate memory to hold the file data
|
||||
if (buffer == NULL)
|
||||
{
|
||||
LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
|
||||
perror("Memory allocation error");
|
||||
fclose(file);
|
||||
return false;
|
||||
}
|
||||
errno = 0;
|
||||
size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer
|
||||
if (ferror(file))
|
||||
{
|
||||
die_fmt("read error: %s", strerror(errno));
|
||||
}
|
||||
if (ret != (size_t)fileSize)
|
||||
{
|
||||
die("unexpectedly reached end of file");
|
||||
}
|
||||
fclose(file); // Close the file
|
||||
|
||||
*bytesOut = buffer;
|
||||
*sizeOut = fileSize;
|
||||
return true;
|
||||
}
|
||||
|
||||
void print_img(clip_image_u8* img)
|
||||
{
|
||||
const int nx = img->nx;
|
||||
const int ny = img->ny;
|
||||
printf("num pixels: %d\n", img->buf.size());
|
||||
printf("raw img: nx:%d | ny:%d\n", nx, ny);
|
||||
|
||||
const int n = nx * ny;
|
||||
for (int k = 0; k < 3; k++)
|
||||
{
|
||||
for (int y = 0; y < 5; y++)
|
||||
{
|
||||
for (int x = 0; x < 10; x++)
|
||||
{
|
||||
// data[(i * 3 * n) + k * n + y * nx + x] = imgs->data[i].buf[3 * (y * nx + x) + k];
|
||||
printf("%d ", img->buf[3 * (y * nx + x) + k]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
int main(){
|
||||
/*
|
||||
Pytorch Image Processing Pipeline
|
||||
n_px = hf_processor.image_processor.size['height']
|
||||
image_processor = Compose([
|
||||
Resize((n_px, n_px), interpolation=InterpolationMode.BICUBIC, antialias=True),
|
||||
Lambda(lambda x: x.convert('RGB') if x.mode != 'RGB' else x),
|
||||
ToTensor(),
|
||||
Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
|
||||
])
|
||||
anyres_grids = [[384, 768], [768, 384], [768, 768], [1152, 384], [384, 1152]]
|
||||
grid_pinpoints = anyres_grids
|
||||
best_resolution = select_best_resolution(image.size, possible_resolutions)
|
||||
image_padded = resize_and_pad_image(image, best_resolution)
|
||||
processor_size = processor.transforms[0].size
|
||||
patches = divide_to_patches(image_padded, processor_size[0])
|
||||
image_original_resize = image.resize((processor_size[0], processor_size[0]))
|
||||
image_patches = [image_original_resize] + patches
|
||||
image_patches = [processor(image_patch) for image_patch in image_patches]
|
||||
return torch.stack(image_patches, dim=0)
|
||||
|
||||
this part is already implemented in the clip_image_preprocess function in clip.cpp
|
||||
*/
|
||||
|
||||
const char* clip_path = "/export/share/yutong/xgenmm/llamacpp_wd/llava-1.6/vit/mmproj-model-f16.gguf";
|
||||
// struct ggml_context* meta = NULL;
|
||||
|
||||
// struct gguf_init_params params = {
|
||||
// /*.no_alloc = */ true,
|
||||
// /*.ctx = */ &meta,
|
||||
// };
|
||||
|
||||
// struct gguf_context* ctx = gguf_init_from_file(clip_path, params);
|
||||
// if (!ctx)
|
||||
// {
|
||||
// throw std::runtime_error(
|
||||
// format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, clip_path));
|
||||
// }
|
||||
struct clip_ctx * ctx = clip_model_load(clip_path, /*verbosity=*/2);
|
||||
printf("Model loaded\n");
|
||||
for (int i=0; i < 3; i++){
|
||||
ctx->image_mean[i] = 0.5;
|
||||
ctx->image_std[i] = 0.5;
|
||||
}
|
||||
LOG_TEE("v_image_mean %f %f %f\n", ctx->image_mean[0], ctx->image_mean[1], ctx->image_mean[2]);
|
||||
LOG_TEE("v_image_std %f %f %f\n", ctx->image_std[0], ctx->image_std[1], ctx->image_std[2]);
|
||||
// [[384, 768], [768, 384], [768, 768], [1152, 384], [384, 1152]]
|
||||
ctx->vision_model.hparams.image_grid_pinpoints[0] = 384;
|
||||
ctx->vision_model.hparams.image_grid_pinpoints[1] = 768;
|
||||
ctx->vision_model.hparams.image_grid_pinpoints[2] = 768;
|
||||
ctx->vision_model.hparams.image_grid_pinpoints[3] = 384;
|
||||
ctx->vision_model.hparams.image_grid_pinpoints[4] = 768;
|
||||
ctx->vision_model.hparams.image_grid_pinpoints[5] = 768;
|
||||
ctx->vision_model.hparams.image_grid_pinpoints[6] = 1152;
|
||||
ctx->vision_model.hparams.image_grid_pinpoints[7] = 384;
|
||||
ctx->vision_model.hparams.image_grid_pinpoints[8] = 384;
|
||||
ctx->vision_model.hparams.image_grid_pinpoints[9] = 1152;
|
||||
for (int i = 0; i < 10; i++)
|
||||
{
|
||||
printf("grid[%d]:%d ", i, ctx->vision_model.hparams.image_grid_pinpoints[i]);
|
||||
}
|
||||
printf("\n");
|
||||
ctx->vision_model.hparams.image_size = 384;
|
||||
printf("params.image_size:%d\n", ctx->vision_model.hparams.image_size);
|
||||
/*
|
||||
part of:
|
||||
llava_image_embed_make_with_filename
|
||||
*/
|
||||
const char* image_path = "/export/home/llama.cpp/examples/xgenmm/imgs/image-1d100e9.jpg"; // Porcelain
|
||||
// const char* image_path = "/export/home/llama.cpp/examples/xgenmm/imgs/image-1d100e9-1.jpg";
|
||||
unsigned char* image_bytes;
|
||||
long image_bytes_length;
|
||||
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
|
||||
if (!loaded)
|
||||
{
|
||||
LOG_TEE("%s: failed to load %s\n", __func__, image_path);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
part of:
|
||||
llava_image_embed_make_with_bytes
|
||||
*/
|
||||
clip_image_u8* img = clip_image_u8_init();
|
||||
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img))
|
||||
{
|
||||
clip_image_u8_free(img);
|
||||
LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
print_img(img);
|
||||
|
||||
clip_image_u8* image_original_resize = clip_image_u8_init();
|
||||
bicubic_resize(*img, *image_original_resize, 384, 384);
|
||||
|
||||
print_img(image_original_resize);
|
||||
|
||||
// printf("num pixels: %d\n", image_original_resize->buf.size());
|
||||
// printf("raw img: nx:%d | ny:%d\n", image_original_resize->nx, image_original_resize->ny);
|
||||
|
||||
// /*
|
||||
// part of:
|
||||
// encode_image_with_clip
|
||||
// */
|
||||
// clip_image_f32_batch img_res_v;
|
||||
// img_res_v.size = 0;
|
||||
// img_res_v.data = nullptr;
|
||||
|
||||
// if (!clip_image_preprocess(ctx, img, &img_res_v))
|
||||
// {
|
||||
// LOG_TEE("%s: unable to preprocess image\n", __func__);
|
||||
// delete[] img_res_v.data;
|
||||
// return false;
|
||||
// }
|
||||
// printf("img->nx:%ld | img->ny:%ld\n", img->nx, img->ny);
|
||||
// // printf("img_res_v.size:%ld\n", img_res_v.size);
|
||||
// printf("img_res_v->nx:%ld | img_res_v->ny:%ld\n", img_res_v.data->nx, img_res_v.data->ny);
|
||||
// // std::cout << img_res_v.data->nx << " | " << img_res_v.data->ny << std::endl;
|
||||
// // std::cout << img_res_v.data->buf.size() << std::endl;
|
||||
|
||||
// const char* mm_patch_merge_type = clip_patch_merge_type(ctx);
|
||||
// printf("mm_patch_merge_type:%s\n", mm_patch_merge_type);
|
||||
|
||||
|
||||
// for (size_t i = 0; i < img_res_v.size; i++) {
|
||||
// const int nx = img_res_v.data[i].nx;
|
||||
// const int ny = img_res_v.data[i].ny;
|
||||
// printf("i:%d | nx:%d | ny:%d\n", i, nx, ny);
|
||||
|
||||
// const int n = nx * ny;
|
||||
|
||||
|
||||
// for (int k = 0; k < 1; k++) {
|
||||
// for (int y = 0; y < 5; y++) {
|
||||
// for (int x = 0; x < 10; x++) {
|
||||
// // data[(i * 3 * n) + k * n + y * nx + x] = imgs->data[i].buf[3 * (y * nx + x) + k];
|
||||
// printf("%.4f ", img_res_v.data[i].buf[3 * (y * nx + x) + k]);
|
||||
// }
|
||||
// printf("\n");
|
||||
// }
|
||||
// printf("\n");
|
||||
// }
|
||||
|
||||
// }
|
||||
|
||||
|
||||
// /*
|
||||
// part of:
|
||||
// clip_image_encode
|
||||
// */
|
||||
// clip_image_f32_batch imgs{};
|
||||
// imgs.size = 1;
|
||||
// imgs.data = &img_res_v.data[0];
|
||||
|
||||
|
||||
// /*
|
||||
// part of:
|
||||
// clip_image_batch_encode
|
||||
// */
|
||||
// const clip_image_f32_batch * imgs_f32_const = &imgs;
|
||||
// int batch_size = imgs_f32_const->size;
|
||||
// if (ctx->has_llava_projector) {
|
||||
// GGML_ASSERT(batch_size == 1); // TODO: support multiple images
|
||||
// }
|
||||
// if (ctx->has_minicpmv_projector) {
|
||||
// GGML_ASSERT(batch_size == 1);
|
||||
// }
|
||||
|
||||
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
// make test_anyres_img && ./bin/test_anyres_img
|
597
examples/xgenmm/xgenmm.cpp
Normal file
597
examples/xgenmm/xgenmm.cpp
Normal file
|
@ -0,0 +1,597 @@
|
|||
/*
|
||||
08/18/2024 - Yutong - The file is adpated from examples/llava/llava.h in the llama.cpp repository.
|
||||
*/
|
||||
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
#include "base64.hpp"
|
||||
#include "clip.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "xgenmm.h"
|
||||
|
||||
// RGB uint8 image
|
||||
struct clip_image_u8
|
||||
{
|
||||
int nx;
|
||||
int ny;
|
||||
|
||||
std::vector<uint8_t> buf;
|
||||
};
|
||||
|
||||
// RGB float32 image (NHWC)
|
||||
// Memory layout: RGBRGBRGB...
|
||||
struct clip_image_f32
|
||||
{
|
||||
int nx;
|
||||
int ny;
|
||||
|
||||
std::vector<float> buf;
|
||||
};
|
||||
|
||||
struct clip_image_grid_shape
|
||||
{
|
||||
int first;
|
||||
int second;
|
||||
};
|
||||
|
||||
/**
|
||||
* Selects the best resolution from a list of possible resolutions based on the original size.
|
||||
*
|
||||
* @param original_size The original size of the image in the format (width, height).
|
||||
* @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
|
||||
* @return The best fit resolution in the format (width, height).
|
||||
*/
|
||||
static std::pair<int, int> select_best_resolution(const std::pair<int, int> &original_size,
|
||||
const std::vector<std::pair<int, int>> &possible_resolutions)
|
||||
{
|
||||
int original_width = original_size.first;
|
||||
int original_height = original_size.second;
|
||||
|
||||
std::pair<int, int> best_fit;
|
||||
int max_effective_resolution = 0;
|
||||
int min_wasted_resolution = std::numeric_limits<int>::max();
|
||||
|
||||
for (const auto &resolution : possible_resolutions)
|
||||
{
|
||||
int width = resolution.first;
|
||||
int height = resolution.second;
|
||||
float scale =
|
||||
std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
|
||||
int downscaled_width = static_cast<int>(original_width * scale);
|
||||
int downscaled_height = static_cast<int>(original_height * scale);
|
||||
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
|
||||
int wasted_resolution = (width * height) - effective_resolution;
|
||||
// LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale,
|
||||
// downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
|
||||
if (effective_resolution > max_effective_resolution ||
|
||||
(effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution))
|
||||
{
|
||||
max_effective_resolution = effective_resolution;
|
||||
min_wasted_resolution = wasted_resolution;
|
||||
best_fit = resolution;
|
||||
}
|
||||
}
|
||||
|
||||
return best_fit;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Get the anyres image grid shape object
|
||||
*
|
||||
* @param image_size
|
||||
* @param grid_pinpoints
|
||||
* @param image_patch_size
|
||||
* @return <int, int>
|
||||
*/
|
||||
static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, int> &image_size,
|
||||
const std::vector<std::pair<int, int>> &grid_pinpoints,
|
||||
int image_patch_size)
|
||||
{
|
||||
/**
|
||||
Conversion from gguf flat array to vector:
|
||||
std::vector<std::pair<int, int>> possible_resolutions;
|
||||
for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
|
||||
possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
|
||||
}
|
||||
*/
|
||||
auto best_resolution = select_best_resolution(image_size, grid_pinpoints);
|
||||
return {best_resolution.first / image_patch_size, best_resolution.second / image_patch_size};
|
||||
}
|
||||
|
||||
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into
|
||||
// preallocated memory (image_embd_out)
|
||||
static bool clip_llava_handle_patches(clip_ctx *ctx_clip, std::vector<float *> &image_embd_v,
|
||||
struct clip_image_grid_shape grid_shape, float *image_embd_out,
|
||||
int *n_img_pos_out)
|
||||
{
|
||||
struct
|
||||
{
|
||||
struct ggml_context *ctx;
|
||||
} model;
|
||||
|
||||
const int32_t image_size = clip_image_size(ctx_clip);
|
||||
const int32_t patch_size = clip_patch_size(ctx_clip);
|
||||
|
||||
int32_t num_patches_per_side =
|
||||
image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)
|
||||
|
||||
int num_patches_width = grid_shape.first; // grid 1-4
|
||||
int num_patches_height = grid_shape.second; // grid 1-4
|
||||
|
||||
const size_t num_images = num_patches_width * num_patches_height + 1;
|
||||
|
||||
// TODO: size calculation is not calculated - it's only tens of MB
|
||||
size_t ctx_size = 0;
|
||||
|
||||
{
|
||||
ctx_size += clip_embd_nbytes(ctx_clip) * num_images * 8; // image_features
|
||||
ctx_size += 1024 * 1024 * ggml_type_size(GGML_TYPE_F32);
|
||||
}
|
||||
|
||||
struct ggml_init_params params
|
||||
{
|
||||
/*.mem_size =*/ctx_size,
|
||||
/*.mem_buffer =*/NULL,
|
||||
/*.no_alloc =*/false, // NOTE: this should be false when using the legacy API
|
||||
};
|
||||
|
||||
// Python reference code for full unpad:
|
||||
/*
|
||||
base_image_feature = image_feature[0]
|
||||
image_feature = image_feature[1:]
|
||||
image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
|
||||
image_feature = image_feature.flatten(1, 2).flatten(2, 3)
|
||||
image_feature = unpad_image(image_feature, image_sizes[image_idx])
|
||||
image_feature = torch.cat((
|
||||
image_feature,
|
||||
self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1)
|
||||
), dim=-1)
|
||||
image_feature = image_feature.flatten(1, 2).transpose(0, 1)
|
||||
image_feature = torch.cat((base_image_feature, image_feature), dim=0)
|
||||
*/
|
||||
// We now have two options: unpad or no unpad. Unpad removes tokens for faster llm eval.
|
||||
// In terms of result quality it appears to make no difference, so we'll start with the easier approach given 5D
|
||||
// tensors are not supported in ggml yet. Without unpad we have to split the sub-image embeddings into patches of 24
|
||||
// features each and permute them. Once all images are processed to prepended the base_image_features without any
|
||||
// changes.
|
||||
|
||||
// Pytorch reference simplified, modified for ggml compatibility - confirmed identical output in python (for a 2x2
|
||||
// grid image (676x676 scaling))
|
||||
/*
|
||||
image_feature = image_feature.view(2, 2, 24, 24, 4096)
|
||||
image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
|
||||
image_feature = image_feature.view(2, 24, 2, 24, 4096)
|
||||
image_feature = image_feature.flatten(0, 3)
|
||||
|
||||
// Reshape to 4D tensor by merging the last two dimensions
|
||||
image_feature = image_feature.view(2, 2, 24, 24*4096)
|
||||
image_feature = image_feature.permute(0, 2, 1, 3).contiguous()
|
||||
image_feature = image_feature.view(-1, 4096)
|
||||
*/
|
||||
|
||||
model.ctx = ggml_init(params);
|
||||
|
||||
struct ggml_tensor *image_features =
|
||||
ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip),
|
||||
num_images - 1); // example: 4096 x 576 x 4
|
||||
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
|
||||
// fill it with the image embeddings, ignoring the base
|
||||
for (size_t i = 1; i < num_images; i++)
|
||||
{
|
||||
size_t offset = (i - 1) * clip_embd_nbytes(ctx_clip);
|
||||
memcpy((uint8_t *)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip));
|
||||
}
|
||||
|
||||
struct ggml_cgraph *gf = ggml_new_graph(model.ctx);
|
||||
size_t size_ele = ggml_type_size(GGML_TYPE_F32);
|
||||
|
||||
struct ggml_tensor *image_features_patchview = ggml_view_4d(
|
||||
model.ctx, image_features, num_patches_per_side * clip_n_mmproj_embd(ctx_clip), num_patches_per_side,
|
||||
num_patches_width, num_patches_height, size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
|
||||
size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side,
|
||||
size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side * num_patches_width, 0);
|
||||
// ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,false);
|
||||
struct ggml_tensor *permuted_cont =
|
||||
ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3));
|
||||
/**
|
||||
At the end of each row we have to add the row_end embeddings, which are the same as the newline embeddings
|
||||
image_feature = torch.cat((
|
||||
image_feature,
|
||||
self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)
|
||||
), dim=-1)
|
||||
*
|
||||
*/
|
||||
|
||||
// ggml_tensor_printf(permuted_cont,"permuted_cont",__LINE__,false,false);
|
||||
struct ggml_tensor *flatten =
|
||||
ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip),
|
||||
num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side,
|
||||
size_ele * clip_n_mmproj_embd(ctx_clip), 0);
|
||||
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
|
||||
ggml_build_forward_expand(gf, flatten);
|
||||
ggml_graph_compute_with_ctx(model.ctx, gf, 1);
|
||||
struct ggml_tensor *result = gf->nodes[gf->n_nodes - 1];
|
||||
|
||||
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
|
||||
// append without newline tokens (default behavior in llava_arch when not using unpad ):
|
||||
memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float *)result->data,
|
||||
clip_embd_nbytes(ctx_clip) * (num_images - 1)); // grid patches
|
||||
*n_img_pos_out = static_cast<int>(result->ne[1] + clip_n_patches(ctx_clip));
|
||||
|
||||
// Debug: Test single segments
|
||||
// Current findings: sending base image, sending a segment embedding all works similar to python
|
||||
// However, permuted embeddings do not work yet (stride issue?)
|
||||
// memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as context
|
||||
// memcpy(image_embd_out, (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip)); // main image as context
|
||||
// *n_img_pos_out=576;
|
||||
|
||||
ggml_free(model.ctx);
|
||||
return true;
|
||||
}
|
||||
|
||||
static clip_image_f32 *only_v2_5_reshape_by_patch(clip_image_f32 *image, int patch_size)
|
||||
{
|
||||
int width = image->nx;
|
||||
int height = image->ny;
|
||||
int num_patches = (height / patch_size) * (width / patch_size);
|
||||
clip_image_f32 *patch = clip_image_f32_init();
|
||||
patch->nx = patch_size * num_patches;
|
||||
patch->ny = patch_size;
|
||||
patch->buf.resize(3 * patch->nx * patch->ny);
|
||||
|
||||
int patch_index = 0;
|
||||
|
||||
for (int i = 0; i < height; i += patch_size)
|
||||
{
|
||||
for (int j = 0; j < width; j += patch_size)
|
||||
{
|
||||
for (int pi = 0; pi < patch_size; ++pi)
|
||||
{
|
||||
for (int pj = 0; pj < patch_size; ++pj)
|
||||
{
|
||||
int input_index = ((i + pi) * width + (j + pj)) * 3;
|
||||
int output_index = (pi * patch_size * num_patches + patch_index * patch_size + pj) * 3;
|
||||
patch->buf[output_index] = image->buf[input_index];
|
||||
patch->buf[output_index + 1] = image->buf[input_index + 1];
|
||||
patch->buf[output_index + 2] = image->buf[input_index + 2];
|
||||
}
|
||||
}
|
||||
patch_index++;
|
||||
}
|
||||
}
|
||||
return patch;
|
||||
}
|
||||
|
||||
static bool encode_image_with_clip(clip_ctx *ctx_clip, int n_threads, const clip_image_u8 *img, float *image_embd,
|
||||
int *n_img_pos)
|
||||
{
|
||||
// std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB -
|
||||
// different to the python implementation which is N x 3 x 336 x 336
|
||||
clip_image_f32_batch img_res_v;
|
||||
img_res_v.size = 0;
|
||||
img_res_v.data = nullptr;
|
||||
if (!clip_image_preprocess(ctx_clip, img, &img_res_v))
|
||||
{
|
||||
LOG_TEE("%s: unable to preprocess image\n", __func__);
|
||||
delete[] img_res_v.data;
|
||||
return false;
|
||||
}
|
||||
|
||||
const int64_t t_img_enc_start_us = ggml_time_us();
|
||||
|
||||
const char *mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
|
||||
|
||||
if (clip_is_minicpmv(ctx_clip))
|
||||
{
|
||||
std::vector<float *> image_embd_v;
|
||||
image_embd_v.resize(img_res_v.size);
|
||||
struct clip_image_size *load_image_size = clip_image_size_init();
|
||||
for (size_t i = 0; i < img_res_v.size; i++)
|
||||
{
|
||||
const int64_t t_img_enc_step_start_us = ggml_time_us();
|
||||
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
|
||||
int patch_size = 14;
|
||||
load_image_size->width = img_res_v.data[i].nx;
|
||||
load_image_size->height = img_res_v.data[i].ny;
|
||||
clip_add_load_image_size(ctx_clip, load_image_size);
|
||||
bool encoded = false;
|
||||
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
|
||||
if (has_minicpmv_projector == 2)
|
||||
{
|
||||
encoded = clip_image_encode(
|
||||
ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
|
||||
}
|
||||
else if (has_minicpmv_projector == 3)
|
||||
{
|
||||
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
|
||||
}
|
||||
if (!encoded)
|
||||
{
|
||||
LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int)i + 1,
|
||||
(int)img_res_v.size);
|
||||
return false;
|
||||
}
|
||||
const int64_t t_img_enc_steop_batch_us = ggml_time_us();
|
||||
LOG_TEE("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i + 1, (int)img_res_v.size,
|
||||
(t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
|
||||
}
|
||||
const int64_t t_img_enc_batch_us = ggml_time_us();
|
||||
LOG_TEE("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size,
|
||||
(t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
||||
|
||||
int n_img_pos_out = 0;
|
||||
for (size_t i = 0; i < image_embd_v.size(); i++)
|
||||
{
|
||||
std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i],
|
||||
clip_embd_nbytes(ctx_clip));
|
||||
n_img_pos_out += clip_n_patches(ctx_clip);
|
||||
}
|
||||
*n_img_pos = n_img_pos_out;
|
||||
for (size_t i = 0; i < image_embd_v.size(); i++)
|
||||
{
|
||||
free(image_embd_v[i]);
|
||||
}
|
||||
image_embd_v.clear();
|
||||
load_image_size->width = img->nx;
|
||||
load_image_size->height = img->ny;
|
||||
clip_add_load_image_size(ctx_clip, load_image_size);
|
||||
LOG_TEE("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
||||
}
|
||||
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0)
|
||||
{
|
||||
// flat / default llava-1.5 type embedding
|
||||
*n_img_pos = clip_n_patches(ctx_clip);
|
||||
bool encoded =
|
||||
clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
|
||||
delete[] img_res_v.data;
|
||||
if (!encoded)
|
||||
{
|
||||
LOG_TEE("Unable to encode image\n");
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// spatial_unpad llava-1.6 type embedding
|
||||
// TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a
|
||||
// solution to quickly get batching working
|
||||
std::vector<float *> image_embd_v;
|
||||
image_embd_v.resize(img_res_v.size);
|
||||
for (size_t i = 0; i < img_res_v.size; i++)
|
||||
{
|
||||
image_embd_v[i] =
|
||||
(float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
|
||||
const bool encoded = clip_image_encode(
|
||||
ctx_clip, n_threads, &img_res_v.data[i],
|
||||
image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
|
||||
if (!encoded)
|
||||
{
|
||||
LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int)i + 1,
|
||||
(int)img_res_v.size);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
const int64_t t_img_enc_batch_us = ggml_time_us();
|
||||
LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size,
|
||||
(t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
||||
|
||||
const int32_t *image_grid = clip_image_grid(ctx_clip);
|
||||
|
||||
std::vector<std::pair<int, int>> grid_pinpoints;
|
||||
for (int i = 0; i < 32 && image_grid[i] != 0; i += 2)
|
||||
{
|
||||
grid_pinpoints.push_back({image_grid[i], image_grid[i + 1]});
|
||||
}
|
||||
|
||||
// free all img_res_v - not needed anymore
|
||||
delete[] img_res_v.data;
|
||||
img_res_v.size = 0;
|
||||
img_res_v.data = nullptr;
|
||||
|
||||
const int32_t image_size = clip_image_size(ctx_clip);
|
||||
|
||||
struct clip_image_grid_shape grid_shape =
|
||||
get_anyres_image_grid_shape({img->nx, img->ny}, grid_pinpoints, image_size);
|
||||
|
||||
int n_img_pos_out;
|
||||
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
|
||||
*n_img_pos = n_img_pos_out;
|
||||
|
||||
for (size_t i = 0; i < image_embd_v.size(); i++)
|
||||
{
|
||||
free(image_embd_v[i]);
|
||||
}
|
||||
image_embd_v.clear();
|
||||
|
||||
// debug image/segment/normalization content:
|
||||
// clip_image_u8 * tmp = clip_image_u8_init();
|
||||
// clip_image_convert_f32_to_u8(*image_feature, *tmp);
|
||||
// clip_image_save_to_bmp(*tmp, "image_feature.bmp");
|
||||
}
|
||||
|
||||
LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
|
||||
|
||||
const int64_t t_img_enc_end_us = ggml_time_us();
|
||||
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
|
||||
|
||||
LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms,
|
||||
t_img_enc_ms / *n_img_pos);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool llava_validate_embed_size(const llama_context *ctx_llama, const clip_ctx *ctx_clip)
|
||||
{
|
||||
// make sure that the correct mmproj was used, i.e., compare apples to apples
|
||||
int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
|
||||
auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
|
||||
if (n_image_embd != n_llama_embd)
|
||||
{
|
||||
LOG_TEE(
|
||||
"%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you "
|
||||
"use the correct mmproj file.\n",
|
||||
__func__, n_image_embd, n_llama_embd);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool llava_image_embed_make_with_clip_img(clip_ctx *ctx_clip, int n_threads, const clip_image_u8 *img,
|
||||
float **image_embd_out, int *n_img_pos_out)
|
||||
{
|
||||
int num_max_patches = 6;
|
||||
if (clip_is_minicpmv(ctx_clip))
|
||||
{
|
||||
num_max_patches = 10;
|
||||
}
|
||||
float *image_embd =
|
||||
(float *)malloc(clip_embd_nbytes(ctx_clip) * num_max_patches); // TODO: base on gridsize/llava model
|
||||
if (!image_embd)
|
||||
{
|
||||
LOG_TEE("Unable to allocate memory for image embeddings\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
int n_img_pos;
|
||||
if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos))
|
||||
{
|
||||
LOG_TEE("%s: cannot encode image, aborting\n", __func__);
|
||||
free(image_embd);
|
||||
return false;
|
||||
}
|
||||
*image_embd_out = image_embd;
|
||||
*n_img_pos_out = n_img_pos;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool llava_eval_image_embed(llama_context *ctx_llama, const struct llava_image_embed *image_embed, int n_batch,
|
||||
int *n_past)
|
||||
{
|
||||
int n_embd = llama_n_embd(llama_get_model(ctx_llama));
|
||||
|
||||
for (int i = 0; i < image_embed->n_image_pos; i += n_batch)
|
||||
{
|
||||
int n_eval = image_embed->n_image_pos - i;
|
||||
if (n_eval > n_batch)
|
||||
{
|
||||
n_eval = n_batch;
|
||||
}
|
||||
llama_batch batch = {
|
||||
int32_t(n_eval),
|
||||
nullptr,
|
||||
(image_embed->embed + i * n_embd),
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
*n_past,
|
||||
1,
|
||||
0,
|
||||
};
|
||||
if (llama_decode(ctx_llama, batch))
|
||||
{
|
||||
LOG_TEE("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
}
|
||||
*n_past += n_eval;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
struct llava_image_embed *llava_image_embed_make_with_bytes(struct clip_ctx *ctx_clip, int n_threads,
|
||||
const unsigned char *image_bytes, int image_bytes_length)
|
||||
{
|
||||
clip_image_u8 *img = clip_image_u8_init();
|
||||
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img))
|
||||
{
|
||||
clip_image_u8_free(img);
|
||||
LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
float *image_embed = NULL;
|
||||
int n_image_pos = 0;
|
||||
bool image_embed_result =
|
||||
llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
|
||||
if (!image_embed_result)
|
||||
{
|
||||
clip_image_u8_free(img);
|
||||
LOG_TEE("%s: coulnd't embed the image\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
clip_image_u8_free(img);
|
||||
auto result = (llava_image_embed *)malloc(sizeof(llava_image_embed));
|
||||
result->embed = image_embed;
|
||||
result->n_image_pos = n_image_pos;
|
||||
return result;
|
||||
}
|
||||
|
||||
static bool load_file_to_bytes(const char *path, unsigned char **bytesOut, long *sizeOut)
|
||||
{
|
||||
auto file = fopen(path, "rb");
|
||||
if (file == NULL)
|
||||
{
|
||||
LOG_TEE("%s: can't read file %s\n", __func__, path);
|
||||
return false;
|
||||
}
|
||||
|
||||
fseek(file, 0, SEEK_END);
|
||||
auto fileSize = ftell(file);
|
||||
fseek(file, 0, SEEK_SET);
|
||||
|
||||
auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
|
||||
if (buffer == NULL)
|
||||
{
|
||||
LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
|
||||
perror("Memory allocation error");
|
||||
fclose(file);
|
||||
return false;
|
||||
}
|
||||
errno = 0;
|
||||
size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer
|
||||
if (ferror(file))
|
||||
{
|
||||
die_fmt("read error: %s", strerror(errno));
|
||||
}
|
||||
if (ret != (size_t)fileSize)
|
||||
{
|
||||
die("unexpectedly reached end of file");
|
||||
}
|
||||
fclose(file); // Close the file
|
||||
|
||||
*bytesOut = buffer;
|
||||
*sizeOut = fileSize;
|
||||
return true;
|
||||
}
|
||||
|
||||
struct llava_image_embed *llava_image_embed_make_with_filename(struct clip_ctx *ctx_clip, int n_threads,
|
||||
const char *image_path)
|
||||
{
|
||||
unsigned char *image_bytes;
|
||||
long image_bytes_length;
|
||||
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
|
||||
if (!loaded)
|
||||
{
|
||||
LOG_TEE("%s: failed to load %s\n", __func__, image_path);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
llava_image_embed *embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length);
|
||||
free(image_bytes);
|
||||
|
||||
return embed;
|
||||
}
|
||||
|
||||
void llava_image_embed_free(struct llava_image_embed *embed)
|
||||
{
|
||||
free(embed->embed);
|
||||
free(embed);
|
||||
}
|
53
examples/xgenmm/xgenmm.h
Normal file
53
examples/xgenmm/xgenmm.h
Normal file
|
@ -0,0 +1,53 @@
|
|||
/*
|
||||
08/18/2024 - Yutong - The file is adpated from examples/llava/llava.h in the llama.cpp repository.
|
||||
*/
|
||||
|
||||
#ifndef LLAVA_H
|
||||
#define LLAVA_H
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef LLAMA_SHARED
|
||||
# if defined(_WIN32) && !defined(__MINGW32__)
|
||||
# ifdef LLAMA_BUILD
|
||||
# define XGENMM_API __declspec(dllexport)
|
||||
# else
|
||||
# define XGENMM_API __declspec(dllimport)
|
||||
# endif
|
||||
# else
|
||||
# define XGENMM_API __attribute__ ((visibility ("default")))
|
||||
# endif
|
||||
#else
|
||||
# define XGENMM_API
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct clip_ctx;
|
||||
struct llava_image_embed {
|
||||
float * embed;
|
||||
int n_image_pos;
|
||||
};
|
||||
|
||||
/** sanity check for clip <-> llava embed size match */
|
||||
XGENMM_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip);
|
||||
|
||||
XGENMM_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
|
||||
|
||||
/** build an image embed from image file bytes */
|
||||
XGENMM_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
|
||||
/** build an image embed from a path to an image filename */
|
||||
XGENMM_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
|
||||
/** free an embedding made with llava_image_embed_make_* */
|
||||
XGENMM_API void llava_image_embed_free(struct llava_image_embed * embed);
|
||||
|
||||
/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
|
||||
XGENMM_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
Loading…
Add table
Add a link
Reference in a new issue