diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 1cdb2be74..73438e3f5 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1483,7 +1483,7 @@ static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector // res_imgs memory is being allocated here, previous allocations will be freed if found -bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs ) { +bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs) { bool pad_to_square = true; if (!ctx->has_vision_encoder) { printf("This gguf file seems to have no vision encoder\n"); @@ -1648,9 +1648,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli // clip_image_u8_free(temp2); // } // res_imgs.push_back(res); + res_imgs.size = 1; res_imgs.data = new clip_image_f32[res_imgs.size]; res_imgs.data[0] = std::move(*res); + return true; } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index a9f71725d..6e3434030 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -31,23 +31,6 @@ using json = nlohmann::json; -// RGB uint8 image -struct clip_image_u8 { - int nx; - int ny; - - std::vector buf; -}; - -// RGB float32 image (NHWC) -// Memory layout: RGBRGBRGB... -struct clip_image_f32 { - int nx; - int ny; - - std::vector buf; -}; - struct server_params { std::string hostname = "127.0.0.1"; @@ -992,10 +975,13 @@ struct llama_server_context { LOG_TEE("Error processing the given image"); clip_free(clp_ctx); - delete[] img_res_v.data; + clip_image_f32_free(img_res_v.data); return false; } - clip_image_f32 * img_res = &img_res_v.data[0]; + + // note: assumes only one image was returned by clip_image_preprocess + clip_image_f32 * img_res = img_res_v.data; + img.image_tokens = clip_n_patches(clp_ctx); img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx)); if (!img.image_embedding) @@ -1010,8 +996,9 @@ struct llama_server_context LOG_TEE("Unable to encode image\n"); return false; } - // clip_image_f32_free(img_res); - delete[] img_res_v.data; + + clip_image_f32_free(img_res_v.data); + img.request_encode_image = false; }