llava : fixes

- avoid ggml_repeat
- use GGML_USE_ instead of CLIP_USE_ macros
- remove unused vars
This commit is contained in:
Georgi Gerganov 2023-12-29 18:44:31 +02:00
parent 2cf4f37e36
commit 44c5f7b1dd
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
2 changed files with 18 additions and 30 deletions

View file

@ -24,7 +24,8 @@ endif()
if (NOT MSVC)
target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h
endif()
endif()
if(TARGET BUILD_INFO)
add_dependencies(llava BUILD_INFO)
endif()
@ -34,10 +35,3 @@ add_executable(llava-cli llava-cli.cpp)
install(TARGETS llava-cli RUNTIME)
target_link_libraries(llava-cli PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(llava PRIVATE cxx_std_11)
if(LLAMA_CUBLAS)
add_definitions(-DCLIP_USE_CUBLAS)
endif()
if(LLAMA_METAL)
add_definitions(-DCLIP_USE_METAL)
endif()

View file

@ -18,19 +18,17 @@
#include "ggml-alloc.h"
#include "ggml-backend.h"
#ifdef CLIP_USE_CUBLAS
#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h"
#endif
#ifdef CLIP_USE_METAL
#ifdef GGML_USE_METAL
#include "ggml-metal.h"
#endif
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
#define CLIP_DEBUG
static std::string format(const char * fmt, ...) {
va_list ap;
va_list ap2;
@ -299,13 +297,11 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
free(zero_mem);
}
struct ggml_tensor * temp = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, 1, batch_size);
ggml_allocr_alloc(ctx->compute_alloc, temp);
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
embeddings = ggml_acc(ctx0, embeddings, ggml_repeat(ctx0, model.class_embedding, temp), embeddings->nb[1],
embeddings->nb[2], embeddings->nb[3], 0);
embeddings =
ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
embeddings = ggml_acc(ctx0, embeddings, inp,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
ggml_allocr_alloc(ctx->compute_alloc, positions);
@ -446,7 +442,6 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
// read and create ggml_context containing the tensors and their data
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
struct ggml_context * meta = NULL;
struct gguf_init_params params = {
@ -511,19 +506,19 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
buffer_size += n_tensors * 128 /* CLIP PADDING */;
clip_ctx * new_clip = new clip_ctx;
#ifdef CLIP_USE_CUBLAS
#ifdef GGML_USE_CUBLAS
new_clip->backend = ggml_backend_cuda_init(0);
printf("CLIP using CUDA backend\n");
printf("%s: CLIP using CUDA backend\n", __func__);
#endif
#ifdef CLIP_USE_METAL
#ifdef GGML_USE_METAL
new_clip->backend = ggml_backend_metal_init();
printf("CLIP using Metal backend\n");
printf("%s: CLIP using Metal backend\n", __func__);
#endif
if(!new_clip->backend) {
if (!new_clip->backend) {
new_clip->backend = ggml_backend_cpu_init();
printf("CLIP using CPU backend\n");
printf("%s: CLIP using CPU backend\n", __func__);
}
// model size and capabilities
@ -604,7 +599,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
}
int num_bytes = ggml_nbytes(cur);
if (ggml_backend_is_cpu(new_clip->backend)
#ifdef CLIP_USE_METAL
#ifdef GGML_USE_METAL
|| ggml_backend_is_metal(new_clip->backend)
#endif
) {
@ -692,7 +687,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
// measure mem requirement and allocate
{
new_clip->compute_alloc = ggml_allocr_new_measure_from_backend(new_clip->backend);
static const size_t tensor_alignment = 32;
clip_image_f32_batch batch;
batch.size = 1;
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
@ -895,7 +889,7 @@ bool clip_image_batch_encode(const clip_ctx * ctx, const int n_threads, const cl
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
}
#ifdef CLIP_USE_METAL
#ifdef GGML_USE_METAL
if (ggml_backend_is_metal(ctx->backend)) {
ggml_backend_metal_set_n_cb(ctx->backend, n_threads);
}
@ -1078,8 +1072,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
gguf_free(ctx_out);
{
printf("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
printf("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
printf("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
printf("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
int64_t sum_all = 0;
for (size_t i = 0; i < hist_all.size(); ++i) {