From 44c5f7b1ddbf66ac9aaf92569a68924137380955 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 29 Dec 2023 18:44:31 +0200
Subject: [PATCH] llava : fixes

- avoid ggml_repeat
- use GGML_USE_ instead of CLIP_USE_ macros
- remove unused vars
---
 examples/llava/CMakeLists.txt | 10 ++-------
 examples/llava/clip.cpp       | 38 +++++++++++++++--------------------
 2 files changed, 18 insertions(+), 30 deletions(-)

diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt
index 69c6be3df..2985caff8 100644
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@@ -24,7 +24,8 @@ endif()
 
 if (NOT MSVC)
     target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h
-    endif()
+endif()
+
 if(TARGET BUILD_INFO)
     add_dependencies(llava BUILD_INFO)
 endif()
@@ -34,10 +35,3 @@ add_executable(llava-cli llava-cli.cpp)
 install(TARGETS llava-cli RUNTIME)
 target_link_libraries(llava-cli PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(llava PRIVATE cxx_std_11)
-
-if(LLAMA_CUBLAS)
-    add_definitions(-DCLIP_USE_CUBLAS)
-endif()
-if(LLAMA_METAL)
-    add_definitions(-DCLIP_USE_METAL)
-endif()
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 2ee14d2e5..f9326a5cc 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -18,19 +18,17 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 
-#ifdef CLIP_USE_CUBLAS
+#ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
 #endif
 
-#ifdef CLIP_USE_METAL
+#ifdef GGML_USE_METAL
 #include "ggml-metal.h"
 #endif
 
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"
 
-#define CLIP_DEBUG
-
 static std::string format(const char * fmt, ...) {
     va_list ap;
     va_list ap2;
@@ -299,13 +297,11 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
         free(zero_mem);
     }
 
-    struct ggml_tensor * temp = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, 1, batch_size);
-    ggml_allocr_alloc(ctx->compute_alloc, temp);
+    embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
+            embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
 
-    embeddings = ggml_acc(ctx0, embeddings, ggml_repeat(ctx0, model.class_embedding, temp), embeddings->nb[1],
-                          embeddings->nb[2], embeddings->nb[3], 0);
-    embeddings =
-        ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
+    embeddings = ggml_acc(ctx0, embeddings, inp,
+            embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
 
     struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
     ggml_allocr_alloc(ctx->compute_alloc, positions);
@@ -446,7 +442,6 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
 
 // read and create ggml_context containing the tensors and their data
 struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
-
     struct ggml_context * meta = NULL;
 
     struct gguf_init_params params = {
@@ -511,19 +506,19 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
     buffer_size += n_tensors * 128 /* CLIP PADDING */;
 
     clip_ctx * new_clip = new clip_ctx;
-#ifdef CLIP_USE_CUBLAS
+#ifdef GGML_USE_CUBLAS
     new_clip->backend = ggml_backend_cuda_init(0);
-    printf("CLIP using CUDA backend\n");
+    printf("%s: CLIP using CUDA backend\n", __func__);
 #endif
 
-#ifdef CLIP_USE_METAL
+#ifdef GGML_USE_METAL
     new_clip->backend = ggml_backend_metal_init();
-    printf("CLIP using Metal backend\n");
+    printf("%s: CLIP using Metal backend\n", __func__);
 #endif
 
-    if(!new_clip->backend) {
+    if (!new_clip->backend) {
         new_clip->backend = ggml_backend_cpu_init();
-        printf("CLIP using CPU backend\n");
+        printf("%s: CLIP using CPU backend\n", __func__);
     }
 
     // model size and capabilities
@@ -604,7 +599,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             }
             int num_bytes = ggml_nbytes(cur);
             if (ggml_backend_is_cpu(new_clip->backend)
-#ifdef CLIP_USE_METAL
+#ifdef GGML_USE_METAL
             || ggml_backend_is_metal(new_clip->backend)
 #endif
             ) {
@@ -692,7 +687,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 // measure mem requirement and allocate
     {
         new_clip->compute_alloc = ggml_allocr_new_measure_from_backend(new_clip->backend);
-        static const size_t tensor_alignment = 32;
         clip_image_f32_batch batch;
         batch.size = 1;
         ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
@@ -895,7 +889,7 @@ bool clip_image_batch_encode(const clip_ctx * ctx, const int n_threads, const cl
         ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
     }
 
-#ifdef CLIP_USE_METAL
+#ifdef GGML_USE_METAL
     if (ggml_backend_is_metal(ctx->backend)) {
         ggml_backend_metal_set_n_cb(ctx->backend, n_threads);
     }
@@ -1078,8 +1072,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
     gguf_free(ctx_out);
 
     {
-        printf("%s: original size  = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
-        printf("%s: quantized size  = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
+        printf("%s: original  size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
+        printf("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
 
         int64_t sum_all = 0;
         for (size_t i = 0; i < hist_all.size(); ++i) {