diff --git a/Makefile b/Makefile
index 61981f683..9a8faef45 100644
--- a/Makefile
+++ b/Makefile
@@ -627,8 +627,8 @@ convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggm
 llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip.cpp examples/llava/clip.h examples/llava/stb_image.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
 
 baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
diff --git a/examples/llava/stb_image.h b/common/stb_image.h
similarity index 100%
rename from examples/llava/stb_image.h
rename to common/stb_image.h
diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt
index 9fb8e441d..9ddb5af5c 100644
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@@ -3,8 +3,9 @@ add_library(${TARGET} clip.cpp clip.h)
 install(TARGETS ${TARGET} LIBRARY)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
 if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
+    add_dependencies(${TARGET} BUILD_INFO)
 endif()
 
 set(TARGET llava)
@@ -13,5 +14,5 @@ install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
+    add_dependencies(${TARGET} BUILD_INFO)
 endif()
diff --git a/examples/llava/README.md b/examples/llava/README.md
index f4d61414c..fc3446b60 100644
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -7,6 +7,7 @@ and [13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
 models are available.
 
 After API is confirmed, more models will be supported / uploaded.
+
 ## Usage
 Build with cmake or run `make llava` to build it.
 
@@ -28,16 +29,16 @@ git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
 git clone https://huggingface.co/openai/clip-vit-large-patch14-336
 ```
 
-2. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
+2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
 
 ```sh
-python ./examples/llava/llava_surgery.py -m ../llava-v1.5-7b
+python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
 ```
 
-3. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF:
+3. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF:
 
 ```sh
-python ./examples/llava/convert_image_encoder_to_gguf -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
+python ./examples/llava/convert-image-encoder-to-gguf -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
 ```
 
 4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 3d24d736b..f4258b34d 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -97,19 +97,19 @@ static int get_key_idx(const gguf_context * ctx, const char * key) {
     return i;
 }
 
-static const uint32_t get_u32(const gguf_context * ctx, std::string key) {
+static uint32_t get_u32(const gguf_context * ctx, const std::string & key) {
     const int i = get_key_idx(ctx, key.c_str());
 
     return gguf_get_val_u32(ctx, i);
 }
 
-static const float get_f32(const gguf_context * ctx, std::string key) {
+static float get_f32(const gguf_context * ctx, const std::string & key) {
     const int i = get_key_idx(ctx, key.c_str());
 
     return gguf_get_val_f32(ctx, i);
 }
 
-static struct ggml_tensor * get_tensor(struct ggml_context * ctx, std::string name) {
+static struct ggml_tensor * get_tensor(struct ggml_context * ctx, const std::string & name) {
     struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
     if (!cur) {
         printf("unable to find tensor %s\n", name.c_str());
@@ -123,25 +123,18 @@ static std::string get_ftype(int ftype) {
     switch (ftype) {
     case 0:
         return "f32";
-        break;
     case 1:
         return "f16";
-        break;
     case 2:
         return "q4_0";
-        break;
     case 3:
         return "q4_1";
-        break;
     case 6:
         return "q5_0";
-        break;
     case 7:
         return "q5_1";
-        break;
     case 8:
         return "q8_0";
-        break;
     default:
         throw std::runtime_error(format("Unrecognized file type: %d\n", ftype));
     }
@@ -237,7 +230,6 @@ struct clip_ctx {
 };
 
 static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_image_f32_batch * imgs) {
-
     if (!ctx->has_vision_encoder) {
         printf("This gguf file seems to have no vision encoder\n");
         return nullptr;
@@ -254,15 +246,15 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
     const int n_head = hparams.n_head;
     const int d_head = hidden_size / n_head;
     const int n_layer = hparams.n_layer;
-    const int n_intermediate = hparams.n_intermediate;
-    const int projection_dim = hparams.projection_dim;
+    //const int n_intermediate = hparams.n_intermediate;
+    //const int projection_dim = hparams.projection_dim;
     const float eps = hparams.eps;
     int batch_size = imgs->size;
     if(ctx->has_llava_projector) {
         GGML_ASSERT(batch_size == 1);
     }
 
-    auto & buf_compute = ctx->buf_compute;
+    const auto & buf_compute = ctx->buf_compute;
 
     struct ggml_init_params params = {
         /*.mem_size =*/ buf_compute.size,
@@ -281,9 +273,9 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
     if (!ggml_allocr_is_measure(ctx->alloc)) {
         float * data = (float *)ggml_get_data(inp_raw);
 
-        for (int b = 0; b < imgs->size; b++) {
-            const int nx = imgs->data[b].nx;
-            const int ny = imgs->data[b].ny;
+        for (size_t i = 0; i < imgs->size; i++) {
+            const int nx = imgs->data[i].nx;
+            const int ny = imgs->data[i].ny;
             GGML_ASSERT(nx == image_size && ny == image_size);
 
             const int n = nx * ny;
@@ -339,17 +331,17 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
                               ggml_repeat(ctx0, model.pre_ln_b, embeddings));
     }
 
-struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-ggml_allocr_alloc(ctx->alloc, KQ_scale);
-if (!ggml_allocr_is_measure(ctx->alloc)) {
+    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+    ggml_allocr_alloc(ctx->alloc, KQ_scale);
+    if (!ggml_allocr_is_measure(ctx->alloc)) {
         ggml_set_f32(KQ_scale, 1.0f / sqrt((float)d_head));
-}
+    }
 
     // loop over layers
     for (int il = 0; il < n_layer - 1; il++) {
         struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
 
-        const size_t nb_q_w = model.layers[il].q_w->nb[0];
+        //const size_t nb_q_w = model.layers[il].q_w->nb[0];
 
         // layernorm1
         {
@@ -730,7 +722,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
         uint8_t bc[3] = {122, 116, 104}; // bakground color in RGB from LLaVA
 
         // fill with background color
-        for (int i = 0; i < temp.size; i++) {
+        for (size_t i = 0; i < temp.size; i++) {
             temp.data[i] = bc[i % 3];
         }
 
@@ -963,7 +955,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
                 if (conv_buf.size() < n_elms) {
                     conv_buf.resize(n_elms);
                 }
-                for (int j = 0; j < n_elms; ++j) {
+                for (size_t j = 0; j < n_elms; ++j) {
                     conv_buf[j] = ggml_fp16_to_fp32(((ggml_fp16_t *)cur->data)[j]);
                 }
                 f32_data = (float *)conv_buf.data();
@@ -981,28 +973,28 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
             std::vector<int64_t> hist_cur(1 << 4, 0);
 
             switch (new_type) {
-            case GGML_TYPE_Q4_0: {
-                new_size = ggml_quantize_q4_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
-            } break;
-            case GGML_TYPE_Q4_1: {
-                new_size = ggml_quantize_q4_1(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
-            } break;
-            case GGML_TYPE_Q5_0: {
-                new_size = ggml_quantize_q5_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
-            } break;
-            case GGML_TYPE_Q5_1: {
-                new_size = ggml_quantize_q5_1(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
-            } break;
-            case GGML_TYPE_Q8_0: {
-                new_size = ggml_quantize_q8_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
-            } break;
-            default: {
-                fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, new_type);
-                return false;
-            }
+                case GGML_TYPE_Q4_0: {
+                    new_size = ggml_quantize_q4_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
+                } break;
+                case GGML_TYPE_Q4_1: {
+                    new_size = ggml_quantize_q4_1(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
+                } break;
+                case GGML_TYPE_Q5_0: {
+                    new_size = ggml_quantize_q5_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
+                } break;
+                case GGML_TYPE_Q5_1: {
+                    new_size = ggml_quantize_q5_1(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
+                } break;
+                case GGML_TYPE_Q8_0: {
+                    new_size = ggml_quantize_q8_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
+                } break;
+                default: {
+                    fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, new_type);
+                    return false;
+                }
             }
 
-            for (int j = 0; j < hist_cur.size(); ++j) {
+            for (size_t j = 0; j < hist_cur.size(); ++j) {
                 hist_all[j] += hist_cur[j];
             }
         } else {
@@ -1017,7 +1009,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
         gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
         fout.write((const char *)new_data, new_size);
         size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size;
-        for (int j = 0; j < pad; ++j) {
+        for (size_t j = 0; j < pad; ++j) {
             fout.put(0);
         }
 
diff --git a/examples/llava/convert_image_encoder_to_gguf.py b/examples/llava/convert-image-encoder-to-gguf.py
similarity index 100%
rename from examples/llava/convert_image_encoder_to_gguf.py
rename to examples/llava/convert-image-encoder-to-gguf.py
diff --git a/examples/llava/llava_surgery.py b/examples/llava/llava-surgery.py
similarity index 100%
rename from examples/llava/llava_surgery.py
rename to examples/llava/llava-surgery.py
diff --git a/examples/llava/llava-utils.h b/examples/llava/llava-utils.h
index de17615c7..79e237c86 100644
--- a/examples/llava/llava-utils.h
+++ b/examples/llava/llava-utils.h
@@ -1,12 +1,15 @@
+#pragma once
+
 // this one and clip lib will be eventually merged to a single lib, let's keep it this way for now
-#include <stdio.h>
-#include <stdlib.h>
-#include <vector>
 
 #include "common.h"
 #include "llama.h"
 
-bool eval_image_embd(llama_context * ctx_llama, float * embd, int N, int n_batch, int * n_past) {
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+inline bool eval_image_embd(llama_context * ctx_llama, float * embd, int N, int n_batch, int * n_past) {
     int n_embd  = llama_n_embd(llama_get_model(ctx_llama));
 
     for (int i = 0; i < N; i += n_batch) {
@@ -24,7 +27,7 @@ bool eval_image_embd(llama_context * ctx_llama, float * embd, int N, int n_batch
     return true;
 }
 
-bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
+inline bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
     int N = (int) tokens.size();
     for (int i = 0; i < N; i += n_batch) {
         int n_eval = (int) tokens.size() - i;
@@ -40,20 +43,21 @@ bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> toke
     return true;
 }
 
-bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
+inline bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
     std::vector<llama_token> tokens;
     tokens.push_back(id);
     return eval_tokens(ctx_llama, tokens, 1, n_past);
 }
 
-bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past){
+inline bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past){
     std::string              str2     = str;
     std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, true);
     eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
     return true;
 }
 
-llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
+// TODO: use common/sampling.h
+inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
       // out of user input, sample next token
     const float   temp      = params.sampling_params.temp;
     const int32_t top_k     = params.sampling_params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : params.sampling_params.top_k;
@@ -128,7 +132,7 @@ llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
     return id;
 }
 
-const char * sample(struct llama_context * ctx_llama, gpt_params & params, int * n_past) {
+inline const char * sample(struct llama_context * ctx_llama, gpt_params & params, int * n_past) {
     int id = sample_id(ctx_llama, params);
     static std::string ret;
     if (id == llama_token_eos(ctx_llama)) {
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 06617062a..14dacc780 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -1,13 +1,13 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <vector>
-
 #include "clip.h"
 #include "llava-utils.h"
 #include "common.h"
 #include "llama.h"
 
-static void show_additional_info(int argc, char ** argv) {
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+static void show_additional_info(int /*argc*/, char ** argv) {
     printf("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
     printf("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
 }
@@ -40,6 +40,7 @@ int main(int argc, char ** argv) {
     // load and preprocess the image
     clip_image_u8 img;
     clip_image_f32 img_res;
+
     if (!clip_image_load_from_file(img_path, &img)) {
         fprintf(stderr, "%s: is %s really an image file?\n", __func__, img_path);
 
@@ -54,8 +55,9 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    int   n_img_pos    = clip_n_patches(ctx_clip);
-    int   n_img_embd   = clip_n_mmproj_embd(ctx_clip);
+    int n_img_pos  = clip_n_patches(ctx_clip);
+    int n_img_embd = clip_n_mmproj_embd(ctx_clip);
+
     float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip));
 
     if (!image_embd) {
@@ -84,11 +86,13 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    llama_context_params ctx_params                 = llama_context_default_params();
+    llama_context_params ctx_params = llama_context_default_params();
+
     ctx_params.n_ctx           = params.n_ctx < 2048 ? 2048 : params.n_ctx; // we need a longer context size to process image embeddings
     ctx_params.n_threads       = params.n_threads;
     ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
-    llama_context        * ctx_llama                = llama_new_context_with_model(model, ctx_params);
+
+    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
 
     if (ctx_llama == NULL) {
         fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
@@ -111,26 +115,35 @@ int main(int argc, char ** argv) {
     // process the prompt
     // llava chat format is "<system_prompt>USER: <image_embeddings>\n<textual_prompt>\nASSISTANT:"
 
-    int   n_past          = 0;
+    int n_past = 0;
+
     const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
+
+    // GG: are we sure that the should be a trailing whitespace at the end of this string?
     eval_string(ctx_llama, "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER: ", params.n_batch, &n_past);
     eval_image_embd(ctx_llama, image_embd, n_img_pos, params.n_batch, &n_past);
     eval_string(ctx_llama, params.prompt.c_str(), params.n_batch, &n_past);
-eval_string(ctx_llama, "\nASSISTANT:", params.n_batch, &n_past);
+    eval_string(ctx_llama, "\nASSISTANT:",        params.n_batch, &n_past);
 
     // generate the response
 
-    const char* tmp;
-    for (int i=0; i < max_tgt_len; i++) {
-        tmp = sample(ctx_llama, params, &n_past);
-        if (strcmp(tmp, "</s>")==0) break;
+    printf("\n");
+
+    for (int i = 0; i < max_tgt_len; i++) {
+        const char * tmp = sample(ctx_llama, params, &n_past);
+        if (strcmp(tmp, "</s>") == 0) break;
+
         printf("%s", tmp);
         fflush(stdout);
     }
+
     printf("\n");
 
-    const float img_enc_duration = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
-    printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, img_enc_duration, img_enc_duration / n_img_pos);
+    {
+        const float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
+
+        printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / n_img_pos);
+    }
 
     llama_print_timings(ctx_llama);