Add fixes to reduce the amount of warnings

2024-01-27 21:17:15 +01:00 · 2024-01-27 21:17:15 +01:00 · c3fe181c44
commit c3fe181c44
parent 6db2b41a76
24 changed files with 145 additions and 308 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1056,7 +1056,7 @@ std::string get_system_info(const gpt_params & params) {
 }

 std::string gpt_random_prompt(std::mt19937 & rng) {
-    const int r = rng() % 10;
+    const int r = static_cast<int>(rng() % 10);
    switch (r) {
        case 0: return "So";
        case 1: return "Once upon a time";
@ -1700,7 +1700,8 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
            if (cs_curr[j] < 0) { continue; }
            if (seqs.find(cs_curr[j]) == seqs.end()) {
                if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
-                seqs[cs_curr[j]] = seqs.size();
+                auto current_size = seqs.size();
+                seqs[cs_curr[j]] = current_size;
            }
        }
        if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
--- a/common/console.cpp
+++ b/common/console.cpp
@ -250,7 +250,7 @@ namespace console {
            return expectedWidth;
        }
        COORD initialPosition = bufferInfo.dwCursorPosition;
-        DWORD nNumberOfChars = length;
+        DWORD nNumberOfChars = static_cast<DWORD>(length);
        WriteConsole(hConsole, utf8_codepoint, nNumberOfChars, &nNumberOfChars, NULL);

        CONSOLE_SCREEN_BUFFER_INFO newBufferInfo;
@ -404,7 +404,7 @@ namespace console {
                    } while (count == 0 && !widths.empty());
                }
            } else {
-                int offset = line.length();
+                int offset = static_cast<int>(line.length());
                append_utf8(input_char, line);
                int width = put_codepoint(line.c_str() + offset, line.length() - offset, estimateWidth(input_char));
                if (width < 0) {
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -73,7 +73,7 @@ llama_token llama_sampling_last(llama_sampling_context * ctx) {
 }

 std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) {
-    const int size = ctx_sampling->prev.size();
+    const int size = static_cast<int>(ctx_sampling->prev.size());

    n = std::min(n, size);

--- a/common/train.cpp
+++ b/common/train.cpp
@ -46,18 +46,12 @@ void free_train_state(struct train_state  * state) {
 struct random_normal_distribution * init_random_normal_distribution(
    int seed, float mean, float std, float min, float max
 ) {
-    struct random_normal_distribution * rnd = (struct random_normal_distribution *) malloc(sizeof(struct random_normal_distribution));
-    rnd->gen = std::mt19937(seed);
-    rnd->rd = std::normal_distribution<float>{mean, std};
-    rnd->min = min;
-    rnd->max = max;
+    auto rnd = new random_normal_distribution{ std::mt19937(seed), std::normal_distribution<float>{mean, std}, min, max};
    return rnd;
 }

 struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max) {
-    struct random_uniform_distribution * rnd = (struct random_uniform_distribution *) malloc(sizeof(struct random_uniform_distribution));
-    rnd->gen = std::mt19937(seed);
-    rnd->rd = std::uniform_real_distribution<float>{min, max};
+    auto rnd = new random_uniform_distribution{ std::mt19937(seed), std::uniform_real_distribution<float>{min, max} };
    return rnd;
 }

--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -70,7 +70,7 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> tokens_list;
    tokens_list = ::llama_tokenize(model, params.prompt, true);

-    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;
+    const int n_kv_req = static_cast<int>(tokens_list.size() + (n_len - tokens_list.size())*n_parallel);

    // initialize the context

@ -112,11 +112,11 @@ int main(int argc, char ** argv) {

    // create a llama_batch
    // we use this object to submit token data for decoding
-    llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0, 1);
+    llama_batch batch = llama_batch_init(std::max(static_cast<int32_t>(tokens_list.size()), n_parallel), 0, 1);

    // evaluate the initial prompt
    for (size_t i = 0; i < tokens_list.size(); ++i) {
-        llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
+        llama_batch_add(batch, tokens_list[i], static_cast<llama_pos>(i), { 0 }, false);
    }
    GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());

--- a/examples/beam-search/beam-search.cpp
+++ b/examples/beam-search/beam-search.cpp
@ -160,12 +160,12 @@ int main(int argc, char ** argv)

    int n_past = 0;

-    if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0)))
+    if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), static_cast<int32_t>(tokens_list.size()), n_past, 0)))
    {
        fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
        return 1;
    }
-    n_past += tokens_list.size();
+    n_past += static_cast<int>(tokens_list.size());

    beam_search_callback_data callback_data{ctx, {}};
    size_t const beam_width = static_cast<size_t>(params.n_beams);
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -76,7 +76,7 @@ static T stdev(const std::vector<T> & v) {
    }
    T mean = avg(v);
    T sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), T(0));
-    T stdev = std::sqrt(sq_sum / (T)(v.size() - 1) - mean * mean * (T)v.size() / (T)(v.size() - 1));
+    T stdev = static_cast<T>(std::sqrt(sq_sum / (T)(v.size() - 1) - mean * mean * (T)v.size() / (T)(v.size() - 1)));
    return stdev;
 }

--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -98,7 +98,6 @@ static std::string format(const char * fmt, ...) {

 enum projector_type {
    PROJECTOR_TYPE_MLP,
-    PROJECTOR_TYPE_MLP_NORM,
    PROJECTOR_TYPE_LDP,
    PROJECTOR_TYPE_UNKNOWN,
 };
@ -305,18 +304,10 @@ struct clip_vision_model {
    struct ggml_tensor * projection;

    // LLaVA projection
-    struct ggml_tensor * mm_0_w = NULL;
-    struct ggml_tensor * mm_0_b = NULL;
-    struct ggml_tensor * mm_2_w = NULL;
-    struct ggml_tensor * mm_2_b = NULL;
-
-    // Yi type models with mlp+normalization projection
-    struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4
-    struct ggml_tensor * mm_1_b = NULL;
-    struct ggml_tensor * mm_3_w = NULL;
-    struct ggml_tensor * mm_3_b = NULL;
-    struct ggml_tensor * mm_4_w = NULL;
-    struct ggml_tensor * mm_4_b = NULL;
+    struct ggml_tensor * mm_0_w;
+    struct ggml_tensor * mm_0_b;
+    struct ggml_tensor * mm_2_w;
+    struct ggml_tensor * mm_2_b;

    // MobileVLM projection
    struct ggml_tensor * mm_model_mlp_1_w;
@ -390,7 +381,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    //const int n_intermediate = hparams.n_intermediate;
    //const int projection_dim = hparams.projection_dim;
    const float eps = hparams.eps;
-    int batch_size = imgs->size;
+    int batch_size = static_cast<int>(imgs->size);
    if (ctx->has_llava_projector) {
        GGML_ASSERT(batch_size == 1);
    }
@ -469,7 +460,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    // pre-layernorm
    {
        embeddings = ggml_norm(ctx0, embeddings, eps);
-        ggml_set_name(embeddings, "pre_ln");

        embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
    }
@ -585,27 +575,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32

            embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
            embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
-
-        } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
-            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
-            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
-            // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
-            // First LayerNorm
-            embeddings = ggml_norm(ctx0, embeddings, eps);
-            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
-                                model.mm_1_b);
-
-            // GELU activation
-            embeddings = ggml_gelu(ctx0, embeddings);
-
-            // Second linear layer
-            embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
-            embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
-
-            // Second LayerNorm
-            embeddings = ggml_norm(ctx0, embeddings, eps);
-            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
-                                model.mm_4_b);
        }
        else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
            // MobileVLM projector
@ -638,7 +607,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                // hardswish
                struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);

-                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
+                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, static_cast<int>(block_1_hw->ne[0]), static_cast<int>(block_1_hw->ne[1]),
+                    static_cast<int>(block_1_hw->ne[0]), static_cast<int>(block_1_hw->ne[1]), 0, 0);
                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
                // pointwise conv
                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
@ -652,7 +622,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
                block_1 = ggml_mul(ctx0, block_1_hw, block_1);

-                int w = block_1->ne[0], h = block_1->ne[1];
+                int w = static_cast<int>(block_1->ne[0]);
+                int h = static_cast<int>(block_1->ne[1]);
                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));

@ -686,7 +657,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);

                // not sure the parameters is right for globalAvgPooling
-                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
+                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, static_cast<int>(block_1_hw->ne[0]), static_cast<int>(block_1_hw->ne[1]),
+                    static_cast<int>(block_1_hw->ne[0]), static_cast<int>(block_1_hw->ne[1]), 0, 0);
                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
                // pointwise conv
                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
@ -701,7 +673,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
                block_1 = ggml_mul(ctx0, block_1_hw, block_1);

-                int w = block_1->ne[0], h = block_1->ne[1];
+                int w = static_cast<int>(block_1->ne[0]);
+                int h = static_cast<int>(block_1->ne[1]);
                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
@ -839,11 +812,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        else {
            new_clip->proj_type = PROJECTOR_TYPE_MLP;
        }
-        if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
-            if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) {
-                new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM;
-            }
-        }
    }

 #ifdef GGML_USE_CUBLAS
@ -938,7 +906,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
                clip_free(new_clip);
                return nullptr;
            }
-            int num_bytes = ggml_nbytes(cur);
+            int num_bytes = static_cast<int>(ggml_nbytes(cur));
            if (ggml_backend_buffer_is_host(new_clip->params_buffer)) {
                // for the CPU and Metal backend, we can read directly into the tensor
                fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
@ -992,29 +960,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        vision_model.pre_ln_b            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));

        // LLaVA projection
-        if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
+        if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
            vision_model.mm_0_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
            vision_model.mm_0_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
-            try {
-                // Yi-type llava
-                vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight"));
-                vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias"));
-            } catch (std::runtime_error & e) {  }
-            try {
-                // missing in Yi-type llava
-                vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
-                vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
-            } catch (std::runtime_error & e) {  }
-            try {
-                // Yi-type llava
-                vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight"));
-                vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias"));
-            } catch (std::runtime_error & e) {  }
-            try {
-                // Yi-type llava
-                vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
-                vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
-            } catch (std::runtime_error & e) {  }
+            vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
+            vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
        }
        else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
            // MobileVLM projection
@ -1124,7 +1074,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {

 bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
    int nx, ny, nc;
-    auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
+    auto * data = stbi_load_from_memory(bytes, static_cast<int>(bytes_length), &nx, &ny, &nc, 3);
    if (!data) {
        fprintf(stderr, "%s: failed to decode image bytes\n", __func__);
        return false;
@ -1224,7 +1174,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli

                const float v = v0 * (1.0f - dy) + v1 * dy;

-                const uint8_t v2 = std::min(std::max(std::round(v), 0.0f), 255.0f);
+                const uint8_t v2 = static_cast<std::uint8_t>(std::min(std::max(std::round(v), 0.0f), 255.0f));

                const int i = 3 * (y * nx3 + x) + c;

@ -1262,7 +1212,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        return false;
    }

-    int batch_size = imgs->size;
+    int batch_size = static_cast<int>(imgs->size);
    if(ctx->has_llava_projector) {
        GGML_ASSERT(batch_size == 1); // TODO: support multiple images
    }
@ -1392,34 +1342,34 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i

            switch (new_type) {
                case GGML_TYPE_Q4_0: {
-                    new_size = ggml_quantize_q4_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
+                    new_size = ggml_quantize_q4_0(f32_data, new_data, static_cast<int>(n_elms), static_cast<int>(cur->ne[0]), hist_cur.data());
                } break;
                case GGML_TYPE_Q4_1: {
-                    new_size = ggml_quantize_q4_1(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
+                    new_size = ggml_quantize_q4_1(f32_data, new_data, static_cast<int>(n_elms), static_cast<int>(cur->ne[0]), hist_cur.data());
                } break;
                case GGML_TYPE_Q5_0: {
-                    new_size = ggml_quantize_q5_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
+                    new_size = ggml_quantize_q5_0(f32_data, new_data, static_cast<int>(n_elms), static_cast<int>(cur->ne[0]), hist_cur.data());
                } break;
                case GGML_TYPE_Q5_1: {
-                    new_size = ggml_quantize_q5_1(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
+                    new_size = ggml_quantize_q5_1(f32_data, new_data, static_cast<int>(n_elms), static_cast<int>(cur->ne[0]), hist_cur.data());
                } break;
                case GGML_TYPE_Q8_0: {
-                    new_size = ggml_quantize_q8_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
+                    new_size = ggml_quantize_q8_0(f32_data, new_data, static_cast<int>(n_elms), static_cast<int>(cur->ne[0]), hist_cur.data());
                } break;
                case GGML_TYPE_Q2_K: {
-                    new_size = ggml_quantize_q2_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
+                    new_size = ggml_quantize_q2_K(f32_data, new_data, static_cast<int>(n_elms), static_cast<int>(cur->ne[0]), hist_cur.data());
                } break;
                case GGML_TYPE_Q3_K: {
-                    new_size = ggml_quantize_q3_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
+                    new_size = ggml_quantize_q3_K(f32_data, new_data, static_cast<int>(n_elms), static_cast<int>(cur->ne[0]), hist_cur.data());
                } break;
                case GGML_TYPE_Q4_K: {
-                    new_size = ggml_quantize_q4_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
+                    new_size = ggml_quantize_q4_K(f32_data, new_data, static_cast<int>(n_elms), static_cast<int>(cur->ne[0]), hist_cur.data());
                } break;
                case GGML_TYPE_Q5_K: {
-                    new_size = ggml_quantize_q5_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
+                    new_size = ggml_quantize_q5_K(f32_data, new_data, static_cast<int>(n_elms), static_cast<int>(cur->ne[0]), hist_cur.data());
                } break;
                case GGML_TYPE_Q6_K: {
-                    new_size = ggml_quantize_q6_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
+                    new_size = ggml_quantize_q6_K(f32_data, new_data, static_cast<int>(n_elms), static_cast<int>(cur->ne[0]), hist_cur.data());
                } break;
                default: {
                    fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, new_type);
@ -1482,12 +1432,10 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i

 int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
    if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
-        return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
+        return static_cast<int>(ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0]);
    }
    else if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
-        return ctx->vision_model.mm_2_b->ne[0];
-    } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
-        return ctx->vision_model.mm_3_b->ne[0];
+        return static_cast<int>(ctx->vision_model.mm_2_b->ne[0]);
    }
    else {
        std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@ -85,7 +85,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
    auto img_bytes = std::vector<unsigned char>(required_bytes);
    base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());

-    auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
+    auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), static_cast<int>(img_bytes.size()));
    if (!embed) {
        fprintf(stderr, "%s: could not load image from base64 string.\n", __func__);
        return NULL;
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@ -29,7 +29,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
    }

    const int64_t t_img_enc_end_us = ggml_time_us();
-    float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
+    float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0f;

    printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);

@ -51,7 +51,6 @@ static bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_thre
    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip));
    if (!image_embd) {
        fprintf(stderr, "Unable to allocate memory for image embeddings\n");
-        free(image_embd);
        return false;
    }

@ -104,6 +103,10 @@ LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct cl

    clip_image_u8_free(img);
    auto result = (llava_image_embed*)malloc(sizeof(llava_image_embed));
+    if (!result) {
+        fprintf(stderr, "%s: unable to allocate memory\n", __func__);
+        return NULL;
+    }
    result->embed = image_embed;
    result->n_image_pos = n_image_pos;
    return result;
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@ -88,7 +88,7 @@ int main(int argc, char ** argv) {

    fflush(stderr);

-    const int n_input = inp.size();
+    const int n_input = static_cast<int>(inp.size());

    const auto t_enc_start = ggml_time_us();

@ -105,7 +105,7 @@ int main(int argc, char ** argv) {
    int n_predict = 0;
    int n_accept  = 0;

-    int n_past = inp.size();
+    int n_past = static_cast<int>(inp.size());

    llama_token id = 0;

@ -362,7 +362,7 @@ int main(int argc, char ** argv) {
                if (v == 0) {
                    // sample from the last level
                    for (int i = 0; i < W; i++) {
-                        tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, NULL, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
+                        tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, NULL, static_cast<int>(ngrams_cur.size()*(N-1) + W*(N - 2) + i));
                    }
                } else {
                    for (int i = 0; i < W; i++) {
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@ -60,7 +60,7 @@ int main(int argc, char ** argv){

    fflush(stderr);

-    const int n_input = inp.size();
+    const int n_input = static_cast<int>(inp.size());

    const auto t_enc_start = ggml_time_us();

@ -73,7 +73,7 @@ int main(int argc, char ** argv){
    int n_drafted = 0;
    int n_accept  = 0;

-    int n_past = inp.size();
+    int n_past = static_cast<int>(inp.size());

    bool has_eos = false;

@ -160,7 +160,7 @@ int main(int argc, char ** argv){

        // generate n_pred tokens through prompt lookup
        auto prompt_lookup = [&]() -> void {
-            int inp_size = inp.size();
+            int inp_size = static_cast<int>(inp.size());
            for (int ngram_size = ngram_max ; ngram_size > ngram_min; --ngram_size){
                const llama_token * ngram = &inp[inp_size - ngram_size];

--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@ -156,13 +156,13 @@ int main(int argc, char ** argv) {
    std::vector<client> clients(n_clients);
    for (size_t i = 0; i < clients.size(); ++i) {
        auto & client = clients[i];
-        client.id = i;
+        client.id = static_cast<int32_t>(i);
        client.ctx_sampling = llama_sampling_init(params.sparams);
    }

    std::vector<llama_token> tokens_system;
    tokens_system = ::llama_tokenize(ctx, k_system, true);
-    const int32_t n_tokens_system = tokens_system.size();
+    const int32_t n_tokens_system = static_cast<int32_t>(tokens_system.size());

    llama_seq_id g_seq_id = 0;

@ -254,7 +254,7 @@ int main(int argc, char ** argv) {
                    tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);

                    for (size_t i = 0; i < tokens_prompt.size(); ++i) {
-                        llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id }, false);
+                        llama_batch_add(batch, tokens_prompt[i], static_cast<llama_pos>(i + n_tokens_system), { client.id }, false);
                    }

                    // extract the logits only for the last token
@ -262,7 +262,7 @@ int main(int argc, char ** argv) {
                        batch.logits[batch.n_tokens - 1] = true;
                    }

-                    client.n_prompt  = tokens_prompt.size();
+                    client.n_prompt  = static_cast<int32_t>(tokens_prompt.size());
                    client.n_decoded = 0;
                    client.i_batch   = batch.n_tokens - 1;

--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@ -42,7 +42,7 @@ int main(int argc, char ** argv) {
    }

    if (seed == -1) {
-        seed = time(NULL);
+        seed = static_cast<int>(time(NULL));
    }

    srand(seed);
@ -110,9 +110,9 @@ int main(int argc, char ** argv) {
    tokens_list = ::llama_tokenize(ctx, params.prompt, true);

    // tokenize the prefix and use it as a sink
-    const int n_tokens_prefix = ::llama_tokenize(ctx, prompt_prefix, true).size();
+    const int n_tokens_prefix = static_cast<int>(::llama_tokenize(ctx, prompt_prefix, true).size());

-    const int n_tokens_all = tokens_list.size();
+    const int n_tokens_all = static_cast<int>(tokens_list.size());

    // we leave a margin of 16 tokens for the generated text - it should contain just the passkey
    const int n_predict = 16;
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -1090,7 +1090,6 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {

    llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);

-    std::vector<float> tok_logits(n_vocab);
    std::vector<float> batch_logits(n_vocab*n_ctx);

    std::vector<std::pair<size_t, llama_token>> eval_pairs;
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -20,7 +20,7 @@ int main(int argc, char ** argv) {
        params.n_predict = 16;
    }

-    auto n_past = 0;
+    llama_pos n_past = 0;

    std::string result0;
    std::string result1;
@ -39,8 +39,8 @@ int main(int argc, char ** argv) {
    auto tokens = llama_tokenize(ctx, params.prompt, true);

    // evaluate prompt
-    llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0));
-    n_past += tokens.size();
+    llama_decode(ctx, llama_batch_get_one(tokens.data(), static_cast<llama_pos>(tokens.size()), n_past, 0));
+    n_past += static_cast<int>(tokens.size());

    // save state (rng, logits, embedding and kv_cache) to file
    {
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -184,12 +184,6 @@ struct llama_client_slot
    struct llama_sampling_params sparams;
    llama_sampling_context *ctx_sampling = nullptr;

-    int32_t ga_i = 0;   // group-attention state
-    int32_t ga_n = 1;// group-attention factor
-    int32_t ga_w = 512; // group-attention width
-
-    int32_t n_past_se = 0; // self-extend
-
    // multimodal
    std::vector<slot_image> images;

@ -218,8 +212,7 @@ struct llama_client_slot
        sent_count             = 0;
        sent_token_probs_index = 0;
        infill                 = false;
-        ga_i                   = 0;
-        n_past_se  = 0;
+
        generated_token_probs.clear();

        for (slot_image & img : images)
@ -406,26 +399,9 @@ struct llama_server_context

            slot.id = i;
            slot.n_ctx = n_ctx_slot;
-
-            LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
-
-            const int ga_n = params.grp_attn_n;
-            const int ga_w = params.grp_attn_w;
-
-            if (ga_n != 1) {
-                GGML_ASSERT(ga_n > 0                    && "ga_n must be positive");                     // NOLINT
-                GGML_ASSERT(ga_w % ga_n == 0            && "ga_w must be a multiple of ga_n");     // NOLINT
-                //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of ga_w");    // NOLINT
-                //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
-                LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w);
-            }
-
-            slot.ga_i = 0;
-            slot.ga_n = ga_n;
-            slot.ga_w = ga_w;
-
            slot.reset();

+            LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
            slots.push_back(slot);
        }

@ -660,7 +636,7 @@ struct llama_server_context
                    const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());

                    slot_image img_sl;
-                    img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
+                    img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : static_cast<int>(slot->images.size());
                    img_sl.img_data = clip_image_u8_init();
                    if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
                    {
@ -704,6 +680,7 @@ struct llama_server_context
                                    return false;
                                }
                            } catch (const std::invalid_argument& e) {
+                                static_cast<void>(e);
                                LOG_TEE("Invalid image number id in prompt\n");
                                slot->images.clear();
                                return false;
@ -759,7 +736,7 @@ struct llama_server_context
        // assign the system KV cache to all parallel sequences
        for (int32_t i = 1; i < params.n_parallel; ++i)
        {
-            llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size());
+            llama_kv_cache_seq_cp(ctx, 0, i, 0, static_cast<llama_pos>(system_tokens.size()));
        }

        LOG_TEE("system prompt updated\n");
@ -1245,12 +1222,12 @@ struct llama_server_context

    void split_multiprompt_task(int multitask_id, task_server& multiprompt_task)
    {
-        int prompt_count = multiprompt_task.data.at("prompt").size();
+        std::size_t prompt_count = multiprompt_task.data.at("prompt").size();
        assert(prompt_count > 1);

        // generate all the ID for subtask
        std::vector<int> subtask_ids(prompt_count);
-        for (int i = 0; i < prompt_count; i++)
+        for (std::size_t i = 0; i < prompt_count; i++)
        {
            subtask_ids[i] = queue_tasks.get_new_id();
        }
@ -1259,7 +1236,7 @@ struct llama_server_context
        queue_tasks.add_multitask(multitask_id, subtask_ids);

        // add subtasks
-        for (int i = 0; i < prompt_count; i++)
+        for (std::size_t i = 0; i < prompt_count; i++)
        {
            json subtask_data = multiprompt_task.data;
            subtask_data["prompt"] = subtask_data["prompt"][i];
@ -1373,35 +1350,32 @@ struct llama_server_context

        for (llama_client_slot &slot : slots)
        {
-            if (slot.ga_n == 1)
+            if (slot.is_processing() && slot.cache_tokens.size() >= (size_t) slot.n_ctx)
            {
-                if (slot.is_processing() && slot.cache_tokens.size() >= (size_t) slot.n_ctx)
+                // Shift context
+                const int n_left    = slot.n_past - slot.params.n_keep - 1;
+                const int n_discard = n_left / 2;
+
+                LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard);
+                llama_kv_cache_seq_rm   (ctx, slot.id, slot.params.n_keep + 1            , slot.params.n_keep + n_discard + 1);
+                llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, slot.n_past, -n_discard);
+
+                for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++)
                {
-                    // Shift context
-                    const int n_left    = slot.n_past - slot.params.n_keep - 1;
-                    const int n_discard = n_left / 2;
-
-                    LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard);
-                    llama_kv_cache_seq_rm   (ctx, slot.id, slot.params.n_keep + 1            , slot.params.n_keep + n_discard + 1);
-                    llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, slot.n_past, -n_discard);
-
-                    for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++)
-                    {
-                        slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
-                    }
-
-                    slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
-
-                    slot.n_past -= n_discard;
-
-                    slot.truncated = true;
-
-                    LOG_VERBOSE("context shift", {
-                        { "n_ctx", n_ctx },
-                        { "n_keep", params.n_keep },
-                        { "n_left", n_left },
-                    });
+                    slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
                }
+
+                slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
+
+                slot.n_past -= n_discard;
+
+                slot.truncated = true;
+
+                LOG_VERBOSE("context shift", {
+                                                {"n_ctx",  n_ctx},
+                                                {"n_keep", params.n_keep},
+                                                {"n_left", n_left},
+                                            });
            }
        }

@ -1428,8 +1402,7 @@ struct llama_server_context

            slot.i_batch = batch.n_tokens;

-            const int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
-            llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
+            llama_batch_add(batch, slot.sampled, static_cast<llama_pos>(system_tokens.size() + slot.n_past), { slot.id }, true);

            slot.n_past += 1;
        }
@ -1491,7 +1464,7 @@ struct llama_server_context
                        prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token);  // add BOS if there isn't system prompt
                    }

-                    slot.num_prompt_tokens = prompt_tokens.size();
+                    slot.num_prompt_tokens = static_cast<int32_t>(prompt_tokens.size());

                    if (slot.params.n_keep < 0)
                    {
@ -1518,7 +1491,7 @@ struct llama_server_context
                        slot.truncated = true;
                        prompt_tokens = new_tokens;

-                        slot.num_prompt_tokens = prompt_tokens.size();
+                        slot.num_prompt_tokens = static_cast<int32_t>(prompt_tokens.size());
                        GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx);
                    }

@ -1527,8 +1500,6 @@ struct llama_server_context
                        llama_sampling_reset(slot.ctx_sampling);

                        slot.n_past = 0;
-                        slot.n_past_se = 0;
-                        slot.ga_i = 0;
                        slot.num_prompt_tokens_processed = slot.num_prompt_tokens;
                    }
                    else
@ -1539,34 +1510,15 @@ struct llama_server_context
                            llama_sampling_accept(slot.ctx_sampling, ctx, token, false);
                        }

-                        slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
+                        slot.n_past = static_cast<int32_t>(common_part(slot.cache_tokens, prompt_tokens));
                        slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past;

-                        if (slot.ga_n != 1)
-                        {
-                            int ga_i = 0;
-                            int32_t ga_n = slot.ga_n;
-                            int32_t ga_w = slot.ga_w;
-                            int32_t slot_npast = 0;
-                            for (int k = 0; k < slot.n_past; ++k)
-                            {
-                                while (slot_npast >= ga_i + ga_w) {
-                                    const int bd = (ga_w/ga_n)*(ga_n - 1);
-                                    slot_npast -= bd;
-                                    ga_i += ga_w/ga_n;
-                                }
-                                slot_npast++;
-                            }
-                            slot.n_past_se = slot_npast;
-                            slot.ga_i = ga_i;
-                        }
-
                        LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
                    }

                    LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);

-                    llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
+                    llama_kv_cache_seq_rm(ctx, slot.id, static_cast<llama_pos>(system_tokens.size() + slot.n_past), -1);

                    slot.cache_tokens = prompt_tokens;

@ -1575,10 +1527,6 @@ struct llama_server_context
                        // we have to evaluate at least 1 token to generate logits.
                        LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id);
                        slot.n_past--;
-                        if (slot.ga_i > 0)
-                        {
-                            slot.n_past_se--;
-                        }
                    }

                    LOG_VERBOSE("prompt ingested", {
@ -1591,22 +1539,9 @@ struct llama_server_context

                    // process the prefix of first image
                    std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
-                    int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
-                    int ga_i = slot.ga_i;
-                    int32_t ga_n = slot.ga_n;
-                    int32_t ga_w = slot.ga_w;
                    for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
                    {
-                        if (slot.ga_n != 1)
-                        {
-                            while (slot_npast >= ga_i + ga_w) {
-                                const int bd = (ga_w/ga_n)*(ga_n - 1);
-                                slot_npast -= bd;
-                                ga_i += ga_w/ga_n;
-                            }
-                        }
-                        llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
-                        slot_npast += 1;
+                       llama_batch_add(batch, prefix_tokens[slot.n_past], static_cast<llama_pos>(system_tokens.size() + slot.n_past), { slot.id }, false);
                    }

                    if (has_images && !ingest_images(slot, n_batch))
@ -1636,36 +1571,6 @@ struct llama_server_context
        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
        {
            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
-
-            for (auto & slot : slots)
-            {
-                if (slot.ga_n != 1)
-                {
-                    // context extension via Self-Extend
-                    while (slot.n_past_se >= slot.ga_i + slot.ga_w)
-                    {
-                        const int ib = (slot.ga_n * slot.ga_i) / slot.ga_w;
-                        const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
-                        const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;
-
-                        LOG_TEE("\n");
-                        LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
-                        LOG_TEE("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
-                        LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
-
-                        llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
-                        llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
-                        llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);
-
-                        slot.n_past_se -= bd;
-
-                        slot.ga_i += slot.ga_w / slot.ga_n;
-
-                        LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
-                    }
-                    slot.n_past_se += n_tokens;
-                }
-            }
            llama_batch batch_view =
            {
                n_tokens,
@ -1679,7 +1584,6 @@ struct llama_server_context
            };

            const int ret = llama_decode(ctx, batch_view);
-
            if (ret != 0)
            {
                if (n_batch == 1 || ret < 0)
@ -1825,8 +1729,6 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    printf("  --override-kv KEY=TYPE:VALUE\n");
    printf("                        advanced option to override model metadata by key. may be specified multiple times.\n");
    printf("                        types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
-    printf("  -gan N, --grp-attn-n N    Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
-    printf("  -gaw N, --grp-attn-w N    Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
    printf("\n");
 }

@ -2012,25 +1914,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            }
            params.n_threads = std::stoi(argv[i]);
        }
-        else if (arg == "--grp-attn-n" || arg == "-gan")
-        {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-
-            params.grp_attn_n = std::stoi(argv[i]);
-        }
-        else if (arg == "--grp-attn-w" || arg == "-gaw")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-
-            params.grp_attn_w = std::stoi(argv[i]);
-        }
        else if (arg == "--threads-batch" || arg == "-tb")
        {
            if (++i >= argc)
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -426,7 +426,7 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
    int j = 0;
    int in_ = 0;

-    int in_len = encoded_string.size();
+    int in_len = static_cast<int>(encoded_string.size());

    uint8_t char_array_4[4];
    uint8_t char_array_3[3];
@ -440,7 +440,7 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
        {
            for (i = 0; i <4; i++)
            {
-                char_array_4[i] = base64_chars.find(char_array_4[i]);
+                char_array_4[i] = static_cast<uint8_t>(base64_chars.find(char_array_4[i]));
            }

            char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
@ -464,7 +464,7 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str

        for (j = 0; j <4; j++)
        {
-            char_array_4[j] = base64_chars.find(char_array_4[j]);
+            char_array_4[j] = static_cast<uint8_t>(base64_chars.find(char_array_4[j]));
        }

        char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -67,8 +67,8 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> tokens_list;
    tokens_list = ::llama_tokenize(ctx, params.prompt, true);

-    const int n_ctx    = llama_n_ctx(ctx);
-    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
+    const int n_ctx    = static_cast<int>(llama_n_ctx(ctx));
+    const int n_kv_req = static_cast<int>(tokens_list.size() + (n_len - tokens_list.size()));

    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_kv_req);

@ -96,7 +96,7 @@ int main(int argc, char ** argv) {

    // evaluate the initial prompt
    for (size_t i = 0; i < tokens_list.size(); i++) {
-        llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
+        llama_batch_add(batch, tokens_list[i], static_cast<llama_pos>(i), { 0 }, false);
    }

    // llama_decode will output logits only for the last token of the prompt
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -131,7 +131,7 @@ int main(int argc, char ** argv) {

    fflush(stderr);

-    const int n_input = inp.size();
+    const int n_input = static_cast<int>(inp.size());

    const auto t_enc_start = ggml_time_us();

@ -152,8 +152,8 @@ int main(int argc, char ** argv) {
    int n_drafted = 0;
    int n_accept  = 0;

-    int n_past_tgt = inp.size();
-    int n_past_dft = inp.size();
+    int n_past_tgt = static_cast<int>(inp.size());
+    int n_past_dft = static_cast<int>(inp.size());

    // used to determine end of generation
    bool has_eos = false;
--- a/ggml-backend.c
+++ b/ggml-backend.c
@ -58,6 +58,7 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
    ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));

    GGML_ASSERT(iface.get_base != NULL);
+    GGML_ASSERT(buffer != NULL);

    (*buffer) = (struct ggml_backend_buffer) {
        /* .interface = */ iface,
@ -647,6 +648,7 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;

    struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
+    GGML_ASSERT(cpu_plan != NULL);

    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
    cpu_plan->cgraph = *cgraph; // FIXME: deep copy
@ -723,12 +725,14 @@ static struct ggml_backend_i cpu_backend_i = {
 ggml_backend_t ggml_backend_cpu_init(void) {
    struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
    
+    GGML_ASSERT(ctx != NULL);
    ctx->n_threads = GGML_DEFAULT_N_THREADS;
    ctx->work_data = NULL;
    ctx->work_size = 0;

    ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));

+    GGML_ASSERT(cpu_backend != NULL);
    *cpu_backend = (struct ggml_backend) {
        /* .interface = */ cpu_backend_i,
        /* .context   = */ ctx
@ -1403,6 +1407,7 @@ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_back

    struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);

+    GGML_ASSERT(sched != NULL);
    // initialize hash table
    sched->hash_set    = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
    sched->node_talloc = calloc(sizeof(sched->node_talloc[0]) * sched->hash_set.size, 1);
@ -1602,6 +1607,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
        /* .keys = */ calloc(sizeof(hash_set.keys[0]) * graph->visited_hash_table.size, 1)
    };
    struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]) * hash_set.size, 1);
+    GGML_ASSERT(node_copies != NULL);
    bool * node_init = calloc(sizeof(node_init[0]) * hash_set.size, 1);

    struct ggml_init_params params = {
--- a/ggml-quants.c
+++ b/ggml-quants.c
@ -674,7 +674,7 @@ void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict
        for (int j = 0; j < QK8_0; ++j) {
            const float x0 = x[i*QK8_0 + j]*id;

-            y[i].qs[j] = roundf(x0);
+            y[i].qs[j] = (int8_t)roundf(x0);
        }
    }
 }
@ -892,8 +892,8 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict
            const float v0 = x[i*QK8_1           + j]*id;
            const float v1 = x[i*QK8_1 + QK8_1/2 + j]*id;

-            y[i].qs[          j] = roundf(v0);
-            y[i].qs[QK8_1/2 + j] = roundf(v1);
+            y[i].qs[          j] = (int8_t)roundf(v0);
+            y[i].qs[QK8_1/2 + j] = (int8_t)roundf(v1);

            sum += y[i].qs[          j];
            sum += y[i].qs[QK8_1/2 + j];
@ -8641,6 +8641,7 @@ void iq2xs_init_impl(int grid_size) {

    printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
    uint64_t * the_grid = (uint64_t *)malloc(grid_size*sizeof(uint64_t));
+    assert(the_grid);
    for (int k = 0; k < grid_size; ++k) {
        int8_t * pos = (int8_t *)(the_grid + k);
        for (int i = 0; i < 8; ++i) {
@ -8864,7 +8865,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
                float sumqx = 0, sumq2 = 0;
                for (int i = 0; i < 32; ++i) {
                    float w = weight[i];
-                    float q = 2*Laux[i] + 1;
+                    float q = 2*Laux[i] + 1.f;
                    sumqx += w*xval[i]*q;
                    sumq2 += w*q*q;
                }
@ -8897,7 +8898,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
                float sumqx = 0, sumq2 = 0;
                for (int i = 0; i < 32; ++i) {
                    float w = weight[i];
-                    float q = 2*L[i] + 1;
+                    float q = 2*L[i] + 1.f;
                    sumqx += w*xval[i]*q;
                    sumq2 += w*q*q;
                }
@ -9085,7 +9086,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
                float sumqx = 0, sumq2 = 0;
                for (int i = 0; i < 16; ++i) {
                    float w = weight[i];
-                    float q = 2*Laux[i] + 1;
+                    float q = 2*Laux[i] + 1.f;
                    sumqx += w*xval[i]*q;
                    sumq2 += w*q*q;
                }
@ -9117,7 +9118,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
                float sumqx = 0, sumq2 = 0;
                for (int i = 0; i < 16; ++i) {
                    float w = weight[i];
-                    float q = 2*L[i] + 1;
+                    float q = 2*L[i] + 1.f;
                    sumqx += w*xval[i]*q;
                    sumq2 += w*q*q;
                }
--- a/llama.cpp
+++ b/llama.cpp
@ -10992,15 +10992,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
            if (llama_is_normal_token(model->vocab, token)) {
                std::string result = model->vocab.id_to_token[token].text;
                llama_unescape_whitespace(result);
-                if (length < (int) result.length()) {
-                    return -(int) result.length();
+                if (length < static_cast<int32_t>(result.length())) {
+                    return -static_cast<int32_t>(result.length());
                }
                memcpy(buf, result.c_str(), result.length());
                return result.length();
            } else if (llama_is_user_defined_token(model->vocab, token)) {
                std::string result = model->vocab.id_to_token[token].text;
-                if (length < (int) result.length()) {
-                    return -result.length();
+                if (length < static_cast<int32_t>(result.length())) {
+                    return -static_cast<int32_t>(result.length());
                }
                memcpy(buf, result.c_str(), result.length());
                return result.length();
@ -11027,15 +11027,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
            if (llama_is_normal_token(model->vocab, token)) {
                std::string result = model->vocab.id_to_token[token].text;
                result = llama_decode_text(result);
-                if (length < (int) result.length()) {
-                    return -(int) result.length();
+                if (length < static_cast<int32_t>(result.length())) {
+                    return -static_cast<int32_t>(result.length());
                }
                memcpy(buf, result.c_str(), result.length());
                return result.length();
            } else if (llama_is_user_defined_token(model->vocab, token)) {
                std::string result = model->vocab.id_to_token[token].text;
-                if (length < (int) result.length()) {
-                    return -result.length();
+                if (length < static_cast<int32_t>(result.length())) {
+                    return -static_cast<int32_t>(result.length());
                }
                memcpy(buf, result.c_str(), result.length());
                return result.length();
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -63,7 +63,8 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
                im = nullptr;
            }
        }
-        ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], hist, im);
+        ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, static_cast<int>(size/tensor->ne[0]),
+            static_cast<int>(tensor->ne[0]), hist, im);
        ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
    } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
        // This is going to create some weird integers though.
@ -552,7 +553,7 @@ struct test_case {

        // duplicate the op
        size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
-        int n_runs = std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1;
+        int n_runs = static_cast<int>(std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1);
        for (int i = 1; i < n_runs; i++) {
            gf->nodes[gf->n_nodes++] = out;
        }
@ -583,7 +584,7 @@ struct test_case {
        ggml_backend_graph_compute(backend, gf);
        ggml_backend_synchronize(backend);
        int64_t end_time = ggml_time_us();
-        double time_us = end_time - start_time;
+        double time_us = static_cast<double>(end_time - start_time);

        printf("    %5d runs - %8.2f us/run - %8zu kB/run - \033[1;34m%7.2f GB/s\033[0m\n",
            n_runs,
@ -713,7 +714,8 @@ struct test_dup : public test_case {
    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
        if (_use_permute) {
-            src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
+            src = ggml_permute(ctx, src, static_cast<int>(permute[0]), static_cast<int>(permute[1]),
+                static_cast<int>(permute[2]), static_cast<int>(permute[3]));
        }
        ggml_tensor * out = ggml_dup(ctx, src);
        return out;
@ -1239,7 +1241,7 @@ struct test_argsort : public test_case {
                for (int64_t r = 0; r < ggml_nrows(t); r++) {
                    std::vector<float> data(t->ne[0]);
                    for (int i = 0; i < t->ne[0]; i++) {
-                        data[i] = i;
+                        data[i] = static_cast<float>(i);
                    }
                    std::shuffle(data.begin(), data.end(), rng);
                    ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
@ -1421,7 +1423,7 @@ struct test_moe : public test_case {
        ggml_tensor * cur = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);

        ggml_tensor * logits = ggml_mul_mat(ctx, ffn_gate_inp, cur);
-        ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, 1.0f/sqrtf(n_embd));
+        ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, 1.0f/sqrtf(static_cast<float>(n_embd)));

        // select experts
        ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_experts_per_tok);