From c3fe181c444fe70a25c26f58c06b222ea2c01e5e Mon Sep 17 00:00:00 2001 From: Michael Klimenko Date: Sat, 27 Jan 2024 21:17:15 +0100 Subject: [PATCH] Add fixes to reduce the amount of warnings --- common/common.cpp | 5 +- common/console.cpp | 4 +- common/sampling.cpp | 2 +- common/train.cpp | 10 +- examples/batched/batched.cpp | 6 +- examples/beam-search/beam-search.cpp | 4 +- examples/llama-bench/llama-bench.cpp | 2 +- examples/llava/clip.cpp | 116 ++++------- examples/llava/llava-cli.cpp | 2 +- examples/llava/llava.cpp | 7 +- examples/lookahead/lookahead.cpp | 6 +- examples/lookup/lookup.cpp | 6 +- examples/parallel/parallel.cpp | 8 +- examples/passkey/passkey.cpp | 6 +- examples/perplexity/perplexity.cpp | 1 - examples/save-load-state/save-load-state.cpp | 6 +- examples/server/server.cpp | 191 ++++--------------- examples/server/utils.hpp | 6 +- examples/simple/simple.cpp | 6 +- examples/speculative/speculative.cpp | 6 +- ggml-backend.c | 8 +- ggml-quants.c | 15 +- llama.cpp | 16 +- tests/test-backend-ops.cpp | 14 +- 24 files changed, 145 insertions(+), 308 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 6b07f1197..f8fdcfe23 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1056,7 +1056,7 @@ std::string get_system_info(const gpt_params & params) { } std::string gpt_random_prompt(std::mt19937 & rng) { - const int r = rng() % 10; + const int r = static_cast(rng() % 10); switch (r) { case 0: return "So"; case 1: return "Once upon a time"; @@ -1700,7 +1700,8 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) { if (cs_curr[j] < 0) { continue; } if (seqs.find(cs_curr[j]) == seqs.end()) { if (seqs.size() + 1 >= sizeof(slot_chars)) { break; } - seqs[cs_curr[j]] = seqs.size(); + auto current_size = seqs.size(); + seqs[cs_curr[j]] = current_size; } } if (seqs.size() + 1 >= sizeof(slot_chars)) { break; } diff --git a/common/console.cpp b/common/console.cpp index f65cbc6ed..957416fd2 100644 --- a/common/console.cpp +++ b/common/console.cpp @@ -250,7 +250,7 @@ namespace console { return expectedWidth; } COORD initialPosition = bufferInfo.dwCursorPosition; - DWORD nNumberOfChars = length; + DWORD nNumberOfChars = static_cast(length); WriteConsole(hConsole, utf8_codepoint, nNumberOfChars, &nNumberOfChars, NULL); CONSOLE_SCREEN_BUFFER_INFO newBufferInfo; @@ -404,7 +404,7 @@ namespace console { } while (count == 0 && !widths.empty()); } } else { - int offset = line.length(); + int offset = static_cast(line.length()); append_utf8(input_char, line); int width = put_codepoint(line.c_str() + offset, line.length() - offset, estimateWidth(input_char)); if (width < 0) { diff --git a/common/sampling.cpp b/common/sampling.cpp index e8675a8c0..8a93b4ecb 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -73,7 +73,7 @@ llama_token llama_sampling_last(llama_sampling_context * ctx) { } std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) { - const int size = ctx_sampling->prev.size(); + const int size = static_cast(ctx_sampling->prev.size()); n = std::min(n, size); diff --git a/common/train.cpp b/common/train.cpp index e6f2f7a2f..bab9bcc0b 100644 --- a/common/train.cpp +++ b/common/train.cpp @@ -46,18 +46,12 @@ void free_train_state(struct train_state * state) { struct random_normal_distribution * init_random_normal_distribution( int seed, float mean, float std, float min, float max ) { - struct random_normal_distribution * rnd = (struct random_normal_distribution *) malloc(sizeof(struct random_normal_distribution)); - rnd->gen = std::mt19937(seed); - rnd->rd = std::normal_distribution{mean, std}; - rnd->min = min; - rnd->max = max; + auto rnd = new random_normal_distribution{ std::mt19937(seed), std::normal_distribution{mean, std}, min, max}; return rnd; } struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max) { - struct random_uniform_distribution * rnd = (struct random_uniform_distribution *) malloc(sizeof(struct random_uniform_distribution)); - rnd->gen = std::mt19937(seed); - rnd->rd = std::uniform_real_distribution{min, max}; + auto rnd = new random_uniform_distribution{ std::mt19937(seed), std::uniform_real_distribution{min, max} }; return rnd; } diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index b1775e0b0..87b32ddf2 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -70,7 +70,7 @@ int main(int argc, char ** argv) { std::vector tokens_list; tokens_list = ::llama_tokenize(model, params.prompt, true); - const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel; + const int n_kv_req = static_cast(tokens_list.size() + (n_len - tokens_list.size())*n_parallel); // initialize the context @@ -112,11 +112,11 @@ int main(int argc, char ** argv) { // create a llama_batch // we use this object to submit token data for decoding - llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0, 1); + llama_batch batch = llama_batch_init(std::max(static_cast(tokens_list.size()), n_parallel), 0, 1); // evaluate the initial prompt for (size_t i = 0; i < tokens_list.size(); ++i) { - llama_batch_add(batch, tokens_list[i], i, { 0 }, false); + llama_batch_add(batch, tokens_list[i], static_cast(i), { 0 }, false); } GGML_ASSERT(batch.n_tokens == (int) tokens_list.size()); diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp index 679b382e1..d3335abb5 100644 --- a/examples/beam-search/beam-search.cpp +++ b/examples/beam-search/beam-search.cpp @@ -160,12 +160,12 @@ int main(int argc, char ** argv) int n_past = 0; - if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0))) + if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), static_cast(tokens_list.size()), n_past, 0))) { fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ ); return 1; } - n_past += tokens_list.size(); + n_past += static_cast(tokens_list.size()); beam_search_callback_data callback_data{ctx, {}}; size_t const beam_width = static_cast(params.n_beams); diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 97325b5bd..5053d3f52 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -76,7 +76,7 @@ static T stdev(const std::vector & v) { } T mean = avg(v); T sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), T(0)); - T stdev = std::sqrt(sq_sum / (T)(v.size() - 1) - mean * mean * (T)v.size() / (T)(v.size() - 1)); + T stdev = static_cast(std::sqrt(sq_sum / (T)(v.size() - 1) - mean * mean * (T)v.size() / (T)(v.size() - 1))); return stdev; } diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 9129052a2..3d43c9d99 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -98,7 +98,6 @@ static std::string format(const char * fmt, ...) { enum projector_type { PROJECTOR_TYPE_MLP, - PROJECTOR_TYPE_MLP_NORM, PROJECTOR_TYPE_LDP, PROJECTOR_TYPE_UNKNOWN, }; @@ -305,18 +304,10 @@ struct clip_vision_model { struct ggml_tensor * projection; // LLaVA projection - struct ggml_tensor * mm_0_w = NULL; - struct ggml_tensor * mm_0_b = NULL; - struct ggml_tensor * mm_2_w = NULL; - struct ggml_tensor * mm_2_b = NULL; - - // Yi type models with mlp+normalization projection - struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4 - struct ggml_tensor * mm_1_b = NULL; - struct ggml_tensor * mm_3_w = NULL; - struct ggml_tensor * mm_3_b = NULL; - struct ggml_tensor * mm_4_w = NULL; - struct ggml_tensor * mm_4_b = NULL; + struct ggml_tensor * mm_0_w; + struct ggml_tensor * mm_0_b; + struct ggml_tensor * mm_2_w; + struct ggml_tensor * mm_2_b; // MobileVLM projection struct ggml_tensor * mm_model_mlp_1_w; @@ -390,7 +381,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 //const int n_intermediate = hparams.n_intermediate; //const int projection_dim = hparams.projection_dim; const float eps = hparams.eps; - int batch_size = imgs->size; + int batch_size = static_cast(imgs->size); if (ctx->has_llava_projector) { GGML_ASSERT(batch_size == 1); } @@ -469,7 +460,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 // pre-layernorm { embeddings = ggml_norm(ctx0, embeddings, eps); - ggml_set_name(embeddings, "pre_ln"); embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b); } @@ -585,27 +575,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); - - } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { - embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); - // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false); - // First LayerNorm - embeddings = ggml_norm(ctx0, embeddings, eps); - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w), - model.mm_1_b); - - // GELU activation - embeddings = ggml_gelu(ctx0, embeddings); - - // Second linear layer - embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_3_b); - - // Second LayerNorm - embeddings = ggml_norm(ctx0, embeddings, eps); - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w), - model.mm_4_b); } else if (ctx->proj_type == PROJECTOR_TYPE_LDP) { // MobileVLM projector @@ -638,7 +607,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 // hardswish struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1); - block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0); + block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, static_cast(block_1_hw->ne[0]), static_cast(block_1_hw->ne[1]), + static_cast(block_1_hw->ne[0]), static_cast(block_1_hw->ne[1]), 0, 0); // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] // pointwise conv block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]); @@ -652,7 +622,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]); block_1 = ggml_mul(ctx0, block_1_hw, block_1); - int w = block_1->ne[0], h = block_1->ne[1]; + int w = static_cast(block_1->ne[0]); + int h = static_cast(block_1->ne[1]); block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]); block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3)); @@ -686,7 +657,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1); // not sure the parameters is right for globalAvgPooling - block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0); + block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, static_cast(block_1_hw->ne[0]), static_cast(block_1_hw->ne[1]), + static_cast(block_1_hw->ne[0]), static_cast(block_1_hw->ne[1]), 0, 0); // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] // pointwise conv block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]); @@ -701,7 +673,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]); block_1 = ggml_mul(ctx0, block_1_hw, block_1); - int w = block_1->ne[0], h = block_1->ne[1]; + int w = static_cast(block_1->ne[0]); + int h = static_cast(block_1->ne[1]); block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]); block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3)); // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1] @@ -839,11 +812,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { else { new_clip->proj_type = PROJECTOR_TYPE_MLP; } - if (new_clip->proj_type == PROJECTOR_TYPE_MLP) { - if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) { - new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM; - } - } } #ifdef GGML_USE_CUBLAS @@ -938,7 +906,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { clip_free(new_clip); return nullptr; } - int num_bytes = ggml_nbytes(cur); + int num_bytes = static_cast(ggml_nbytes(cur)); if (ggml_backend_buffer_is_host(new_clip->params_buffer)) { // for the CPU and Metal backend, we can read directly into the tensor fin.read(reinterpret_cast(cur->data), num_bytes); @@ -992,29 +960,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); // LLaVA projection - if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) { + if (new_clip->proj_type == PROJECTOR_TYPE_MLP) { vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight")); vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias")); - try { - // Yi-type llava - vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight")); - vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias")); - } catch (std::runtime_error & e) { } - try { - // missing in Yi-type llava - vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight")); - vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias")); - } catch (std::runtime_error & e) { } - try { - // Yi-type llava - vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight")); - vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias")); - } catch (std::runtime_error & e) { } - try { - // Yi-type llava - vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight")); - vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias")); - } catch (std::runtime_error & e) { } + vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight")); + vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias")); } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) { // MobileVLM projection @@ -1124,7 +1074,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) { bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) { int nx, ny, nc; - auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3); + auto * data = stbi_load_from_memory(bytes, static_cast(bytes_length), &nx, &ny, &nc, 3); if (!data) { fprintf(stderr, "%s: failed to decode image bytes\n", __func__); return false; @@ -1224,7 +1174,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli const float v = v0 * (1.0f - dy) + v1 * dy; - const uint8_t v2 = std::min(std::max(std::round(v), 0.0f), 255.0f); + const uint8_t v2 = static_cast(std::min(std::max(std::round(v), 0.0f), 255.0f)); const int i = 3 * (y * nx3 + x) + c; @@ -1262,7 +1212,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima return false; } - int batch_size = imgs->size; + int batch_size = static_cast(imgs->size); if(ctx->has_llava_projector) { GGML_ASSERT(batch_size == 1); // TODO: support multiple images } @@ -1392,34 +1342,34 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i switch (new_type) { case GGML_TYPE_Q4_0: { - new_size = ggml_quantize_q4_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data()); + new_size = ggml_quantize_q4_0(f32_data, new_data, static_cast(n_elms), static_cast(cur->ne[0]), hist_cur.data()); } break; case GGML_TYPE_Q4_1: { - new_size = ggml_quantize_q4_1(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data()); + new_size = ggml_quantize_q4_1(f32_data, new_data, static_cast(n_elms), static_cast(cur->ne[0]), hist_cur.data()); } break; case GGML_TYPE_Q5_0: { - new_size = ggml_quantize_q5_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data()); + new_size = ggml_quantize_q5_0(f32_data, new_data, static_cast(n_elms), static_cast(cur->ne[0]), hist_cur.data()); } break; case GGML_TYPE_Q5_1: { - new_size = ggml_quantize_q5_1(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data()); + new_size = ggml_quantize_q5_1(f32_data, new_data, static_cast(n_elms), static_cast(cur->ne[0]), hist_cur.data()); } break; case GGML_TYPE_Q8_0: { - new_size = ggml_quantize_q8_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data()); + new_size = ggml_quantize_q8_0(f32_data, new_data, static_cast(n_elms), static_cast(cur->ne[0]), hist_cur.data()); } break; case GGML_TYPE_Q2_K: { - new_size = ggml_quantize_q2_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data()); + new_size = ggml_quantize_q2_K(f32_data, new_data, static_cast(n_elms), static_cast(cur->ne[0]), hist_cur.data()); } break; case GGML_TYPE_Q3_K: { - new_size = ggml_quantize_q3_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data()); + new_size = ggml_quantize_q3_K(f32_data, new_data, static_cast(n_elms), static_cast(cur->ne[0]), hist_cur.data()); } break; case GGML_TYPE_Q4_K: { - new_size = ggml_quantize_q4_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data()); + new_size = ggml_quantize_q4_K(f32_data, new_data, static_cast(n_elms), static_cast(cur->ne[0]), hist_cur.data()); } break; case GGML_TYPE_Q5_K: { - new_size = ggml_quantize_q5_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data()); + new_size = ggml_quantize_q5_K(f32_data, new_data, static_cast(n_elms), static_cast(cur->ne[0]), hist_cur.data()); } break; case GGML_TYPE_Q6_K: { - new_size = ggml_quantize_q6_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data()); + new_size = ggml_quantize_q6_K(f32_data, new_data, static_cast(n_elms), static_cast(cur->ne[0]), hist_cur.data()); } break; default: { fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, new_type); @@ -1482,12 +1432,10 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i int clip_n_mmproj_embd(const struct clip_ctx * ctx) { if (ctx->proj_type == PROJECTOR_TYPE_LDP) { - return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0]; + return static_cast(ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0]); } else if (ctx->proj_type == PROJECTOR_TYPE_MLP) { - return ctx->vision_model.mm_2_b->ne[0]; - } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { - return ctx->vision_model.mm_3_b->ne[0]; + return static_cast(ctx->vision_model.mm_2_b->ne[0]); } else { std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 6ac70ba69..98dbc1098 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -85,7 +85,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip auto img_bytes = std::vector(required_bytes); base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin()); - auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size()); + auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), static_cast(img_bytes.size())); if (!embed) { fprintf(stderr, "%s: could not load image from base64 string.\n", __func__); return NULL; diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index d42e7582e..f7cec1840 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -29,7 +29,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli } const int64_t t_img_enc_end_us = ggml_time_us(); - float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0; + float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0f; printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos); @@ -51,7 +51,6 @@ static bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_thre float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)); if (!image_embd) { fprintf(stderr, "Unable to allocate memory for image embeddings\n"); - free(image_embd); return false; } @@ -104,6 +103,10 @@ LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct cl clip_image_u8_free(img); auto result = (llava_image_embed*)malloc(sizeof(llava_image_embed)); + if (!result) { + fprintf(stderr, "%s: unable to allocate memory\n", __func__); + return NULL; + } result->embed = image_embed; result->n_image_pos = n_image_pos; return result; diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index e55a15a1b..ba949ff6e 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -88,7 +88,7 @@ int main(int argc, char ** argv) { fflush(stderr); - const int n_input = inp.size(); + const int n_input = static_cast(inp.size()); const auto t_enc_start = ggml_time_us(); @@ -105,7 +105,7 @@ int main(int argc, char ** argv) { int n_predict = 0; int n_accept = 0; - int n_past = inp.size(); + int n_past = static_cast(inp.size()); llama_token id = 0; @@ -362,7 +362,7 @@ int main(int argc, char ** argv) { if (v == 0) { // sample from the last level for (int i = 0; i < W; i++) { - tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, NULL, ngrams_cur.size()*(N-1) + W*(N - 2) + i); + tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, NULL, static_cast(ngrams_cur.size()*(N-1) + W*(N - 2) + i)); } } else { for (int i = 0; i < W; i++) { diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index d8de7dd38..b19c35c0b 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -60,7 +60,7 @@ int main(int argc, char ** argv){ fflush(stderr); - const int n_input = inp.size(); + const int n_input = static_cast(inp.size()); const auto t_enc_start = ggml_time_us(); @@ -73,7 +73,7 @@ int main(int argc, char ** argv){ int n_drafted = 0; int n_accept = 0; - int n_past = inp.size(); + int n_past = static_cast(inp.size()); bool has_eos = false; @@ -160,7 +160,7 @@ int main(int argc, char ** argv){ // generate n_pred tokens through prompt lookup auto prompt_lookup = [&]() -> void { - int inp_size = inp.size(); + int inp_size = static_cast(inp.size()); for (int ngram_size = ngram_max ; ngram_size > ngram_min; --ngram_size){ const llama_token * ngram = &inp[inp_size - ngram_size]; diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index d2e074d9e..3b1290229 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -156,13 +156,13 @@ int main(int argc, char ** argv) { std::vector clients(n_clients); for (size_t i = 0; i < clients.size(); ++i) { auto & client = clients[i]; - client.id = i; + client.id = static_cast(i); client.ctx_sampling = llama_sampling_init(params.sparams); } std::vector tokens_system; tokens_system = ::llama_tokenize(ctx, k_system, true); - const int32_t n_tokens_system = tokens_system.size(); + const int32_t n_tokens_system = static_cast(tokens_system.size()); llama_seq_id g_seq_id = 0; @@ -254,7 +254,7 @@ int main(int argc, char ** argv) { tokens_prompt = ::llama_tokenize(ctx, client.prompt, false); for (size_t i = 0; i < tokens_prompt.size(); ++i) { - llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id }, false); + llama_batch_add(batch, tokens_prompt[i], static_cast(i + n_tokens_system), { client.id }, false); } // extract the logits only for the last token @@ -262,7 +262,7 @@ int main(int argc, char ** argv) { batch.logits[batch.n_tokens - 1] = true; } - client.n_prompt = tokens_prompt.size(); + client.n_prompt = static_cast(tokens_prompt.size()); client.n_decoded = 0; client.i_batch = batch.n_tokens - 1; diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index 5c0022832..c4c527269 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -42,7 +42,7 @@ int main(int argc, char ** argv) { } if (seed == -1) { - seed = time(NULL); + seed = static_cast(time(NULL)); } srand(seed); @@ -110,9 +110,9 @@ int main(int argc, char ** argv) { tokens_list = ::llama_tokenize(ctx, params.prompt, true); // tokenize the prefix and use it as a sink - const int n_tokens_prefix = ::llama_tokenize(ctx, prompt_prefix, true).size(); + const int n_tokens_prefix = static_cast(::llama_tokenize(ctx, prompt_prefix, true).size()); - const int n_tokens_all = tokens_list.size(); + const int n_tokens_all = static_cast(tokens_list.size()); // we leave a margin of 16 tokens for the generated text - it should contain just the passkey const int n_predict = 16; diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 8d2204969..a14a23313 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -1090,7 +1090,6 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { llama_batch batch = llama_batch_init(n_ctx, 0, max_seq); - std::vector tok_logits(n_vocab); std::vector batch_logits(n_vocab*n_ctx); std::vector> eval_pairs; diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index ef952e2bd..ba21a50c0 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -20,7 +20,7 @@ int main(int argc, char ** argv) { params.n_predict = 16; } - auto n_past = 0; + llama_pos n_past = 0; std::string result0; std::string result1; @@ -39,8 +39,8 @@ int main(int argc, char ** argv) { auto tokens = llama_tokenize(ctx, params.prompt, true); // evaluate prompt - llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0)); - n_past += tokens.size(); + llama_decode(ctx, llama_batch_get_one(tokens.data(), static_cast(tokens.size()), n_past, 0)); + n_past += static_cast(tokens.size()); // save state (rng, logits, embedding and kv_cache) to file { diff --git a/examples/server/server.cpp b/examples/server/server.cpp index f58a2acaa..f97e5e415 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -184,12 +184,6 @@ struct llama_client_slot struct llama_sampling_params sparams; llama_sampling_context *ctx_sampling = nullptr; - int32_t ga_i = 0; // group-attention state - int32_t ga_n = 1;// group-attention factor - int32_t ga_w = 512; // group-attention width - - int32_t n_past_se = 0; // self-extend - // multimodal std::vector images; @@ -218,8 +212,7 @@ struct llama_client_slot sent_count = 0; sent_token_probs_index = 0; infill = false; - ga_i = 0; - n_past_se = 0; + generated_token_probs.clear(); for (slot_image & img : images) @@ -406,26 +399,9 @@ struct llama_server_context slot.id = i; slot.n_ctx = n_ctx_slot; - - LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot); - - const int ga_n = params.grp_attn_n; - const int ga_w = params.grp_attn_w; - - if (ga_n != 1) { - GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT - GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT - //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT - //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT - LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w); - } - - slot.ga_i = 0; - slot.ga_n = ga_n; - slot.ga_w = ga_w; - slot.reset(); + LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot); slots.push_back(slot); } @@ -660,7 +636,7 @@ struct llama_server_context const std::vector image_buffer = base64_decode(img["data"].get()); slot_image img_sl; - img_sl.id = img.count("id") != 0 ? img["id"].get() : slot->images.size(); + img_sl.id = img.count("id") != 0 ? img["id"].get() : static_cast(slot->images.size()); img_sl.img_data = clip_image_u8_init(); if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data)) { @@ -704,6 +680,7 @@ struct llama_server_context return false; } } catch (const std::invalid_argument& e) { + static_cast(e); LOG_TEE("Invalid image number id in prompt\n"); slot->images.clear(); return false; @@ -759,7 +736,7 @@ struct llama_server_context // assign the system KV cache to all parallel sequences for (int32_t i = 1; i < params.n_parallel; ++i) { - llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size()); + llama_kv_cache_seq_cp(ctx, 0, i, 0, static_cast(system_tokens.size())); } LOG_TEE("system prompt updated\n"); @@ -1245,12 +1222,12 @@ struct llama_server_context void split_multiprompt_task(int multitask_id, task_server& multiprompt_task) { - int prompt_count = multiprompt_task.data.at("prompt").size(); + std::size_t prompt_count = multiprompt_task.data.at("prompt").size(); assert(prompt_count > 1); // generate all the ID for subtask std::vector subtask_ids(prompt_count); - for (int i = 0; i < prompt_count; i++) + for (std::size_t i = 0; i < prompt_count; i++) { subtask_ids[i] = queue_tasks.get_new_id(); } @@ -1259,7 +1236,7 @@ struct llama_server_context queue_tasks.add_multitask(multitask_id, subtask_ids); // add subtasks - for (int i = 0; i < prompt_count; i++) + for (std::size_t i = 0; i < prompt_count; i++) { json subtask_data = multiprompt_task.data; subtask_data["prompt"] = subtask_data["prompt"][i]; @@ -1373,35 +1350,32 @@ struct llama_server_context for (llama_client_slot &slot : slots) { - if (slot.ga_n == 1) + if (slot.is_processing() && slot.cache_tokens.size() >= (size_t) slot.n_ctx) { - if (slot.is_processing() && slot.cache_tokens.size() >= (size_t) slot.n_ctx) + // Shift context + const int n_left = slot.n_past - slot.params.n_keep - 1; + const int n_discard = n_left / 2; + + LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard); + llama_kv_cache_seq_rm (ctx, slot.id, slot.params.n_keep + 1 , slot.params.n_keep + n_discard + 1); + llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, slot.n_past, -n_discard); + + for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++) { - // Shift context - const int n_left = slot.n_past - slot.params.n_keep - 1; - const int n_discard = n_left / 2; - - LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard); - llama_kv_cache_seq_rm (ctx, slot.id, slot.params.n_keep + 1 , slot.params.n_keep + n_discard + 1); - llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, slot.n_past, -n_discard); - - for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++) - { - slot.cache_tokens[i - n_discard] = slot.cache_tokens[i]; - } - - slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard); - - slot.n_past -= n_discard; - - slot.truncated = true; - - LOG_VERBOSE("context shift", { - { "n_ctx", n_ctx }, - { "n_keep", params.n_keep }, - { "n_left", n_left }, - }); + slot.cache_tokens[i - n_discard] = slot.cache_tokens[i]; } + + slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard); + + slot.n_past -= n_discard; + + slot.truncated = true; + + LOG_VERBOSE("context shift", { + {"n_ctx", n_ctx}, + {"n_keep", params.n_keep}, + {"n_left", n_left}, + }); } } @@ -1428,8 +1402,7 @@ struct llama_server_context slot.i_batch = batch.n_tokens; - const int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past; - llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true); + llama_batch_add(batch, slot.sampled, static_cast(system_tokens.size() + slot.n_past), { slot.id }, true); slot.n_past += 1; } @@ -1491,7 +1464,7 @@ struct llama_server_context prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt } - slot.num_prompt_tokens = prompt_tokens.size(); + slot.num_prompt_tokens = static_cast(prompt_tokens.size()); if (slot.params.n_keep < 0) { @@ -1518,7 +1491,7 @@ struct llama_server_context slot.truncated = true; prompt_tokens = new_tokens; - slot.num_prompt_tokens = prompt_tokens.size(); + slot.num_prompt_tokens = static_cast(prompt_tokens.size()); GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx); } @@ -1527,8 +1500,6 @@ struct llama_server_context llama_sampling_reset(slot.ctx_sampling); slot.n_past = 0; - slot.n_past_se = 0; - slot.ga_i = 0; slot.num_prompt_tokens_processed = slot.num_prompt_tokens; } else @@ -1539,34 +1510,15 @@ struct llama_server_context llama_sampling_accept(slot.ctx_sampling, ctx, token, false); } - slot.n_past = common_part(slot.cache_tokens, prompt_tokens); + slot.n_past = static_cast(common_part(slot.cache_tokens, prompt_tokens)); slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past; - if (slot.ga_n != 1) - { - int ga_i = 0; - int32_t ga_n = slot.ga_n; - int32_t ga_w = slot.ga_w; - int32_t slot_npast = 0; - for (int k = 0; k < slot.n_past; ++k) - { - while (slot_npast >= ga_i + ga_w) { - const int bd = (ga_w/ga_n)*(ga_n - 1); - slot_npast -= bd; - ga_i += ga_w/ga_n; - } - slot_npast++; - } - slot.n_past_se = slot_npast; - slot.ga_i = ga_i; - } - LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed); } LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past); - llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1); + llama_kv_cache_seq_rm(ctx, slot.id, static_cast(system_tokens.size() + slot.n_past), -1); slot.cache_tokens = prompt_tokens; @@ -1575,10 +1527,6 @@ struct llama_server_context // we have to evaluate at least 1 token to generate logits. LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id); slot.n_past--; - if (slot.ga_i > 0) - { - slot.n_past_se--; - } } LOG_VERBOSE("prompt ingested", { @@ -1591,22 +1539,9 @@ struct llama_server_context // process the prefix of first image std::vector prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens; - int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past; - int ga_i = slot.ga_i; - int32_t ga_n = slot.ga_n; - int32_t ga_w = slot.ga_w; for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past) { - if (slot.ga_n != 1) - { - while (slot_npast >= ga_i + ga_w) { - const int bd = (ga_w/ga_n)*(ga_n - 1); - slot_npast -= bd; - ga_i += ga_w/ga_n; - } - } - llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false); - slot_npast += 1; + llama_batch_add(batch, prefix_tokens[slot.n_past], static_cast(system_tokens.size() + slot.n_past), { slot.id }, false); } if (has_images && !ingest_images(slot, n_batch)) @@ -1636,36 +1571,6 @@ struct llama_server_context for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) { const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); - - for (auto & slot : slots) - { - if (slot.ga_n != 1) - { - // context extension via Self-Extend - while (slot.n_past_se >= slot.ga_i + slot.ga_w) - { - const int ib = (slot.ga_n * slot.ga_i) / slot.ga_w; - const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1); - const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w; - - LOG_TEE("\n"); - LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd); - LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n); - LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd); - - llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd); - llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n); - llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd); - - slot.n_past_se -= bd; - - slot.ga_i += slot.ga_w / slot.ga_n; - - LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i); - } - slot.n_past_se += n_tokens; - } - } llama_batch batch_view = { n_tokens, @@ -1679,7 +1584,6 @@ struct llama_server_context }; const int ret = llama_decode(ctx, batch_view); - if (ret != 0) { if (n_batch == 1 || ret < 0) @@ -1825,8 +1729,6 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); - printf(" -gan N, --grp-attn-n N Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`"); - printf(" -gaw N, --grp-attn-w N Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`"); printf("\n"); } @@ -2012,25 +1914,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } params.n_threads = std::stoi(argv[i]); } - else if (arg == "--grp-attn-n" || arg == "-gan") - { - if (++i >= argc) { - invalid_param = true; - break; - } - - params.grp_attn_n = std::stoi(argv[i]); - } - else if (arg == "--grp-attn-w" || arg == "-gaw") - { - if (++i >= argc) - { - invalid_param = true; - break; - } - - params.grp_attn_w = std::stoi(argv[i]); - } else if (arg == "--threads-batch" || arg == "-tb") { if (++i >= argc) diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 70cce0721..b6d6d27c5 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -426,7 +426,7 @@ static inline std::vector base64_decode(const std::string & encoded_str int j = 0; int in_ = 0; - int in_len = encoded_string.size(); + int in_len = static_cast(encoded_string.size()); uint8_t char_array_4[4]; uint8_t char_array_3[3]; @@ -440,7 +440,7 @@ static inline std::vector base64_decode(const std::string & encoded_str { for (i = 0; i <4; i++) { - char_array_4[i] = base64_chars.find(char_array_4[i]); + char_array_4[i] = static_cast(base64_chars.find(char_array_4[i])); } char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4); @@ -464,7 +464,7 @@ static inline std::vector base64_decode(const std::string & encoded_str for (j = 0; j <4; j++) { - char_array_4[j] = base64_chars.find(char_array_4[j]); + char_array_4[j] = static_cast(base64_chars.find(char_array_4[j])); } char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4); diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 9cfde8308..af7f1b706 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -67,8 +67,8 @@ int main(int argc, char ** argv) { std::vector tokens_list; tokens_list = ::llama_tokenize(ctx, params.prompt, true); - const int n_ctx = llama_n_ctx(ctx); - const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size()); + const int n_ctx = static_cast(llama_n_ctx(ctx)); + const int n_kv_req = static_cast(tokens_list.size() + (n_len - tokens_list.size())); LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_kv_req); @@ -96,7 +96,7 @@ int main(int argc, char ** argv) { // evaluate the initial prompt for (size_t i = 0; i < tokens_list.size(); i++) { - llama_batch_add(batch, tokens_list[i], i, { 0 }, false); + llama_batch_add(batch, tokens_list[i], static_cast(i), { 0 }, false); } // llama_decode will output logits only for the last token of the prompt diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 7b3af01f3..188805b53 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -131,7 +131,7 @@ int main(int argc, char ** argv) { fflush(stderr); - const int n_input = inp.size(); + const int n_input = static_cast(inp.size()); const auto t_enc_start = ggml_time_us(); @@ -152,8 +152,8 @@ int main(int argc, char ** argv) { int n_drafted = 0; int n_accept = 0; - int n_past_tgt = inp.size(); - int n_past_dft = inp.size(); + int n_past_tgt = static_cast(inp.size()); + int n_past_dft = static_cast(inp.size()); // used to determine end of generation bool has_eos = false; diff --git a/ggml-backend.c b/ggml-backend.c index 3fff5fc87..9cdce576f 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -58,6 +58,7 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init( ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer)); GGML_ASSERT(iface.get_base != NULL); + GGML_ASSERT(buffer != NULL); (*buffer) = (struct ggml_backend_buffer) { /* .interface = */ iface, @@ -647,6 +648,7 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu)); + GGML_ASSERT(cpu_plan != NULL); cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); cpu_plan->cgraph = *cgraph; // FIXME: deep copy @@ -722,13 +724,15 @@ static struct ggml_backend_i cpu_backend_i = { ggml_backend_t ggml_backend_cpu_init(void) { struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context)); - + + GGML_ASSERT(ctx != NULL); ctx->n_threads = GGML_DEFAULT_N_THREADS; ctx->work_data = NULL; ctx->work_size = 0; ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend)); + GGML_ASSERT(cpu_backend != NULL); *cpu_backend = (struct ggml_backend) { /* .interface = */ cpu_backend_i, /* .context = */ ctx @@ -1403,6 +1407,7 @@ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_back struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1); + GGML_ASSERT(sched != NULL); // initialize hash table sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS); sched->node_talloc = calloc(sizeof(sched->node_talloc[0]) * sched->hash_set.size, 1); @@ -1602,6 +1607,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s /* .keys = */ calloc(sizeof(hash_set.keys[0]) * graph->visited_hash_table.size, 1) }; struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]) * hash_set.size, 1); + GGML_ASSERT(node_copies != NULL); bool * node_init = calloc(sizeof(node_init[0]) * hash_set.size, 1); struct ggml_init_params params = { diff --git a/ggml-quants.c b/ggml-quants.c index 7d2f033e9..12ee50e10 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -674,7 +674,7 @@ void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict for (int j = 0; j < QK8_0; ++j) { const float x0 = x[i*QK8_0 + j]*id; - y[i].qs[j] = roundf(x0); + y[i].qs[j] = (int8_t)roundf(x0); } } } @@ -892,8 +892,8 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict const float v0 = x[i*QK8_1 + j]*id; const float v1 = x[i*QK8_1 + QK8_1/2 + j]*id; - y[i].qs[ j] = roundf(v0); - y[i].qs[QK8_1/2 + j] = roundf(v1); + y[i].qs[ j] = (int8_t)roundf(v0); + y[i].qs[QK8_1/2 + j] = (int8_t)roundf(v1); sum += y[i].qs[ j]; sum += y[i].qs[QK8_1/2 + j]; @@ -8641,6 +8641,7 @@ void iq2xs_init_impl(int grid_size) { printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size); uint64_t * the_grid = (uint64_t *)malloc(grid_size*sizeof(uint64_t)); + assert(the_grid); for (int k = 0; k < grid_size; ++k) { int8_t * pos = (int8_t *)(the_grid + k); for (int i = 0; i < 8; ++i) { @@ -8864,7 +8865,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict float sumqx = 0, sumq2 = 0; for (int i = 0; i < 32; ++i) { float w = weight[i]; - float q = 2*Laux[i] + 1; + float q = 2*Laux[i] + 1.f; sumqx += w*xval[i]*q; sumq2 += w*q*q; } @@ -8897,7 +8898,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict float sumqx = 0, sumq2 = 0; for (int i = 0; i < 32; ++i) { float w = weight[i]; - float q = 2*L[i] + 1; + float q = 2*L[i] + 1.f; sumqx += w*xval[i]*q; sumq2 += w*q*q; } @@ -9085,7 +9086,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v float sumqx = 0, sumq2 = 0; for (int i = 0; i < 16; ++i) { float w = weight[i]; - float q = 2*Laux[i] + 1; + float q = 2*Laux[i] + 1.f; sumqx += w*xval[i]*q; sumq2 += w*q*q; } @@ -9117,7 +9118,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v float sumqx = 0, sumq2 = 0; for (int i = 0; i < 16; ++i) { float w = weight[i]; - float q = 2*L[i] + 1; + float q = 2*L[i] + 1.f; sumqx += w*xval[i]*q; sumq2 += w*q*q; } diff --git a/llama.cpp b/llama.cpp index b03b67e16..31482069c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -10992,15 +10992,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token if (llama_is_normal_token(model->vocab, token)) { std::string result = model->vocab.id_to_token[token].text; llama_unescape_whitespace(result); - if (length < (int) result.length()) { - return -(int) result.length(); + if (length < static_cast(result.length())) { + return -static_cast(result.length()); } memcpy(buf, result.c_str(), result.length()); return result.length(); } else if (llama_is_user_defined_token(model->vocab, token)) { std::string result = model->vocab.id_to_token[token].text; - if (length < (int) result.length()) { - return -result.length(); + if (length < static_cast(result.length())) { + return -static_cast(result.length()); } memcpy(buf, result.c_str(), result.length()); return result.length(); @@ -11027,15 +11027,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token if (llama_is_normal_token(model->vocab, token)) { std::string result = model->vocab.id_to_token[token].text; result = llama_decode_text(result); - if (length < (int) result.length()) { - return -(int) result.length(); + if (length < static_cast(result.length())) { + return -static_cast(result.length()); } memcpy(buf, result.c_str(), result.length()); return result.length(); } else if (llama_is_user_defined_token(model->vocab, token)) { std::string result = model->vocab.id_to_token[token].text; - if (length < (int) result.length()) { - return -result.length(); + if (length < static_cast(result.length())) { + return -static_cast(result.length()); } memcpy(buf, result.c_str(), result.length()); return result.length(); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index e3c656f56..5ec0ed335 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -63,7 +63,8 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m im = nullptr; } } - ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], hist, im); + ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, static_cast(size/tensor->ne[0]), + static_cast(tensor->ne[0]), hist, im); ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) { // This is going to create some weird integers though. @@ -552,7 +553,7 @@ struct test_case { // duplicate the op size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU - int n_runs = std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1; + int n_runs = static_cast(std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1); for (int i = 1; i < n_runs; i++) { gf->nodes[gf->n_nodes++] = out; } @@ -583,7 +584,7 @@ struct test_case { ggml_backend_graph_compute(backend, gf); ggml_backend_synchronize(backend); int64_t end_time = ggml_time_us(); - double time_us = end_time - start_time; + double time_us = static_cast(end_time - start_time); printf(" %5d runs - %8.2f us/run - %8zu kB/run - \033[1;34m%7.2f GB/s\033[0m\n", n_runs, @@ -713,7 +714,8 @@ struct test_dup : public test_case { ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data()); if (_use_permute) { - src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]); + src = ggml_permute(ctx, src, static_cast(permute[0]), static_cast(permute[1]), + static_cast(permute[2]), static_cast(permute[3])); } ggml_tensor * out = ggml_dup(ctx, src); return out; @@ -1239,7 +1241,7 @@ struct test_argsort : public test_case { for (int64_t r = 0; r < ggml_nrows(t); r++) { std::vector data(t->ne[0]); for (int i = 0; i < t->ne[0]; i++) { - data[i] = i; + data[i] = static_cast(i); } std::shuffle(data.begin(), data.end(), rng); ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float)); @@ -1421,7 +1423,7 @@ struct test_moe : public test_case { ggml_tensor * cur = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens); ggml_tensor * logits = ggml_mul_mat(ctx, ffn_gate_inp, cur); - ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, 1.0f/sqrtf(n_embd)); + ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, 1.0f/sqrtf(static_cast(n_embd))); // select experts ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_experts_per_tok);