From 168324a388c86334605694f2dc0f7025267af2f4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 11 Jan 2025 17:52:45 +0200 Subject: [PATCH 01/15] cmake : enable -Wshadow for C++ code [no ci] --- cmake/common.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/common.cmake b/cmake/common.cmake index 0f54871e4..c64ddbc3d 100644 --- a/cmake/common.cmake +++ b/cmake/common.cmake @@ -13,7 +13,7 @@ function(llama_add_compile_flags) list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration) - list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn) + list(APPEND CXX_FLAGS -Wshadow -Wmissing-declarations -Wmissing-noreturn) list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function) From 0bebe45a25614401c372959770f89bab01165c47 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 12 Jan 2025 12:15:19 +0200 Subject: [PATCH 02/15] llama : de-shadow (wip) [no ci] --- examples/gguf/gguf.cpp | 12 +++++++----- src/llama-kv-cache.h | 6 +++--- src/llama.cpp | 42 +++++++++++++++++++++--------------------- 3 files changed, 31 insertions(+), 29 deletions(-) diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index f31989c8c..d928db8fe 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -204,13 +204,15 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) { __func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, cur->data); // print first 10 elements - const float * data = (const float *) cur->data; + { + const float * data = (const float *) cur->data; - printf("%s data[:10] : ", name); - for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) { - printf("%f ", data[j]); + printf("%s data[:10] : ", name); + for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) { + printf("%f ", data[j]); + } + printf("\n\n"); } - printf("\n\n"); // check data if (check_data) { diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index dca6f3998..2645fd23b 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -58,12 +58,12 @@ struct llama_kv_cache { std::vector bufs; size_t total_size() const { - size_t size = 0; + size_t size_all = 0; for (const auto & buf : bufs) { - size += ggml_backend_buffer_get_size(buf.get()); + size_all += ggml_backend_buffer_get_size(buf.get()); } - return size; + return size_all; } // TODO: better data structures to reduce the cost of this operation diff --git a/src/llama.cpp b/src/llama.cpp index daf1b7c97..83822668e 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1174,14 +1174,15 @@ struct llm_build_context { ggml_set_input(lctx.inp_K_shift); for (int il = 0; il < n_layer; ++il) { - const int64_t n_head_kv = hparams.n_head_kv(il); - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + const int64_t n_head_kv_i = hparams.n_head_kv(il); + const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(il); + struct ggml_tensor * rope_factors = build_rope_factors(il); struct ggml_tensor * k = ggml_view_3d(ctx0, kv_self.k_l[il], - n_embd_head_k, n_head_kv, n_ctx, + n_embd_head_k, n_head_kv_i, n_ctx, ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i), 0); struct ggml_tensor * tmp; @@ -1231,18 +1232,18 @@ struct llm_build_context { } for (int il = 0; il < n_layer; ++il) { - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(il); + const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(il); ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il], - n_embd_k_gqa, nm, - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i)); + n_embd_k_gqa_i, nm, + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i*i)); ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il], - n_embd_k_gqa, nm, - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id)); + n_embd_k_gqa_i, nm, + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i*id)); ggml_tensor * view_v_src; ggml_tensor * view_v_dst; @@ -1250,22 +1251,22 @@ struct llm_build_context { if (flash_attn) { // NOTE: the V cache is not transposed when using flash attention view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], - n_embd_v_gqa, nm, - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i)); + n_embd_v_gqa_i, nm, + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa_i), + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa_i*i)); view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], - n_embd_v_gqa, nm, - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id)); + n_embd_v_gqa_i, nm, + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa_i), + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa_i*id)); } else { view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], - nm, n_embd_v_gqa, + nm, n_embd_v_gqa_i, ggml_row_size(kv_self.v_l[il]->type, kv_self.size), ggml_row_size(kv_self.v_l[il]->type, i)); view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], - nm, n_embd_v_gqa, + nm, n_embd_v_gqa_i, ggml_row_size(kv_self.v_l[il]->type, kv_self.size), ggml_row_size(kv_self.v_l[il]->type, id)); } @@ -1459,7 +1460,6 @@ struct llm_build_context { } struct ggml_tensor * llm_build_inp_embd_enc() { - const int64_t n_embd = hparams.n_embd; lctx.inp_embd_enc = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc); ggml_set_input(lctx.inp_embd_enc); cb(lctx.inp_embd_enc, "embd_enc", -1); From 0127774ae4410abf58ef816bf5a8deaa194afbd0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 12 Jan 2025 12:17:24 +0200 Subject: [PATCH 03/15] llama : remove unused mutable n_tokens [no ci] --- src/llama.cpp | 60 --------------------------------------------------- 1 file changed, 60 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 83822668e..ed99094be 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1476,9 +1476,6 @@ struct llm_build_context { struct ggml_cgraph * build_llama() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -1553,7 +1550,6 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -1642,9 +1638,6 @@ struct llm_build_context { struct ggml_cgraph * build_deci() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -1730,7 +1723,6 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -2141,9 +2133,6 @@ struct llm_build_context { struct ggml_cgraph * build_grok() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -2218,7 +2207,6 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -2300,9 +2288,6 @@ struct llm_build_context { struct ggml_cgraph * build_dbrx() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -2370,7 +2355,6 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -3553,9 +3537,6 @@ struct llm_build_context { struct ggml_cgraph * build_qwen2moe() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -3620,7 +3601,6 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -5440,9 +5420,6 @@ struct llm_build_context { struct ggml_cgraph * build_olmo() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -5513,7 +5490,6 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -5564,9 +5540,6 @@ struct llm_build_context { struct ggml_cgraph * build_olmo2() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -5637,7 +5610,6 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -5692,9 +5664,6 @@ struct llm_build_context { struct ggml_cgraph * build_olmoe() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -5764,7 +5733,6 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -6085,9 +6053,6 @@ struct llm_build_context { struct ggml_cgraph * build_arctic() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -6146,7 +6111,6 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -6219,9 +6183,6 @@ struct llm_build_context { struct ggml_cgraph * build_deepseek() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -6295,7 +6256,6 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -6376,9 +6336,6 @@ struct llm_build_context { struct ggml_cgraph * build_deepseek2() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - bool is_lite = (hparams.n_layer == 27); // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly. @@ -6527,7 +6484,6 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -6757,9 +6713,6 @@ struct llm_build_context { struct ggml_cgraph * build_t5_enc() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -6833,7 +6786,6 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -6889,9 +6841,6 @@ struct llm_build_context { struct ggml_cgraph * build_t5_dec() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -7033,7 +6982,6 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids); @@ -7421,9 +7369,6 @@ struct llm_build_context { struct ggml_cgraph * build_exaone() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -7497,7 +7442,6 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -7779,9 +7723,6 @@ struct llm_build_context { struct ggml_cgraph * build_chameleon() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -7878,7 +7819,6 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } From 32e7b9dc995a27d52bcf7cb2c77c87a534b2f1ef Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 12 Jan 2025 12:30:54 +0200 Subject: [PATCH 04/15] llama : de-shadow (cont) [no ci] --- src/llama-vocab.cpp | 17 +++++++++++----- src/llama.cpp | 47 +++++++++++++++++++++------------------------ 2 files changed, 34 insertions(+), 30 deletions(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index d0fb85cea..cd943b97c 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -24,25 +24,30 @@ struct naive_trie { naive_trie() : has_value(false), value(0) { } - void insert(const char * key, size_t len, int32_t value = 0) { + + void insert(const char * key, size_t len, int32_t val = 0) { if (len == 0) { - this->has_value = true; - this->value = value; + has_value = true; + value = val; + return; } + char c = key[0]; auto res = children.find(c); if (res != children.end()) { - res->second.insert(key + 1, len - 1, value); + res->second.insert(key + 1, len - 1, val); } else { auto res = children.insert(std::make_pair(c, naive_trie())); - res.first->second.insert(key + 1, len - 1, value); + res.first->second.insert(key + 1, len - 1, val); } } + std::pair get_longest_prefix(const char * key, size_t len, size_t offset = 0) const { if (len == 0 || offset == len) { return std::make_pair(key, offset); } + char c = key[offset]; auto res = children.find(c); if (res != children.end()) { @@ -51,6 +56,7 @@ struct naive_trie { return std::make_pair(key, offset); } + const struct naive_trie * traverse(const char c) const { auto res = children.find(c); if (res != children.end()) { @@ -59,6 +65,7 @@ struct naive_trie { return NULL; } + std::map children; bool has_value; llama_token value; diff --git a/src/llama.cpp b/src/llama.cpp index ed99094be..d907c2d6e 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1656,10 +1656,10 @@ struct llm_build_context { const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - const int64_t n_head_kv = hparams.n_head_kv(il); - const int64_t n_head = hparams.n_head(il); + const int64_t n_head_kv_i = hparams.n_head_kv(il); + const int64_t n_head_i = hparams.n_head(il); - if (n_head == 0) { + if (n_head_i == 0) { // attention-free layer of Llama-3_1-Nemotron-51B cur = inpL; } else { @@ -1670,11 +1670,11 @@ struct llm_build_context { cb(cur, "attn_norm", il); } - if (n_head > 0 && n_head_kv == 0) { + if (n_head_i > 0 && n_head_kv_i == 0) { // "linear attention" of Llama-3_1-Nemotron-51B cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); cb(cur, "wo", il); - } else if (n_head > 0) { + } else if (n_head_i > 0) { // self-attention // rope freq factors for llama3; may return nullptr for llama2 and other models struct ggml_tensor * rope_factors = build_rope_factors(il); @@ -1702,14 +1702,14 @@ struct llm_build_context { } Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head_i, n_tokens), inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors, + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv_i, n_tokens), inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -1734,7 +1734,7 @@ struct llm_build_context { // modified to support attention-free layer of Llama-3_1-Nemotron-51B struct ggml_tensor * ffn_inp = cur; - if (n_head > 0) { + if (n_head_i > 0) { ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); } @@ -2643,7 +2643,7 @@ struct llm_build_context { // iterate layers for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * cur = inpL; + cur = inpL; struct ggml_tensor * Qcur; struct ggml_tensor * Kcur; @@ -4717,8 +4717,6 @@ struct llm_build_context { struct ggml_cgraph * build_gemma() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - const int64_t n_embd_head_k = hparams.n_embd_head_k; - struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -4825,8 +4823,6 @@ struct llm_build_context { struct ggml_cgraph * build_gemma2() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - const int64_t n_embd_head_k = hparams.n_embd_head_k; - struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -4962,6 +4958,7 @@ struct llm_build_context { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -5800,9 +5797,9 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); for (int il = 0; il < n_layer; ++il) { - const int64_t n_head = hparams.n_head(il); - const int64_t n_head_kv = hparams.n_head_kv(il); - const int64_t n_head_qkv = 2*n_head_kv + n_head; + const int64_t n_head_i = hparams.n_head(il); + const int64_t n_head_kv_i = hparams.n_head_kv(il); + const int64_t n_head_qkv_i = 2*n_head_kv_i + n_head_i; cur = inpL; struct ggml_tensor * residual = cur; @@ -5818,15 +5815,15 @@ struct llm_build_context { cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); - cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens); + cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv_i, n_tokens); - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0)); + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_i, n_tokens, cur->nb[1], cur->nb[2], 0)); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head)); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv_i, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head_i)); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv_i, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head_i+n_head_kv_i))); cb(Vcur, "Vcur", il); Qcur = llm_build_norm(ctx0, Qcur, hparams, @@ -5851,7 +5848,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens); + Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv_i, n_tokens); cb(Qcur, "Vcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, @@ -7495,9 +7492,9 @@ struct llm_build_context { // Token shift state dimensions should be 2 * n_emb GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2); - const int64_t n_seqs = ubatch.n_seqs; + const int64_t n_seqs = ubatch.n_seqs; const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_tokens = ubatch.n_tokens; + GGML_ASSERT(n_seqs != 0); GGML_ASSERT(ubatch.equal_seqs); GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs); @@ -7608,9 +7605,9 @@ struct llm_build_context { GGML_ASSERT(n_embd == hparams.n_embd_k_s()); - const int64_t n_seqs = ubatch.n_seqs; + const int64_t n_seqs = ubatch.n_seqs; const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_tokens = ubatch.n_tokens; + GGML_ASSERT(n_seqs != 0); GGML_ASSERT(ubatch.equal_seqs); GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs); From 82caffa74e4e101df3adba878ecb99f6e25e3d84 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 12 Jan 2025 13:22:16 +0200 Subject: [PATCH 05/15] llama : de-shadow libllama [no ci] --- ci/run.sh | 3 + src/llama-batch.cpp | 83 ++++++++++----------- src/llama-batch.h | 4 +- src/llama-context.cpp | 4 +- src/llama-grammar.cpp | 2 +- src/llama-mmap.cpp | 10 +-- src/llama-model-loader.cpp | 16 ++-- src/llama-model-loader.h | 2 +- src/llama-model.cpp | 147 +++++++++++++++++++------------------ src/llama-model.h | 2 - src/llama-quant.cpp | 39 +++++----- src/llama-sampling.cpp | 20 ++--- src/llama-vocab.cpp | 28 +++---- 13 files changed, 181 insertions(+), 179 deletions(-) diff --git a/ci/run.sh b/ci/run.sh index abf08a4ff..bd3420e48 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -13,6 +13,9 @@ # # with SYCL support # GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt # +# # with METAL support +# GG_BUILD_METAL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt +# # # with VULKAN support # GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt # diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp index 01d5ca57f..e92e5ba9d 100644 --- a/src/llama-batch.cpp +++ b/src/llama-batch.cpp @@ -7,9 +7,9 @@ llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) { // clear empty sequences // the previous ubatch is assumed to be gone, // so nothing should refer to values in these sequences anymore. - for (size_t i = seq.size(); i-- > 0;) { - if (seq[i].length == 0) { - seq.pop_back(); + for (size_t i = seqs.size(); i-- > 0;) { + if (seqs[i].length == 0) { + seqs.pop_back(); } else { break; } @@ -36,48 +36,48 @@ llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) { } void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & seq, size_t length) { - GGML_ASSERT(batch != nullptr); + GGML_ASSERT(batch_ptr != nullptr); GGML_ASSERT(length <= seq.length); // Can only add sequences of equal lengths to a batch, // otherwise it isn't clear to which sequence a token belongs GGML_ASSERT(seq.n_seq_id == 0 || ubatch.n_seqs == 0 || length == (size_t) ubatch.n_tokens / ubatch.n_seqs); GGML_ASSERT((seq.n_seq_id != 0) == ubatch.equal_seqs); // NOTE: loops are separated for cache-friendliness - if (batch->token) { + if (batch_ptr->token) { if (ubatch.equal_seqs) { for (size_t i = 0; i < length; ++i) { - ubatch.token[ubatch.n_tokens + i] = batch->token[ids[seq.offset + i]]; + ubatch.token[ubatch.n_tokens + i] = batch_ptr->token[ids[seq.offset + i]]; } } else { // simple split - ubatch.token = batch->token + seq.offset; + ubatch.token = batch_ptr->token + seq.offset; } } else { ubatch.token = nullptr; } - if (batch->embd) { + if (batch_ptr->embd) { if (ubatch.equal_seqs) { for (size_t i = 0; i < length; ++i) { memcpy( ubatch.embd + (n_embd * (ubatch.n_tokens + i)), - batch->embd + (n_embd * ids[seq.offset + i]), + batch_ptr->embd + (n_embd * ids[seq.offset + i]), n_embd * sizeof(float) ); } } else { // simple split - ubatch.embd = batch->embd + (n_embd * seq.offset); + ubatch.embd = batch_ptr->embd + (n_embd * seq.offset); } } else { ubatch.embd = nullptr; } if (ubatch.equal_seqs) { for (size_t i = 0; i < length; ++i) { - ubatch.pos[ubatch.n_tokens + i] = batch->pos[ids[seq.offset + i]]; + ubatch.pos[ubatch.n_tokens + i] = batch_ptr->pos[ids[seq.offset + i]]; } } else { // simple split - ubatch.pos = batch->pos + seq.offset; + ubatch.pos = batch_ptr->pos + seq.offset; } if (ubatch.equal_seqs) { ubatch.n_seq_id[ubatch.n_seqs] = seq.n_seq_id; @@ -86,15 +86,15 @@ void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & s } } else { // simple split - if (batch->n_seq_id) { - ubatch.n_seq_id = batch->n_seq_id + seq.offset; + if (batch_ptr->n_seq_id) { + ubatch.n_seq_id = batch_ptr->n_seq_id + seq.offset; } else { for (size_t i = 0; i < length; ++i) { ubatch.n_seq_id[ubatch.n_seqs + i] = 1; } } - if (batch->seq_id) { - ubatch.seq_id = batch->seq_id + seq.offset; + if (batch_ptr->seq_id) { + ubatch.seq_id = batch_ptr->seq_id + seq.offset; } } if (logits_all) { @@ -102,17 +102,17 @@ void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & s ubatch.output[ubatch.n_tokens + i] = 1; out_ids.push_back(ids[seq.offset + i]); } - } else if (batch->logits) { + } else if (batch_ptr->logits) { if (ubatch.equal_seqs) { for (size_t i = 0; i < length; ++i) { size_t id = ids[seq.offset + i]; - int8_t is_output = batch->logits[id]; + int8_t is_output = batch_ptr->logits[id]; ubatch.output[ubatch.n_tokens + i] = is_output; if (is_output) { out_ids.push_back(id); } } } else { // simple split - ubatch.output = batch->logits + seq.offset; + ubatch.output = batch_ptr->logits + seq.offset; for (size_t i = 0; i < length; ++i) { if (ubatch.output[i] != 0) { out_ids.push_back(seq.offset + i); } } @@ -139,12 +139,12 @@ void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & s llama_ubatch llama_sbatch::split_simple(size_t n_ubatch) { n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch; - llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr); + llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch_ptr->embd != nullptr); ubatch.equal_seqs = false; - if (!seq.empty()) { - llama_sbatch_seq & s = seq[0]; + if (!seqs.empty()) { + llama_sbatch_seq & s = seqs[0]; size_t length = s.length < n_ubatch ? s.length : n_ubatch; - GGML_ASSERT(seq.size() == 1 && s.n_seq_id == 0); // don't mix with other splits + GGML_ASSERT(seqs.size() == 1 && s.n_seq_id == 0); // don't mix with other splits add_seq_to_ubatch(ubatch, s, length); } return ubatch; @@ -152,15 +152,15 @@ llama_ubatch llama_sbatch::split_simple(size_t n_ubatch) { llama_ubatch llama_sbatch::split_equal(size_t n_ubatch) { n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch; - llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr); - if (!seq.empty()) { + llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch_ptr->embd != nullptr); + if (!seqs.empty()) { size_t length = 0; size_t n_tokens_in_ubatch = 0; - GGML_ASSERT(seq[0].n_seq_id > 0); // should not be mixed with simple splits + GGML_ASSERT(seqs[0].n_seq_id > 0); // should not be mixed with simple splits // smallest first, because it's easier to split this way; // starting from the end to pop in constant time. - for (size_t i = seq.size(); i-- > 0;) { - llama_sbatch_seq & s = seq[i]; + for (size_t i = seqs.size(); i-- > 0;) { + llama_sbatch_seq & s = seqs[i]; GGML_ASSERT(s.length > 0); if (length == 0) { length = s.length < n_ubatch ? s.length : n_ubatch; @@ -179,9 +179,9 @@ llama_ubatch llama_sbatch::split_equal(size_t n_ubatch) { llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) { n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch; - llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr); - if (!seq.empty()) { - llama_sbatch_seq & s = seq[seq.size() - 1]; + llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch_ptr->embd != nullptr); + if (!seqs.empty()) { + llama_sbatch_seq & s = seqs.back(); size_t length = s.length < n_ubatch ? s.length : n_ubatch; GGML_ASSERT(s.n_seq_id > 0); // should not be mixed with simple splits add_seq_to_ubatch(ubatch, s, length); @@ -189,23 +189,24 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) { return ubatch; } -void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) { +void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd_cur, bool simple_split, bool logits_all_cur) { GGML_ASSERT(batch.n_tokens >= 0); - this->batch = &batch; - this->n_embd = n_embd; - this->logits_all = logits_all; + + batch_ptr = &batch; + n_embd = n_embd_cur; + logits_all = logits_all_cur; n_tokens = batch.n_tokens; ids.resize(n_tokens); out_ids.clear(); - // TODO: reserve out_ids and seq + // TODO: reserve out_ids and seqs for (size_t i = 0; i < n_tokens; ++i) { ids[i] = i; } if (simple_split) { - seq.resize(1); - llama_sbatch_seq & s = seq[0]; + seqs.resize(1); + llama_sbatch_seq & s = seqs[0]; s.n_seq_id = 0; s.seq_id = nullptr; s.offset = 0; @@ -259,11 +260,11 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim } } llama_sbatch_seq new_seq = {n_seqs, seq_ids, i, 1}; - seq.push_back(new_seq); - last_seq = &seq.back(); + seqs.push_back(new_seq); + last_seq = &seqs.back(); } // keep shared prompts first at the end, then sort by length descending. - std::sort(seq.begin(), seq.end(), + std::sort(seqs.begin(), seqs.end(), [](llama_sbatch_seq & a, llama_sbatch_seq & b) { if (a.n_seq_id == b.n_seq_id) { return a.length > b.length; diff --git a/src/llama-batch.h b/src/llama-batch.h index 773c3808b..572eb79fd 100644 --- a/src/llama-batch.h +++ b/src/llama-batch.h @@ -45,9 +45,9 @@ struct llama_sbatch { std::vector ids; // batch indices of the output std::vector out_ids; - std::vector seq; + std::vector seqs; - const llama_batch * batch = nullptr; + const llama_batch * batch_ptr = nullptr; // buffers for the ubatch std::vector ubatch_token; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 671d2a81a..c761a4a21 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -916,8 +916,8 @@ struct llama_data_write { write(&n_seq_id, sizeof(n_seq_id)); if (n_seq_id) { - for (auto seq_id : cell.seq_id) { - write(&seq_id, sizeof(seq_id)); + for (auto sid : cell.seq_id) { + write(&sid, sizeof(sid)); } } } diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp index bebe4e9a3..bea7d0b1a 100644 --- a/src/llama-grammar.cpp +++ b/src/llama-grammar.cpp @@ -490,7 +490,7 @@ const char * llama_grammar_parser::parse_sequence( pos = parse_space(pos + 1, is_nested); if (is_digit_char(*pos)) { - const char * int_end = parse_int(pos); + int_end = parse_int(pos); max_times = std::stoul(std::string(pos, int_end - pos)); pos = parse_space(int_end, is_nested); } diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index 57c6e4f51..db4c4bcbe 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -454,8 +454,8 @@ struct llama_mlock::impl { return (size_t) sysconf(_SC_PAGESIZE); } - bool raw_lock(const void * addr, size_t size) const { - if (!mlock(addr, size)) { + bool raw_lock(const void * addr_cur, size_t size_cur) const { + if (!mlock(addr_cur, size_cur)) { return true; } @@ -475,12 +475,12 @@ struct llama_mlock::impl { if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) { suggest = false; } - if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) { + if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size_cur)) { suggest = false; } LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s", - size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : ""); + size_cur, size, errmsg, suggest ? MLOCK_SUGGESTION : ""); return false; } @@ -535,7 +535,7 @@ struct llama_mlock::impl { return (size_t) 65536; } - bool raw_lock(const void * addr, size_t len) const { + bool raw_lock(const void * addr_cur, size_t size_cur) const { LLAMA_LOG_WARN("warning: mlock not supported on this system\n"); return false; } diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 53175f0e0..a781b2884 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -413,7 +413,7 @@ namespace GGUFMeta { template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); -llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) { +llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap_cur, bool check_tensors_cur, const struct llama_model_kv_override * param_overrides_p) { int trace = 0; if (getenv("LLAMA_TRACE")) { trace = atoi(getenv("LLAMA_TRACE")); @@ -626,11 +626,11 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, if (!llama_mmap::SUPPORTED) { LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__); - use_mmap = false; + use_mmap_cur = false; } - this->use_mmap = use_mmap; - this->check_tensors = check_tensors; + use_mmap = use_mmap_cur; + check_tensors = check_tensors_cur; } std::string llama_model_loader::get_arch_name() const { @@ -887,15 +887,15 @@ bool llama_model_loader::load_all_data( // If the backend is supported, create pinned memory buffers and events for synchronisation. for (size_t idx = 0; idx < n_buffers; ++idx) { - auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size); - if (!buf) { + auto * buf_new = ggml_backend_buft_alloc_buffer(host_buft, buffer_size); + if (!buf_new) { LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func, ggml_backend_dev_name(dev)); return nullptr; } - host_buffers.emplace_back(buf); - host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf)); + host_buffers.emplace_back(buf_new); + host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf_new)); auto * event = ggml_backend_event_new(dev); if (!event) { diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index b63d158d9..4814bbdc9 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -90,7 +90,7 @@ struct llama_model_loader { size_t size_data = 0; std::vector> mmaps_used; - llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p); + llama_model_loader(const std::string & fname, bool use_mmap_cur, bool check_tensors_cur, const struct llama_model_kv_override * param_overrides_p); template typename std::enable_if::value, bool>::type diff --git a/src/llama-model.cpp b/src/llama-model.cpp index f90f5e746..1229d8738 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -340,7 +340,8 @@ struct llama_model::impl { size_t n_bytes = 0; - std::string desc_str; + std::string name_str = "n/a"; + std::string desc_str = "n/a"; // model memory mapped files llama_mmaps mappings; @@ -390,17 +391,17 @@ void llama_model::load_hparams(llama_model_loader & ml) { // get metadata as string for (int i = 0; i < gguf_get_n_kv(ctx); i++) { - enum gguf_type type = gguf_get_kv_type(ctx, i); - if (type == GGUF_TYPE_ARRAY) { + gguf_type type_cur = gguf_get_kv_type(ctx, i); + if (type_cur == GGUF_TYPE_ARRAY) { continue; } - const char * name = gguf_get_key(ctx, i); - const std::string value = gguf_kv_to_str(ctx, i); - gguf_kv.emplace(name, value); + const char * name_cur = gguf_get_key(ctx, i); + const std::string value_cur = gguf_kv_to_str(ctx, i); + gguf_kv.emplace(name_cur, value_cur); } // get general kv - ml.get_key(LLM_KV_GENERAL_NAME, name, false); + ml.get_key(LLM_KV_GENERAL_NAME, pimpl->name_str, false); // everything past this point is not vocab-related if (hparams.vocab_only) { @@ -1333,13 +1334,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) { auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { auto it = ctx_map.find(buft); if (it == ctx_map.end()) { - ggml_init_params params = { + ggml_init_params params_cur = { /*.mem_size =*/ ctx_size, /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; - ggml_context * ctx = ggml_init(params); + ggml_context * ctx = ggml_init(params_cur); if (!ctx) { throw std::runtime_error(format("failed to create ggml context")); } @@ -1557,31 +1558,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) { for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i); - const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i); - const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i); - const int64_t n_ff = hparams.n_ff(i); - const int64_t n_head = hparams.n_head(i); - const int64_t n_head_kv = hparams.n_head_kv(i); + const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i); + const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i); + const int64_t n_embd_gqa_i = hparams.n_embd_v_gqa(i); + const int64_t n_ff_i = hparams.n_ff(i); + const int64_t n_head_i = hparams.n_head(i); + const int64_t n_head_kv_i = hparams.n_head_kv(i); - if (n_head_kv == 0 && n_head > 0) { + if (n_head_kv_i == 0 && n_head_i > 0) { // linear attention for DeciLMCausalModel layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); } - else if (n_head_kv > 0) { + else if (n_head_kv_i > 0) { layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); - layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); - layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); - layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); - layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0); } // optional bias tensors layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); - layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED); - layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED); + layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa_i}, TENSOR_NOT_REQUIRED); + layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa_i}, TENSOR_NOT_REQUIRED); layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); @@ -1594,14 +1595,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); } - layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff_i}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff_i, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff_i}, 0); // optional MLP bias - layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED); + layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff_i}, TENSOR_NOT_REQUIRED); layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); - layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED); + layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff_i}, TENSOR_NOT_REQUIRED); } } break; case LLM_ARCH_MINICPM3: @@ -2653,23 +2654,23 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); for (int i = 0; i < n_layer; ++i) { - const int64_t n_head = hparams.n_head(i); - const int64_t n_head_qkv = 2*hparams.n_head_kv(i) + n_head; - const int64_t n_ff = hparams.n_ff(i); + const int64_t n_head_i = hparams.n_head(i); + const int64_t n_head_qkv_i = 2*hparams.n_head_kv(i) + n_head_i; + const int64_t n_ff_i = hparams.n_ff(i); auto & layer = layers[i]; layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); - layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0); + layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv_i*n_embd_head_k}, 0); layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0); layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0); - layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head_i*n_embd_head_k, n_embd}, 0); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); - layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff_i}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff_i, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff_i}, 0); } } break; case LLM_ARCH_GPTNEOX: @@ -3167,11 +3168,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); - const int time_mix_extra_dim = hparams.time_mix_extra_dim; + const int time_mix_extra_dim = hparams.time_mix_extra_dim; const int time_decay_extra_dim = hparams.time_decay_extra_dim; - const int head_size = hparams.wkv_head_size; - const int attn_hidden_size = n_embd; - const int n_head_kv = hparams.n_head_kv(); + const int head_size = hparams.wkv_head_size; + const int attn_hidden_size = n_embd; + int attn_key_value_size; if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) { attn_key_value_size = attn_hidden_size; @@ -3254,7 +3255,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // posnet { - const int64_t n_embd = hparams.posnet.n_embd; + const int64_t n_embd_cur = hparams.posnet.n_embd; for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) { auto & layer = layers[i].posnet; @@ -3274,39 +3275,39 @@ bool llama_model::load_tensors(llama_model_loader & ml) { case 3: case 4: { - layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0); - layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0); + layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd_cur}, 0); + layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd_cur}, 0); - layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0); - layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0); + layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd_cur, n_embd_cur}, 0); + layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd_cur}, 0); - layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0); - layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0); + layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd_cur}, 0); + layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd_cur}, 0); - layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0); - layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0); + layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd_cur, n_embd_cur}, 0); + layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd_cur}, 0); } break; case 2: { - layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0); - layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0); + layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd_cur}, 0); + layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd_cur}, 0); - layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0); - layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0); + layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd_cur, n_embd_cur}, 0); + layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd_cur}, 0); - layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0); - layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0); + layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd_cur, n_embd_cur}, 0); + layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd_cur}, 0); - layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0); - layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0); + layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd_cur, n_embd_cur}, 0); + layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd_cur}, 0); - layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0); - layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0); + layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd_cur, n_embd_cur}, 0); + layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd_cur}, 0); } break; case 5: { - layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0); - layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0); + layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd_cur}, 0); + layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd_cur}, 0); } break; default: GGML_ABORT("unknown posnet layer"); }; @@ -3320,29 +3321,29 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // convnext { - const int64_t n_embd = hparams.convnext.n_embd; + const int64_t n_embd_cur = hparams.convnext.n_embd; for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) { auto & layer = layers[i].convnext; - layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0); - layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0); + layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd_cur}, 0); + layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd_cur}, 0); - layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0); - layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0); + layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd_cur}, 0); + layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd_cur}, 0); - layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0); + layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd_cur, n_ff}, 0); layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0); - layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0); - layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0); + layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd_cur}, 0); + layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd_cur}, 0); - layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0); + layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd_cur}, 0); } // output - output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); - output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0); + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd_cur}, 0); + output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd_cur}, 0); } output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0); @@ -3601,7 +3602,7 @@ void llama_model::print_info() const { } // general kv - LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str()); + LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, pimpl->name_str.c_str()); if (arch == LLM_ARCH_DEEPSEEK) { LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); diff --git a/src/llama-model.h b/src/llama-model.h index 4cc8abb75..39e26fae7 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -290,8 +290,6 @@ struct llama_model { llm_type type = LLM_TYPE_UNKNOWN; llm_arch arch = LLM_ARCH_UNKNOWN; - std::string name = "n/a"; - llama_hparams hparams = {}; llama_vocab vocab; diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index d4947a780..6c59e1730 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -423,8 +423,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * int64_t counter = 0; size_t new_size = 0; bool valid = true; - auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size, - nrows, n_per_row, imatrix]() { + auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size, nrows, n_per_row, imatrix]() { const int64_t nrows_per_chunk = chunk_size / n_per_row; size_t local_size = 0; while (true) { @@ -437,6 +436,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * break; } lock.unlock(); + const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk); size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix); local_size += this_size; @@ -445,7 +445,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * const size_t row_size = ggml_row_size(new_type, n_per_row); void * this_data = (char *) new_data + first_row * row_size; if (!ggml_validate_row_data(new_type, this_data, this_size)) { - std::unique_lock lock(mutex); + lock.lock(); valid = false; break; } @@ -589,15 +589,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } // make a list of weights - std::vector tensors; - tensors.reserve(ml.weights_map.size()); + std::vector tensor_weights; + tensor_weights.reserve(ml.weights_map.size()); for (const auto & it : ml.weights_map) { - tensors.push_back(&it.second); + tensor_weights.push_back(&it.second); } // keep_split requires that the weights are sorted by split index if (params->keep_split) { - std::sort(tensors.begin(), tensors.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) { + std::sort(tensor_weights.begin(), tensor_weights.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) { if (a->idx == b->idx) { return a->offs < b->offs; } @@ -605,8 +605,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: }); } - for (const auto * it : tensors) { - const struct ggml_tensor * tensor = it->tensor; + for (const auto * tw : tensor_weights) { + const ggml_tensor * tensor = tw->tensor; const std::string name = ggml_get_name(tensor); @@ -650,17 +650,17 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // Assume split index is continuous if (params->keep_split) { - for (const auto * it : tensors) { - n_split = std::max(uint16_t(it->idx + 1), n_split); + for (const auto * tw : tensor_weights) { + n_split = std::max(uint16_t(tw->idx + 1), n_split); } } std::vector ctx_outs(n_split); ctx_outs[0] = std::move(ctx_out); - // populate the original tensors so we get an initial meta data - for (const auto * it : tensors) { - uint16_t i_split = params->keep_split ? it->idx : 0; - struct ggml_tensor * tensor = it->tensor; + // populate the original tensor_weights so we get an initial meta data + for (const auto * tw : tensor_weights) { + uint16_t i_split = params->keep_split ? tw->idx : 0; + ggml_tensor * tensor = tw->tensor; if (!ctx_outs[i_split]) { ctx_outs[i_split].reset(gguf_init_empty()); } @@ -707,12 +707,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: const auto tn = LLM_TN(model.arch); new_ofstream(0); - for (const auto * it : tensors) { - const auto & weight = *it; - struct ggml_tensor * tensor = weight.tensor; - if (weight.idx != cur_split && params->keep_split) { + for (const auto * tw : tensor_weights) { + ggml_tensor * tensor = tw->tensor; + if (tw->idx != cur_split && params->keep_split) { close_ofstream(); - new_ofstream(weight.idx); + new_ofstream(tw->idx); } const std::string name = ggml_get_name(tensor); diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index b3a12386e..711de388e 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -412,8 +412,8 @@ static void llama_sampler_chain_accept(struct llama_sampler * smpl, llama_token time_meas tm(chain->t_sample_us, chain->params.no_perf); - for (auto * smpl : chain->samplers) { - llama_sampler_accept(smpl, token); + for (auto * cur : chain->samplers) { + llama_sampler_accept(cur, token); } chain->n_sample++; @@ -424,16 +424,16 @@ static void llama_sampler_chain_apply(struct llama_sampler * smpl, llama_token_d time_meas tm(chain->t_sample_us, chain->params.no_perf); - for (auto * smpl : chain->samplers) { - llama_sampler_apply(smpl, cur_p); + for (auto * cur : chain->samplers) { + llama_sampler_apply(cur, cur_p); } } static void llama_sampler_chain_reset(struct llama_sampler * smpl) { auto * chain = (llama_sampler_chain *) smpl->ctx; - for (auto * smpl : chain->samplers) { - llama_sampler_reset(smpl); + for (auto * cur : chain->samplers) { + llama_sampler_reset(cur); } chain->t_sample_us = 0; @@ -445,8 +445,8 @@ static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampl auto * result = llama_sampler_chain_init(chain_src->params); - for (auto * smpl : chain_src->samplers) { - llama_sampler_chain_add(result, llama_sampler_clone(smpl)); + for (auto * cur : chain_src->samplers) { + llama_sampler_chain_add(result, llama_sampler_clone(cur)); } return result; @@ -455,8 +455,8 @@ static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampl static void llama_sampler_chain_free(struct llama_sampler * smpl) { auto * chain = (llama_sampler_chain *) smpl->ctx; - for (auto * smpl : chain->samplers) { - llama_sampler_free(smpl); + for (auto * cur : chain->samplers) { + llama_sampler_free(cur); } delete chain; diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index cd943b97c..df6bcdf6a 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -34,12 +34,12 @@ struct naive_trie { } char c = key[0]; - auto res = children.find(c); - if (res != children.end()) { - res->second.insert(key + 1, len - 1, val); + auto child = children.find(c); + if (child != children.end()) { + child->second.insert(key + 1, len - 1, val); } else { - auto res = children.insert(std::make_pair(c, naive_trie())); - res.first->second.insert(key + 1, len - 1, val); + auto child_new = children.insert(std::make_pair(c, naive_trie())); + child_new.first->second.insert(key + 1, len - 1, val); } } @@ -49,18 +49,18 @@ struct naive_trie { } char c = key[offset]; - auto res = children.find(c); - if (res != children.end()) { - return res->second.get_longest_prefix(key, len, offset + 1); + auto child = children.find(c); + if (child != children.end()) { + return child->second.get_longest_prefix(key, len, offset + 1); } return std::make_pair(key, offset); } const struct naive_trie * traverse(const char c) const { - auto res = children.find(c); - if (res != children.end()) { - return &res->second; + auto child = children.find(c); + if (child != children.end()) { + return &child->second; } return NULL; @@ -1285,7 +1285,7 @@ struct llama_vocab::impl { llama_token_attr token_get_attr(llama_token id) const; - void init_tokenizer(enum llama_vocab_type type); + void init_tokenizer(); void tokenizer_st_partition(std::forward_list & buffer, bool parse_special) const; @@ -1675,7 +1675,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { } GGML_ASSERT(id_to_token.size() == token_to_id.size()); - init_tokenizer(type); + init_tokenizer(); // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n' if (type == LLAMA_VOCAB_TYPE_SPM) { @@ -2116,7 +2116,7 @@ llama_token_attr llama_vocab::impl::token_get_attr(llama_token id) const { return id_to_token.at(id).attr; } -void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) { +void llama_vocab::impl::init_tokenizer() { LLAMA_LOG_DEBUG("%s: initializing tokenizer for type %d\n", __func__, type); switch (type) { From 9a735ae6d84b4bf76a8444d72fabef8ad353abcf Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 12 Jan 2025 14:25:32 +0200 Subject: [PATCH 06/15] examplse : de-shadow ggml-ci --- common/arg.cpp | 26 +++---- common/arg.h | 12 ++-- common/common.cpp | 10 +-- common/console.cpp | 32 ++++----- common/log.cpp | 8 +-- .../convert-llama2c-to-ggml.cpp | 6 +- examples/gbnf-validator/gbnf-validator.cpp | 7 -- examples/imatrix/imatrix.cpp | 24 +++---- examples/infill/infill.cpp | 12 ++-- examples/llama-bench/llama-bench.cpp | 8 +-- examples/llava/clip.cpp | 39 +++++----- examples/perplexity/perplexity.cpp | 26 +++---- examples/run/CMakeLists.txt | 1 + examples/server/server.cpp | 72 +++++++++---------- examples/server/utils.hpp | 8 +-- examples/speculative/speculative.cpp | 20 +++--- 16 files changed, 152 insertions(+), 159 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 27886b84e..b551596df 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -17,19 +17,19 @@ using json = nlohmann::ordered_json; -common_arg & common_arg::set_examples(std::initializer_list examples) { - this->examples = std::move(examples); +common_arg & common_arg::set_examples(std::initializer_list vals) { + examples = std::move(vals); return *this; } -common_arg & common_arg::set_excludes(std::initializer_list excludes) { - this->excludes = std::move(excludes); +common_arg & common_arg::set_excludes(std::initializer_list vals) { + excludes = std::move(vals); return *this; } -common_arg & common_arg::set_env(const char * env) { - help = help + "\n(env: " + env + ")"; - this->env = env; +common_arg & common_arg::set_env(const char * val) { + help = help + "\n(env: " + val + ")"; + env = val; return *this; } @@ -46,8 +46,10 @@ bool common_arg::is_exclude(enum llama_example ex) { return excludes.find(ex) != excludes.end(); } -bool common_arg::get_value_from_env(std::string & output) { - if (env == nullptr) return false; +bool common_arg::get_value_from_env(std::string & output) const { + if (env == nullptr) { + return false; + } char * value = std::getenv(env); if (value) { output = value; @@ -56,7 +58,7 @@ bool common_arg::get_value_from_env(std::string & output) { return false; } -bool common_arg::has_value_from_env() { +bool common_arg::has_value_from_env() const { return env != nullptr && std::getenv(env); } @@ -87,7 +89,7 @@ static std::vector break_str_into_lines(std::string input, size_t m return result; } -std::string common_arg::to_string() { +std::string common_arg::to_string() const { // params for printing to console const static int n_leading_spaces = 40; const static int n_char_per_line_help = 70; // TODO: detect this based on current console @@ -192,8 +194,6 @@ static std::string get_all_kv_cache_types() { // static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) { - std::string arg; - const std::string arg_prefix = "--"; common_params & params = ctx_arg.params; std::unordered_map arg_to_options; diff --git a/common/arg.h b/common/arg.h index 49ab8667b..d88efa462 100644 --- a/common/arg.h +++ b/common/arg.h @@ -53,15 +53,15 @@ struct common_arg { void (*handler)(common_params & params, const std::string &, const std::string &) ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {} - common_arg & set_examples(std::initializer_list examples); - common_arg & set_excludes(std::initializer_list excludes); - common_arg & set_env(const char * env); + common_arg & set_examples(std::initializer_list vals); + common_arg & set_excludes(std::initializer_list vals); + common_arg & set_env(const char * val); common_arg & set_sparam(); bool in_example(enum llama_example ex); bool is_exclude(enum llama_example ex); - bool get_value_from_env(std::string & output); - bool has_value_from_env(); - std::string to_string(); + bool get_value_from_env(std::string & output) const; + bool has_value_from_env() const; + std::string to_string() const; }; struct common_params_context { diff --git a/common/common.cpp b/common/common.cpp index 39bfb0c2e..16cc3f41c 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -763,9 +763,11 @@ bool fs_create_directory_with_parents(const std::string & path) { return true; #else // if the path already exists, check whether it's a directory - struct stat info; - if (stat(path.c_str(), &info) == 0) { - return S_ISDIR(info.st_mode); + { + struct stat info; + if (stat(path.c_str(), &info) == 0) { + return S_ISDIR(info.st_mode); + } } size_t pos_slash = 1; // skip leading slashes for directory creation @@ -796,7 +798,7 @@ bool fs_create_directory_with_parents(const std::string & path) { } std::string fs_get_cache_directory() { - std::string cache_directory = ""; + std::string cache_directory; auto ensure_trailing_slash = [](std::string p) { // Make sure to add trailing slash if (p.back() != DIRECTORY_SEPARATOR) { diff --git a/common/console.cpp b/common/console.cpp index 078a8d678..8d3c8fa5f 100644 --- a/common/console.cpp +++ b/common/console.cpp @@ -43,7 +43,7 @@ namespace console { static bool simple_io = true; static display_t current_display = reset; - static FILE* out = stdout; + static FILE* fout = stdout; #if defined (_WIN32) static void* hConsole; @@ -110,7 +110,7 @@ namespace console { tty = fopen("/dev/tty", "w+"); if (tty != nullptr) { - out = tty; + fout = tty; } } @@ -126,7 +126,7 @@ namespace console { // Restore settings on POSIX systems if (!simple_io) { if (tty != nullptr) { - out = stdout; + fout = stdout; fclose(tty); tty = nullptr; } @@ -145,19 +145,19 @@ namespace console { fflush(stdout); switch(display) { case reset: - fprintf(out, ANSI_COLOR_RESET); + fprintf(fout, ANSI_COLOR_RESET); break; case prompt: - fprintf(out, ANSI_COLOR_YELLOW); + fprintf(fout, ANSI_COLOR_YELLOW); break; case user_input: - fprintf(out, ANSI_BOLD ANSI_COLOR_GREEN); + fprintf(fout, ANSI_BOLD ANSI_COLOR_GREEN); break; case error: - fprintf(out, ANSI_BOLD ANSI_COLOR_RED); + fprintf(fout, ANSI_BOLD ANSI_COLOR_RED); } current_display = display; - fflush(out); + fflush(fout); } } @@ -233,7 +233,7 @@ namespace console { return; } #endif - putc('\b', out); + putc('\b', fout); } static int estimateWidth(char32_t codepoint) { @@ -274,7 +274,7 @@ namespace console { #else // We can trust expectedWidth if we've got one if (expectedWidth >= 0 || tty == nullptr) { - fwrite(utf8_codepoint, length, 1, out); + fwrite(utf8_codepoint, length, 1, fout); return expectedWidth; } @@ -311,7 +311,7 @@ namespace console { pop_cursor(); put_codepoint(&ch, 1, 1); #else - fprintf(out, "\b%c", ch); + fprintf(fout, "\b%c", ch); #endif } @@ -353,7 +353,7 @@ namespace console { } static bool readline_advanced(std::string & line, bool multiline_input) { - if (out != stdout) { + if (fout != stdout) { fflush(stdout); } @@ -364,7 +364,7 @@ namespace console { char32_t input_char; while (true) { - fflush(out); // Ensure all output is displayed before waiting for input + fflush(fout); // Ensure all output is displayed before waiting for input input_char = getchar32(); if (input_char == '\r' || input_char == '\n') { @@ -432,7 +432,7 @@ namespace console { line.pop_back(); if (last == '\\') { line += '\n'; - fputc('\n', out); + fputc('\n', fout); has_more = !has_more; } else { // llama will just eat the single space, it won't act as a space @@ -447,11 +447,11 @@ namespace console { has_more = false; } else { line += '\n'; - fputc('\n', out); + fputc('\n', fout); } } - fflush(out); + fflush(fout); return has_more; } diff --git a/common/log.cpp b/common/log.cpp index 04c7c0ed1..7a94bf7f9 100644 --- a/common/log.cpp +++ b/common/log.cpp @@ -338,16 +338,16 @@ public: resume(); } - void set_prefix(bool prefix) { + void set_prefix(bool val) { std::lock_guard lock(mtx); - this->prefix = prefix; + prefix = val; } - void set_timestamps(bool timestamps) { + void set_timestamps(bool val) { std::lock_guard lock(mtx); - this->timestamps = timestamps; + timestamps = val; } }; diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index bdf0eed2a..ef0b22a3d 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -471,12 +471,12 @@ struct my_llama_file { GGML_ASSERT(ret == 0); // same } - void read_raw(void * ptr, size_t size) { - if (size == 0) { + void read_raw(void * ptr, size_t size_cur) { + if (size_cur == 0) { return; } errno = 0; - std::size_t ret = std::fread(ptr, size, 1, fp); + std::size_t ret = std::fread(ptr, size_cur, 1, fp); if (ferror(fp)) { die_fmt("fread failed: %s", strerror(errno)); } diff --git a/examples/gbnf-validator/gbnf-validator.cpp b/examples/gbnf-validator/gbnf-validator.cpp index 17a0e27c4..12e7e762d 100644 --- a/examples/gbnf-validator/gbnf-validator.cpp +++ b/examples/gbnf-validator/gbnf-validator.cpp @@ -60,13 +60,6 @@ int main(int argc, char** argv) { const std::string grammar_filename = argv[1]; const std::string input_filename = argv[2]; - // Read the GBNF grammar file - FILE* grammar_file = fopen(grammar_filename.c_str(), "r"); - if (!grammar_file) { - fprintf(stdout, "Failed to open grammar file: %s\n", grammar_filename.c_str()); - return 1; - } - std::string grammar_str; { std::ifstream grammar_file(grammar_filename); diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index b5f3feb9f..d4d3fc7c8 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -294,7 +294,7 @@ void IMatrixCollector::save_imatrix(int ncall) const { bool IMatrixCollector::load_imatrix(const char * fname) { std::ifstream in(fname, std::ios::binary); if (!in) { - LOG_ERR("%s: failed to open %s\n",__func__, fname); + LOG_ERR("%s: failed to open %s\n", __func__, fname); return false; } int n_entries; @@ -308,7 +308,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) { std::vector name_as_vec(len+1); in.read((char *)name_as_vec.data(), len); if (in.fail()) { - LOG_ERR("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname); + LOG_ERR("%s: failed reading name for entry %d from %s\n", __func__, i + 1, fname); return false; } name_as_vec[len] = 0; @@ -319,7 +319,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) { int nval; in.read((char *)&nval, sizeof(nval)); if (in.fail() || nval < 1) { - LOG_ERR("%s: failed reading number of values for entry %d\n",__func__,i); + LOG_ERR("%s: failed reading number of values for entry %d\n", __func__, i); m_stats = {}; return false; } @@ -332,15 +332,15 @@ bool IMatrixCollector::load_imatrix(const char * fname) { std::vector tmp(nval); in.read((char*)tmp.data(), nval*sizeof(float)); if (in.fail()) { - LOG_ERR("%s: failed reading data for entry %d\n",__func__,i); + LOG_ERR("%s: failed reading data for entry %d\n", __func__, i); m_stats = {}; return false; } // Recreate the state as expected by save_imatrix(), and corerct for weighted sum. - for (int i = 0; i < nval; i++) { - e.values[i] += tmp[i]; - e.counts[i] += ncall; + for (int j = 0; j < nval; j++) { + e.values[j] += tmp[j]; + e.counts[j] += ncall; } e.ncall += ncall; @@ -488,12 +488,10 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { logits.reserve((size_t)n_ctx * n_vocab); } - for (int i = 0; i < n_chunk; ++i) { - const int start = i * n_ctx; + for (int ich = 0; ich < n_chunk; ++ich) { + const int start = ich * n_ctx; const int end = start + n_ctx; - std::vector logits; - const auto t_start = std::chrono::high_resolution_clock::now(); // clear the KV cache @@ -537,7 +535,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { const auto t_end = std::chrono::high_resolution_clock::now(); - if (i == 0) { + if (ich == 0) { const float t_total = std::chrono::duration(t_end - t_start).count(); LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total); int total_seconds = (int)(t_total * n_chunk); @@ -555,7 +553,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); count += n_ctx - first - 1; - LOG("[%d]%.4lf,", i + 1, std::exp(nll / count)); + LOG("[%d]%.4lf,", ich + 1, std::exp(nll / count)); fflush(stdout); logits.clear(); diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 489a208b6..f8d099591 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -462,14 +462,14 @@ int main(int argc, char ** argv) { } // tokenize new prefix and suffix - std::vector inp_pfx = common_tokenize(ctx, params.input_prefix, false); - std::vector inp_sfx = common_tokenize(ctx, params.input_suffix, false); + std::vector inp_pfx_cur = common_tokenize(ctx, params.input_prefix, false); + std::vector inp_sfx_cur = common_tokenize(ctx, params.input_suffix, false); - inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab)); - inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab)); + inp_pfx_cur.insert(inp_pfx_cur.begin(), llama_vocab_fim_pre(vocab)); + inp_sfx_cur.insert(inp_sfx_cur.begin(), llama_vocab_fim_suf(vocab)); - embd_inp = params.spm_infill ? inp_sfx : inp_pfx; - embd_end = params.spm_infill ? inp_pfx : inp_sfx; + embd_inp = params.spm_infill ? inp_sfx_cur : inp_pfx_cur; + embd_end = params.spm_infill ? inp_pfx_cur : inp_sfx_cur; if (add_bos) { embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab)); } diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index a3b4c5ac8..faa8d5f87 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -548,11 +548,11 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { GGML_ASSERT(split_arg.size() <= llama_max_devices()); std::vector tensor_split(llama_max_devices()); - for (size_t i = 0; i < llama_max_devices(); ++i) { - if (i < split_arg.size()) { - tensor_split[i] = std::stof(split_arg[i]); + for (size_t is = 0; is < llama_max_devices(); ++is) { + if (is < split_arg.size()) { + tensor_split[is] = std::stof(split_arg[is]); } else { - tensor_split[i] = 0.0f; + tensor_split[is] = 0.0f; } } params.tensor_split.push_back(tensor_split); diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 7a8a3156b..dc827e814 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1039,41 +1039,40 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } { // attention - int hidden_size = 4096; - const int d_head = 128; - int n_head = hidden_size/d_head; + int hidden_size_cur = 4096; int num_query = 96; if (ctx->minicpmv_version == 2) { - hidden_size = 4096; - n_head = hidden_size/d_head; + hidden_size_cur = 4096; num_query = 96; } else if (ctx->minicpmv_version == 3) { - hidden_size = 3584; - n_head = hidden_size/d_head; + hidden_size_cur = 3584; num_query = 64; } + const int d_head_cur = 128; + const int n_head_cur = hidden_size_cur/d_head_cur; + struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b); - Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head)); + Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head_cur)); struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b); struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b); // permute - Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size); + Q = ggml_reshape_4d(ctx0, Q, d_head_cur, n_head_cur, num_query, batch_size); Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); - Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size); - K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); + Q = ggml_reshape_3d(ctx0, Q, d_head_cur, num_query, n_head_cur * batch_size); + K = ggml_reshape_4d(ctx0, K, d_head_cur, n_head_cur, num_positions, batch_size); K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); - K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); - V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size); + K = ggml_reshape_3d(ctx0, K, d_head_cur, num_positions, n_head_cur * batch_size); + V = ggml_reshape_4d(ctx0, V, d_head_cur, n_head_cur, num_positions, batch_size); V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); - V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size); + V = ggml_reshape_3d(ctx0, V, num_positions, d_head_cur, n_head_cur * batch_size); struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); KQ = ggml_soft_max_inplace(ctx0, KQ); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); - KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size); + KQV = ggml_reshape_4d(ctx0, KQV, d_head_cur, num_query, n_head_cur, batch_size); KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size); + KQV = ggml_cont_3d(ctx0, KQV, hidden_size_cur, num_query, batch_size); embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b); } @@ -1113,12 +1112,12 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { struct ggml_context * meta = NULL; - struct gguf_init_params params = { + struct gguf_init_params params_meta = { /*.no_alloc = */ true, /*.ctx = */ &meta, }; - struct gguf_context * ctx = gguf_init_from_file(fname, params); + struct gguf_context * ctx = gguf_init_from_file(fname, params_meta); if (!ctx) { throw std::runtime_error(format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname)); } @@ -1310,13 +1309,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { // load tensors { std::vector read_buf; - struct ggml_init_params params = { + struct ggml_init_params params_data = { /*.mem_size =*/ (n_tensors + 1) * ggml_tensor_overhead(), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; - new_clip->ctx_data = ggml_init(params); + new_clip->ctx_data = ggml_init(params_data); if (!new_clip->ctx_data) { LOG_ERR("%s: ggml_init() failed\n", __func__); clip_free(new_clip); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 9bf6c5743..c9239ecda 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -348,8 +348,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params LOG_INF("%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch); - for (int i = 0; i < n_chunk; ++i) { - const int start = i * params.ppl_stride; + for (int ich = 0; ich < n_chunk; ++ich) { + const int start = ich * params.ppl_stride; const int end = start + calc_chunk; const int num_batches = (calc_chunk + n_batch - 1) / n_batch; @@ -400,7 +400,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params const auto t_end = std::chrono::high_resolution_clock::now(); - if (i == 0) { + if (ich == 0) { const float t_total = std::chrono::duration(t_end - t_start).count(); LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total); int total_seconds = (int)(t_total * n_chunk); @@ -427,9 +427,9 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params } // perplexity is e^(average negative log-likelihood) if (params.ppl_output_type == 0) { - LOG("[%d]%.4lf,", i + 1, std::exp(nll / count)); + LOG("[%d]%.4lf,", ich + 1, std::exp(nll / count)); } else { - LOG("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count)); + LOG("%8d %.4lf\n", ich*params.ppl_stride, std::exp(nll / count)); } } LOG("\n"); @@ -659,7 +659,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params & static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector & batch_logits, int n_batch, int n_vocab) { int prev_outputs = 0; - for (int i = 0; i < (int) batch.n_tokens; i += n_batch) { + for (int i = 0; i < batch.n_tokens; i += n_batch) { const int n_tokens = std::min(n_batch, batch.n_tokens - i); llama_batch batch_view = { @@ -679,8 +679,8 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector< } int n_outputs = 0; - for (int i = 0; i < n_tokens; ++i) { - n_outputs += batch_view.logits[i] != 0; + for (int iv = 0; iv < n_tokens; ++iv) { + n_outputs += batch_view.logits[iv] != 0; } memcpy(batch_logits.data() + size_t(prev_outputs)*n_vocab, llama_get_logits(ctx), size_t(n_outputs)*n_vocab*sizeof(float)); @@ -1752,14 +1752,14 @@ static void kl_divergence(llama_context * ctx, const common_params & params) { auto kld_ptr = kld_values.data(); auto p_diff_ptr = p_diff_values.data(); - for (int i = 0; i < n_chunk; ++i) { - const int start = i * n_ctx; + for (int ich = 0; ich < n_chunk; ++ich) { + const int start = ich * n_ctx; const int end = start + n_ctx; const auto t_start = std::chrono::high_resolution_clock::now(); if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) { - LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i); + LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, ich); return; } @@ -1804,7 +1804,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) { const auto t_end = std::chrono::high_resolution_clock::now(); - if (i == 0) { + if (ich == 0) { const float t_total = std::chrono::duration(t_end - t_start).count(); LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total); int total_seconds = (int)(t_total * n_chunk); @@ -1824,7 +1824,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) { p_diff_ptr += n_ctx - 1 - first; kld_ptr += n_ctx - 1 - first; - LOG("%4d", i+1); + LOG("%4d", ich + 1); auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count); const double ppl_val = exp(log_ppl.first); diff --git a/examples/run/CMakeLists.txt b/examples/run/CMakeLists.txt index 0686d6305..22b43524b 100644 --- a/examples/run/CMakeLists.txt +++ b/examples/run/CMakeLists.txt @@ -3,3 +3,4 @@ add_executable(${TARGET} run.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) +target_compile_options(${TARGET} PRIVATE -Wno-shadow) # TMP diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 64c0c4ef6..aa8b54680 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -122,9 +122,9 @@ struct slot_params { samplers.emplace_back(common_sampler_type_to_str(sampler)); } - json lora = json::array(); - for (size_t i = 0; i < this->lora.size(); ++i) { - lora.push_back({{"id", i}, {"scale", this->lora[i].scale}}); + json json_lora = json::array(); + for (size_t i = 0; i < lora.size(); ++i) { + json_lora.push_back({{"id", i}, {"scale", lora[i].scale}}); } return json { @@ -167,7 +167,7 @@ struct slot_params { {"speculative.p_min", speculative.p_min}, {"timings_per_token", timings_per_token}, {"post_sampling_probs", post_sampling_probs}, - {"lora", lora}, + {"lora", json_lora}, }; } }; @@ -1641,7 +1641,7 @@ struct server_context { llama_context_params cparams_dft; - llama_batch batch = {}; + llama_batch batch_main = {}; bool clean_kv_cache = true; bool add_bos_token = true; @@ -1676,7 +1676,7 @@ struct server_context { llama_batch_free(slot.batch_spec); } - llama_batch_free(batch); + llama_batch_free(batch_main); } bool load_model(const common_params & params) { @@ -1797,7 +1797,7 @@ struct server_context { const int32_t n_batch = llama_n_batch(ctx); // only a single seq_id per token is needed - batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1); + batch_main = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1); } metrics.init(); @@ -2655,7 +2655,7 @@ struct server_context { } // start populating the batch for this iteration - common_batch_clear(batch); + common_batch_clear(batch_main); // track if given slot can be batched with slots already in the batch server_slot * slot_batched = nullptr; @@ -2673,9 +2673,9 @@ struct server_context { continue; } - slot.i_batch = batch.n_tokens; + slot.i_batch = batch_main.n_tokens; - common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true); + common_batch_add(batch_main, slot.sampled, slot.n_past, { slot.id }, true); slot.n_past += 1; @@ -2692,7 +2692,7 @@ struct server_context { int32_t n_ubatch = llama_n_ubatch(ctx); // next, batch any pending prompts without exceeding n_batch - if (params_base.cont_batching || batch.n_tokens == 0) { + if (params_base.cont_batching || batch_main.n_tokens == 0) { for (auto & slot : slots) { // check if we can batch this slot with the previous one if (slot.is_processing()) { @@ -2858,7 +2858,7 @@ struct server_context { // non-causal tasks require to fit the entire prompt in the physical batch if (slot.is_non_causal()) { // cannot fit the prompt in the current batch - will try next iter - if (batch.n_tokens + slot.n_prompt_tokens > n_batch) { + if (batch_main.n_tokens + slot.n_prompt_tokens > n_batch) { continue; } } @@ -2878,11 +2878,11 @@ struct server_context { slot.cache_tokens.resize(slot.n_past); // add prompt tokens for processing in the current batch - while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) { + while (slot.n_past < slot.n_prompt_tokens && batch_main.n_tokens < n_batch) { // without pooling, we want to output the embeddings for all the tokens in the batch const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE; - common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, need_embd); + common_batch_add(batch_main, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, need_embd); if (slot.params.cache_prompt) { slot.cache_tokens.push_back(prompt_tokens[slot.n_past]); @@ -2892,13 +2892,13 @@ struct server_context { slot.n_past++; } - SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens); + SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch_main.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens); // entire prompt has been processed if (slot.n_past == slot.n_prompt_tokens) { slot.state = SLOT_STATE_DONE_PROMPT; - GGML_ASSERT(batch.n_tokens > 0); + GGML_ASSERT(batch_main.n_tokens > 0); common_sampler_reset(slot.smpl); @@ -2908,27 +2908,27 @@ struct server_context { } // extract the logits only for the last token - batch.logits[batch.n_tokens - 1] = true; + batch_main.logits[batch_main.n_tokens - 1] = true; slot.n_decoded = 0; - slot.i_batch = batch.n_tokens - 1; + slot.i_batch = batch_main.n_tokens - 1; - SLT_INF(slot, "prompt done, n_past = %d, n_tokens = %d\n", slot.n_past, batch.n_tokens); + SLT_INF(slot, "prompt done, n_past = %d, n_tokens = %d\n", slot.n_past, batch_main.n_tokens); } } - if (batch.n_tokens >= n_batch) { + if (batch_main.n_tokens >= n_batch) { break; } } } - if (batch.n_tokens == 0) { + if (batch_main.n_tokens == 0) { SRV_WRN("%s", "no tokens to decode\n"); return; } - SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens); + SRV_DBG("decoding batch, n_tokens = %d\n", batch_main.n_tokens); if (slot_batched) { // make sure we're in the right embedding mode @@ -2938,17 +2938,17 @@ struct server_context { } // process the created batch of tokens - for (int32_t i = 0; i < batch.n_tokens; i += n_batch) { - const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i); + for (int32_t i_batch = 0; i_batch < batch_main.n_tokens; i_batch += n_batch) { + const int32_t n_tokens = std::min(n_batch, batch_main.n_tokens - i_batch); llama_batch batch_view = { n_tokens, - batch.token + i, + batch_main.token + i_batch, nullptr, - batch.pos + i, - batch.n_seq_id + i, - batch.seq_id + i, - batch.logits + i, + batch_main.pos + i_batch, + batch_main.n_seq_id + i_batch, + batch_main.seq_id + i_batch, + batch_main.logits + i_batch, }; const int ret = llama_decode(ctx, batch_view); @@ -2957,7 +2957,7 @@ struct server_context { if (ret != 0) { if (n_batch == 1 || ret < 0) { // if you get here, it means the KV cache is full - try increasing it via the context size - SRV_ERR("failed to decode the batch: KV cache is full - try increasing it via the context size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret); + SRV_ERR("failed to decode the batch: KV cache is full - try increasing it via the context size, i_batch = %d, n_batch = %d, ret = %d\n", i_batch, n_batch, ret); for (auto & slot : slots) { slot.release(); send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size."); @@ -2967,15 +2967,15 @@ struct server_context { // retry with half the batch size to try to find a free slot in the KV cache n_batch /= 2; - i -= n_batch; + i_batch -= n_batch; - SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret); + SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation, i_batch = %d, n_batch = %d, ret = %d\n", i_batch, n_batch, ret); continue; // continue loop of n_batch } for (auto & slot : slots) { - if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) { + if (slot.i_batch < (int) i_batch || slot.i_batch >= (int) (i_batch + n_tokens)) { continue; // continue loop of slots } @@ -3001,7 +3001,7 @@ struct server_context { continue; // continue loop of slots } - const int tok_idx = slot.i_batch - i; + const int tok_idx = slot.i_batch - i_batch; llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx); @@ -3687,8 +3687,8 @@ int main(int argc, char ** argv) { } else { // multiple results (multitask) json arr = json::array(); - for (auto & res : results) { - arr.push_back(res->to_json()); + for (auto & result : results) { + arr.push_back(result->to_json()); } res_ok(res, arr); } diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 699480f90..fab0850e5 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -129,15 +129,15 @@ static llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_ if (p.is_string()) { auto s = p.template get(); - llama_tokens p; + llama_tokens ids; if (first) { - p = common_tokenize(vocab, s, add_special, parse_special); + ids = common_tokenize(vocab, s, add_special, parse_special); first = false; } else { - p = common_tokenize(vocab, s, false, parse_special); + ids = common_tokenize(vocab, s, false, parse_special); } - prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); + prompt_tokens.insert(prompt_tokens.end(), ids.begin(), ids.end()); } else { if (first) { first = false; diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index c7ccea50d..56700db3f 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -544,26 +544,26 @@ int main(int argc, char ** argv) { for (int is = 0; is < (int) sa.size(); ++is) { const llama_token id = cur_p->data[is].id; - const int s = sa[is]; + const int sd = sa[is]; - common_sampler_accept(drafts[s].smpl, id, true); + common_sampler_accept(drafts[sd].smpl, id, true); - drafts[s].tokens.push_back(id); - // save cur_p.data into drafts[s].dists - drafts[s].dists.push_back({cur_p->data, cur_p->data + cur_p->size}); + drafts[sd].tokens.push_back(id); + // save cur_p.data into drafts[sd].dists + drafts[sd].dists.push_back({cur_p->data, cur_p->data + cur_p->size}); // add unique drafted tokens to the target batch - drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens); + drafts[sd].i_batch_tgt.push_back(batch_tgt.n_tokens); - common_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true); + common_batch_add(batch_tgt, id, n_past_tgt + i + 1, { sd }, true); // add the token to the batch for batched decoding with the draft model - drafts[s].i_batch_dft = batch_dft.n_tokens; + drafts[sd].i_batch_dft = batch_dft.n_tokens; - common_batch_add(batch_dft, id, n_past_cur, { s }, true); + common_batch_add(batch_dft, id, n_past_cur, { sd }, true); if (batch_tgt.n_tokens > n_draft) { - drafts[s].drafting = false; + drafts[sd].drafting = false; } } } From e159e7751c5e358da439745141584d57f2056e40 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 12 Jan 2025 14:35:29 +0200 Subject: [PATCH 07/15] cmake : disable -Wshadow for GCC ggml-ci --- cmake/common.cmake | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cmake/common.cmake b/cmake/common.cmake index c64ddbc3d..bbc9c412e 100644 --- a/cmake/common.cmake +++ b/cmake/common.cmake @@ -13,7 +13,12 @@ function(llama_add_compile_flags) list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration) - list(APPEND CXX_FLAGS -Wshadow -Wmissing-declarations -Wmissing-noreturn) + list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn) + + # GCC -Wshadow is way too agressive + if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + list(APPEND CXX_FLAGS -Wshadow) + endif() list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function) From 34889bf8102e806289613e566e835420d7ea3d70 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 12 Jan 2025 15:11:52 +0200 Subject: [PATCH 08/15] cmake : cont ggml-ci --- cmake/common.cmake | 2 +- examples/run/CMakeLists.txt | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/cmake/common.cmake b/cmake/common.cmake index bbc9c412e..5dee785c3 100644 --- a/cmake/common.cmake +++ b/cmake/common.cmake @@ -16,7 +16,7 @@ function(llama_add_compile_flags) list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn) # GCC -Wshadow is way too agressive - if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") list(APPEND CXX_FLAGS -Wshadow) endif() diff --git a/examples/run/CMakeLists.txt b/examples/run/CMakeLists.txt index 22b43524b..5e9c57bbc 100644 --- a/examples/run/CMakeLists.txt +++ b/examples/run/CMakeLists.txt @@ -3,4 +3,7 @@ add_executable(${TARGET} run.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) -target_compile_options(${TARGET} PRIVATE -Wno-shadow) # TMP + +if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") + target_compile_options(${TARGET} PRIVATE -Wno-shadow) # TMP +endif() From 439e68c1e5889a01116ba6eec1c03c9fe11bfaa0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 12 Jan 2025 15:29:33 +0200 Subject: [PATCH 09/15] cmake : re-enable GCC -Wshadow ggml-ci --- cmake/common.cmake | 7 ++++-- common/arg.h | 34 ++++++++++++++-------------- examples/export-lora/export-lora.cpp | 8 +++---- examples/gguf-split/gguf-split.cpp | 16 ++++++------- examples/run/CMakeLists.txt | 9 ++++++-- examples/server/server.cpp | 2 +- src/llama-adapter.h | 2 +- src/llama-arch.cpp | 2 +- src/llama-arch.h | 2 +- src/llama-context.h | 4 ++-- src/llama-impl.cpp | 2 +- src/llama-model-loader.h | 2 +- src/llama-model.cpp | 2 +- src/llama-quant.cpp | 6 ++--- src/llama-vocab.cpp | 14 ++++++------ src/llama.cpp | 26 ++++++++++----------- 16 files changed, 73 insertions(+), 65 deletions(-) diff --git a/cmake/common.cmake b/cmake/common.cmake index 5dee785c3..45bac7af8 100644 --- a/cmake/common.cmake +++ b/cmake/common.cmake @@ -15,9 +15,12 @@ function(llama_add_compile_flags) list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn) - # GCC -Wshadow is way too agressive - if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") + if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") list(APPEND CXX_FLAGS -Wshadow) + + if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") + list(APPEND CXX_FLAGS -Wshadow -Wshadow-field-in-constructor) + endif() endif() list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function) diff --git a/common/arg.h b/common/arg.h index d88efa462..eff9e6e1f 100644 --- a/common/arg.h +++ b/common/arg.h @@ -25,33 +25,33 @@ struct common_arg { void (*handler_int) (common_params & params, int) = nullptr; common_arg( - const std::initializer_list & args, - const char * value_hint, - const std::string & help, + const std::initializer_list & args_, + const char * value_hint_, + const std::string & help_, void (*handler)(common_params & params, const std::string &) - ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {} + ) : args(args_), value_hint(value_hint_), help(help_), handler_string(handler) {} common_arg( - const std::initializer_list & args, - const char * value_hint, - const std::string & help, + const std::initializer_list & args_, + const char * value_hint_, + const std::string & help_, void (*handler)(common_params & params, int) - ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {} + ) : args(args_), value_hint(value_hint_), help(help_), handler_int(handler) {} common_arg( - const std::initializer_list & args, - const std::string & help, + const std::initializer_list & args_, + const std::string & help_, void (*handler)(common_params & params) - ) : args(args), help(help), handler_void(handler) {} + ) : args(args_), help(help_), handler_void(handler) {} // support 2 values for arg common_arg( - const std::initializer_list & args, - const char * value_hint, - const char * value_hint_2, - const std::string & help, + const std::initializer_list & args_, + const char * value_hint_, + const char * value_hint_2_, + const std::string & help_, void (*handler)(common_params & params, const std::string &, const std::string &) - ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {} + ) : args(args_), value_hint(value_hint_), value_hint_2(value_hint_2_), help(help_), handler_str_str(handler) {} common_arg & set_examples(std::initializer_list vals); common_arg & set_excludes(std::initializer_list vals); @@ -69,7 +69,7 @@ struct common_params_context { common_params & params; std::vector options; void(*print_usage)(int, char **) = nullptr; - common_params_context(common_params & params) : params(params) {} + common_params_context(common_params & params_) : params(params_) {} }; // parse input arguments from CLI diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 99063b5d5..592cffbf4 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -66,7 +66,7 @@ struct file_input { float alpha; float scale; - file_input(std::string & fname, float scale): f_in(fname, std::ios::binary), scale(scale) { + file_input(std::string & fname, float scale_): f_in(fname, std::ios::binary), scale(scale_) { if (!f_in.is_open()) { throw std::runtime_error("failed to open input gguf from " + fname); } @@ -131,7 +131,7 @@ struct lora_merge_ctx { std::string & base_fname, std::vector & lora_files, std::string & outfile, - int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) { + int n_threads_) : base_model(base_fname, 0), n_threads(n_threads_), fout(outfile, std::ios::binary) { fout.exceptions(std::ofstream::failbit); // fail fast on write errors if (gguf_find_key(base_model.ctx_gguf, LLM_KV_SPLIT_COUNT) >= 0) { @@ -157,7 +157,7 @@ struct lora_merge_ctx { allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); } - void check_metadata_lora(file_input * adapter) { + void check_metadata_lora(const file_input * adapter) const { auto general_type = get_kv_str(adapter->ctx_gguf, "general.type"); if (general_type != "adapter") { throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type); @@ -175,7 +175,7 @@ struct lora_merge_ctx { } } - ggml_type get_out_tensor_type(struct ggml_tensor * t) { + static ggml_type get_out_tensor_type(struct ggml_tensor * t) { if (t->type == GGML_TYPE_F32) { return GGML_TYPE_F32; } else { diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index ef3ceb686..3b9ae6a58 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -204,14 +204,14 @@ struct split_strategy { // temporary buffer for reading in tensor data std::vector read_buf; - split_strategy(const split_params & params, - std::ifstream & f_input, - struct gguf_context * ctx_gguf, - struct ggml_context * ctx_meta) : - params(params), - f_input(f_input), - ctx_gguf(ctx_gguf), - ctx_meta(ctx_meta), + split_strategy(const split_params & params_, + std::ifstream & f_input_, + struct gguf_context * ctx_gguf_, + struct ggml_context * ctx_meta_) : + params(params_), + f_input(f_input_), + ctx_gguf(ctx_gguf_), + ctx_meta(ctx_meta_), n_tensors(gguf_get_n_tensors(ctx_gguf)) { // because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits diff --git a/examples/run/CMakeLists.txt b/examples/run/CMakeLists.txt index 5e9c57bbc..8735c9dc2 100644 --- a/examples/run/CMakeLists.txt +++ b/examples/run/CMakeLists.txt @@ -4,6 +4,11 @@ install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) -if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") - target_compile_options(${TARGET} PRIVATE -Wno-shadow) # TMP +# TMP +if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") + target_compile_options(${TARGET} PRIVATE -Wno-shadow) + + if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") + target_compile_options(${TARGET} PRIVATE -Wno-shadow-field-in-constructor) + endif() endif() diff --git a/examples/server/server.cpp b/examples/server/server.cpp index aa8b54680..0c0f066ca 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -200,7 +200,7 @@ struct server_task { // used by SERVER_TASK_TYPE_SET_LORA std::vector set_lora; - server_task(server_task_type type) : type(type) {} + server_task(server_task_type type_) : type(type_) {} static slot_params params_from_json_cmpl( const llama_context * ctx, diff --git a/src/llama-adapter.h b/src/llama-adapter.h index 603fa08f6..7cfc49689 100644 --- a/src/llama-adapter.h +++ b/src/llama-adapter.h @@ -55,7 +55,7 @@ struct llama_adapter_lora_weight { } llama_adapter_lora_weight() = default; - llama_adapter_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {} + llama_adapter_lora_weight(struct ggml_tensor * a_, struct ggml_tensor * b_) : a(a_), b(b_) {} }; struct llama_adapter_lora { diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 5c1f14cfd..17d7939af 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -1443,7 +1443,7 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, }; -LLM_KV::LLM_KV(llm_arch arch) : arch(arch) {} +LLM_KV::LLM_KV(llm_arch arch_) : arch(arch_) {} std::string LLM_KV::operator()(llm_kv kv) const { return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch)); diff --git a/src/llama-arch.h b/src/llama-arch.h index 349844790..d6a79db1e 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -374,7 +374,7 @@ struct LLM_TN_IMPL { }; struct LLM_TN { - LLM_TN(llm_arch arch) : arch(arch) {} + LLM_TN(llm_arch arch_) : arch(arch_) {} llm_arch arch; diff --git a/src/llama-context.h b/src/llama-context.h index a9268b292..70c3d0ad7 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -15,8 +15,8 @@ #include struct llama_context { - llama_context(const llama_model & model) - : model(model) + llama_context(const llama_model & model_) + : model(model_) , t_start_us(model.t_start_us) , t_load_us(model.t_load_us) {} diff --git a/src/llama-impl.cpp b/src/llama-impl.cpp index 6ec709dd3..37cf7cdb7 100644 --- a/src/llama-impl.cpp +++ b/src/llama-impl.cpp @@ -17,7 +17,7 @@ struct llama_logger_state { static llama_logger_state g_logger_state; -time_meas::time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {} +time_meas::time_meas(int64_t & t_acc_, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc_) {} time_meas::~time_meas() { if (t_start_us >= 0) { diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index 4814bbdc9..2164da710 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -31,7 +31,7 @@ struct llama_model_loader { ggml_tensor * tensor; - llama_tensor_weight(const llama_file * file, uint16_t idx, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) { + llama_tensor_weight(const llama_file * file, uint16_t idx_, const struct gguf_context * gguf_ctx, ggml_tensor * tensor_) : idx(idx_), tensor(tensor_) { const int tensor_idx = gguf_find_tensor(gguf_ctx, ggml_get_name(tensor)); if (tensor_idx < 0) { throw std::runtime_error(format("tensor '%s' not found in the model", ggml_get_name(tensor))); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 1229d8738..01a3afa40 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -369,7 +369,7 @@ struct llama_model::impl { std::vector dev_layer; }; -llama_model::llama_model(const struct llama_model_params & params) : params(params), pimpl(std::make_unique()) { +llama_model::llama_model(const struct llama_model_params & params_) : params(params_), pimpl(std::make_unique()) { } llama_model::~llama_model() {} diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 6c59e1730..75899d142 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -41,9 +41,9 @@ struct quantize_state_impl { // used to figure out if a model shares tok_embd with the output weight bool has_output = false; - quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params) - : model(model) - , params(params) + quantize_state_impl(const llama_model & model_, const llama_model_quantize_params * params_) + : model(model_) + , params(params_) {} }; diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index df6bcdf6a..ef108b991 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -115,7 +115,7 @@ struct llm_tokenizer_spm : llm_tokenizer { }; struct llm_tokenizer_spm_session { - llm_tokenizer_spm_session(const llama_vocab & vocab) : vocab(vocab) {} + llm_tokenizer_spm_session(const llama_vocab & vocab_) : vocab(vocab_) {} void tokenize(const std::string & text, std::vector & output) { // split string into utf8 chars @@ -415,7 +415,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { }; struct llm_tokenizer_bpe_session { - llm_tokenizer_bpe_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : vocab(vocab), tokenizer(tokenizer) {} + llm_tokenizer_bpe_session(const llama_vocab & vocab_, const llm_tokenizer_bpe & tokenizer_) : vocab(vocab_), tokenizer(tokenizer_) {} static void append(const llama_token token_id, std::vector & output) { output.push_back(token_id); @@ -603,7 +603,7 @@ struct llm_tokenizer_wpm : llm_tokenizer { }; struct llm_tokenizer_wpm_session { - llm_tokenizer_wpm_session(const llama_vocab & vocab) : vocab(vocab) {} + llm_tokenizer_wpm_session(const llama_vocab & vocab_) : vocab(vocab_) {} void tokenize(const std::string & text, std::vector & output) { // normalize and split by whitespace @@ -782,7 +782,7 @@ struct llm_tokenizer_ugm : llm_tokenizer { }; struct llm_tokenizer_ugm_session { - llm_tokenizer_ugm_session(const llama_vocab & vocab, const llm_tokenizer_ugm & tokenizer) : vocab(vocab), tokenizer(tokenizer) {} + llm_tokenizer_ugm_session(const llama_vocab & vocab_, const llm_tokenizer_ugm & tokenizer_) : vocab(vocab_), tokenizer(tokenizer_) {} /* This implementation is based on SentencePiece optimized Viterbi algorithm for * unigram language models. The general idea is to: @@ -949,7 +949,7 @@ private: */ struct xcda_array_view { public: - xcda_array_view(const uint32_t * xcda_array, size_t xcda_array_size) : xcda_array(xcda_array), xcda_array_size(xcda_array_size) { + xcda_array_view(const uint32_t * xcda_array_, size_t xcda_array_size_) : xcda_array(xcda_array_), xcda_array_size(xcda_array_size_) { } uint32_t get_base(size_t index) { uint32_t packed_node = get_node(index); @@ -1135,7 +1135,7 @@ struct llm_tokenizer_rwkv : llm_tokenizer { }; struct llm_tokenizer_rwkv_session { - llm_tokenizer_rwkv_session(const llama_vocab & vocab, const llm_tokenizer_rwkv & tokenizer) : vocab(vocab), tokenizer(tokenizer) {} + llm_tokenizer_rwkv_session(const llama_vocab & vocab_, const llm_tokenizer_rwkv & tokenizer_) : vocab(vocab_), tokenizer(tokenizer_) {} void tokenize(const std::string & text, std::vector & output) { uint32_t position = 0; @@ -1262,7 +1262,7 @@ struct llama_vocab::impl { std::vector precompiled_charsmap; - impl(const llama_vocab & vocab) : vocab(vocab) { + impl(const llama_vocab & vocab_) : vocab(vocab_) { } ~impl() = default; diff --git a/src/llama.cpp b/src/llama.cpp index d907c2d6e..094ed0024 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1089,16 +1089,16 @@ struct llm_build_context { // TODO: consider making the entire interface noexcept llm_build_context( - llama_context & lctx, - const llama_ubatch & ubatch, - const llm_build_cb & cb, + llama_context & lctx_, + const llama_ubatch & ubatch_, + const llm_build_cb & cb_, bool worst_case) : - model (lctx.model), - lctx (lctx), + model (lctx_.model), + lctx (lctx_), hparams (model.hparams), - cparams (lctx.cparams), - ubatch (ubatch), - kv_self (lctx.kv_self), + cparams (lctx_.cparams), + ubatch (ubatch_), + kv_self (lctx_.kv_self), n_embd (hparams.n_embd), n_layer (hparams.n_layer), n_rot (hparams.n_rot), @@ -1119,17 +1119,17 @@ struct llm_build_context { beta_slow (cparams.yarn_beta_slow), norm_eps (hparams.f_norm_eps), norm_rms_eps (hparams.f_norm_rms_eps), - n_tokens (ubatch.n_tokens), + n_tokens (ubatch_.n_tokens), n_kv (worst_case ? kv_self.size : kv_self.n), - n_outputs (worst_case ? n_tokens : lctx.n_outputs), - n_outputs_enc (worst_case ? n_tokens : lctx.embd_enc.size() / hparams.n_embd), + n_outputs (worst_case ? n_tokens : lctx_.n_outputs), + n_outputs_enc (worst_case ? n_tokens : lctx_.embd_enc.size() / hparams.n_embd), kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head), n_ctx_orig (cparams.n_ctx_orig_yarn), flash_attn (cparams.flash_attn), pooling_type (cparams.pooling_type), rope_type (hparams.rope_type), - cb (cb), - buf_compute_meta (lctx.buf_compute_meta) { + cb (cb_), + buf_compute_meta (lctx_.buf_compute_meta) { // all initializations should be done in init() } From f65e3d324d641b8f4f110687e998ebfee0a94586 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 12 Jan 2025 15:34:48 +0200 Subject: [PATCH 10/15] ggml : ggml_backend_graph_copy -> ggml_backend_graph_copy_init --- ggml/include/ggml-backend.h | 2 +- ggml/src/ggml-backend.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 7221a0830..ce4fb4652 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -331,7 +331,7 @@ extern "C" { }; // Copy a graph to a different backend - GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph); + GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy_init(ggml_backend_t backend, struct ggml_cgraph * graph); GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy); typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data); diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index dba7be33b..cbc57a2d3 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1724,7 +1724,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_ } } -struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) { +struct ggml_backend_graph_copy ggml_backend_graph_copy_init(ggml_backend_t backend, struct ggml_cgraph * graph) { struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size); struct ggml_tensor ** node_copies = (ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0])); @@ -1812,7 +1812,7 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) { } bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) { - struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph); + struct ggml_backend_graph_copy copy = ggml_backend_graph_copy_init(backend2, graph); if (copy.buffer == NULL) { return false; } From 10eb87409ec0797ec79dec87f1004b380e094cfd Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 12 Jan 2025 16:09:49 +0200 Subject: [PATCH 11/15] shadow : cont gcc ggml-ci --- common/arg.cpp | 936 +++++++++++------------ common/json-schema-to-grammar.cpp | 4 +- common/log.cpp | 4 +- examples/batched-bench/batched-bench.cpp | 8 +- examples/llava/clip.cpp | 8 +- examples/llava/clip.h | 6 +- examples/llava/llava.cpp | 6 +- examples/server/server.cpp | 16 +- examples/simple-chat/simple-chat.cpp | 3 +- src/llama-model.cpp | 13 +- src/llama-quant.cpp | 8 +- src/llama-vocab.cpp | 8 +- 12 files changed, 509 insertions(+), 511 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index b551596df..d1faccee1 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -383,8 +383,8 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e } exit(0); } - } catch (const std::invalid_argument & ex) { - fprintf(stderr, "%s\n", ex.what()); + } catch (const std::invalid_argument & e) { + fprintf(stderr, "%s\n", e.what()); ctx_arg.params = params_org; return false; } @@ -438,8 +438,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"-h", "--help", "--usage"}, "print usage and exit", - [](common_params & params) { - params.usage = true; + [](common_params & cur) { + cur.usage = true; } )); add_opt(common_arg( @@ -454,50 +454,50 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"--verbose-prompt"}, string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"), - [](common_params & params) { - params.verbose_prompt = true; + [](common_params & cur) { + cur.verbose_prompt = true; } )); add_opt(common_arg( {"--no-display-prompt"}, string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"), - [](common_params & params) { - params.display_prompt = false; + [](common_params & cur) { + cur.display_prompt = false; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"-co", "--color"}, string_format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"), - [](common_params & params) { - params.use_color = true; + [](common_params & cur) { + cur.use_color = true; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); add_opt(common_arg( {"-t", "--threads"}, "N", string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads), - [](common_params & params, int value) { - params.cpuparams.n_threads = value; - if (params.cpuparams.n_threads <= 0) { - params.cpuparams.n_threads = std::thread::hardware_concurrency(); + [](common_params & cur, int value) { + cur.cpuparams.n_threads = value; + if (cur.cpuparams.n_threads <= 0) { + cur.cpuparams.n_threads = std::thread::hardware_concurrency(); } } ).set_env("LLAMA_ARG_THREADS")); add_opt(common_arg( {"-tb", "--threads-batch"}, "N", "number of threads to use during batch and prompt processing (default: same as --threads)", - [](common_params & params, int value) { - params.cpuparams_batch.n_threads = value; - if (params.cpuparams_batch.n_threads <= 0) { - params.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); + [](common_params & cur, int value) { + cur.cpuparams_batch.n_threads = value; + if (cur.cpuparams_batch.n_threads <= 0) { + cur.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); } } )); add_opt(common_arg( {"-C", "--cpu-mask"}, "M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")", - [](common_params & params, const std::string & mask) { - params.cpuparams.mask_valid = true; - if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) { + [](common_params & cur, const std::string & mask) { + cur.cpuparams.mask_valid = true; + if (!parse_cpu_mask(mask, cur.cpuparams.cpumask)) { throw std::invalid_argument("invalid cpumask"); } } @@ -505,9 +505,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"-Cr", "--cpu-range"}, "lo-hi", "range of CPUs for affinity. Complements --cpu-mask", - [](common_params & params, const std::string & range) { - params.cpuparams.mask_valid = true; - if (!parse_cpu_range(range, params.cpuparams.cpumask)) { + [](common_params & cur, const std::string & range) { + cur.cpuparams.mask_valid = true; + if (!parse_cpu_range(range, cur.cpuparams.cpumask)) { throw std::invalid_argument("invalid range"); } } @@ -515,33 +515,33 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"--cpu-strict"}, "<0|1>", string_format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu), - [](common_params & params, const std::string & value) { - params.cpuparams.strict_cpu = std::stoul(value); + [](common_params & cur, const std::string & value) { + cur.cpuparams.strict_cpu = std::stoul(value); } )); add_opt(common_arg( {"--prio"}, "N", string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority), - [](common_params & params, int prio) { + [](common_params & cur, int prio) { if (prio < 0 || prio > 3) { throw std::invalid_argument("invalid value"); } - params.cpuparams.priority = (enum ggml_sched_priority) prio; + cur.cpuparams.priority = (enum ggml_sched_priority) prio; } )); add_opt(common_arg( {"--poll"}, "<0...100>", string_format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll), - [](common_params & params, const std::string & value) { - params.cpuparams.poll = std::stoul(value); + [](common_params & cur, const std::string & value) { + cur.cpuparams.poll = std::stoul(value); } )); add_opt(common_arg( {"-Cb", "--cpu-mask-batch"}, "M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)", - [](common_params & params, const std::string & mask) { - params.cpuparams_batch.mask_valid = true; - if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) { + [](common_params & cur, const std::string & mask) { + cur.cpuparams_batch.mask_valid = true; + if (!parse_cpu_mask(mask, cur.cpuparams_batch.cpumask)) { throw std::invalid_argument("invalid cpumask"); } } @@ -549,9 +549,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"-Crb", "--cpu-range-batch"}, "lo-hi", "ranges of CPUs for affinity. Complements --cpu-mask-batch", - [](common_params & params, const std::string & range) { - params.cpuparams_batch.mask_valid = true; - if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) { + [](common_params & cur, const std::string & range) { + cur.cpuparams_batch.mask_valid = true; + if (!parse_cpu_range(range, cur.cpuparams_batch.cpumask)) { throw std::invalid_argument("invalid range"); } } @@ -559,95 +559,95 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"--cpu-strict-batch"}, "<0|1>", "use strict CPU placement (default: same as --cpu-strict)", - [](common_params & params, int value) { - params.cpuparams_batch.strict_cpu = value; + [](common_params & cur, int value) { + cur.cpuparams_batch.strict_cpu = value; } )); add_opt(common_arg( {"--prio-batch"}, "N", string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority), - [](common_params & params, int prio) { + [](common_params & cur, int prio) { if (prio < 0 || prio > 3) { throw std::invalid_argument("invalid value"); } - params.cpuparams_batch.priority = (enum ggml_sched_priority) prio; + cur.cpuparams_batch.priority = (enum ggml_sched_priority) prio; } )); add_opt(common_arg( {"--poll-batch"}, "<0|1>", "use polling to wait for work (default: same as --poll)", - [](common_params & params, int value) { - params.cpuparams_batch.poll = value; + [](common_params & cur, int value) { + cur.cpuparams_batch.poll = value; } )); add_opt(common_arg( {"-lcs", "--lookup-cache-static"}, "FNAME", "path to static lookup cache to use for lookup decoding (not updated by generation)", - [](common_params & params, const std::string & value) { - params.lookup_cache_static = value; + [](common_params & cur, const std::string & value) { + cur.lookup_cache_static = value; } ).set_examples({LLAMA_EXAMPLE_LOOKUP})); add_opt(common_arg( {"-lcd", "--lookup-cache-dynamic"}, "FNAME", "path to dynamic lookup cache to use for lookup decoding (updated by generation)", - [](common_params & params, const std::string & value) { - params.lookup_cache_dynamic = value; + [](common_params & cur, const std::string & value) { + cur.lookup_cache_dynamic = value; } ).set_examples({LLAMA_EXAMPLE_LOOKUP})); add_opt(common_arg( {"-c", "--ctx-size"}, "N", string_format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx), - [](common_params & params, int value) { - params.n_ctx = value; + [](common_params & cur, int value) { + cur.n_ctx = value; } ).set_env("LLAMA_ARG_CTX_SIZE")); add_opt(common_arg( {"-n", "--predict", "--n-predict"}, "N", string_format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict), - [](common_params & params, int value) { - params.n_predict = value; + [](common_params & cur, int value) { + cur.n_predict = value; } ).set_env("LLAMA_ARG_N_PREDICT")); add_opt(common_arg( {"-b", "--batch-size"}, "N", string_format("logical maximum batch size (default: %d)", params.n_batch), - [](common_params & params, int value) { - params.n_batch = value; + [](common_params & cur, int value) { + cur.n_batch = value; } ).set_env("LLAMA_ARG_BATCH")); add_opt(common_arg( {"-ub", "--ubatch-size"}, "N", string_format("physical maximum batch size (default: %d)", params.n_ubatch), - [](common_params & params, int value) { - params.n_ubatch = value; + [](common_params & cur, int value) { + cur.n_ubatch = value; } ).set_env("LLAMA_ARG_UBATCH")); add_opt(common_arg( {"--keep"}, "N", string_format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep), - [](common_params & params, int value) { - params.n_keep = value; + [](common_params & cur, int value) { + cur.n_keep = value; } )); add_opt(common_arg( {"--no-context-shift"}, string_format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"), - [](common_params & params) { - params.ctx_shift = false; + [](common_params & cur) { + cur.ctx_shift = false; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); add_opt(common_arg( {"--chunks"}, "N", string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), - [](common_params & params, int value) { - params.n_chunks = value; + [](common_params & cur, int value) { + cur.n_chunks = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL})); add_opt(common_arg( {"-fa", "--flash-attn"}, string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"), - [](common_params & params) { - params.flash_attn = true; + [](common_params & cur) { + cur.flash_attn = true; } ).set_env("LLAMA_ARG_FLASH_ATTN")); add_opt(common_arg( @@ -655,115 +655,115 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ex == LLAMA_EXAMPLE_MAIN ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt" : "prompt to start generation with", - [](common_params & params, const std::string & value) { - params.prompt = value; + [](common_params & cur, const std::string & value) { + cur.prompt = value; } ).set_excludes({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--no-perf"}, string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), - [](common_params & params) { - params.no_perf = true; - params.sampling.no_perf = true; + [](common_params & cur) { + cur.no_perf = true; + cur.sampling.no_perf = true; } ).set_env("LLAMA_ARG_NO_PERF")); add_opt(common_arg( {"-f", "--file"}, "FNAME", "a file containing the prompt (default: none)", - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { std::ifstream file(value); if (!file) { throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } - // store the external file name in params - params.prompt_file = value; - std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); - if (!params.prompt.empty() && params.prompt.back() == '\n') { - params.prompt.pop_back(); + // store the external file name in cur + cur.prompt_file = value; + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(cur.prompt)); + if (!cur.prompt.empty() && cur.prompt.back() == '\n') { + cur.prompt.pop_back(); } } ).set_excludes({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--in-file"}, "FNAME", "an input file (repeat to specify multiple files)", - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { std::ifstream file(value); if (!file) { throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } - params.in_files.push_back(value); + cur.in_files.push_back(value); } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( {"-bf", "--binary-file"}, "FNAME", "binary file containing the prompt (default: none)", - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { std::ifstream file(value, std::ios::binary); if (!file) { throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } - // store the external file name in params - params.prompt_file = value; + // store the external file name in cur + cur.prompt_file = value; std::ostringstream ss; ss << file.rdbuf(); - params.prompt = ss.str(); - fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str()); + cur.prompt = ss.str(); + fprintf(stderr, "Read %zu bytes from binary file %s\n", cur.prompt.size(), value.c_str()); } ).set_excludes({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"-e", "--escape"}, string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), - [](common_params & params) { - params.escape = true; + [](common_params & cur) { + cur.escape = true; } )); add_opt(common_arg( {"--no-escape"}, "do not process escape sequences", - [](common_params & params) { - params.escape = false; + [](common_params & cur) { + cur.escape = false; } )); add_opt(common_arg( {"-ptc", "--print-token-count"}, "N", string_format("print token count every N tokens (default: %d)", params.n_print), - [](common_params & params, int value) { - params.n_print = value; + [](common_params & cur, int value) { + cur.n_print = value; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"--prompt-cache"}, "FNAME", "file to cache prompt state for faster startup (default: none)", - [](common_params & params, const std::string & value) { - params.path_prompt_cache = value; + [](common_params & cur, const std::string & value) { + cur.path_prompt_cache = value; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"--prompt-cache-all"}, "if specified, saves user input and generations to cache as well\n", - [](common_params & params) { - params.prompt_cache_all = true; + [](common_params & cur) { + cur.prompt_cache_all = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"--prompt-cache-ro"}, "if specified, uses the prompt cache but does not update it", - [](common_params & params) { - params.prompt_cache_ro = true; + [](common_params & cur) { + cur.prompt_cache_ro = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"-r", "--reverse-prompt"}, "PROMPT", "halt generation at PROMPT, return control in interactive mode\n", - [](common_params & params, const std::string & value) { - params.antiprompt.emplace_back(value); + [](common_params & cur, const std::string & value) { + cur.antiprompt.emplace_back(value); } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"-sp", "--special"}, string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"), - [](common_params & params) { - params.special = true; + [](common_params & cur) { + cur.special = true; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( @@ -775,60 +775,60 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "(default: %s)", params.conversation ? "true" : "false" ), - [](common_params & params) { - params.conversation = true; + [](common_params & cur) { + cur.conversation = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"-i", "--interactive"}, string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"), - [](common_params & params) { - params.interactive = true; + [](common_params & cur) { + cur.interactive = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"-if", "--interactive-first"}, string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"), - [](common_params & params) { - params.interactive_first = true; + [](common_params & cur) { + cur.interactive_first = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"-mli", "--multiline-input"}, "allows you to write or paste multiple lines without ending each in '\\'", - [](common_params & params) { - params.multiline_input = true; + [](common_params & cur) { + cur.multiline_input = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"--in-prefix-bos"}, "prefix BOS to user inputs, preceding the `--in-prefix` string", - [](common_params & params) { - params.input_prefix_bos = true; - params.enable_chat_template = false; + [](common_params & cur) { + cur.input_prefix_bos = true; + cur.enable_chat_template = false; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"--in-prefix"}, "STRING", "string to prefix user inputs with (default: empty)", - [](common_params & params, const std::string & value) { - params.input_prefix = value; - params.enable_chat_template = false; + [](common_params & cur, const std::string & value) { + cur.input_prefix = value; + cur.enable_chat_template = false; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); add_opt(common_arg( {"--in-suffix"}, "STRING", "string to suffix after user inputs with (default: empty)", - [](common_params & params, const std::string & value) { - params.input_suffix = value; - params.enable_chat_template = false; + [](common_params & cur, const std::string & value) { + cur.input_suffix = value; + cur.enable_chat_template = false; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); add_opt(common_arg( {"--no-warmup"}, "skip warming up the model with an empty run", - [](common_params & params) { - params.warmup = false; + [](common_params & cur) { + cur.warmup = false; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( @@ -837,154 +837,154 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" ), - [](common_params & params) { - params.spm_infill = true; + [](common_params & cur) { + cur.spm_infill = true; } ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL})); add_opt(common_arg( {"--samplers"}, "SAMPLERS", string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()), - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { const auto sampler_names = string_split(value, ';'); - params.sampling.samplers = common_sampler_types_from_names(sampler_names, true); + cur.sampling.samplers = common_sampler_types_from_names(sampler_names, true); } ).set_sparam()); add_opt(common_arg( {"-s", "--seed"}, "SEED", string_format("RNG seed (default: %d, use random seed for %d)", params.sampling.seed, LLAMA_DEFAULT_SEED), - [](common_params & params, const std::string & value) { - params.sampling.seed = std::stoul(value); + [](common_params & cur, const std::string & value) { + cur.sampling.seed = std::stoul(value); } ).set_sparam()); add_opt(common_arg( {"--sampling-seq", "--sampler-seq"}, "SEQUENCE", string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), - [](common_params & params, const std::string & value) { - params.sampling.samplers = common_sampler_types_from_chars(value); + [](common_params & cur, const std::string & value) { + cur.sampling.samplers = common_sampler_types_from_chars(value); } ).set_sparam()); add_opt(common_arg( {"--ignore-eos"}, "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)", - [](common_params & params) { - params.sampling.ignore_eos = true; + [](common_params & cur) { + cur.sampling.ignore_eos = true; } ).set_sparam()); add_opt(common_arg( {"--temp"}, "N", string_format("temperature (default: %.1f)", (double)params.sampling.temp), - [](common_params & params, const std::string & value) { - params.sampling.temp = std::stof(value); - params.sampling.temp = std::max(params.sampling.temp, 0.0f); + [](common_params & cur, const std::string & value) { + cur.sampling.temp = std::stof(value); + cur.sampling.temp = std::max(cur.sampling.temp, 0.0f); } ).set_sparam()); add_opt(common_arg( {"--top-k"}, "N", string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k), - [](common_params & params, int value) { - params.sampling.top_k = value; + [](common_params & cur, int value) { + cur.sampling.top_k = value; } ).set_sparam()); add_opt(common_arg( {"--top-p"}, "N", string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p), - [](common_params & params, const std::string & value) { - params.sampling.top_p = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.sampling.top_p = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--min-p"}, "N", string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p), - [](common_params & params, const std::string & value) { - params.sampling.min_p = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.sampling.min_p = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--xtc-probability"}, "N", string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability), - [](common_params & params, const std::string & value) { - params.sampling.xtc_probability = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.sampling.xtc_probability = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--xtc-threshold"}, "N", string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold), - [](common_params & params, const std::string & value) { - params.sampling.xtc_threshold = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.sampling.xtc_threshold = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--typical"}, "N", string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sampling.typ_p), - [](common_params & params, const std::string & value) { - params.sampling.typ_p = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.sampling.typ_p = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--repeat-last-n"}, "N", string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n), - [](common_params & params, int value) { + [](common_params & cur, int value) { if (value < -1) { throw std::runtime_error(string_format("error: invalid repeat-last-n = %d\n", value)); } - params.sampling.penalty_last_n = value; - params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n); + cur.sampling.penalty_last_n = value; + cur.sampling.n_prev = std::max(cur.sampling.n_prev, cur.sampling.penalty_last_n); } ).set_sparam()); add_opt(common_arg( {"--repeat-penalty"}, "N", string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat), - [](common_params & params, const std::string & value) { - params.sampling.penalty_repeat = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.sampling.penalty_repeat = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--presence-penalty"}, "N", string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_present), - [](common_params & params, const std::string & value) { - params.sampling.penalty_present = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.sampling.penalty_present = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--frequency-penalty"}, "N", string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_freq), - [](common_params & params, const std::string & value) { - params.sampling.penalty_freq = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.sampling.penalty_freq = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--dry-multiplier"}, "N", string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sampling.dry_multiplier), - [](common_params & params, const std::string & value) { - params.sampling.dry_multiplier = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.sampling.dry_multiplier = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--dry-base"}, "N", string_format("set DRY sampling base value (default: %.2f)", (double)params.sampling.dry_base), - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { float potential_base = std::stof(value); if (potential_base >= 1.0f) { - params.sampling.dry_base = potential_base; + cur.sampling.dry_base = potential_base; } } ).set_sparam()); add_opt(common_arg( {"--dry-allowed-length"}, "N", string_format("set allowed length for DRY sampling (default: %d)", params.sampling.dry_allowed_length), - [](common_params & params, int value) { - params.sampling.dry_allowed_length = value; + [](common_params & cur, int value) { + cur.sampling.dry_allowed_length = value; } ).set_sparam()); add_opt(common_arg( {"--dry-penalty-last-n"}, "N", string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n), - [](common_params & params, int value) { + [](common_params & cur, int value) { if (value < -1) { throw std::runtime_error(string_format("error: invalid dry-penalty-last-n = %d\n", value)); } - params.sampling.dry_penalty_last_n = value; + cur.sampling.dry_penalty_last_n = value; } ).set_sparam()); add_opt(common_arg( @@ -998,55 +998,55 @@ common_params_context common_params_parser_init(common_params & params, llama_ex std::string formatted_b = (b == "\n") ? "\\n" : b; return a + ", '" + formatted_b + "'"; }).c_str()), - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { static bool defaults_cleared = false; if (!defaults_cleared) { - params.sampling.dry_sequence_breakers.clear(); + cur.sampling.dry_sequence_breakers.clear(); defaults_cleared = true; } if (value == "none") { - params.sampling.dry_sequence_breakers.clear(); + cur.sampling.dry_sequence_breakers.clear(); } else { - params.sampling.dry_sequence_breakers.emplace_back(value); + cur.sampling.dry_sequence_breakers.emplace_back(value); } } ).set_sparam()); add_opt(common_arg( {"--dynatemp-range"}, "N", string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range), - [](common_params & params, const std::string & value) { - params.sampling.dynatemp_range = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.sampling.dynatemp_range = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--dynatemp-exp"}, "N", string_format("dynamic temperature exponent (default: %.1f)", (double)params.sampling.dynatemp_exponent), - [](common_params & params, const std::string & value) { - params.sampling.dynatemp_exponent = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.sampling.dynatemp_exponent = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--mirostat"}, "N", string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n" "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat), - [](common_params & params, int value) { - params.sampling.mirostat = value; + [](common_params & cur, int value) { + cur.sampling.mirostat = value; } ).set_sparam()); add_opt(common_arg( {"--mirostat-lr"}, "N", string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta), - [](common_params & params, const std::string & value) { - params.sampling.mirostat_eta = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.sampling.mirostat_eta = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--mirostat-ent"}, "N", string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau), - [](common_params & params, const std::string & value) { - params.sampling.mirostat_tau = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.sampling.mirostat_tau = std::stof(value); } ).set_sparam()); add_opt(common_arg( @@ -1054,7 +1054,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "modifies the likelihood of token appearing in the completion,\n" "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n" "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'", - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { std::stringstream ss(value); llama_token key; char sign; @@ -1062,7 +1062,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex try { if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); - params.sampling.logit_bias.push_back({key, bias}); + cur.sampling.logit_bias.push_back({key, bias}); } else { throw std::invalid_argument("invalid input format"); } @@ -1074,14 +1074,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"--grammar"}, "GRAMMAR", string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sampling.grammar.c_str()), - [](common_params & params, const std::string & value) { - params.sampling.grammar = value; + [](common_params & cur, const std::string & value) { + cur.sampling.grammar = value; } ).set_sparam()); add_opt(common_arg( {"--grammar-file"}, "FNAME", "file to read grammar from", - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { std::ifstream file(value); if (!file) { throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); @@ -1089,130 +1089,130 @@ common_params_context common_params_parser_init(common_params & params, llama_ex std::copy( std::istreambuf_iterator(file), std::istreambuf_iterator(), - std::back_inserter(params.sampling.grammar) + std::back_inserter(cur.sampling.grammar) ); } ).set_sparam()); add_opt(common_arg( {"-j", "--json-schema"}, "SCHEMA", "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead", - [](common_params & params, const std::string & value) { - params.sampling.grammar = json_schema_to_grammar(json::parse(value)); + [](common_params & cur, const std::string & value) { + cur.sampling.grammar = json_schema_to_grammar(json::parse(value)); } ).set_sparam()); add_opt(common_arg( {"--pooling"}, "{none,mean,cls,last,rank}", "pooling type for embeddings, use model default if unspecified", - [](common_params & params, const std::string & value) { - /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } - else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } - else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } - else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; } - else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; } + [](common_params & cur, const std::string & value) { + /**/ if (value == "none") { cur.pooling_type = LLAMA_POOLING_TYPE_NONE; } + else if (value == "mean") { cur.pooling_type = LLAMA_POOLING_TYPE_MEAN; } + else if (value == "cls") { cur.pooling_type = LLAMA_POOLING_TYPE_CLS; } + else if (value == "last") { cur.pooling_type = LLAMA_POOLING_TYPE_LAST; } + else if (value == "rank") { cur.pooling_type = LLAMA_POOLING_TYPE_RANK; } else { throw std::invalid_argument("invalid value"); } } ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING")); add_opt(common_arg( {"--attention"}, "{causal,non-causal}", "attention type for embeddings, use model default if unspecified", - [](common_params & params, const std::string & value) { - /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; } - else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; } + [](common_params & cur, const std::string & value) { + /**/ if (value == "causal") { cur.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; } + else if (value == "non-causal") { cur.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; } else { throw std::invalid_argument("invalid value"); } } ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); add_opt(common_arg( {"--rope-scaling"}, "{none,linear,yarn}", "RoPE frequency scaling method, defaults to linear unless specified by the model", - [](common_params & params, const std::string & value) { - /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } - else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } - else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } + [](common_params & cur, const std::string & value) { + /**/ if (value == "none") { cur.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } + else if (value == "linear") { cur.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } + else if (value == "yarn") { cur.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } else { throw std::invalid_argument("invalid value"); } } ).set_env("LLAMA_ARG_ROPE_SCALING_TYPE")); add_opt(common_arg( {"--rope-scale"}, "N", "RoPE context scaling factor, expands context by a factor of N", - [](common_params & params, const std::string & value) { - params.rope_freq_scale = 1.0f / std::stof(value); + [](common_params & cur, const std::string & value) { + cur.rope_freq_scale = 1.0f / std::stof(value); } ).set_env("LLAMA_ARG_ROPE_SCALE")); add_opt(common_arg( {"--rope-freq-base"}, "N", "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)", - [](common_params & params, const std::string & value) { - params.rope_freq_base = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.rope_freq_base = std::stof(value); } ).set_env("LLAMA_ARG_ROPE_FREQ_BASE")); add_opt(common_arg( {"--rope-freq-scale"}, "N", "RoPE frequency scaling factor, expands context by a factor of 1/N", - [](common_params & params, const std::string & value) { - params.rope_freq_scale = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.rope_freq_scale = std::stof(value); } ).set_env("LLAMA_ARG_ROPE_FREQ_SCALE")); add_opt(common_arg( {"--yarn-orig-ctx"}, "N", string_format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx), - [](common_params & params, int value) { - params.yarn_orig_ctx = value; + [](common_params & cur, int value) { + cur.yarn_orig_ctx = value; } ).set_env("LLAMA_ARG_YARN_ORIG_CTX")); add_opt(common_arg( {"--yarn-ext-factor"}, "N", string_format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor), - [](common_params & params, const std::string & value) { - params.yarn_ext_factor = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.yarn_ext_factor = std::stof(value); } ).set_env("LLAMA_ARG_YARN_EXT_FACTOR")); add_opt(common_arg( {"--yarn-attn-factor"}, "N", string_format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor), - [](common_params & params, const std::string & value) { - params.yarn_attn_factor = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.yarn_attn_factor = std::stof(value); } ).set_env("LLAMA_ARG_YARN_ATTN_FACTOR")); add_opt(common_arg( {"--yarn-beta-slow"}, "N", string_format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow), - [](common_params & params, const std::string & value) { - params.yarn_beta_slow = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.yarn_beta_slow = std::stof(value); } ).set_env("LLAMA_ARG_YARN_BETA_SLOW")); add_opt(common_arg( {"--yarn-beta-fast"}, "N", string_format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast), - [](common_params & params, const std::string & value) { - params.yarn_beta_fast = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.yarn_beta_fast = std::stof(value); } ).set_env("LLAMA_ARG_YARN_BETA_FAST")); add_opt(common_arg( {"-gan", "--grp-attn-n"}, "N", string_format("group-attention factor (default: %d)", params.grp_attn_n), - [](common_params & params, int value) { - params.grp_attn_n = value; + [](common_params & cur, int value) { + cur.grp_attn_n = value; } ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY})); add_opt(common_arg( {"-gaw", "--grp-attn-w"}, "N", string_format("group-attention width (default: %d)", params.grp_attn_w), - [](common_params & params, int value) { - params.grp_attn_w = value; + [](common_params & cur, int value) { + cur.grp_attn_w = value; } ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"-dkvc", "--dump-kv-cache"}, "verbose print of the KV cache", - [](common_params & params) { - params.dump_kv_cache = true; + [](common_params & cur) { + cur.dump_kv_cache = true; } )); add_opt(common_arg( {"-nkvo", "--no-kv-offload"}, "disable KV offload", - [](common_params & params) { - params.no_kv_offload = true; + [](common_params & cur) { + cur.no_kv_offload = true; } ).set_env("LLAMA_ARG_NO_KV_OFFLOAD")); add_opt(common_arg( @@ -1224,8 +1224,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex get_all_kv_cache_types().c_str(), ggml_type_name(params.cache_type_k) ), - [](common_params & params, const std::string & value) { - params.cache_type_k = kv_cache_type_from_str(value); + [](common_params & cur, const std::string & value) { + cur.cache_type_k = kv_cache_type_from_str(value); } ).set_env("LLAMA_ARG_CACHE_TYPE_K")); add_opt(common_arg( @@ -1237,157 +1237,157 @@ common_params_context common_params_parser_init(common_params & params, llama_ex get_all_kv_cache_types().c_str(), ggml_type_name(params.cache_type_v) ), - [](common_params & params, const std::string & value) { - params.cache_type_v = kv_cache_type_from_str(value); + [](common_params & cur, const std::string & value) { + cur.cache_type_v = kv_cache_type_from_str(value); } ).set_env("LLAMA_ARG_CACHE_TYPE_V")); add_opt(common_arg( {"--perplexity", "--all-logits"}, string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"), - [](common_params & params) { - params.logits_all = true; + [](common_params & cur) { + cur.logits_all = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--hellaswag"}, "compute HellaSwag score over random tasks from datafile supplied with -f", - [](common_params & params) { - params.hellaswag = true; + [](common_params & cur) { + cur.hellaswag = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--hellaswag-tasks"}, "N", string_format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks), - [](common_params & params, int value) { - params.hellaswag_tasks = value; + [](common_params & cur, int value) { + cur.hellaswag_tasks = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--winogrande"}, "compute Winogrande score over random tasks from datafile supplied with -f", - [](common_params & params) { - params.winogrande = true; + [](common_params & cur) { + cur.winogrande = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--winogrande-tasks"}, "N", string_format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks), - [](common_params & params, int value) { - params.winogrande_tasks = value; + [](common_params & cur, int value) { + cur.winogrande_tasks = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--multiple-choice"}, "compute multiple choice score over random tasks from datafile supplied with -f", - [](common_params & params) { - params.multiple_choice = true; + [](common_params & cur) { + cur.multiple_choice = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--multiple-choice-tasks"}, "N", string_format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks), - [](common_params & params, int value) { - params.multiple_choice_tasks = value; + [](common_params & cur, int value) { + cur.multiple_choice_tasks = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--kl-divergence"}, "computes KL-divergence to logits provided via --kl-divergence-base", - [](common_params & params) { - params.kl_divergence = true; + [](common_params & cur) { + cur.kl_divergence = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--save-all-logits", "--kl-divergence-base"}, "FNAME", "set logits file", - [](common_params & params, const std::string & value) { - params.logits_file = value; + [](common_params & cur, const std::string & value) { + cur.logits_file = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--ppl-stride"}, "N", string_format("stride for perplexity calculation (default: %d)", params.ppl_stride), - [](common_params & params, int value) { - params.ppl_stride = value; + [](common_params & cur, int value) { + cur.ppl_stride = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--ppl-output-type"}, "<0|1>", string_format("output type for perplexity calculation (default: %d)", params.ppl_output_type), - [](common_params & params, int value) { - params.ppl_output_type = value; + [](common_params & cur, int value) { + cur.ppl_output_type = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"-dt", "--defrag-thold"}, "N", string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold), - [](common_params & params, const std::string & value) { - params.defrag_thold = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.defrag_thold = std::stof(value); } ).set_env("LLAMA_ARG_DEFRAG_THOLD")); add_opt(common_arg( {"-np", "--parallel"}, "N", string_format("number of parallel sequences to decode (default: %d)", params.n_parallel), - [](common_params & params, int value) { - params.n_parallel = value; + [](common_params & cur, int value) { + cur.n_parallel = value; } ).set_env("LLAMA_ARG_N_PARALLEL")); add_opt(common_arg( {"-ns", "--sequences"}, "N", string_format("number of sequences to decode (default: %d)", params.n_sequences), - [](common_params & params, int value) { - params.n_sequences = value; + [](common_params & cur, int value) { + cur.n_sequences = value; } ).set_examples({LLAMA_EXAMPLE_PARALLEL})); add_opt(common_arg( {"-cb", "--cont-batching"}, string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"), - [](common_params & params) { - params.cont_batching = true; + [](common_params & cur) { + cur.cont_batching = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING")); add_opt(common_arg( {"-nocb", "--no-cont-batching"}, "disable continuous batching", - [](common_params & params) { - params.cont_batching = false; + [](common_params & cur) { + cur.cont_batching = false; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING")); add_opt(common_arg( {"--mmproj"}, "FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md", - [](common_params & params, const std::string & value) { - params.mmproj = value; + [](common_params & cur, const std::string & value) { + cur.mmproj = value; } ).set_examples({LLAMA_EXAMPLE_LLAVA})); add_opt(common_arg( {"--image"}, "FILE", "path to an image file. use with multimodal models. Specify multiple times for batching", - [](common_params & params, const std::string & value) { - params.image.emplace_back(value); + [](common_params & cur, const std::string & value) { + cur.image.emplace_back(value); } ).set_examples({LLAMA_EXAMPLE_LLAVA})); if (llama_supports_rpc()) { add_opt(common_arg( {"--rpc"}, "SERVERS", "comma separated list of RPC servers", - [](common_params & params, const std::string & value) { - params.rpc_servers = value; + [](common_params & cur, const std::string & value) { + cur.rpc_servers = value; } ).set_env("LLAMA_ARG_RPC")); } add_opt(common_arg( {"--mlock"}, "force system to keep model in RAM rather than swapping or compressing", - [](common_params & params) { - params.use_mlock = true; + [](common_params & cur) { + cur.use_mlock = true; } ).set_env("LLAMA_ARG_MLOCK")); add_opt(common_arg( {"--no-mmap"}, "do not memory-map model (slower load but may reduce pageouts if not using mlock)", - [](common_params & params) { - params.use_mmap = false; + [](common_params & cur) { + cur.use_mmap = false; } ).set_env("LLAMA_ARG_NO_MMAP")); add_opt(common_arg( @@ -1398,10 +1398,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "- numactl: use the CPU map provided by numactl\n" "if run without this previously, it is recommended to drop the system page cache before using this\n" "see https://github.com/ggerganov/llama.cpp/issues/1437", - [](common_params & params, const std::string & value) { - /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } - else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } - else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } + [](common_params & cur, const std::string & value) { + /**/ if (value == "distribute" || value == "") { cur.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } + else if (value == "isolate") { cur.numa = GGML_NUMA_STRATEGY_ISOLATE; } + else if (value == "numactl") { cur.numa = GGML_NUMA_STRATEGY_NUMACTL; } else { throw std::invalid_argument("invalid value"); } } ).set_env("LLAMA_ARG_NUMA")); @@ -1409,8 +1409,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"-dev", "--device"}, "", "comma-separated list of devices to use for offloading (none = don't offload)\n" "use --list-devices to see a list of available devices", - [](common_params & params, const std::string & value) { - params.devices = parse_device_list(value); + [](common_params & cur, const std::string & value) { + cur.devices = parse_device_list(value); } ).set_env("LLAMA_ARG_DEVICE")); add_opt(common_arg( @@ -1432,8 +1432,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N", "number of layers to store in VRAM", - [](common_params & params, int value) { - params.n_gpu_layers = value; + [](common_params & cur, int value) { + cur.n_gpu_layers = value; if (!llama_supports_gpu_offload()) { fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n"); fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n"); @@ -1447,14 +1447,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "- none: use one GPU only\n" "- layer (default): split layers and KV across GPUs\n" "- row: split rows across GPUs", - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { std::string arg_next = value; if (arg_next == "none") { - params.split_mode = LLAMA_SPLIT_MODE_NONE; + cur.split_mode = LLAMA_SPLIT_MODE_NONE; } else if (arg_next == "layer") { - params.split_mode = LLAMA_SPLIT_MODE_LAYER; + cur.split_mode = LLAMA_SPLIT_MODE_LAYER; } else if (arg_next == "row") { - params.split_mode = LLAMA_SPLIT_MODE_ROW; + cur.split_mode = LLAMA_SPLIT_MODE_ROW; } else { throw std::invalid_argument("invalid value"); } @@ -1466,7 +1466,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"-ts", "--tensor-split"}, "N0,N1,N2,...", "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1", - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { std::string arg_next = value; // split string by , and / @@ -1480,9 +1480,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } for (size_t i = 0; i < llama_max_devices(); ++i) { if (i < split_arg.size()) { - params.tensor_split[i] = std::stof(split_arg[i]); + cur.tensor_split[i] = std::stof(split_arg[i]); } else { - params.tensor_split[i] = 0.0f; + cur.tensor_split[i] = 0.0f; } } if (!llama_supports_gpu_offload()) { @@ -1493,8 +1493,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"-mg", "--main-gpu"}, "INDEX", string_format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu), - [](common_params & params, int value) { - params.main_gpu = value; + [](common_params & cur, int value) { + cur.main_gpu = value; if (!llama_supports_gpu_offload()) { fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n"); } @@ -1503,16 +1503,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"--check-tensors"}, string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"), - [](common_params & params) { - params.check_tensors = true; + [](common_params & cur) { + cur.check_tensors = true; } )); add_opt(common_arg( {"--override-kv"}, "KEY=TYPE:VALUE", "advanced option to override model metadata by key. may be specified multiple times.\n" "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false", - [](common_params & params, const std::string & value) { - if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) { + [](common_params & cur, const std::string & value) { + if (!string_parse_kv_override(value.c_str(), cur.kv_overrides)) { throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", value.c_str())); } } @@ -1520,47 +1520,47 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"--lora"}, "FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)", - [](common_params & params, const std::string & value) { - params.lora_adapters.push_back({ std::string(value), 1.0, nullptr }); + [](common_params & cur, const std::string & value) { + cur.lora_adapters.push_back({ std::string(value), 1.0, nullptr }); } // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); add_opt(common_arg( {"--lora-scaled"}, "FNAME", "SCALE", "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)", - [](common_params & params, const std::string & fname, const std::string & scale) { - params.lora_adapters.push_back({ fname, std::stof(scale), nullptr }); + [](common_params & cur, const std::string & fname, const std::string & scale) { + cur.lora_adapters.push_back({ fname, std::stof(scale), nullptr }); } // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); add_opt(common_arg( {"--control-vector"}, "FNAME", "add a control vector\nnote: this argument can be repeated to add multiple control vectors", - [](common_params & params, const std::string & value) { - params.control_vectors.push_back({ 1.0f, value, }); + [](common_params & cur, const std::string & value) { + cur.control_vectors.push_back({ 1.0f, value, }); } )); add_opt(common_arg( {"--control-vector-scaled"}, "FNAME", "SCALE", "add a control vector with user defined scaling SCALE\n" "note: this argument can be repeated to add multiple scaled control vectors", - [](common_params & params, const std::string & fname, const std::string & scale) { - params.control_vectors.push_back({ std::stof(scale), fname }); + [](common_params & cur, const std::string & fname, const std::string & scale) { + cur.control_vectors.push_back({ std::stof(scale), fname }); } )); add_opt(common_arg( {"--control-vector-layer-range"}, "START", "END", "layer range to apply the control vector(s) to, start and end inclusive", - [](common_params & params, const std::string & start, const std::string & end) { - params.control_vector_layer_start = std::stoi(start); - params.control_vector_layer_end = std::stoi(end); + [](common_params & cur, const std::string & start, const std::string & end) { + cur.control_vector_layer_start = std::stoi(start); + cur.control_vector_layer_end = std::stoi(end); } )); add_opt(common_arg( {"-a", "--alias"}, "STRING", "set alias for model name (to be used by REST API)", - [](common_params & params, const std::string & value) { - params.model_alias = value; + [](common_params & cur, const std::string & value) { + cur.model_alias = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS")); add_opt(common_arg( @@ -1571,89 +1571,89 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "model path (default: `models/$filename` with filename from `--hf-file` " "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH ), - [](common_params & params, const std::string & value) { - params.model = value; + [](common_params & cur, const std::string & value) { + cur.model = value; } ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL")); add_opt(common_arg( {"-mu", "--model-url"}, "MODEL_URL", "model download url (default: unused)", - [](common_params & params, const std::string & value) { - params.model_url = value; + [](common_params & cur, const std::string & value) { + cur.model_url = value; } ).set_env("LLAMA_ARG_MODEL_URL")); add_opt(common_arg( {"-hfr", "--hf-repo"}, "REPO", "Hugging Face model repository (default: unused)", - [](common_params & params, const std::string & value) { - params.hf_repo = value; + [](common_params & cur, const std::string & value) { + cur.hf_repo = value; } ).set_env("LLAMA_ARG_HF_REPO")); add_opt(common_arg( {"-hff", "--hf-file"}, "FILE", "Hugging Face model file (default: unused)", - [](common_params & params, const std::string & value) { - params.hf_file = value; + [](common_params & cur, const std::string & value) { + cur.hf_file = value; } ).set_env("LLAMA_ARG_HF_FILE")); add_opt(common_arg( {"-hfrv", "--hf-repo-v"}, "REPO", "Hugging Face model repository for the vocoder model (default: unused)", - [](common_params & params, const std::string & value) { - params.vocoder.hf_repo = value; + [](common_params & cur, const std::string & value) { + cur.vocoder.hf_repo = value; } ).set_env("LLAMA_ARG_HF_REPO_V")); add_opt(common_arg( {"-hffv", "--hf-file-v"}, "FILE", "Hugging Face model file for the vocoder model (default: unused)", - [](common_params & params, const std::string & value) { - params.vocoder.hf_file = value; + [](common_params & cur, const std::string & value) { + cur.vocoder.hf_file = value; } ).set_env("LLAMA_ARG_HF_FILE_V")); add_opt(common_arg( {"-hft", "--hf-token"}, "TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)", - [](common_params & params, const std::string & value) { - params.hf_token = value; + [](common_params & cur, const std::string & value) { + cur.hf_token = value; } ).set_env("HF_TOKEN")); add_opt(common_arg( {"--context-file"}, "FNAME", "file to load context from (repeat to specify multiple files)", - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { std::ifstream file(value, std::ios::binary); if (!file) { throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } - params.context_files.push_back(value); + cur.context_files.push_back(value); } ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); add_opt(common_arg( {"--chunk-size"}, "N", string_format("minimum length of embedded text chunks (default: %d)", params.chunk_size), - [](common_params & params, int value) { - params.chunk_size = value; + [](common_params & cur, int value) { + cur.chunk_size = value; } ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); add_opt(common_arg( {"--chunk-separator"}, "STRING", string_format("separator between chunks (default: '%s')", params.chunk_separator.c_str()), - [](common_params & params, const std::string & value) { - params.chunk_separator = value; + [](common_params & cur, const std::string & value) { + cur.chunk_separator = value; } ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); add_opt(common_arg( {"--junk"}, "N", string_format("number of times to repeat the junk text (default: %d)", params.n_junk), - [](common_params & params, int value) { - params.n_junk = value; + [](common_params & cur, int value) { + cur.n_junk = value; } ).set_examples({LLAMA_EXAMPLE_PASSKEY})); add_opt(common_arg( {"--pos"}, "N", string_format("position of the passkey in the junk text (default: %d)", params.i_pos), - [](common_params & params, int value) { - params.i_pos = value; + [](common_params & cur, int value) { + cur.i_pos = value; } ).set_examples({LLAMA_EXAMPLE_PASSKEY})); add_opt(common_arg( @@ -1664,152 +1664,152 @@ common_params_context common_params_parser_init(common_params & params, llama_ex : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR ? params.cvector_outfile.c_str() : params.out_file.c_str()), - [](common_params & params, const std::string & value) { - params.out_file = value; - params.cvector_outfile = value; - params.lora_outfile = value; + [](common_params & cur, const std::string & value) { + cur.out_file = value; + cur.cvector_outfile = value; + cur.lora_outfile = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA})); add_opt(common_arg( {"-ofreq", "--output-frequency"}, "N", string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq), - [](common_params & params, int value) { - params.n_out_freq = value; + [](common_params & cur, int value) { + cur.n_out_freq = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( {"--save-frequency"}, "N", string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq), - [](common_params & params, int value) { - params.n_save_freq = value; + [](common_params & cur, int value) { + cur.n_save_freq = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( {"--process-output"}, string_format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"), - [](common_params & params) { - params.process_output = true; + [](common_params & cur) { + cur.process_output = true; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( {"--no-ppl"}, string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"), - [](common_params & params) { - params.compute_ppl = false; + [](common_params & cur) { + cur.compute_ppl = false; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( {"--chunk", "--from-chunk"}, "N", string_format("start processing the input from chunk N (default: %d)", params.i_chunk), - [](common_params & params, int value) { - params.i_chunk = value; + [](common_params & cur, int value) { + cur.i_chunk = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( {"-pps"}, string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"), - [](common_params & params) { - params.is_pp_shared = true; + [](common_params & cur) { + cur.is_pp_shared = true; } ).set_examples({LLAMA_EXAMPLE_BENCH})); add_opt(common_arg( {"-npp"}, "n0,n1,...", "number of prompt tokens", - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { auto p = string_split(value, ','); - params.n_pp.insert(params.n_pp.end(), p.begin(), p.end()); + cur.n_pp.insert(cur.n_pp.end(), p.begin(), p.end()); } ).set_examples({LLAMA_EXAMPLE_BENCH})); add_opt(common_arg( {"-ntg"}, "n0,n1,...", "number of text generation tokens", - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { auto p = string_split(value, ','); - params.n_tg.insert(params.n_tg.end(), p.begin(), p.end()); + cur.n_tg.insert(cur.n_tg.end(), p.begin(), p.end()); } ).set_examples({LLAMA_EXAMPLE_BENCH})); add_opt(common_arg( {"-npl"}, "n0,n1,...", "number of parallel prompts", - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { auto p = string_split(value, ','); - params.n_pl.insert(params.n_pl.end(), p.begin(), p.end()); + cur.n_pl.insert(cur.n_pl.end(), p.begin(), p.end()); } ).set_examples({LLAMA_EXAMPLE_BENCH})); add_opt(common_arg( {"--embd-normalize"}, "N", string_format("normalisation for embeddings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize), - [](common_params & params, int value) { - params.embd_normalize = value; + [](common_params & cur, int value) { + cur.embd_normalize = value; } ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); add_opt(common_arg( {"--embd-output-format"}, "FORMAT", "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix", - [](common_params & params, const std::string & value) { - params.embd_out = value; + [](common_params & cur, const std::string & value) { + cur.embd_out = value; } ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); add_opt(common_arg( {"--embd-separator"}, "STRING", "separator of embeddings (default \\n) for example \"<#sep#>\"", - [](common_params & params, const std::string & value) { - params.embd_sep = value; + [](common_params & cur, const std::string & value) { + cur.embd_sep = value; } ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); add_opt(common_arg( {"--host"}, "HOST", string_format("ip address to listen (default: %s)", params.hostname.c_str()), - [](common_params & params, const std::string & value) { - params.hostname = value; + [](common_params & cur, const std::string & value) { + cur.hostname = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST")); add_opt(common_arg( {"--port"}, "PORT", string_format("port to listen (default: %d)", params.port), - [](common_params & params, int value) { - params.port = value; + [](common_params & cur, int value) { + cur.port = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT")); add_opt(common_arg( {"--path"}, "PATH", string_format("path to serve static files from (default: %s)", params.public_path.c_str()), - [](common_params & params, const std::string & value) { - params.public_path = value; + [](common_params & cur, const std::string & value) { + cur.public_path = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH")); add_opt(common_arg( {"--no-webui"}, string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"), - [](common_params & params) { - params.webui = false; + [](common_params & cur) { + cur.webui = false; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI")); add_opt(common_arg( {"--embedding", "--embeddings"}, string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), - [](common_params & params) { - params.embedding = true; + [](common_params & cur) { + cur.embedding = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS")); add_opt(common_arg( {"--reranking", "--rerank"}, string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"), - [](common_params & params) { - params.reranking = true; + [](common_params & cur) { + cur.reranking = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING")); add_opt(common_arg( {"--api-key"}, "KEY", "API key to use for authentication (default: none)", - [](common_params & params, const std::string & value) { - params.api_keys.push_back(value); + [](common_params & cur, const std::string & value) { + cur.api_keys.push_back(value); } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY")); add_opt(common_arg( {"--api-key-file"}, "FNAME", "path to file containing API keys (default: none)", - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { std::ifstream key_file(value); if (!key_file) { throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); @@ -1817,7 +1817,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex std::string key; while (std::getline(key_file, key)) { if (!key.empty()) { - params.api_keys.push_back(key); + cur.api_keys.push_back(key); } } key_file.close(); @@ -1826,75 +1826,75 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"--ssl-key-file"}, "FNAME", "path to file a PEM-encoded SSL private key", - [](common_params & params, const std::string & value) { - params.ssl_file_key = value; + [](common_params & cur, const std::string & value) { + cur.ssl_file_key = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_KEY_FILE")); add_opt(common_arg( {"--ssl-cert-file"}, "FNAME", "path to file a PEM-encoded SSL certificate", - [](common_params & params, const std::string & value) { - params.ssl_file_cert = value; + [](common_params & cur, const std::string & value) { + cur.ssl_file_cert = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE")); add_opt(common_arg( {"-to", "--timeout"}, "N", string_format("server read/write timeout in seconds (default: %d)", params.timeout_read), - [](common_params & params, int value) { - params.timeout_read = value; - params.timeout_write = value; + [](common_params & cur, int value) { + cur.timeout_read = value; + cur.timeout_write = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT")); add_opt(common_arg( {"--threads-http"}, "N", string_format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http), - [](common_params & params, int value) { - params.n_threads_http = value; + [](common_params & cur, int value) { + cur.n_threads_http = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP")); add_opt(common_arg( {"--cache-reuse"}, "N", string_format("min chunk size to attempt reusing from the cache via KV shifting (default: %d)", params.n_cache_reuse), - [](common_params & params, int value) { - params.n_cache_reuse = value; + [](common_params & cur, int value) { + cur.n_cache_reuse = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE")); add_opt(common_arg( {"--metrics"}, string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), - [](common_params & params) { - params.endpoint_metrics = true; + [](common_params & cur) { + cur.endpoint_metrics = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS")); add_opt(common_arg( {"--slots"}, string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), - [](common_params & params) { - params.endpoint_slots = true; + [](common_params & cur) { + cur.endpoint_slots = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS")); add_opt(common_arg( {"--props"}, string_format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"), - [](common_params & params) { - params.endpoint_props = true; + [](common_params & cur) { + cur.endpoint_props = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS")); add_opt(common_arg( {"--no-slots"}, "disables slots monitoring endpoint", - [](common_params & params) { - params.endpoint_slots = false; + [](common_params & cur) { + cur.endpoint_slots = false; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS")); add_opt(common_arg( {"--slot-save-path"}, "PATH", "path to save slot kv cache (default: disabled)", - [](common_params & params, const std::string & value) { - params.slot_save_path = value; + [](common_params & cur, const std::string & value) { + cur.slot_save_path = value; // if doesn't end with DIRECTORY_SEPARATOR, add it - if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) { - params.slot_save_path += DIRECTORY_SEPARATOR; + if (!cur.slot_save_path.empty() && cur.slot_save_path[cur.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) { + cur.slot_save_path += DIRECTORY_SEPARATOR; } } ).set_examples({LLAMA_EXAMPLE_SERVER})); @@ -1905,7 +1905,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "if suffix/prefix are specified, template will be disabled\n" "list of built-in templates:\n%s", list_builtin_chat_templates().c_str() ), - [](common_params & params, const std::string & value) { + [](common_params & cur, const std::string & value) { if (!common_chat_verify_template(value)) { throw std::runtime_error(string_format( "error: the supplied chat template is not supported: %s\n" @@ -1913,73 +1913,73 @@ common_params_context common_params_parser_init(common_params & params, llama_ex value.c_str() )); } - params.chat_template = value; + cur.chat_template = value; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE")); add_opt(common_arg( {"-sps", "--slot-prompt-similarity"}, "SIMILARITY", string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), - [](common_params & params, const std::string & value) { - params.slot_prompt_similarity = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.slot_prompt_similarity = std::stof(value); } ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--lora-init-without-apply"}, string_format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"), - [](common_params & params) { - params.lora_init_without_apply = true; + [](common_params & cur) { + cur.lora_init_without_apply = true; } ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--simple-io"}, "use basic IO for better compatibility in subprocesses and limited consoles", - [](common_params & params) { - params.simple_io = true; + [](common_params & cur) { + cur.simple_io = true; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); add_opt(common_arg( {"--positive-file"}, "FNAME", string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()), - [](common_params & params, const std::string & value) { - params.cvector_positive_file = value; + [](common_params & cur, const std::string & value) { + cur.cvector_positive_file = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); add_opt(common_arg( {"--negative-file"}, "FNAME", string_format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()), - [](common_params & params, const std::string & value) { - params.cvector_negative_file = value; + [](common_params & cur, const std::string & value) { + cur.cvector_negative_file = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); add_opt(common_arg( {"--pca-batch"}, "N", string_format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch), - [](common_params & params, int value) { - params.n_pca_batch = value; + [](common_params & cur, int value) { + cur.n_pca_batch = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); add_opt(common_arg( {"--pca-iter"}, "N", string_format("number of iterations used for PCA (default: %d)", params.n_pca_iterations), - [](common_params & params, int value) { - params.n_pca_iterations = value; + [](common_params & cur, int value) { + cur.n_pca_iterations = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); add_opt(common_arg( {"--method"}, "{pca, mean}", "dimensionality reduction method to be used (default: pca)", - [](common_params & params, const std::string & value) { - /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; } - else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; } + [](common_params & cur, const std::string & value) { + /**/ if (value == "pca") { cur.cvector_dimre_method = DIMRE_METHOD_PCA; } + else if (value == "mean") { cur.cvector_dimre_method = DIMRE_METHOD_MEAN; } else { throw std::invalid_argument("invalid value"); } } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); add_opt(common_arg( {"--output-format"}, "{md,jsonl}", "output format for batched-bench results (default: md)", - [](common_params & params, const std::string & value) { - /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; } - else if (value == "md") { params.batched_bench_output_jsonl = false; } + [](common_params & cur, const std::string & value) { + /**/ if (value == "jsonl") { cur.batched_bench_output_jsonl = true; } + else if (value == "md") { cur.batched_bench_output_jsonl = false; } else { std::invalid_argument("invalid value"); } } ).set_examples({LLAMA_EXAMPLE_BENCH})); @@ -2007,16 +2007,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"-v", "--verbose", "--log-verbose"}, "Set verbosity level to infinity (i.e. log all messages, useful for debugging)", - [](common_params & params) { - params.verbosity = INT_MAX; + [](common_params & cur) { + cur.verbosity = INT_MAX; common_log_set_verbosity_thold(INT_MAX); } )); add_opt(common_arg( {"-lv", "--verbosity", "--log-verbosity"}, "N", "Set the verbosity threshold. Messages with a higher verbosity will be ignored.", - [](common_params & params, int value) { - params.verbosity = value; + [](common_params & cur, int value) { + cur.verbosity = value; common_log_set_verbosity_thold(value); } ).set_env("LLAMA_LOG_VERBOSITY")); @@ -2039,29 +2039,29 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"-td", "--threads-draft"}, "N", "number of threads to use during generation (default: same as --threads)", - [](common_params & params, int value) { - params.speculative.cpuparams.n_threads = value; - if (params.speculative.cpuparams.n_threads <= 0) { - params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency(); + [](common_params & cur, int value) { + cur.speculative.cpuparams.n_threads = value; + if (cur.speculative.cpuparams.n_threads <= 0) { + cur.speculative.cpuparams.n_threads = std::thread::hardware_concurrency(); } } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-tbd", "--threads-batch-draft"}, "N", "number of threads to use during batch and prompt processing (default: same as --threads-draft)", - [](common_params & params, int value) { - params.speculative.cpuparams_batch.n_threads = value; - if (params.speculative.cpuparams_batch.n_threads <= 0) { - params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); + [](common_params & cur, int value) { + cur.speculative.cpuparams_batch.n_threads = value; + if (cur.speculative.cpuparams_batch.n_threads <= 0) { + cur.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); } } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-Cd", "--cpu-mask-draft"}, "M", "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", - [](common_params & params, const std::string & mask) { - params.speculative.cpuparams.mask_valid = true; - if (!parse_cpu_mask(mask, params.speculative.cpuparams.cpumask)) { + [](common_params & cur, const std::string & mask) { + cur.speculative.cpuparams.mask_valid = true; + if (!parse_cpu_mask(mask, cur.speculative.cpuparams.cpumask)) { throw std::invalid_argument("invalid cpumask"); } } @@ -2069,9 +2069,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"-Crd", "--cpu-range-draft"}, "lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft", - [](common_params & params, const std::string & range) { - params.speculative.cpuparams.mask_valid = true; - if (!parse_cpu_range(range, params.speculative.cpuparams.cpumask)) { + [](common_params & cur, const std::string & range) { + cur.speculative.cpuparams.mask_valid = true; + if (!parse_cpu_range(range, cur.speculative.cpuparams.cpumask)) { throw std::invalid_argument("invalid range"); } } @@ -2079,33 +2079,33 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"--cpu-strict-draft"}, "<0|1>", "Use strict CPU placement for draft model (default: same as --cpu-strict)", - [](common_params & params, int value) { - params.speculative.cpuparams.strict_cpu = value; + [](common_params & cur, int value) { + cur.speculative.cpuparams.strict_cpu = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"--prio-draft"}, "N", string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams.priority), - [](common_params & params, int prio) { + [](common_params & cur, int prio) { if (prio < 0 || prio > 3) { throw std::invalid_argument("invalid value"); } - params.speculative.cpuparams.priority = (enum ggml_sched_priority) prio; + cur.speculative.cpuparams.priority = (enum ggml_sched_priority) prio; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"--poll-draft"}, "<0|1>", "Use polling to wait for draft model work (default: same as --poll])", - [](common_params & params, int value) { - params.speculative.cpuparams.poll = value; + [](common_params & cur, int value) { + cur.speculative.cpuparams.poll = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-Cbd", "--cpu-mask-batch-draft"}, "M", "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", - [](common_params & params, const std::string & mask) { - params.speculative.cpuparams_batch.mask_valid = true; - if (!parse_cpu_mask(mask, params.speculative.cpuparams_batch.cpumask)) { + [](common_params & cur, const std::string & mask) { + cur.speculative.cpuparams_batch.mask_valid = true; + if (!parse_cpu_mask(mask, cur.speculative.cpuparams_batch.cpumask)) { throw std::invalid_argument("invalid cpumask"); } } @@ -2113,9 +2113,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)", - [](common_params & params, const std::string & range) { - params.speculative.cpuparams_batch.mask_valid = true; - if (!parse_cpu_range(range, params.speculative.cpuparams_batch.cpumask)) { + [](common_params & cur, const std::string & range) { + cur.speculative.cpuparams_batch.mask_valid = true; + if (!parse_cpu_range(range, cur.speculative.cpuparams_batch.cpumask)) { throw std::invalid_argument("invalid cpumask"); } } @@ -2123,75 +2123,75 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"--cpu-strict-batch-draft"}, "<0|1>", "Use strict CPU placement for draft model (default: --cpu-strict-draft)", - [](common_params & params, int value) { - params.speculative.cpuparams_batch.strict_cpu = value; + [](common_params & cur, int value) { + cur.speculative.cpuparams_batch.strict_cpu = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"--prio-batch-draft"}, "N", string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams_batch.priority), - [](common_params & params, int prio) { + [](common_params & cur, int prio) { if (prio < 0 || prio > 3) { throw std::invalid_argument("invalid value"); } - params.speculative.cpuparams_batch.priority = (enum ggml_sched_priority) prio; + cur.speculative.cpuparams_batch.priority = (enum ggml_sched_priority) prio; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"--poll-batch-draft"}, "<0|1>", "Use polling to wait for draft model work (default: --poll-draft)", - [](common_params & params, int value) { - params.speculative.cpuparams_batch.poll = value; + [](common_params & cur, int value) { + cur.speculative.cpuparams_batch.poll = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"--draft-max", "--draft", "--draft-n"}, "N", string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max), - [](common_params & params, int value) { - params.speculative.n_max = value; + [](common_params & cur, int value) { + cur.speculative.n_max = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX")); add_opt(common_arg( {"--draft-min", "--draft-n-min"}, "N", string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min), - [](common_params & params, int value) { - params.speculative.n_min = value; + [](common_params & cur, int value) { + cur.speculative.n_min = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN")); add_opt(common_arg( {"--draft-p-split"}, "P", string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split), - [](common_params & params, const std::string & value) { - params.speculative.p_split = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.speculative.p_split = std::stof(value); } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT")); add_opt(common_arg( {"--draft-p-min"}, "P", string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min), - [](common_params & params, const std::string & value) { - params.speculative.p_min = std::stof(value); + [](common_params & cur, const std::string & value) { + cur.speculative.p_min = std::stof(value); } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN")); add_opt(common_arg( {"-cd", "--ctx-size-draft"}, "N", string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx), - [](common_params & params, int value) { - params.speculative.n_ctx = value; + [](common_params & cur, int value) { + cur.speculative.n_ctx = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT")); add_opt(common_arg( {"-devd", "--device-draft"}, "", "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n" "use --list-devices to see a list of available devices", - [](common_params & params, const std::string & value) { - params.speculative.devices = parse_device_list(value); + [](common_params & cur, const std::string & value) { + cur.speculative.devices = parse_device_list(value); } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N", "number of layers to store in VRAM for the draft model", - [](common_params & params, int value) { - params.speculative.n_gpu_layers = value; + [](common_params & cur, int value) { + cur.speculative.n_gpu_layers = value; if (!llama_supports_gpu_offload()) { fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n"); fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n"); @@ -2202,16 +2202,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"-md", "--model-draft"}, "FNAME", "draft model for speculative decoding (default: unused)", - [](common_params & params, const std::string & value) { - params.speculative.model = value; + [](common_params & cur, const std::string & value) { + cur.speculative.model = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT")); add_opt(common_arg( {"-mv", "--model-vocoder"}, "FNAME", "vocoder model for audio generation (default: unused)", - [](common_params & params, const std::string & value) { - params.vocoder.model = value; + [](common_params & cur, const std::string & value) { + cur.vocoder.model = value; } ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER})); @@ -2219,11 +2219,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"--tts-oute-default"}, string_format("use default OuteTTS models (note: can download weights from the internet)"), - [](common_params & params) { - params.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF"; - params.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf"; - params.vocoder.hf_repo = "ggml-org/WavTokenizer"; - params.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf"; + [](common_params & cur) { + cur.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF"; + cur.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf"; + cur.vocoder.hf_repo = "ggml-org/WavTokenizer"; + cur.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf"; } ).set_examples({LLAMA_EXAMPLE_TTS})); diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index dadc18c8b..5bf67ecc1 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -579,8 +579,8 @@ private: seq.back().second = false; } else { std::string literal; - auto is_non_literal = [&](char c) { - return NON_LITERAL_SET.find(c) != NON_LITERAL_SET.end(); + auto is_non_literal = [&](char ch) { + return NON_LITERAL_SET.find(ch) != NON_LITERAL_SET.end(); }; while (i < length) { if (sub_pattern[i] == '\\' && i < length - 1) { diff --git a/common/log.cpp b/common/log.cpp index 7a94bf7f9..76715d629 100644 --- a/common/log.cpp +++ b/common/log.cpp @@ -255,8 +255,8 @@ public: thrd = std::thread([this]() { while (true) { { - std::unique_lock lock(mtx); - cv.wait(lock, [this]() { return head != tail; }); + std::unique_lock lock_thrd(mtx); + cv.wait(lock_thrd, [this]() { return head != tail; }); cur = entries[head]; diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 0659ab6f1..b17d6bc57 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -62,7 +62,7 @@ int main(int argc, char ** argv) { llama_batch batch = llama_batch_init(n_kv_max, 0, 1); // decode in batches of ctx_params.n_batch tokens - auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) { + auto decode_helper = [&ctx, &batch](int32_t n_batch) { for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) { const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); @@ -94,7 +94,7 @@ int main(int argc, char ** argv) { common_batch_add(batch, 0, i, { 0 }, false); } - if (!decode_helper(ctx, batch, ctx_params.n_batch)) { + if (!decode_helper(ctx_params.n_batch)) { LOG_ERR("%s: llama_decode() failed\n", __func__); return 1; } @@ -134,7 +134,7 @@ int main(int argc, char ** argv) { llama_kv_cache_clear(ctx); - if (!decode_helper(ctx, batch, ctx_params.n_batch)) { + if (!decode_helper(ctx_params.n_batch)) { LOG_ERR("%s: llama_decode() failed\n", __func__); return 1; } @@ -156,7 +156,7 @@ int main(int argc, char ** argv) { common_batch_add(batch, 0, pp + i, { j }, true); } - if (!decode_helper(ctx, batch, ctx_params.n_batch)) { + if (!decode_helper(ctx_params.n_batch)) { LOG_ERR("%s: llama_decode() failed\n", __func__); return 1; } diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index dc827e814..2e8812f03 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -2082,7 +2082,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli } else if (ctx->has_qwen2vl_merger) { clip_image_u8 * resized = clip_image_u8_init(); - auto patch_size = clip_patch_size(ctx) * 2; + auto patch_size = clip_get_patch_size(ctx) * 2; int nx = ceil((float)img->nx / patch_size) * patch_size; int ny = ceil((float)img->ny / patch_size) * patch_size; bicubic_resize(*img, *resized, nx, ny); @@ -2293,15 +2293,15 @@ size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float); } -int32_t clip_image_size(const struct clip_ctx * ctx) { +int32_t clip_get_image_size(const struct clip_ctx * ctx) { return ctx->vision_model.hparams.image_size; } -int32_t clip_patch_size(const struct clip_ctx * ctx) { +int32_t clip_get_patch_size(const struct clip_ctx * ctx) { return ctx->vision_model.hparams.patch_size; } -int32_t clip_hidden_size(const struct clip_ctx * ctx) { +int32_t clip_get_hidden_size(const struct clip_ctx * ctx) { return ctx->vision_model.hparams.hidden_size; } diff --git a/examples/llava/clip.h b/examples/llava/clip.h index 1603edd26..3b60f161d 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -47,9 +47,9 @@ CLIP_API void clip_free(struct clip_ctx * ctx); CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx); CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w); -CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx); -CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx); -CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx); +CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx); +CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx); +CLIP_API int32_t clip_get_hidden_size(const struct clip_ctx * ctx); // TODO: should be enum, not string CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx); diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index c598caf3d..1978ce180 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -105,8 +105,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector struct ggml_context * ctx; } model; - const int32_t image_size = clip_image_size(ctx_clip); - const int32_t patch_size = clip_patch_size(ctx_clip); + const int32_t image_size = clip_get_image_size(ctx_clip); + const int32_t patch_size = clip_get_patch_size(ctx_clip); int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches) @@ -353,7 +353,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli img_res_v.size = 0; img_res_v.data = nullptr; - const int32_t image_size = clip_image_size(ctx_clip); + const int32_t image_size = clip_get_image_size(ctx_clip); struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 0c0f066ca..ab8d6c6b4 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -3702,8 +3702,8 @@ int main(int argc, char ** argv) { ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_ptr & result) -> bool { json res_json = result->to_json(); if (res_json.is_array()) { - for (const auto & res : res_json) { - if (!server_sent_event(sink, "data", res)) { + for (const auto & item : res_json) { + if (!server_sent_event(sink, "data", item)) { return false; } } @@ -3973,9 +3973,9 @@ int main(int argc, char ** argv) { std::unordered_set task_ids = server_task::get_list_id(tasks); ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { - for (auto & res : results) { - GGML_ASSERT(dynamic_cast(res.get()) != nullptr); - responses.push_back(res->to_json()); + for (auto & result : results) { + GGML_ASSERT(dynamic_cast(result.get()) != nullptr); + responses.push_back(result->to_json()); } }, [&](const json & error_data) { res_error(res, error_data); @@ -4063,9 +4063,9 @@ int main(int argc, char ** argv) { std::unordered_set task_ids = server_task::get_list_id(tasks); ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { - for (auto & res : results) { - GGML_ASSERT(dynamic_cast(res.get()) != nullptr); - responses.push_back(res->to_json()); + for (auto & result : results) { + GGML_ASSERT(dynamic_cast(result.get()) != nullptr); + responses.push_back(result->to_json()); } }, [&](const json & error_data) { res_error(res, error_data); diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp index e8eda9c22..2b2d906e5 100644 --- a/examples/simple-chat/simple-chat.cpp +++ b/examples/simple-chat/simple-chat.cpp @@ -110,9 +110,8 @@ int main(int argc, char ** argv) { llama_token new_token_id; while (true) { // check if we have enough space in the context to evaluate this batch - int n_ctx = llama_n_ctx(ctx); int n_ctx_used = llama_get_kv_cache_used_cells(ctx); - if (n_ctx_used + batch.n_tokens > n_ctx) { + if (n_ctx_used + batch.n_tokens > (int) llama_n_ctx(ctx)) { printf("\033[0m\n"); fprintf(stderr, "context size exceeded\n"); exit(0); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 01a3afa40..9026fbcf5 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -311,9 +311,9 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, enum llama_split_m ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type"); if (ggml_backend_split_buffer_type_fn) { size_t dev_index = [&]() { - auto * reg = ggml_backend_dev_backend_reg(dev); - for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) { - if (ggml_backend_reg_dev_get(reg, i) == dev) { + ggml_backend_reg_t reg_dev = ggml_backend_dev_backend_reg(dev); + for (size_t i = 0; i < ggml_backend_reg_dev_count(reg_dev); ++i) { + if (ggml_backend_reg_dev_get(reg_dev, i) == dev) { return i; } } @@ -1304,7 +1304,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1); auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev { if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) { - return {cpu_dev, &pimpl->cpu_buft_list}; + return { cpu_dev, &pimpl->cpu_buft_list }; } const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin(); auto * dev = devices.at(layer_gpu); @@ -1453,7 +1453,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // avoid using a host buffer when using mmap auto * buft_dev = ggml_backend_buft_get_device(buft); if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) { - auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); buft = ggml_backend_dev_buffer_type(cpu_dev); } @@ -3697,8 +3696,8 @@ ggml_backend_buffer_type_t llama_model::select_buft(int il) const { const struct ggml_tensor * llama_model::get_tensor(const char * name) const { auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(), - [name](const std::pair & it) { - return it.first == name; + [name](const std::pair & entry) { + return entry.first == name; }); if (it == tensors_by_name.end()) { return nullptr; diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 75899d142..c1e751e70 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -130,17 +130,17 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2; }; const int n_expert = std::max(1, (int)qs.model.hparams.n_expert); - auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) { + auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name_layer) { if (n_expert > 1) { // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work // for getting the current layer as I initially thought, and we need to resort to parsing the // tensor name. - if (sscanf(name, "blk.%d.", &i_layer) != 1) { - throw std::runtime_error(format("Failed to determine layer for tensor %s", name)); + if (sscanf(name_layer, "blk.%d.", &i_layer) != 1) { + throw std::runtime_error(format("Failed to determine layer for tensor %s", name_layer)); } if (i_layer < 0 || i_layer >= n_layer) { - throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name, n_layer)); + throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name_layer, n_layer)); } } return std::make_pair(i_layer, n_layer); diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index ef108b991..b03f40485 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -2496,15 +2496,15 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t // copy piece chars to output text buffer // skip up to 'lstrip' leading spaces before copying - auto _try_copy = [=] (const char * token, size_t size) -> int32_t { - for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) { - token++; + auto _try_copy = [=] (const char * text, size_t size) -> int32_t { + for (int32_t i = 0; i < lstrip && size && *text == ' '; ++i) { + text++; size--; } if (length < (int32_t)size) { return -(int32_t) size; } - memcpy(buf, token, size); + memcpy(buf, text, size); return (int32_t) size; }; From a59ee7c4eb3efa39718af405dc1fad43bdca6dce Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 12 Jan 2025 16:19:18 +0200 Subject: [PATCH 12/15] common : cont ggml-ci --- cmake/common.cmake | 2 +- common/common.cpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cmake/common.cmake b/cmake/common.cmake index 45bac7af8..5a39cbf78 100644 --- a/cmake/common.cmake +++ b/cmake/common.cmake @@ -19,7 +19,7 @@ function(llama_add_compile_flags) list(APPEND CXX_FLAGS -Wshadow) if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") - list(APPEND CXX_FLAGS -Wshadow -Wshadow-field-in-constructor) + list(APPEND CXX_FLAGS -Wshadow-field-in-constructor) endif() endif() diff --git a/common/common.cpp b/common/common.cpp index 16cc3f41c..447fb03ea 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1208,7 +1208,7 @@ static bool common_download_file(const std::string & url, const std::string & pa { typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *); auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t { - common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata; + common_load_model_from_url_headers * cur = (common_load_model_from_url_headers *) userdata; static std::regex header_regex("([^:]+): (.*)\r\n"); static std::regex etag_regex("ETag", std::regex_constants::icase); @@ -1220,9 +1220,9 @@ static bool common_download_file(const std::string & url, const std::string & pa const std::string & key = match[1]; const std::string & value = match[2]; if (std::regex_match(key, match, etag_regex)) { - headers->etag = value; + cur->etag = value; } else if (std::regex_match(key, match, last_modified_regex)) { - headers->last_modified = value; + cur->last_modified = value; } } return n_items; From 36803b1902195f3489ede64644fc5d2e8d51ea77 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 12 Jan 2025 16:53:44 +0200 Subject: [PATCH 13/15] common : cont ggml-ci --- common/common.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 447fb03ea..e83537306 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1294,18 +1294,18 @@ static bool common_download_file(const std::string & url, const std::string & pa curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L); // helper function to hide password in URL - auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string { - std::size_t protocol_pos = url.find("://"); + auto llama_download_hide_password_in_url = [](const std::string & url_full) -> std::string { + std::size_t protocol_pos = url_full.find("://"); if (protocol_pos == std::string::npos) { - return url; // Malformed URL + return url_full; // Malformed URL } - std::size_t at_pos = url.find('@', protocol_pos + 3); + std::size_t at_pos = url_full.find('@', protocol_pos + 3); if (at_pos == std::string::npos) { - return url; // No password in URL + return url_full; // No password in URL } - return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos); + return url_full.substr(0, protocol_pos + 3) + "********" + url_full.substr(at_pos); }; // start the download From afd40ea206540e96f0a9dce45d43ac51dc966f1a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 12 Jan 2025 17:22:16 +0200 Subject: [PATCH 14/15] minor : better names ggml-ci --- .../convert-llama2c-to-ggml.cpp | 6 +++--- src/llama-mmap.cpp | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index ef0b22a3d..e597fa279 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -471,12 +471,12 @@ struct my_llama_file { GGML_ASSERT(ret == 0); // same } - void read_raw(void * ptr, size_t size_cur) { - if (size_cur == 0) { + void read_raw(void * raw_addr, size_t raw_size) { + if (raw_size == 0) { return; } errno = 0; - std::size_t ret = std::fread(ptr, size_cur, 1, fp); + std::size_t ret = std::fread(raw_addr, raw_size, 1, fp); if (ferror(fp)) { die_fmt("fread failed: %s", strerror(errno)); } diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index db4c4bcbe..7f43bccda 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -454,8 +454,8 @@ struct llama_mlock::impl { return (size_t) sysconf(_SC_PAGESIZE); } - bool raw_lock(const void * addr_cur, size_t size_cur) const { - if (!mlock(addr_cur, size_cur)) { + bool raw_lock(const void * lock_addr, size_t lock_len) const { + if (!mlock(lock_addr, lock_len)) { return true; } @@ -475,12 +475,12 @@ struct llama_mlock::impl { if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) { suggest = false; } - if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size_cur)) { + if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + lock_len)) { suggest = false; } LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s", - size_cur, size, errmsg, suggest ? MLOCK_SUGGESTION : ""); + lock_len, size, errmsg, suggest ? MLOCK_SUGGESTION : ""); return false; } @@ -535,7 +535,7 @@ struct llama_mlock::impl { return (size_t) 65536; } - bool raw_lock(const void * addr_cur, size_t size_cur) const { + bool raw_lock(const void * lock_addr, size_t lock_len) const { LLAMA_LOG_WARN("warning: mlock not supported on this system\n"); return false; } From a97b3621cf40f264f2f73b41d87ec70ee8b79c17 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 12 Jan 2025 17:57:51 +0200 Subject: [PATCH 15/15] ggml : ggml_backend_graph_copy -> ggml_backend_graph_copy_state ggml-ci --- ggml/include/ggml-backend.h | 6 +++--- ggml/src/ggml-backend.cpp | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index ce4fb4652..df6faa4b9 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -323,7 +323,7 @@ extern "C" { // Utils // - struct ggml_backend_graph_copy { + struct ggml_backend_graph_copy_state { ggml_backend_buffer_t buffer; struct ggml_context * ctx_allocated; struct ggml_context * ctx_unallocated; @@ -331,8 +331,8 @@ extern "C" { }; // Copy a graph to a different backend - GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy_init(ggml_backend_t backend, struct ggml_cgraph * graph); - GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy); + GGML_API struct ggml_backend_graph_copy_state ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph); + GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy_state copy); typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data); diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index cbc57a2d3..8f15805ba 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1724,7 +1724,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_ } } -struct ggml_backend_graph_copy ggml_backend_graph_copy_init(ggml_backend_t backend, struct ggml_cgraph * graph) { +struct ggml_backend_graph_copy_state ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) { struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size); struct ggml_tensor ** node_copies = (ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0])); @@ -1805,14 +1805,14 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy_init(ggml_backend_t backe }; } -void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) { +void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy_state copy) { ggml_backend_buffer_free(copy.buffer); ggml_free(copy.ctx_allocated); ggml_free(copy.ctx_unallocated); } bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) { - struct ggml_backend_graph_copy copy = ggml_backend_graph_copy_init(backend2, graph); + struct ggml_backend_graph_copy_state copy = ggml_backend_graph_copy(backend2, graph); if (copy.buffer == NULL) { return false; }