From 82caffa74e4e101df3adba878ecb99f6e25e3d84 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 12 Jan 2025 13:22:16 +0200 Subject: [PATCH] llama : de-shadow libllama [no ci] --- ci/run.sh | 3 + src/llama-batch.cpp | 83 ++++++++++----------- src/llama-batch.h | 4 +- src/llama-context.cpp | 4 +- src/llama-grammar.cpp | 2 +- src/llama-mmap.cpp | 10 +-- src/llama-model-loader.cpp | 16 ++-- src/llama-model-loader.h | 2 +- src/llama-model.cpp | 147 +++++++++++++++++++------------------ src/llama-model.h | 2 - src/llama-quant.cpp | 39 +++++----- src/llama-sampling.cpp | 20 ++--- src/llama-vocab.cpp | 28 +++---- 13 files changed, 181 insertions(+), 179 deletions(-) diff --git a/ci/run.sh b/ci/run.sh index abf08a4ff..bd3420e48 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -13,6 +13,9 @@ # # with SYCL support # GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt # +# # with METAL support +# GG_BUILD_METAL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt +# # # with VULKAN support # GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt # diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp index 01d5ca57f..e92e5ba9d 100644 --- a/src/llama-batch.cpp +++ b/src/llama-batch.cpp @@ -7,9 +7,9 @@ llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) { // clear empty sequences // the previous ubatch is assumed to be gone, // so nothing should refer to values in these sequences anymore. - for (size_t i = seq.size(); i-- > 0;) { - if (seq[i].length == 0) { - seq.pop_back(); + for (size_t i = seqs.size(); i-- > 0;) { + if (seqs[i].length == 0) { + seqs.pop_back(); } else { break; } @@ -36,48 +36,48 @@ llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) { } void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & seq, size_t length) { - GGML_ASSERT(batch != nullptr); + GGML_ASSERT(batch_ptr != nullptr); GGML_ASSERT(length <= seq.length); // Can only add sequences of equal lengths to a batch, // otherwise it isn't clear to which sequence a token belongs GGML_ASSERT(seq.n_seq_id == 0 || ubatch.n_seqs == 0 || length == (size_t) ubatch.n_tokens / ubatch.n_seqs); GGML_ASSERT((seq.n_seq_id != 0) == ubatch.equal_seqs); // NOTE: loops are separated for cache-friendliness - if (batch->token) { + if (batch_ptr->token) { if (ubatch.equal_seqs) { for (size_t i = 0; i < length; ++i) { - ubatch.token[ubatch.n_tokens + i] = batch->token[ids[seq.offset + i]]; + ubatch.token[ubatch.n_tokens + i] = batch_ptr->token[ids[seq.offset + i]]; } } else { // simple split - ubatch.token = batch->token + seq.offset; + ubatch.token = batch_ptr->token + seq.offset; } } else { ubatch.token = nullptr; } - if (batch->embd) { + if (batch_ptr->embd) { if (ubatch.equal_seqs) { for (size_t i = 0; i < length; ++i) { memcpy( ubatch.embd + (n_embd * (ubatch.n_tokens + i)), - batch->embd + (n_embd * ids[seq.offset + i]), + batch_ptr->embd + (n_embd * ids[seq.offset + i]), n_embd * sizeof(float) ); } } else { // simple split - ubatch.embd = batch->embd + (n_embd * seq.offset); + ubatch.embd = batch_ptr->embd + (n_embd * seq.offset); } } else { ubatch.embd = nullptr; } if (ubatch.equal_seqs) { for (size_t i = 0; i < length; ++i) { - ubatch.pos[ubatch.n_tokens + i] = batch->pos[ids[seq.offset + i]]; + ubatch.pos[ubatch.n_tokens + i] = batch_ptr->pos[ids[seq.offset + i]]; } } else { // simple split - ubatch.pos = batch->pos + seq.offset; + ubatch.pos = batch_ptr->pos + seq.offset; } if (ubatch.equal_seqs) { ubatch.n_seq_id[ubatch.n_seqs] = seq.n_seq_id; @@ -86,15 +86,15 @@ void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & s } } else { // simple split - if (batch->n_seq_id) { - ubatch.n_seq_id = batch->n_seq_id + seq.offset; + if (batch_ptr->n_seq_id) { + ubatch.n_seq_id = batch_ptr->n_seq_id + seq.offset; } else { for (size_t i = 0; i < length; ++i) { ubatch.n_seq_id[ubatch.n_seqs + i] = 1; } } - if (batch->seq_id) { - ubatch.seq_id = batch->seq_id + seq.offset; + if (batch_ptr->seq_id) { + ubatch.seq_id = batch_ptr->seq_id + seq.offset; } } if (logits_all) { @@ -102,17 +102,17 @@ void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & s ubatch.output[ubatch.n_tokens + i] = 1; out_ids.push_back(ids[seq.offset + i]); } - } else if (batch->logits) { + } else if (batch_ptr->logits) { if (ubatch.equal_seqs) { for (size_t i = 0; i < length; ++i) { size_t id = ids[seq.offset + i]; - int8_t is_output = batch->logits[id]; + int8_t is_output = batch_ptr->logits[id]; ubatch.output[ubatch.n_tokens + i] = is_output; if (is_output) { out_ids.push_back(id); } } } else { // simple split - ubatch.output = batch->logits + seq.offset; + ubatch.output = batch_ptr->logits + seq.offset; for (size_t i = 0; i < length; ++i) { if (ubatch.output[i] != 0) { out_ids.push_back(seq.offset + i); } } @@ -139,12 +139,12 @@ void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & s llama_ubatch llama_sbatch::split_simple(size_t n_ubatch) { n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch; - llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr); + llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch_ptr->embd != nullptr); ubatch.equal_seqs = false; - if (!seq.empty()) { - llama_sbatch_seq & s = seq[0]; + if (!seqs.empty()) { + llama_sbatch_seq & s = seqs[0]; size_t length = s.length < n_ubatch ? s.length : n_ubatch; - GGML_ASSERT(seq.size() == 1 && s.n_seq_id == 0); // don't mix with other splits + GGML_ASSERT(seqs.size() == 1 && s.n_seq_id == 0); // don't mix with other splits add_seq_to_ubatch(ubatch, s, length); } return ubatch; @@ -152,15 +152,15 @@ llama_ubatch llama_sbatch::split_simple(size_t n_ubatch) { llama_ubatch llama_sbatch::split_equal(size_t n_ubatch) { n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch; - llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr); - if (!seq.empty()) { + llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch_ptr->embd != nullptr); + if (!seqs.empty()) { size_t length = 0; size_t n_tokens_in_ubatch = 0; - GGML_ASSERT(seq[0].n_seq_id > 0); // should not be mixed with simple splits + GGML_ASSERT(seqs[0].n_seq_id > 0); // should not be mixed with simple splits // smallest first, because it's easier to split this way; // starting from the end to pop in constant time. - for (size_t i = seq.size(); i-- > 0;) { - llama_sbatch_seq & s = seq[i]; + for (size_t i = seqs.size(); i-- > 0;) { + llama_sbatch_seq & s = seqs[i]; GGML_ASSERT(s.length > 0); if (length == 0) { length = s.length < n_ubatch ? s.length : n_ubatch; @@ -179,9 +179,9 @@ llama_ubatch llama_sbatch::split_equal(size_t n_ubatch) { llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) { n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch; - llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr); - if (!seq.empty()) { - llama_sbatch_seq & s = seq[seq.size() - 1]; + llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch_ptr->embd != nullptr); + if (!seqs.empty()) { + llama_sbatch_seq & s = seqs.back(); size_t length = s.length < n_ubatch ? s.length : n_ubatch; GGML_ASSERT(s.n_seq_id > 0); // should not be mixed with simple splits add_seq_to_ubatch(ubatch, s, length); @@ -189,23 +189,24 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) { return ubatch; } -void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) { +void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd_cur, bool simple_split, bool logits_all_cur) { GGML_ASSERT(batch.n_tokens >= 0); - this->batch = &batch; - this->n_embd = n_embd; - this->logits_all = logits_all; + + batch_ptr = &batch; + n_embd = n_embd_cur; + logits_all = logits_all_cur; n_tokens = batch.n_tokens; ids.resize(n_tokens); out_ids.clear(); - // TODO: reserve out_ids and seq + // TODO: reserve out_ids and seqs for (size_t i = 0; i < n_tokens; ++i) { ids[i] = i; } if (simple_split) { - seq.resize(1); - llama_sbatch_seq & s = seq[0]; + seqs.resize(1); + llama_sbatch_seq & s = seqs[0]; s.n_seq_id = 0; s.seq_id = nullptr; s.offset = 0; @@ -259,11 +260,11 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim } } llama_sbatch_seq new_seq = {n_seqs, seq_ids, i, 1}; - seq.push_back(new_seq); - last_seq = &seq.back(); + seqs.push_back(new_seq); + last_seq = &seqs.back(); } // keep shared prompts first at the end, then sort by length descending. - std::sort(seq.begin(), seq.end(), + std::sort(seqs.begin(), seqs.end(), [](llama_sbatch_seq & a, llama_sbatch_seq & b) { if (a.n_seq_id == b.n_seq_id) { return a.length > b.length; diff --git a/src/llama-batch.h b/src/llama-batch.h index 773c3808b..572eb79fd 100644 --- a/src/llama-batch.h +++ b/src/llama-batch.h @@ -45,9 +45,9 @@ struct llama_sbatch { std::vector ids; // batch indices of the output std::vector out_ids; - std::vector seq; + std::vector seqs; - const llama_batch * batch = nullptr; + const llama_batch * batch_ptr = nullptr; // buffers for the ubatch std::vector ubatch_token; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 671d2a81a..c761a4a21 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -916,8 +916,8 @@ struct llama_data_write { write(&n_seq_id, sizeof(n_seq_id)); if (n_seq_id) { - for (auto seq_id : cell.seq_id) { - write(&seq_id, sizeof(seq_id)); + for (auto sid : cell.seq_id) { + write(&sid, sizeof(sid)); } } } diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp index bebe4e9a3..bea7d0b1a 100644 --- a/src/llama-grammar.cpp +++ b/src/llama-grammar.cpp @@ -490,7 +490,7 @@ const char * llama_grammar_parser::parse_sequence( pos = parse_space(pos + 1, is_nested); if (is_digit_char(*pos)) { - const char * int_end = parse_int(pos); + int_end = parse_int(pos); max_times = std::stoul(std::string(pos, int_end - pos)); pos = parse_space(int_end, is_nested); } diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index 57c6e4f51..db4c4bcbe 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -454,8 +454,8 @@ struct llama_mlock::impl { return (size_t) sysconf(_SC_PAGESIZE); } - bool raw_lock(const void * addr, size_t size) const { - if (!mlock(addr, size)) { + bool raw_lock(const void * addr_cur, size_t size_cur) const { + if (!mlock(addr_cur, size_cur)) { return true; } @@ -475,12 +475,12 @@ struct llama_mlock::impl { if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) { suggest = false; } - if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) { + if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size_cur)) { suggest = false; } LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s", - size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : ""); + size_cur, size, errmsg, suggest ? MLOCK_SUGGESTION : ""); return false; } @@ -535,7 +535,7 @@ struct llama_mlock::impl { return (size_t) 65536; } - bool raw_lock(const void * addr, size_t len) const { + bool raw_lock(const void * addr_cur, size_t size_cur) const { LLAMA_LOG_WARN("warning: mlock not supported on this system\n"); return false; } diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 53175f0e0..a781b2884 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -413,7 +413,7 @@ namespace GGUFMeta { template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); -llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) { +llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap_cur, bool check_tensors_cur, const struct llama_model_kv_override * param_overrides_p) { int trace = 0; if (getenv("LLAMA_TRACE")) { trace = atoi(getenv("LLAMA_TRACE")); @@ -626,11 +626,11 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, if (!llama_mmap::SUPPORTED) { LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__); - use_mmap = false; + use_mmap_cur = false; } - this->use_mmap = use_mmap; - this->check_tensors = check_tensors; + use_mmap = use_mmap_cur; + check_tensors = check_tensors_cur; } std::string llama_model_loader::get_arch_name() const { @@ -887,15 +887,15 @@ bool llama_model_loader::load_all_data( // If the backend is supported, create pinned memory buffers and events for synchronisation. for (size_t idx = 0; idx < n_buffers; ++idx) { - auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size); - if (!buf) { + auto * buf_new = ggml_backend_buft_alloc_buffer(host_buft, buffer_size); + if (!buf_new) { LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func, ggml_backend_dev_name(dev)); return nullptr; } - host_buffers.emplace_back(buf); - host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf)); + host_buffers.emplace_back(buf_new); + host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf_new)); auto * event = ggml_backend_event_new(dev); if (!event) { diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index b63d158d9..4814bbdc9 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -90,7 +90,7 @@ struct llama_model_loader { size_t size_data = 0; std::vector> mmaps_used; - llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p); + llama_model_loader(const std::string & fname, bool use_mmap_cur, bool check_tensors_cur, const struct llama_model_kv_override * param_overrides_p); template typename std::enable_if::value, bool>::type diff --git a/src/llama-model.cpp b/src/llama-model.cpp index f90f5e746..1229d8738 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -340,7 +340,8 @@ struct llama_model::impl { size_t n_bytes = 0; - std::string desc_str; + std::string name_str = "n/a"; + std::string desc_str = "n/a"; // model memory mapped files llama_mmaps mappings; @@ -390,17 +391,17 @@ void llama_model::load_hparams(llama_model_loader & ml) { // get metadata as string for (int i = 0; i < gguf_get_n_kv(ctx); i++) { - enum gguf_type type = gguf_get_kv_type(ctx, i); - if (type == GGUF_TYPE_ARRAY) { + gguf_type type_cur = gguf_get_kv_type(ctx, i); + if (type_cur == GGUF_TYPE_ARRAY) { continue; } - const char * name = gguf_get_key(ctx, i); - const std::string value = gguf_kv_to_str(ctx, i); - gguf_kv.emplace(name, value); + const char * name_cur = gguf_get_key(ctx, i); + const std::string value_cur = gguf_kv_to_str(ctx, i); + gguf_kv.emplace(name_cur, value_cur); } // get general kv - ml.get_key(LLM_KV_GENERAL_NAME, name, false); + ml.get_key(LLM_KV_GENERAL_NAME, pimpl->name_str, false); // everything past this point is not vocab-related if (hparams.vocab_only) { @@ -1333,13 +1334,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) { auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { auto it = ctx_map.find(buft); if (it == ctx_map.end()) { - ggml_init_params params = { + ggml_init_params params_cur = { /*.mem_size =*/ ctx_size, /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; - ggml_context * ctx = ggml_init(params); + ggml_context * ctx = ggml_init(params_cur); if (!ctx) { throw std::runtime_error(format("failed to create ggml context")); } @@ -1557,31 +1558,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) { for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i); - const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i); - const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i); - const int64_t n_ff = hparams.n_ff(i); - const int64_t n_head = hparams.n_head(i); - const int64_t n_head_kv = hparams.n_head_kv(i); + const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i); + const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i); + const int64_t n_embd_gqa_i = hparams.n_embd_v_gqa(i); + const int64_t n_ff_i = hparams.n_ff(i); + const int64_t n_head_i = hparams.n_head(i); + const int64_t n_head_kv_i = hparams.n_head_kv(i); - if (n_head_kv == 0 && n_head > 0) { + if (n_head_kv_i == 0 && n_head_i > 0) { // linear attention for DeciLMCausalModel layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); } - else if (n_head_kv > 0) { + else if (n_head_kv_i > 0) { layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); - layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); - layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); - layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); - layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0); } // optional bias tensors layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); - layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED); - layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED); + layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa_i}, TENSOR_NOT_REQUIRED); + layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa_i}, TENSOR_NOT_REQUIRED); layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); @@ -1594,14 +1595,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); } - layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff_i}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff_i, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff_i}, 0); // optional MLP bias - layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED); + layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff_i}, TENSOR_NOT_REQUIRED); layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); - layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED); + layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff_i}, TENSOR_NOT_REQUIRED); } } break; case LLM_ARCH_MINICPM3: @@ -2653,23 +2654,23 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); for (int i = 0; i < n_layer; ++i) { - const int64_t n_head = hparams.n_head(i); - const int64_t n_head_qkv = 2*hparams.n_head_kv(i) + n_head; - const int64_t n_ff = hparams.n_ff(i); + const int64_t n_head_i = hparams.n_head(i); + const int64_t n_head_qkv_i = 2*hparams.n_head_kv(i) + n_head_i; + const int64_t n_ff_i = hparams.n_ff(i); auto & layer = layers[i]; layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); - layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0); + layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv_i*n_embd_head_k}, 0); layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0); layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0); - layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head_i*n_embd_head_k, n_embd}, 0); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); - layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff_i}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff_i, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff_i}, 0); } } break; case LLM_ARCH_GPTNEOX: @@ -3167,11 +3168,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); - const int time_mix_extra_dim = hparams.time_mix_extra_dim; + const int time_mix_extra_dim = hparams.time_mix_extra_dim; const int time_decay_extra_dim = hparams.time_decay_extra_dim; - const int head_size = hparams.wkv_head_size; - const int attn_hidden_size = n_embd; - const int n_head_kv = hparams.n_head_kv(); + const int head_size = hparams.wkv_head_size; + const int attn_hidden_size = n_embd; + int attn_key_value_size; if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) { attn_key_value_size = attn_hidden_size; @@ -3254,7 +3255,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // posnet { - const int64_t n_embd = hparams.posnet.n_embd; + const int64_t n_embd_cur = hparams.posnet.n_embd; for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) { auto & layer = layers[i].posnet; @@ -3274,39 +3275,39 @@ bool llama_model::load_tensors(llama_model_loader & ml) { case 3: case 4: { - layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0); - layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0); + layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd_cur}, 0); + layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd_cur}, 0); - layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0); - layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0); + layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd_cur, n_embd_cur}, 0); + layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd_cur}, 0); - layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0); - layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0); + layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd_cur}, 0); + layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd_cur}, 0); - layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0); - layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0); + layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd_cur, n_embd_cur}, 0); + layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd_cur}, 0); } break; case 2: { - layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0); - layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0); + layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd_cur}, 0); + layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd_cur}, 0); - layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0); - layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0); + layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd_cur, n_embd_cur}, 0); + layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd_cur}, 0); - layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0); - layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0); + layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd_cur, n_embd_cur}, 0); + layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd_cur}, 0); - layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0); - layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0); + layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd_cur, n_embd_cur}, 0); + layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd_cur}, 0); - layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0); - layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0); + layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd_cur, n_embd_cur}, 0); + layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd_cur}, 0); } break; case 5: { - layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0); - layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0); + layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd_cur}, 0); + layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd_cur}, 0); } break; default: GGML_ABORT("unknown posnet layer"); }; @@ -3320,29 +3321,29 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // convnext { - const int64_t n_embd = hparams.convnext.n_embd; + const int64_t n_embd_cur = hparams.convnext.n_embd; for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) { auto & layer = layers[i].convnext; - layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0); - layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0); + layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd_cur}, 0); + layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd_cur}, 0); - layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0); - layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0); + layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd_cur}, 0); + layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd_cur}, 0); - layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0); + layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd_cur, n_ff}, 0); layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0); - layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0); - layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0); + layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd_cur}, 0); + layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd_cur}, 0); - layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0); + layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd_cur}, 0); } // output - output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); - output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0); + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd_cur}, 0); + output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd_cur}, 0); } output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0); @@ -3601,7 +3602,7 @@ void llama_model::print_info() const { } // general kv - LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str()); + LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, pimpl->name_str.c_str()); if (arch == LLM_ARCH_DEEPSEEK) { LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); diff --git a/src/llama-model.h b/src/llama-model.h index 4cc8abb75..39e26fae7 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -290,8 +290,6 @@ struct llama_model { llm_type type = LLM_TYPE_UNKNOWN; llm_arch arch = LLM_ARCH_UNKNOWN; - std::string name = "n/a"; - llama_hparams hparams = {}; llama_vocab vocab; diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index d4947a780..6c59e1730 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -423,8 +423,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * int64_t counter = 0; size_t new_size = 0; bool valid = true; - auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size, - nrows, n_per_row, imatrix]() { + auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size, nrows, n_per_row, imatrix]() { const int64_t nrows_per_chunk = chunk_size / n_per_row; size_t local_size = 0; while (true) { @@ -437,6 +436,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * break; } lock.unlock(); + const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk); size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix); local_size += this_size; @@ -445,7 +445,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * const size_t row_size = ggml_row_size(new_type, n_per_row); void * this_data = (char *) new_data + first_row * row_size; if (!ggml_validate_row_data(new_type, this_data, this_size)) { - std::unique_lock lock(mutex); + lock.lock(); valid = false; break; } @@ -589,15 +589,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } // make a list of weights - std::vector tensors; - tensors.reserve(ml.weights_map.size()); + std::vector tensor_weights; + tensor_weights.reserve(ml.weights_map.size()); for (const auto & it : ml.weights_map) { - tensors.push_back(&it.second); + tensor_weights.push_back(&it.second); } // keep_split requires that the weights are sorted by split index if (params->keep_split) { - std::sort(tensors.begin(), tensors.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) { + std::sort(tensor_weights.begin(), tensor_weights.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) { if (a->idx == b->idx) { return a->offs < b->offs; } @@ -605,8 +605,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: }); } - for (const auto * it : tensors) { - const struct ggml_tensor * tensor = it->tensor; + for (const auto * tw : tensor_weights) { + const ggml_tensor * tensor = tw->tensor; const std::string name = ggml_get_name(tensor); @@ -650,17 +650,17 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // Assume split index is continuous if (params->keep_split) { - for (const auto * it : tensors) { - n_split = std::max(uint16_t(it->idx + 1), n_split); + for (const auto * tw : tensor_weights) { + n_split = std::max(uint16_t(tw->idx + 1), n_split); } } std::vector ctx_outs(n_split); ctx_outs[0] = std::move(ctx_out); - // populate the original tensors so we get an initial meta data - for (const auto * it : tensors) { - uint16_t i_split = params->keep_split ? it->idx : 0; - struct ggml_tensor * tensor = it->tensor; + // populate the original tensor_weights so we get an initial meta data + for (const auto * tw : tensor_weights) { + uint16_t i_split = params->keep_split ? tw->idx : 0; + ggml_tensor * tensor = tw->tensor; if (!ctx_outs[i_split]) { ctx_outs[i_split].reset(gguf_init_empty()); } @@ -707,12 +707,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: const auto tn = LLM_TN(model.arch); new_ofstream(0); - for (const auto * it : tensors) { - const auto & weight = *it; - struct ggml_tensor * tensor = weight.tensor; - if (weight.idx != cur_split && params->keep_split) { + for (const auto * tw : tensor_weights) { + ggml_tensor * tensor = tw->tensor; + if (tw->idx != cur_split && params->keep_split) { close_ofstream(); - new_ofstream(weight.idx); + new_ofstream(tw->idx); } const std::string name = ggml_get_name(tensor); diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index b3a12386e..711de388e 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -412,8 +412,8 @@ static void llama_sampler_chain_accept(struct llama_sampler * smpl, llama_token time_meas tm(chain->t_sample_us, chain->params.no_perf); - for (auto * smpl : chain->samplers) { - llama_sampler_accept(smpl, token); + for (auto * cur : chain->samplers) { + llama_sampler_accept(cur, token); } chain->n_sample++; @@ -424,16 +424,16 @@ static void llama_sampler_chain_apply(struct llama_sampler * smpl, llama_token_d time_meas tm(chain->t_sample_us, chain->params.no_perf); - for (auto * smpl : chain->samplers) { - llama_sampler_apply(smpl, cur_p); + for (auto * cur : chain->samplers) { + llama_sampler_apply(cur, cur_p); } } static void llama_sampler_chain_reset(struct llama_sampler * smpl) { auto * chain = (llama_sampler_chain *) smpl->ctx; - for (auto * smpl : chain->samplers) { - llama_sampler_reset(smpl); + for (auto * cur : chain->samplers) { + llama_sampler_reset(cur); } chain->t_sample_us = 0; @@ -445,8 +445,8 @@ static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampl auto * result = llama_sampler_chain_init(chain_src->params); - for (auto * smpl : chain_src->samplers) { - llama_sampler_chain_add(result, llama_sampler_clone(smpl)); + for (auto * cur : chain_src->samplers) { + llama_sampler_chain_add(result, llama_sampler_clone(cur)); } return result; @@ -455,8 +455,8 @@ static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampl static void llama_sampler_chain_free(struct llama_sampler * smpl) { auto * chain = (llama_sampler_chain *) smpl->ctx; - for (auto * smpl : chain->samplers) { - llama_sampler_free(smpl); + for (auto * cur : chain->samplers) { + llama_sampler_free(cur); } delete chain; diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index cd943b97c..df6bcdf6a 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -34,12 +34,12 @@ struct naive_trie { } char c = key[0]; - auto res = children.find(c); - if (res != children.end()) { - res->second.insert(key + 1, len - 1, val); + auto child = children.find(c); + if (child != children.end()) { + child->second.insert(key + 1, len - 1, val); } else { - auto res = children.insert(std::make_pair(c, naive_trie())); - res.first->second.insert(key + 1, len - 1, val); + auto child_new = children.insert(std::make_pair(c, naive_trie())); + child_new.first->second.insert(key + 1, len - 1, val); } } @@ -49,18 +49,18 @@ struct naive_trie { } char c = key[offset]; - auto res = children.find(c); - if (res != children.end()) { - return res->second.get_longest_prefix(key, len, offset + 1); + auto child = children.find(c); + if (child != children.end()) { + return child->second.get_longest_prefix(key, len, offset + 1); } return std::make_pair(key, offset); } const struct naive_trie * traverse(const char c) const { - auto res = children.find(c); - if (res != children.end()) { - return &res->second; + auto child = children.find(c); + if (child != children.end()) { + return &child->second; } return NULL; @@ -1285,7 +1285,7 @@ struct llama_vocab::impl { llama_token_attr token_get_attr(llama_token id) const; - void init_tokenizer(enum llama_vocab_type type); + void init_tokenizer(); void tokenizer_st_partition(std::forward_list & buffer, bool parse_special) const; @@ -1675,7 +1675,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { } GGML_ASSERT(id_to_token.size() == token_to_id.size()); - init_tokenizer(type); + init_tokenizer(); // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n' if (type == LLAMA_VOCAB_TYPE_SPM) { @@ -2116,7 +2116,7 @@ llama_token_attr llama_vocab::impl::token_get_attr(llama_token id) const { return id_to_token.at(id).attr; } -void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) { +void llama_vocab::impl::init_tokenizer() { LLAMA_LOG_DEBUG("%s: initializing tokenizer for type %d\n", __func__, type); switch (type) {