llama : de-shadow libllama [no ci]

This commit is contained in:
Georgi Gerganov 2025-01-12 13:22:16 +02:00
parent 32e7b9dc99
commit 82caffa74e
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
13 changed files with 181 additions and 179 deletions

View file

@ -13,6 +13,9 @@
# # with SYCL support # # with SYCL support
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt # GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
# #
# # with METAL support
# GG_BUILD_METAL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
# # with VULKAN support # # with VULKAN support
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt # GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
# #

View file

@ -7,9 +7,9 @@ llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) {
// clear empty sequences // clear empty sequences
// the previous ubatch is assumed to be gone, // the previous ubatch is assumed to be gone,
// so nothing should refer to values in these sequences anymore. // so nothing should refer to values in these sequences anymore.
for (size_t i = seq.size(); i-- > 0;) { for (size_t i = seqs.size(); i-- > 0;) {
if (seq[i].length == 0) { if (seqs[i].length == 0) {
seq.pop_back(); seqs.pop_back();
} else { } else {
break; break;
} }
@ -36,48 +36,48 @@ llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) {
} }
void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & seq, size_t length) { void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & seq, size_t length) {
GGML_ASSERT(batch != nullptr); GGML_ASSERT(batch_ptr != nullptr);
GGML_ASSERT(length <= seq.length); GGML_ASSERT(length <= seq.length);
// Can only add sequences of equal lengths to a batch, // Can only add sequences of equal lengths to a batch,
// otherwise it isn't clear to which sequence a token belongs // otherwise it isn't clear to which sequence a token belongs
GGML_ASSERT(seq.n_seq_id == 0 || ubatch.n_seqs == 0 || length == (size_t) ubatch.n_tokens / ubatch.n_seqs); GGML_ASSERT(seq.n_seq_id == 0 || ubatch.n_seqs == 0 || length == (size_t) ubatch.n_tokens / ubatch.n_seqs);
GGML_ASSERT((seq.n_seq_id != 0) == ubatch.equal_seqs); GGML_ASSERT((seq.n_seq_id != 0) == ubatch.equal_seqs);
// NOTE: loops are separated for cache-friendliness // NOTE: loops are separated for cache-friendliness
if (batch->token) { if (batch_ptr->token) {
if (ubatch.equal_seqs) { if (ubatch.equal_seqs) {
for (size_t i = 0; i < length; ++i) { for (size_t i = 0; i < length; ++i) {
ubatch.token[ubatch.n_tokens + i] = batch->token[ids[seq.offset + i]]; ubatch.token[ubatch.n_tokens + i] = batch_ptr->token[ids[seq.offset + i]];
} }
} else { } else {
// simple split // simple split
ubatch.token = batch->token + seq.offset; ubatch.token = batch_ptr->token + seq.offset;
} }
} else { } else {
ubatch.token = nullptr; ubatch.token = nullptr;
} }
if (batch->embd) { if (batch_ptr->embd) {
if (ubatch.equal_seqs) { if (ubatch.equal_seqs) {
for (size_t i = 0; i < length; ++i) { for (size_t i = 0; i < length; ++i) {
memcpy( memcpy(
ubatch.embd + (n_embd * (ubatch.n_tokens + i)), ubatch.embd + (n_embd * (ubatch.n_tokens + i)),
batch->embd + (n_embd * ids[seq.offset + i]), batch_ptr->embd + (n_embd * ids[seq.offset + i]),
n_embd * sizeof(float) n_embd * sizeof(float)
); );
} }
} else { } else {
// simple split // simple split
ubatch.embd = batch->embd + (n_embd * seq.offset); ubatch.embd = batch_ptr->embd + (n_embd * seq.offset);
} }
} else { } else {
ubatch.embd = nullptr; ubatch.embd = nullptr;
} }
if (ubatch.equal_seqs) { if (ubatch.equal_seqs) {
for (size_t i = 0; i < length; ++i) { for (size_t i = 0; i < length; ++i) {
ubatch.pos[ubatch.n_tokens + i] = batch->pos[ids[seq.offset + i]]; ubatch.pos[ubatch.n_tokens + i] = batch_ptr->pos[ids[seq.offset + i]];
} }
} else { } else {
// simple split // simple split
ubatch.pos = batch->pos + seq.offset; ubatch.pos = batch_ptr->pos + seq.offset;
} }
if (ubatch.equal_seqs) { if (ubatch.equal_seqs) {
ubatch.n_seq_id[ubatch.n_seqs] = seq.n_seq_id; ubatch.n_seq_id[ubatch.n_seqs] = seq.n_seq_id;
@ -86,15 +86,15 @@ void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & s
} }
} else { } else {
// simple split // simple split
if (batch->n_seq_id) { if (batch_ptr->n_seq_id) {
ubatch.n_seq_id = batch->n_seq_id + seq.offset; ubatch.n_seq_id = batch_ptr->n_seq_id + seq.offset;
} else { } else {
for (size_t i = 0; i < length; ++i) { for (size_t i = 0; i < length; ++i) {
ubatch.n_seq_id[ubatch.n_seqs + i] = 1; ubatch.n_seq_id[ubatch.n_seqs + i] = 1;
} }
} }
if (batch->seq_id) { if (batch_ptr->seq_id) {
ubatch.seq_id = batch->seq_id + seq.offset; ubatch.seq_id = batch_ptr->seq_id + seq.offset;
} }
} }
if (logits_all) { if (logits_all) {
@ -102,17 +102,17 @@ void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & s
ubatch.output[ubatch.n_tokens + i] = 1; ubatch.output[ubatch.n_tokens + i] = 1;
out_ids.push_back(ids[seq.offset + i]); out_ids.push_back(ids[seq.offset + i]);
} }
} else if (batch->logits) { } else if (batch_ptr->logits) {
if (ubatch.equal_seqs) { if (ubatch.equal_seqs) {
for (size_t i = 0; i < length; ++i) { for (size_t i = 0; i < length; ++i) {
size_t id = ids[seq.offset + i]; size_t id = ids[seq.offset + i];
int8_t is_output = batch->logits[id]; int8_t is_output = batch_ptr->logits[id];
ubatch.output[ubatch.n_tokens + i] = is_output; ubatch.output[ubatch.n_tokens + i] = is_output;
if (is_output) { out_ids.push_back(id); } if (is_output) { out_ids.push_back(id); }
} }
} else { } else {
// simple split // simple split
ubatch.output = batch->logits + seq.offset; ubatch.output = batch_ptr->logits + seq.offset;
for (size_t i = 0; i < length; ++i) { for (size_t i = 0; i < length; ++i) {
if (ubatch.output[i] != 0) { out_ids.push_back(seq.offset + i); } if (ubatch.output[i] != 0) { out_ids.push_back(seq.offset + i); }
} }
@ -139,12 +139,12 @@ void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & s
llama_ubatch llama_sbatch::split_simple(size_t n_ubatch) { llama_ubatch llama_sbatch::split_simple(size_t n_ubatch) {
n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch; n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr); llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch_ptr->embd != nullptr);
ubatch.equal_seqs = false; ubatch.equal_seqs = false;
if (!seq.empty()) { if (!seqs.empty()) {
llama_sbatch_seq & s = seq[0]; llama_sbatch_seq & s = seqs[0];
size_t length = s.length < n_ubatch ? s.length : n_ubatch; size_t length = s.length < n_ubatch ? s.length : n_ubatch;
GGML_ASSERT(seq.size() == 1 && s.n_seq_id == 0); // don't mix with other splits GGML_ASSERT(seqs.size() == 1 && s.n_seq_id == 0); // don't mix with other splits
add_seq_to_ubatch(ubatch, s, length); add_seq_to_ubatch(ubatch, s, length);
} }
return ubatch; return ubatch;
@ -152,15 +152,15 @@ llama_ubatch llama_sbatch::split_simple(size_t n_ubatch) {
llama_ubatch llama_sbatch::split_equal(size_t n_ubatch) { llama_ubatch llama_sbatch::split_equal(size_t n_ubatch) {
n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch; n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr); llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch_ptr->embd != nullptr);
if (!seq.empty()) { if (!seqs.empty()) {
size_t length = 0; size_t length = 0;
size_t n_tokens_in_ubatch = 0; size_t n_tokens_in_ubatch = 0;
GGML_ASSERT(seq[0].n_seq_id > 0); // should not be mixed with simple splits GGML_ASSERT(seqs[0].n_seq_id > 0); // should not be mixed with simple splits
// smallest first, because it's easier to split this way; // smallest first, because it's easier to split this way;
// starting from the end to pop in constant time. // starting from the end to pop in constant time.
for (size_t i = seq.size(); i-- > 0;) { for (size_t i = seqs.size(); i-- > 0;) {
llama_sbatch_seq & s = seq[i]; llama_sbatch_seq & s = seqs[i];
GGML_ASSERT(s.length > 0); GGML_ASSERT(s.length > 0);
if (length == 0) { if (length == 0) {
length = s.length < n_ubatch ? s.length : n_ubatch; length = s.length < n_ubatch ? s.length : n_ubatch;
@ -179,9 +179,9 @@ llama_ubatch llama_sbatch::split_equal(size_t n_ubatch) {
llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) { llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch; n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr); llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch_ptr->embd != nullptr);
if (!seq.empty()) { if (!seqs.empty()) {
llama_sbatch_seq & s = seq[seq.size() - 1]; llama_sbatch_seq & s = seqs.back();
size_t length = s.length < n_ubatch ? s.length : n_ubatch; size_t length = s.length < n_ubatch ? s.length : n_ubatch;
GGML_ASSERT(s.n_seq_id > 0); // should not be mixed with simple splits GGML_ASSERT(s.n_seq_id > 0); // should not be mixed with simple splits
add_seq_to_ubatch(ubatch, s, length); add_seq_to_ubatch(ubatch, s, length);
@ -189,23 +189,24 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
return ubatch; return ubatch;
} }
void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) { void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd_cur, bool simple_split, bool logits_all_cur) {
GGML_ASSERT(batch.n_tokens >= 0); GGML_ASSERT(batch.n_tokens >= 0);
this->batch = &batch;
this->n_embd = n_embd; batch_ptr = &batch;
this->logits_all = logits_all; n_embd = n_embd_cur;
logits_all = logits_all_cur;
n_tokens = batch.n_tokens; n_tokens = batch.n_tokens;
ids.resize(n_tokens); ids.resize(n_tokens);
out_ids.clear(); out_ids.clear();
// TODO: reserve out_ids and seq // TODO: reserve out_ids and seqs
for (size_t i = 0; i < n_tokens; ++i) { for (size_t i = 0; i < n_tokens; ++i) {
ids[i] = i; ids[i] = i;
} }
if (simple_split) { if (simple_split) {
seq.resize(1); seqs.resize(1);
llama_sbatch_seq & s = seq[0]; llama_sbatch_seq & s = seqs[0];
s.n_seq_id = 0; s.n_seq_id = 0;
s.seq_id = nullptr; s.seq_id = nullptr;
s.offset = 0; s.offset = 0;
@ -259,11 +260,11 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
} }
} }
llama_sbatch_seq new_seq = {n_seqs, seq_ids, i, 1}; llama_sbatch_seq new_seq = {n_seqs, seq_ids, i, 1};
seq.push_back(new_seq); seqs.push_back(new_seq);
last_seq = &seq.back(); last_seq = &seqs.back();
} }
// keep shared prompts first at the end, then sort by length descending. // keep shared prompts first at the end, then sort by length descending.
std::sort(seq.begin(), seq.end(), std::sort(seqs.begin(), seqs.end(),
[](llama_sbatch_seq & a, llama_sbatch_seq & b) { [](llama_sbatch_seq & a, llama_sbatch_seq & b) {
if (a.n_seq_id == b.n_seq_id) { if (a.n_seq_id == b.n_seq_id) {
return a.length > b.length; return a.length > b.length;

View file

@ -45,9 +45,9 @@ struct llama_sbatch {
std::vector<size_t> ids; std::vector<size_t> ids;
// batch indices of the output // batch indices of the output
std::vector<size_t> out_ids; std::vector<size_t> out_ids;
std::vector<llama_sbatch_seq> seq; std::vector<llama_sbatch_seq> seqs;
const llama_batch * batch = nullptr; const llama_batch * batch_ptr = nullptr;
// buffers for the ubatch // buffers for the ubatch
std::vector<llama_token> ubatch_token; std::vector<llama_token> ubatch_token;

View file

@ -916,8 +916,8 @@ struct llama_data_write {
write(&n_seq_id, sizeof(n_seq_id)); write(&n_seq_id, sizeof(n_seq_id));
if (n_seq_id) { if (n_seq_id) {
for (auto seq_id : cell.seq_id) { for (auto sid : cell.seq_id) {
write(&seq_id, sizeof(seq_id)); write(&sid, sizeof(sid));
} }
} }
} }

View file

@ -490,7 +490,7 @@ const char * llama_grammar_parser::parse_sequence(
pos = parse_space(pos + 1, is_nested); pos = parse_space(pos + 1, is_nested);
if (is_digit_char(*pos)) { if (is_digit_char(*pos)) {
const char * int_end = parse_int(pos); int_end = parse_int(pos);
max_times = std::stoul(std::string(pos, int_end - pos)); max_times = std::stoul(std::string(pos, int_end - pos));
pos = parse_space(int_end, is_nested); pos = parse_space(int_end, is_nested);
} }

View file

@ -454,8 +454,8 @@ struct llama_mlock::impl {
return (size_t) sysconf(_SC_PAGESIZE); return (size_t) sysconf(_SC_PAGESIZE);
} }
bool raw_lock(const void * addr, size_t size) const { bool raw_lock(const void * addr_cur, size_t size_cur) const {
if (!mlock(addr, size)) { if (!mlock(addr_cur, size_cur)) {
return true; return true;
} }
@ -475,12 +475,12 @@ struct llama_mlock::impl {
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) { if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
suggest = false; suggest = false;
} }
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) { if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size_cur)) {
suggest = false; suggest = false;
} }
LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s", LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : ""); size_cur, size, errmsg, suggest ? MLOCK_SUGGESTION : "");
return false; return false;
} }
@ -535,7 +535,7 @@ struct llama_mlock::impl {
return (size_t) 65536; return (size_t) 65536;
} }
bool raw_lock(const void * addr, size_t len) const { bool raw_lock(const void * addr_cur, size_t size_cur) const {
LLAMA_LOG_WARN("warning: mlock not supported on this system\n"); LLAMA_LOG_WARN("warning: mlock not supported on this system\n");
return false; return false;
} }

View file

@ -413,7 +413,7 @@ namespace GGUFMeta {
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) { llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap_cur, bool check_tensors_cur, const struct llama_model_kv_override * param_overrides_p) {
int trace = 0; int trace = 0;
if (getenv("LLAMA_TRACE")) { if (getenv("LLAMA_TRACE")) {
trace = atoi(getenv("LLAMA_TRACE")); trace = atoi(getenv("LLAMA_TRACE"));
@ -626,11 +626,11 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,
if (!llama_mmap::SUPPORTED) { if (!llama_mmap::SUPPORTED) {
LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__); LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__);
use_mmap = false; use_mmap_cur = false;
} }
this->use_mmap = use_mmap; use_mmap = use_mmap_cur;
this->check_tensors = check_tensors; check_tensors = check_tensors_cur;
} }
std::string llama_model_loader::get_arch_name() const { std::string llama_model_loader::get_arch_name() const {
@ -887,15 +887,15 @@ bool llama_model_loader::load_all_data(
// If the backend is supported, create pinned memory buffers and events for synchronisation. // If the backend is supported, create pinned memory buffers and events for synchronisation.
for (size_t idx = 0; idx < n_buffers; ++idx) { for (size_t idx = 0; idx < n_buffers; ++idx) {
auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size); auto * buf_new = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
if (!buf) { if (!buf_new) {
LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func, LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
ggml_backend_dev_name(dev)); ggml_backend_dev_name(dev));
return nullptr; return nullptr;
} }
host_buffers.emplace_back(buf); host_buffers.emplace_back(buf_new);
host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf)); host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf_new));
auto * event = ggml_backend_event_new(dev); auto * event = ggml_backend_event_new(dev);
if (!event) { if (!event) {

View file

@ -90,7 +90,7 @@ struct llama_model_loader {
size_t size_data = 0; size_t size_data = 0;
std::vector<std::pair<size_t, size_t>> mmaps_used; std::vector<std::pair<size_t, size_t>> mmaps_used;
llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p); llama_model_loader(const std::string & fname, bool use_mmap_cur, bool check_tensors_cur, const struct llama_model_kv_override * param_overrides_p);
template<typename T> template<typename T>
typename std::enable_if<std::is_integral<T>::value, bool>::type typename std::enable_if<std::is_integral<T>::value, bool>::type

View file

@ -340,7 +340,8 @@ struct llama_model::impl {
size_t n_bytes = 0; size_t n_bytes = 0;
std::string desc_str; std::string name_str = "n/a";
std::string desc_str = "n/a";
// model memory mapped files // model memory mapped files
llama_mmaps mappings; llama_mmaps mappings;
@ -390,17 +391,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
// get metadata as string // get metadata as string
for (int i = 0; i < gguf_get_n_kv(ctx); i++) { for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
enum gguf_type type = gguf_get_kv_type(ctx, i); gguf_type type_cur = gguf_get_kv_type(ctx, i);
if (type == GGUF_TYPE_ARRAY) { if (type_cur == GGUF_TYPE_ARRAY) {
continue; continue;
} }
const char * name = gguf_get_key(ctx, i); const char * name_cur = gguf_get_key(ctx, i);
const std::string value = gguf_kv_to_str(ctx, i); const std::string value_cur = gguf_kv_to_str(ctx, i);
gguf_kv.emplace(name, value); gguf_kv.emplace(name_cur, value_cur);
} }
// get general kv // get general kv
ml.get_key(LLM_KV_GENERAL_NAME, name, false); ml.get_key(LLM_KV_GENERAL_NAME, pimpl->name_str, false);
// everything past this point is not vocab-related // everything past this point is not vocab-related
if (hparams.vocab_only) { if (hparams.vocab_only) {
@ -1333,13 +1334,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
auto it = ctx_map.find(buft); auto it = ctx_map.find(buft);
if (it == ctx_map.end()) { if (it == ctx_map.end()) {
ggml_init_params params = { ggml_init_params params_cur = {
/*.mem_size =*/ ctx_size, /*.mem_size =*/ ctx_size,
/*.mem_buffer =*/ NULL, /*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true, /*.no_alloc =*/ true,
}; };
ggml_context * ctx = ggml_init(params); ggml_context * ctx = ggml_init(params_cur);
if (!ctx) { if (!ctx) {
throw std::runtime_error(format("failed to create ggml context")); throw std::runtime_error(format("failed to create ggml context"));
} }
@ -1557,31 +1558,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
for (int i = 0; i < n_layer; ++i) { for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i]; auto & layer = layers[i];
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i); const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i); const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i); const int64_t n_embd_gqa_i = hparams.n_embd_v_gqa(i);
const int64_t n_ff = hparams.n_ff(i); const int64_t n_ff_i = hparams.n_ff(i);
const int64_t n_head = hparams.n_head(i); const int64_t n_head_i = hparams.n_head(i);
const int64_t n_head_kv = hparams.n_head_kv(i); const int64_t n_head_kv_i = hparams.n_head_kv(i);
if (n_head_kv == 0 && n_head > 0) { if (n_head_kv_i == 0 && n_head_i > 0) {
// linear attention for DeciLMCausalModel // linear attention for DeciLMCausalModel
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
} }
else if (n_head_kv > 0) { else if (n_head_kv_i > 0) {
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
} }
// optional bias tensors // optional bias tensors
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED); layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa_i}, TENSOR_NOT_REQUIRED);
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED); layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa_i}, TENSOR_NOT_REQUIRED);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
@ -1594,14 +1595,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
} }
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff_i}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff_i, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff_i}, 0);
// optional MLP bias // optional MLP bias
layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED); layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff_i}, TENSOR_NOT_REQUIRED);
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED); layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff_i}, TENSOR_NOT_REQUIRED);
} }
} break; } break;
case LLM_ARCH_MINICPM3: case LLM_ARCH_MINICPM3:
@ -2653,23 +2654,23 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
for (int i = 0; i < n_layer; ++i) { for (int i = 0; i < n_layer; ++i) {
const int64_t n_head = hparams.n_head(i); const int64_t n_head_i = hparams.n_head(i);
const int64_t n_head_qkv = 2*hparams.n_head_kv(i) + n_head; const int64_t n_head_qkv_i = 2*hparams.n_head_kv(i) + n_head_i;
const int64_t n_ff = hparams.n_ff(i); const int64_t n_ff_i = hparams.n_ff(i);
auto & layer = layers[i]; auto & layer = layers[i];
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0); layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv_i*n_embd_head_k}, 0);
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0); layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0); layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0); layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head_i*n_embd_head_k, n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff_i}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff_i, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff_i}, 0);
} }
} break; } break;
case LLM_ARCH_GPTNEOX: case LLM_ARCH_GPTNEOX:
@ -3167,11 +3168,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
const int time_mix_extra_dim = hparams.time_mix_extra_dim; const int time_mix_extra_dim = hparams.time_mix_extra_dim;
const int time_decay_extra_dim = hparams.time_decay_extra_dim; const int time_decay_extra_dim = hparams.time_decay_extra_dim;
const int head_size = hparams.wkv_head_size; const int head_size = hparams.wkv_head_size;
const int attn_hidden_size = n_embd; const int attn_hidden_size = n_embd;
const int n_head_kv = hparams.n_head_kv();
int attn_key_value_size; int attn_key_value_size;
if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) { if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) {
attn_key_value_size = attn_hidden_size; attn_key_value_size = attn_hidden_size;
@ -3254,7 +3255,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
// posnet // posnet
{ {
const int64_t n_embd = hparams.posnet.n_embd; const int64_t n_embd_cur = hparams.posnet.n_embd;
for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) { for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
auto & layer = layers[i].posnet; auto & layer = layers[i].posnet;
@ -3274,39 +3275,39 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
case 3: case 3:
case 4: case 4:
{ {
layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0); layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd_cur}, 0);
layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0); layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd_cur}, 0);
layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0); layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd_cur, n_embd_cur}, 0);
layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0); layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd_cur}, 0);
layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0); layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd_cur}, 0);
layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0); layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd_cur}, 0);
layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0); layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd_cur, n_embd_cur}, 0);
layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0); layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd_cur}, 0);
} break; } break;
case 2: case 2:
{ {
layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0); layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd_cur}, 0);
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0); layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd_cur}, 0);
layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0); layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd_cur, n_embd_cur}, 0);
layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0); layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd_cur}, 0);
layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0); layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd_cur, n_embd_cur}, 0);
layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0); layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd_cur}, 0);
layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0); layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd_cur, n_embd_cur}, 0);
layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0); layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd_cur}, 0);
layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0); layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd_cur, n_embd_cur}, 0);
layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0); layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd_cur}, 0);
} break; } break;
case 5: case 5:
{ {
layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0); layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd_cur}, 0);
layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0); layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd_cur}, 0);
} break; } break;
default: GGML_ABORT("unknown posnet layer"); default: GGML_ABORT("unknown posnet layer");
}; };
@ -3320,29 +3321,29 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
// convnext // convnext
{ {
const int64_t n_embd = hparams.convnext.n_embd; const int64_t n_embd_cur = hparams.convnext.n_embd;
for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) { for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
auto & layer = layers[i].convnext; auto & layer = layers[i].convnext;
layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0); layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd_cur}, 0);
layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0); layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd_cur}, 0);
layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0); layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd_cur}, 0);
layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0); layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd_cur}, 0);
layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0); layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd_cur, n_ff}, 0);
layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0); layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0);
layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0); layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd_cur}, 0);
layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0); layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd_cur}, 0);
layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0); layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd_cur}, 0);
} }
// output // output
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd_cur}, 0);
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0); output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd_cur}, 0);
} }
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
@ -3601,7 +3602,7 @@ void llama_model::print_info() const {
} }
// general kv // general kv
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str()); LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, pimpl->name_str.c_str());
if (arch == LLM_ARCH_DEEPSEEK) { if (arch == LLM_ARCH_DEEPSEEK) {
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);

View file

@ -290,8 +290,6 @@ struct llama_model {
llm_type type = LLM_TYPE_UNKNOWN; llm_type type = LLM_TYPE_UNKNOWN;
llm_arch arch = LLM_ARCH_UNKNOWN; llm_arch arch = LLM_ARCH_UNKNOWN;
std::string name = "n/a";
llama_hparams hparams = {}; llama_hparams hparams = {};
llama_vocab vocab; llama_vocab vocab;

View file

@ -423,8 +423,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
int64_t counter = 0; int64_t counter = 0;
size_t new_size = 0; size_t new_size = 0;
bool valid = true; bool valid = true;
auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size, auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size, nrows, n_per_row, imatrix]() {
nrows, n_per_row, imatrix]() {
const int64_t nrows_per_chunk = chunk_size / n_per_row; const int64_t nrows_per_chunk = chunk_size / n_per_row;
size_t local_size = 0; size_t local_size = 0;
while (true) { while (true) {
@ -437,6 +436,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
break; break;
} }
lock.unlock(); lock.unlock();
const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk); const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix); size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
local_size += this_size; local_size += this_size;
@ -445,7 +445,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
const size_t row_size = ggml_row_size(new_type, n_per_row); const size_t row_size = ggml_row_size(new_type, n_per_row);
void * this_data = (char *) new_data + first_row * row_size; void * this_data = (char *) new_data + first_row * row_size;
if (!ggml_validate_row_data(new_type, this_data, this_size)) { if (!ggml_validate_row_data(new_type, this_data, this_size)) {
std::unique_lock<std::mutex> lock(mutex); lock.lock();
valid = false; valid = false;
break; break;
} }
@ -589,15 +589,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
} }
// make a list of weights // make a list of weights
std::vector<const llama_model_loader::llama_tensor_weight *> tensors; std::vector<const llama_model_loader::llama_tensor_weight *> tensor_weights;
tensors.reserve(ml.weights_map.size()); tensor_weights.reserve(ml.weights_map.size());
for (const auto & it : ml.weights_map) { for (const auto & it : ml.weights_map) {
tensors.push_back(&it.second); tensor_weights.push_back(&it.second);
} }
// keep_split requires that the weights are sorted by split index // keep_split requires that the weights are sorted by split index
if (params->keep_split) { if (params->keep_split) {
std::sort(tensors.begin(), tensors.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) { std::sort(tensor_weights.begin(), tensor_weights.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) {
if (a->idx == b->idx) { if (a->idx == b->idx) {
return a->offs < b->offs; return a->offs < b->offs;
} }
@ -605,8 +605,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
}); });
} }
for (const auto * it : tensors) { for (const auto * tw : tensor_weights) {
const struct ggml_tensor * tensor = it->tensor; const ggml_tensor * tensor = tw->tensor;
const std::string name = ggml_get_name(tensor); const std::string name = ggml_get_name(tensor);
@ -650,17 +650,17 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// Assume split index is continuous // Assume split index is continuous
if (params->keep_split) { if (params->keep_split) {
for (const auto * it : tensors) { for (const auto * tw : tensor_weights) {
n_split = std::max(uint16_t(it->idx + 1), n_split); n_split = std::max(uint16_t(tw->idx + 1), n_split);
} }
} }
std::vector<gguf_context_ptr> ctx_outs(n_split); std::vector<gguf_context_ptr> ctx_outs(n_split);
ctx_outs[0] = std::move(ctx_out); ctx_outs[0] = std::move(ctx_out);
// populate the original tensors so we get an initial meta data // populate the original tensor_weights so we get an initial meta data
for (const auto * it : tensors) { for (const auto * tw : tensor_weights) {
uint16_t i_split = params->keep_split ? it->idx : 0; uint16_t i_split = params->keep_split ? tw->idx : 0;
struct ggml_tensor * tensor = it->tensor; ggml_tensor * tensor = tw->tensor;
if (!ctx_outs[i_split]) { if (!ctx_outs[i_split]) {
ctx_outs[i_split].reset(gguf_init_empty()); ctx_outs[i_split].reset(gguf_init_empty());
} }
@ -707,12 +707,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
const auto tn = LLM_TN(model.arch); const auto tn = LLM_TN(model.arch);
new_ofstream(0); new_ofstream(0);
for (const auto * it : tensors) { for (const auto * tw : tensor_weights) {
const auto & weight = *it; ggml_tensor * tensor = tw->tensor;
struct ggml_tensor * tensor = weight.tensor; if (tw->idx != cur_split && params->keep_split) {
if (weight.idx != cur_split && params->keep_split) {
close_ofstream(); close_ofstream();
new_ofstream(weight.idx); new_ofstream(tw->idx);
} }
const std::string name = ggml_get_name(tensor); const std::string name = ggml_get_name(tensor);

View file

@ -412,8 +412,8 @@ static void llama_sampler_chain_accept(struct llama_sampler * smpl, llama_token
time_meas tm(chain->t_sample_us, chain->params.no_perf); time_meas tm(chain->t_sample_us, chain->params.no_perf);
for (auto * smpl : chain->samplers) { for (auto * cur : chain->samplers) {
llama_sampler_accept(smpl, token); llama_sampler_accept(cur, token);
} }
chain->n_sample++; chain->n_sample++;
@ -424,16 +424,16 @@ static void llama_sampler_chain_apply(struct llama_sampler * smpl, llama_token_d
time_meas tm(chain->t_sample_us, chain->params.no_perf); time_meas tm(chain->t_sample_us, chain->params.no_perf);
for (auto * smpl : chain->samplers) { for (auto * cur : chain->samplers) {
llama_sampler_apply(smpl, cur_p); llama_sampler_apply(cur, cur_p);
} }
} }
static void llama_sampler_chain_reset(struct llama_sampler * smpl) { static void llama_sampler_chain_reset(struct llama_sampler * smpl) {
auto * chain = (llama_sampler_chain *) smpl->ctx; auto * chain = (llama_sampler_chain *) smpl->ctx;
for (auto * smpl : chain->samplers) { for (auto * cur : chain->samplers) {
llama_sampler_reset(smpl); llama_sampler_reset(cur);
} }
chain->t_sample_us = 0; chain->t_sample_us = 0;
@ -445,8 +445,8 @@ static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampl
auto * result = llama_sampler_chain_init(chain_src->params); auto * result = llama_sampler_chain_init(chain_src->params);
for (auto * smpl : chain_src->samplers) { for (auto * cur : chain_src->samplers) {
llama_sampler_chain_add(result, llama_sampler_clone(smpl)); llama_sampler_chain_add(result, llama_sampler_clone(cur));
} }
return result; return result;
@ -455,8 +455,8 @@ static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampl
static void llama_sampler_chain_free(struct llama_sampler * smpl) { static void llama_sampler_chain_free(struct llama_sampler * smpl) {
auto * chain = (llama_sampler_chain *) smpl->ctx; auto * chain = (llama_sampler_chain *) smpl->ctx;
for (auto * smpl : chain->samplers) { for (auto * cur : chain->samplers) {
llama_sampler_free(smpl); llama_sampler_free(cur);
} }
delete chain; delete chain;

View file

@ -34,12 +34,12 @@ struct naive_trie {
} }
char c = key[0]; char c = key[0];
auto res = children.find(c); auto child = children.find(c);
if (res != children.end()) { if (child != children.end()) {
res->second.insert(key + 1, len - 1, val); child->second.insert(key + 1, len - 1, val);
} else { } else {
auto res = children.insert(std::make_pair(c, naive_trie())); auto child_new = children.insert(std::make_pair(c, naive_trie()));
res.first->second.insert(key + 1, len - 1, val); child_new.first->second.insert(key + 1, len - 1, val);
} }
} }
@ -49,18 +49,18 @@ struct naive_trie {
} }
char c = key[offset]; char c = key[offset];
auto res = children.find(c); auto child = children.find(c);
if (res != children.end()) { if (child != children.end()) {
return res->second.get_longest_prefix(key, len, offset + 1); return child->second.get_longest_prefix(key, len, offset + 1);
} }
return std::make_pair(key, offset); return std::make_pair(key, offset);
} }
const struct naive_trie * traverse(const char c) const { const struct naive_trie * traverse(const char c) const {
auto res = children.find(c); auto child = children.find(c);
if (res != children.end()) { if (child != children.end()) {
return &res->second; return &child->second;
} }
return NULL; return NULL;
@ -1285,7 +1285,7 @@ struct llama_vocab::impl {
llama_token_attr token_get_attr(llama_token id) const; llama_token_attr token_get_attr(llama_token id) const;
void init_tokenizer(enum llama_vocab_type type); void init_tokenizer();
void tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const; void tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const;
@ -1675,7 +1675,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
} }
GGML_ASSERT(id_to_token.size() == token_to_id.size()); GGML_ASSERT(id_to_token.size() == token_to_id.size());
init_tokenizer(type); init_tokenizer();
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n' // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
if (type == LLAMA_VOCAB_TYPE_SPM) { if (type == LLAMA_VOCAB_TYPE_SPM) {
@ -2116,7 +2116,7 @@ llama_token_attr llama_vocab::impl::token_get_attr(llama_token id) const {
return id_to_token.at(id).attr; return id_to_token.at(id).attr;
} }
void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) { void llama_vocab::impl::init_tokenizer() {
LLAMA_LOG_DEBUG("%s: initializing tokenizer for type %d\n", __func__, type); LLAMA_LOG_DEBUG("%s: initializing tokenizer for type %d\n", __func__, type);
switch (type) { switch (type) {