diff --git a/llama.cpp b/llama.cpp index 685e4b2e3..0fa5e89f4 100644 --- a/llama.cpp +++ b/llama.cpp @@ -364,23 +364,11 @@ static size_t llama_calc_tensor_size(const std::vector & ne, enum ggml return size / ggml_blck_size(type); } -struct llama_load_tensor_shard { - std::vector ne; - size_t size; - enum ggml_type type; - size_t file_off; - - void calc_size() { - size = llama_calc_tensor_size(ne, type); - } -}; - struct llama_load_tensor { - llama_load_tensor_shard first_shard; - std::string name; enum ggml_type type = GGML_TYPE_F32; std::vector ne; + size_t file_off; size_t size; struct ggml_tensor * ggml_tensor = NULL; uint8_t * data; @@ -388,20 +376,6 @@ struct llama_load_tensor { llama_load_tensor(const std::string & name) : name(name) {} void calc_all() { - calc_type(); - calc_ne(); - calc_size(); - } - - void calc_type() { - type = first_shard.type; - } - - void calc_ne() { - ne = first_shard.ne; - } - - void calc_size() { size = llama_calc_tensor_size(ne, type); } }; @@ -491,17 +465,17 @@ struct llama_file_loader { } void read_tensor_metadata(llama_load_tensors_map & tensors_map) { while (file.tell() < file.size) { - llama_load_tensor_shard shard; uint32_t n_dims = file.read_u32(); uint32_t name_len = file.read_u32(); - shard.type = (enum ggml_type) file.read_u32(); - shard.ne.resize(n_dims); - file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims); + ggml_type type = (enum ggml_type) file.read_u32(); + std::vector ne; + ne.resize(n_dims); + file.read_raw(ne.data(), sizeof(ne[0]) * n_dims); std::string name = file.read_string(name_len); if (n_dims < 1 || n_dims > 2) { throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims)); } - switch (shard.type) { + switch (type) { case GGML_TYPE_F32: case GGML_TYPE_F16: case GGML_TYPE_Q4_0: @@ -516,7 +490,7 @@ struct llama_file_loader { case GGML_TYPE_Q6_K: break; default: { - throw std::runtime_error(format("unrecognized tensor type %u\n", shard.type)); + throw std::runtime_error(format("unrecognized tensor type %u\n", type)); } } @@ -525,11 +499,6 @@ struct llama_file_loader { file.seek(-static_cast(file.tell()) & 31, SEEK_CUR); } - shard.file_off = file.tell(); - - shard.calc_size(); - file.seek(shard.size, SEEK_CUR); - auto it = tensors_map.name_to_idx.find(name); size_t idx; if (it != tensors_map.name_to_idx.end()) { @@ -539,7 +508,14 @@ struct llama_file_loader { idx = tensors_map.tensors.size() - 1; tensors_map.name_to_idx.emplace(name, idx); } - tensors_map.tensors.at(idx).first_shard = shard; + auto tensor = tensors_map.tensors.at(idx); + + tensor.ne = ne; + tensor.type = type; + tensor.file_off = file.tell(); + + tensor.calc_all(); + file.seek(tensor.size, SEEK_CUR); } } }; @@ -633,7 +609,7 @@ struct llama_model_loader { bool alignment_prevents_mmap() { for (const llama_load_tensor & lt : tensors_map.tensors) { - if (lt.first_shard.file_off & 3) { + if (lt.file_off & 3) { return true; } } @@ -646,7 +622,7 @@ struct llama_model_loader { throw std::runtime_error(std::string("missing tok_embeddings.weight")); } const llama_load_tensor & lt = tensors_map.tensors.at(it->second); - return file_loader->hparams.n_embd / lt.first_shard.ne.at(0); + return file_loader->hparams.n_embd / lt.ne.at(0); } void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const { @@ -768,10 +744,10 @@ struct llama_model_loader { void load_data_for(llama_load_tensor & lt) { if (use_mmap) { - lt.data = (uint8_t *) mapping->addr + lt.first_shard.file_off; + lt.data = (uint8_t *) mapping->addr + lt.file_off; } else { llama_file & file = file_loader->file; - file.seek(lt.first_shard.file_off, SEEK_SET); + file.seek(lt.file_off, SEEK_SET); file.read_raw(lt.data, lt.size); }