quantize : add '--keep-split' to quantize model into shards (#6688)
* Implement '--keep-split' to quantize model into several shards * Add test script * Update examples/quantize/quantize.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Split model correctly even if tensor id is out-of-order * Update llama_model_quantize_params * Fix preci failures --------- Co-authored-by: z5269887 <z5269887@unsw.edu.au> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
784e11dea1
commit
1966eb2615
4 changed files with 148 additions and 26 deletions
93
llama.cpp
93
llama.cpp
|
@ -3297,6 +3297,10 @@ struct llama_model_loader {
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
const llama_tensor_weight * get_weight(int i) const {
|
||||
return get_weight(get_tensor_name(i));
|
||||
}
|
||||
|
||||
const llama_tensor_weight & require_weight(const char * name) const {
|
||||
const llama_tensor_weight * weight = get_weight(name);
|
||||
if (!weight) {
|
||||
|
@ -14528,26 +14532,74 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
std::vector<no_init<uint8_t>> work;
|
||||
std::vector<no_init<float>> f32_conv_buf;
|
||||
|
||||
uint16_t n_split = 1;
|
||||
// Assume split index is continuous
|
||||
if (params->keep_split) {
|
||||
for (int i = 0; i < ml.n_tensors; ++i) {
|
||||
n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split);
|
||||
}
|
||||
}
|
||||
std::vector<gguf_context*> ctx_outs(n_split, NULL);
|
||||
ctx_outs[0] = ctx_out;
|
||||
|
||||
// populate the original tensors so we get an initial meta data
|
||||
for (int i = 0; i < ml.n_tensors; ++i) {
|
||||
const struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
||||
gguf_add_tensor(ctx_out, meta);
|
||||
auto weight = ml.get_weight(i);
|
||||
uint16_t i_split = params->keep_split ? weight->idx : 0;
|
||||
struct ggml_tensor * tensor = weight->tensor;
|
||||
if (ctx_outs[i_split] == NULL) {
|
||||
ctx_outs[i_split] = gguf_init_empty();
|
||||
}
|
||||
gguf_add_tensor(ctx_outs[i_split], tensor);
|
||||
}
|
||||
|
||||
std::ofstream fout(fname_out, std::ios::binary);
|
||||
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
||||
// Set split info if needed
|
||||
if (n_split > 1) {
|
||||
for (size_t i = 0; i < ctx_outs.size(); ++i) {
|
||||
gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
|
||||
gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
|
||||
gguf_set_val_i32(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
|
||||
}
|
||||
}
|
||||
|
||||
const size_t meta_size = gguf_get_meta_size(ctx_out);
|
||||
int cur_split = -1;
|
||||
std::ofstream fout;
|
||||
auto close_ofstream = [&]() {
|
||||
// Write metadata and close file handler
|
||||
if (fout.is_open()) {
|
||||
fout.seekp(0);
|
||||
std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split]));
|
||||
gguf_get_meta_data(ctx_outs[cur_split], data.data());
|
||||
fout.write((const char *) data.data(), data.size());
|
||||
fout.close();
|
||||
}
|
||||
};
|
||||
auto new_ofstream = [&](int index = 0) {
|
||||
cur_split = index;
|
||||
GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
|
||||
std::string fname = fname_out;
|
||||
if (params->keep_split) {
|
||||
char split_path[PATH_MAX] = {0};
|
||||
llama_split_path(split_path, sizeof(split_path), fname_out.c_str(), cur_split, n_split);
|
||||
fname = std::string(split_path);
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size);
|
||||
|
||||
// placeholder for the meta data
|
||||
::zeros(fout, meta_size);
|
||||
fout = std::ofstream(fname, std::ios::binary);
|
||||
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
||||
const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]);
|
||||
// placeholder for the meta data
|
||||
::zeros(fout, meta_size);
|
||||
};
|
||||
|
||||
const auto tn = LLM_TN(model.arch);
|
||||
|
||||
new_ofstream();
|
||||
for (int i = 0; i < ml.n_tensors; ++i) {
|
||||
struct ggml_tensor * tensor = ml.get_tensor_meta(i);
|
||||
auto weight = ml.get_weight(i);
|
||||
struct ggml_tensor * tensor = weight->tensor;
|
||||
if (weight->idx != cur_split && params->keep_split) {
|
||||
close_ofstream();
|
||||
new_ofstream(weight->idx);
|
||||
}
|
||||
|
||||
const std::string name = ggml_get_name(tensor);
|
||||
|
||||
|
@ -14702,26 +14754,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
total_size_new += new_size;
|
||||
|
||||
// update the gguf meta data as we go
|
||||
gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
|
||||
gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
|
||||
gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
|
||||
gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
|
||||
|
||||
// write tensor data + padding
|
||||
fout.write((const char *) new_data, new_size);
|
||||
zeros(fout, GGML_PAD(new_size, align) - new_size);
|
||||
}
|
||||
|
||||
// go back to beginning of file and write the updated meta data
|
||||
{
|
||||
fout.seekp(0);
|
||||
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
|
||||
gguf_get_meta_data(ctx_out, data.data());
|
||||
fout.write((const char *) data.data(), data.size());
|
||||
close_ofstream();
|
||||
for (auto & c:ctx_outs) {
|
||||
gguf_free(c);
|
||||
}
|
||||
|
||||
fout.close();
|
||||
|
||||
gguf_free(ctx_out);
|
||||
|
||||
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
||||
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
|
||||
|
||||
|
@ -15077,6 +15121,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|||
/*.quantize_output_tensor =*/ true,
|
||||
/*.only_copy =*/ false,
|
||||
/*.pure =*/ false,
|
||||
/*.keep_split =*/ false,
|
||||
/*.imatrix =*/ nullptr,
|
||||
/*.kv_overrides =*/ nullptr,
|
||||
};
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue