Implement '--keep-split' to quantize model into several shards
This commit is contained in:
parent
4cc120c744
commit
17519e110f
3 changed files with 75 additions and 27 deletions
|
@ -97,6 +97,7 @@ static void usage(const char * executable) {
|
||||||
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
|
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
|
||||||
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
|
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
|
||||||
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
|
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
|
||||||
|
printf(" --keep-split: will generate quatized model in the same shards as input");
|
||||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||||
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
|
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
|
||||||
printf("Note: --include-weights and --exclude-weights cannot be used together\n");
|
printf("Note: --include-weights and --exclude-weights cannot be used together\n");
|
||||||
|
@ -300,6 +301,8 @@ int main(int argc, char ** argv) {
|
||||||
} else {
|
} else {
|
||||||
usage(argv[0]);
|
usage(argv[0]);
|
||||||
}
|
}
|
||||||
|
} else if (strcmp(argv[arg_idx], "--keep-split")) {
|
||||||
|
params.keep_split = true;
|
||||||
} else {
|
} else {
|
||||||
usage(argv[0]);
|
usage(argv[0]);
|
||||||
}
|
}
|
||||||
|
@ -332,20 +335,28 @@ int main(int argc, char ** argv) {
|
||||||
std::string fname_out;
|
std::string fname_out;
|
||||||
|
|
||||||
std::string ftype_str;
|
std::string ftype_str;
|
||||||
|
std::string suffix = ".gguf";
|
||||||
if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
|
if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
|
||||||
std::string fpath;
|
std::string fpath;
|
||||||
const size_t pos = fname_inp.find_last_of("/\\");
|
const size_t pos = fname_inp.find_last_of("/\\");
|
||||||
if (pos != std::string::npos) {
|
if (pos != std::string::npos) {
|
||||||
fpath = fname_inp.substr(0, pos + 1);
|
fpath = fname_inp.substr(0, pos + 1);
|
||||||
}
|
}
|
||||||
// export as [inp path]/ggml-model-[ftype].gguf
|
|
||||||
fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
|
// export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
|
||||||
|
fname_out = fpath + "ggml-model-" + ftype_str;
|
||||||
|
if (!params.keep_split) {
|
||||||
|
fname_out += suffix;
|
||||||
|
}
|
||||||
arg_idx++;
|
arg_idx++;
|
||||||
if (ftype_str == "COPY") {
|
if (ftype_str == "COPY") {
|
||||||
params.only_copy = true;
|
params.only_copy = true;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
fname_out = argv[arg_idx];
|
fname_out = argv[arg_idx];
|
||||||
|
if (params.keep_split && fname_out.find(suffix) !=std::string::npos) {
|
||||||
|
fname_out = fname_out.substr(0, fname_out.length() - suffix.length());
|
||||||
|
}
|
||||||
arg_idx++;
|
arg_idx++;
|
||||||
|
|
||||||
if (argc <= arg_idx) {
|
if (argc <= arg_idx) {
|
||||||
|
|
86
llama.cpp
86
llama.cpp
|
@ -3197,6 +3197,10 @@ struct llama_model_loader {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const llama_tensor_weight * get_weight(int i) const {
|
||||||
|
return get_weight(get_tensor_name(i));
|
||||||
|
}
|
||||||
|
|
||||||
const llama_tensor_weight & require_weight(const char * name) const {
|
const llama_tensor_weight & require_weight(const char * name) const {
|
||||||
const llama_tensor_weight * weight = get_weight(name);
|
const llama_tensor_weight * weight = get_weight(name);
|
||||||
if (!weight) {
|
if (!weight) {
|
||||||
|
@ -13530,6 +13534,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
|
|
||||||
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
||||||
struct gguf_context * ctx_out = gguf_init_empty();
|
struct gguf_context * ctx_out = gguf_init_empty();
|
||||||
|
std::vector<gguf_context*> ctx_outs = {ctx_out};
|
||||||
|
|
||||||
// copy the KV pairs from the input file
|
// copy the KV pairs from the input file
|
||||||
gguf_set_kv (ctx_out, ml.meta);
|
gguf_set_kv (ctx_out, ml.meta);
|
||||||
|
@ -13593,24 +13598,63 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
|
|
||||||
// populate the original tensors so we get an initial meta data
|
// populate the original tensors so we get an initial meta data
|
||||||
for (int i = 0; i < ml.n_tensors; ++i) {
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
||||||
const struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
auto weight = ml.get_weight(i);
|
||||||
gguf_add_tensor(ctx_out, meta);
|
struct ggml_tensor * tensor = weight->tensor;
|
||||||
|
if (weight->idx != (ctx_outs.size() - 1) && params->keep_split) {
|
||||||
|
ctx_out = gguf_init_empty();
|
||||||
|
ctx_outs.push_back(ctx_out);
|
||||||
|
}
|
||||||
|
gguf_add_tensor(ctx_out, tensor);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::ofstream fout(fname_out, std::ios::binary);
|
// Set split info if needed
|
||||||
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
uint16_t n_split = ctx_outs.size();
|
||||||
|
if (n_split > 1) {
|
||||||
|
for (int i = 0; i < ctx_outs.size(); ++i) {
|
||||||
|
gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
|
||||||
|
gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
|
||||||
|
gguf_set_val_i32(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int cur_split = -1;
|
||||||
|
std::ofstream fout;
|
||||||
|
auto close_ofstream = [&]() {
|
||||||
|
// Write metadata and close file handler
|
||||||
|
if (fout.is_open()) {
|
||||||
|
fout.seekp(0);
|
||||||
|
std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split]));
|
||||||
|
gguf_get_meta_data(ctx_outs[cur_split], data.data());
|
||||||
|
fout.write((const char *) data.data(), data.size());
|
||||||
|
fout.close();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
auto new_ofstream = [&]() {
|
||||||
|
++cur_split;
|
||||||
|
std::string fname = fname_out;
|
||||||
|
if (params->keep_split) {
|
||||||
|
char split_path[PATH_MAX] = {0};
|
||||||
|
llama_split_path(split_path, sizeof(split_path), fname_out.c_str(), cur_split, n_split);
|
||||||
|
fname = std::string(split_path);
|
||||||
|
}
|
||||||
|
|
||||||
const size_t meta_size = gguf_get_meta_size(ctx_out);
|
fout = std::ofstream(fname, std::ios::binary);
|
||||||
|
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
||||||
LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size);
|
const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]);
|
||||||
|
// placeholder for the meta data
|
||||||
// placeholder for the meta data
|
::zeros(fout, meta_size);
|
||||||
::zeros(fout, meta_size);
|
};
|
||||||
|
|
||||||
const auto tn = LLM_TN(model.arch);
|
const auto tn = LLM_TN(model.arch);
|
||||||
|
new_ofstream();
|
||||||
for (int i = 0; i < ml.n_tensors; ++i) {
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
||||||
struct ggml_tensor * tensor = ml.get_tensor_meta(i);
|
auto weight = ml.get_weight(i);
|
||||||
|
struct ggml_tensor * tensor = weight->tensor;
|
||||||
|
if (weight->idx != cur_split && params->keep_split) {
|
||||||
|
GGML_ASSERT(cur_split == weight->idx-1 && "Invalid split index found in weight");
|
||||||
|
close_ofstream();
|
||||||
|
new_ofstream();
|
||||||
|
}
|
||||||
|
|
||||||
const std::string name = ggml_get_name(tensor);
|
const std::string name = ggml_get_name(tensor);
|
||||||
|
|
||||||
|
@ -13765,26 +13809,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
total_size_new += new_size;
|
total_size_new += new_size;
|
||||||
|
|
||||||
// update the gguf meta data as we go
|
// update the gguf meta data as we go
|
||||||
gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
|
gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
|
||||||
gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
|
gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
|
||||||
|
|
||||||
// write tensor data + padding
|
// write tensor data + padding
|
||||||
fout.write((const char *) new_data, new_size);
|
fout.write((const char *) new_data, new_size);
|
||||||
zeros(fout, GGML_PAD(new_size, align) - new_size);
|
zeros(fout, GGML_PAD(new_size, align) - new_size);
|
||||||
}
|
}
|
||||||
|
close_ofstream();
|
||||||
// go back to beginning of file and write the updated meta data
|
for (auto & c:ctx_outs) {
|
||||||
{
|
gguf_free(c);
|
||||||
fout.seekp(0);
|
|
||||||
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
|
|
||||||
gguf_get_meta_data(ctx_out, data.data());
|
|
||||||
fout.write((const char *) data.data(), data.size());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fout.close();
|
|
||||||
|
|
||||||
gguf_free(ctx_out);
|
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
||||||
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
|
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
|
||||||
|
|
||||||
|
|
1
llama.h
1
llama.h
|
@ -290,6 +290,7 @@ extern "C" {
|
||||||
bool pure; // quantize all tensors to the default type
|
bool pure; // quantize all tensors to the default type
|
||||||
void * imatrix; // pointer to importance matrix data
|
void * imatrix; // pointer to importance matrix data
|
||||||
void * kv_overrides; // pointer to vector containing overrides
|
void * kv_overrides; // pointer to vector containing overrides
|
||||||
|
bool keep_split; // quantize to the same number of shards
|
||||||
} llama_model_quantize_params;
|
} llama_model_quantize_params;
|
||||||
|
|
||||||
// grammar types
|
// grammar types
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue