merge: accept quant input
This commit is contained in:
parent
2cfae6d9a8
commit
abec8c0c3a
2 changed files with 70 additions and 41 deletions
|
@ -10,8 +10,15 @@
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
|
struct merge_params {
|
||||||
|
std::string config_path = "merge.csv";
|
||||||
|
std::vector<std::string> model_paths;
|
||||||
|
std::string output_path = "gguf-merged-f16.gguf";
|
||||||
|
};
|
||||||
|
|
||||||
[[noreturn]]
|
[[noreturn]]
|
||||||
static void usage(const char * executable, int exit_code) {
|
static void usage(const char * executable, int exit_code) {
|
||||||
|
struct merge_params defaults;
|
||||||
printf("usage: %s -c CONFIG_FILE -o OUTPUT_FILE -m MODEL_PATH -m MODEL_PATH ...\n\n", executable);
|
printf("usage: %s -c CONFIG_FILE -o OUTPUT_FILE -m MODEL_PATH -m MODEL_PATH ...\n\n", executable);
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("Merging 2 models and change layers configuration.\n");
|
printf("Merging 2 models and change layers configuration.\n");
|
||||||
|
@ -29,14 +36,14 @@ static void usage(const char * executable, int exit_code) {
|
||||||
printf("2,0.5,1,0.5 meaning: output layer 3 = A[2]*0.5 + B[1]*0.5\n");
|
printf("2,0.5,1,0.5 meaning: output layer 3 = A[2]*0.5 + B[1]*0.5\n");
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("NOTE:\n");
|
printf("NOTE:\n");
|
||||||
printf("- The embedding layer of the first model will be used\n");
|
printf("- The embedding and output layers of the first model will be used.\n");
|
||||||
printf("- Currently, only F16 model type is supported\n");
|
printf("- Currently, we accept both quantized and non-quantized models as input, but only output FP16 model. To re-quantize it, please use \"quantize\" tool.\n");
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("Options:\n");
|
printf("Options:\n");
|
||||||
printf(" -h, --help Show this help message and exit\n");
|
printf(" -h, --help Show this help message and exit\n");
|
||||||
printf(" -c, --config CONFIG_FILE Path to config file (CSV format)\n");
|
printf(" -c, --config CONFIG_FILE Path to config file, in CSV format (default: %s)\n", defaults.config_path.c_str());
|
||||||
printf(" -m, --model MODEL_PATH Path to model. This option can be repeated multiple times and must be specified in the right order.\n");
|
printf(" -m, --model MODEL_PATH Path to model. This option can be repeated multiple times and must be specified in the right order.\n");
|
||||||
printf(" -o, --output OUTPUT_FILE Path to the output model\n");
|
printf(" -o, --output OUTPUT_FILE Path to the output model (default: %s)\n", defaults.output_path.c_str());
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("Example: ./merge -c config.csv -o output.gguf -m model_a.gguf -m model_b.gguf\n");
|
printf("Example: ./merge -c config.csv -o output.gguf -m model_a.gguf -m model_b.gguf\n");
|
||||||
exit(exit_code);
|
exit(exit_code);
|
||||||
|
@ -92,9 +99,7 @@ static std::vector<struct llama_merge_layer> parse_config(std::string & config_p
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
bool invalid_param = false;
|
bool invalid_param = false;
|
||||||
std::string config_path;
|
struct merge_params params;
|
||||||
std::vector<std::string> model_paths;
|
|
||||||
std::string output_path;
|
|
||||||
|
|
||||||
std::string arg;
|
std::string arg;
|
||||||
for (int i = 1; i < argc; i++) {
|
for (int i = 1; i < argc; i++) {
|
||||||
|
@ -106,40 +111,36 @@ int main(int argc, char ** argv) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
config_path = argv[i];
|
params.config_path = argv[i];
|
||||||
} else if (arg == "-m" || arg == "--model") {
|
} else if (arg == "-m" || arg == "--model") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
model_paths.push_back(argv[i]);
|
params.model_paths.push_back(argv[i]);
|
||||||
} else if (arg == "-o" || arg == "--output") {
|
} else if (arg == "-o" || arg == "--output") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
output_path = argv[i];
|
params.output_path = argv[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (invalid_param) {
|
if (invalid_param) {
|
||||||
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
|
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
|
||||||
} else if (config_path.empty()) {
|
} else if (params.model_paths.size() < 2) {
|
||||||
throw std::invalid_argument("error: missing config path");
|
|
||||||
} else if (model_paths.size() < 2) {
|
|
||||||
throw std::invalid_argument("error: require at least 2 models");
|
throw std::invalid_argument("error: require at least 2 models");
|
||||||
} else if (output_path.empty()) {
|
|
||||||
throw std::invalid_argument("error: missing output path");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// buffers to hold allocated data
|
// buffers to hold allocated data
|
||||||
std::vector<int> buf_srcs;
|
std::vector<int> buf_srcs;
|
||||||
std::vector<float> buf_scales;
|
std::vector<float> buf_scales;
|
||||||
|
|
||||||
auto layers = parse_config(config_path, model_paths.size(), buf_srcs, buf_scales);
|
auto layers = parse_config(params.config_path, params.model_paths.size(), buf_srcs, buf_scales);
|
||||||
std::vector<const char*> p_model_paths;
|
std::vector<const char*> p_model_paths;
|
||||||
for (auto & m : model_paths) {
|
for (auto & m : params.model_paths) {
|
||||||
p_model_paths.push_back(m.data());
|
p_model_paths.push_back(m.data());
|
||||||
}
|
}
|
||||||
const struct llama_merge_config config{
|
const struct llama_merge_config config{
|
||||||
|
@ -147,7 +148,7 @@ int main(int argc, char ** argv) {
|
||||||
p_model_paths.size(),
|
p_model_paths.size(),
|
||||||
layers.data(),
|
layers.data(),
|
||||||
layers.size(),
|
layers.size(),
|
||||||
output_path.data(),
|
params.output_path.data(),
|
||||||
};
|
};
|
||||||
|
|
||||||
llama_merge_models(&config);
|
llama_merge_models(&config);
|
||||||
|
|
66
llama.cpp
66
llama.cpp
|
@ -11309,8 +11309,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t llama_merge_models(const struct llama_merge_config * config)
|
int32_t llama_merge_models(const struct llama_merge_config * config) {
|
||||||
{
|
|
||||||
#if defined(__linux__) || defined(_WIN32)
|
#if defined(__linux__) || defined(_WIN32)
|
||||||
constexpr bool use_mmap = true;
|
constexpr bool use_mmap = true;
|
||||||
#else
|
#else
|
||||||
|
@ -11346,6 +11345,11 @@ int32_t llama_merge_models(const struct llama_merge_config * config)
|
||||||
return ss.str();
|
return ss.str();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// if the input model model is quantized, it will be dequant to FP16
|
||||||
|
auto get_output_type = [&](struct ggml_tensor * t) {
|
||||||
|
return ggml_is_quantized(t->type) ? GGML_TYPE_F16 : t->type;
|
||||||
|
};
|
||||||
|
|
||||||
// remember to call before exit
|
// remember to call before exit
|
||||||
auto clean_up = [&]() {
|
auto clean_up = [&]() {
|
||||||
fout.close();
|
fout.close();
|
||||||
|
@ -11379,8 +11383,10 @@ int32_t llama_merge_models(const struct llama_merge_config * config)
|
||||||
gguf_set_kv(ctx_out, mls[0]->ctx_gguf);
|
gguf_set_kv(ctx_out, mls[0]->ctx_gguf);
|
||||||
|
|
||||||
// correct layer count for output model
|
// correct layer count for output model
|
||||||
// TODO: is this key "llama.block_count" this the same for all architectures?
|
std::stringstream ss;
|
||||||
gguf_set_val_u32(ctx_out, "llama.block_count", config->n_layers);
|
ss << mls[0]->get_arch_name() << ".block_count";
|
||||||
|
gguf_set_val_u32(ctx_out, ss.str().c_str(), config->n_layers);
|
||||||
|
printf("====> Set new value of %s = %ld\n", ss.str().c_str(), config->n_layers);
|
||||||
|
|
||||||
// read input layers
|
// read input layers
|
||||||
for (int i = 0; i < mls[0]->n_tensors; i++) {
|
for (int i = 0; i < mls[0]->n_tensors; i++) {
|
||||||
|
@ -11409,10 +11415,12 @@ int32_t llama_merge_models(const struct llama_merge_config * config)
|
||||||
struct ggml_tensor * ref_tensor = mls[0]->get_tensor_meta(ref_name.c_str()); // get ref tensor from layer 0
|
struct ggml_tensor * ref_tensor = mls[0]->get_tensor_meta(ref_name.c_str()); // get ref tensor from layer 0
|
||||||
memcpy(out_tensor, ref_tensor, GGML_TENSOR_SIZE); // copy metadata (shape, type,...)
|
memcpy(out_tensor, ref_tensor, GGML_TENSOR_SIZE); // copy metadata (shape, type,...)
|
||||||
ggml_set_name(out_tensor, get_name(i_layer, ref_name).c_str()); // set the correct name (with correct i_layer)
|
ggml_set_name(out_tensor, get_name(i_layer, ref_name).c_str()); // set the correct name (with correct i_layer)
|
||||||
|
out_tensor->type = get_output_type(ref_tensor); // maybe dequant
|
||||||
output_tensors.push_back(out_tensor);
|
output_tensors.push_back(out_tensor);
|
||||||
gguf_add_tensor(ctx_out, out_tensor);
|
gguf_add_tensor(ctx_out, out_tensor);
|
||||||
LLAMA_LOG_INFO("%s\n", ggml_get_name(out_tensor));
|
LLAMA_LOG_INFO("%s\n", ggml_get_name(out_tensor));
|
||||||
}
|
}
|
||||||
|
// TODO: how to reuse tensor (duplicated layers)? we can play with ctx->infos[tensor_idx].offset
|
||||||
}
|
}
|
||||||
|
|
||||||
const size_t meta_size = gguf_get_meta_size(ctx_out);
|
const size_t meta_size = gguf_get_meta_size(ctx_out);
|
||||||
|
@ -11436,27 +11444,47 @@ int32_t llama_merge_models(const struct llama_merge_config * config)
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO: maybe we should use ggml_add and ggml_scale? and how?
|
// TODO: maybe we should use ggml_add and ggml_scale? and how?
|
||||||
auto calc_output_tensor = [&](enum ggml_type type, std::vector<no_init<uint8_t>> & in_buf, float scale, std::vector<no_init<uint8_t>> & out_buf) {
|
auto calc_output_tensor = [&](struct ggml_tensor * in_tensor, float scale, std::vector<no_init<uint8_t>> & out_buf) {
|
||||||
GGML_ASSERT(in_buf.size() == out_buf.size());
|
enum ggml_type type = in_tensor->type;
|
||||||
|
const size_t nelements = ggml_nelements(in_tensor);
|
||||||
|
std::vector<no_init<uint8_t>> tmp_buf;
|
||||||
|
void * in_buf = in_tensor->data;
|
||||||
|
// TODO: if the tensor is quantized, we dequantize to FP16 then to FP32, do the calculation and re-quant to FP16. how can we simplify?
|
||||||
|
if (ggml_is_quantized(type)) {
|
||||||
|
// dequantize it to FP32
|
||||||
|
std::vector<std::thread> workers;
|
||||||
|
int nthread = std::thread::hardware_concurrency();
|
||||||
|
workers.reserve(nthread);
|
||||||
|
std::vector<no_init<float>> f32_conv_buf;
|
||||||
|
llama_convert_tensor_internal(in_tensor, f32_conv_buf, workers, nelements, nthread);
|
||||||
|
// quantize back to FP16
|
||||||
|
type = GGML_TYPE_F16;
|
||||||
|
tmp_buf.resize(nelements * sizeof(ggml_fp16_t));
|
||||||
|
for (size_t i = 0; i < nelements; i++) {
|
||||||
|
ggml_fp16_t * dest = (ggml_fp16_t *) tmp_buf.data();
|
||||||
|
dest[i] = ggml_fp32_to_fp16(f32_conv_buf[i].value);
|
||||||
|
}
|
||||||
|
in_buf = tmp_buf.data();
|
||||||
|
}
|
||||||
|
// then the code block below will calculate the merged tensor
|
||||||
if (type == GGML_TYPE_F16) {
|
if (type == GGML_TYPE_F16) {
|
||||||
GGML_ASSERT(in_buf.size() % sizeof(ggml_fp16_t) == 0);
|
out_buf.resize(nelements * sizeof(ggml_fp16_t));
|
||||||
for (size_t i = 0; i < in_buf.size() / sizeof(ggml_fp16_t); i++) {
|
for (size_t i = 0; i < nelements; i++) {
|
||||||
ggml_fp16_t * in = (ggml_fp16_t *) in_buf.data();
|
ggml_fp16_t * in = (ggml_fp16_t *) in_buf;
|
||||||
ggml_fp16_t * dest = (ggml_fp16_t *) out_buf.data();
|
ggml_fp16_t * dest = (ggml_fp16_t *) out_buf.data();
|
||||||
float in_dequant = ggml_fp16_to_fp32(in[i]);
|
float in_dequant = ggml_fp16_to_fp32(in[i]);
|
||||||
float res = in_dequant * scale;
|
float res = in_dequant * scale;
|
||||||
dest[i] = ggml_fp32_to_fp16(res);
|
dest[i] = ggml_fp32_to_fp16(res);
|
||||||
}
|
}
|
||||||
} else if (type == GGML_TYPE_F32) {
|
} else if (type == GGML_TYPE_F32) {
|
||||||
GGML_ASSERT(in_buf.size() % sizeof(float) == 0);
|
out_buf.resize(nelements * sizeof(float));
|
||||||
for (size_t i = 0; i < in_buf.size() / sizeof(float); i++) {
|
for (size_t i = 0; i < nelements; i++) {
|
||||||
float * in = (float *) in_buf.data();
|
float * in = (float *) in_buf;
|
||||||
float * dest = (float *) out_buf.data();
|
float * dest = (float *) out_buf.data();
|
||||||
dest[i] = in[i] * scale;
|
dest[i] = in[i] * scale;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
LLAMA_LOG_ERROR("Only GGML_TYPE_F16 or GGML_TYPE_F32 is supported, current type = %s\n", ggml_type_name(type));
|
GGML_ASSERT(false); // should never reach here
|
||||||
return -1; // return of lambda, no need clean up
|
|
||||||
}
|
}
|
||||||
return 0; // return of lambda, no need clean up
|
return 0; // return of lambda, no need clean up
|
||||||
};
|
};
|
||||||
|
@ -11479,7 +11507,7 @@ int32_t llama_merge_models(const struct llama_merge_config * config)
|
||||||
read_tensor_data(in_tensor, *mls[0], buf); // read from first model
|
read_tensor_data(in_tensor, *mls[0], buf); // read from first model
|
||||||
|
|
||||||
n_done++;
|
n_done++;
|
||||||
LLAMA_LOG_INFO("[%4ld/%4ld] %36s - [%s], type = %6s\n",
|
LLAMA_LOG_INFO("[%4ld/%4ld] %36s - [%s], input type = %6s\n",
|
||||||
n_done, output_tensors.size(),
|
n_done, output_tensors.size(),
|
||||||
name.c_str(),
|
name.c_str(),
|
||||||
llama_format_tensor_shape(out_tensor).c_str(),
|
llama_format_tensor_shape(out_tensor).c_str(),
|
||||||
|
@ -11492,8 +11520,8 @@ int32_t llama_merge_models(const struct llama_merge_config * config)
|
||||||
|
|
||||||
// process layer output tensor
|
// process layer output tensor
|
||||||
for (auto & out_tensor : output_tensors) {
|
for (auto & out_tensor : output_tensors) {
|
||||||
std::vector<no_init<uint8_t>> in_buf(ggml_nbytes(out_tensor));
|
std::vector<no_init<uint8_t>> in_buf;
|
||||||
std::vector<no_init<uint8_t>> out_buf(ggml_nbytes(out_tensor));
|
std::vector<no_init<uint8_t>> out_buf;
|
||||||
|
|
||||||
std::string out_name = ggml_get_name(out_tensor);
|
std::string out_name = ggml_get_name(out_tensor);
|
||||||
int i_layer_out = get_i_layer(out_name.c_str());
|
int i_layer_out = get_i_layer(out_name.c_str());
|
||||||
|
@ -11515,7 +11543,7 @@ int32_t llama_merge_models(const struct llama_merge_config * config)
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
read_tensor_data(in_tensor, *mls[i_model], in_buf);
|
read_tensor_data(in_tensor, *mls[i_model], in_buf);
|
||||||
res = calc_output_tensor(in_tensor->type, in_buf, scale, out_buf);
|
res = calc_output_tensor(in_tensor, scale, out_buf);
|
||||||
if (res < 0) {
|
if (res < 0) {
|
||||||
clean_up();
|
clean_up();
|
||||||
return res;
|
return res;
|
||||||
|
@ -11523,7 +11551,7 @@ int32_t llama_merge_models(const struct llama_merge_config * config)
|
||||||
}
|
}
|
||||||
|
|
||||||
n_done++;
|
n_done++;
|
||||||
LLAMA_LOG_INFO("[%4ld/%4ld] %36s - [%s], type = %6s\n",
|
LLAMA_LOG_INFO("[%4ld/%4ld] %36s - [%s], output type = %6s\n",
|
||||||
n_done, output_tensors.size(),
|
n_done, output_tensors.size(),
|
||||||
out_name.c_str(),
|
out_name.c_str(),
|
||||||
llama_format_tensor_shape(out_tensor).c_str(),
|
llama_format_tensor_shape(out_tensor).c_str(),
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue