first working version

This commit is contained in:
ngxson 2024-02-26 22:31:25 +01:00
parent 48582575ab
commit df9fb7e7bf
4 changed files with 121 additions and 52 deletions

1
.gitignore vendored
View file

@ -72,6 +72,7 @@ models-mnt
/train-text-from-scratch /train-text-from-scratch
/tokenize /tokenize
/vdot /vdot
/merge
/common/build-info.cpp /common/build-info.cpp
arm_neon.h arm_neon.h
compile_commands.json compile_commands.json

View file

@ -12,32 +12,107 @@
// usage: // usage:
// ./merge ./path/model_1 LAYERS_1 ./path/model_2 LAYERS_2 // ./merge ./path/model_1 CONFIG1 ./path/model_2 CONFIG2
// //
[[noreturn]] [[noreturn]]
static void usage(const char * executable) { static void usage(const char * executable) {
printf("usage: %s ./path/model_1 LAYERS_1 ./path/model_2 LAYERS_2\n\n", executable); printf("usage: %s ./path/model_1 CONFIG1 ./path/model_2 CONFIG2\n\n", executable);
printf(" LAYERS must be in format: p0-p1,p2-p3,p4,... Example: 0-5,7,8-12\n"); printf(" CONFIG must be in format: p0-p1,p2-p3,p4,... Example: 0-5,7,8-12\n");
//printf(" Optionally, you can specify the scaling for a range of layers, for example: 0-5*0.5,6-7*1\n"); printf(" Optionally, you can specify the scaling for a range of layers, for example: 0-5*0.5,6-7*1. By default, scale will be 0.5. The number of layer start counting from 0.\n");
printf(" The embedding layer of the first model will be used"); printf(" The embedding layer of the first model will be used\n");
printf(" NOTE: currently, only F16 model type is supported\n");
exit(1); exit(1);
} }
inline std::vector<std::string> str_split(std::string str, const std::string & delimiter) {
size_t pos = 0;
std::string token;
std::vector<std::string> output;
while ((pos = str.find(delimiter)) != std::string::npos) {
token = str.substr(0, pos);
output.push_back(token);
str.erase(0, pos + delimiter.length());
}
output.push_back(str); // the rest
return output;
}
static std::vector<struct llama_merge_config> parse_config(std::string & input) {
std::vector<struct llama_merge_config> configs;
auto intervals = str_split(input, ",");
for (auto & interval : intervals) {
auto components = str_split(interval, "*");
if (components.empty()) {
throw std::runtime_error("Config is incorrect");
}
float scale = components.size() == 2
? std::stof(components[1])
: 0.5; // be default
auto p0p1 = str_split(components[0], "-");
if (p0p1.empty()) {
throw std::runtime_error("Layer interval is invalid");
}
int p0 = std::stoi(p0p1[0]);
int p1 = p0p1.size() == 2 ? std::stoi(p0p1[1]) : p0;
if (p0 > p1) {
throw std::runtime_error("Layer interval is invalid, the end layer number is bigger and start layer number (p0 > p1)");
}
for (int i = p0; i <= p1; i++) {
struct llama_merge_config conf{i, scale, scale};
configs.push_back(conf);
}
// TODO: maybe check for overlap intervals?
}
return configs;
}
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
llama_backend_init(); llama_backend_init();
llama_model_params model_params = llama_model_default_params();
std::vector<struct llama_merge_config> configs; if (argc < 6) {
for (int i = 0; i < 100; i++) { usage(argv[0]);
struct llama_merge_config conf{i, 0.0, 0.0};
configs.push_back(conf);
} }
std::string fname_model1(argv[1]);
std::string config_model1(argv[2]);
std::string fname_model2(argv[3]);
std::string config_model2(argv[4]);
std::string fname_output(argv[5]);
// TODO: add try catch
auto configs1 = parse_config(config_model1);
auto configs2 = parse_config(config_model2);
std::vector<struct llama_merge_config> configs;
if (configs1.size() != configs2.size()) {
fprintf(stderr, "Number of layers between 2 configs does not match, config1 has %ld layers and config2 has %ld layers\n", configs1.size(), configs2.size());
}
// merge 2 configs
printf("Merge configs:\n");
for (auto c1 : configs1) {
float scale2 = -1;
for (auto c2 : configs2) {
if (c2.i_layer == c1.i_layer) {
scale2 = c2.scale2;
}
}
if (scale2 < 0) {
fprintf(stderr, "Cannot find config for layer %d in CONFIG2\n", c1.i_layer);
exit(1);
}
struct llama_merge_config conf{c1.i_layer, c1.scale1, scale2};
configs.push_back(conf);
printf(" Layer %d: scale1 = %f, scale2 = %f\n", conf.i_layer, conf.scale1, conf.scale2);
}
llama_merge_models( llama_merge_models(
"", fname_model1.c_str(),
"", fname_model2.c_str(),
configs.data(), configs.data(),
100, configs.size(),
"/tmp/dolphin-test-merge.gguf" fname_output.c_str()
); );
std::cout << "done\n";
llama_backend_free(); llama_backend_free();
} }

View file

@ -11126,13 +11126,6 @@ static int32_t llama_merge_models_internal(
// process layers // process layers
for (int i = 0; i < ml1.n_tensors; ++i) { for (int i = 0; i < ml1.n_tensors; ++i) {
struct ggml_init_params params = {
/*.mem_size =*/ 1000u*ggml_tensor_overhead(),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ggml_context * ctx_ggml = ggml_init(params);
struct ggml_tensor * tensor1 = ml1.get_tensor_meta(i); struct ggml_tensor * tensor1 = ml1.get_tensor_meta(i);
std::vector<no_init<uint8_t>> buf1; std::vector<no_init<uint8_t>> buf1;
const std::string name = ggml_get_name(tensor1); const std::string name = ggml_get_name(tensor1);
@ -11142,54 +11135,54 @@ static int32_t llama_merge_models_internal(
std::vector<no_init<uint8_t>> buf2; std::vector<no_init<uint8_t>> buf2;
struct ggml_tensor * tensor2 = ml2.get_tensor_meta(idx_ml2); struct ggml_tensor * tensor2 = ml2.get_tensor_meta(idx_ml2);
struct ggml_tensor * result; // GGML_TYPE_F16
std::vector<no_init<uint8_t>> result(tensor_size);
if (llama_format_tensor_shape(tensor1) != llama_format_tensor_shape(tensor2)) { if (llama_format_tensor_shape(tensor1) != llama_format_tensor_shape(tensor2)) {
LLAMA_LOG_ERROR("Tensor shapes are different\n"); LLAMA_LOG_ERROR("Tensor shapes are different\n");
return -1;
} }
int i_layer; int i_layer = -1;
if (sscanf(name.c_str(), "blk.%d.", &i_layer) != 1) { if (sscanf(name.c_str(), "blk.%d.", &i_layer) != 1) {
// non-layer, simply copy // non-layer, simply copy
read_tensor_data(tensor1, ml1, buf1); read_tensor_data(tensor1, ml1, buf1);
result = tensor1; // no change memcpy(result.data(), tensor1->data, tensor_size);
} else { } else {
LLAMA_LOG_INFO("i_layer %d\n", i_layer); auto conf = get_config_for_layer(i_layer);
read_tensor_data(tensor1, ml1, buf1); read_tensor_data(tensor1, ml1, buf1);
read_tensor_data(tensor2, ml2, buf2); read_tensor_data(tensor2, ml2, buf2);
auto conf = get_config_for_layer(i_layer); LLAMA_LOG_INFO("Merge layer %d with scale1 = %f, scale2 = %f\n", i_layer, conf->scale1, conf->scale2);
struct ggml_cgraph * gf = ggml_new_graph(ctx_ggml);
struct ggml_tensor * t1 = ggml_dup_tensor(ctx_ggml, tensor1);
struct ggml_tensor * t2 = ggml_dup_tensor(ctx_ggml, tensor2);
t1 = ggml_cpy(ctx_ggml, tensor1, t1);
t2 = ggml_cpy(ctx_ggml, tensor2, t2);
t1 = ggml_scale(ctx_ggml, t1, conf->scale1);
t2 = ggml_scale(ctx_ggml, t2, conf->scale2);
result = ggml_add(ctx_ggml, t1, t2);
ggml_build_forward_expand(gf, result);
ggml_graph_dump_dot(gf, NULL, "/tmp/___cgraph.txt");
ggml_graph_compute_with_ctx(ctx_ggml, gf, 1);
}
LLAMA_LOG_INFO("i_layer %d ===\n", i_layer); if (tensor1->type == GGML_TYPE_F16 && tensor2->type == GGML_TYPE_F16) {
for (size_t i = 0; i < result.size() / sizeof(float); i++) {
float * t1 = (float *) tensor1->data;
float * t2 = (float *) tensor2->data;
float * dest = (float *) result.data();
dest[i] = t1[i] * conf->scale1 + t2[i] * conf->scale2;
}
} else if (tensor1->type == GGML_TYPE_F32 && tensor2->type == GGML_TYPE_F32) {
for (size_t i = 0; i < result.size() / sizeof(double); i++) {
double * t1 = (double *) tensor1->data;
double * t2 = (double *) tensor2->data;
double * dest = (double *) result.data();
dest[i] = t1[i] * conf->scale1 + t2[i] * conf->scale2;
}
} else {
LLAMA_LOG_ERROR("Only GGML_TYPE_F16 or GGML_TYPE_F32 is supported, current type = %s\n", ggml_type_name(tensor1->type));
return -1;
}
}
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s\n", LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s\n",
i + 1, ml1.n_tensors, i + 1, ml1.n_tensors,
ggml_get_name(result), ggml_get_name(tensor1),
llama_format_tensor_shape(result).c_str(), llama_format_tensor_shape(tensor1).c_str(),
ggml_type_name(result->type)); ggml_type_name(tensor1->type));
//std::vector<no_init<uint8_t>> tensor_data(tensor_size);
//ggml_backend_tensor_get(tensor1, tensor_data.data(), 0, tensor_size);
// write tensor data + padding // write tensor data + padding
const char * buf = (const char *) result->data; fout.write((const char *) result.data(), tensor_size);
printf("%d %d\n", buf[0], buf[1]);
fout.write((const char *) result->data, tensor_size);
zeros(fout, GGML_PAD(tensor_size, GGUF_DEFAULT_ALIGNMENT) - tensor_size); zeros(fout, GGML_PAD(tensor_size, GGUF_DEFAULT_ALIGNMENT) - tensor_size);
ggml_free(ctx_ggml);
if (i > 3) break;
} }
// go back to beginning of file and write the updated meta data // go back to beginning of file and write the updated meta data

BIN
merge

Binary file not shown.