first working version
This commit is contained in:
parent
48582575ab
commit
df9fb7e7bf
4 changed files with 121 additions and 52 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -72,6 +72,7 @@ models-mnt
|
||||||
/train-text-from-scratch
|
/train-text-from-scratch
|
||||||
/tokenize
|
/tokenize
|
||||||
/vdot
|
/vdot
|
||||||
|
/merge
|
||||||
/common/build-info.cpp
|
/common/build-info.cpp
|
||||||
arm_neon.h
|
arm_neon.h
|
||||||
compile_commands.json
|
compile_commands.json
|
||||||
|
|
|
@ -12,32 +12,107 @@
|
||||||
|
|
||||||
|
|
||||||
// usage:
|
// usage:
|
||||||
// ./merge ./path/model_1 LAYERS_1 ./path/model_2 LAYERS_2
|
// ./merge ./path/model_1 CONFIG1 ./path/model_2 CONFIG2
|
||||||
//
|
//
|
||||||
[[noreturn]]
|
[[noreturn]]
|
||||||
static void usage(const char * executable) {
|
static void usage(const char * executable) {
|
||||||
printf("usage: %s ./path/model_1 LAYERS_1 ./path/model_2 LAYERS_2\n\n", executable);
|
printf("usage: %s ./path/model_1 CONFIG1 ./path/model_2 CONFIG2\n\n", executable);
|
||||||
printf(" LAYERS must be in format: p0-p1,p2-p3,p4,... Example: 0-5,7,8-12\n");
|
printf(" CONFIG must be in format: p0-p1,p2-p3,p4,... Example: 0-5,7,8-12\n");
|
||||||
//printf(" Optionally, you can specify the scaling for a range of layers, for example: 0-5*0.5,6-7*1\n");
|
printf(" Optionally, you can specify the scaling for a range of layers, for example: 0-5*0.5,6-7*1. By default, scale will be 0.5. The number of layer start counting from 0.\n");
|
||||||
printf(" The embedding layer of the first model will be used");
|
printf(" The embedding layer of the first model will be used\n");
|
||||||
|
printf(" NOTE: currently, only F16 model type is supported\n");
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline std::vector<std::string> str_split(std::string str, const std::string & delimiter) {
|
||||||
|
size_t pos = 0;
|
||||||
|
std::string token;
|
||||||
|
std::vector<std::string> output;
|
||||||
|
while ((pos = str.find(delimiter)) != std::string::npos) {
|
||||||
|
token = str.substr(0, pos);
|
||||||
|
output.push_back(token);
|
||||||
|
str.erase(0, pos + delimiter.length());
|
||||||
|
}
|
||||||
|
output.push_back(str); // the rest
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::vector<struct llama_merge_config> parse_config(std::string & input) {
|
||||||
|
std::vector<struct llama_merge_config> configs;
|
||||||
|
auto intervals = str_split(input, ",");
|
||||||
|
for (auto & interval : intervals) {
|
||||||
|
auto components = str_split(interval, "*");
|
||||||
|
if (components.empty()) {
|
||||||
|
throw std::runtime_error("Config is incorrect");
|
||||||
|
}
|
||||||
|
float scale = components.size() == 2
|
||||||
|
? std::stof(components[1])
|
||||||
|
: 0.5; // be default
|
||||||
|
auto p0p1 = str_split(components[0], "-");
|
||||||
|
if (p0p1.empty()) {
|
||||||
|
throw std::runtime_error("Layer interval is invalid");
|
||||||
|
}
|
||||||
|
int p0 = std::stoi(p0p1[0]);
|
||||||
|
int p1 = p0p1.size() == 2 ? std::stoi(p0p1[1]) : p0;
|
||||||
|
if (p0 > p1) {
|
||||||
|
throw std::runtime_error("Layer interval is invalid, the end layer number is bigger and start layer number (p0 > p1)");
|
||||||
|
}
|
||||||
|
for (int i = p0; i <= p1; i++) {
|
||||||
|
struct llama_merge_config conf{i, scale, scale};
|
||||||
|
configs.push_back(conf);
|
||||||
|
}
|
||||||
|
// TODO: maybe check for overlap intervals?
|
||||||
|
}
|
||||||
|
return configs;
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_model_params model_params = llama_model_default_params();
|
|
||||||
std::vector<struct llama_merge_config> configs;
|
if (argc < 6) {
|
||||||
for (int i = 0; i < 100; i++) {
|
usage(argv[0]);
|
||||||
struct llama_merge_config conf{i, 0.0, 0.0};
|
|
||||||
configs.push_back(conf);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string fname_model1(argv[1]);
|
||||||
|
std::string config_model1(argv[2]);
|
||||||
|
std::string fname_model2(argv[3]);
|
||||||
|
std::string config_model2(argv[4]);
|
||||||
|
std::string fname_output(argv[5]);
|
||||||
|
|
||||||
|
// TODO: add try catch
|
||||||
|
auto configs1 = parse_config(config_model1);
|
||||||
|
auto configs2 = parse_config(config_model2);
|
||||||
|
std::vector<struct llama_merge_config> configs;
|
||||||
|
|
||||||
|
if (configs1.size() != configs2.size()) {
|
||||||
|
fprintf(stderr, "Number of layers between 2 configs does not match, config1 has %ld layers and config2 has %ld layers\n", configs1.size(), configs2.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
// merge 2 configs
|
||||||
|
printf("Merge configs:\n");
|
||||||
|
for (auto c1 : configs1) {
|
||||||
|
float scale2 = -1;
|
||||||
|
for (auto c2 : configs2) {
|
||||||
|
if (c2.i_layer == c1.i_layer) {
|
||||||
|
scale2 = c2.scale2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (scale2 < 0) {
|
||||||
|
fprintf(stderr, "Cannot find config for layer %d in CONFIG2\n", c1.i_layer);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
struct llama_merge_config conf{c1.i_layer, c1.scale1, scale2};
|
||||||
|
configs.push_back(conf);
|
||||||
|
|
||||||
|
printf(" Layer %d: scale1 = %f, scale2 = %f\n", conf.i_layer, conf.scale1, conf.scale2);
|
||||||
|
}
|
||||||
|
|
||||||
llama_merge_models(
|
llama_merge_models(
|
||||||
"",
|
fname_model1.c_str(),
|
||||||
"",
|
fname_model2.c_str(),
|
||||||
configs.data(),
|
configs.data(),
|
||||||
100,
|
configs.size(),
|
||||||
"/tmp/dolphin-test-merge.gguf"
|
fname_output.c_str()
|
||||||
);
|
);
|
||||||
std::cout << "done\n";
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
}
|
}
|
67
llama.cpp
67
llama.cpp
|
@ -11126,13 +11126,6 @@ static int32_t llama_merge_models_internal(
|
||||||
|
|
||||||
// process layers
|
// process layers
|
||||||
for (int i = 0; i < ml1.n_tensors; ++i) {
|
for (int i = 0; i < ml1.n_tensors; ++i) {
|
||||||
struct ggml_init_params params = {
|
|
||||||
/*.mem_size =*/ 1000u*ggml_tensor_overhead(),
|
|
||||||
/*.mem_buffer =*/ NULL,
|
|
||||||
/*.no_alloc =*/ true,
|
|
||||||
};
|
|
||||||
ggml_context * ctx_ggml = ggml_init(params);
|
|
||||||
|
|
||||||
struct ggml_tensor * tensor1 = ml1.get_tensor_meta(i);
|
struct ggml_tensor * tensor1 = ml1.get_tensor_meta(i);
|
||||||
std::vector<no_init<uint8_t>> buf1;
|
std::vector<no_init<uint8_t>> buf1;
|
||||||
const std::string name = ggml_get_name(tensor1);
|
const std::string name = ggml_get_name(tensor1);
|
||||||
|
@ -11142,54 +11135,54 @@ static int32_t llama_merge_models_internal(
|
||||||
std::vector<no_init<uint8_t>> buf2;
|
std::vector<no_init<uint8_t>> buf2;
|
||||||
struct ggml_tensor * tensor2 = ml2.get_tensor_meta(idx_ml2);
|
struct ggml_tensor * tensor2 = ml2.get_tensor_meta(idx_ml2);
|
||||||
|
|
||||||
struct ggml_tensor * result;
|
// GGML_TYPE_F16
|
||||||
|
std::vector<no_init<uint8_t>> result(tensor_size);
|
||||||
|
|
||||||
if (llama_format_tensor_shape(tensor1) != llama_format_tensor_shape(tensor2)) {
|
if (llama_format_tensor_shape(tensor1) != llama_format_tensor_shape(tensor2)) {
|
||||||
LLAMA_LOG_ERROR("Tensor shapes are different\n");
|
LLAMA_LOG_ERROR("Tensor shapes are different\n");
|
||||||
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
int i_layer;
|
int i_layer = -1;
|
||||||
if (sscanf(name.c_str(), "blk.%d.", &i_layer) != 1) {
|
if (sscanf(name.c_str(), "blk.%d.", &i_layer) != 1) {
|
||||||
// non-layer, simply copy
|
// non-layer, simply copy
|
||||||
read_tensor_data(tensor1, ml1, buf1);
|
read_tensor_data(tensor1, ml1, buf1);
|
||||||
result = tensor1; // no change
|
memcpy(result.data(), tensor1->data, tensor_size);
|
||||||
} else {
|
} else {
|
||||||
LLAMA_LOG_INFO("i_layer %d\n", i_layer);
|
auto conf = get_config_for_layer(i_layer);
|
||||||
read_tensor_data(tensor1, ml1, buf1);
|
read_tensor_data(tensor1, ml1, buf1);
|
||||||
read_tensor_data(tensor2, ml2, buf2);
|
read_tensor_data(tensor2, ml2, buf2);
|
||||||
auto conf = get_config_for_layer(i_layer);
|
LLAMA_LOG_INFO("Merge layer %d with scale1 = %f, scale2 = %f\n", i_layer, conf->scale1, conf->scale2);
|
||||||
struct ggml_cgraph * gf = ggml_new_graph(ctx_ggml);
|
|
||||||
struct ggml_tensor * t1 = ggml_dup_tensor(ctx_ggml, tensor1);
|
|
||||||
struct ggml_tensor * t2 = ggml_dup_tensor(ctx_ggml, tensor2);
|
|
||||||
t1 = ggml_cpy(ctx_ggml, tensor1, t1);
|
|
||||||
t2 = ggml_cpy(ctx_ggml, tensor2, t2);
|
|
||||||
t1 = ggml_scale(ctx_ggml, t1, conf->scale1);
|
|
||||||
t2 = ggml_scale(ctx_ggml, t2, conf->scale2);
|
|
||||||
result = ggml_add(ctx_ggml, t1, t2);
|
|
||||||
ggml_build_forward_expand(gf, result);
|
|
||||||
ggml_graph_dump_dot(gf, NULL, "/tmp/___cgraph.txt");
|
|
||||||
ggml_graph_compute_with_ctx(ctx_ggml, gf, 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
LLAMA_LOG_INFO("i_layer %d ===\n", i_layer);
|
if (tensor1->type == GGML_TYPE_F16 && tensor2->type == GGML_TYPE_F16) {
|
||||||
|
for (size_t i = 0; i < result.size() / sizeof(float); i++) {
|
||||||
|
float * t1 = (float *) tensor1->data;
|
||||||
|
float * t2 = (float *) tensor2->data;
|
||||||
|
float * dest = (float *) result.data();
|
||||||
|
dest[i] = t1[i] * conf->scale1 + t2[i] * conf->scale2;
|
||||||
|
}
|
||||||
|
} else if (tensor1->type == GGML_TYPE_F32 && tensor2->type == GGML_TYPE_F32) {
|
||||||
|
for (size_t i = 0; i < result.size() / sizeof(double); i++) {
|
||||||
|
double * t1 = (double *) tensor1->data;
|
||||||
|
double * t2 = (double *) tensor2->data;
|
||||||
|
double * dest = (double *) result.data();
|
||||||
|
dest[i] = t1[i] * conf->scale1 + t2[i] * conf->scale2;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
LLAMA_LOG_ERROR("Only GGML_TYPE_F16 or GGML_TYPE_F32 is supported, current type = %s\n", ggml_type_name(tensor1->type));
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s\n",
|
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s\n",
|
||||||
i + 1, ml1.n_tensors,
|
i + 1, ml1.n_tensors,
|
||||||
ggml_get_name(result),
|
ggml_get_name(tensor1),
|
||||||
llama_format_tensor_shape(result).c_str(),
|
llama_format_tensor_shape(tensor1).c_str(),
|
||||||
ggml_type_name(result->type));
|
ggml_type_name(tensor1->type));
|
||||||
|
|
||||||
//std::vector<no_init<uint8_t>> tensor_data(tensor_size);
|
|
||||||
//ggml_backend_tensor_get(tensor1, tensor_data.data(), 0, tensor_size);
|
|
||||||
|
|
||||||
// write tensor data + padding
|
// write tensor data + padding
|
||||||
const char * buf = (const char *) result->data;
|
fout.write((const char *) result.data(), tensor_size);
|
||||||
printf("%d %d\n", buf[0], buf[1]);
|
|
||||||
fout.write((const char *) result->data, tensor_size);
|
|
||||||
zeros(fout, GGML_PAD(tensor_size, GGUF_DEFAULT_ALIGNMENT) - tensor_size);
|
zeros(fout, GGML_PAD(tensor_size, GGUF_DEFAULT_ALIGNMENT) - tensor_size);
|
||||||
ggml_free(ctx_ggml);
|
|
||||||
|
|
||||||
if (i > 3) break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// go back to beginning of file and write the updated meta data
|
// go back to beginning of file and write the updated meta data
|
||||||
|
|
BIN
merge
BIN
merge
Binary file not shown.
Loading…
Add table
Add a link
Reference in a new issue