first working version
This commit is contained in:
parent
48582575ab
commit
df9fb7e7bf
4 changed files with 121 additions and 52 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -72,6 +72,7 @@ models-mnt
|
|||
/train-text-from-scratch
|
||||
/tokenize
|
||||
/vdot
|
||||
/merge
|
||||
/common/build-info.cpp
|
||||
arm_neon.h
|
||||
compile_commands.json
|
||||
|
|
|
@ -12,32 +12,107 @@
|
|||
|
||||
|
||||
// usage:
|
||||
// ./merge ./path/model_1 LAYERS_1 ./path/model_2 LAYERS_2
|
||||
// ./merge ./path/model_1 CONFIG1 ./path/model_2 CONFIG2
|
||||
//
|
||||
[[noreturn]]
|
||||
static void usage(const char * executable) {
|
||||
printf("usage: %s ./path/model_1 LAYERS_1 ./path/model_2 LAYERS_2\n\n", executable);
|
||||
printf(" LAYERS must be in format: p0-p1,p2-p3,p4,... Example: 0-5,7,8-12\n");
|
||||
//printf(" Optionally, you can specify the scaling for a range of layers, for example: 0-5*0.5,6-7*1\n");
|
||||
printf(" The embedding layer of the first model will be used");
|
||||
printf("usage: %s ./path/model_1 CONFIG1 ./path/model_2 CONFIG2\n\n", executable);
|
||||
printf(" CONFIG must be in format: p0-p1,p2-p3,p4,... Example: 0-5,7,8-12\n");
|
||||
printf(" Optionally, you can specify the scaling for a range of layers, for example: 0-5*0.5,6-7*1. By default, scale will be 0.5. The number of layer start counting from 0.\n");
|
||||
printf(" The embedding layer of the first model will be used\n");
|
||||
printf(" NOTE: currently, only F16 model type is supported\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
inline std::vector<std::string> str_split(std::string str, const std::string & delimiter) {
|
||||
size_t pos = 0;
|
||||
std::string token;
|
||||
std::vector<std::string> output;
|
||||
while ((pos = str.find(delimiter)) != std::string::npos) {
|
||||
token = str.substr(0, pos);
|
||||
output.push_back(token);
|
||||
str.erase(0, pos + delimiter.length());
|
||||
}
|
||||
output.push_back(str); // the rest
|
||||
return output;
|
||||
}
|
||||
|
||||
static std::vector<struct llama_merge_config> parse_config(std::string & input) {
|
||||
std::vector<struct llama_merge_config> configs;
|
||||
auto intervals = str_split(input, ",");
|
||||
for (auto & interval : intervals) {
|
||||
auto components = str_split(interval, "*");
|
||||
if (components.empty()) {
|
||||
throw std::runtime_error("Config is incorrect");
|
||||
}
|
||||
float scale = components.size() == 2
|
||||
? std::stof(components[1])
|
||||
: 0.5; // be default
|
||||
auto p0p1 = str_split(components[0], "-");
|
||||
if (p0p1.empty()) {
|
||||
throw std::runtime_error("Layer interval is invalid");
|
||||
}
|
||||
int p0 = std::stoi(p0p1[0]);
|
||||
int p1 = p0p1.size() == 2 ? std::stoi(p0p1[1]) : p0;
|
||||
if (p0 > p1) {
|
||||
throw std::runtime_error("Layer interval is invalid, the end layer number is bigger and start layer number (p0 > p1)");
|
||||
}
|
||||
for (int i = p0; i <= p1; i++) {
|
||||
struct llama_merge_config conf{i, scale, scale};
|
||||
configs.push_back(conf);
|
||||
}
|
||||
// TODO: maybe check for overlap intervals?
|
||||
}
|
||||
return configs;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
llama_backend_init();
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
std::vector<struct llama_merge_config> configs;
|
||||
for (int i = 0; i < 100; i++) {
|
||||
struct llama_merge_config conf{i, 0.0, 0.0};
|
||||
configs.push_back(conf);
|
||||
|
||||
if (argc < 6) {
|
||||
usage(argv[0]);
|
||||
}
|
||||
|
||||
std::string fname_model1(argv[1]);
|
||||
std::string config_model1(argv[2]);
|
||||
std::string fname_model2(argv[3]);
|
||||
std::string config_model2(argv[4]);
|
||||
std::string fname_output(argv[5]);
|
||||
|
||||
// TODO: add try catch
|
||||
auto configs1 = parse_config(config_model1);
|
||||
auto configs2 = parse_config(config_model2);
|
||||
std::vector<struct llama_merge_config> configs;
|
||||
|
||||
if (configs1.size() != configs2.size()) {
|
||||
fprintf(stderr, "Number of layers between 2 configs does not match, config1 has %ld layers and config2 has %ld layers\n", configs1.size(), configs2.size());
|
||||
}
|
||||
|
||||
// merge 2 configs
|
||||
printf("Merge configs:\n");
|
||||
for (auto c1 : configs1) {
|
||||
float scale2 = -1;
|
||||
for (auto c2 : configs2) {
|
||||
if (c2.i_layer == c1.i_layer) {
|
||||
scale2 = c2.scale2;
|
||||
}
|
||||
}
|
||||
if (scale2 < 0) {
|
||||
fprintf(stderr, "Cannot find config for layer %d in CONFIG2\n", c1.i_layer);
|
||||
exit(1);
|
||||
}
|
||||
struct llama_merge_config conf{c1.i_layer, c1.scale1, scale2};
|
||||
configs.push_back(conf);
|
||||
|
||||
printf(" Layer %d: scale1 = %f, scale2 = %f\n", conf.i_layer, conf.scale1, conf.scale2);
|
||||
}
|
||||
|
||||
llama_merge_models(
|
||||
"",
|
||||
"",
|
||||
fname_model1.c_str(),
|
||||
fname_model2.c_str(),
|
||||
configs.data(),
|
||||
100,
|
||||
"/tmp/dolphin-test-merge.gguf"
|
||||
configs.size(),
|
||||
fname_output.c_str()
|
||||
);
|
||||
std::cout << "done\n";
|
||||
llama_backend_free();
|
||||
}
|
67
llama.cpp
67
llama.cpp
|
@ -11126,13 +11126,6 @@ static int32_t llama_merge_models_internal(
|
|||
|
||||
// process layers
|
||||
for (int i = 0; i < ml1.n_tensors; ++i) {
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ 1000u*ggml_tensor_overhead(),
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
ggml_context * ctx_ggml = ggml_init(params);
|
||||
|
||||
struct ggml_tensor * tensor1 = ml1.get_tensor_meta(i);
|
||||
std::vector<no_init<uint8_t>> buf1;
|
||||
const std::string name = ggml_get_name(tensor1);
|
||||
|
@ -11142,54 +11135,54 @@ static int32_t llama_merge_models_internal(
|
|||
std::vector<no_init<uint8_t>> buf2;
|
||||
struct ggml_tensor * tensor2 = ml2.get_tensor_meta(idx_ml2);
|
||||
|
||||
struct ggml_tensor * result;
|
||||
// GGML_TYPE_F16
|
||||
std::vector<no_init<uint8_t>> result(tensor_size);
|
||||
|
||||
if (llama_format_tensor_shape(tensor1) != llama_format_tensor_shape(tensor2)) {
|
||||
LLAMA_LOG_ERROR("Tensor shapes are different\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
int i_layer;
|
||||
int i_layer = -1;
|
||||
if (sscanf(name.c_str(), "blk.%d.", &i_layer) != 1) {
|
||||
// non-layer, simply copy
|
||||
read_tensor_data(tensor1, ml1, buf1);
|
||||
result = tensor1; // no change
|
||||
memcpy(result.data(), tensor1->data, tensor_size);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("i_layer %d\n", i_layer);
|
||||
auto conf = get_config_for_layer(i_layer);
|
||||
read_tensor_data(tensor1, ml1, buf1);
|
||||
read_tensor_data(tensor2, ml2, buf2);
|
||||
auto conf = get_config_for_layer(i_layer);
|
||||
struct ggml_cgraph * gf = ggml_new_graph(ctx_ggml);
|
||||
struct ggml_tensor * t1 = ggml_dup_tensor(ctx_ggml, tensor1);
|
||||
struct ggml_tensor * t2 = ggml_dup_tensor(ctx_ggml, tensor2);
|
||||
t1 = ggml_cpy(ctx_ggml, tensor1, t1);
|
||||
t2 = ggml_cpy(ctx_ggml, tensor2, t2);
|
||||
t1 = ggml_scale(ctx_ggml, t1, conf->scale1);
|
||||
t2 = ggml_scale(ctx_ggml, t2, conf->scale2);
|
||||
result = ggml_add(ctx_ggml, t1, t2);
|
||||
ggml_build_forward_expand(gf, result);
|
||||
ggml_graph_dump_dot(gf, NULL, "/tmp/___cgraph.txt");
|
||||
ggml_graph_compute_with_ctx(ctx_ggml, gf, 1);
|
||||
}
|
||||
LLAMA_LOG_INFO("Merge layer %d with scale1 = %f, scale2 = %f\n", i_layer, conf->scale1, conf->scale2);
|
||||
|
||||
LLAMA_LOG_INFO("i_layer %d ===\n", i_layer);
|
||||
if (tensor1->type == GGML_TYPE_F16 && tensor2->type == GGML_TYPE_F16) {
|
||||
for (size_t i = 0; i < result.size() / sizeof(float); i++) {
|
||||
float * t1 = (float *) tensor1->data;
|
||||
float * t2 = (float *) tensor2->data;
|
||||
float * dest = (float *) result.data();
|
||||
dest[i] = t1[i] * conf->scale1 + t2[i] * conf->scale2;
|
||||
}
|
||||
} else if (tensor1->type == GGML_TYPE_F32 && tensor2->type == GGML_TYPE_F32) {
|
||||
for (size_t i = 0; i < result.size() / sizeof(double); i++) {
|
||||
double * t1 = (double *) tensor1->data;
|
||||
double * t2 = (double *) tensor2->data;
|
||||
double * dest = (double *) result.data();
|
||||
dest[i] = t1[i] * conf->scale1 + t2[i] * conf->scale2;
|
||||
}
|
||||
} else {
|
||||
LLAMA_LOG_ERROR("Only GGML_TYPE_F16 or GGML_TYPE_F32 is supported, current type = %s\n", ggml_type_name(tensor1->type));
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s\n",
|
||||
i + 1, ml1.n_tensors,
|
||||
ggml_get_name(result),
|
||||
llama_format_tensor_shape(result).c_str(),
|
||||
ggml_type_name(result->type));
|
||||
|
||||
//std::vector<no_init<uint8_t>> tensor_data(tensor_size);
|
||||
//ggml_backend_tensor_get(tensor1, tensor_data.data(), 0, tensor_size);
|
||||
ggml_get_name(tensor1),
|
||||
llama_format_tensor_shape(tensor1).c_str(),
|
||||
ggml_type_name(tensor1->type));
|
||||
|
||||
// write tensor data + padding
|
||||
const char * buf = (const char *) result->data;
|
||||
printf("%d %d\n", buf[0], buf[1]);
|
||||
fout.write((const char *) result->data, tensor_size);
|
||||
fout.write((const char *) result.data(), tensor_size);
|
||||
zeros(fout, GGML_PAD(tensor_size, GGUF_DEFAULT_ALIGNMENT) - tensor_size);
|
||||
ggml_free(ctx_ggml);
|
||||
|
||||
if (i > 3) break;
|
||||
}
|
||||
|
||||
// go back to beginning of file and write the updated meta data
|
||||
|
|
BIN
merge
BIN
merge
Binary file not shown.
Loading…
Add table
Add a link
Reference in a new issue