wip: new format
This commit is contained in:
parent
52186adcbe
commit
65730438aa
8 changed files with 389 additions and 124 deletions
2
Makefile
2
Makefile
|
@ -705,7 +705,7 @@ quantize: examples/quantize/quantize.cpp build-info.o ggml.
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
merge: examples/merge/merge.cpp build-info.o ggml.o llama.o $(OBJS)
|
merge: examples/merge/merge.cpp examples/merge/parser.hpp build-info.o ggml.o llama.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
set(TARGET merge)
|
set(TARGET merge)
|
||||||
add_executable(${TARGET} merge.cpp)
|
add_executable(${TARGET} merge.cpp parser.hpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||||
|
|
|
@ -1,32 +0,0 @@
|
||||||
0,0.1,0,0.9
|
|
||||||
1,0.1,1,0.9
|
|
||||||
2,0.1,2,0.9
|
|
||||||
3,0.1,3,0.9
|
|
||||||
4,0.1,4,0.9
|
|
||||||
5,0.1,5,0.9
|
|
||||||
6,0.1,6,0.9
|
|
||||||
7,0.1,7,0.9
|
|
||||||
8,0.1,8,0.9
|
|
||||||
9,0.1,9,0.9
|
|
||||||
10,0.5,10,0.5
|
|
||||||
11,0.5,11,0.5
|
|
||||||
12,0.5,12,0.5
|
|
||||||
13,0.5,13,0.5
|
|
||||||
14,0.5,14,0.5
|
|
||||||
15,0.5,15,0.5
|
|
||||||
16,0.5,16,0.5
|
|
||||||
17,0.5,17,0.5
|
|
||||||
18,0.5,18,0.5
|
|
||||||
19,0.5,19,0.5
|
|
||||||
20,0.5,20,0.5
|
|
||||||
21,0.5,21,0.5
|
|
||||||
22,0.5,22,0.5
|
|
||||||
23,0.5,23,0.5
|
|
||||||
24,0.5,24,0.5
|
|
||||||
25,0.5,25,0.5
|
|
||||||
26,0.5,26,0.5
|
|
||||||
27,0.5,27,0.5
|
|
||||||
28,0.5,28,0.5
|
|
||||||
29,0.5,29,0.5
|
|
||||||
20,0.5,20,0.5
|
|
||||||
31,0.5,31,0.5
|
|
|
38
examples/merge/config.example.txt
Normal file
38
examples/merge/config.example.txt
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
# GGUF merge instructions
|
||||||
|
#
|
||||||
|
# Lines start with "#" will be comment
|
||||||
|
# Empty lines will be ignored
|
||||||
|
# The "output layer" instruction is to add a new layer for output model
|
||||||
|
# Merge instruction is in format: target (space) verb (space) parameters
|
||||||
|
# Supported verbs:
|
||||||
|
# - linear: merge linearly, parameters: source_layer,source_layer,t
|
||||||
|
# - slerp: spherical linear interpolation, parameters: source_layer,source_layer,scale,scale
|
||||||
|
# - repeat: repeat a layer in the same output model (to reduce file size)
|
||||||
|
#
|
||||||
|
# For example:
|
||||||
|
#
|
||||||
|
# This is the first layer of output model:
|
||||||
|
# For all tensors, we want slerp(model[0].layer[0], model[1].layer[0], 0.1)
|
||||||
|
# Except for "attn_output" tensor that we want t=0.5 instead t=0.1
|
||||||
|
|
||||||
|
output layer 0
|
||||||
|
all slerp 0,0,0.9
|
||||||
|
attn_output slerp 0,0,0.9
|
||||||
|
|
||||||
|
# For next layer, we want: model[0].layer[1]*0.6 + model[1].layer[1]*0.4
|
||||||
|
# Except for "attn_output" tensor that we want to use slerp with t=0.9
|
||||||
|
|
||||||
|
output layer 1
|
||||||
|
all linear 1,1,0.6,0.4
|
||||||
|
attn_output slerp 1,1,0.9
|
||||||
|
|
||||||
|
output layer 2
|
||||||
|
all linear 2,2,1.0,0.0
|
||||||
|
|
||||||
|
# repeat the first layers defined earlier in this file
|
||||||
|
|
||||||
|
output layer 3
|
||||||
|
all repeat 0
|
||||||
|
|
||||||
|
output layer 4
|
||||||
|
all repeat 1
|
|
@ -1,5 +1,6 @@
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
#include "parser.hpp"
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
@ -10,10 +11,14 @@
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
|
static const size_t n_models = 2; // hard-limited to 2 input models for now
|
||||||
|
|
||||||
struct merge_params {
|
struct merge_params {
|
||||||
std::string config_path = "merge.csv";
|
std::string config_path = "config.txt";
|
||||||
std::vector<std::string> model_paths;
|
std::vector<std::string> model_paths;
|
||||||
std::string output_path = "ggml-merged-f16.gguf";
|
std::string output_path = "ggml-merged-f16.gguf";
|
||||||
|
bool only_list_tensors_name = false;
|
||||||
|
bool dry_run = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
[[noreturn]]
|
[[noreturn]]
|
||||||
|
@ -21,21 +26,11 @@ static void usage(const char * executable, int exit_code) {
|
||||||
struct merge_params defaults;
|
struct merge_params defaults;
|
||||||
printf("usage: %s -c CONFIG_FILE -o OUTPUT_FILE -m MODEL_PATH -m MODEL_PATH ...\n\n", executable);
|
printf("usage: %s -c CONFIG_FILE -o OUTPUT_FILE -m MODEL_PATH -m MODEL_PATH ...\n\n", executable);
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("Merging 2 models and change layers configuration.\n");
|
printf("Merging multiple models, inspired by mergekit.\n");
|
||||||
printf("Merge config format is CSV, without header, one line represents one layer of the output model, columns in the order below:\n");
|
printf("For more details, see \"config.example.txt\" file.\n");
|
||||||
printf("- Model A layer\n");
|
|
||||||
printf("- Model A scale\n");
|
|
||||||
printf("- Model B layer\n");
|
|
||||||
printf("- Model B scale\n");
|
|
||||||
printf("- ...\n");
|
|
||||||
printf("\n");
|
|
||||||
printf("For example:\n");
|
|
||||||
printf("0,1.0,0,0.0 meaning: output layer 0 = A[0]*1.0 + B[0]*0.0\n");
|
|
||||||
printf("0,1.0,0,0.0 meaning: output layer 1 = A[0]*1.0 + B[0]*0.0\n");
|
|
||||||
printf("1,0.0,2,0.0 meaning: output layer 2 = A[1]*0.0 + B[2]*0.0\n");
|
|
||||||
printf("2,0.5,1,0.5 meaning: output layer 3 = A[2]*0.5 + B[1]*0.5\n");
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("NOTE:\n");
|
printf("NOTE:\n");
|
||||||
|
printf("- Only support merging 2 models.\n");
|
||||||
printf("- The embedding and output layers of the first model will be used.\n");
|
printf("- The embedding and output layers of the first model will be used.\n");
|
||||||
printf("- Currently, we accept both quantized and non-quantized models as input. The output model will be re-quantized into the same format of the first model.\n");
|
printf("- Currently, we accept both quantized and non-quantized models as input. The output model will be re-quantized into the same format of the first model.\n");
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
@ -44,64 +39,13 @@ static void usage(const char * executable, int exit_code) {
|
||||||
printf(" -c, --config CONFIG_FILE Path to config file, in CSV format (default: %s)\n", defaults.config_path.c_str());
|
printf(" -c, --config CONFIG_FILE Path to config file, in CSV format (default: %s)\n", defaults.config_path.c_str());
|
||||||
printf(" -m, --model MODEL_PATH Path to model. This option can be repeated multiple times and must be specified in the right order.\n");
|
printf(" -m, --model MODEL_PATH Path to model. This option can be repeated multiple times and must be specified in the right order.\n");
|
||||||
printf(" -o, --output OUTPUT_FILE Path to the output model (default: %s)\n", defaults.output_path.c_str());
|
printf(" -o, --output OUTPUT_FILE Path to the output model (default: %s)\n", defaults.output_path.c_str());
|
||||||
|
printf(" --dry-run Only print out list of parsed and exit, useful for debugging\n");
|
||||||
|
printf(" --print-list-tensor Only print out list of tensors of the input model, useful for debugging (only one model is accepted)\n");
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("Example: ./merge -c config.csv -o output.gguf -m model_a.gguf -m model_b.gguf\n");
|
printf("Example: ./merge -c config.txt -o output.gguf -m model_a.gguf -m model_b.gguf\n");
|
||||||
exit(exit_code);
|
exit(exit_code);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline std::vector<std::string> str_split(std::string str, const std::string & delimiter) {
|
|
||||||
size_t pos = 0;
|
|
||||||
std::string token;
|
|
||||||
std::vector<std::string> output;
|
|
||||||
while ((pos = str.find(delimiter)) != std::string::npos) {
|
|
||||||
token = str.substr(0, pos);
|
|
||||||
output.push_back(token);
|
|
||||||
str.erase(0, pos + delimiter.length());
|
|
||||||
}
|
|
||||||
output.push_back(str); // the rest
|
|
||||||
return output;
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::vector<struct llama_merge_layer> parse_config(std::string & config_path, size_t n_models, std::vector<int> & buf_srcs, std::vector<float> & buf_scales) {
|
|
||||||
// read file
|
|
||||||
std::ifstream file(config_path);
|
|
||||||
if (!file.is_open()) {
|
|
||||||
throw std::runtime_error("Unable to open file merge config file");
|
|
||||||
}
|
|
||||||
std::ostringstream content;
|
|
||||||
content << file.rdbuf(); // Read the entire file into the stringstream
|
|
||||||
file.close();
|
|
||||||
|
|
||||||
// allocate memory
|
|
||||||
auto lines = str_split(content.str(), "\n");
|
|
||||||
buf_srcs.resize(lines.size()*n_models);
|
|
||||||
buf_scales.resize(lines.size()*n_models);
|
|
||||||
|
|
||||||
// process line by line, one line is one layer
|
|
||||||
std::cout << "Parsing configurations:\n";
|
|
||||||
std::vector<struct llama_merge_layer> layers;
|
|
||||||
for (size_t i_layer = 0; i_layer < lines.size(); i_layer++) {
|
|
||||||
std::cout << "- Layer " << i_layer << " =" << std::flush;
|
|
||||||
auto columns = str_split(lines[i_layer], ",");
|
|
||||||
if (columns.size() != n_models*2) {
|
|
||||||
std::stringstream ss;
|
|
||||||
ss << "error: line " << i_layer+1 << " is malformed. Expect to have exactly " << n_models*2 << " columns, but got " << columns.size() << " columns";
|
|
||||||
throw std::runtime_error(ss.str());
|
|
||||||
}
|
|
||||||
int * srcs = buf_srcs.data() + i_layer*n_models;
|
|
||||||
float * scales = buf_scales.data() + i_layer*n_models;
|
|
||||||
for (size_t i_model = 0; i_model < n_models; i_model++) {
|
|
||||||
srcs[i_model] = std::stoi(columns[i_model*2]);
|
|
||||||
scales[i_model] = std::stof(columns[i_model*2 + 1]);
|
|
||||||
// debug message
|
|
||||||
std::cout << " + model[" << i_model << "].layer[" << srcs[i_model] << "]*" << scales[i_model] << std::flush;
|
|
||||||
}
|
|
||||||
layers.push_back(llama_merge_layer{srcs, scales, -1});
|
|
||||||
std::cout << "\n";
|
|
||||||
}
|
|
||||||
return layers;
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
bool invalid_param = false;
|
bool invalid_param = false;
|
||||||
struct merge_params params;
|
struct merge_params params;
|
||||||
|
@ -129,37 +73,54 @@ int main(int argc, char ** argv) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.output_path = argv[i];
|
params.output_path = argv[i];
|
||||||
|
} else if (arg == "--print-list-tensor") {
|
||||||
|
params.only_list_tensors_name = true;
|
||||||
|
} else if (arg == "--dry-run") {
|
||||||
|
params.dry_run = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (invalid_param) {
|
if (invalid_param) {
|
||||||
|
usage(argv[0], 1);
|
||||||
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
|
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
|
||||||
} else if (params.model_paths.size() < 2) {
|
} else if (!params.only_list_tensors_name && params.model_paths.size() < 2) {
|
||||||
throw std::invalid_argument("error: require at least 2 models");
|
throw std::invalid_argument("error: require at least 2 models");
|
||||||
}
|
}
|
||||||
|
|
||||||
// buffers to hold allocated data
|
if (params.only_list_tensors_name) {
|
||||||
std::vector<int> buf_srcs;
|
if (params.model_paths.size() != 1) {
|
||||||
std::vector<float> buf_scales;
|
throw std::invalid_argument("error: we can only list tensors of one single model");
|
||||||
|
}
|
||||||
|
print_model_tensors_name(params.model_paths[0]);
|
||||||
|
return 0; // exit now
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t n_layers = 0;
|
||||||
|
auto instructions = parse_config(params.config_path, params.model_paths[0], n_layers);
|
||||||
|
|
||||||
|
if (params.dry_run) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
auto layers = parse_config(params.config_path, params.model_paths.size(), buf_srcs, buf_scales);
|
|
||||||
std::vector<const char*> p_model_paths;
|
std::vector<const char*> p_model_paths;
|
||||||
for (auto & m : params.model_paths) {
|
for (auto & m : params.model_paths) {
|
||||||
p_model_paths.push_back(m.data());
|
p_model_paths.push_back(m.data());
|
||||||
}
|
}
|
||||||
const struct llama_merge_config config{
|
struct llama_merge_config config{
|
||||||
p_model_paths.data(),
|
{
|
||||||
p_model_paths.size(),
|
params.model_paths[0].c_str(),
|
||||||
layers.data(),
|
params.model_paths[1].c_str(),
|
||||||
layers.size(),
|
},
|
||||||
params.output_path.data(),
|
instructions.data(),
|
||||||
|
instructions.size(),
|
||||||
|
n_layers,
|
||||||
|
params.output_path.c_str(),
|
||||||
};
|
};
|
||||||
|
|
||||||
llama_merge_models(&config);
|
llama_merge_models(&config);
|
||||||
} catch (const std::exception & ex) {
|
} catch (const std::exception & ex) {
|
||||||
std::cerr << ex.what() << "\n\n";
|
std::cerr << ex.what() << "\n\n";
|
||||||
usage(argv[0], 1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
269
examples/merge/parser.hpp
Normal file
269
examples/merge/parser.hpp
Normal file
|
@ -0,0 +1,269 @@
|
||||||
|
#include "common.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstring>
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
#include <unordered_map>
|
||||||
|
#include <fstream>
|
||||||
|
#include <cmath>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <set>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
// trim whitespace from the beginning and end of a string
|
||||||
|
static std::string str_trim(const std::string & str) {
|
||||||
|
size_t start = 0;
|
||||||
|
size_t end = str.size();
|
||||||
|
while (start < end && isspace(str[start])) {
|
||||||
|
start += 1;
|
||||||
|
}
|
||||||
|
while (end > start && isspace(str[end - 1])) {
|
||||||
|
end -= 1;
|
||||||
|
}
|
||||||
|
return str.substr(start, end - start);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline std::vector<std::string> str_split(std::string str, const std::string & delimiter) {
|
||||||
|
size_t pos = 0;
|
||||||
|
std::string token;
|
||||||
|
std::vector<std::string> output;
|
||||||
|
while ((pos = str.find(delimiter)) != std::string::npos) {
|
||||||
|
token = str.substr(0, pos);
|
||||||
|
output.push_back(token);
|
||||||
|
str.erase(0, pos + delimiter.length());
|
||||||
|
}
|
||||||
|
output.push_back(str); // the rest
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
/////////////////////////////////
|
||||||
|
|
||||||
|
// dump a list of tensor name of the input model
|
||||||
|
static std::vector<std::string> get_list_tensors_name(std::string & model_path) {
|
||||||
|
llama_model_params model_params = llama_model_default_params();
|
||||||
|
llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
|
||||||
|
size_t n_tensors = llama_get_all_tensors_name(model, nullptr, 0);
|
||||||
|
std::vector<const char *> list(n_tensors, nullptr);
|
||||||
|
llama_get_all_tensors_name(model, list.data(), list.size());
|
||||||
|
// copy the result
|
||||||
|
std::vector<std::string> results;
|
||||||
|
for (auto & name : list) {
|
||||||
|
results.push_back(std::string(name));
|
||||||
|
}
|
||||||
|
llama_free_model(model);
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void print_model_tensors_name(std::string & model_path) {
|
||||||
|
auto tensors = get_list_tensors_name(model_path);
|
||||||
|
std::cout << "\n\n===================\n";
|
||||||
|
std::cout << "Total number of tensors: " << tensors.size() << "\n";
|
||||||
|
std::vector<const char *> list(tensors.size(), nullptr);
|
||||||
|
for (size_t i = 0; i < tensors.size(); i++) {
|
||||||
|
char buf[128];
|
||||||
|
sprintf(buf, "%4ld: %s", i, tensors[i].c_str());
|
||||||
|
std::cout << buf << "\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/////////////////////////////////
|
||||||
|
|
||||||
|
// get layer index from tensor name, for example "blk.x.attn_norm.weight"
|
||||||
|
// returns -1 if it is non-layer
|
||||||
|
static int get_i_layer(std::string tensor_name) {
|
||||||
|
int i_layer = -1;
|
||||||
|
return sscanf(tensor_name.c_str(), "blk.%d.", &i_layer) == 1 ? i_layer : -1;
|
||||||
|
};
|
||||||
|
|
||||||
|
static void print_inst(struct llama_merge_inst inst) {
|
||||||
|
std::cout << "Output: " << inst.name << "\n";
|
||||||
|
switch (inst.method) {
|
||||||
|
case LLAMA_MERGE_LINEAR:
|
||||||
|
std::cout << " Linear\n";
|
||||||
|
std::cout << " Model A: " << inst.scales[0] << " * " << inst.srcs[0] << "\n";
|
||||||
|
std::cout << " Model B: " << inst.scales[1] << " * " << inst.srcs[1] << "\n";
|
||||||
|
break;
|
||||||
|
case LLAMA_MERGE_SLERP:
|
||||||
|
std::cout << " SLERP\n";
|
||||||
|
std::cout << " t=" << inst.t << "\n";
|
||||||
|
std::cout << " Model A: " << inst.srcs[0] << "\n";
|
||||||
|
std::cout << " Model B: " << inst.srcs[1] << "\n";
|
||||||
|
break;
|
||||||
|
case LLAMA_MERGE_COPY:
|
||||||
|
std::cout << " Copy from model A: "<< inst.srcs[0] << "\n";
|
||||||
|
break;
|
||||||
|
case LLAMA_MERGE_REPEAT:
|
||||||
|
std::cout << " Repeat from output model: " << inst.srcs[0] << "\n";
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::vector<struct llama_merge_inst> parse_config(std::string & config_path, std::string & model_path, size_t & n_layers) {
|
||||||
|
std::vector<struct llama_merge_inst> instructions;
|
||||||
|
|
||||||
|
// read file
|
||||||
|
std::ifstream file(config_path);
|
||||||
|
if (!file.is_open()) {
|
||||||
|
throw std::runtime_error("Unable to open file merge config file");
|
||||||
|
}
|
||||||
|
std::ostringstream content;
|
||||||
|
content << file.rdbuf(); // Read the entire file into the stringstream
|
||||||
|
auto lines = str_split(content.str(), "\n");
|
||||||
|
file.close();
|
||||||
|
|
||||||
|
// get list of input tensors
|
||||||
|
auto inp_names = get_list_tensors_name(model_path);
|
||||||
|
std::set<std::string> units; // name of units, for example "attn_output"
|
||||||
|
for (auto & name : inp_names) {
|
||||||
|
int il = get_i_layer(name);
|
||||||
|
if (il < 0) {
|
||||||
|
// non-layer, only copy
|
||||||
|
struct llama_merge_inst ins;
|
||||||
|
ins.method = LLAMA_MERGE_COPY;
|
||||||
|
strcpy(ins.name, name.c_str());
|
||||||
|
strcpy(ins.srcs[0], name.c_str());
|
||||||
|
instructions.push_back(ins);
|
||||||
|
} else {
|
||||||
|
// tensor belong to layer
|
||||||
|
auto parts = str_split(name, ".");
|
||||||
|
units.insert(parts[2]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << "List of units:\n";
|
||||||
|
for (auto & u : units) std::cout << u << "\n";
|
||||||
|
std::cout << "\n";
|
||||||
|
|
||||||
|
// process line by line, one line is one layer
|
||||||
|
std::unordered_map<std::string, struct llama_merge_inst> layer; // map tensor name to instruction
|
||||||
|
bool is_layer_empty = true;
|
||||||
|
int i_layer = -1;
|
||||||
|
auto get_tensor_name = [&](int layer, std::string unit) {
|
||||||
|
return "blk." + std::to_string(layer) + "." + unit + ".weight";
|
||||||
|
};
|
||||||
|
auto push_output_layer = [&]() {
|
||||||
|
if (!is_layer_empty) {
|
||||||
|
for (auto & it : layer) {
|
||||||
|
instructions.push_back(it.second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
layer.clear();
|
||||||
|
is_layer_empty = true;
|
||||||
|
};
|
||||||
|
auto new_output_layer = [&]() {
|
||||||
|
layer.clear();
|
||||||
|
for (auto & u : units) {
|
||||||
|
struct llama_merge_inst ins;
|
||||||
|
strcpy(ins.name, get_tensor_name(i_layer, u).c_str());
|
||||||
|
layer[u] = ins;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
auto raise_err = [&](size_t i_line, std::string message) {
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << "Parse error: (line " << i_line + 1 << ") " << message;
|
||||||
|
throw std::runtime_error(ss.str());
|
||||||
|
};
|
||||||
|
|
||||||
|
for (size_t i_line = 0 ; i_line < lines.size(); i_line++) {
|
||||||
|
auto line = str_trim(lines[i_line]);
|
||||||
|
if (line.empty() || line.c_str()[0] == '#') {
|
||||||
|
continue; // skip empty line or comment
|
||||||
|
}
|
||||||
|
|
||||||
|
auto parts = str_split(line, " ");
|
||||||
|
if (parts.size() != 3) {
|
||||||
|
raise_err(i_line, "does not follow format: \"target (space) verb (space) arguments\"");
|
||||||
|
}
|
||||||
|
|
||||||
|
auto target = parts[0];
|
||||||
|
auto verb = parts[1];
|
||||||
|
auto params = str_split(parts[2], ",");
|
||||||
|
|
||||||
|
if (target == "output" && verb == "layer") {
|
||||||
|
int il_curr = std::stoi(params[0]);
|
||||||
|
if (i_layer + 1 != il_curr) {
|
||||||
|
raise_err(i_line, "new layer number must be (last layer number + 1)");
|
||||||
|
}
|
||||||
|
push_output_layer();
|
||||||
|
i_layer = il_curr;
|
||||||
|
new_output_layer();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto linear = [&](struct llama_merge_inst & ins, std::string unit) {
|
||||||
|
if (params.size() != 4) {
|
||||||
|
raise_err(i_line, "verb \"linear\" requires exactly 4 params");
|
||||||
|
}
|
||||||
|
ins.method = LLAMA_MERGE_LINEAR;
|
||||||
|
int src0 = std::stoi(params[0]);
|
||||||
|
int src1 = std::stoi(params[1]);
|
||||||
|
strcpy(ins.srcs[0], get_tensor_name(src0, unit).c_str());
|
||||||
|
strcpy(ins.srcs[1], get_tensor_name(src1, unit).c_str());
|
||||||
|
ins.scales[0] = std::stof(params[2]);
|
||||||
|
ins.scales[1] = std::stof(params[3]);
|
||||||
|
is_layer_empty = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
auto slerp = [&](struct llama_merge_inst & ins, std::string unit) {
|
||||||
|
if (params.size() != 3) {
|
||||||
|
raise_err(i_line, "verb \"slerp\" requires exactly 3 params");
|
||||||
|
}
|
||||||
|
ins.method = LLAMA_MERGE_SLERP;
|
||||||
|
int src0 = std::stoi(params[0]);
|
||||||
|
int src1 = std::stoi(params[1]);
|
||||||
|
strcpy(ins.srcs[0], get_tensor_name(src0, unit).c_str());
|
||||||
|
strcpy(ins.srcs[1], get_tensor_name(src1, unit).c_str());
|
||||||
|
ins.t = std::stof(params[2]);
|
||||||
|
is_layer_empty = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
auto repeat = [&](struct llama_merge_inst & ins, std::string unit) {
|
||||||
|
if (params.size() != 1) {
|
||||||
|
raise_err(i_line, "verb \"repeat\" requires exactly 1 param");
|
||||||
|
}
|
||||||
|
ins.method = LLAMA_MERGE_REPEAT;
|
||||||
|
int src0 = std::stoi(params[0]);
|
||||||
|
strcpy(ins.srcs[0], get_tensor_name(src0, unit).c_str());
|
||||||
|
is_layer_empty = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
auto apply_verb = [&](struct llama_merge_inst & ins, std::string unit) {
|
||||||
|
if (verb == "linear") {
|
||||||
|
linear(ins, unit);
|
||||||
|
} else if (verb == "slerp") {
|
||||||
|
slerp(ins, unit);
|
||||||
|
} else if (verb == "repeat") {
|
||||||
|
repeat(ins, unit);
|
||||||
|
} else {
|
||||||
|
raise_err(i_line, "invalid verb: " + verb);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if (target == "all") {
|
||||||
|
for (auto & u : units) {
|
||||||
|
apply_verb(layer[u], u);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (units.find(target) == units.end()) {
|
||||||
|
raise_err(i_line, "unit " + target + " does not exist");
|
||||||
|
}
|
||||||
|
apply_verb(layer[target], target);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
push_output_layer();
|
||||||
|
n_layers = i_layer + 1;
|
||||||
|
|
||||||
|
// print all parsed instructions
|
||||||
|
std::cout << "Parsed instructions:\n";
|
||||||
|
for (auto & ins : instructions) {
|
||||||
|
print_inst(ins);
|
||||||
|
}
|
||||||
|
std::cout << "---\n" << "Total output layers: " << n_layers << "\n";
|
||||||
|
|
||||||
|
return instructions;
|
||||||
|
}
|
15
llama.cpp
15
llama.cpp
|
@ -11358,7 +11358,7 @@ int32_t llama_merge_models(const struct llama_merge_config * config) {
|
||||||
#else
|
#else
|
||||||
constexpr bool use_mmap = false;
|
constexpr bool use_mmap = false;
|
||||||
#endif
|
#endif
|
||||||
|
/*
|
||||||
// std::move doesn't work with llama_model and llama_model_loader, why?
|
// std::move doesn't work with llama_model and llama_model_loader, why?
|
||||||
std::vector<std::unique_ptr<llama_model>> models;
|
std::vector<std::unique_ptr<llama_model>> models;
|
||||||
std::vector<std::unique_ptr<llama_model_loader>> mls;
|
std::vector<std::unique_ptr<llama_model_loader>> mls;
|
||||||
|
@ -11610,6 +11610,7 @@ int32_t llama_merge_models(const struct llama_merge_config * config) {
|
||||||
}
|
}
|
||||||
|
|
||||||
clean_up();
|
clean_up();
|
||||||
|
*/
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -12479,6 +12480,18 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
|
||||||
return nparams;
|
return nparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int32_t llama_get_all_tensors_name(struct llama_model * model, const char ** name_arr, size_t arr_size) {
|
||||||
|
size_t i = 0;
|
||||||
|
for (const auto & it : model->tensors_by_name) {
|
||||||
|
if (i == arr_size) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
name_arr[i] = it.first.c_str();
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
return model->tensors_by_name.size();
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
|
struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
|
||||||
auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
|
auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
|
||||||
[name](const std::pair<std::string, struct ggml_tensor *> & it) {
|
[name](const std::pair<std::string, struct ggml_tensor *> & it) {
|
||||||
|
|
34
llama.h
34
llama.h
|
@ -327,18 +327,30 @@ extern "C" {
|
||||||
const char * content;
|
const char * content;
|
||||||
} llama_chat_message;
|
} llama_chat_message;
|
||||||
|
|
||||||
// used to merge models
|
enum llama_merge_method {
|
||||||
struct llama_merge_layer {
|
LLAMA_MERGE_LINEAR,
|
||||||
const int * srcs; // contains n_models elements, if nullptr then we reuse other layer
|
LLAMA_MERGE_SLERP,
|
||||||
const float * scales; // contains n_models elements, if nullptr then we reuse other layer
|
LLAMA_MERGE_REPEAT,
|
||||||
const int i_layer_reuse; // if != -1, then reuse earlier layer in the model to reduce output size
|
LLAMA_MERGE_COPY,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// instruction for merging tensors (model merge)
|
||||||
|
struct llama_merge_inst {
|
||||||
|
char name[GGML_MAX_NAME]; // name of output tensor
|
||||||
|
enum llama_merge_method method;
|
||||||
|
// we only support 2 models for now
|
||||||
|
char srcs[2][GGML_MAX_NAME]; // name of input tensors
|
||||||
|
float scales[2]; // for linear method
|
||||||
|
float t; // for slerp method
|
||||||
|
};
|
||||||
|
|
||||||
|
// merge models
|
||||||
struct llama_merge_config {
|
struct llama_merge_config {
|
||||||
const char ** model_paths;
|
// we only support 2 models for now
|
||||||
const size_t n_models;
|
const char * model_paths[2];
|
||||||
const struct llama_merge_layer * layers;
|
const struct llama_merge_inst * insts;
|
||||||
const size_t n_layers;
|
const size_t n_insts;
|
||||||
|
const size_t n_layers; // number of output layers
|
||||||
const char * output_path;
|
const char * output_path;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -420,6 +432,9 @@ extern "C" {
|
||||||
// Returns the total number of parameters in the model
|
// Returns the total number of parameters in the model
|
||||||
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
||||||
|
|
||||||
|
// Get a list of model tensor name, returns number of elements
|
||||||
|
LLAMA_API int32_t llama_get_all_tensors_name(struct llama_model * model, const char ** name_arr, size_t arr_size);
|
||||||
|
|
||||||
// Get a llama model tensor
|
// Get a llama model tensor
|
||||||
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
|
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
|
||||||
|
|
||||||
|
@ -429,6 +444,7 @@ extern "C" {
|
||||||
const char * fname_out,
|
const char * fname_out,
|
||||||
const llama_model_quantize_params * params);
|
const llama_model_quantize_params * params);
|
||||||
|
|
||||||
|
// Merge multiple models, inspired by mergekit
|
||||||
LLAMA_API int32_t llama_merge_models(
|
LLAMA_API int32_t llama_merge_models(
|
||||||
const struct llama_merge_config * config);
|
const struct llama_merge_config * config);
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue