This commit is contained in:
Xuan Son Nguyen 2024-03-10 16:49:07 +02:00 committed by GitHub
commit ee6854a71e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 940 additions and 2 deletions

1
.gitignore vendored
View file

@ -72,6 +72,7 @@ models-mnt
/train-text-from-scratch
/tokenize
/vdot
/merge
/common/build-info.cpp
arm_neon.h
compile_commands.json

View file

@ -1,8 +1,9 @@
# Define the default target now so that it is always the first target
BUILD_TARGETS = \
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o \
merge
# Binaries only useful for tests
TEST_TARGETS = \
@ -708,6 +709,10 @@ quantize: examples/quantize/quantize.cpp build-info.o ggml.
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
merge: examples/merge/merge.cpp examples/merge/parser.hpp build-info.o ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

View file

@ -0,0 +1,6 @@
set(TARGET merge)
add_executable(${TARGET} merge.cpp parser.hpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -0,0 +1,123 @@
# GGUF merge instructions
#
# Lines start with "#" will be comment
# Empty lines will be ignored
# The "output layer" instruction is to add a new layer for output model
# Merge instruction is in format: target (space) verb (space) parameters
# Supported verbs:
# - linear: merge linearly, parameters: source_layer,source_layer,t
# - slerp: spherical linear interpolation, parameters: source_layer,source_layer,scale,scale
# - copy: copy from which model, which layer
#########################
# Example:
# This is the first layer of output model:
# For all tensors, we want slerp(model[0].layer[0], model[1].layer[0], 0.1)
# Except for "attn_output" tensor that we want t=0.5 instead t=0.1
output layer 0
all slerp 0,0,0.1
attn_output slerp 0,0,0.5
# For next layer, we want: model[0].layer[1]*0.6 + model[1].layer[1]*0.4
# Except for "attn_output" tensor that we want to use slerp with t=0.9
output layer 1
all linear 1,1,0.6,0.4
attn_output slerp 1,1,0.9
# For next layer, we want to copy from model[0].layer[2]
output layer 2
all copy 0,2
output layer 3
all copy 0,3
# For next layer, we want to copy from model[1].layer[4]
output layer 4
all copy 1,4
output layer 5
all copy 1,5
output layer 6
all linear 6,6,0.1,0.9
output layer 7
all linear 7,7,0.1,0.9
output layer 8
all linear 8,8,0.1,0.9
output layer 9
all linear 9,9,0.1,0.9
output layer 10
all linear 10,10,0.1,0.9
output layer 11
all linear 11,11,0.1,0.9
output layer 12
all linear 12,12,0.1,0.9
output layer 13
all linear 13,13,0.3333,0.6666
output layer 14
all linear 14,14,0.3333,0.6666
output layer 15
all linear 15,15,0.3333,0.6666
output layer 16
all linear 16,16,0.3333,0.6666
output layer 17
all linear 17,17,0.3333,0.6666
output layer 18
all linear 18,18,0.3333,0.6666
output layer 19
all linear 19,19,0.3333,0.6666
output layer 20
all slerp 20,20,0.8
output layer 21
all slerp 21,21,0.8
output layer 22
all slerp 22,22,0.8
output layer 23
all slerp 23,23,0.8
output layer 24
all slerp 24,24,0.8
output layer 25
all slerp 25,25,0.8
output layer 26
all slerp 26,26,0.8
output layer 27
all slerp 27,27,0.8
output layer 28
all slerp 28,28,0.8
output layer 29
all slerp 29,29,0.8
output layer 30
all slerp 30,30,0.8
output layer 31
all slerp 31,31,0.8

127
examples/merge/merge.cpp Normal file
View file

@ -0,0 +1,127 @@
#include "common.h"
#include "llama.h"
#include "parser.hpp"
#include <cstdio>
#include <cstring>
#include <vector>
#include <string>
#include <unordered_map>
#include <fstream>
#include <cmath>
#include <algorithm>
static const size_t n_models = 2; // hard-limited to 2 input models for now
struct merge_params {
std::string config_path = "config.txt";
std::vector<std::string> model_paths;
std::string output_path = "ggml-merged-f16.gguf";
bool only_list_tensors_name = false;
bool dry_run = false;
};
[[noreturn]]
static void usage(const char * executable, int exit_code) {
struct merge_params defaults;
printf("usage: %s -c CONFIG_FILE -o OUTPUT_FILE -m MODEL_PATH -m MODEL_PATH ...\n\n", executable);
printf("\n");
printf("Merging multiple models, inspired by mergekit.\n");
printf("For more details, see \"config.example.txt\" file.\n");
printf("\n");
printf("NOTE:\n");
printf("- Only support merging 2 models.\n");
printf("- The embedding and output layers of the first model will be used.\n");
printf("- Currently, we accept both quantized and non-quantized models as input. The output model will be re-quantized into the same format of the first model.\n");
printf("\n");
printf("Options:\n");
printf(" -h, --help Show this help message and exit\n");
printf(" -c, --config CONFIG_FILE Path to config file, in CSV format (default: %s)\n", defaults.config_path.c_str());
printf(" -m, --model MODEL_PATH Path to model. This option can be repeated multiple times and must be specified in the right order.\n");
printf(" -o, --output OUTPUT_FILE Path to the output model (default: %s)\n", defaults.output_path.c_str());
printf(" --dry-run Only print out list of parsed and exit, useful for debugging\n");
printf(" --print-list-tensor Only print out list of tensors of the input model, useful for debugging (only one model is accepted)\n");
printf("\n");
printf("Example: ./merge -c config.txt -o output.gguf -m model_a.gguf -m model_b.gguf\n");
exit(exit_code);
}
int main(int argc, char ** argv) {
bool invalid_param = false;
struct merge_params params;
std::string arg;
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg == "-h" || arg == "--help") {
usage(argv[0], 0);
} else if (arg == "-c" || arg == "--config") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.config_path = argv[i];
} else if (arg == "-m" || arg == "--model") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.model_paths.push_back(argv[i]);
} else if (arg == "-o" || arg == "--output") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.output_path = argv[i];
} else if (arg == "--print-list-tensor") {
params.only_list_tensors_name = true;
} else if (arg == "--dry-run") {
params.dry_run = true;
}
}
try {
if (invalid_param) {
usage(argv[0], 1);
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
} else if (!params.only_list_tensors_name && params.model_paths.size() < 2) {
throw std::invalid_argument("error: require at least 2 models");
}
if (params.only_list_tensors_name) {
if (params.model_paths.size() != 1) {
throw std::invalid_argument("error: we can only list tensors of one single model");
}
print_model_tensors_name(params.model_paths[0]);
return 0; // exit now
}
size_t n_layers = 0;
auto instructions = parse_config(params.config_path, params.model_paths[0], n_layers);
if (params.dry_run) {
return 0;
}
std::vector<const char*> p_model_paths;
for (auto & m : params.model_paths) {
p_model_paths.push_back(m.data());
}
struct llama_merge_config config{
{
params.model_paths[0].c_str(),
params.model_paths[1].c_str(),
},
instructions.data(),
instructions.size(),
n_layers,
params.output_path.c_str(),
};
llama_merge_models(&config);
} catch (const std::exception & ex) {
std::cerr << ex.what() << "\n\n";
}
return 0;
}

293
examples/merge/parser.hpp Normal file
View file

@ -0,0 +1,293 @@
#include "common.h"
#include "llama.h"
#include <cstdio>
#include <cstring>
#include <vector>
#include <string>
#include <unordered_map>
#include <fstream>
#include <cmath>
#include <algorithm>
#include <set>
#include <string.h>
// trim whitespace from the beginning and end of a string
static std::string str_trim(const std::string & str) {
size_t start = 0;
size_t end = str.size();
while (start < end && isspace(str[start])) {
start += 1;
}
while (end > start && isspace(str[end - 1])) {
end -= 1;
}
return str.substr(start, end - start);
}
inline std::vector<std::string> str_split(std::string str, const std::string & delimiter) {
size_t pos = 0;
std::string token;
std::vector<std::string> output;
while ((pos = str.find(delimiter)) != std::string::npos) {
token = str.substr(0, pos);
output.push_back(token);
str.erase(0, pos + delimiter.length());
}
output.push_back(str); // the rest
return output;
}
/////////////////////////////////
// dump a list of tensor name of the input model
static std::vector<std::string> get_list_tensors_name(std::string & model_path) {
llama_model_params model_params = llama_model_default_params();
llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
size_t n_tensors = llama_get_all_tensors_name(model, nullptr, 0);
std::vector<const char *> list(n_tensors, nullptr);
llama_get_all_tensors_name(model, list.data(), list.size());
// copy the result
std::vector<std::string> results;
for (auto & name : list) {
results.push_back(std::string(name));
}
llama_free_model(model);
return results;
}
static void print_model_tensors_name(std::string & model_path) {
auto tensors = get_list_tensors_name(model_path);
std::cout << "\n\n===================\n";
std::cout << "Total number of tensors: " << tensors.size() << "\n";
std::vector<const char *> list(tensors.size(), nullptr);
for (size_t i = 0; i < tensors.size(); i++) {
char buf[128];
sprintf(buf, "%4ld: %s", i, tensors[i].c_str());
std::cout << buf << "\n";
}
}
/////////////////////////////////
// get layer index from tensor name, for example "blk.x.attn_norm.weight"
// returns -1 if it is non-layer
static int get_i_layer(std::string tensor_name) {
int i_layer = -1;
return sscanf(tensor_name.c_str(), "blk.%d.", &i_layer) == 1 ? i_layer : -1;
};
static void print_inst(struct llama_merge_inst inst) {
std::cout << "Output: " << inst.name << "\n";
switch (inst.method) {
case LLAMA_MERGE_LINEAR:
std::cout << " Linear\n";
std::cout << " Model A: " << inst.scales[0] << " * " << inst.srcs[0] << "\n";
std::cout << " Model B: " << inst.scales[1] << " * " << inst.srcs[1] << "\n";
break;
case LLAMA_MERGE_SLERP:
std::cout << " SLERP\n";
std::cout << " t=" << inst.t << "\n";
std::cout << " Model A: " << inst.srcs[0] << "\n";
std::cout << " Model B: " << inst.srcs[1] << "\n";
break;
case LLAMA_MERGE_COPY:
std::cout << " Copy from model A: "<< inst.srcs[0] << "\n";
break;
case LLAMA_MERGE_REPEAT:
std::cout << " Repeat from output model: " << inst.srcs[0] << "\n";
break;
default:
break;
}
}
static std::vector<struct llama_merge_inst> parse_config(std::string & config_path, std::string & model_path, size_t & n_layers) {
std::vector<struct llama_merge_inst> instructions;
// read file
std::ifstream file(config_path);
if (!file.is_open()) {
throw std::runtime_error("Unable to open file merge config file");
}
std::ostringstream content;
content << file.rdbuf(); // Read the entire file into the stringstream
auto lines = str_split(content.str(), "\n");
file.close();
// get list of input tensors
auto inp_names = get_list_tensors_name(model_path);
std::set<std::string> units; // name of units, for example "attn_output"
for (auto & name : inp_names) {
int il = get_i_layer(name);
if (il < 0) {
// non-layer, only copy
struct llama_merge_inst ins;
ins.method = LLAMA_MERGE_COPY;
strcpy(ins.name, name.c_str());
strcpy(ins.srcs[0], name.c_str()); // always take the first model
strcpy(ins.srcs[1], "");
instructions.push_back(ins);
} else {
// tensor belong to layer
auto parts = str_split(name, ".");
units.insert(parts[2]);
}
}
std::cout << "List of units:\n";
for (auto & u : units) std::cout << u << "\n";
std::cout << "\n";
// process line by line, one line is one layer
std::unordered_map<std::string, struct llama_merge_inst> layer; // map tensor name to instruction
bool is_layer_empty = true;
int i_layer = -1;
auto get_tensor_name = [&](int layer, std::string unit) {
return "blk." + std::to_string(layer) + "." + unit + ".weight";
};
auto push_output_layer = [&]() {
if (!is_layer_empty) {
for (auto & it : layer) {
instructions.push_back(it.second);
}
}
layer.clear();
is_layer_empty = true;
};
auto new_output_layer = [&]() {
layer.clear();
for (auto & u : units) {
struct llama_merge_inst ins;
strcpy(ins.name, get_tensor_name(i_layer, u).c_str());
layer[u] = ins;
}
};
auto raise_err = [&](size_t i_line, std::string message) {
std::stringstream ss;
ss << "Parse error: (line " << i_line + 1 << ") " << message;
throw std::runtime_error(ss.str());
};
for (size_t i_line = 0 ; i_line < lines.size(); i_line++) {
auto line = str_trim(lines[i_line]);
if (line.empty() || line.c_str()[0] == '#') {
continue; // skip empty line or comment
}
auto parts = str_split(line, " ");
if (parts.size() != 3) {
raise_err(i_line, "does not follow format: \"target (space) verb (space) parameters\"");
}
auto target = parts[0];
auto verb = parts[1];
auto params = str_split(parts[2], ",");
if (target == "output" && verb == "layer") {
int il_curr = std::stoi(params[0]);
if (i_layer + 1 != il_curr) {
raise_err(i_line, "new layer number must be (last layer number + 1)");
}
push_output_layer();
i_layer = il_curr;
new_output_layer();
continue;
}
auto linear = [&](struct llama_merge_inst & ins, std::string unit) {
if (params.size() != 4) {
raise_err(i_line, "verb \"linear\" requires exactly 4 parameters");
}
ins.method = LLAMA_MERGE_LINEAR;
int src0 = std::stoi(params[0]);
int src1 = std::stoi(params[1]);
strcpy(ins.srcs[0], get_tensor_name(src0, unit).c_str());
strcpy(ins.srcs[1], get_tensor_name(src1, unit).c_str());
ins.scales[0] = std::stof(params[2]);
ins.scales[1] = std::stof(params[3]);
is_layer_empty = false;
};
auto slerp = [&](struct llama_merge_inst & ins, std::string unit) {
if (params.size() != 3) {
raise_err(i_line, "verb \"slerp\" requires exactly 3 parameters");
}
ins.method = LLAMA_MERGE_SLERP;
int src0 = std::stoi(params[0]);
int src1 = std::stoi(params[1]);
strcpy(ins.srcs[0], get_tensor_name(src0, unit).c_str());
strcpy(ins.srcs[1], get_tensor_name(src1, unit).c_str());
ins.t = std::stof(params[2]);
is_layer_empty = false;
};
/*auto repeat = [&](struct llama_merge_inst & ins, std::string unit) {
if (params.size() != 1) {
raise_err(i_line, "verb \"repeat\" requires exactly 1 parameter");
}
ins.method = LLAMA_MERGE_REPEAT;
int src0 = std::stoi(params[0]);
strcpy(ins.srcs[0], get_tensor_name(src0, unit).c_str());
is_layer_empty = false;
};*/
auto copy = [&](struct llama_merge_inst & ins, std::string unit) {
if (params.size() != 2) {
raise_err(i_line, "verb \"copy\" requires exactly 2 parameters");
}
ins.method = LLAMA_MERGE_COPY;
int model = std::stoi(params[0]);
int layer = std::stoi(params[1]);
if (model == 0) {
strcpy(ins.srcs[0], get_tensor_name(layer, unit).c_str());
strcpy(ins.srcs[1], "");
} else if (model == 1) {
strcpy(ins.srcs[0], "");
strcpy(ins.srcs[1], get_tensor_name(layer, unit).c_str());
} else {
raise_err(i_line, "can only copy from model 0 or 1");
}
is_layer_empty = false;
};
auto apply_verb = [&](struct llama_merge_inst & ins, std::string unit) {
if (verb == "linear") {
linear(ins, unit);
} else if (verb == "slerp") {
slerp(ins, unit);
} else if (verb == "repeat") {
// repeat(ins, unit);
raise_err(i_line, "repeat is currently not supported");
} else if (verb == "copy") {
copy(ins, unit);
} else {
raise_err(i_line, "invalid verb: " + verb);
}
};
// TODO: what if user does not use "all"? we may miss some tensors?
if (target == "all") {
for (auto & u : units) {
apply_verb(layer[u], u);
}
} else {
if (units.find(target) == units.end()) {
raise_err(i_line, "unit " + target + " does not exist");
}
apply_verb(layer[target], target);
}
}
push_output_layer();
n_layers = i_layer + 1;
// print all parsed instructions
std::cout << "Parsed instructions:\n";
for (auto & ins : instructions) {
print_inst(ins);
}
std::cout << "---\n" << "Total output layers: " << n_layers << "\n";
return instructions;
}

349
llama.cpp
View file

@ -61,6 +61,7 @@
#include <cfloat>
#include <cinttypes>
#include <climits>
#include <condition_variable>
#include <cmath>
#include <cstdarg>
#include <cstddef>
@ -86,6 +87,7 @@
#include <thread>
#include <type_traits>
#include <unordered_map>
#include <queue>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
@ -12227,6 +12229,341 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
}
}
// TODO: remove this when #5830 is merged
static int32_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, int64_t * hist_cur, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
std::mutex mutex;
int counter = 0;
size_t new_size = 0;
if (nthread < 2) {
// single-thread
return ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur, imatrix);
}
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
nrows, n_per_row, imatrix]() {
std::array<int64_t, 1 << 4> local_hist = {};
const int nrows_per_chunk = chunk_size / n_per_row;
size_t local_size = 0;
while (true) {
std::unique_lock<std::mutex> lock(mutex);
int first_row = counter; counter += nrows_per_chunk;
if (first_row >= nrows) {
if (local_size > 0) {
for (int j=0; j<int(local_hist.size()); ++j) {
hist_cur[j] += local_hist[j];
}
new_size += local_size;
}
break;
}
lock.unlock();
const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
}
};
for (int it = 0; it < nthread - 1; ++it) {
workers.emplace_back(compute);
}
compute();
for (auto & w : workers) { w.join(); }
workers.clear();
return new_size;
}
int32_t llama_merge_models(const struct llama_merge_config * config) {
#if defined(__linux__) || defined(_WIN32)
constexpr bool use_mmap = true;
#else
constexpr bool use_mmap = false;
#endif
// std::move doesn't work with llama_model and llama_model_loader, why?
std::vector<std::unique_ptr<llama_model>> models;
std::vector<std::unique_ptr<llama_model_loader>> mls;
std::vector<no_init<uint8_t>> buf_in;
std::vector<no_init<uint8_t>> buf_out;
std::set<std::string> ref_names; // list of ref_name per layer
std::vector<struct ggml_tensor *> output_tensors;
// output file
struct gguf_context * ctx_out = gguf_init_empty();
std::ofstream fout(config->output_path, std::ios::binary);
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
// remember to call before exit
auto clean_up = [&]() {
fout.close();
for (auto & tensor : output_tensors) {
free(tensor);
}
gguf_free(ctx_out);
};
// load the input models
static const size_t n_models = 2;
for (size_t i = 0; i < n_models; i++) {
auto model = std::unique_ptr<llama_model>(new llama_model());
auto ml = std::unique_ptr<llama_model_loader>(new llama_model_loader(config->model_paths[i], use_mmap, NULL));
ml->init_mapping(false);
llm_load_arch(*ml, *model);
llm_load_hparams(*ml, *model);
models.push_back(std::move(model));
mls.push_back(std::move(ml));
}
// for verb copy, we want to get the source tensor
auto get_src_tensor_for_copy = [&](const struct llama_merge_inst ins, size_t & i_model) {
i_model = std::string(ins.srcs[0]).empty() ? 1 : 0;
return mls[i_model]->get_tensor_meta(ins.srcs[i_model]);
};
// construct metadata
{
// copy the KV pairs from the input file
gguf_set_kv(ctx_out, mls[0]->ctx_gguf);
// correct layer count for output model
std::stringstream ss;
ss << mls[0]->get_arch_name() << ".block_count";
gguf_set_val_u32(ctx_out, ss.str().c_str(), config->n_layers);
LLAMA_LOG_INFO("====> Set new value of %s = %ld\n", ss.str().c_str(), config->n_layers);
// populate metadata for output tensors
auto push_tensor = [&](struct ggml_tensor * ref, const char * name) {
struct ggml_tensor * out_tensor = (struct ggml_tensor *) malloc(GGML_TENSOR_SIZE);
if (ref != nullptr) {
// copy metadata (shape, type,...)
memcpy(out_tensor, ref, GGML_TENSOR_SIZE);
}
ggml_set_name(out_tensor, name);
gguf_add_tensor(ctx_out, out_tensor);
output_tensors.push_back(out_tensor);
};
for (size_t i = 0; i < config->n_insts; i++) {
const struct llama_merge_inst ins = config->insts[i];
struct ggml_tensor * t0;
struct ggml_tensor * t1;
// TODO: reject non-requantize-able type (one that requires imatrix)
if (ins.method == LLAMA_MERGE_COPY) {
// simply copy from model A
size_t i_model;
t0 = get_src_tensor_for_copy(ins, i_model);
push_tensor(t0, ins.name);
} else if (ins.method == LLAMA_MERGE_LINEAR || ins.method == LLAMA_MERGE_SLERP) {
t0 = mls[0]->get_tensor_meta(ins.srcs[0]);
t1 = mls[1]->get_tensor_meta(ins.srcs[1]);
if (llama_format_tensor_shape(t0) != llama_format_tensor_shape(t1)) {
LLAMA_LOG_ERROR("some tensors does not have the same shape");
clean_up();
return -1;
}
push_tensor(t0, ins.name);
} else if (ins.method == LLAMA_MERGE_REPEAT) {
// TODO: in theory, we can point 2 tensors to the same offset, but here we're unable to do that, because offset is currently managed by gguf_add_tensor()
GGML_ASSERT(false);
/*int idx = nullptr;
std::string search_tensor(ins.srcs[0]);
for (auto & tensor : output_tensors) {
if (std::string(ggml_get_name(tensor)) == search_tensor) {
t0 = tensor;
break;
}
}
if (t0 == nullptr) {
LLAMA_LOG_ERROR("cannot find source tensor to repeat");
clean_up();
return -1;
}
push_tensor(t0, ins.name);*/
} else {
GGML_ASSERT(false); // should never happen
}
}
const size_t meta_size = gguf_get_meta_size(ctx_out);
LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size);
// placeholder for the meta data
::zeros(fout, meta_size);
}
// load tensor data into buffer
auto read_tensor_data = [&](struct ggml_tensor * tensor, llama_model_loader & ml, std::vector<no_init<uint8_t>> & buf) -> size_t {
if (!ml.use_mmap) {
if (buf.size() < ggml_nbytes(tensor)) {
buf.resize(ggml_nbytes(tensor));
}
tensor->data = buf.data();
}
ml.load_data_for(tensor);
return ggml_nbytes(tensor);
};
size_t n_done = 0;
auto write_output_tensor = [&](const struct ggml_tensor * tensor, void * data) {
// write tensor data + padding
const size_t len = ggml_nbytes(tensor);
fout.write((const char *) data, len);
zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
n_done++;
LLAMA_LOG_INFO("[%4ld/%4ld] %36s - [%s], input type = %6s\n",
n_done, output_tensors.size(),
ggml_get_name(tensor),
llama_format_tensor_shape(tensor).c_str(),
ggml_type_name(tensor->type));
};
// TODO: allow user to set n_threads
const int n_threads = std::thread::hardware_concurrency();
std::vector<std::thread> workers;
workers.reserve(n_threads);
// process instruction one by one
GGML_ASSERT(config->n_insts == output_tensors.size());
for (size_t i = 0; i < config->n_insts; i++) {
const struct llama_merge_inst ins = config->insts[i];
struct ggml_tensor * t0;
struct ggml_tensor * t1;
struct ggml_tensor * out_tensor = output_tensors[i];
const size_t n_elements = ggml_nelements(out_tensor);
std::vector<no_init<uint8_t>> in_buf0;
std::vector<no_init<float>> f32_in_buf0; // dequant it internally
std::vector<no_init<uint8_t>> in_buf1;
std::vector<no_init<float>> f32_in_buf1; // dequant it internally
std::vector<float> f32_out_buf(n_elements, 0.0); // do not resize!
std::vector<uint8_t> out_buf(ggml_nbytes(out_tensor)); // do not resize!
const int n_per_row = out_tensor->ne[0];
const int n_rows = n_elements / n_per_row;
if (ins.method == LLAMA_MERGE_COPY) {
LLAMA_LOG_INFO("copy\n");
size_t i_model;
t0 = get_src_tensor_for_copy(ins, i_model);
read_tensor_data(t0, *mls[i_model], in_buf0);
write_output_tensor(out_tensor, t0->data);
continue;
}
// dequantize the tensor to FP32
auto dequantize = [&](struct ggml_tensor * in_tensor, std::vector<no_init<float>> & f32_in_buf) {
if (in_tensor->type != GGML_TYPE_F32) {
LLAMA_LOG_INFO("dequant ");
llama_convert_tensor_internal(in_tensor, f32_in_buf, workers, n_elements, n_threads);
} else {
// if we already have f32, just copy it
LLAMA_LOG_INFO("f32_copy ");
f32_in_buf.resize(n_elements);
memcpy((void *) f32_in_buf.data(), in_tensor->data, n_elements * sizeof(float));
}
};
// load data and dequantize
if (ins.method == LLAMA_MERGE_LINEAR || ins.method == LLAMA_MERGE_SLERP) {
t0 = mls[0]->get_tensor_meta(ins.srcs[0]);
t1 = mls[1]->get_tensor_meta(ins.srcs[1]);
read_tensor_data(t0, *mls[0], in_buf0);
read_tensor_data(t1, *mls[1], in_buf1);
dequantize(t0, f32_in_buf0);
dequantize(t1, f32_in_buf1);
}
if (ins.method == LLAMA_MERGE_LINEAR) {
LLAMA_LOG_INFO("linear ");
float * in0 = (float *) f32_in_buf0.data();
float * in1 = (float *) f32_in_buf1.data();
float * dest = (float *) f32_out_buf.data();
for (size_t i = 0; i < n_elements; i++) {
dest[i] = in0[i] * ins.scales[0] + in1[i] * ins.scales[1];
}
}
if (ins.method == LLAMA_MERGE_SLERP) {
// Python code: https://gist.github.com/dvschultz/3af50c40df002da3b751efab1daddf2c
LLAMA_LOG_INFO("slerp ");
static const float dot_threshold = 0.9995;
auto lerp_row = [](float * in0, float * in1, float * out, size_t nelem, float t) {
for (size_t i = 0; i < nelem; i++) {
out[i] = in0[i] * (1.0 - t) + in1[i] * t;
}
};
auto slerp_row = [&lerp_row](float * in0, float * in1, float * out, size_t nelem, float t) {
float norm0 = std::sqrt(std::inner_product(in0, in0 + nelem, in0, 0.0));
float norm1 = std::sqrt(std::inner_product(in1, in1 + nelem, in1, 0.0));
// Normalize the vectors to get the directions and angles
std::vector<float> v0(nelem);
std::vector<float> v1(nelem);
for (size_t i = 0; i < nelem; i++) {
v0[i] = in0[i] / norm0;
v1[i] = in1[i] / norm1;
}
// Dot product with the normalized vectors
float dot = std::inner_product(v0.begin(), v0.end(), v1.begin(), 0.0);
// If absolute value of dot product is almost 1, vectors are ~colineal, so use lerp
if (std::abs(dot) > dot_threshold) {
return lerp_row(in0, in1, out, nelem, t);
}
// Calculate initial angle between v0 and v1
float theta_0 = std::acos(dot);
float sin_theta_0 = std::sin(theta_0);
// Angle at timestep t
float theta_t = theta_0 * t;
float sin_theta_t = std::sin(theta_t);
// Finish the slerp algorithm
float s0 = std::sin(theta_0 - theta_t) / sin_theta_0;
float s1 = sin_theta_t / sin_theta_0;
for (size_t i = 0; i < nelem; i++) {
out[i] = in0[i] * s0 + in1[i] * s1;
}
};
for (int r = 0; r < n_rows; r++) {
float * in0 = (float *) f32_in_buf0.data();
float * in1 = (float *) f32_in_buf1.data();
float * dest = (float *) f32_out_buf.data();
size_t offset = n_per_row * r;
slerp_row(in0 + offset, in1 + offset, dest + offset, n_per_row, ins.t);
}
}
// re-quantize it
{
LLAMA_LOG_INFO("requant\n");
std::array<int64_t, 1 << 4> hist_cur = {};
static const int min_chunk_size = 32 * 512;
const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
size_t new_size = llama_tensor_quantize_internal(
out_tensor->type,
f32_out_buf.data(),
out_buf.data(),
chunk_size,
n_rows,
n_per_row,
hist_cur.data(), // unused for now
nullptr,
workers,
n_threads);
GGML_ASSERT(new_size == out_buf.size());
}
LLAMA_LOG_INFO("===> INPUT %f %f %f\n", f32_in_buf0[0].value, f32_in_buf0[1].value, f32_in_buf0[2].value);
LLAMA_LOG_INFO("===> OUTPUT %f %f %f\n", f32_out_buf[0], f32_out_buf[1], f32_out_buf[2]);
write_output_tensor(out_tensor, out_buf.data());
}
// go back to beginning of file and write the updated meta data
{
fout.seekp(0);
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
gguf_get_meta_data(ctx_out, data.data());
fout.write((const char *) data.data(), data.size());
LLAMA_LOG_INFO("===> Written metadata size = %ld bytes\n", data.size());
}
clean_up();
return 0;
}
static int llama_apply_lora_from_file_internal(
const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
) {
@ -13150,6 +13487,18 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
return nparams;
}
int32_t llama_get_all_tensors_name(struct llama_model * model, const char ** name_arr, size_t arr_size) {
size_t i = 0;
for (const auto & it : model->tensors_by_name) {
if (i == arr_size) {
break;
}
name_arr[i] = it.first.c_str();
i++;
}
return model->tensors_by_name.size();
}
struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
[name](const std::pair<std::string, struct ggml_tensor *> & it) {

34
llama.h
View file

@ -336,6 +336,33 @@ extern "C" {
const char * content;
} llama_chat_message;
enum llama_merge_method {
LLAMA_MERGE_LINEAR,
LLAMA_MERGE_SLERP,
LLAMA_MERGE_REPEAT, // doesn't work for now
LLAMA_MERGE_COPY,
};
// instruction for merging tensors (model merge)
struct llama_merge_inst {
char name[GGML_MAX_NAME]; // name of output tensor
enum llama_merge_method method;
// we only support 2 models for now
char srcs[2][GGML_MAX_NAME]; // name of input tensors. if method == copy, only one src is non-empty
float scales[2]; // for linear method
float t; // for slerp method
};
// merge models
struct llama_merge_config {
// we only support 2 models for now
const char * model_paths[2];
const struct llama_merge_inst * insts;
const size_t n_insts;
const size_t n_layers; // number of output layers
const char * output_path;
};
// Helpers for getting default parameters
LLAMA_API struct llama_model_params llama_model_default_params(void);
LLAMA_API struct llama_context_params llama_context_default_params(void);
@ -415,6 +442,9 @@ extern "C" {
// Returns the total number of parameters in the model
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
// Get a list of model tensor name, returns number of elements
LLAMA_API int32_t llama_get_all_tensors_name(struct llama_model * model, const char ** name_arr, size_t arr_size);
// Get a llama model tensor
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
@ -424,6 +454,10 @@ extern "C" {
const char * fname_out,
const llama_model_quantize_params * params);
// Merge multiple models, inspired by mergekit
LLAMA_API int32_t llama_merge_models(
const struct llama_merge_config * config);
// Apply a LoRA adapter to a loaded model
// path_base_model is the path to a higher quality model to use as a base for
// the layers modified by the adapter. Can be NULL to use the current loaded model.