diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index 881f0451c..553a4f1d8 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -32,8 +32,10 @@ struct split_params { int n_split_tensors = 128; std::string input; std::string output; + std::string tensor_set_file; bool no_tensor_first_split = false; bool dry_run = false; + bool customized_split = false; }; static void split_print_usage(const char * executable) { @@ -47,6 +49,7 @@ static void split_print_usage(const char * executable) { printf(" -h, --help show this help message and exit\n"); printf(" --version show version and build info\n"); printf(" --split split GGUF to multiple GGUF (enabled by default)\n"); + printf(" --tensor-set customize tensor set used to split. File contains modules, e.g. 'ffn_up.weight'"); printf(" --merge merge multiple GGUF to a single GGUF\n"); printf(" --split-max-tensors max tensors in each split (default: %d)\n", default_params.n_split_tensors); printf(" --split-max-size N(M|G) max size per split\n"); @@ -121,6 +124,16 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p params.operation = SPLIT_OP_SPLIT; } + if (arg == "--tensor-set") { + arg_found = true; + if (++arg_idx >= argc) { + invalid_param = true; + break; + } + params.tensor_set_file = argv[arg_idx]; + params.customized_split = true; + } + if (is_mode_set) { throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both"); } @@ -180,12 +193,27 @@ static void zeros(std::ofstream & file, size_t n) { } } +static std::vector read_customized_tensors(const std::string & tensor_set_file) { + std::vector tensor_set; + std::ifstream f_tensor_set(tensor_set_file); + if (!f_tensor_set.is_open()) { + fprintf(stderr, "error: failed to open tensor set file %s\n", tensor_set_file.c_str()); + exit(EXIT_FAILURE); + } + std::string line; + while (std::getline(f_tensor_set, line)) { + tensor_set.push_back(line); + } + return tensor_set; +} + struct split_strategy { const split_params params; std::ifstream & f_input; struct gguf_context * ctx_gguf; struct ggml_context * ctx_meta = NULL; const int n_tensors; + std::string tensor_set_file; // one ctx_out per one output file std::vector ctx_outs; @@ -233,20 +261,45 @@ struct split_strategy { new_ctx_out(true); } - // process tensors one by one - size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata) - for (int i = 0; i < n_tensors; ++i) { - struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i)); - // calculate the "imaginary" size = the current size + next tensor size - size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT); - size_t next_tensors_size = curr_tensors_size + n_bytes; - if (should_split(i, next_tensors_size)) { - new_ctx_out(false); - curr_tensors_size = n_bytes; - } else { - curr_tensors_size = next_tensors_size; + if (!params.customized_split) { + // process tensors one by one + size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata) + for (int i = 0; i < n_tensors; ++i) { + struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i)); + // calculate the "imaginary" size = the current size + next tensor size + size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT); + size_t next_tensors_size = curr_tensors_size + n_bytes; + if (should_split(i, next_tensors_size)) { + new_ctx_out(false); + curr_tensors_size = n_bytes; + } else { + curr_tensors_size = next_tensors_size; + } + gguf_add_tensor(ctx_out, t); + } + } else { + // custom split based on tensor set + std::vector tensor_set = read_customized_tensors(params.tensor_set_file); + if(tensor_set.empty()) { + fprintf(stderr, "error: tensor set is empty\n"); + exit(EXIT_FAILURE); + } + for (int i = 0; i < n_tensors; ++i) { + const char * t_name = gguf_get_tensor_name(ctx_gguf, i); + if (is_tensor_in_customized_set(t_name, tensor_set)) { + struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name); + gguf_add_tensor(ctx_out, t); + } + } + new_ctx_out(false); + // add left tensors to the next split + for (int i = 0; i < n_tensors; ++i) { + const char * t_name = gguf_get_tensor_name(ctx_gguf, i); + if (!is_tensor_in_customized_set(t_name, tensor_set)) { + struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name); + gguf_add_tensor(ctx_out, t); + } } - gguf_add_tensor(ctx_out, t); } // push the last ctx_out @@ -274,6 +327,16 @@ struct split_strategy { } } + bool is_tensor_in_customized_set(const char * t_name, std::vector tensor_set) { + for (auto & s : tensor_set) { + if (strstr(t_name, s.c_str()) != NULL) { + return true; + } + } + + return false; + } + void print_info() { printf("n_split: %ld\n", ctx_outs.size()); int i_split = 0; diff --git a/include/llama.h b/include/llama.h index 433baf5ec..f45e6c052 100644 --- a/include/llama.h +++ b/include/llama.h @@ -11,6 +11,11 @@ #include #include +#ifdef _WIN32 +#else +#include +#endif // _WIN32 + #ifdef LLAMA_SHARED # if defined(_WIN32) && !defined(__MINGW32__) # ifdef LLAMA_BUILD diff --git a/src/llama.cpp b/src/llama.cpp index 2ebc75303..1dd263db9 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -18022,7 +18022,7 @@ bool llama_switch_derived_model(struct llama_context* ctx, const std::string der auto& cparams = ctx->cparams; cparams.derived_model_name = (ctx->derived_models.find(derived_model_name) == ctx->derived_models.end()) ? BASE_MODEL : derived_model_name; - LLAMA_LOG_INFO("%s: %s\n", __func__, cparams.derived_model_name); + LLAMA_LOG_INFO("%s: %s\n", __func__, cparams.derived_model_name.c_str()); return true; }