llama_model_loader: support multiple split/shard GGUFs (#6187)
* split: support in llama_model_loader * avoid copying the entire vector Co-authored-by: slaren <slarengh@gmail.com> * split: move llama_tensor_offset to llama_model_loader * llama_model_loader: PR feedbacks: - use only one gguf_context for metadata only - store all ggml_context in a vector as the files and mappings - store all weights in a vector along with the source tensor - rename ctx_gguf to meta - rename ctx_meta to contexts * avoid copying the entire vector * Simplify this by making these optional, switch some layer creation tensor optional Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Handle optional tensors Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * llama_model_loader: fail if backend cannot allocate buffer * fix mmap buffer management * llama_model_loader: map file to backend buffer if the allocation succeeds only * llama_model_loader: only map tensors included in the context * llama_model_loader: minor, use same variable name for consistency, fix spacing in types cast * llama_model_loader: fail if any of backend buffer cannot be allocated * spacing Co-authored-by: slaren <slarengh@gmail.com> * fix loop over pointer Co-authored-by: slaren <slarengh@gmail.com> * llama_model_loader: if n_tensors declared not equals to loaded tensors in split, throw an exception instead of asserting * llama_model_loader: ensure mappings vector has the expected size * llama_model_loader: use at instead of operator[] if this should never add to the map. * llama_model_loader: immediately add the backend buffer to the model buffers in order to free them if an error occurs in the next allocation. Reserve the expected size. * llama_model_loader: be sure the model mappings has enough capacity before allocating backend buffer * llama_model_loader: fix map -> unordered map * llama_split_prefix: use a clearer version, not pass split path len but dest max len. Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com> * llama : minor ggml-ci * llama : introduce some typedef helpers * docs: add model shard in hot topic * llama_model_loader: put mapping in a unique_ptr from the moment it is allocated Co-authored-by: slaren <slarengh@gmail.com> * fix llama_split_prefix --------- Co-authored-by: slaren <slarengh@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>
This commit is contained in:
parent
ee804f6223
commit
dba1af6129
4 changed files with 412 additions and 224 deletions
|
@ -1,31 +1,34 @@
|
|||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
#include "common.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <fstream>
|
||||
#include <ios>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include <stdio.h>
|
||||
#include <fcntl.h>
|
||||
#include <string.h>
|
||||
#include <climits>
|
||||
#include <stdexcept>
|
||||
|
||||
#if defined(_WIN32)
|
||||
#include <windows.h>
|
||||
#ifndef PATH_MAX
|
||||
#define PATH_MAX MAX_PATH
|
||||
#endif
|
||||
#include <io.h>
|
||||
#endif
|
||||
|
||||
enum split_operation : uint8_t {
|
||||
SPLIT_OP_SPLIT,
|
||||
SPLIT_OP_MERGE,
|
||||
};
|
||||
|
||||
static const char * const LLM_KV_GENERAL_SPLIT_I_SPLIT = "general.split";
|
||||
static const char * const LLM_KV_GENERAL_SPLIT_N_SPLIT = "general.split_count";
|
||||
|
||||
static const int SPLIT_FILENAME_MAX = 256;
|
||||
|
||||
static const char * const SPLIT_FILENAME_FORMAT = "%s-%05d-of-%05d.gguf";
|
||||
static const char * const LLM_KV_SPLIT_NO = "split.no";
|
||||
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
||||
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
||||
|
||||
struct split_params {
|
||||
split_operation operation = SPLIT_OP_SPLIT;
|
||||
|
@ -116,13 +119,13 @@ static bool split_params_parse(int argc, const char ** argv, split_params & para
|
|||
try {
|
||||
if (!split_params_parse_ex(argc, argv, params)) {
|
||||
split_print_usage(argv[0]);
|
||||
exit(1);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
catch (const std::invalid_argument & ex) {
|
||||
fprintf(stderr, "%s\n", ex.what());
|
||||
split_print_usage(argv[0]);
|
||||
exit(1);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
@ -134,12 +137,6 @@ static void zeros(std::ofstream & file, size_t n) {
|
|||
}
|
||||
}
|
||||
|
||||
static std::string split_file_name(const std::string & path, int i_split, int n_split) {
|
||||
char f_split[SPLIT_FILENAME_MAX] = {0};
|
||||
snprintf(f_split, sizeof(f_split), SPLIT_FILENAME_FORMAT, path.c_str(), i_split + 1, n_split);
|
||||
return std::string(f_split);
|
||||
}
|
||||
|
||||
struct split_strategy {
|
||||
const split_params params;
|
||||
std::ifstream & f_input;
|
||||
|
@ -180,8 +177,9 @@ struct split_strategy {
|
|||
if (i_split == 0) {
|
||||
gguf_set_kv(ctx_out, ctx_gguf);
|
||||
}
|
||||
gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_I_SPLIT, i_split);
|
||||
gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, n_split);
|
||||
gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO, i_split);
|
||||
gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, n_split);
|
||||
gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT, n_tensors);
|
||||
|
||||
// populate the original tensors, so we get an initial metadata
|
||||
for (int i = i_split * params.n_split_tensors; i < n_tensors && i < (i_split + 1) * params.n_split_tensors; ++i) {
|
||||
|
@ -189,10 +187,11 @@ struct split_strategy {
|
|||
gguf_add_tensor(ctx_out, meta);
|
||||
}
|
||||
|
||||
auto split_name = split_file_name(params.output, i_split, n_split);
|
||||
char split_path[PATH_MAX] = {0};
|
||||
llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split);
|
||||
|
||||
fprintf(stderr, "%s: %s ...", __func__, split_name.c_str());
|
||||
fout = std::ofstream(split_name, std::ios::binary);
|
||||
fprintf(stderr, "%s: %s ...", __func__, split_path);
|
||||
fout = std::ofstream(split_path, std::ios::binary);
|
||||
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
||||
|
||||
auto meta_size = gguf_get_meta_size(ctx_out);
|
||||
|
@ -250,19 +249,23 @@ static void gguf_split(const split_params & split_params) {
|
|||
std::ifstream f_input(split_params.input.c_str(), std::ios::binary);
|
||||
if (!f_input.is_open()) {
|
||||
fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_params.input.c_str());
|
||||
exit(1);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
auto * ctx_gguf = gguf_init_from_file(split_params.input.c_str(), params);
|
||||
if (!ctx_gguf) {
|
||||
fprintf(stderr, "%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
|
||||
exit(1);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta);
|
||||
|
||||
char first_split_path[PATH_MAX] = {0};
|
||||
llama_split_path(first_split_path, sizeof(first_split_path),
|
||||
split_params.output.c_str(), strategy.i_split, strategy.n_split);
|
||||
fprintf(stderr, "%s: %s -> %s (%d tensors per file)\n",
|
||||
__func__, split_params.input.c_str(),
|
||||
split_file_name(split_params.output, strategy.i_split, strategy.n_split).c_str(),
|
||||
first_split_path,
|
||||
split_params.n_split_tensors);
|
||||
|
||||
strategy.split_start();
|
||||
|
@ -298,7 +301,9 @@ static void gguf_merge(const split_params & split_params) {
|
|||
std::vector<ggml_context *> ctx_metas;
|
||||
std::vector<gguf_context *> ctx_ggufs;
|
||||
|
||||
std::string split_prefix;
|
||||
char split_path[PATH_MAX] = {0};
|
||||
strncpy(split_path, split_params.input.c_str(), sizeof(split_path) - 1);
|
||||
char split_prefix[PATH_MAX] = {0};
|
||||
|
||||
// First pass to find KV and tensors metadata
|
||||
for (int i_split = 0; i_split < n_split; i_split++) {
|
||||
|
@ -309,89 +314,66 @@ static void gguf_merge(const split_params & split_params) {
|
|||
/*.ctx = */ &ctx_meta,
|
||||
};
|
||||
|
||||
auto split_name = split_params.input;
|
||||
if (i_split > 0) {
|
||||
split_name = split_file_name(split_prefix, i_split, n_split);
|
||||
llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
|
||||
}
|
||||
fprintf(stderr, "%s: reading metadata %s ...", __func__, split_name.c_str());
|
||||
fprintf(stderr, "%s: reading metadata %s ...", __func__, split_path);
|
||||
|
||||
auto * ctx_gguf = gguf_init_from_file(split_name.c_str(), params);
|
||||
auto * ctx_gguf = gguf_init_from_file(split_path, params);
|
||||
if (!ctx_gguf) {
|
||||
fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
|
||||
exit(1);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
ctx_ggufs.push_back(ctx_gguf);
|
||||
ctx_metas.push_back(ctx_meta);
|
||||
|
||||
if (i_split == 0) {
|
||||
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT);
|
||||
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
|
||||
if (key_n_split < 0) {
|
||||
fprintf(stderr,
|
||||
"\n%s: input file does not contain %s metadata\n",
|
||||
__func__,
|
||||
LLM_KV_GENERAL_SPLIT_N_SPLIT);
|
||||
LLM_KV_SPLIT_COUNT);
|
||||
gguf_free(ctx_gguf);
|
||||
ggml_free(ctx_meta);
|
||||
gguf_free(ctx_out);
|
||||
fout.close();
|
||||
exit(1);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
n_split = gguf_get_val_u8(ctx_gguf, key_n_split);
|
||||
n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
|
||||
if (n_split < 1) {
|
||||
fprintf(stderr,
|
||||
"\n%s: input file does not contain a valid split count %d\n",
|
||||
__func__,
|
||||
n_split);
|
||||
gguf_free(ctx_gguf);
|
||||
ggml_free(ctx_meta);
|
||||
gguf_free(ctx_out);
|
||||
fout.close();
|
||||
exit(1);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
// Verify the file naming and extract split_prefix
|
||||
if (!llama_split_prefix(split_prefix, sizeof (split_prefix), split_path, i_split, n_split)) {
|
||||
fprintf(stderr, "\n%s: unexpected input file name: %s"
|
||||
" i_split=%d"
|
||||
" n_split=%d\n", __func__,
|
||||
split_path, i_split, n_split);
|
||||
gguf_free(ctx_gguf);
|
||||
ggml_free(ctx_meta);
|
||||
gguf_free(ctx_out);
|
||||
fout.close();
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
// Do not trigger merge if we try to merge again the output
|
||||
gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, 0);
|
||||
gguf_set_val_u16(ctx_gguf, LLM_KV_SPLIT_COUNT, 0);
|
||||
|
||||
// Set metadata from the first split
|
||||
gguf_set_kv(ctx_out, ctx_gguf);
|
||||
}
|
||||
|
||||
// Verify the file naming
|
||||
{
|
||||
int i_split_file = 0;
|
||||
int n_split_file = 0;
|
||||
const char * i_split_format = "-00000-of-00000.gguf";
|
||||
|
||||
if (split_name.size() < strlen(i_split_format)) {
|
||||
fprintf(stderr, "\n%s: unexpected input file name: %s\n", __func__, split_params.input.c_str());
|
||||
for (auto * _ctx_gguf : ctx_ggufs) {
|
||||
gguf_free(_ctx_gguf);
|
||||
}
|
||||
gguf_free(ctx_out);
|
||||
fout.close();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
split_prefix = split_name.substr(0, split_name.size() - strlen(i_split_format));
|
||||
|
||||
const char * split_name_c_str = split_name.c_str();
|
||||
int n_part = sscanf(&split_name_c_str[0] + split_prefix.size(), "-%d-of-%d", &i_split_file, &n_split_file);
|
||||
|
||||
if (n_part != 2 || i_split_file - 1 != i_split || n_split_file != n_split) {
|
||||
fprintf(stderr, "\n%s: unexpected input file name: %s"
|
||||
" i_split=%d i_split_file=%d"
|
||||
" n_split=%d n_split_file=%d\n", __func__,
|
||||
split_params.input.c_str(),
|
||||
i_split, i_split_file,
|
||||
n_split, n_split_file);
|
||||
for (auto * _ctx_gguf : ctx_ggufs) {
|
||||
gguf_free(_ctx_gguf);
|
||||
}
|
||||
gguf_free(ctx_out);
|
||||
fout.close();
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
auto n_tensors = gguf_get_n_tensors(ctx_gguf);
|
||||
for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) {
|
||||
const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor);
|
||||
|
@ -411,18 +393,19 @@ static void gguf_merge(const split_params & split_params) {
|
|||
|
||||
// Write tensors data
|
||||
for (int i_split = 0; i_split < n_split; i_split++) {
|
||||
auto split_name = split_file_name(split_prefix, i_split, n_split);
|
||||
std::ifstream f_input(split_name.c_str(), std::ios::binary);
|
||||
llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
|
||||
std::ifstream f_input(split_path, std::ios::binary);
|
||||
if (!f_input.is_open()) {
|
||||
fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_name.c_str());
|
||||
for (auto * _ctx_gguf : ctx_ggufs) {
|
||||
gguf_free(_ctx_gguf);
|
||||
fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_path);
|
||||
for (uint32_t i = 0; i < ctx_ggufs.size(); i++) {
|
||||
gguf_free(ctx_ggufs[i]);
|
||||
ggml_free(ctx_metas[i]);
|
||||
}
|
||||
gguf_free(ctx_out);
|
||||
fout.close();
|
||||
exit(1);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
fprintf(stderr, "%s: writing tensors %s ...", __func__, split_name.c_str());
|
||||
fprintf(stderr, "%s: writing tensors %s ...", __func__, split_path);
|
||||
|
||||
auto * ctx_gguf = ctx_ggufs[i_split];
|
||||
auto * ctx_meta = ctx_metas[i_split];
|
||||
|
@ -481,8 +464,8 @@ int main(int argc, const char ** argv) {
|
|||
break;
|
||||
case SPLIT_OP_MERGE: gguf_merge(params);
|
||||
break;
|
||||
default:split_print_usage(argv[0]);
|
||||
exit(1);
|
||||
default: split_print_usage(argv[0]);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue