From a9db9b0048f3c3c9c191743596345b3175215187 Mon Sep 17 00:00:00 2001 From: Aleksei Nikiforov Date: Tue, 21 Jan 2025 12:16:32 +0100 Subject: [PATCH] Implement --no-byteswap argument to disable byteswapping on big endian platform --- common/arg.cpp | 7 +++++++ common/common.cpp | 13 +++++++----- common/common.h | 1 + .../convert-llama2c-to-ggml.cpp | 5 +++-- examples/export-lora/export-lora.cpp | 5 +++-- examples/gguf-hash/gguf-hash.cpp | 5 +++-- examples/gguf-split/gguf-split.cpp | 10 ++++++---- examples/gguf/gguf.cpp | 10 ++++++---- examples/llava/clip.cpp | 5 +++-- ggml/include/gguf.h | 2 ++ ggml/src/gguf.cpp | 17 ++++++++++++---- include/llama.h | 4 +++- src/llama-adapter.cpp | 11 +++++----- src/llama-model-loader.cpp | 20 +++++++++++-------- src/llama-model-loader.h | 4 +++- src/llama-model.cpp | 1 + src/llama-quant.cpp | 2 +- src/llama.cpp | 2 +- tests/test-gguf.cpp | 10 ++++++---- 19 files changed, 88 insertions(+), 46 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index a6226a34b..ad36a4572 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1438,6 +1438,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.use_mmap = false; } ).set_env("LLAMA_ARG_NO_MMAP")); + add_opt(common_arg( + {"--no-byteswap"}, + "don't byteswap model data on big endian systems (use if model is byteswapped to big endian in advance)", + [](common_params & params) { + params.no_byteswap = true; + } + ).set_env("LLAMA_NO_BYTESWAP")); add_opt(common_arg( {"--numa"}, "TYPE", "attempt optimizations that help on some NUMA systems\n" diff --git a/common/common.cpp b/common/common.cpp index 6dea8e3d2..83d21470a 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -987,7 +987,7 @@ struct common_init_result common_init_from_params(common_params & params) { // load and optionally apply lora adapters for (auto & la : params.lora_adapters) { llama_adapter_lora_ptr lora; - lora.reset(llama_adapter_lora_init(model, la.path.c_str())); + lora.reset(llama_adapter_lora_init(model, la.path.c_str(), mparams.no_byteswap)); if (lora == nullptr) { LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); llama_free(lctx); @@ -1092,6 +1092,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { mparams.use_mmap = params.use_mmap; mparams.use_mlock = params.use_mlock; mparams.check_tensors = params.check_tensors; + mparams.no_byteswap = params.no_byteswap; if (params.kv_overrides.empty()) { mparams.kv_overrides = NULL; } else { @@ -1418,8 +1419,9 @@ struct llama_model * common_load_model_from_url( int n_split = 0; { struct gguf_init_params gguf_params = { - /*.no_alloc = */ true, - /*.ctx = */ NULL, + /*.no_alloc = */ true, + /*.ctx = */ NULL, + /*.no_byteswap = */ false, }; auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params); if (!ctx_gguf) { @@ -2063,8 +2065,9 @@ static common_control_vector_data common_control_vector_load_one(const common_co ggml_context * ctx = nullptr; struct gguf_init_params meta_gguf_params = { - /* .no_alloc = */ false, - /* .ctx = */ &ctx, + /* .no_alloc = */ false, + /* .ctx = */ &ctx, + /* .no_byteswap = */ false, }; struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params); if (!ctx_gguf) { diff --git a/common/common.h b/common/common.h index 571260372..1b9d79c79 100644 --- a/common/common.h +++ b/common/common.h @@ -307,6 +307,7 @@ struct common_params { bool no_kv_offload = false; // disable KV offloading bool warmup = true; // warmup run bool check_tensors = false; // validate tensor data + bool no_byteswap = false; // skip byteswapping on big endian systems ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index bdf0eed2a..2990d6533 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -533,8 +533,9 @@ static void load_vocab(const char * filename, const Config * config, struct my_l struct ggml_context * ctx_data = NULL; struct gguf_init_params params = { - /*.no_alloc = */ false, - /*.ctx = */ &ctx_data, + /*.no_alloc = */ false, + /*.ctx = */ &ctx_data, + /*.no_byteswap = */ false, }; struct gguf_context * ctx = gguf_init_from_file(filename, params); diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 91238e4be..e50dabf08 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -48,8 +48,9 @@ static std::string ggml_ne_string(const ggml_tensor * t) { static struct gguf_context * load_gguf(std::string & fname, struct ggml_context ** ctx_ggml) { struct gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ ctx_ggml, + /*.no_alloc = */ true, + /*.ctx = */ ctx_ggml, + /*.no_byteswap = */ false, }; struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), params); if (!ctx_gguf) { diff --git a/examples/gguf-hash/gguf-hash.cpp b/examples/gguf-hash/gguf-hash.cpp index 9523ec122..3ef8ca49b 100644 --- a/examples/gguf-hash/gguf-hash.cpp +++ b/examples/gguf-hash/gguf-hash.cpp @@ -288,8 +288,9 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) { struct ggml_context * ctx_data = NULL; struct gguf_init_params params = { - /*.no_alloc = */ false, - /*.ctx = */ &ctx_data, + /*.no_alloc = */ false, + /*.ctx = */ &ctx_data, + /*.no_byteswap = */ false, }; // xxh64 init diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index ef3ceb686..bd2c18bd0 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -361,8 +361,9 @@ static void gguf_split(const split_params & split_params) { struct ggml_context * ctx_meta = NULL; struct gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ &ctx_meta, + /*.no_alloc = */ true, + /*.ctx = */ &ctx_meta, + /*.no_byteswap = */ false, }; std::ifstream f_input(split_params.input.c_str(), std::ios::binary); @@ -426,8 +427,9 @@ static void gguf_merge(const split_params & split_params) { struct ggml_context * ctx_meta = NULL; struct gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ &ctx_meta, + /*.no_alloc = */ true, + /*.ctx = */ &ctx_meta, + /*.no_byteswap = */ false, }; if (i_split > 0) { diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index f31989c8c..18c818694 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -85,8 +85,9 @@ static bool gguf_ex_write(const std::string & fname) { // just read tensor info static bool gguf_ex_read_0(const std::string & fname) { struct gguf_init_params params = { - /*.no_alloc = */ false, - /*.ctx = */ NULL, + /*.no_alloc = */ false, + /*.ctx = */ NULL, + /*.no_byteswap = */ false, }; struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); @@ -151,8 +152,9 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) { struct ggml_context * ctx_data = NULL; struct gguf_init_params params = { - /*.no_alloc = */ false, - /*.ctx = */ &ctx_data, + /*.no_alloc = */ false, + /*.ctx = */ &ctx_data, + /*.no_byteswap = */ false, }; struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 24073c5a9..b3bb83d23 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1122,8 +1122,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { struct ggml_context * meta = NULL; struct gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ &meta, + /*.no_alloc = */ true, + /*.ctx = */ &meta, + /*.no_byteswap = */ false, }; struct gguf_context * ctx = gguf_init_from_file(fname, params); diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h index 79ee20206..47a511d34 100644 --- a/ggml/include/gguf.h +++ b/ggml/include/gguf.h @@ -74,6 +74,8 @@ extern "C" { // if not NULL, create a ggml_context and allocate the tensor data in it struct ggml_context ** ctx; + + bool no_byteswap; }; GGML_API struct gguf_context * gguf_init_empty(void); diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index 602995c57..66906189d 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -218,13 +218,17 @@ struct gguf_context { struct gguf_reader { FILE * file; + bool no_byteswap = false; gguf_reader(FILE * file) : file(file) {} + gguf_reader(FILE * file, bool v_no_byteswap) : file(file), no_byteswap(v_no_byteswap) {} template bool read(T & dst) const { auto res = fread(&dst, 1, sizeof(dst), file); - ggml_convert_from_le(&dst); + if (!no_byteswap) { + ggml_convert_from_le(&dst); + } return res == sizeof(dst); } @@ -319,7 +323,7 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector & buf; + bool no_byteswap = false; gguf_writer(std::vector & buf) : buf(buf) {} @@ -1150,7 +1155,11 @@ struct gguf_writer { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ buf.push_back(reinterpret_cast(&val)[i]); #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - buf.push_back(reinterpret_cast(&val)[sizeof(val) - i - 1]); + if (!no_byteswap) { + buf.push_back(reinterpret_cast(&val)[sizeof(val) - i - 1]); + } else { + buf.push_back(reinterpret_cast(&val)[i]); + } #else // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ #error Unexpected or undefined __BYTE_ORDER__ #endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ @@ -1321,7 +1330,7 @@ struct gguf_writer { #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ auto byteswap = ggml_get_type_traits(info.t.type)->byteswap; - if (byteswap != nullptr) { + if (byteswap != nullptr && !no_byteswap) { byteswap(buf.data() + offset, ggml_nelements(&(info.t)) / ggml_blck_size(info.t.type)); } #endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ diff --git a/include/llama.h b/include/llama.h index 3b75e7607..23ab4c2c2 100644 --- a/include/llama.h +++ b/include/llama.h @@ -304,6 +304,7 @@ extern "C" { bool use_mmap; // use mmap if possible bool use_mlock; // force system to keep model in RAM bool check_tensors; // validate model tensor data + bool no_byteswap; // don't do byteswap, load pre-byteswapped big endian model on big endian system }; // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations @@ -542,7 +543,8 @@ extern "C" { // Load a LoRA adapter from file LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init( struct llama_model * model, - const char * path_lora); + const char * path_lora, + bool no_byteswap); // Manually free a LoRA adapter // Note: loaded adapters will be free when the associated model is deleted diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index 8a0800463..e3c6bac73 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -146,13 +146,14 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor * return nullptr; } -static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) { +static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter, bool no_byteswap) { LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora); ggml_context * ctx_init; struct gguf_init_params meta_gguf_params = { - /* .no_alloc = */ true, - /* .ctx = */ &ctx_init, + /* .no_alloc = */ true, + /* .ctx = */ &ctx_init, + /* .no_byteswap = */ no_byteswap, }; gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) }; @@ -327,11 +328,11 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2); } -struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) { +struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora, bool no_byteswap) { struct llama_adapter_lora * adapter = new llama_adapter_lora(); try { - llama_adapter_lora_init_impl(*model, path_lora, *adapter); + llama_adapter_lora_init_impl(*model, path_lora, *adapter, no_byteswap); return adapter; } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index cc98896fc..d2b887df7 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -445,7 +445,8 @@ llama_model_loader::llama_model_loader( std::vector & splits, bool use_mmap, bool check_tensors, - const struct llama_model_kv_override * param_overrides_p) { + const struct llama_model_kv_override * param_overrides_p, + bool no_byteswap) { int trace = 0; if (getenv("LLAMA_TRACE")) { trace = atoi(getenv("LLAMA_TRACE")); @@ -460,8 +461,9 @@ llama_model_loader::llama_model_loader( // Load the main GGUF struct ggml_context * ctx = NULL; struct gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ &ctx, + /*.no_alloc = */ true, + /*.ctx = */ &ctx, + /*.no_byteswap = */ no_byteswap, }; meta.reset(gguf_init_from_file(fname.c_str(), params)); @@ -520,8 +522,9 @@ llama_model_loader::llama_model_loader( const char * fname_split = splits[idx].c_str(); struct gguf_init_params split_params = { - /*.no_alloc = */ true, - /*.ctx = */ &ctx, + /*.no_alloc = */ true, + /*.ctx = */ &ctx, + /*.no_byteswap = */ no_byteswap, }; gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) }; if (!ctx_gguf) { @@ -681,8 +684,9 @@ llama_model_loader::llama_model_loader( use_mmap = false; } - this->use_mmap = use_mmap; + this->use_mmap = use_mmap; this->check_tensors = check_tensors; + this->no_byteswap = no_byteswap; } std::string llama_model_loader::get_arch_name() const { @@ -1027,7 +1031,7 @@ bool llama_model_loader::load_all_data( #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ auto byteswap = ggml_get_type_traits(cur->type)->byteswap; - if (byteswap != nullptr) { + if (byteswap != nullptr && !no_byteswap) { byteswap(cur->data, ggml_nelements(cur) / ggml_blck_size(cur->type)); } #endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ @@ -1063,7 +1067,7 @@ bool llama_model_loader::load_all_data( #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ auto byteswap = ggml_get_type_traits(cur->type)->byteswap; - if (byteswap != nullptr) { + if (byteswap != nullptr && !no_byteswap) { byteswap(read_buf.data(), read_buf.size() / ggml_blck_size(cur->type)); } #endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index fe35404b2..24fd7f381 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -70,6 +70,7 @@ struct llama_model_loader { bool use_mmap = false; bool check_tensors; + bool no_byteswap = false; llama_files files; llama_ftype ftype; @@ -95,7 +96,8 @@ struct llama_model_loader { std::vector & splits, // optional, only need if the split does not follow naming scheme bool use_mmap, bool check_tensors, - const struct llama_model_kv_override * param_overrides_p); + const struct llama_model_kv_override * param_overrides_p, + bool no_byteswap); template typename std::enable_if::value, bool>::type diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 031b4c30b..38dd1f918 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -3768,6 +3768,7 @@ struct llama_model_params llama_model_default_params() { /*.use_mmap =*/ true, /*.use_mlock =*/ false, /*.check_tensors =*/ false, + /*.no_byteswap =*/ false, }; #ifdef GGML_USE_METAL diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index c91af4cbd..4cf8f3245 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -527,7 +527,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } std::vector splits = {}; - llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides); + llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, /*no_byteswap*/ false); ml.init_mappings(false); // no prefetching llama_model model(llama_model_default_params()); diff --git a/src/llama.cpp b/src/llama.cpp index e8cfe5012..6d5af9c3a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -40,7 +40,7 @@ static int llama_model_load(const std::string & fname, std::vector model.t_start_us = tm.t_start_us; try { - llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides); + llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.no_byteswap); ml.print_info(); diff --git a/tests/test-gguf.cpp b/tests/test-gguf.cpp index dc87f5f0a..0b25c5846 100644 --- a/tests/test-gguf.cpp +++ b/tests/test-gguf.cpp @@ -758,8 +758,9 @@ static std::pair test_handcrafted_file(const unsigned int seed) { struct ggml_context * ctx = nullptr; struct gguf_init_params gguf_params = { - /*no_alloc =*/ false, - /*ctx =*/ hft >= offset_has_data ? &ctx : nullptr, + /*no_alloc =*/ false, + /*ctx =*/ hft >= offset_has_data ? &ctx : nullptr, + /*no_byteswap =*/ false, }; struct gguf_context * gguf_ctx = gguf_init_from_file_impl(file, gguf_params); @@ -1154,8 +1155,9 @@ static std::pair test_roundtrip(ggml_backend_dev_t dev, const unsigned struct ggml_context * ctx_1 = nullptr; struct gguf_init_params gguf_params = { - /*no_alloc =*/ false, - /*ctx =*/ only_meta ? nullptr : &ctx_1, + /*no_alloc =*/ false, + /*ctx =*/ only_meta ? nullptr : &ctx_1, + /*no_byteswap =*/ false, }; struct gguf_context * gguf_ctx_1 = gguf_init_from_file_impl(file, gguf_params);