diff --git a/CMakeLists.txt b/CMakeLists.txt index 58a1805ba..425100ff8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,12 +43,6 @@ else() set(LLAMA_METAL_DEFAULT OFF) endif() -if (CMAKE_SYSTEM_NAME MATCHES "ANDROID") - set(LLAMA_LLAMAFILE_DEFAULT OFF) -else() - set(LLAMA_LLAMAFILE_DEFAULT ON) -endif() - # general option(BUILD_SHARED_LIBS "build shared libraries" OFF) option(LLAMA_STATIC "llama: static link libraries" OFF) diff --git a/README.md b/README.md index 72b5a2c88..bb2ca9dbd 100644 --- a/README.md +++ b/README.md @@ -93,6 +93,7 @@ Typically finetunes of the base models below are supported as well. - [X] LLaMA 🦙 - [x] LLaMA 2 🦙🦙 +- [x] LLaMA 3 🦙🦙🦙 - [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1) - [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral) - [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct) @@ -119,8 +120,9 @@ Typically finetunes of the base models below are supported as well. - [x] [CodeShell](https://github.com/WisdomShell/codeshell) - [x] [Gemma](https://ai.google.dev/gemma) - [x] [Mamba](https://github.com/state-spaces/mamba) +- [x] [Grok-1](https://huggingface.co/keyfan/grok-1-hf) - [x] [Xverse](https://huggingface.co/models?search=xverse) -- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01) +- [x] [Command-R models](https://huggingface.co/models?search=CohereForAI/c4ai-command-r) - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion) - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B) - [x] [OLMo](https://allenai.org/olmo) @@ -135,6 +137,7 @@ Typically finetunes of the base models below are supported as well. - [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V) - [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM) - [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL) +- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM) **HTTP server** diff --git a/ci/run.sh b/ci/run.sh index 085dfd42f..da05f0d48 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -160,7 +160,9 @@ function gg_run_test_scripts_debug { set -e + # TODO: too slow, run on dedicated node (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log + #(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log set +e } @@ -184,6 +186,7 @@ function gg_run_test_scripts_release { set -e (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log + (cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log set +e } diff --git a/examples/gguf-split/tests.sh b/examples/gguf-split/tests.sh old mode 100644 new mode 100755 index 879522f7e..57588204d --- a/examples/gguf-split/tests.sh +++ b/examples/gguf-split/tests.sh @@ -4,16 +4,16 @@ set -eu if [ $# -lt 1 ] then - echo "usage: $0 path_to_build_binary [path_to_temp_folder]" - echo "example: $0 ../../build/bin ../../tmp" - exit 1 + echo "usage: $0 path_to_build_binary [path_to_temp_folder]" + echo "example: $0 ../../build/bin ../../tmp" + exit 1 fi if [ $# -gt 1 ] then - TMP_DIR=$2 + TMP_DIR=$2 else - TMP_DIR=/tmp + TMP_DIR=/tmp fi set -x @@ -21,7 +21,7 @@ set -x SPLIT=$1/gguf-split MAIN=$1/main WORK_PATH=$TMP_DIR/gguf-split -CUR_DIR=$(pwd) +ROOT_DIR=$(realpath $(dirname $0)/../../) mkdir -p "$WORK_PATH" @@ -30,8 +30,8 @@ rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf # 1. Get a model ( - cd $WORK_PATH - "$CUR_DIR"/../../scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf +cd $WORK_PATH +"$ROOT_DIR"/scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf ) echo PASS diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index e431c7f70..f568f470c 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1325,7 +1325,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length } // Linear interpolation between two points -inline float lerp(float s, float e, float t) { +inline float clip_lerp(float s, float e, float t) { return s + (e - s) * t; } // Bilinear resize function @@ -1347,17 +1347,17 @@ static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int ta float y_lerp = py - y_floor; for (int c = 0; c < 3; c++) { - float top = lerp( + float top = clip_lerp( static_cast(src.buf[3 * (y_floor * src.nx + x_floor) + c]), static_cast(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]), x_lerp ); - float bottom = lerp( + float bottom = clip_lerp( static_cast(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]), static_cast(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]), x_lerp ); - dst.buf[3 * (y * target_width + x) + c] = static_cast(lerp(top, bottom, y_lerp)); + dst.buf[3 * (y * target_width + x) + c] = static_cast(clip_lerp(top, bottom, y_lerp)); } } } diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 64cb6db19..da1850dfd 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -97,6 +97,7 @@ static void usage(const char * executable) { printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n"); printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n"); printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n"); + printf(" --keep-split: will generate quatized model in the same shards as input"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); printf("Note: --include-weights and --exclude-weights cannot be used together\n"); @@ -300,6 +301,8 @@ int main(int argc, char ** argv) { } else { usage(argv[0]); } + } else if (strcmp(argv[arg_idx], "--keep-split")) { + params.keep_split = true; } else { usage(argv[0]); } @@ -332,20 +335,28 @@ int main(int argc, char ** argv) { std::string fname_out; std::string ftype_str; + std::string suffix = ".gguf"; if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) { std::string fpath; const size_t pos = fname_inp.find_last_of("/\\"); if (pos != std::string::npos) { fpath = fname_inp.substr(0, pos + 1); } - // export as [inp path]/ggml-model-[ftype].gguf - fname_out = fpath + "ggml-model-" + ftype_str + ".gguf"; + + // export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting + fname_out = fpath + "ggml-model-" + ftype_str; + if (!params.keep_split) { + fname_out += suffix; + } arg_idx++; if (ftype_str == "COPY") { params.only_copy = true; } } else { fname_out = argv[arg_idx]; + if (params.keep_split && fname_out.find(suffix) != std::string::npos) { + fname_out = fname_out.substr(0, fname_out.length() - suffix.length()); + } arg_idx++; if (argc <= arg_idx) { diff --git a/examples/quantize/tests.sh b/examples/quantize/tests.sh new file mode 100644 index 000000000..160c12bee --- /dev/null +++ b/examples/quantize/tests.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +set -eu + +if [ $# -lt 1 ] +then + echo "usage: $0 path_to_build_binary [path_to_temp_folder]" + echo "example: $0 ../../build/bin ../../tmp" + exit 1 +fi + +if [ $# -gt 1 ] +then + TMP_DIR=$2 +else + TMP_DIR=/tmp +fi + +set -x + +SPLIT=$1/gguf-split +QUANTIZE=$1/quantize +MAIN=$1/main +WORK_PATH=$TMP_DIR/quantize +ROOT_DIR=$(realpath $(dirname $0)/../../) + +mkdir -p "$WORK_PATH" + +# Clean up in case of previously failed test +rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-requant*.gguf + +# 1. Get a model +( +cd $WORK_PATH +"$ROOT_DIR"/scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf +) +echo PASS + +# 2. Split model +$SPLIT --split-max-tensors 28 $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/ggml-model-split +echo PASS +echo + +# 3. Requant model with '--keep_split' +$QUANTIZE --allow-requantize --keep_split $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant.gguf Q4_K +echo PASS +echo + +# 3a. Test the requanted model is loading properly +$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --random-prompt --n-predict 32 +echo PASS +echo + +# 4. Requant mode without '--keep_split' +$QUANTIZE --allow-requantize $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant-merge.gguf Q4_K +echo PASS +echo + +# 4b. Test the requanted model is loading properly +$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --random-prompt --n-predict 32 +echo PASS +echo + +# Clean up +rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-requant*.gguf diff --git a/examples/server/tests/tests.sh b/examples/server/tests/tests.sh index 1c6c5695f..72a0fbad8 100755 --- a/examples/server/tests/tests.sh +++ b/examples/server/tests/tests.sh @@ -4,9 +4,8 @@ set -eu if [ $# -lt 1 ] then - # Start @llama.cpp scenario - behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp + # Start @llama.cpp scenario + behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp else - behave "$@" + behave "$@" fi - diff --git a/ggml-impl.h b/ggml-impl.h index 2ffacc299..2087f7ded 100644 --- a/ggml-impl.h +++ b/ggml-impl.h @@ -11,6 +11,12 @@ #include // memcpy #include // fabsf +#undef MIN +#undef MAX + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + #ifdef __cplusplus extern "C" { #endif diff --git a/ggml-quants.c b/ggml-quants.c index 11e11c219..c147531df 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -14,12 +14,6 @@ #include // for qsort #include // for GGML_ASSERT -#undef MIN -#undef MAX - -#define MIN(a, b) ((a) < (b) ? (a) : (b)) -#define MAX(a, b) ((a) > (b) ? (a) : (b)) - #define UNUSED GGML_UNUSED // some compilers don't provide _mm256_set_m128i, e.g. gcc 7 diff --git a/ggml.c b/ggml.c index 086db96af..f574351f4 100644 --- a/ggml.c +++ b/ggml.c @@ -858,18 +858,6 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { // simd mappings // -#if defined(__ARM_NEON) -#if !defined(__aarch64__) - -// 64-bit compatibility - -inline static float vaddvq_f32(float32x4_t v) { - return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); -} - -#endif -#endif - // we define a common set of C macros which map to specific intrinsics based on the current architecture // we then implement the fundamental computation operations below using only these macros // adding support for new architectures requires to define the corresponding SIMD macros diff --git a/llama.cpp b/llama.cpp index 3a84b4916..0c01a3539 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2999,9 +2999,13 @@ struct llama_model_loader { ggml_tensor * tensor; - llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) { + llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) { const int tensor_idx = gguf_find_tensor(gguf_ctx, name); offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx); + + if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) { + throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name)); + } } }; std::vector weights; @@ -3040,15 +3044,15 @@ struct llama_model_loader { get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); llm_kv = LLM_KV(llm_arch_from_string(arch_name)); + files.emplace_back(new llama_file(fname.c_str(), "rb")); + contexts.emplace_back(ctx); + // Save tensors data offset of the main file. // For subsidiary files, `meta` tensor data offset must not be used, // so we build a unified tensors index for weights. for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { - weights.emplace_back(0, cur->name, meta, cur); + weights.emplace_back(files.back().get(), 0, cur->name, meta, cur); } - files.emplace_back(new llama_file(fname.c_str(), "rb")); - contexts.emplace_back(ctx); - uint16_t n_split = 0; get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false); @@ -3082,13 +3086,14 @@ struct llama_model_loader { throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path)); } - // Save tensors data offset info of the shard. - for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { - weights.emplace_back(idx, cur->name, ctx_gguf, cur); - } files.emplace_back(new llama_file(split_path, "rb")); contexts.emplace_back(ctx); + // Save tensors data offset info of the shard. + for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { + weights.emplace_back(files.back().get(), idx, cur->name, ctx_gguf, cur); + } + gguf_free(ctx_gguf); } @@ -3297,6 +3302,10 @@ struct llama_model_loader { return nullptr; } + const llama_tensor_weight * get_weight(int i) const { + return get_weight(get_tensor_name(i)); + } + const llama_tensor_weight & require_weight(const char * name) const { const llama_tensor_weight * weight = get_weight(name); if (!weight) { @@ -14528,26 +14537,74 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s std::vector> work; std::vector> f32_conv_buf; + uint16_t n_split = 1; + // Assume split index is continuous + if (params->keep_split) { + for (int i = 0; i < ml.n_tensors; ++i) { + n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split); + } + } + std::vector ctx_outs(n_split, NULL); + ctx_outs[0] = ctx_out; + // populate the original tensors so we get an initial meta data for (int i = 0; i < ml.n_tensors; ++i) { - const struct ggml_tensor * meta = ml.get_tensor_meta(i); - gguf_add_tensor(ctx_out, meta); + auto weight = ml.get_weight(i); + uint16_t i_split = params->keep_split ? weight->idx : 0; + struct ggml_tensor * tensor = weight->tensor; + if (ctx_outs[i_split] == NULL) { + ctx_outs[i_split] = gguf_init_empty(); + } + gguf_add_tensor(ctx_outs[i_split], tensor); } - std::ofstream fout(fname_out, std::ios::binary); - fout.exceptions(std::ofstream::failbit); // fail fast on write errors + // Set split info if needed + if (n_split > 1) { + for (size_t i = 0; i < ctx_outs.size(); ++i) { + gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i); + gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split); + gguf_set_val_i32(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors); + } + } - const size_t meta_size = gguf_get_meta_size(ctx_out); + int cur_split = -1; + std::ofstream fout; + auto close_ofstream = [&]() { + // Write metadata and close file handler + if (fout.is_open()) { + fout.seekp(0); + std::vector data(gguf_get_meta_size(ctx_outs[cur_split])); + gguf_get_meta_data(ctx_outs[cur_split], data.data()); + fout.write((const char *) data.data(), data.size()); + fout.close(); + } + }; + auto new_ofstream = [&](int index) { + cur_split = index; + GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context"); + std::string fname = fname_out; + if (params->keep_split) { + char split_path[PATH_MAX] = {0}; + llama_split_path(split_path, sizeof(split_path), fname_out.c_str(), cur_split, n_split); + fname = std::string(split_path); + } - LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size); - - // placeholder for the meta data - ::zeros(fout, meta_size); + fout = std::ofstream(fname, std::ios::binary); + fout.exceptions(std::ofstream::failbit); // fail fast on write errors + const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]); + // placeholder for the meta data + ::zeros(fout, meta_size); + }; const auto tn = LLM_TN(model.arch); - + new_ofstream(0); for (int i = 0; i < ml.n_tensors; ++i) { - struct ggml_tensor * tensor = ml.get_tensor_meta(i); + auto weight = ml.get_weight(i); + struct ggml_tensor * tensor = weight->tensor; + if (weight->idx != cur_split && params->keep_split) { + close_ofstream(); + new_ofstream(weight->idx); + } const std::string name = ggml_get_name(tensor); @@ -14702,26 +14759,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s total_size_new += new_size; // update the gguf meta data as we go - gguf_set_tensor_type(ctx_out, name.c_str(), new_type); - gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size); + gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type); + gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size); // write tensor data + padding fout.write((const char *) new_data, new_size); zeros(fout, GGML_PAD(new_size, align) - new_size); } - - // go back to beginning of file and write the updated meta data - { - fout.seekp(0); - std::vector data(gguf_get_meta_size(ctx_out)); - gguf_get_meta_data(ctx_out, data.data()); - fout.write((const char *) data.data(), data.size()); + close_ofstream(); + for (auto & c:ctx_outs) { + gguf_free(c); } - fout.close(); - - gguf_free(ctx_out); - LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); @@ -15077,6 +15126,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { /*.quantize_output_tensor =*/ true, /*.only_copy =*/ false, /*.pure =*/ false, + /*.keep_split =*/ false, /*.imatrix =*/ nullptr, /*.kv_overrides =*/ nullptr, }; @@ -16081,6 +16131,8 @@ struct llama_data_file_context : llama_data_context { * */ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) { + llama_synchronize(ctx); + // copy rng { std::ostringstream rng_ss; @@ -16233,6 +16285,8 @@ size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) { // Sets the state reading from the specified source address size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) { + llama_synchronize(ctx); + const uint8_t * inp = src; // set rng @@ -16537,6 +16591,8 @@ size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id) } static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) { + llama_synchronize(ctx); + const auto & kv_self = ctx->kv_self; GGML_ASSERT(!kv_self.recurrent); // not implemented @@ -16654,6 +16710,8 @@ size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_s } size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) { + llama_synchronize(ctx); + auto & kv_self = ctx->kv_self; GGML_ASSERT(!kv_self.recurrent); // not implemented diff --git a/llama.h b/llama.h index 0eb2a1e9a..8aa763672 100644 --- a/llama.h +++ b/llama.h @@ -288,6 +288,7 @@ extern "C" { bool quantize_output_tensor; // quantize output.weight bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored bool pure; // quantize all tensors to the default type + bool keep_split; // quantize to the same number of shards void * imatrix; // pointer to importance matrix data void * kv_overrides; // pointer to vector containing overrides } llama_model_quantize_params;