From b804b1ef77351d2a11be945462c6c251710476cb Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Thu, 11 Apr 2024 14:51:07 +0200 Subject: [PATCH 01/12] eval-callback: Example how to use eval callback for debugging (#6576) * gguf-debug: Example how to use ggml callback for debugging * gguf-debug: no mutex, verify type, fix stride. * llama: cv eval: move cb eval field in common gpt_params * ggml_debug: use common gpt_params to pass cb eval. Fix get tensor SIGV random. * ggml_debug: ci: add tests * ggml_debug: EOL in CMakeLists.txt * ggml_debug: Remove unused param n_batch, no batching here * ggml_debug: fix trailing spaces * ggml_debug: fix trailing spaces * common: fix cb_eval and user data not initialized * ci: build revert label * ggml_debug: add main test label * doc: add a model: add a link to ggml-debug * ggml-debug: add to make toolchain * ggml-debug: tests add the main label * ggml-debug: ci add test curl label * common: allow the warmup to be disabled in llama_init_from_gpt_params * ci: add curl test * ggml-debug: better tensor type support * gitignore : ggml-debug * ggml-debug: printing also the sum of each tensor * ggml-debug: remove block size * eval-callback: renamed from ggml-debug * eval-callback: fix make toolchain --------- Co-authored-by: slaren Co-authored-by: Georgi Gerganov --- .github/workflows/build.yml | 8 +- .gitignore | 1 + Makefile | 6 +- common/common.cpp | 4 +- common/common.h | 4 + docs/HOWTO-add-model.md | 2 + examples/CMakeLists.txt | 1 + examples/eval-callback/CMakeLists.txt | 9 ++ examples/eval-callback/README.md | 95 ++++++++++++ examples/eval-callback/eval-callback.cpp | 185 +++++++++++++++++++++++ examples/imatrix/imatrix.cpp | 24 ++- llama.cpp | 2 +- 12 files changed, 319 insertions(+), 22 deletions(-) create mode 100644 examples/eval-callback/CMakeLists.txt create mode 100644 examples/eval-callback/README.md create mode 100644 examples/eval-callback/eval-callback.cpp diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ff7238aba..f10ed4161 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -52,7 +52,7 @@ jobs: id: cmake_test run: | cd build - ctest -L main --verbose --timeout 900 + ctest -L 'main|curl' --verbose --timeout 900 - name: Determine tag name id: tag @@ -209,21 +209,21 @@ jobs: id: depends run: | sudo apt-get update - sudo apt-get install build-essential + sudo apt-get install build-essential libcurl4-openssl-dev - name: Build id: cmake_build run: | mkdir build cd build - cmake .. -DLLAMA_FATAL_WARNINGS=ON + cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON cmake --build . --config Release -j $(nproc) - name: Test id: cmake_test run: | cd build - ctest -L main --verbose --timeout 900 + ctest -L 'main|curl' --verbose --timeout 900 - name: Test llama2c conversion id: llama2c_test diff --git a/.gitignore b/.gitignore index 9fb5b80c3..fdc5184a1 100644 --- a/.gitignore +++ b/.gitignore @@ -48,6 +48,7 @@ models-mnt /convert-llama2c-to-ggml /embd-input-test /embedding +/eval-callback /gguf /gguf-llama-simple /gguf-split diff --git a/Makefile b/Makefile index 11b31c5c8..2fd805a97 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # Define the default target now so that it is always the first target BUILD_TARGETS = \ main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ - simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \ + simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \ retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o # Binaries only useful for tests @@ -800,6 +800,10 @@ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(O $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/common/common.cpp b/common/common.cpp index 98fc8388c..dda514785 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1745,6 +1745,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.yarn_orig_ctx = params.yarn_orig_ctx; cparams.pooling_type = params.pooling_type; cparams.defrag_thold = params.defrag_thold; + cparams.cb_eval = params.cb_eval; + cparams.cb_eval_user_data = params.cb_eval_user_data; cparams.offload_kqv = !params.no_kv_offload; cparams.type_k = kv_cache_type_from_str(params.cache_type_k); @@ -2192,7 +2194,7 @@ std::tuple llama_init_from_gpt_par params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY; } - { + if (params.warmup) { LOG("warming up the model with an empty run\n"); std::vector tmp = { llama_token_bos(model), llama_token_eos(model), }; diff --git a/common/common.h b/common/common.h index a7f476c1b..65272b0ba 100644 --- a/common/common.h +++ b/common/common.h @@ -80,6 +80,9 @@ struct gpt_params { int32_t yarn_orig_ctx = 0; // YaRN original context length float defrag_thold = -1.0f; // KV cache defragmentation threshold + ggml_backend_sched_eval_callback cb_eval = nullptr; + void * cb_eval_user_data = nullptr; + ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; @@ -156,6 +159,7 @@ struct gpt_params { bool infill = false; // use infill mode bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes bool no_kv_offload = false; // disable KV offloading + bool warmup = true; // warmup run std::string cache_type_k = "f16"; // KV cache data type for the K std::string cache_type_v = "f16"; // KV cache data type for the V diff --git a/docs/HOWTO-add-model.md b/docs/HOWTO-add-model.md index 3581f3e65..a56b78344 100644 --- a/docs/HOWTO-add-model.md +++ b/docs/HOWTO-add-model.md @@ -100,6 +100,8 @@ Have a look to existing implementation like `build_llama`, `build_dbrx` or `buil When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support of missing backend operations can be added in another PR. +Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback). + ## GGUF specification https://github.com/ggerganov/ggml/blob/master/docs/gguf.md diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 76496bf06..f421769cc 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -19,6 +19,7 @@ else() add_subdirectory(benchmark) add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(embedding) + add_subdirectory(eval-callback) add_subdirectory(finetune) add_subdirectory(gritlm) add_subdirectory(gguf-split) diff --git a/examples/eval-callback/CMakeLists.txt b/examples/eval-callback/CMakeLists.txt new file mode 100644 index 000000000..d53f37422 --- /dev/null +++ b/examples/eval-callback/CMakeLists.txt @@ -0,0 +1,9 @@ +set(TARGET eval-callback) +add_executable(${TARGET} eval-callback.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) + +set(TEST_TARGET test-eval-callback) +add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42) +set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl) diff --git a/examples/eval-callback/README.md b/examples/eval-callback/README.md new file mode 100644 index 000000000..66a37e878 --- /dev/null +++ b/examples/eval-callback/README.md @@ -0,0 +1,95 @@ +# llama.cpp/examples/eval-callback + +A simple example which demonstrates how to use callback during the inference. +It simply prints to the console all operations and tensor data. + +Usage: + +```shell +eval-callback \ + --hf-repo ggml-org/models \ + --hf-file phi-2/ggml-model-q4_0.gguf \ + --model phi-2-q4_0.gguf \ + --prompt hello \ + --seed 42 \ + -ngl 33 +``` + +Will print: + +```shell +llm_load_tensors: offloaded 33/33 layers to GPU +... +llama_new_context_with_model: n_ctx = 512 +... +llama_new_context_with_model: CUDA0 compute buffer size = 105.00 MiB +llama_new_context_with_model: CUDA_Host compute buffer size = 6.01 MiB +llama_new_context_with_model: graph nodes = 1225 +llama_new_context_with_model: graph splits = 2 +ggml_debug: inp_embd = (f32) GET_ROWS(token_embd.weight{2560, 51200, 1, 1}, inp_tokens{1, 1, 1, 1}}) = {2560, 1, 1, 1} + [ + [ + [ -0.0181, 0.0272, 0.0272, ...], + ], + ] +ggml_debug: norm-0 = (f32) NORM(CUDA0#inp_embd#0{2560, 1, 1, 1}, }) = {2560, 1, 1, 1} + [ + [ + [ -0.6989, 1.0636, 1.0636, ...], + ], + ] +ggml_debug: norm_w-0 = (f32) MUL(norm-0{2560, 1, 1, 1}, blk.0.attn_norm.weight{2560, 1, 1, 1}}) = {2560, 1, 1, 1} + [ + [ + [ -0.1800, 0.2817, 0.2632, ...], + ], + ] +ggml_debug: attn_norm-0 = (f32) ADD(norm_w-0{2560, 1, 1, 1}, blk.0.attn_norm.bias{2560, 1, 1, 1}}) = {2560, 1, 1, 1} + [ + [ + [ -0.1863, 0.2970, 0.2604, ...], + ], + ] +ggml_debug: wqkv-0 = (f32) MUL_MAT(blk.0.attn_qkv.weight{2560, 7680, 1, 1}, attn_norm-0{2560, 1, 1, 1}}) = {7680, 1, 1, 1} + [ + [ + [ -1.1238, 1.2876, -1.8086, ...], + ], + ] +ggml_debug: bqkv-0 = (f32) ADD(wqkv-0{7680, 1, 1, 1}, blk.0.attn_qkv.bias{7680, 1, 1, 1}}) = {7680, 1, 1, 1} + [ + [ + [ -1.1135, 1.4604, -1.9226, ...], + ], + ] +ggml_debug: bqkv-0 (view) = (f32) VIEW(bqkv-0{7680, 1, 1, 1}, }) = {2560, 1, 1, 1} + [ + [ + [ -1.1135, 1.4604, -1.9226, ...], + ], + ] +ggml_debug: Qcur-0 = (f32) CONT(bqkv-0 (view){2560, 1, 1, 1}, }) = {2560, 1, 1, 1} + [ + [ + [ -1.1135, 1.4604, -1.9226, ...], + ], + ] +ggml_debug: Qcur-0 (reshaped) = (f32) RESHAPE(Qcur-0{2560, 1, 1, 1}, }) = {80, 32, 1, 1} + [ + [ + [ -1.1135, 1.4604, -1.9226, ...], + [ -0.3608, 0.5076, -1.8866, ...], + [ 1.7643, 0.0273, -2.1065, ...], + ... + ], + ] +ggml_debug: Qcur-0 = (f32) ROPE(Qcur-0 (reshaped){80, 32, 1, 1}, CUDA0#inp_pos#0{1, 1, 1, 1}}) = {80, 32, 1, 1} + [ + [ + [ -1.1135, 1.4604, -1.9226, ...], + [ -0.3608, 0.5076, -1.8866, ...], + [ 1.7643, 0.0273, -2.1065, ...], + ... + ], + ] +``` diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp new file mode 100644 index 000000000..f70d62128 --- /dev/null +++ b/examples/eval-callback/eval-callback.cpp @@ -0,0 +1,185 @@ +#include "common.h" +#include "llama.h" +#include "ggml.h" + +#include +#include +#include +#include +#include + +/** + * This the arbitrary data which will be passed to each callback. + * Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor. + */ +struct callback_data { + std::vector data; +}; + +static std::string ggml_ne_string(const ggml_tensor * t) { + std::string str; + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + str += std::to_string(t->ne[i]); + if (i + 1 < GGML_MAX_DIMS) { + str += ", "; + } + } + return str; +} + +static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) { + float sum = 0; + for (int64_t i3 = 0; i3 < ne[3]; i3++) { + printf(" [\n"); + for (int64_t i2 = 0; i2 < ne[2] && i2 < n; i2++) { + printf(" [\n"); + for (int64_t i1 = 0; i1 < ne[1] && i1 < n; i1++) { + printf(" ["); + for (int64_t i0 = 0; i0 < ne[0] && i0 < n; i0++) { + size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; + float v; + if (type == GGML_TYPE_F16) { + v = ggml_fp16_to_fp32(*(ggml_fp16_t *) data + i); + } else if (type == GGML_TYPE_F32) { + v = *(float *) data + i; + } else if (type == GGML_TYPE_I32) { + v = (float) *(int32_t *) data + i; + } else if (type == GGML_TYPE_I16) { + v = (float) *(int16_t *) data + i; + } else if (type == GGML_TYPE_I8) { + v = (float) *(int8_t *) data + i; + } else { + GGML_ASSERT(false); + } + printf("%8.4f", v); + sum += v; + if (i0 < ne[0] - 1 && i0 < n - 1) printf(", "); + } + if (ne[0] > n) printf(", ..."); + printf("],\n"); + } + if (ne[1] > n) printf(" ...\n"); + printf(" ],\n"); + } + if (ne[2] > n) printf(" ...\n"); + printf(" ]\n"); + printf(" sum = %f\n", sum); + } +} + +/** + * GGML operations callback during the graph execution. + * + * @param t current tensor + * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor + * if we return true, a follow-up call will be made with ask=false in which we can do the actual collection. + * see ggml_backend_sched_eval_callback + * @param user_data user data to pass at each call back + * @return true to receive data or continue the graph, false otherwise + */ +static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { + auto * cb_data = (callback_data *) user_data; + + const struct ggml_tensor * src0 = t->src[0]; + const struct ggml_tensor * src1 = t->src[1]; + + if (ask) { + return true; // Always retrieve data + } + + char src1_str[128] = {0}; + if (src1) { + sprintf(src1_str, "%s{%s}", src1->name, ggml_ne_string(src1).c_str()); + } + + printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, + t->name, ggml_type_name(t->type), ggml_op_name(t->op), + src0->name, ggml_ne_string(src0).c_str(), + src1 ? src1_str : "", + ggml_ne_string(t).c_str()); + + + // copy the data from the GPU memory if needed + const bool is_host = ggml_backend_buffer_is_host(t->buffer); + + if (!is_host) { + auto n_bytes = ggml_nbytes(t); + cb_data->data.resize(n_bytes); + ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes); + } + + if (!ggml_is_quantized(t->type)) { + uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data(); + ggml_print_tensor(data, t->type, t->ne, t->nb, 3); + } + + return true; +} + +static bool run(llama_context * ctx, const gpt_params & params) { + const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + + std::vector tokens = ::llama_tokenize(ctx, params.prompt, add_bos); + + if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return false; + } + + return true; +} + +int main(int argc, char ** argv) { + + callback_data cb_data; + + gpt_params params; + if (!gpt_params_parse(argc, argv, params)) { + return 1; + } + + print_build_info(); + + std::mt19937 rng(params.seed); + if (params.random_prompt) { + params.prompt = gpt_random_prompt(rng); + } + + llama_backend_init(); + llama_numa_init(params.numa); + + // pass the callback to the backend scheduler + // it will be executed for each node during the graph computation + params.cb_eval = ggml_debug; + params.cb_eval_user_data = &cb_data; + params.warmup = false; + + // init + llama_model * model; + llama_context * ctx; + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == nullptr || ctx == nullptr) { + fprintf(stderr, "%s : failed to init\n", __func__); + return 1; + } + + // print system information + { + fprintf(stderr, "\n"); + fprintf(stderr, "%s\n", get_system_info(params).c_str()); + } + + bool OK = run(ctx, params); + if (!OK) { + return 1; + } + + llama_print_timings(ctx); + + llama_free(ctx); + llama_free_model(model); + + llama_backend_free(); + + return 0; +} diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 1bf55f90c..ff624c539 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -597,24 +597,18 @@ int main(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); - llama_model_params mparams = llama_model_params_from_gpt_params(params); - - llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams); - if (model == NULL) { - fprintf(stderr, "%s: error: unable to load model\n", __func__); - return 1; - } - - llama_context_params cparams = llama_context_params_from_gpt_params(params); - // pass the callback to the backend scheduler // it will be executed for each node during the graph computation - cparams.cb_eval = ik_collect_imatrix; - cparams.cb_eval_user_data = NULL; + params.cb_eval = ik_collect_imatrix; + params.cb_eval_user_data = NULL; + params.warmup = false; - llama_context * ctx = llama_new_context_with_model(model, cparams); - if (ctx == NULL) { - fprintf(stderr, "%s: error: unable to create context\n", __func__); + // init + llama_model * model; + llama_context * ctx; + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == nullptr || ctx == nullptr) { + fprintf(stderr, "%s : failed to init\n", __func__); return 1; } diff --git a/llama.cpp b/llama.cpp index 9ad9b10cb..b6e2ade91 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11121,7 +11121,7 @@ struct llm_tokenizer_bpe { add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol } - // add the fnished tokens to the final list keeping correct order for next and prev + // add the finished tokens to the final list keeping correct order for next and prev for (auto & sym : symbols) { if (sym.n > 0) { sym.prev = final_prev_index; From f4183afe6a22f356ee222a710686ae7f83dbd949 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Thu, 11 Apr 2024 15:22:47 +0200 Subject: [PATCH 02/12] scripts : add --outdir option to hf.sh (#6600) * scripts : add --outdir option to hf.sh This commit adds an option to the hf.sh script that allows the user to specify an output directory for the downloaded file. The motivation for this changes is that examples that use the hf.sh script to download models from huggingface can now specify the output directory, perhaps to the `models` directory to keep them in one place and not clutter the root directory. Signed-off-by: Daniel Bevenius * squash! scripts : add --outdir option to hf.sh Fix format of the --outdir option in the usage message. Signed-off-by: Daniel Bevenius --------- Signed-off-by: Daniel Bevenius --- scripts/hf.sh | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/scripts/hf.sh b/scripts/hf.sh index 1e9e5a6ea..58f83d6fe 100755 --- a/scripts/hf.sh +++ b/scripts/hf.sh @@ -3,9 +3,9 @@ # Shortcut for downloading HF models # # Usage: -# ./main -m $(./examples/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf) -# ./main -m $(./examples/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf) -# ./main -m $(./examples/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf) +# ./main -m $(./scripts/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf) +# ./main -m $(./scripts/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf) +# ./main -m $(./scripts/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf) # # all logs go to stderr @@ -14,7 +14,7 @@ function log { } function usage { - log "Usage: $0 [[--url] ] [--repo ] [--file ] [-h|--help]" + log "Usage: $0 [[--url] ] [--repo ] [--file ] [--outdir [-h|--help]" exit 1 } @@ -26,9 +26,9 @@ function has_cmd { } if has_cmd wget; then - cmd="wget -q --show-progress -c -O %s %s" + cmd="wget -q --show-progress -c -O %s/%s %s" elif has_cmd curl; then - cmd="curl -C - -f -o %s -L %s" + cmd="curl -C - -f --output-dir %s -o %s -L %s" else log "[E] curl or wget not found" exit 1 @@ -37,6 +37,7 @@ fi url="" repo="" file="" +outdir="." # parse args while [[ $# -gt 0 ]]; do @@ -53,6 +54,10 @@ while [[ $# -gt 0 ]]; do file="$2" shift 2 ;; + --outdir) + outdir="$2" + shift 2 + ;; -h|--help) usage ;; @@ -94,10 +99,10 @@ basename=$(basename $url) log "[+] attempting to download $basename" if [ -n "$cmd" ]; then - cmd=$(printf "$cmd" "$basename" "$url") + cmd=$(printf "$cmd" "$outdir" "$basename" "$url") log "[+] $cmd" if $cmd; then - echo $basename + echo $outdir/$basename exit 0 fi fi From 1bbdaf6ecda6f0a360dfb307b256fcb6838c560b Mon Sep 17 00:00:00 2001 From: Hugo Roussel Date: Thu, 11 Apr 2024 19:52:21 +0200 Subject: [PATCH 03/12] ci: download artifacts to release directory (#6612) When action download-artifact was updated to v4, the default download path changed. This fix binaries not being uploaded to releases. --- .github/workflows/build.yml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index f10ed4161..0527899a1 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -938,6 +938,12 @@ jobs: - name: Download artifacts id: download-artifact uses: actions/download-artifact@v4 + with: + path: ./artifact + + - name: Move artifacts + id: move_artifacts + run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release - name: Create release id: create_release @@ -956,7 +962,7 @@ jobs: const path = require('path'); const fs = require('fs'); const release_id = '${{ steps.create_release.outputs.id }}'; - for (let file of await fs.readdirSync('./artifact')) { + for (let file of await fs.readdirSync('./artifact/release')) { if (path.extname(file) === '.zip') { console.log('uploadReleaseAsset', file); await github.repos.uploadReleaseAsset({ @@ -964,7 +970,7 @@ jobs: repo: context.repo.repo, release_id: release_id, name: file, - data: await fs.readFileSync(`./artifact/${file}`) + data: await fs.readFileSync(`./artifact/release/${file}`) }); } } From cbaadc92942c50aab599a9e4c163afc1f44f7c26 Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Thu, 11 Apr 2024 19:47:34 +0100 Subject: [PATCH 04/12] grammars: 1.5x faster inference w/ complex grammars (vector reserves / reuses) (#6609) * grammars: reserve rejects & next candidates * grammars: reuse new_stacks * grammars: fix missing sig change in llama.h * grammars: fix test (api changed) * grammars: update gbnf-validator.cpp * grammars: simpler syntax (no swap) --- examples/gbnf-validator/gbnf-validator.cpp | 2 +- llama.cpp | 16 ++++++++++------ llama.h | 5 +++-- tests/test-grammar-integration.cpp | 6 +++--- 4 files changed, 17 insertions(+), 12 deletions(-) diff --git a/examples/gbnf-validator/gbnf-validator.cpp b/examples/gbnf-validator/gbnf-validator.cpp index e4c0c1689..091069ffa 100644 --- a/examples/gbnf-validator/gbnf-validator.cpp +++ b/examples/gbnf-validator/gbnf-validator.cpp @@ -17,7 +17,7 @@ static bool llama_sample_grammar_string(struct llama_grammar * grammar, const st size_t pos = 0; for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { auto prev_stacks = grammar->stacks; - grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it); + llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks); if (grammar->stacks.empty()) { error_pos = pos; error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'"; diff --git a/llama.cpp b/llama.cpp index b6e2ade91..ad07059c4 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11912,12 +11912,13 @@ static void llama_grammar_advance_stack( // be positioned at a character range (see `llama_grammar_advance_stack`), and // produces the N possible stacks if the given char is accepted at those // positions -std::vector> llama_grammar_accept( +void llama_grammar_accept( const std::vector> & rules, const std::vector> & stacks, - const uint32_t chr) { + const uint32_t chr, + std::vector> & new_stacks) { - std::vector> new_stacks; + new_stacks.clear(); for (const auto & stack : stacks) { if (stack.empty()) { @@ -11936,8 +11937,6 @@ std::vector> llama_grammar_accept( llama_grammar_advance_stack(rules, new_stack, new_stacks); } } - - return new_stacks; } static std::vector llama_grammar_reject_candidates( @@ -11951,6 +11950,7 @@ static std::vector llama_grammar_reject_candidates_for_ const std::vector & candidates) { std::vector rejects; + rejects.reserve(candidates.size()); if (stack.empty()) { for (const auto & tok : candidates) { @@ -11964,6 +11964,8 @@ static std::vector llama_grammar_reject_candidates_for_ const llama_grammar_element * stack_pos = stack.back(); std::vector next_candidates; + next_candidates.reserve(candidates.size()); + for (const auto & tok : candidates) { if (*tok.code_points == 0) { // reached end of full codepoints in token, reject iff it ended in a partial sequence @@ -12771,8 +12773,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar // Note terminating 0 in decoded string const auto decoded = decode_utf8(piece, grammar->partial_utf8); const auto & code_points = decoded.first; + std::vector> tmp_new_stacks; for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { - grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it); + llama_grammar_accept(grammar->rules, grammar->stacks, *it, tmp_new_stacks); + grammar->stacks = tmp_new_stacks; } grammar->partial_utf8 = decoded.second; GGML_ASSERT(!grammar->stacks.empty()); diff --git a/llama.h b/llama.h index b770a275f..b5da686f7 100644 --- a/llama.h +++ b/llama.h @@ -1097,10 +1097,11 @@ const std::vector> & llama_internal struct llama_context * ctx ); -std::vector> llama_grammar_accept( +void llama_grammar_accept( const std::vector> & rules, const std::vector> & stacks, - const uint32_t chr); + const uint32_t chr, + std::vector> & new_stacks); std::pair, llama_partial_utf8> decode_utf8( const std::string & src, diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp index 0a9c3b6f5..2d8f228e3 100644 --- a/tests/test-grammar-integration.cpp +++ b/tests/test-grammar-integration.cpp @@ -38,7 +38,7 @@ number ::= [0-9]+)"""; for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { auto prev_stacks = grammar->stacks; - grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it); + llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks); assert(!grammar->stacks.empty()); } @@ -138,7 +138,7 @@ ws ::= [ \t\n\r]?)"""; for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { ++pos; auto prev_stacks = grammar->stacks; - grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it); + llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks); // Expect that each code point will not cause the grammar to fail if (grammar->stacks.empty()) { @@ -173,7 +173,7 @@ ws ::= [ \t\n\r]?)"""; for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { auto prev_stacks = grammar->stacks; - grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it); + llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks); if (grammar->stacks.empty()) { parse_failed = true; break; From a474f50ebb3e10be3371562f75f3f573f1a86b5f Mon Sep 17 00:00:00 2001 From: Nikolas <127742645+nneubacher@users.noreply.github.com> Date: Thu, 11 Apr 2024 21:56:29 +0200 Subject: [PATCH 05/12] Refactor Error Handling for CUDA (#6575) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Refactor Error Handling for CUDA Add guidance for setting CUDA_DOCKER_ARCH to match GPU compute capability for CUDA versions < 11.7. Include link to NVIDIA's CUDA GPUs documentation for compute capability reference. * Update Makefile Improved wording Co-authored-by: Johannes Gäßler --------- Co-authored-by: Johannes Gäßler --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 2fd805a97..7a69ad1b3 100644 --- a/Makefile +++ b/Makefile @@ -646,7 +646,7 @@ CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])' ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1) ifndef CUDA_DOCKER_ARCH ifndef CUDA_POWER_ARCH -$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH) +$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus ) endif # CUDA_POWER_ARCH endif # CUDA_DOCKER_ARCH endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1) From f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7 Mon Sep 17 00:00:00 2001 From: Clint Herron Date: Thu, 11 Apr 2024 17:44:48 -0400 Subject: [PATCH 06/12] As suggested by @slaren, disabling Metal for test to fix CI build on OSX from #6576 (#6619) --- examples/eval-callback/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/eval-callback/CMakeLists.txt b/examples/eval-callback/CMakeLists.txt index d53f37422..c56ba780b 100644 --- a/examples/eval-callback/CMakeLists.txt +++ b/examples/eval-callback/CMakeLists.txt @@ -5,5 +5,5 @@ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) set(TEST_TARGET test-eval-callback) -add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42) +add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0) set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl) From 04a5ac211ef40936295980b7cdf0ba6e97093146 Mon Sep 17 00:00:00 2001 From: Clint Herron Date: Thu, 11 Apr 2024 21:44:50 -0400 Subject: [PATCH 07/12] Optimization: eliminate addition of redundant stacks when advancing grammar. (#6616) --- llama.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index ad07059c4..73ff60706 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11861,7 +11861,9 @@ static void llama_grammar_advance_stack( std::vector> & new_stacks) { if (stack.empty()) { - new_stacks.emplace_back(stack); + if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) { + new_stacks.emplace_back(stack); + } return; } @@ -11898,7 +11900,10 @@ static void llama_grammar_advance_stack( } case LLAMA_GRETYPE_CHAR: case LLAMA_GRETYPE_CHAR_NOT: - new_stacks.emplace_back(stack); + if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) { + // only add the stack if it's not a duplicate of one we already have + new_stacks.emplace_back(stack); + } break; default: // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range From 9ed2737acc233716374860e6b2ea7399c4aae29e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 12 Apr 2024 11:15:05 +0300 Subject: [PATCH 08/12] ci : disable Metal for macOS-latest-cmake-x64 (#6628) --- .github/workflows/build.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0527899a1..63143bc94 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -101,7 +101,9 @@ jobs: sysctl -a mkdir build cd build - cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON .. + # Metal is disabled due to intermittent failures with Github runners not having a GPU: + # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 + cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON .. cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) - name: Test From 81da18e71ccfc196d4516fbea5dc3a6a1f92dccb Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Fri, 12 Apr 2024 10:26:47 +0200 Subject: [PATCH 09/12] eval-callback: use ggml_op_desc to pretty print unary operator name (#6631) --- examples/eval-callback/eval-callback.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index f70d62128..05f7d6ab1 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -93,7 +93,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { } printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, - t->name, ggml_type_name(t->type), ggml_op_name(t->op), + t->name, ggml_type_name(t->type), ggml_op_desc(t), src0->name, ggml_ne_string(src0).c_str(), src1 ? src1_str : "", ggml_ne_string(t).c_str()); From dee7f8d6928cc680cc969f7d93f98c3e24dcad41 Mon Sep 17 00:00:00 2001 From: MasterYi1024 <39848311+MasterYi1024@users.noreply.github.com> Date: Fri, 12 Apr 2024 16:28:12 +0800 Subject: [PATCH 10/12] Correct free memory and total memory. (#6630) Co-authored-by: MasterYi --- llama.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index 73ff60706..dad2c4fbf 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1638,17 +1638,17 @@ static size_t llama_get_device_memory(int device) { #if defined(GGML_USE_CUDA) size_t total; size_t free; - ggml_backend_cuda_get_device_memory(device, &total, &free); + ggml_backend_cuda_get_device_memory(device, &free, &total); return free; #elif defined(GGML_USE_SYCL) size_t total; size_t free; - ggml_backend_sycl_get_device_memory(device, &total, &free); + ggml_backend_sycl_get_device_memory(device, &free, &total); return free; #elif defined(GGML_USE_VULKAN) size_t total; size_t free; - ggml_backend_vk_get_device_memory(device, &total, &free); + ggml_backend_vk_get_device_memory(device, &free, &total); return free; #else return 1; From ef21ce4ccb41164cb52997bd2210d92bc6a6c5d1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 12 Apr 2024 11:49:58 +0300 Subject: [PATCH 11/12] imatrix : remove invalid assert (#6632) --- examples/imatrix/imatrix.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index ff624c539..73609d3e6 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -107,9 +107,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * // the top-k selected expert ids are stored in the ids tensor // for simplicity, always copy ids to host, because it is small - // take into account that ids is not contiguous! GGML_ASSERT(ids->ne[1] == src1->ne[1]); - GGML_ASSERT(n_as*ggml_nrows(ids)*sizeof(int) == GGML_PAD(ggml_nbytes(ids), n_as*sizeof(int))); m_ids.resize(ggml_nbytes(ids)/sizeof(int)); ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids)); From 5c4d767ac028c0f9c31cba3fceaf765c6097abfc Mon Sep 17 00:00:00 2001 From: Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com> Date: Fri, 12 Apr 2024 10:52:36 +0200 Subject: [PATCH 12/12] chore: Fix markdown warnings (#6625) --- README-sycl.md | 94 +++++++++++++++--------------- README.md | 38 ++++++------ SECURITY.md | 6 +- examples/llava/MobileVLM-README.md | 2 +- examples/llava/README.md | 2 +- examples/main/README.md | 2 +- examples/perplexity/README.md | 31 +++++----- examples/quantize/README.md | 22 +++---- 8 files changed, 98 insertions(+), 99 deletions(-) diff --git a/README-sycl.md b/README-sycl.md index 4372a32e3..169d2ca0b 100644 --- a/README-sycl.md +++ b/README-sycl.md @@ -8,9 +8,9 @@ - [Linux](#linux) - [Windows](#windows) - [Environment Variable](#environment-variable) -- [Known Issue](#known-issue) -- [Q&A](#q&a) -- [Todo](#todo) +- [Known Issue](#known-issues) +- [Q&A](#qa) +- [TODO](#todo) ## Background @@ -54,10 +54,10 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, ## OS -|OS|Status|Verified| -|-|-|-| -|Linux|Support|Ubuntu 22.04, Fedora Silverblue 39| -|Windows|Support|Windows 11| +| OS | Status | Verified | +|---------|---------|------------------------------------| +| Linux | Support | Ubuntu 22.04, Fedora Silverblue 39 | +| Windows | Support | Windows 11 | ## Hardware @@ -66,13 +66,13 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, **Verified devices** -|Intel GPU| Status | Verified Model| -|-|-|-| -|Intel Data Center Max Series| Support| Max 1550| -|Intel Data Center Flex Series| Support| Flex 170| -|Intel Arc Series| Support| Arc 770, 730M| -|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake| -|Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7| +| Intel GPU | Status | Verified Model | +|-------------------------------|---------|---------------------------------------| +| Intel Data Center Max Series | Support | Max 1550 | +| Intel Data Center Flex Series | Support | Flex 170 | +| Intel Arc Series | Support | Arc 770, 730M | +| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake | +| Intel iGPU | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 | *Notes:* @@ -89,10 +89,10 @@ The BLAS acceleration on Nvidia GPU through oneAPI can be obtained using the Nvi **Verified devices** -|Nvidia GPU| Status | Verified Model| -|-|-|-| -|Ampere Series| Support| A100, A4000| -|Ampere Series *(Mobile)*| Support| RTX 40 Series| +| Nvidia GPU | Status | Verified Model | +|--------------------------|---------|----------------| +| Ampere Series | Support | A100, A4000 | +| Ampere Series *(Mobile)* | Support | RTX 40 Series | *Notes:* - Support for Nvidia targets through oneAPI is currently limited to Linux platforms. @@ -167,7 +167,7 @@ Platform #0: Intel(R) OpenCL HD Graphics - **Nvidia GPU** -In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cublas)-* are installed. +In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed. Installation can be verified by running the following: ```sh nvidia-smi @@ -313,10 +313,10 @@ found 6 SYCL devices: | 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616| ``` -|Attribute|Note| -|-|-| -|compute capability 1.3|Level-zero driver/runtime, recommended | -|compute capability 3.0|OpenCL driver/runtime, slower than level-zero in most cases| +| Attribute | Note | +|------------------------|-------------------------------------------------------------| +| compute capability 1.3 | Level-zero driver/runtime, recommended | +| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases | 4. Launch inference @@ -325,10 +325,10 @@ There are two device selection modes: - Single device: Use one device target specified by the user. - Multiple devices: Automatically select the devices with the same largest Max compute-units. -|Device selection|Parameter| -|-|-| -|Single device|--split-mode none --main-gpu DEVICE_ID | -|Multiple devices|--split-mode layer (default)| +| Device selection | Parameter | +|------------------|----------------------------------------| +| Single device | --split-mode none --main-gpu DEVICE_ID | +| Multiple devices | --split-mode layer (default) | Examples: @@ -486,10 +486,10 @@ found 6 SYCL devices: ``` -|Attribute|Note| -|-|-| -|compute capability 1.3|Level-zero running time, recommended | -|compute capability 3.0|OpenCL running time, slower than level-zero in most cases| +| Attribute | Note | +|------------------------|-----------------------------------------------------------| +| compute capability 1.3 | Level-zero running time, recommended | +| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases | 4. Launch inference @@ -499,10 +499,10 @@ There are two device selection modes: - Single device: Use one device assigned by user. - Multiple devices: Automatically choose the devices with the same biggest Max compute units. -|Device selection|Parameter| -|-|-| -|Single device|--split-mode none --main-gpu DEVICE_ID | -|Multiple devices|--split-mode layer (default)| +| Device selection | Parameter | +|------------------|----------------------------------------| +| Single device | --split-mode none --main-gpu DEVICE_ID | +| Multiple devices | --split-mode layer (default) | Examples: @@ -540,20 +540,20 @@ use 1 SYCL GPUs: [0] with Max compute units:512 #### Build -|Name|Value|Function| -|-|-|-| -|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path.| -|LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA|Set the SYCL target device type.| -|LLAMA_SYCL_F16|OFF *(default)* \|ON *(optional)*|Enable FP16 build with SYCL code path.| -|CMAKE_C_COMPILER|icx|Set *icx* compiler for SYCL code path.| -|CMAKE_CXX_COMPILER|icpx *(Linux)*, icx *(Windows)*|Set `icpx/icx` compiler for SYCL code path.| +| Name | Value | Function | +|--------------------|-----------------------------------|---------------------------------------------| +| LLAMA_SYCL | ON (mandatory) | Enable build with SYCL code path. | +| LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. | +| LLAMA_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. | +| CMAKE_C_COMPILER | icx | Set *icx* compiler for SYCL code path. | +| CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)* | Set `icpx/icx` compiler for SYCL code path. | #### Runtime -|Name|Value|Function| -|-|-|-| -|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG| -|ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer| +| Name | Value | Function | +|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------| +| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG | +| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer | ## Known Issues @@ -591,6 +591,6 @@ use 1 SYCL GPUs: [0] with Max compute units:512 ### **GitHub contribution**: Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay. -## Todo +## TODO - Support row layer split for multiple card runs. diff --git a/README.md b/README.md index 462b1b180..00a487fc6 100644 --- a/README.md +++ b/README.md @@ -485,14 +485,14 @@ Building the program with BLAS support may lead to some performance improvements The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance: - | Option | Legal values | Default | Description | - |--------------------------------|------------------------|---------|-------------| - | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. | - | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | - | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. | - | LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. | - | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | - | LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. | + | Option | Legal values | Default | Description | + |--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| + | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. | + | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | + | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. | + | LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. | + | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | + | LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. | - #### hipBLAS @@ -534,11 +534,11 @@ Building the program with BLAS support may lead to some performance improvements If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3. The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above): - | Option | Legal values | Default | Description | - |-------------------------|------------------------|---------|-------------| - | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | - | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | - | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | + | Option | Legal values | Default | Description | + |-------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| + | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | + | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | + | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | - #### CLBlast @@ -746,11 +746,11 @@ From the unzipped folder, open a terminal/cmd window here and place a pre-conver As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same. | Model | Original size | Quantized size (Q4_0) | -|------:|--------------:|-----------------------:| -| 7B | 13 GB | 3.9 GB | -| 13B | 24 GB | 7.8 GB | -| 30B | 60 GB | 19.5 GB | -| 65B | 120 GB | 38.5 GB | +|------:|--------------:|----------------------:| +| 7B | 13 GB | 3.9 GB | +| 13B | 24 GB | 7.8 GB | +| 30B | 60 GB | 19.5 GB | +| 65B | 120 GB | 38.5 GB | ### Quantization @@ -758,7 +758,7 @@ Several quantization methods are supported. They differ in the resulting model d *(outdated)* -| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | +| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | |------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:| | 7B | perplexity | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 | | 7B | file size | 13.0G | 3.5G | 3.9G | 4.3G | 4.7G | 6.7G | diff --git a/SECURITY.md b/SECURITY.md index 14504b1bf..f4322c6ee 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -49,11 +49,11 @@ If you intend to run multiple models in parallel with shared memory, it is your 1. Tenant Isolation: Models should run separately with strong isolation methods to prevent unwanted data access. Separating networks is crucial for isolation, as it prevents unauthorized access to data or models and malicious users from sending graphs to execute under another tenant's identity. -1. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring. +2. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring. -1. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk. +3. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk. -1. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time. +4. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time. ## Reporting a vulnerability diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md index 96b048525..413e433dd 100644 --- a/examples/llava/MobileVLM-README.md +++ b/examples/llava/MobileVLM-README.md @@ -22,7 +22,7 @@ After building, run: `./llava-cli` to see the usage. For example: ## Model conversion -- Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally: +1. Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally: ```sh git clone https://huggingface.co/mtgv/MobileVLM-1.7B diff --git a/examples/llava/README.md b/examples/llava/README.md index 67cb0f22b..d4810d42e 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -24,7 +24,7 @@ After building, run: `./llava-cli` to see the usage. For example: ## LLaVA 1.5 -- Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example: +1. Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example: ```sh git clone https://huggingface.co/liuhaotian/llava-v1.5-7b diff --git a/examples/main/README.md b/examples/main/README.md index bb696b562..10a589ceb 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -310,7 +310,7 @@ These options help improve the performance and memory usage of the LLaMA models. ### Quantization -For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run). +For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-and-quantize). ## Additional Options diff --git a/examples/perplexity/README.md b/examples/perplexity/README.md index 50e1af011..1a8c0dd64 100644 --- a/examples/perplexity/README.md +++ b/examples/perplexity/README.md @@ -3,19 +3,18 @@ TODO ## Llama 2 70B Scorechart -Quantization | Model size (GiB) | Perplexity | Delta to fp16 --- | -- | -- | -- -Q4_0 | 36.20 | 3.5550 | 3.61% -Q4_1 | 40.20 | 3.5125 | 2.37% -Q5_0 | 44.20 | 3.4744 | 1.26% -Q2_K | 27.27 | 3.7339 | 8.82% -Q3_K_S | 27.86 | 3.7019 | 7.89% -Q3_K_M | 30.83 | 3.5932 | 4.72% -Q3_K_L | 33.67 | 3.5617 | 3.80% -Q4_K_S | 36.39 | 3.4852 | 1.57% -Q4_K_M | 38.54 | 3.4725 | 1.20% -Q5_K_S | 44.20 | 3.4483 | 0.50% -Q5_K_M | 45.41 | 3.4451 | 0.40% -Q6_K | 52.70 | 3.4367 | 0.16% -fp16 | 128.5 | 3.4313 | - - +| Quantization | Model size (GiB) | Perplexity | Delta to fp16 | +|--------------|------------------|------------|---------------| +| Q4_0 | 36.20 | 3.5550 | 3.61% | +| Q4_1 | 40.20 | 3.5125 | 2.37% | +| Q5_0 | 44.20 | 3.4744 | 1.26% | +| Q2_K | 27.27 | 3.7339 | 8.82% | +| Q3_K_S | 27.86 | 3.7019 | 7.89% | +| Q3_K_M | 30.83 | 3.5932 | 4.72% | +| Q3_K_L | 33.67 | 3.5617 | 3.80% | +| Q4_K_S | 36.39 | 3.4852 | 1.57% | +| Q4_K_M | 38.54 | 3.4725 | 1.20% | +| Q5_K_S | 44.20 | 3.4483 | 0.50% | +| Q5_K_M | 45.41 | 3.4451 | 0.40% | +| Q6_K | 52.70 | 3.4367 | 0.16% | +| fp16 | 128.5 | 3.4313 | - | diff --git a/examples/quantize/README.md b/examples/quantize/README.md index c8b9a27a0..8a10365c0 100644 --- a/examples/quantize/README.md +++ b/examples/quantize/README.md @@ -4,17 +4,17 @@ TODO ## Llama 2 7B -Quantization | Bits per Weight (BPW) --- | -- -Q2_K | 3.35 -Q3_K_S | 3.50 -Q3_K_M | 3.91 -Q3_K_L | 4.27 -Q4_K_S | 4.58 -Q4_K_M | 4.84 -Q5_K_S | 5.52 -Q5_K_M | 5.68 -Q6_K | 6.56 +| Quantization | Bits per Weight (BPW) | +|--------------|-----------------------| +| Q2_K | 3.35 | +| Q3_K_S | 3.50 | +| Q3_K_M | 3.91 | +| Q3_K_L | 4.27 | +| Q4_K_S | 4.58 | +| Q4_K_M | 4.84 | +| Q5_K_S | 5.52 | +| Q5_K_M | 5.68 | +| Q6_K | 6.56 | ## Llama 2 13B Quantization | Bits per Weight (BPW)