From 8228b66dbc16290c5cbd70e80ab47c068e2569d8 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 10 Apr 2024 20:16:48 +0200 Subject: [PATCH 1/2] gguf : add option to not check tensor data (#6582) This commit adds an option to the gguf example to not check the tensor data. The motivation for this is that it can be nice to use the gguf tool to read other .gguf files that were not created by the gguf tool. Signed-off-by: Daniel Bevenius --- examples/gguf/gguf.cpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index 5444503a5..575143771 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -142,7 +142,7 @@ static bool gguf_ex_read_0(const std::string & fname) { } // read and create ggml_context containing the tensors and their data -static bool gguf_ex_read_1(const std::string & fname) { +static bool gguf_ex_read_1(const std::string & fname, bool check_data) { struct ggml_context * ctx_data = NULL; struct gguf_init_params params = { @@ -206,7 +206,7 @@ static bool gguf_ex_read_1(const std::string & fname) { printf("\n\n"); // check data - { + if (check_data) { const float * data = (const float *) cur->data; for (int j = 0; j < ggml_nelements(cur); ++j) { if (data[j] != 100 + i) { @@ -229,9 +229,16 @@ static bool gguf_ex_read_1(const std::string & fname) { int main(int argc, char ** argv) { if (argc < 3) { - printf("usage: %s data.gguf r|w\n", argv[0]); + printf("usage: %s data.gguf r|w [n]\n", argv[0]); + printf("r: read data.gguf file\n"); + printf("w: write data.gguf file\n"); + printf("n: no check of tensor data\n"); return -1; } + bool check_data = true; + if (argc == 4) { + check_data = false; + } const std::string fname(argv[1]); const std::string mode (argv[2]); @@ -242,7 +249,7 @@ int main(int argc, char ** argv) { GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file"); } else if (mode == "r") { GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file"); - GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file"); + GGML_ASSERT(gguf_ex_read_1(fname, check_data) && "failed to read gguf file"); } return 0; From b804b1ef77351d2a11be945462c6c251710476cb Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Thu, 11 Apr 2024 14:51:07 +0200 Subject: [PATCH 2/2] eval-callback: Example how to use eval callback for debugging (#6576) * gguf-debug: Example how to use ggml callback for debugging * gguf-debug: no mutex, verify type, fix stride. * llama: cv eval: move cb eval field in common gpt_params * ggml_debug: use common gpt_params to pass cb eval. Fix get tensor SIGV random. * ggml_debug: ci: add tests * ggml_debug: EOL in CMakeLists.txt * ggml_debug: Remove unused param n_batch, no batching here * ggml_debug: fix trailing spaces * ggml_debug: fix trailing spaces * common: fix cb_eval and user data not initialized * ci: build revert label * ggml_debug: add main test label * doc: add a model: add a link to ggml-debug * ggml-debug: add to make toolchain * ggml-debug: tests add the main label * ggml-debug: ci add test curl label * common: allow the warmup to be disabled in llama_init_from_gpt_params * ci: add curl test * ggml-debug: better tensor type support * gitignore : ggml-debug * ggml-debug: printing also the sum of each tensor * ggml-debug: remove block size * eval-callback: renamed from ggml-debug * eval-callback: fix make toolchain --------- Co-authored-by: slaren Co-authored-by: Georgi Gerganov --- .github/workflows/build.yml | 8 +- .gitignore | 1 + Makefile | 6 +- common/common.cpp | 4 +- common/common.h | 4 + docs/HOWTO-add-model.md | 2 + examples/CMakeLists.txt | 1 + examples/eval-callback/CMakeLists.txt | 9 ++ examples/eval-callback/README.md | 95 ++++++++++++ examples/eval-callback/eval-callback.cpp | 185 +++++++++++++++++++++++ examples/imatrix/imatrix.cpp | 24 ++- llama.cpp | 2 +- 12 files changed, 319 insertions(+), 22 deletions(-) create mode 100644 examples/eval-callback/CMakeLists.txt create mode 100644 examples/eval-callback/README.md create mode 100644 examples/eval-callback/eval-callback.cpp diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ff7238aba..f10ed4161 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -52,7 +52,7 @@ jobs: id: cmake_test run: | cd build - ctest -L main --verbose --timeout 900 + ctest -L 'main|curl' --verbose --timeout 900 - name: Determine tag name id: tag @@ -209,21 +209,21 @@ jobs: id: depends run: | sudo apt-get update - sudo apt-get install build-essential + sudo apt-get install build-essential libcurl4-openssl-dev - name: Build id: cmake_build run: | mkdir build cd build - cmake .. -DLLAMA_FATAL_WARNINGS=ON + cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON cmake --build . --config Release -j $(nproc) - name: Test id: cmake_test run: | cd build - ctest -L main --verbose --timeout 900 + ctest -L 'main|curl' --verbose --timeout 900 - name: Test llama2c conversion id: llama2c_test diff --git a/.gitignore b/.gitignore index 9fb5b80c3..fdc5184a1 100644 --- a/.gitignore +++ b/.gitignore @@ -48,6 +48,7 @@ models-mnt /convert-llama2c-to-ggml /embd-input-test /embedding +/eval-callback /gguf /gguf-llama-simple /gguf-split diff --git a/Makefile b/Makefile index 11b31c5c8..2fd805a97 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # Define the default target now so that it is always the first target BUILD_TARGETS = \ main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ - simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \ + simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \ retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o # Binaries only useful for tests @@ -800,6 +800,10 @@ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(O $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/common/common.cpp b/common/common.cpp index 98fc8388c..dda514785 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1745,6 +1745,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.yarn_orig_ctx = params.yarn_orig_ctx; cparams.pooling_type = params.pooling_type; cparams.defrag_thold = params.defrag_thold; + cparams.cb_eval = params.cb_eval; + cparams.cb_eval_user_data = params.cb_eval_user_data; cparams.offload_kqv = !params.no_kv_offload; cparams.type_k = kv_cache_type_from_str(params.cache_type_k); @@ -2192,7 +2194,7 @@ std::tuple llama_init_from_gpt_par params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY; } - { + if (params.warmup) { LOG("warming up the model with an empty run\n"); std::vector tmp = { llama_token_bos(model), llama_token_eos(model), }; diff --git a/common/common.h b/common/common.h index a7f476c1b..65272b0ba 100644 --- a/common/common.h +++ b/common/common.h @@ -80,6 +80,9 @@ struct gpt_params { int32_t yarn_orig_ctx = 0; // YaRN original context length float defrag_thold = -1.0f; // KV cache defragmentation threshold + ggml_backend_sched_eval_callback cb_eval = nullptr; + void * cb_eval_user_data = nullptr; + ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; @@ -156,6 +159,7 @@ struct gpt_params { bool infill = false; // use infill mode bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes bool no_kv_offload = false; // disable KV offloading + bool warmup = true; // warmup run std::string cache_type_k = "f16"; // KV cache data type for the K std::string cache_type_v = "f16"; // KV cache data type for the V diff --git a/docs/HOWTO-add-model.md b/docs/HOWTO-add-model.md index 3581f3e65..a56b78344 100644 --- a/docs/HOWTO-add-model.md +++ b/docs/HOWTO-add-model.md @@ -100,6 +100,8 @@ Have a look to existing implementation like `build_llama`, `build_dbrx` or `buil When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support of missing backend operations can be added in another PR. +Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback). + ## GGUF specification https://github.com/ggerganov/ggml/blob/master/docs/gguf.md diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 76496bf06..f421769cc 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -19,6 +19,7 @@ else() add_subdirectory(benchmark) add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(embedding) + add_subdirectory(eval-callback) add_subdirectory(finetune) add_subdirectory(gritlm) add_subdirectory(gguf-split) diff --git a/examples/eval-callback/CMakeLists.txt b/examples/eval-callback/CMakeLists.txt new file mode 100644 index 000000000..d53f37422 --- /dev/null +++ b/examples/eval-callback/CMakeLists.txt @@ -0,0 +1,9 @@ +set(TARGET eval-callback) +add_executable(${TARGET} eval-callback.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) + +set(TEST_TARGET test-eval-callback) +add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42) +set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl) diff --git a/examples/eval-callback/README.md b/examples/eval-callback/README.md new file mode 100644 index 000000000..66a37e878 --- /dev/null +++ b/examples/eval-callback/README.md @@ -0,0 +1,95 @@ +# llama.cpp/examples/eval-callback + +A simple example which demonstrates how to use callback during the inference. +It simply prints to the console all operations and tensor data. + +Usage: + +```shell +eval-callback \ + --hf-repo ggml-org/models \ + --hf-file phi-2/ggml-model-q4_0.gguf \ + --model phi-2-q4_0.gguf \ + --prompt hello \ + --seed 42 \ + -ngl 33 +``` + +Will print: + +```shell +llm_load_tensors: offloaded 33/33 layers to GPU +... +llama_new_context_with_model: n_ctx = 512 +... +llama_new_context_with_model: CUDA0 compute buffer size = 105.00 MiB +llama_new_context_with_model: CUDA_Host compute buffer size = 6.01 MiB +llama_new_context_with_model: graph nodes = 1225 +llama_new_context_with_model: graph splits = 2 +ggml_debug: inp_embd = (f32) GET_ROWS(token_embd.weight{2560, 51200, 1, 1}, inp_tokens{1, 1, 1, 1}}) = {2560, 1, 1, 1} + [ + [ + [ -0.0181, 0.0272, 0.0272, ...], + ], + ] +ggml_debug: norm-0 = (f32) NORM(CUDA0#inp_embd#0{2560, 1, 1, 1}, }) = {2560, 1, 1, 1} + [ + [ + [ -0.6989, 1.0636, 1.0636, ...], + ], + ] +ggml_debug: norm_w-0 = (f32) MUL(norm-0{2560, 1, 1, 1}, blk.0.attn_norm.weight{2560, 1, 1, 1}}) = {2560, 1, 1, 1} + [ + [ + [ -0.1800, 0.2817, 0.2632, ...], + ], + ] +ggml_debug: attn_norm-0 = (f32) ADD(norm_w-0{2560, 1, 1, 1}, blk.0.attn_norm.bias{2560, 1, 1, 1}}) = {2560, 1, 1, 1} + [ + [ + [ -0.1863, 0.2970, 0.2604, ...], + ], + ] +ggml_debug: wqkv-0 = (f32) MUL_MAT(blk.0.attn_qkv.weight{2560, 7680, 1, 1}, attn_norm-0{2560, 1, 1, 1}}) = {7680, 1, 1, 1} + [ + [ + [ -1.1238, 1.2876, -1.8086, ...], + ], + ] +ggml_debug: bqkv-0 = (f32) ADD(wqkv-0{7680, 1, 1, 1}, blk.0.attn_qkv.bias{7680, 1, 1, 1}}) = {7680, 1, 1, 1} + [ + [ + [ -1.1135, 1.4604, -1.9226, ...], + ], + ] +ggml_debug: bqkv-0 (view) = (f32) VIEW(bqkv-0{7680, 1, 1, 1}, }) = {2560, 1, 1, 1} + [ + [ + [ -1.1135, 1.4604, -1.9226, ...], + ], + ] +ggml_debug: Qcur-0 = (f32) CONT(bqkv-0 (view){2560, 1, 1, 1}, }) = {2560, 1, 1, 1} + [ + [ + [ -1.1135, 1.4604, -1.9226, ...], + ], + ] +ggml_debug: Qcur-0 (reshaped) = (f32) RESHAPE(Qcur-0{2560, 1, 1, 1}, }) = {80, 32, 1, 1} + [ + [ + [ -1.1135, 1.4604, -1.9226, ...], + [ -0.3608, 0.5076, -1.8866, ...], + [ 1.7643, 0.0273, -2.1065, ...], + ... + ], + ] +ggml_debug: Qcur-0 = (f32) ROPE(Qcur-0 (reshaped){80, 32, 1, 1}, CUDA0#inp_pos#0{1, 1, 1, 1}}) = {80, 32, 1, 1} + [ + [ + [ -1.1135, 1.4604, -1.9226, ...], + [ -0.3608, 0.5076, -1.8866, ...], + [ 1.7643, 0.0273, -2.1065, ...], + ... + ], + ] +``` diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp new file mode 100644 index 000000000..f70d62128 --- /dev/null +++ b/examples/eval-callback/eval-callback.cpp @@ -0,0 +1,185 @@ +#include "common.h" +#include "llama.h" +#include "ggml.h" + +#include +#include +#include +#include +#include + +/** + * This the arbitrary data which will be passed to each callback. + * Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor. + */ +struct callback_data { + std::vector data; +}; + +static std::string ggml_ne_string(const ggml_tensor * t) { + std::string str; + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + str += std::to_string(t->ne[i]); + if (i + 1 < GGML_MAX_DIMS) { + str += ", "; + } + } + return str; +} + +static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) { + float sum = 0; + for (int64_t i3 = 0; i3 < ne[3]; i3++) { + printf(" [\n"); + for (int64_t i2 = 0; i2 < ne[2] && i2 < n; i2++) { + printf(" [\n"); + for (int64_t i1 = 0; i1 < ne[1] && i1 < n; i1++) { + printf(" ["); + for (int64_t i0 = 0; i0 < ne[0] && i0 < n; i0++) { + size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; + float v; + if (type == GGML_TYPE_F16) { + v = ggml_fp16_to_fp32(*(ggml_fp16_t *) data + i); + } else if (type == GGML_TYPE_F32) { + v = *(float *) data + i; + } else if (type == GGML_TYPE_I32) { + v = (float) *(int32_t *) data + i; + } else if (type == GGML_TYPE_I16) { + v = (float) *(int16_t *) data + i; + } else if (type == GGML_TYPE_I8) { + v = (float) *(int8_t *) data + i; + } else { + GGML_ASSERT(false); + } + printf("%8.4f", v); + sum += v; + if (i0 < ne[0] - 1 && i0 < n - 1) printf(", "); + } + if (ne[0] > n) printf(", ..."); + printf("],\n"); + } + if (ne[1] > n) printf(" ...\n"); + printf(" ],\n"); + } + if (ne[2] > n) printf(" ...\n"); + printf(" ]\n"); + printf(" sum = %f\n", sum); + } +} + +/** + * GGML operations callback during the graph execution. + * + * @param t current tensor + * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor + * if we return true, a follow-up call will be made with ask=false in which we can do the actual collection. + * see ggml_backend_sched_eval_callback + * @param user_data user data to pass at each call back + * @return true to receive data or continue the graph, false otherwise + */ +static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { + auto * cb_data = (callback_data *) user_data; + + const struct ggml_tensor * src0 = t->src[0]; + const struct ggml_tensor * src1 = t->src[1]; + + if (ask) { + return true; // Always retrieve data + } + + char src1_str[128] = {0}; + if (src1) { + sprintf(src1_str, "%s{%s}", src1->name, ggml_ne_string(src1).c_str()); + } + + printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, + t->name, ggml_type_name(t->type), ggml_op_name(t->op), + src0->name, ggml_ne_string(src0).c_str(), + src1 ? src1_str : "", + ggml_ne_string(t).c_str()); + + + // copy the data from the GPU memory if needed + const bool is_host = ggml_backend_buffer_is_host(t->buffer); + + if (!is_host) { + auto n_bytes = ggml_nbytes(t); + cb_data->data.resize(n_bytes); + ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes); + } + + if (!ggml_is_quantized(t->type)) { + uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data(); + ggml_print_tensor(data, t->type, t->ne, t->nb, 3); + } + + return true; +} + +static bool run(llama_context * ctx, const gpt_params & params) { + const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + + std::vector tokens = ::llama_tokenize(ctx, params.prompt, add_bos); + + if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return false; + } + + return true; +} + +int main(int argc, char ** argv) { + + callback_data cb_data; + + gpt_params params; + if (!gpt_params_parse(argc, argv, params)) { + return 1; + } + + print_build_info(); + + std::mt19937 rng(params.seed); + if (params.random_prompt) { + params.prompt = gpt_random_prompt(rng); + } + + llama_backend_init(); + llama_numa_init(params.numa); + + // pass the callback to the backend scheduler + // it will be executed for each node during the graph computation + params.cb_eval = ggml_debug; + params.cb_eval_user_data = &cb_data; + params.warmup = false; + + // init + llama_model * model; + llama_context * ctx; + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == nullptr || ctx == nullptr) { + fprintf(stderr, "%s : failed to init\n", __func__); + return 1; + } + + // print system information + { + fprintf(stderr, "\n"); + fprintf(stderr, "%s\n", get_system_info(params).c_str()); + } + + bool OK = run(ctx, params); + if (!OK) { + return 1; + } + + llama_print_timings(ctx); + + llama_free(ctx); + llama_free_model(model); + + llama_backend_free(); + + return 0; +} diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 1bf55f90c..ff624c539 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -597,24 +597,18 @@ int main(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); - llama_model_params mparams = llama_model_params_from_gpt_params(params); - - llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams); - if (model == NULL) { - fprintf(stderr, "%s: error: unable to load model\n", __func__); - return 1; - } - - llama_context_params cparams = llama_context_params_from_gpt_params(params); - // pass the callback to the backend scheduler // it will be executed for each node during the graph computation - cparams.cb_eval = ik_collect_imatrix; - cparams.cb_eval_user_data = NULL; + params.cb_eval = ik_collect_imatrix; + params.cb_eval_user_data = NULL; + params.warmup = false; - llama_context * ctx = llama_new_context_with_model(model, cparams); - if (ctx == NULL) { - fprintf(stderr, "%s: error: unable to create context\n", __func__); + // init + llama_model * model; + llama_context * ctx; + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == nullptr || ctx == nullptr) { + fprintf(stderr, "%s : failed to init\n", __func__); return 1; } diff --git a/llama.cpp b/llama.cpp index 9ad9b10cb..b6e2ade91 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11121,7 +11121,7 @@ struct llm_tokenizer_bpe { add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol } - // add the fnished tokens to the final list keeping correct order for next and prev + // add the finished tokens to the final list keeping correct order for next and prev for (auto & sym : symbols) { if (sym.n > 0) { sym.prev = final_prev_index;