From 99b71c068f624521ad977e08e41589e2971fa1c7 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 13 Mar 2024 11:39:11 +0100 Subject: [PATCH 01/56] Server: Use multi-task for embeddings endpoint (#6001) * use multitask for embd endpoint * specify types * remove redundant {"n_predict", 0} --- examples/server/server.cpp | 76 ++++++++++++++------------------------ examples/server/utils.hpp | 12 +++++- 2 files changed, 38 insertions(+), 50 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index b63a6f243..3172d96dd 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2763,6 +2763,7 @@ int main(int argc, char ** argv) { res.set_header("Access-Control-Allow-Credentials", "true"); res.set_header("Access-Control-Allow-Methods", "POST"); res.set_header("Access-Control-Allow-Headers", "*"); + return res.set_content("", "application/json; charset=utf-8"); }); svr->set_logger(log_server_request); @@ -3371,44 +3372,37 @@ int main(int argc, char ** argv) { const json body = json::parse(req.body); bool is_openai = false; - // an input prompt can string or a list of tokens (integer) - std::vector prompts; + // an input prompt can be a string or a list of tokens (integer) + json prompt; if (body.count("input") != 0) { is_openai = true; - if (body["input"].is_array()) { - // support multiple prompts - for (const json & elem : body["input"]) { - prompts.push_back(elem); - } - } else { - // single input prompt - prompts.push_back(body["input"]); - } + prompt = body["input"]; } else if (body.count("content") != 0) { - // only support single prompt here - std::string content = body["content"]; - prompts.push_back(content); + // with "content", we only support single prompt + prompt = std::vector{body["content"]}; } else { res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST)); return; } - // process all prompts - json responses = json::array(); - for (auto & prompt : prompts) { - // TODO @ngxson : maybe support multitask for this endpoint? - // create and queue the task + // create and queue the task + json responses; + { const int id_task = ctx_server.queue_tasks.get_new_id(); - ctx_server.queue_results.add_waiting_task_id(id_task); - ctx_server.request_completion(id_task, -1, { {"prompt", prompt}, { "n_predict", 0}}, false, true); + ctx_server.request_completion(id_task, -1, {{"prompt", prompt}}, false, true); // get the result server_task_result result = ctx_server.queue_results.recv(id_task); ctx_server.queue_results.remove_waiting_task_id(id_task); if (!result.error) { - // append to the responses - responses.push_back(result.data); + if (result.data.count("results")) { + // result for multi-task + responses = result.data["results"]; + } else { + // result for single task + responses = std::vector{result.data}; + } } else { // error received, ignore everything else res_error(res, result.data); @@ -3417,24 +3411,19 @@ int main(int argc, char ** argv) { } // write JSON response - json root; - if (is_openai) { - json res_oai = json::array(); - int i = 0; - for (auto & elem : responses) { - res_oai.push_back(json{ - {"embedding", json_value(elem, "embedding", json::array())}, - {"index", i++}, - {"object", "embedding"} - }); - } - root = format_embeddings_response_oaicompat(body, res_oai); - } else { - root = responses[0]; - } + json root = is_openai + ? format_embeddings_response_oaicompat(body, responses) + : responses[0]; return res.set_content(root.dump(), "application/json; charset=utf-8"); }; + auto handle_static_file = [](unsigned char * content, size_t len, const char * mime_type) { + return [content, len, mime_type](const httplib::Request &, httplib::Response & res) { + res.set_content(reinterpret_cast(content), len, mime_type); + return false; + }; + }; + // // Router // @@ -3446,17 +3435,6 @@ int main(int argc, char ** argv) { } // using embedded static files - auto handle_static_file = [](unsigned char * content, size_t len, const char * mime_type) { - return [content, len, mime_type](const httplib::Request &, httplib::Response & res) { - res.set_content(reinterpret_cast(content), len, mime_type); - return false; - }; - }; - - svr->Options(R"(/.*)", [](const httplib::Request &, httplib::Response & res) { - // TODO @ngxson : I have no idea what it is... maybe this is redundant? - return res.set_content("", "application/json; charset=utf-8"); - }); svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8")); svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8")); svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8")); diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 48aeef4eb..2ddb2cd21 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -529,6 +529,16 @@ static std::vector format_partial_response_oaicompat(json result, const st } static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) { + json data = json::array(); + int i = 0; + for (auto & elem : embeddings) { + data.push_back(json{ + {"embedding", json_value(elem, "embedding", json::array())}, + {"index", i++}, + {"object", "embedding"} + }); + } + json res = json { {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, {"object", "list"}, @@ -536,7 +546,7 @@ static json format_embeddings_response_oaicompat(const json & request, const jso {"prompt_tokens", 0}, {"total_tokens", 0} }}, - {"data", embeddings} + {"data", data} }; return res; From b3d978600f07f22e94f2e797f18a8b5f6df23c89 Mon Sep 17 00:00:00 2001 From: AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com> Date: Wed, 13 Mar 2024 13:17:54 +0000 Subject: [PATCH 02/56] Update get version (#6025) --- ggml-sycl.cpp | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index cfb09934d..c2ab13034 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -202,24 +202,29 @@ namespace dpct // Version string has the following format: // a. OpenCL // b. + // c. e.g gfx1030 std::string ver; ver = dev.get_info(); std::string::size_type i = 0; - while (i < ver.size()) - { - if (isdigit(ver[i])) - break; - i++; + while (i < ver.size()) { + if (isdigit(ver[i])) + break; + i++; } major = std::stoi(&(ver[i])); - while (i < ver.size()) - { - if (ver[i] == '.') - break; - i++; + while (i < ver.size()) { + if (ver[i] == '.') + break; + i++; + } + if (i < ver.size()) { + // a. and b. + i++; + minor = std::stoi(&(ver[i])); + } else { + // c. + minor = 0; } - i++; - minor = std::stoi(&(ver[i])); } template From d8fd0ccf6ac8b07791ffd1575eed436930854ae3 Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 13 Mar 2024 14:58:30 +0100 Subject: [PATCH 03/56] test-backend-ops : skip CPU backend by default (#6028) --- tests/test-backend-ops.cpp | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index fc5edcc4b..c2916c3e4 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -2222,8 +2222,8 @@ static void usage(char ** argv) { int main(int argc, char ** argv) { test_mode mode = MODE_TEST; - const char * op_name = NULL; - const char * backend = NULL; + const char * op_name_filter = NULL; + const char * backend_filter = NULL; for (int i = 1; i < argc; i++) { if (strcmp(argv[i], "test") == 0) { @@ -2232,14 +2232,14 @@ int main(int argc, char ** argv) { mode = MODE_PERF; } else if (strcmp(argv[i], "-o") == 0) { if (i + 1 < argc) { - op_name = argv[++i]; + op_name_filter = argv[++i]; } else { usage(argv); return 1; } } else if (strcmp(argv[i], "-b") == 0) { if (i + 1 < argc) { - backend = argv[++i]; + backend_filter = argv[++i]; } else { usage(argv); return 1; @@ -2258,7 +2258,7 @@ int main(int argc, char ** argv) { for (size_t i = 0; i < ggml_backend_reg_get_count(); i++) { printf("Backend %zu/%zu (%s)\n", i + 1, ggml_backend_reg_get_count(), ggml_backend_reg_get_name(i)); - if (backend != NULL && strcmp(backend, ggml_backend_reg_get_name(i)) != 0) { + if (backend_filter != NULL && strcmp(backend_filter, ggml_backend_reg_get_name(i)) != 0) { printf(" Skipping\n"); n_ok++; continue; @@ -2266,9 +2266,17 @@ int main(int argc, char ** argv) { ggml_backend_t backend = ggml_backend_reg_init_backend(i, NULL); GGML_ASSERT(backend != NULL); + + if (backend_filter == NULL && ggml_backend_is_cpu(backend)) { + printf(" Skipping CPU backend\n"); + ggml_backend_free(backend); + n_ok++; + continue; + } + printf(" Backend name: %s\n", ggml_backend_name(backend)); - bool ok = test_backend(backend, mode, op_name); + bool ok = test_backend(backend, mode, op_name_filter); printf(" Backend %s: ", ggml_backend_name(backend)); if (ok) { From f30ea47a87ed4446ad55adb265755dc9102956a2 Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 13 Mar 2024 18:54:21 +0100 Subject: [PATCH 04/56] llama : add pipeline parallelism support (#6017) * llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs ggml-ci * server : add -ub, --ubatch-size parameter * fix server embedding test * llama : fix Mamba inference for pipeline parallelism Tested to work correctly with both `main` and `parallel` examples. * llama : limit max batch size to n_batch * add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism default increase to 4 (from 2) changing this value may improve performance for some systems, but increases memory usage * fix hip build * fix sycl build (disable cpy_tensor_async) * fix hip build * llama : limit n_batch and n_ubatch to n_ctx during context creation * llama : fix norm backend * batched-bench : sync after decode * swiftui : sync after decode * ggml : allow ggml_get_rows to use multiple threads if they are available * check n_ubatch >= n_tokens with non-casual attention * llama : do not limit n_batch to n_ctx with non-casual attn * server : construct batch with size of llama_n_batch * ggml_backend_cpu_graph_compute : fix return value when alloc fails * llama : better n_batch and n_ubatch comment * fix merge * small fix * reduce default n_batch to 2048 --------- Co-authored-by: Francis Couture-Harpin Co-authored-by: Georgi Gerganov --- CMakeLists.txt | 3 + Makefile | 4 + common/common.cpp | 14 +- common/common.h | 3 +- examples/batched-bench/batched-bench.cpp | 2 + examples/embedding/embedding.cpp | 2 +- examples/llama-bench/llama-bench.cpp | 53 +- .../llama.cpp.swift/LibLlama.swift | 2 + examples/perplexity/perplexity.cpp | 3 +- examples/server/server.cpp | 32 +- .../server/tests/features/embeddings.feature | 1 + examples/server/tests/features/steps/steps.py | 8 + ggml-alloc.c | 109 +- ggml-alloc.h | 18 +- ggml-backend-impl.h | 17 +- ggml-backend.c | 517 +++++-- ggml-backend.h | 58 +- ggml-cuda.cu | 175 ++- ggml-kompute.cpp | 5 + ggml-metal.m | 5 + ggml-sycl.cpp | 7 +- ggml-vulkan.cpp | 5 + ggml.c | 113 +- llama.cpp | 1189 +++++++++-------- llama.h | 9 +- 25 files changed, 1467 insertions(+), 887 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7ab13cbd5..a8abf4088 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -118,6 +118,7 @@ option(LLAMA_SYCL "llama: use SYCL" option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF) set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device") option(LLAMA_CPU_HBM "llama: use memkind for CPU HBM" OFF) +set(LLAMA_SCHED_MAX_COPIES "4" CACHE STRING "llama: max input copies for pipeline parallelism") option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) @@ -147,6 +148,8 @@ set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads REQUIRED) include(CheckCXXCompilerFlag) +add_compile_definitions(GGML_SCHED_MAX_COPIES=${LLAMA_SCHED_MAX_COPIES}) + # enable libstdc++ assertions for debug builds if (CMAKE_SYSTEM_NAME MATCHES "Linux") add_compile_definitions($<$:_GLIBCXX_ASSERTIONS>) diff --git a/Makefile b/Makefile index c8fd3f5c5..db9968efb 100644 --- a/Makefile +++ b/Makefile @@ -167,6 +167,10 @@ ifeq ($(UNAME_S),OpenBSD) MK_CPPFLAGS += -D_BSD_SOURCE endif +ifdef LLAMA_SCHED_MAX_COPIES + MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(LLAMA_SCHED_MAX_COPIES) +endif + ifdef LLAMA_DEBUG MK_CFLAGS += -O0 -g MK_CXXFLAGS += -O0 -g diff --git a/common/common.cpp b/common/common.cpp index 2f38ac632..73b1b61ba 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -483,6 +483,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.n_batch = std::stoi(argv[i]); + } else if (arg == "-ub" || arg == "--ubatch-size") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_ubatch = std::stoi(argv[i]); } else if (arg == "--keep") { if (++i >= argc) { invalid_param = true; @@ -977,7 +983,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" binary file containing multiple choice tasks.\n"); printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx); - printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); + printf(" -b N, --batch-size N logical maximum batch size (default: %d)\n", params.n_batch); + printf(" -ub N, --ubatch-size N\n"); + printf(" physical maximum batch size (default: %d)\n", params.n_ubatch); printf(" --samplers samplers that will be used for generation in the order, separated by \';\'\n"); printf(" (default: %s)\n", sampler_type_names.c_str()); printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str()); @@ -1287,8 +1295,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param auto cparams = llama_context_default_params(); cparams.n_ctx = params.n_ctx; - cparams.n_batch = params.n_batch; cparams.n_seq_max = params.n_parallel; + cparams.n_batch = params.n_batch; + cparams.n_ubatch = params.n_ubatch; cparams.n_threads = params.n_threads; cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; cparams.seed = params.seed; @@ -1379,6 +1388,7 @@ std::tuple llama_init_from_gpt_par std::vector tmp = { llama_token_bos(model), llama_token_eos(model), }; llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0)); llama_kv_cache_clear(lctx); + llama_synchronize(lctx); llama_reset_timings(lctx); } diff --git a/common/common.h b/common/common.h index f8d82b871..0f178b9eb 100644 --- a/common/common.h +++ b/common/common.h @@ -51,7 +51,8 @@ struct gpt_params { int32_t n_threads_batch_draft = -1; int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 512; // context size - int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) + int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) + int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_draft = 5; // number of tokens to draft during speculative decoding int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 22bc93bca..19674dfd3 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -138,6 +138,8 @@ int main(int argc, char ** argv) { LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret); return false; } + + llama_synchronize(ctx); } return true; diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index a553ae1c3..49302a199 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -107,7 +107,7 @@ int main(int argc, char ** argv) { // max batch size const uint64_t n_batch = params.n_batch; - GGML_ASSERT(params.n_batch == params.n_ctx); + GGML_ASSERT(params.n_batch >= params.n_ctx); // tokenize the prompts and trim std::vector> inputs; diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 2ff86ef6f..bf94e7e7a 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -164,6 +164,7 @@ struct cmd_params { std::vector n_prompt; std::vector n_gen; std::vector n_batch; + std::vector n_ubatch; std::vector type_k; std::vector type_v; std::vector n_threads; @@ -183,7 +184,8 @@ static const cmd_params cmd_params_defaults = { /* model */ {"models/7B/ggml-model-q4_0.gguf"}, /* n_prompt */ {512}, /* n_gen */ {128}, - /* n_batch */ {512}, + /* n_batch */ {2048}, + /* n_ubatch */ {512}, /* type_k */ {GGML_TYPE_F16}, /* type_v */ {GGML_TYPE_F16}, /* n_threads */ {get_num_physical_cores()}, @@ -208,6 +210,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -p, --n-prompt (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); printf(" -b, --batch-size (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); + printf(" -ub N, --ubatch-size (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str()); printf(" -ctk , --cache-type-k (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); printf(" -ctv , --cache-type-v (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); @@ -217,7 +220,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str()); - printf(" -ts, --tensor_split (default: 0)\n"); + printf(" -ts, --tensor-split (default: 0)\n"); printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); printf(" -o, --output (default: %s)\n", output_format_str(cmd_params_defaults.output_format)); printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); @@ -297,6 +300,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = split(argv[i], split_delim); params.n_batch.insert(params.n_batch.end(), p.begin(), p.end()); + } else if (arg == "-ub" || arg == "--ubatch-size") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = split(argv[i], split_delim); + params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end()); } else if (arg == "-ctk" || arg == "--cache-type-k") { if (++i >= argc) { invalid_param = true; @@ -455,6 +465,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; } if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; } if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; } + if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; } if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; } if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; } if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } @@ -474,6 +485,7 @@ struct cmd_params_instance { int n_prompt; int n_gen; int n_batch; + int n_ubatch; ggml_type type_k; ggml_type type_v; int n_threads; @@ -511,6 +523,7 @@ struct cmd_params_instance { cparams.n_ctx = n_prompt + n_gen; cparams.n_batch = n_batch; + cparams.n_ubatch = n_ubatch; cparams.type_k = type_k; cparams.type_v = type_v; cparams.offload_kqv = !no_kv_offload; @@ -532,6 +545,7 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & mmp : params.use_mmap) for (const auto & embd : params.embeddings) for (const auto & nb : params.n_batch) + for (const auto & nub : params.n_ubatch) for (const auto & tk : params.type_k) for (const auto & tv : params.type_v) for (const auto & nkvo : params.no_kv_offload) @@ -545,6 +559,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .n_prompt = */ n_prompt, /* .n_gen = */ 0, /* .n_batch = */ nb, + /* .n_ubatch = */ nub, /* .type_k = */ tk, /* .type_v = */ tv, /* .n_threads = */ nt, @@ -568,6 +583,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .n_prompt = */ 0, /* .n_gen = */ n_gen, /* .n_batch = */ nb, + /* .n_ubatch = */ nub, /* .type_k = */ tk, /* .type_v = */ tv, /* .n_threads = */ nt, @@ -604,6 +620,7 @@ struct test { uint64_t model_size; uint64_t model_n_params; int n_batch; + int n_ubatch; int n_threads; ggml_type type_k; ggml_type type_v; @@ -627,6 +644,7 @@ struct test { model_size = llama_model_size(lmodel); model_n_params = llama_model_n_params(lmodel); n_batch = inst.n_batch; + n_ubatch = inst.n_ubatch; n_threads = inst.n_threads; type_k = inst.type_k; type_v = inst.type_v; @@ -705,7 +723,8 @@ struct test { "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas", "cpu_info", "gpu_info", "model_filename", "model_type", "model_size", "model_n_params", - "n_batch", "n_threads", "type_k", "type_v", + "n_batch", "n_ubatch", + "n_threads", "type_k", "type_v", "n_gpu_layers", "split_mode", "main_gpu", "no_kv_offload", "tensor_split", "use_mmap", "embeddings", @@ -719,7 +738,8 @@ struct test { enum field_type {STRING, BOOL, INT, FLOAT}; static field_type get_field_type(const std::string & field) { - if (field == "build_number" || field == "n_batch" || field == "n_threads" || + if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || + field == "n_threads" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || field == "main_gpu" || field == "n_prompt" || field == "n_gen" || @@ -759,7 +779,8 @@ struct test { std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas), cpu_info, gpu_info, model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), - std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), + std::to_string(n_batch), std::to_string(n_ubatch), + std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), std::to_string(n_gpu_layers), split_mode_str(split_mode), std::to_string(main_gpu), std::to_string(no_kv_offload), tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings), @@ -957,6 +978,9 @@ struct markdown_printer : public printer { if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) { fields.emplace_back("n_batch"); } + if (params.n_ubatch.size() > 1 || params.n_ubatch != cmd_params_defaults.n_ubatch) { + fields.emplace_back("n_ubatch"); + } if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) { fields.emplace_back("type_k"); } @@ -1096,25 +1120,32 @@ struct sql_printer : public printer { }; static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) { + llama_set_n_threads(ctx, n_threads, n_threads); + + //std::vector tokens(n_prompt, llama_token_bos(llama_get_model(ctx))); + //llama_decode(ctx, llama_batch_get_one(tokens.data(), n_prompt, n_past, 0)); + //GGML_UNUSED(n_batch); + std::vector tokens(n_batch, llama_token_bos(llama_get_model(ctx))); int n_processed = 0; - llama_set_n_threads(ctx, n_threads, n_threads); - while (n_processed < n_prompt) { int n_tokens = std::min(n_prompt - n_processed, n_batch); llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0)); n_processed += n_tokens; } + + llama_synchronize(ctx); } static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) { - llama_token token = llama_token_bos(llama_get_model(ctx)); - llama_set_n_threads(ctx, n_threads, n_threads); + llama_token token = llama_token_bos(llama_get_model(ctx)); + for (int i = 0; i < n_gen; i++) { llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0)); + llama_synchronize(ctx); } } @@ -1203,7 +1234,8 @@ int main(int argc, char ** argv) { // warmup run if (t.n_prompt > 0) { - test_prompt(ctx, std::min(2, t.n_batch), 0, t.n_batch, t.n_threads); + //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads); + test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads); } if (t.n_gen > 0) { test_gen(ctx, 1, 0, t.n_threads); @@ -1219,6 +1251,7 @@ int main(int argc, char ** argv) { if (t.n_gen > 0) { test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads); } + uint64_t t_ns = get_time_ns() - t_start; t.samples_ns.push_back(t_ns); } diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index 58fcf40c6..c249291ae 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -221,6 +221,7 @@ actor LlamaContext { if llama_decode(context, batch) != 0 { print("llama_decode() failed during prompt") } + llama_synchronize(context) let t_pp_end = ggml_time_us() @@ -240,6 +241,7 @@ actor LlamaContext { if llama_decode(context, batch) != 0 { print("llama_decode() failed during text generation") } + llama_synchronize(context) } let t_tg_end = ggml_time_us() diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index fdfc8f5dc..d766aef6a 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -589,9 +589,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par } } - const auto t_end = std::chrono::high_resolution_clock::now(); if (i == 0) { + llama_synchronize(ctx); + const auto t_end = std::chrono::high_resolution_clock::now(); const float t_total = std::chrono::duration(t_end - t_start).count(); fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); int total_seconds = (int)(t_total*n_chunk/n_seq); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 3172d96dd..895d608fd 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -147,7 +147,7 @@ struct server_slot { int32_t n_decoded = 0; int32_t n_remaining = -1; int32_t i_batch = -1; - int32_t n_predict = -1; + int32_t n_predict = -1; // TODO: disambiguate from params.n_predict int32_t n_prompt_tokens = 0; int32_t n_prompt_tokens_processed = 0; @@ -739,7 +739,13 @@ struct server_context { default_generation_settings_for_props = get_formated_generation(slots.front()); default_generation_settings_for_props["seed"] = -1; - batch = llama_batch_init(n_ctx, 0, params.n_parallel); + // the update_slots() logic will always submit a maximum of n_batch tokens + // note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used) + { + const int32_t n_batch = llama_n_batch(ctx); + + batch = llama_batch_init(n_batch, 0, params.n_parallel); + } metrics.init(); } @@ -1036,8 +1042,10 @@ struct server_context { llama_batch_add(batch, system_tokens[i], i, { 0 }, false); } - for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch) { - const int32_t n_tokens = std::min(params.n_batch, (int32_t) (batch.n_tokens - i)); + const int32_t n_batch = llama_n_batch(ctx); + + for (int32_t i = 0; i < batch.n_tokens; i += n_batch) { + const int32_t n_tokens = std::min(params.n_batch, batch.n_tokens - i); llama_batch batch_view = { n_tokens, batch.token + i, @@ -1226,7 +1234,7 @@ struct server_context { {"mirostat_eta", slot.sparams.mirostat_eta}, {"penalize_nl", slot.sparams.penalize_nl}, {"stop", slot.params.antiprompt}, - {"n_predict", slot.params.n_predict}, + {"n_predict", slot.params.n_predict}, // TODO: fix duplicate key n_predict {"n_keep", params.n_keep}, {"ignore_eos", ignore_eos}, {"stream", slot.params.stream}, @@ -1738,7 +1746,8 @@ struct server_context { } // process in chunks of params.n_batch - int32_t n_batch = params.n_batch; + int32_t n_batch = llama_n_batch(ctx); + int32_t n_ubatch = llama_n_ubatch(ctx); // next, batch any pending prompts without exceeding n_batch if (params.cont_batching || batch.n_tokens == 0) { @@ -1811,7 +1820,7 @@ struct server_context { if (slot.embedding) { // this prompt is too large to process - discard it - if (slot.n_prompt_tokens > n_batch) { + if (slot.n_prompt_tokens > n_ubatch) { slot.state = SLOT_STATE_PROCESSING; slot.command = SLOT_COMMAND_NONE; slot.release(); @@ -2157,7 +2166,8 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co printf(" --pooling {none,mean,cls} pooling type for embeddings, use model default if unspecified\n"); printf(" -dt N, --defrag-thold N\n"); printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold); - printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); + printf(" -b N, --batch-size N logical maximum batch size (default: %d)\n", params.n_batch); + printf(" -ub N, --ubatch-size N physical maximum batch size (default: %d)\n", params.n_ubatch); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); if (llama_supports_mlock()) { @@ -2424,6 +2434,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, break; } params.n_batch = std::stoi(argv[i]); + } else if (arg == "-ub" || arg == "--ubatch-size") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_ubatch = std::stoi(argv[i]); } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { if (++i >= argc) { invalid_param = true; diff --git a/examples/server/tests/features/embeddings.feature b/examples/server/tests/features/embeddings.feature index b47661e94..57359b267 100644 --- a/examples/server/tests/features/embeddings.feature +++ b/examples/server/tests/features/embeddings.feature @@ -9,6 +9,7 @@ Feature: llama.cpp server And 42 as server seed And 2 slots And 1024 as batch size + And 1024 as ubatch size And 2048 KV cache size And embeddings extraction Then the server is starting diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 98c2b6174..cfa9f96ec 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -33,6 +33,7 @@ def step_server_config(context, server_fqdn, server_port): context.model_alias = None context.n_batch = None + context.n_ubatch = None context.n_ctx = None context.n_ga = None context.n_ga_w = None @@ -278,6 +279,11 @@ def step_n_batch(context, n_batch): context.n_batch = n_batch +@step('{n_ubatch:d} as ubatch size') +def step_n_ubatch(context, n_ubatch): + context.n_ubatch = n_ubatch + + @step('{seed:d} as seed') def step_seed(context, seed): context.seed = seed @@ -1029,6 +1035,8 @@ def start_server_background(context): ] if context.n_batch: server_args.extend(['--batch-size', context.n_batch]) + if context.n_ubatch: + server_args.extend(['--ubatch-size', context.n_ubatch]) if context.n_gpu_layer: server_args.extend(['--n-gpu-layers', context.n_gpu_layer]) if context.server_continuous_batching: diff --git a/ggml-alloc.c b/ggml-alloc.c index e675306c8..8ac1d3e51 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -61,7 +61,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) { } } -// TODO: GGML_PAD ? static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) { assert(alignment && !(alignment & (alignment - 1))); // power of 2 size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment; @@ -69,25 +68,14 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen } // tallocr -struct ggml_tallocr { - ggml_backend_buffer_t buffer; - void * base; - size_t alignment; - size_t offset; -}; - -ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) { - ggml_tallocr_t talloc = malloc(sizeof(struct ggml_tallocr)); - if (talloc == NULL) { - return NULL; - } +struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) { void * base = ggml_backend_buffer_get_base(buffer); size_t align = ggml_backend_buffer_get_alignment(buffer); assert(align && !(align & (align - 1))); // power of 2 - *talloc = (struct ggml_tallocr) { + struct ggml_tallocr talloc = (struct ggml_tallocr) { /*.buffer = */ buffer, /*.base = */ base, /*.alignment = */ align, @@ -96,11 +84,7 @@ ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) { return talloc; } -void ggml_tallocr_free(ggml_tallocr_t talloc) { - free(talloc); -} - -void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor) { +void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) { size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor); size = GGML_PAD(size, talloc->alignment); @@ -354,12 +338,16 @@ struct hash_node { bool allocated; }; -// struct tensor_alloc { size_t offset; size_t size_max; // 0 = pre-allocated, unused, or view }; +struct leaf_alloc { + int buffer_id; + struct tensor_alloc leaf; +}; + struct node_alloc { int buffer_id; struct tensor_alloc dst; @@ -378,7 +366,7 @@ struct ggml_gallocr { struct node_alloc * node_allocs; // [n_nodes] int n_nodes; - struct tensor_alloc * leaf_allocs; // [n_leafs] + struct leaf_alloc * leaf_allocs; // [n_leafs] int n_leafs; }; @@ -543,13 +531,20 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) { return node_buffer_ids ? node_buffer_ids[i] : 0; } -static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) { +static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) { // clear hash tables memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *)); memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node)); + // allocate leafs + // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes + for (int i = 0; i < graph->n_leafs; i++) { + struct ggml_tensor * leaf = graph->leafs[i]; + ggml_gallocr_allocate_node(galloc, leaf, get_node_buffer_id(leaf_buffer_ids, i)); + } + // count number of children and views - // allocate all graph inputs and leafs first to avoid overwriting them + // allocate other graph inputs and leafs first to avoid overwriting them for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; @@ -577,19 +572,6 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr } } - // allocate the remaining leafs that are unused on the graph - // these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes - for (int i = 0; i < graph->n_leafs; i++) { - struct ggml_tensor * leaf = graph->leafs[i]; - struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf); - - if (hn->n_children == 0) { - assert(!hn->allocated); - // since buffer ids are only given for nodes, these leafs are always allocated in the first buffer - ggml_gallocr_allocate_node(galloc, leaf, 0); - } - } - // allocate tensors for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; @@ -652,7 +634,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr } } -bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) { +bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) { size_t hash_size = graph->visited_hash_table.size; // initialize hash table @@ -676,7 +658,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c } // allocate in hash table - ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids); + ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids); // set the node_allocs from the hash table if (galloc->n_nodes < graph->n_nodes) { @@ -711,15 +693,16 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c } if (galloc->n_leafs < graph->n_leafs) { free(galloc->leaf_allocs); - galloc->leaf_allocs = calloc(sizeof(struct tensor_alloc), graph->n_leafs); + galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0]), graph->n_leafs); GGML_ASSERT(galloc->leaf_allocs != NULL); } galloc->n_leafs = graph->n_leafs; for (int i = 0; i < graph->n_leafs; i++) { struct ggml_tensor * leaf = graph->leafs[i]; struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf); - galloc->leaf_allocs[i].offset = hn->offset; - galloc->leaf_allocs[i].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf); + galloc->leaf_allocs[i].buffer_id = hn->buffer_id; + galloc->leaf_allocs[i].leaf.offset = hn->offset; + galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf); } // reallocate buffers if needed @@ -727,7 +710,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0; size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]); - if (new_size > cur_size) { + // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views + if (new_size > cur_size || galloc->buffers[i] == NULL) { #ifndef NDEBUG fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); #endif @@ -744,30 +728,30 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c } bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) { - return ggml_gallocr_reserve_n(galloc, graph, NULL); + return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL); } -static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, struct tensor_alloc * tensor_alloc) { - assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max); +static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) { + assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max); - if (node->view_src != NULL) { - if (node->buffer == NULL) { + if (tensor->view_src != NULL) { + if (tensor->buffer == NULL) { assert(tensor_alloc->offset == SIZE_MAX); - if (node->view_src->buffer == NULL) { + if (tensor->view_src->buffer == NULL) { // this tensor was allocated without ggml-backend return; } - ggml_backend_view_init(galloc->buffers[buffer_id], node); + ggml_backend_view_init(galloc->buffers[buffer_id], tensor); } } else { - if (node->data == NULL) { + if (tensor->data == NULL) { assert(tensor_alloc->offset != SIZE_MAX); - assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max); + assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max); void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]); void * addr = (char *)base + tensor_alloc->offset; - ggml_backend_tensor_alloc(galloc->buffers[buffer_id], node, addr); + ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr); } else { - if (node->buffer == NULL) { + if (tensor->buffer == NULL) { // this tensor was allocated without ggml-backend return; } @@ -843,13 +827,18 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) // reset buffers for (int i = 0; i < galloc->n_buffers; i++) { - // zero size buffers are not allocated if (galloc->buffers[i] != NULL) { ggml_backend_buffer_reset(galloc->buffers[i]); } } // allocate the graph tensors from the previous assignments + // leafs + for (int i = 0; i < graph->n_leafs; i++) { + struct ggml_tensor * leaf = graph->leafs[i]; + struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i]; + ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf); + } // nodes for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; @@ -863,12 +852,6 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) } ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst); } - // leafs - for (int i = 0; i < graph->n_leafs; i++) { - struct ggml_tensor * leaf = graph->leafs[i]; - struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i]; - ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc); - } return true; } @@ -900,12 +883,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx, return false; } - struct ggml_tallocr * tallocr = ggml_tallocr_new(buffer); + struct ggml_tallocr tallocr = ggml_tallocr_new(buffer); for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) { if (t->data == NULL) { if (t->view_src == NULL) { - ggml_tallocr_alloc(tallocr, t); + ggml_tallocr_alloc(&tallocr, t); } else if (t->buffer == NULL) { ggml_backend_view_init(buffer, t); } @@ -917,8 +900,6 @@ static bool alloc_tensor_range(struct ggml_context * ctx, } } - ggml_tallocr_free(tallocr); - *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1)); (*buffers)[(*n_buffers)++] = buffer; diff --git a/ggml-alloc.h b/ggml-alloc.h index 1d9085d15..434c13b34 100644 --- a/ggml-alloc.h +++ b/ggml-alloc.h @@ -11,11 +11,15 @@ typedef struct ggml_backend_buffer * ggml_backend_buffer_t; typedef struct ggml_backend * ggml_backend_t; // Tensor allocator -typedef struct ggml_tallocr * ggml_tallocr_t; +struct ggml_tallocr { + ggml_backend_buffer_t buffer; + void * base; + size_t alignment; + size_t offset; +}; -GGML_API ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer); -GGML_API void ggml_tallocr_free(ggml_tallocr_t talloc); -GGML_API void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor); +GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer); +GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor); // Graph allocator /* @@ -50,7 +54,11 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc); // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed // returns false if the buffer allocation failed GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph); -GGML_API bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids); +GGML_API bool ggml_gallocr_reserve_n( + ggml_gallocr_t galloc, + struct ggml_cgraph * graph, + const int * node_buffer_ids, + const int * leaf_buffer_ids); // automatic reallocation if the topology changes when using a single buffer // returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers) diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h index 2e9ba58a9..e475e20e5 100644 --- a/ggml-backend-impl.h +++ b/ggml-backend-impl.h @@ -86,12 +86,12 @@ extern "C" { // (optional) asynchronous tensor data access void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); - bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst); + bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst); // (optional) complete all pending operations void (*GGML_CALL synchronize)(ggml_backend_t backend); - // create a plan for ggml_cgraph and free it + // compute graph with a plan (not used currently) ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); @@ -102,16 +102,27 @@ extern "C" { // check if the backend supports an operation bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); + + // (optional) event synchronization + ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend); + void (*GGML_CALL event_free) (ggml_backend_event_t event); + void (*GGML_CALL event_record) (ggml_backend_event_t event); + void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event); + void (*GGML_CALL event_synchronize) (ggml_backend_event_t event); }; struct ggml_backend { ggml_guid_t guid; struct ggml_backend_i iface; - ggml_backend_context_t context; }; + struct ggml_backend_event { + ggml_backend_t backend; + void * context; + }; + // // Backend registry // diff --git a/ggml-backend.c b/ggml-backend.c index d60d98414..31f8d5a6d 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -221,29 +221,29 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(buf != NULL && "tensor buffer not set"); + GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); if (!size) { return; } - tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size); + buf->iface.set_tensor(buf, tensor, data, offset, size); } GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; + GGML_ASSERT(buf != NULL && "tensor buffer not set"); GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); if (!size) { return; } - tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size); + buf->iface.get_tensor(buf, tensor, data, offset, size); } void ggml_backend_synchronize(ggml_backend_t backend) { @@ -255,18 +255,30 @@ void ggml_backend_synchronize(ggml_backend_t backend) { } ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + GGML_ASSERT(backend->iface.graph_plan_create != NULL); + return backend->iface.graph_plan_create(backend, cgraph); } void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { + GGML_ASSERT(backend->iface.graph_plan_free != NULL); + backend->iface.graph_plan_free(backend, plan); } enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { + GGML_ASSERT(backend->iface.graph_plan_compute != NULL); + return backend->iface.graph_plan_compute(backend, plan); } enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph); + ggml_backend_synchronize(backend); + return err; +} + +bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) { return backend->iface.graph_compute(backend, cgraph); } @@ -314,34 +326,68 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst } } -void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { +void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) { GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts"); if (src == dst) { return; } - if (ggml_backend_buft_supports_backend(src->buffer->buft, backend) && ggml_backend_buft_supports_backend(dst->buffer->buft, backend)) { - if (backend->iface.cpy_tensor_async != NULL) { - if (backend->iface.cpy_tensor_async(backend, src, dst)) { - return; - } + if (backend_dst->iface.cpy_tensor_async != NULL) { + if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) { + return; } } - size_t nbytes = ggml_nbytes(src); + // an async copy would normally happen after all the queued operations on both backends are completed + // sync src, set_async dst if (ggml_backend_buffer_is_host(src->buffer)) { - ggml_backend_tensor_set_async(backend, dst, src->data, 0, nbytes); - } - else { + ggml_backend_synchronize(backend_src); + ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, ggml_nbytes(src)); + } else { + ggml_backend_synchronize(backend_src); ggml_backend_tensor_copy(src, dst); + ggml_backend_synchronize(backend_dst); } } +// events + +ggml_backend_event_t ggml_backend_event_new(ggml_backend_t backend) { + if (backend->iface.event_new == NULL) { + return NULL; + } + return backend->iface.event_new(backend); +} + +void ggml_backend_event_free(ggml_backend_event_t event) { + if (event == NULL) { + return; + } + event->backend->iface.event_free(event); +} + +void ggml_backend_event_record(ggml_backend_event_t event) { + GGML_ASSERT(event->backend->iface.event_record != NULL); + + event->backend->iface.event_record(event); +} + +void ggml_backend_event_synchronize(ggml_backend_event_t event) { + GGML_ASSERT(event->backend->iface.event_synchronize != NULL); + + event->backend->iface.event_synchronize(event); +} + +void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) { + GGML_ASSERT(backend->iface.event_wait != NULL); + + backend->iface.event_wait(backend, event); +} // backend registry -#define GGML_MAX_BACKENDS_REG 16 +#define GGML_REG_MAX_BACKENDS 16 struct ggml_backend_reg { char name[128]; @@ -350,7 +396,7 @@ struct ggml_backend_reg { void * user_data; }; -static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG]; +static struct ggml_backend_reg ggml_backend_registry[GGML_REG_MAX_BACKENDS]; static size_t ggml_backend_registry_count = 0; GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data); @@ -395,7 +441,7 @@ GGML_CALL static void ggml_backend_registry_init(void) { } GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) { - GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG); + GGML_ASSERT(ggml_backend_registry_count < GGML_REG_MAX_BACKENDS); size_t id = ggml_backend_registry_count; @@ -746,8 +792,12 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); if (cpu_ctx->work_size < cplan.work_size) { - // TODO: may be faster to free and use malloc to avoid the copy - cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size); + free(cpu_ctx->work_data); + cpu_ctx->work_data = malloc(cplan.work_size); + if (cpu_ctx->work_data == NULL) { + cpu_ctx->work_size = 0; + return GGML_STATUS_ALLOC_FAILED; + } cpu_ctx->work_size = cplan.work_size; } cplan.work_data = cpu_ctx->work_data; @@ -784,6 +834,11 @@ static struct ggml_backend_i cpu_backend_i = { /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute, /* .graph_compute = */ ggml_backend_cpu_graph_compute, /* .supports_op = */ ggml_backend_cpu_supports_op, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, + /* .event_synchronize = */ NULL, }; static ggml_guid_t ggml_backend_cpu_guid(void) { @@ -939,15 +994,27 @@ static bool ggml_is_view_op(enum ggml_op op) { // scheduler -#define GGML_MAX_BACKENDS 16 -#define GGML_MAX_SPLITS 256 -#define GGML_MAX_SPLIT_INPUTS 16 +#ifndef GGML_SCHED_MAX_BACKENDS +#define GGML_SCHED_MAX_BACKENDS 16 +#endif + +#ifndef GGML_SCHED_MAX_SPLITS +#define GGML_SCHED_MAX_SPLITS 256 +#endif + +#ifndef GGML_SCHED_MAX_SPLIT_INPUTS +#define GGML_SCHED_MAX_SPLIT_INPUTS 16 +#endif + +#ifndef GGML_SCHED_MAX_COPIES +#define GGML_SCHED_MAX_COPIES 4 +#endif struct ggml_backend_sched_split { int backend_id; int i_start; int i_end; - struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS]; + struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS]; int n_inputs; // graph view of this split struct ggml_cgraph graph; @@ -955,45 +1022,53 @@ struct ggml_backend_sched_split { struct ggml_backend_sched { bool is_reset; // true if the scheduler has been reset since the last graph split + bool is_alloc; int n_backends; - ggml_backend_t backends[GGML_MAX_BACKENDS]; - ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS]; + ggml_backend_t backends[GGML_SCHED_MAX_BACKENDS]; + ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS]; ggml_gallocr_t galloc; // hash keys of the nodes in the graph struct ggml_hash_set hash_set; // hash values int * tensor_backend_id; - struct ggml_tensor * (* tensor_copies)[GGML_MAX_BACKENDS]; + struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES]; - int * node_backend_ids; // [n_nodes] - int n_nodes; + int * node_backend_ids; // [graph_size] + int * leaf_backend_ids; // [graph_size] // copy of the graph with modified inputs struct ggml_cgraph * graph; - struct ggml_backend_sched_split splits[GGML_MAX_SPLITS]; + // graph splits + struct ggml_backend_sched_split splits[GGML_SCHED_MAX_SPLITS]; int n_splits; + // pipeline parallelism support + int n_copies; + int cur_copy; + ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES]; + struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS]; + int n_graph_inputs; + struct ggml_context * ctx; ggml_backend_sched_eval_callback callback_eval; void * callback_eval_user_data; // align context_buffer to GGML_MEM_ALIGN - #ifdef _MSC_VER +#ifdef _MSC_VER __declspec(align(GGML_MEM_ALIGN)) - #else +#else __attribute__((aligned(GGML_MEM_ALIGN))) - #endif - char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)]; +#endif + char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)]; }; -#define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node) -#define tensor_backend_id(node) sched->tensor_backend_id[hash_id(node)] -#define tensor_backend(node) (tensor_backend_id(node) == -1 ? NULL : sched->backends[tensor_backend_id(node)]) +#define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor) +#define tensor_backend_id(tensor) sched->tensor_backend_id[hash_id(tensor)] // returns the priority of the backend, lower id is higher priority static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) { @@ -1005,7 +1080,8 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen return -1; } -static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) { +static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) { + ggml_backend_buffer_t buffer = tensor->buffer; if (buffer == NULL) { return -1; } @@ -1016,12 +1092,16 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, gg return i; } } - GGML_ASSERT(false && "tensor buffer type not supported by any backend"); - return -1; // silence warning + + fprintf(stderr, "%s: error: no backend supports buffer type %s used in tensor %s\n", + __func__, ggml_backend_buffer_name(buffer), tensor->name); + GGML_ASSERT(false); + + return -1; } #if 0 -static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only +static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__) #define GET_CAUSE(node) causes[hash_id(node)] #else @@ -1035,19 +1115,28 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st // assign pre-allocated nodes to their backend // dst - int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->buffer); + int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor); if (cur_backend != -1) { - SET_CAUSE(node, "1.dst"); + SET_CAUSE(tensor, "1.dst"); return cur_backend; } + // view_src if (tensor->view_src != NULL) { - cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src->buffer); + cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src); if (cur_backend != -1) { - SET_CAUSE(node, "1.vsrc"); + SET_CAUSE(tensor, "1.vsrc"); return cur_backend; } } + + // input + if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { + cur_backend = sched->n_backends - 1; // last backend (assumed CPU) + SET_CAUSE(tensor, "1.inp"); + return cur_backend; + } + // assign nodes that use weights to the backend of the weights for (int i = 0; i < GGML_MAX_SRC; i++) { const struct ggml_tensor * src = tensor->src[i]; @@ -1055,9 +1144,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st continue; } if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { - int src_backend = ggml_backend_sched_backend_from_buffer(sched, src->buffer); + int src_backend = ggml_backend_sched_backend_from_buffer(sched, src); // operations with weights are always run on the same backend as the weights - SET_CAUSE(node, "1.wgt%d", i); + SET_CAUSE(tensor, "1.wgt%d", i); return src_backend; } } @@ -1093,7 +1182,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str if (ggml_is_view_op(node->op)) { continue; } - ggml_backend_t tensor_backend = tensor_backend(node); + ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node); fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name, fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node)); for (int j = 0; j < GGML_MAX_SRC; j++) { @@ -1101,7 +1190,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str if (src == NULL) { continue; } - ggml_backend_t src_backend = tensor_backend(src); + ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src); fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name, fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src)); } @@ -1118,6 +1207,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { // reset splits sched->n_splits = 0; + sched->n_graph_inputs = 0; sched->is_reset = false; struct ggml_init_params params = { @@ -1163,7 +1253,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } #ifdef DEBUG_PASS1 - fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); + fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph); #endif // pass 2: expand current backend assignments @@ -1171,28 +1261,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend) // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops - // pass 2.1 expand gpu up - { - int cur_backend_id = -1; - for (int i = graph->n_nodes - 1; i >= 0; i--) { - struct ggml_tensor * node = graph->nodes[i]; - if (ggml_is_view_op(node->op)) { - continue; - } - int tensor_backend_id = tensor_backend_id(node); - if (tensor_backend_id != -1) { - if (tensor_backend_id == sched->n_backends - 1) { - // skip cpu (lowest prio backend) - cur_backend_id = -1; - } else { - cur_backend_id = tensor_backend_id; - } - } else { - tensor_backend_id(node) = cur_backend_id; - SET_CAUSE(node, "2.1"); - } - } - } // pass 2.2 expand gpu down { @@ -1217,7 +1285,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } - // pass 2.3 expand rest up + // pass 2.1 expand gpu up { int cur_backend_id = -1; for (int i = graph->n_nodes - 1; i >= 0; i--) { @@ -1227,14 +1295,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } int tensor_backend_id = tensor_backend_id(node); if (tensor_backend_id != -1) { - cur_backend_id = tensor_backend_id; + if (tensor_backend_id == sched->n_backends - 1) { + // skip cpu (lowest prio backend) + cur_backend_id = -1; + } else { + cur_backend_id = tensor_backend_id; + } } else { tensor_backend_id(node) = cur_backend_id; - SET_CAUSE(node, "2.3"); + SET_CAUSE(node, "2.1"); } } } + // pass 2.4 expand rest down { int cur_backend_id = -1; @@ -1252,8 +1326,26 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } } + // pass 2.3 expand rest up + { + int cur_backend_id = -1; + for (int i = graph->n_nodes - 1; i >= 0; i--) { + struct ggml_tensor * node = graph->nodes[i]; + if (ggml_is_view_op(node->op)) { + continue; + } + int tensor_backend_id = tensor_backend_id(node); + if (tensor_backend_id != -1) { + cur_backend_id = tensor_backend_id; + } else { + tensor_backend_id(node) = cur_backend_id; + SET_CAUSE(node, "2.3"); + } + } + } + #ifdef DEBUG_PASS2 - fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); + fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph); #endif // pass 3: assign backends to remaining src from dst and view_src @@ -1283,7 +1375,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } #ifdef DEBUG_PASS3 - fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); + fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph); #endif // pass 4: split graph, find tensors that need to be copied @@ -1315,7 +1407,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (tensor_backend_id != cur_backend_id) { sched->splits[cur_split].i_end = i; cur_split++; - GGML_ASSERT(cur_split < GGML_MAX_SPLITS); + GGML_ASSERT(cur_split < GGML_SCHED_MAX_SPLITS); sched->splits[cur_split].backend_id = tensor_backend_id; sched->splits[cur_split].i_start = i; sched->splits[cur_split].n_inputs = 0; @@ -1328,25 +1420,57 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (src == NULL) { continue; } + int src_backend_id = tensor_backend_id(src); assert(src_backend_id != -1); // all inputs should be assigned by now + + if (src->flags & GGML_TENSOR_FLAG_INPUT) { + size_t id = hash_id(src); + if (sched->tensor_copies[id][src_backend_id][0] == NULL) { + ggml_backend_t backend = sched->backends[src_backend_id]; + for (int c = 0; c < sched->n_copies; c++) { + struct ggml_tensor * tensor_copy; + if (c == sched->cur_copy) { + tensor_copy = src; // use the original tensor as the current copy + } else { + tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); + ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c); + } + if (sched->n_copies > 1) { + ggml_set_input(tensor_copy); + ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor + } + sched->tensor_copies[id][src_backend_id][c] = tensor_copy; + tensor_backend_id(tensor_copy) = src_backend_id; + SET_CAUSE(tensor_copy, "4.cpy"); + } + int n_graph_inputs = sched->n_graph_inputs++; + GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); + sched->graph_inputs[n_graph_inputs] = src; + } + } + if (src_backend_id != tensor_backend_id) { // create a copy of the input in the split's backend size_t id = hash_id(src); - if (sched->tensor_copies[id][cur_backend_id] == NULL) { + if (sched->tensor_copies[id][cur_backend_id][0] == NULL) { ggml_backend_t backend = sched->backends[cur_backend_id]; - struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); - ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name); - - sched->tensor_copies[id][cur_backend_id] = tensor_copy; - tensor_backend_id(tensor_copy) = cur_backend_id; - SET_CAUSE(tensor_copy, "4.cpy"); - + for (int c = 0; c < sched->n_copies; c++) { + struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); + ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c); + if (sched->n_copies > 1) { + ggml_set_input(tensor_copy); + ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor + } + sched->tensor_copies[id][cur_backend_id][c] = tensor_copy; + tensor_backend_id(tensor_copy) = cur_backend_id; + SET_CAUSE(tensor_copy, "4.cpy"); + } int n_inputs = sched->splits[cur_split].n_inputs++; - GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS); + GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); sched->splits[cur_split].inputs[n_inputs] = src; } - node->src[j] = sched->tensor_copies[id][cur_backend_id]; + node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy]; } } } @@ -1354,37 +1478,39 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg sched->n_splits = cur_split + 1; } #ifdef DEBUG_PASS4 - fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); + fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph); #endif #ifndef NDEBUG // sanity check: all sources should have the same backend as the node for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - ggml_backend_t tensor_backend = tensor_backend(node); + ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node); if (tensor_backend == NULL) { fprintf(stderr, "!!!!!!! %s has no backend\n", node->name); } - if (node->view_src != NULL && tensor_backend != tensor_backend(node->view_src)) { + if (node->view_src != NULL && tensor_backend != ggml_backend_sched_get_tensor_backend(sched, node->view_src)) { fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n", node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", - node->view_src->name, tensor_backend(node->view_src) ? ggml_backend_name(tensor_backend(node->view_src)) : "NULL"); + node->view_src->name, ggml_backend_sched_get_tensor_backend(sched, node->view_src) ? + ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, node->view_src)) : "NULL"); } for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { continue; } - ggml_backend_t src_backend = tensor_backend(src); + ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src); if (src_backend != tensor_backend /* && src_backend != NULL */) { fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n", node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL"); } - if (src->view_src != NULL && src_backend != tensor_backend(src->view_src)) { + if (src->view_src != NULL && src_backend != ggml_backend_sched_get_tensor_backend(sched, src->view_src)) { fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n", src->name, src_backend ? ggml_backend_name(src_backend) : "NULL", - src->view_src->name, tensor_backend(src->view_src) ? ggml_backend_name(tensor_backend(src->view_src)) : "NULL"); + src->view_src->name, ggml_backend_sched_get_tensor_backend(sched, src->view_src) ? + ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, src->view_src)) : "NULL"); } } } @@ -1392,18 +1518,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg #endif // create copies of the graph for each split - // FIXME: avoid this copy, pass split inputs to ggml_gallocr_alloc_graph_n in some other way - struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_MAX_SPLIT_INPUTS, false); + // TODO: avoid this copy + struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false); for (int i = 0; i < sched->n_splits; i++) { struct ggml_backend_sched_split * split = &sched->splits[i]; split->graph = ggml_graph_view(graph, split->i_start, split->i_end); + // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split for (int j = 0; j < split->n_inputs; j++) { struct ggml_tensor * input = split->inputs[j]; - struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id]; + struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id][sched->cur_copy]; // add a dependency to the input source so that it is not freed before the copy is done struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input); + input_dep->src[0] = input; sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input); graph_copy->nodes[graph_copy->n_nodes++] = input_dep; @@ -1417,18 +1545,56 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j]; } } + + if (sched->n_copies > 1) { + // add input copies as leafs so that they are allocated first + for (int i = 0; i < sched->n_graph_inputs; i++) { + struct ggml_tensor * input = sched->graph_inputs[i]; + size_t id = hash_id(input); + int backend_id = tensor_backend_id(input); + for (int c = 0; c < sched->n_copies; c++) { + struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c]; + sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id; + graph_copy->leafs[graph_copy->n_leafs++] = input_cpy; + } + } + + for (int i = 0; i < sched->n_splits; i++) { + struct ggml_backend_sched_split * split = &sched->splits[i]; + int backend_id = split->backend_id; + for (int j = 0; j < split->n_inputs; j++) { + struct ggml_tensor * input = split->inputs[j]; + size_t id = hash_id(input); + for (int c = 0; c < sched->n_copies; c++) { + struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c]; + sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id; + graph_copy->leafs[graph_copy->n_leafs++] = input_cpy; + } + } + } + } + + // add leafs from the original graph + for (int i = 0; i < graph->n_leafs; i++) { + struct ggml_tensor * leaf = graph->leafs[i]; + sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf); + graph_copy->leafs[graph_copy->n_leafs++] = leaf; + } + sched->graph = graph_copy; } static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { - // ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids); + // allocate graph if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) { + // the re-allocation may cause the split inputs to be moved to a different address + ggml_backend_sched_synchronize(sched); #ifndef NDEBUG - fprintf(stderr, "ggml_backend_sched: failed to allocate graph, reserving\n"); + fprintf(stderr, "%s: failed to allocate graph, reserving\n", __func__); #endif - ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids); + ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids); if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) { - fprintf(stderr, "ggml_backend_sched: failed to allocate graph\n"); + fprintf(stderr, "%s: failed to allocate graph\n", __func__); return false; } } @@ -1437,9 +1603,6 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { } static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) { - uint64_t copy_us[GGML_MAX_BACKENDS] = {0}; - uint64_t compute_us[GGML_MAX_BACKENDS] = {0}; - struct ggml_backend_sched_split * splits = sched->splits; for (int i = 0; i < sched->n_splits; i++) { @@ -1448,34 +1611,36 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s ggml_backend_t split_backend = sched->backends[split_backend_id]; // copy the input tensors to the split backend - uint64_t copy_start_us = ggml_time_us(); for (int j = 0; j < split->n_inputs; j++) { + ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]); struct ggml_tensor * input = split->inputs[j]; - struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id]; + struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy]; - GGML_ASSERT(input->buffer != NULL); - GGML_ASSERT(input_cpy->buffer != NULL); + if (input->flags & GGML_TENSOR_FLAG_INPUT) { + // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done + if (sched->events[split_backend_id][sched->cur_copy] != NULL) { + ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]); + } else { + ggml_backend_synchronize(split_backend); + } + ggml_backend_tensor_copy(input, input_cpy); + } else { + if (sched->events[split_backend_id][sched->cur_copy] != NULL) { + ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]); + } else { + ggml_backend_synchronize(split_backend); + ggml_backend_synchronize(input_backend); + } - ggml_backend_tensor_copy_async(split_backend, input, input_cpy); + ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy); + } } - //ggml_backend_synchronize(split_backend); // necessary to measure copy time - int64_t copy_end_us = ggml_time_us(); - copy_us[split_backend_id] += copy_end_us - copy_start_us; -#if 0 - char split_filename[GGML_MAX_NAME]; - snprintf(split_filename, GGML_MAX_NAME, "split_%i_%s.dot", i, ggml_backend_name(split_backend)); - ggml_graph_dump_dot(split->graph, NULL, split_filename); -#endif - - - uint64_t compute_start_us = ggml_time_us(); if (!sched->callback_eval) { - enum ggml_status ec = ggml_backend_graph_compute(split_backend, &split->graph); + enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph); if (ec != GGML_STATUS_SUCCESS) { return ec; } - //ggml_backend_synchronize(split_backend); // necessary to measure compute time } else { // similar to ggml_backend_compare_graph_backend for (int j0 = 0; j0 < split->graph.n_nodes; j0++) { @@ -1494,11 +1659,14 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1); - enum ggml_status ec = ggml_backend_graph_compute(split_backend, &gv); + enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv); if (ec != GGML_STATUS_SUCCESS) { return ec; } + // TODO: pass backend to the callback, then the user can decide if they want to synchronize + ggml_backend_synchronize(split_backend); + if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) { break; } @@ -1506,39 +1674,54 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s j0 = j1; } } - uint64_t compute_end_us = ggml_time_us(); - compute_us[split_backend_id] += compute_end_us - compute_start_us; - } -#if 0 - // per-backend timings - fprintf(stderr, "sched_compute_splits times (%d splits):\n", sched->n_splits); - for (int i = 0; i < sched->n_backends; i++) { - if (copy_us[i] > 0 || compute_us[i] > 0) { - fprintf(stderr, "\t%5.5s: %lu us copy, %lu us compute\n", ggml_backend_name(sched->backends[i]), copy_us[i], compute_us[i]); + // record the event of this copy + if (split->n_inputs > 0) { + if (sched->events[split_backend_id][sched->cur_copy] != NULL) { + ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]); + } } } -#endif + + sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies; return GGML_STATUS_SUCCESS; } -ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) { +ggml_backend_sched_t ggml_backend_sched_new( + ggml_backend_t * backends, + ggml_backend_buffer_type_t * bufts, + int n_backends, + size_t graph_size, + bool parallel) { GGML_ASSERT(n_backends > 0); - GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS); + GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS); + GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1); // initialize hash table - sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS); + sched->hash_set = ggml_hash_set_new(graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS); sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size); sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size); sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size); + sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), graph_size); sched->n_backends = n_backends; - for (int i = 0; i < n_backends; i++) { - sched->backends[i] = backends[i]; - sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]); + + sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1; + + GGML_ASSERT(sched->n_copies <= GGML_SCHED_MAX_COPIES); + + for (int b = 0; b < n_backends; b++) { + sched->backends[b] = backends[b]; + sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]); + GGML_ASSERT(ggml_backend_buft_supports_backend(sched->bufts[b], backends[b])); + if (sched->n_copies > 1) { + for (int c = 0; c < sched->n_copies; c++) { + sched->events[b][c] = ggml_backend_event_new(backends[b]); + } + } } sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends); @@ -1552,12 +1735,18 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { if (sched == NULL) { return; } + for (int b = 0; b < sched->n_backends; b++) { + for (int c = 0; c < sched->n_copies; c++) { + ggml_backend_event_free(sched->events[b][c]); + } + } ggml_gallocr_free(sched->galloc); ggml_free(sched->ctx); free(sched->hash_set.keys); free(sched->tensor_backend_id); free(sched->tensor_copies); free(sched->node_backend_ids); + free(sched->leaf_backend_ids); free(sched); } @@ -1569,34 +1758,63 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) { memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size); sched->is_reset = true; + sched->is_alloc = false; } bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) { ggml_backend_sched_split_graph(sched, measure_graph); - if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids)) { + // TODO: extract this to a separate function + if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) { return false; } ggml_backend_sched_reset(sched); + ggml_backend_sched_synchronize(sched); + + return true; +} + +bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { + GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS); + + ggml_backend_sched_split_graph(sched, graph); + + if (!ggml_backend_sched_alloc_splits(sched)) { + return false; + } + + sched->is_alloc = true; + return true; } enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { - GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS); + enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph); + ggml_backend_sched_synchronize(sched); + return err; +} - if (!sched->is_reset) { +enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { + if (!sched->is_reset && !sched->is_alloc) { ggml_backend_sched_reset(sched); } - ggml_backend_sched_split_graph(sched, graph); - if (!ggml_backend_sched_alloc_splits(sched)) { - return GGML_STATUS_ALLOC_FAILED; + if (!sched->is_alloc) { + if (!ggml_backend_sched_alloc_graph(sched, graph)) { + return GGML_STATUS_ALLOC_FAILED; + } } return ggml_backend_sched_compute_splits(sched); } +void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) { + for (int i = 0; i < sched->n_backends; i++) { + ggml_backend_synchronize(sched->backends[i]); + } +} + void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) { sched->callback_eval = callback; sched->callback_eval_user_data = user_data; @@ -1606,19 +1824,24 @@ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) { return sched->n_splits; } +int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) { + return sched->n_copies; +} + size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) { int backend_index = ggml_backend_sched_backend_id(sched, backend); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); + return ggml_gallocr_get_buffer_size(sched->galloc, backend_index); } -void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) { +void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) { int backend_index = ggml_backend_sched_backend_id(sched, backend); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); tensor_backend_id(node) = backend_index; } -ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) { +ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) { int backend_index = tensor_backend_id(node); if (backend_index == -1) { return NULL; diff --git a/ggml-backend.h b/ggml-backend.h index 8bed22578..099d9c258 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -9,6 +9,7 @@ extern "C" { typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t; typedef struct ggml_backend_buffer * ggml_backend_buffer_t; + typedef struct ggml_backend_event * ggml_backend_event_t; typedef struct ggml_backend * ggml_backend_t; typedef void * ggml_backend_graph_plan_t; @@ -72,11 +73,24 @@ extern "C" { GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan); GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); + GGML_API bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op); // tensor copy between different backends GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst); - GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); // automatic fallback to sync copy + + // asynchronous copy + // the copy is performed after all the currently queued operations in backend_src + // backend_dst will wait for the copy to complete before performing other operations + // automatic fallback to sync copy if async is not supported + GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst); + + // events + GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_t backend); + GGML_API void ggml_backend_event_free (ggml_backend_event_t event); + GGML_API void ggml_backend_event_record (ggml_backend_event_t event); + GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event); + GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event // // CPU backend @@ -123,27 +137,31 @@ extern "C" { /* Example usage: - sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, num_backends); - // sched is initialized with measure allocators and cannot be used until allocated with a measure graph + // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be asigned + // preferrably to run on the same backend as the buffer + ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); - // initialize buffers from a measure graph - measure_graph = build_graph(sched); // use the allocr to allocate inputs as needed + sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false); - // in build_graph: - build_graph(...) { - // manually assign nodes to a backend (optional, should not be needed in most cases) - struct ggml_tensor * node = ggml_mul_mat(ctx, ...); - ggml_backend_sched_set_node_backend(sched, node, backend_gpu); - } + // initialize buffers from a max size graph (optional) + reserve_graph = build_graph(sched, max_batch_size); - // allocate backend buffers from measure graph - ggml_backend_sched_init_measure(sched, measure_graph); + // manually assign nodes to a backend (optional, should not be needed in most cases) + struct ggml_tensor * node = ggml_mul_mat(ctx, ...); + ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu); - // the scheduler is now ready to compute graphs + ggml_backend_sched_reserve(sched, reserve_graph); // compute graph = build_graph(sched); ggml_backend_sched_graph_compute(sched, graph); + + // if there are graph inputs: + ggml_backend_sched_reset(sched); + ggml_backend_sched_alloc_graph(sched, graph); + ggml_backend_tensor_set(input_tensor, ...); + ggml_backend_sched_graph_compute(sched, graph); + } */ struct ggml_backend_sched; @@ -158,20 +176,26 @@ extern "C" { typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data); // Initialize a backend scheduler - GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size); + GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel); GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); + // Initialize backend buffers from a measure graph GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); + // Get the number of splits of the last graph GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched); + GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched); GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend); - GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); - GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); + GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); + GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); // Allocate and compute graph on the backend scheduler + GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph); + GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph); + GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched); // Reset all assignments and allocators - must be called before changing the node backends GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched); diff --git a/ggml-cuda.cu b/ggml-cuda.cu index b8834ed05..d1b5e52ba 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -72,6 +72,7 @@ #define cudaEventCreateWithFlags hipEventCreateWithFlags #define cudaEventDisableTiming hipEventDisableTiming #define cudaEventRecord hipEventRecord +#define cudaEventSynchronize hipEventSynchronize #define cudaEvent_t hipEvent_t #define cudaEventDestroy hipEventDestroy #define cudaFree hipFree @@ -81,6 +82,7 @@ #define cudaGetDeviceProperties hipGetDeviceProperties #define cudaGetErrorString hipGetErrorString #define cudaGetLastError hipGetLastError +#define cudaLaunchHostFunc hipLaunchHostFunc #ifdef GGML_HIP_UMA #define cudaMalloc hipMallocManaged #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size) @@ -104,6 +106,7 @@ #define cudaStreamCreateWithFlags hipStreamCreateWithFlags #define cudaStreamFireAndForget hipStreamFireAndForget #define cudaStreamNonBlocking hipStreamNonBlocking +#define cudaStreamPerThread hipStreamPerThread #define cudaStreamSynchronize hipStreamSynchronize #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags) #define cudaStream_t hipStream_t @@ -10641,8 +10644,20 @@ GGML_CALL void ggml_cuda_get_device_description(int device, char * description, #define UNUSED GGML_UNUSED struct ggml_backend_cuda_context { + explicit ggml_backend_cuda_context(int device) : + device(device), + name(GGML_CUDA_NAME + std::to_string(device)) { + } + + ~ggml_backend_cuda_context() { + if (copy_event != nullptr) { + CUDA_CHECK(cudaEventDestroy(copy_event)); + } + } + int device; std::string name; + cudaEvent_t copy_event = nullptr; }; // cuda buffer @@ -10732,9 +10747,8 @@ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_cuda_set_device(ctx->device); - CUDA_CHECK(cudaDeviceSynchronize()); - CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread)); + CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); } GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { @@ -10743,26 +10757,25 @@ GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_cuda_set_device(ctx->device); - CUDA_CHECK(cudaDeviceSynchronize()); - CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread)); + CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); } GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { if (ggml_backend_buffer_is_cuda(src->buffer)) { ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context; - ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context; - - ggml_cuda_set_device(src_ctx->device); - CUDA_CHECK(cudaDeviceSynchronize()); - ggml_cuda_set_device(dst_ctx->device); - CUDA_CHECK(cudaDeviceSynchronize()); - CUDA_CHECK(cudaMemcpy((char *)dst->data, (const char *)src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice)); - CUDA_CHECK(cudaDeviceSynchronize()); - + ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context; + if (src_ctx->device == dst_ctx->device) { + CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread)); + } else { + CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread)); + } + CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); return true; } return false; + + UNUSED(buffer); } GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { @@ -11007,7 +11020,11 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buf } const char * buf_host = (const char *)data + offset_split; - CUDA_CHECK(cudaMemcpy(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread)); + } + + for (int id = 0; id < g_device_count; ++id) { + CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); } } @@ -11041,7 +11058,11 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buf } char * buf_host = (char *)data + offset_split; - CUDA_CHECK(cudaMemcpy(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost, cudaStreamPerThread)); + } + + for (int id = 0; id < g_device_count; ++id) { + CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); } } @@ -11220,6 +11241,10 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { return &ggml_backend_cuda_buffer_type_host; } +//static bool ggml_backend_buffer_is_cuda_host(ggml_backend_buffer_t buffer) { +// return buffer->buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name; +//} + // backend GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) { @@ -11243,8 +11268,9 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; - GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); + GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0])); @@ -11252,22 +11278,61 @@ GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; - GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); + GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0])); } -GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { - ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; +GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) { + GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst)); - if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) { - CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx->device][0])); - return true; + ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer; + ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer; + + if (!ggml_backend_buffer_is_cuda(src->buffer)) { + return false; } - return false; + if (!ggml_backend_buffer_is_cuda(dst->buffer)) { + return false; + } + + // device -> device + ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context; + ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context; + + if (backend_src != backend_dst) { + ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context; + ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context; + + GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device); + GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device); + + if (!cuda_ctx_src->copy_event) { + ggml_cuda_set_device(cuda_ctx_src->device); + CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming)); + } + + // copy on src stream + if (cuda_ctx_src->device == cuda_ctx_dst->device) { + CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0])); + } else { + CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), g_cudaStreams[cuda_ctx_src->device][0])); + } + + // record event on src stream + CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, g_cudaStreams[cuda_ctx_src->device][0])); + + // wait on dst stream for the copy to complete + CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx_dst->device][0], cuda_ctx_src->copy_event, 0)); + } else { + // src and dst are on the same backend + CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0])); + } + return true; } GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { @@ -11444,6 +11509,52 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons UNUSED(backend); } +static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + + ggml_cuda_set_device(cuda_ctx->device); + + cudaEvent_t event; + CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); + + return new ggml_backend_event { + /* .backend = */ backend, + /* .context = */ event, + }; +} + +static void ggml_backend_cuda_event_free(ggml_backend_event_t event) { + CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context)); + + delete event; +} + +static void ggml_backend_cuda_event_record(ggml_backend_event_t event) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)event->backend->context; + + CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, g_cudaStreams[cuda_ctx->device][0])); +} + +static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + + if (ggml_backend_is_cuda(event->backend)) { + CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx->device][0], (cudaEvent_t)event->context, 0)); + } else { + // untested + auto wait_fn = [](void * user_data) { + ggml_backend_event_t event = (ggml_backend_event_t)user_data; + ggml_backend_event_synchronize(event); + }; + + CUDA_CHECK(cudaLaunchHostFunc(g_cudaStreams[cuda_ctx->device][0], wait_fn, event)); + } +} + +static void ggml_backend_cuda_event_synchronize(ggml_backend_event_t event) { + CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context)); +} + static ggml_backend_i ggml_backend_cuda_interface = { /* .get_name = */ ggml_backend_cuda_name, /* .free = */ ggml_backend_cuda_free, @@ -11457,6 +11568,11 @@ static ggml_backend_i ggml_backend_cuda_interface = { /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_cuda_graph_compute, /* .supports_op = */ ggml_backend_cuda_supports_op, + /* .event_new = */ ggml_backend_cuda_event_new, + /* .event_free = */ ggml_backend_cuda_event_free, + /* .event_record = */ ggml_backend_cuda_event_record, + /* .event_wait = */ ggml_backend_cuda_event_wait, + /* .event_synchronize = */ ggml_backend_cuda_event_synchronize, }; static ggml_guid_t ggml_backend_cuda_guid() { @@ -11475,10 +11591,11 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) { // not strictly necessary, but it may reduce the overhead of the first graph_compute ggml_cuda_set_main_device(device); - ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context { - /* .device = */ device, - /* .name = */ GGML_CUDA_NAME + std::to_string(device), - }; + ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device); + if (ctx == nullptr) { + fprintf(stderr, "%s: error: failed to allocate context\n", __func__); + return nullptr; + } ggml_backend_t cuda_backend = new ggml_backend { /* .guid = */ ggml_backend_cuda_guid(), diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp index 83a7822fd..4caf2c9e7 100644 --- a/ggml-kompute.cpp +++ b/ggml-kompute.cpp @@ -1951,6 +1951,11 @@ static struct ggml_backend_i kompute_backend_i = { /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_kompute_graph_compute, /* .supports_op = */ ggml_backend_kompute_supports_op, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, + /* .event_synchronize = */ NULL, }; static ggml_guid_t ggml_backend_kompute_guid() { diff --git a/ggml-metal.m b/ggml-metal.m index 1825d3320..3a5476c52 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -2820,6 +2820,11 @@ static struct ggml_backend_i ggml_backend_metal_i = { /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_metal_graph_compute, /* .supports_op = */ ggml_backend_metal_supports_op, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, + /* .event_synchronize = */ NULL, }; void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) { diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index c2ab13034..9f6506383 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -17249,13 +17249,18 @@ static ggml_backend_i ggml_backend_sycl_interface = { /* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type, /* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async, /* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async, - /* .cpy_tensor_async = */ ggml_backend_sycl_cpy_tensor_async, + /* .cpy_tensor_async = */ NULL, //ggml_backend_sycl_cpy_tensor_async, // TODO: update for the new interface /* .synchronize = */ ggml_backend_sycl_synchronize, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_sycl_graph_compute, /* .supports_op = */ ggml_backend_sycl_supports_op, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, + /* .event_synchronize = */ NULL, }; static ggml_guid_t ggml_backend_sycl_guid() { diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index d41aa7d22..7cce616ba 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -5693,6 +5693,11 @@ static ggml_backend_i ggml_backend_vk_interface = { /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_vk_graph_compute, /* .supports_op = */ ggml_backend_vk_supports_op, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, + /* .event_synchronize = */ NULL, }; static ggml_guid_t ggml_backend_vk_guid() { diff --git a/ggml.c b/ggml.c index 9a7bd1d8c..fbc66f65b 100644 --- a/ggml.c +++ b/ggml.c @@ -11560,8 +11560,6 @@ static void ggml_compute_forward_get_rows_q( const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; - assert(params->ith == 0); - if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11569,7 +11567,7 @@ static void ggml_compute_forward_get_rows_q( GGML_TENSOR_BINARY_OP_LOCALS const int64_t nc = ne00; - const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr); + const int64_t nr = ggml_nelements(src1); const enum ggml_type type = src0->type; ggml_to_float_t const dequantize_row_q = type_traits[type].to_float; @@ -11579,17 +11577,25 @@ static void ggml_compute_forward_get_rows_q( assert(nb00 == ggml_type_size(type)); assert(ggml_nrows(dst) == nr); - // TODO: multi-thread - for (int64_t i12 = 0; i12 < ne12; ++i12) { - for (int64_t i11 = 0; i11 < ne11; ++i11) { - for (int64_t i10 = 0; i10 < ne10; ++i10) { - const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + const int ith = params->ith; + const int nth = params->nth; - dequantize_row_q( - (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), - (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); - } - } + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int64_t i = ir0; i < ir1; ++i) { + const int64_t i12 = i/(ne11*ne10); + const int64_t i11 = (i - i12*ne11*ne10)/ne10; + const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); + const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + + dequantize_row_q( + (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), + (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); } } @@ -11600,8 +11606,6 @@ static void ggml_compute_forward_get_rows_f16( const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; - assert(params->ith == 0); - if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11609,24 +11613,32 @@ static void ggml_compute_forward_get_rows_f16( GGML_TENSOR_BINARY_OP_LOCALS const int64_t nc = ne00; - const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr); + const int64_t nr = ggml_nelements(src1); assert(ne0 == nc); assert(ne02 == ne11); assert(nb00 == sizeof(ggml_fp16_t)); assert(ggml_nrows(dst) == nr); - // TODO: multi-thread - for (int64_t i12 = 0; i12 < ne12; ++i12) { - for (int64_t i11 = 0; i11 < ne11; ++i11) { - for (int64_t i10 = 0; i10 < ne10; ++i10) { - const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + const int ith = params->ith; + const int nth = params->nth; - ggml_fp16_to_fp32_row( - (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), - (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); - } - } + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int64_t i = ir0; i < ir1; ++i) { + const int64_t i12 = i/(ne11*ne10); + const int64_t i11 = (i - i12*ne11*ne10)/ne10; + const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); + const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + + ggml_fp16_to_fp32_row( + (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), + (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); } } @@ -11637,8 +11649,6 @@ static void ggml_compute_forward_get_rows_f32( const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; - assert(params->ith == 0); - if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11646,24 +11656,32 @@ static void ggml_compute_forward_get_rows_f32( GGML_TENSOR_BINARY_OP_LOCALS const int64_t nc = ne00; - const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr); + const int64_t nr = ggml_nelements(src1); assert(ne0 == nc); assert(ne02 == ne11); assert(nb00 == sizeof(float)); assert(ggml_nrows(dst) == nr); - // TODO: multi-thread - for (int64_t i12 = 0; i12 < ne12; ++i12) { - for (int64_t i11 = 0; i11 < ne11; ++i11) { - for (int64_t i10 = 0; i10 < ne10; ++i10) { - const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + const int ith = params->ith; + const int nth = params->nth; - ggml_vec_cpy_f32(nc, - (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), - (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03)); - } - } + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int64_t i = ir0; i < ir1; ++i) { + const int64_t i12 = i/(ne11*ne10); + const int64_t i11 = (i - i12*ne11*ne10)/ne10; + const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); + const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + + ggml_vec_cpy_f32(nc, + (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), + (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03)); } } @@ -17796,7 +17814,7 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const node->perf_time_us += time_us_cur; } -static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { +static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads) { int n_tasks = 0; switch (node->op) { @@ -17877,6 +17895,12 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { { n_tasks = n_threads; } break; + case GGML_OP_GET_ROWS: + { + // FIXME: the cost of launching additional threads decreases performance with GPU offloading + //n_tasks = MIN(n_threads, ggml_nelements(node->src[1])); + n_tasks = MIN(n_cur_threads, ggml_nelements(node->src[1])); + } break; case GGML_OP_SCALE: case GGML_OP_SET: case GGML_OP_CONT: @@ -17884,7 +17908,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_OP_VIEW: case GGML_OP_PERMUTE: case GGML_OP_TRANSPOSE: - case GGML_OP_GET_ROWS: case GGML_OP_GET_ROWS_BACK: case GGML_OP_DIAG: { @@ -18102,7 +18125,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /* FINALIZE */ struct ggml_tensor * node = cgraph->nodes[node_n]; if (GGML_OP_HAS_FINALIZE[node->op]) { - params.nth = ggml_get_n_tasks(node, n_threads); + params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads); ggml_compute_forward(¶ms, node); } ggml_graph_compute_perf_stats_node(node, state->shared); @@ -18112,7 +18135,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { while (++node_n < cgraph->n_nodes) { GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes); struct ggml_tensor * node = cgraph->nodes[node_n]; - const int n_tasks = ggml_get_n_tasks(node, n_threads); + const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads); state->shared->perf_node_start_cycles = ggml_perf_cycles(); state->shared->perf_node_start_time_us = ggml_perf_time_us(); @@ -18160,7 +18183,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /* INIT & COMPUTE */ struct ggml_tensor * node = cgraph->nodes[node_n]; - const int n_tasks = ggml_get_n_tasks(node, n_threads); + const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads); struct ggml_compute_params params = { /*.type =*/ GGML_TASK_TYPE_INIT, @@ -18225,7 +18248,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; - const int n_tasks = ggml_get_n_tasks(node, n_threads); + const int n_tasks = ggml_get_n_tasks(node, n_threads, 1); max_tasks = MAX(max_tasks, n_tasks); diff --git a/llama.cpp b/llama.cpp index ad7b7b7d4..38e7036a7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -978,21 +978,6 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { } } -// -// ggml helpers -// - -static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { - struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); - - if (plan.work_size > 0) { - buf.resize(plan.work_size); - plan.work_data = buf.data(); - } - - ggml_graph_compute(graph, &plan); -} - // // llama helpers // @@ -1728,6 +1713,7 @@ struct llama_hparams { struct llama_cparams { uint32_t n_ctx; // context size used during inference uint32_t n_batch; + uint32_t n_ubatch; uint32_t n_threads; // number of threads to use for generation uint32_t n_threads_batch; // number of threads to use for batch processing @@ -2024,8 +2010,7 @@ struct llama_context { ggml_vk_free_cpu_assist(); #endif - ggml_backend_buffer_free(buf_input); - ggml_free(ctx_input); + ggml_backend_buffer_free(buf_output); } llama_cparams cparams; @@ -2051,12 +2036,20 @@ struct llama_context { int64_t t_p_eval_us = 0; int64_t t_eval_us = 0; + int64_t t_compute_start_us = 0; + int64_t n_queued_tokens = 0; + int32_t n_sample = 0; // number of tokens sampled int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) int32_t n_eval = 0; // number of eval calls - // logits output (2-dimensional array: [n_tokens][n_vocab]) - std::vector logits; + // host buffer for the model output (logits and embeddings) + ggml_backend_buffer_t buf_output = nullptr; + + // decode output (2-dimensional array: [n_tokens][n_vocab]) + size_t logits_size = 0; + float * logits = nullptr; + #ifndef NDEBUG // guard against access to unset logits std::vector logits_valid; @@ -2065,7 +2058,8 @@ struct llama_context { // embeddings output (2-dimensional array: [n_tokens][n_embd]) // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE - std::vector embd; + size_t embd_size = 0; + float * embd = nullptr; // sequence embeddings output (map of [n_embd] vectors) // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE @@ -2079,8 +2073,6 @@ struct llama_context { void * abort_callback_data = nullptr; // input tensors - ggml_backend_buffer_t buf_input = nullptr; - ggml_context * ctx_input = nullptr; struct ggml_tensor * inp_tokens; // I32 [n_batch] struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch] struct ggml_tensor * inp_pos; // I32 [n_batch] @@ -2090,7 +2082,7 @@ struct llama_context { struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch] struct ggml_tensor * inp_cls; // I32 [n_batch] struct ggml_tensor * inp_s_copy; // I32 [kv_size] - struct ggml_tensor * inp_s_mask; // F32 [kv_size] + struct ggml_tensor * inp_s_mask; // F32 [1, kv_size] struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch] #ifdef GGML_USE_MPI @@ -4005,6 +3997,7 @@ static bool llm_load_tensors( // there is very little benefit to offloading the input layer, so always keep it on the CPU model.buft_input = llama_default_buffer_type_cpu(true); + //model.buft_input = llama_default_buffer_type_offload(main_gpu); model.buft_layer.resize(n_layer); @@ -5094,29 +5087,32 @@ enum llm_norm_type { static struct ggml_tensor * llm_build_inp_embd( struct ggml_context * ctx, + struct llama_context & lctx, const llama_hparams & hparams, const llama_batch & batch, struct ggml_tensor * tok_embd, - struct ggml_tensor * inp_tokens, - struct ggml_tensor * inp_embd, const llm_build_cb & cb) { const int64_t n_embd = hparams.n_embd; struct ggml_tensor * inpL; if (batch.token) { - struct ggml_tensor * inp_tokens_v = ggml_view_1d(ctx, inp_tokens, batch.n_tokens, 0); - cb(inp_tokens, "inp_tokens", -1); + lctx.inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens); + cb(lctx.inp_tokens, "inp_tokens", -1); + ggml_set_input(lctx.inp_tokens); - inpL = ggml_get_rows(ctx, tok_embd, inp_tokens_v); + inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens); } else { #ifdef GGML_USE_MPI GGML_ASSERT(false && "not implemented"); #endif - - inpL = ggml_view_2d(ctx, inp_embd, n_embd, batch.n_tokens, inp_embd->nb[1], 0); + lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens); + inpL = lctx.inp_embd; + ggml_set_input(lctx.inp_embd); } + cb(inpL, "inp_embd", -1); + return inpL; } @@ -5420,7 +5416,7 @@ static struct ggml_tensor * llm_build_kv( struct llm_build_context { const llama_model & model; - const llama_context & lctx; + llama_context & lctx; const llama_hparams & hparams; const llama_cparams & cparams; const llama_batch & batch; @@ -5513,6 +5509,18 @@ struct llm_build_context { }; ctx0 = ggml_init(params); + + lctx.inp_tokens = nullptr; + lctx.inp_embd = nullptr; + lctx.inp_pos = nullptr; + lctx.inp_KQ_mask = nullptr; + lctx.inp_KQ_pos = nullptr; + lctx.inp_K_shift = nullptr; + lctx.inp_mean = nullptr; + lctx.inp_cls = nullptr; + lctx.inp_s_copy = nullptr; + lctx.inp_s_mask = nullptr; + lctx.inp_s_seq = nullptr; } void free() { @@ -5527,6 +5535,10 @@ struct llm_build_context { GGML_ASSERT(kv_self.size == n_ctx); + lctx.inp_K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); + cb(lctx.inp_K_shift, "K_shift", -1); + ggml_set_input(lctx.inp_K_shift); + for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * tmp = // we rotate only the first n_rot dimensions @@ -5550,12 +5562,14 @@ struct llm_build_context { GGML_ASSERT(kv_self.recurrent); + struct ggml_tensor * state_copy = build_inp_s_copy(); + for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * conv_states = ggml_reshape_2d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s(), kv_self.size); struct ggml_tensor * ssm_states = ggml_reshape_2d(ctx0, kv_self.v_l[il], hparams.n_embd_v_s(), kv_self.size); - conv_states = ggml_get_rows(ctx0, conv_states, lctx.inp_s_copy); - ssm_states = ggml_get_rows(ctx0, ssm_states, lctx.inp_s_copy); + conv_states = ggml_get_rows(ctx0, conv_states, state_copy); + ssm_states = ggml_get_rows(ctx0, ssm_states, state_copy); // TODO: name the intermediate tensors with cb() @@ -5615,6 +5629,66 @@ struct llm_build_context { return gf; } + struct ggml_tensor * build_inp_pos() { + lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(lctx.inp_pos, "inp_pos", -1); + ggml_set_input(lctx.inp_pos); + return lctx.inp_pos; + } + + struct ggml_tensor * build_inp_KQ_mask(bool causal = true) { + if (causal) { + lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, n_tokens); + } else { + lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); + } + cb(lctx.inp_KQ_mask, "KQ_mask", -1); + ggml_set_input(lctx.inp_KQ_mask); + return lctx.inp_KQ_mask; + } + + struct ggml_tensor * build_inp_KQ_pos() { + lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv); + cb(lctx.inp_KQ_pos, "KQ_pos", -1); + ggml_set_input(lctx.inp_KQ_pos); + return lctx.inp_KQ_pos; + } + + struct ggml_tensor * build_inp_mean() { + lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); + cb(lctx.inp_mean, "inp_mean", -1); + ggml_set_input(lctx.inp_mean); + return lctx.inp_mean; + } + + struct ggml_tensor * build_inp_cls() { + lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(lctx.inp_cls, "inp_cls", -1); + ggml_set_input(lctx.inp_cls); + return lctx.inp_cls; + } + + struct ggml_tensor * build_inp_s_copy() { + lctx.inp_s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, kv_self.size); + cb(lctx.inp_s_copy, "inp_s_copy", -1); + ggml_set_input(lctx.inp_s_copy); + return lctx.inp_s_copy; + } + + struct ggml_tensor * build_inp_s_mask() { + lctx.inp_s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv); + cb(lctx.inp_s_mask, "inp_s_mask", -1); + ggml_set_input(lctx.inp_s_mask); + return lctx.inp_s_mask; + } + + struct ggml_tensor * build_inp_s_seq() { + lctx.inp_s_seq = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens); + cb(lctx.inp_s_seq, "inp_s_seq", -1); + ggml_set_input(lctx.inp_s_seq); + return lctx.inp_s_seq; + } + struct ggml_cgraph * build_llama() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); @@ -5625,16 +5699,13 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); - cb(inpL, "inp_embd", -1); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); - cb(inp_pos, "inp_pos", -1); + struct ggml_tensor * inp_pos = build_inp_pos(); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); - cb(KQ_mask, "KQ_mask", -1); + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5686,7 +5757,6 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - cb(cur, "kqv_out", il); } struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); @@ -5804,20 +5874,16 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); - cb(inpL, "inp_embd", -1); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); - cb(inp_pos, "inp_pos", -1); + struct ggml_tensor * inp_pos = build_inp_pos(); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); - cb(KQ_mask, "KQ_mask", -1); + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); // positions of the tokens in the KV cache - struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0); - cb(KQ_pos, "KQ_pos", -1); + struct ggml_tensor * KQ_pos = build_inp_KQ_pos(); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5865,7 +5931,6 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - cb(cur, "kqv_out", il); } struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); @@ -5921,16 +5986,13 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); - cb(inpL, "inp_embd", -1); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); - cb(inp_pos, "inp_pos", -1); + struct ggml_tensor * inp_pos = build_inp_pos(); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); - cb(KQ_mask, "KQ_mask", -1); + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; @@ -5984,7 +6046,6 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - cb(cur, "kqv_out", il); } struct ggml_tensor * ffn_inp = cur; @@ -6035,21 +6096,17 @@ struct llm_build_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); struct ggml_tensor * cur; - struct ggml_tensor * pos; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); - cb(inpL, "inp_embd", -1); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); - cb(inp_pos, "inp_pos", -1); + struct ggml_tensor * inp_pos = build_inp_pos(); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); - cb(KQ_mask, "KQ_mask", -1); + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); - pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); + struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); cb(pos, "pos_embd", -1); inpL = ggml_add(ctx0, inpL, pos); @@ -6083,7 +6140,6 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - cb(cur, "kqv_out", il); } // add the input @@ -6135,16 +6191,13 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); - cb(inpL, "inp_embd", -1); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); - cb(inp_pos, "inp_pos", -1); + struct ggml_tensor * inp_pos = build_inp_pos(); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); - cb(KQ_mask, "KQ_mask", -1); + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * residual = inpL; @@ -6284,7 +6337,6 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - cb(cur, "kqv_out", il); } struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur); @@ -6338,16 +6390,13 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); - cb(inpL, "inp_embd", -1); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); - cb(KQ_mask, "KQ_mask", -1); + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); // positions of the tokens in the KV cache - struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0); - cb(KQ_pos, "KQ_pos", -1); + struct ggml_tensor * KQ_pos = build_inp_KQ_pos(); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6377,7 +6426,6 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - cb(cur, "kqv_out", il); } struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); @@ -6433,15 +6481,12 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - // get input vectors with right size - const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type); - - struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); - struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0); - struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0); + struct ggml_tensor * inp_pos = build_inp_pos(); + struct ggml_tensor * inp_mean = build_inp_mean(); + struct ggml_tensor * inp_cls = build_inp_cls(); // construct input embeddings (token, type, position) - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); // token types are hardcoded to zero ("Sentence A") struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0); @@ -6456,8 +6501,7 @@ struct llm_build_context { cb(inpL, "inp_norm", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_cont(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_tokens, n_tokens, n_tokens*ggml_type_size(lctx.inp_KQ_mask->type), 0)); - cb(KQ_mask, "KQ_mask", -1); // [n_tokens, n_tokens] + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(false); // iterate layers for (int il = 0; il < n_layer; ++il) { @@ -6619,16 +6663,13 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); - cb(inpL, "inp_embd", -1); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); - cb(KQ_mask, "KQ_mask", -1); + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); // positions of the tokens in the KV cache - struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0); - cb(KQ_pos, "KQ_pos", -1); + struct ggml_tensor * KQ_pos = build_inp_KQ_pos(); inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, @@ -6664,7 +6705,6 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - cb(cur, "kqv_out", il); } // Add the input @@ -6716,16 +6756,13 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); - cb(inpL, "inp_embd", -1); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); - cb(KQ_mask, "KQ_mask", -1); + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); // positions of the tokens in the KV cache - struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0); - cb(KQ_pos, "KQ_pos", -1); + struct ggml_tensor * KQ_pos = build_inp_KQ_pos(); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; @@ -6766,7 +6803,6 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - cb(cur, "kqv_out", il); } // Add the input @@ -6821,16 +6857,13 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); - cb(inpL, "inp_embd", -1); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); - cb(inp_pos, "inp_pos", -1); + struct ggml_tensor * inp_pos = build_inp_pos(); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); - cb(KQ_mask, "KQ_mask", -1); + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6883,7 +6916,6 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - cb(cur, "kqv_out", il); } struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); @@ -6939,16 +6971,13 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); - cb(inpL, "inp_embd", -1); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); - cb(inp_pos, "inp_pos", -1); + struct ggml_tensor * inp_pos = build_inp_pos(); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); - cb(KQ_mask, "KQ_mask", -1); + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6993,7 +7022,6 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - cb(cur, "kqv_out", il); } struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); @@ -7048,16 +7076,13 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); - cb(inpL, "inp_embd", -1); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); - cb(inp_pos, "inp_pos", -1); + struct ggml_tensor * inp_pos = build_inp_pos(); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); - cb(KQ_mask, "KQ_mask", -1); + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7109,7 +7134,6 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - cb(cur, "kqv_out", il); } struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); @@ -7164,16 +7188,13 @@ struct llm_build_context { struct ggml_tensor * ffn_output; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); - cb(inpL, "inp_embd", -1); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); - cb(inp_pos, "inp_pos", -1); + struct ggml_tensor * inp_pos = build_inp_pos(); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); - cb(KQ_mask, "KQ_mask", -1); + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); for (int il = 0; il < n_layer; ++il) { attn_norm_output = llm_build_norm(ctx0, inpL, hparams, @@ -7231,7 +7252,6 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il); - cb(cur, "kqv_out", il); } // FF @@ -7281,16 +7301,13 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); - cb(inpL, "inp_embd", -1); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); - cb(inp_pos, "inp_pos", -1); + struct ggml_tensor * inp_pos = build_inp_pos(); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); - cb(KQ_mask, "KQ_mask", -1); + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); for (int il = 0; il < n_layer; ++il) { @@ -7329,7 +7346,6 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - cb(cur, "kqv_out", il); } struct ggml_tensor * sa_out = cur; @@ -7383,16 +7399,13 @@ struct llm_build_context { struct ggml_tensor * pos; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); - cb(inpL, "inp_embd", -1); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); - cb(inp_pos, "inp_pos", -1); + struct ggml_tensor * inp_pos = build_inp_pos(); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); - cb(KQ_mask, "KQ_mask", -1); + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); cb(pos, "pos_embd", -1); @@ -7428,7 +7441,6 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - cb(cur, "kqv_out", il); } // add the input @@ -7481,16 +7493,13 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); - cb(inpL, "inp_embd", -1); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); - cb(inp_pos, "inp_pos", -1); + struct ggml_tensor * inp_pos = build_inp_pos(); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); - cb(KQ_mask, "KQ_mask", -1); + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); for (int il = 0; il < n_layer; ++il) { cur = llm_build_norm(ctx0, inpL, hparams, @@ -7532,7 +7541,6 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - cb(cur, "kqv_out", il); } // add the input @@ -7584,16 +7592,13 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); - cb(inpL, "inp_embd", -1); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); - cb(inp_pos, "inp_pos", -1); + struct ggml_tensor * inp_pos = build_inp_pos(); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); - cb(KQ_mask, "KQ_mask", -1); + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7645,7 +7650,6 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - cb(cur, "kqv_out", il); } struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); @@ -7698,16 +7702,13 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); - cb(inpL, "inp_embd", -1); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); - cb(inp_pos, "inp_pos", -1); + struct ggml_tensor * inp_pos = build_inp_pos(); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); - cb(KQ_mask, "KQ_mask", -1); + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7759,7 +7760,6 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - cb(cur, "kqv_out", il); } struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); @@ -7821,20 +7821,17 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); - cb(inpL, "inp_embd", -1); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); // scale the input embeddings inpL = ggml_scale(ctx0, inpL, scale_embd); cb(inpL, "inp_scaled", -1); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); - cb(inp_pos, "inp_pos", -1); + struct ggml_tensor * inp_pos = build_inp_pos(); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); - cb(KQ_mask, "KQ_mask", -1); + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7886,7 +7883,6 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - cb(cur, "kqv_out", il); } // scale_res - scale the hidden states for residual connection @@ -7953,22 +7949,18 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); - cb(inpL, "inp_embd", -1); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); cb(inpL, "inp_scaled", -1); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); - cb(inp_pos, "inp_pos", -1); + struct ggml_tensor * inp_pos = build_inp_pos(); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); - cb(KQ_mask, "KQ_mask", -1); + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); for (int il = 0; il < n_layer; ++il) { - // norm cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, @@ -8005,7 +7997,6 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il); - cb(cur, "kqv_out", il); } struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); @@ -8060,16 +8051,13 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); - cb(inpL, "inp_embd", -1); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); - cb(inp_pos, "inp_pos", -1); + struct ggml_tensor * inp_pos = build_inp_pos(); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); - cb(KQ_mask, "KQ_mask", -1); + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -8178,11 +8166,10 @@ struct llm_build_context { struct ggml_tensor * inpL; // {n_embd, n_tokens} - inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); - cb(inpL, "inp_embd", -1); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); - struct ggml_tensor * state_mask = ggml_view_2d(ctx0, lctx.inp_s_mask, 1, n_kv, lctx.inp_s_mask->nb[0], 0); - struct ggml_tensor * state_seq = ggml_view_2d(ctx0, lctx.inp_s_seq, n_kv, n_tokens, n_kv*ggml_element_size(lctx.inp_s_seq), 0); + struct ggml_tensor * state_mask = build_inp_s_mask(); + struct ggml_tensor * state_seq = build_inp_s_seq(); for (int il = 0; il < n_layer; ++il) { // (ab)using the KV cache to store the states @@ -8234,7 +8221,7 @@ struct llm_build_context { ggml_build_forward_expand(gf, ggml_cpy(ctx0, ggml_view_2d(ctx0, x_conv, d_conv - 1, d_inner*n_kv, d_conv*ggml_element_size(x_conv), (1+d_inner*n_tokens)*ggml_element_size(x_conv)), - ggml_view_1d(ctx0, kv_self.k_l[il], (d_conv - 1)*(d_inner)*(n_kv), kv_self.head*(d_conv - 1)*(d_inner)*ggml_element_size(x_conv)))); + ggml_view_1d(ctx0, kv_self.k_l[il], (d_conv - 1)*(d_inner)*(n_kv), kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(x_conv)))); // extract x from x_conv x = ggml_view_2d(ctx0, x_conv, d_inner, n_tokens, d_inner*ggml_element_size(x_conv), 0); @@ -8268,7 +8255,7 @@ struct llm_build_context { ggml_build_forward_expand(gf, ggml_cpy(ctx0, ggml_view_1d(ctx0, y_ssm_states, d_state*d_inner*n_kv, d_inner*n_tokens*ggml_element_size(y_ssm_states)), - ggml_view_1d(ctx0, kv_self.v_l[il], d_state*d_inner*n_kv, kv_self.head*d_state*d_inner*ggml_element_size(ssm_states)))); + ggml_view_1d(ctx0, kv_self.v_l[il], d_state*d_inner*n_kv, kv_head*d_state*d_inner*ggml_element_size(ssm_states)))); struct ggml_tensor * y = ggml_view_2d(ctx0, y_ssm_states, d_inner, n_tokens, d_inner*ggml_element_size(y_ssm_states), 0); @@ -8372,7 +8359,18 @@ static struct ggml_cgraph * llama_build_graph( if (!lctx.cparams.offload_kqv) { if (strcmp(name, "kqv_merged_cont") == 0) { // all nodes between the KV store and the attention output are run on the CPU - ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu); + ggml_backend_sched_set_tensor_backend(lctx.sched, cur, lctx.backend_cpu); + } + } + + // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends + // to fix this, we assign the norm layer manually to the backend of its layer + if (il != -1 && strcmp(name, "norm") == 0) { + for (auto * backend : lctx.backends) { + if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) { + ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend); + break; + } } } }; @@ -8528,7 +8526,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd)); } - if (batch.pos) { + if (batch.pos && lctx.inp_pos) { const int64_t n_tokens = batch.n_tokens; ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos)); @@ -8539,61 +8537,63 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { "non-causal attention with generative models is not supported" ); - // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache. - if (cparams.causal_attn) { - const int64_t n_kv = kv_self.n; - const int64_t n_tokens = batch.n_tokens; + if (lctx.inp_KQ_mask) { + // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache. + if (cparams.causal_attn) { + const int64_t n_kv = kv_self.n; + const int64_t n_tokens = batch.n_tokens; - assert(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer)); + GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer)); - float * data = (float *) lctx.inp_KQ_mask->data; + float * data = (float *) lctx.inp_KQ_mask->data; - // For causal attention, use only the previous KV cells - // of the correct sequence for each token of the batch. - // It's assumed that if a token in the batch has multiple sequences, they are equivalent. - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; + // For causal attention, use only the previous KV cells + // of the correct sequence for each token of the batch. + // It's assumed that if a token in the batch has multiple sequences, they are equivalent. + for (int h = 0; h < 1; ++h) { + for (int j = 0; j < n_tokens; ++j) { + const llama_pos pos = batch.pos[j]; + const llama_seq_id seq_id = batch.seq_id[j][0]; - for (int i = 0; i < n_kv; ++i) { - float f; - if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) { - f = -INFINITY; - } else { - f = 0.0f; + for (int i = 0; i < n_kv; ++i) { + float f; + if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) { + f = -INFINITY; + } else { + f = 0.0f; + } + data[h*(n_kv*n_tokens) + j*n_kv + i] = f; } - data[h*(n_kv*n_tokens) + j*n_kv + i] = f; } } - } - } else { - // when using kv cache, the mask needs to match the kv cache size - const int64_t n_tokens = batch.n_tokens; - const int64_t n_stride = hparams.causal_attn ? kv_self.n : n_tokens; + } else { + // when using kv cache, the mask needs to match the kv cache size + const int64_t n_tokens = batch.n_tokens; + const int64_t n_stride = hparams.causal_attn ? kv_self.n : n_tokens; - assert(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer)); + GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer)); - float * data = (float *) lctx.inp_KQ_mask->data; + float * data = (float *) lctx.inp_KQ_mask->data; - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_seq_id seq_id = batch.seq_id[j][0]; + for (int h = 0; h < 1; ++h) { + for (int j = 0; j < n_tokens; ++j) { + const llama_seq_id seq_id = batch.seq_id[j][0]; - for (int i = 0; i < n_tokens; ++i) { - float f = -INFINITY; - for (int s = 0; s < batch.n_seq_id[i]; ++s) { - if (batch.seq_id[i][s] == seq_id) { - f = 0.0f; - break; + for (int i = 0; i < n_tokens; ++i) { + float f = -INFINITY; + for (int s = 0; s < batch.n_seq_id[i]; ++s) { + if (batch.seq_id[i][s] == seq_id) { + f = 0.0f; + break; + } } + + data[h*(n_tokens*n_tokens) + j*n_stride + i] = f; } - data[h*(n_tokens*n_tokens) + j*n_stride + i] = f; - } - - for (int i = n_tokens; i < n_stride; ++i) { - data[h*(n_tokens*n_tokens) + j*n_stride + i] = -INFINITY; + for (int i = n_tokens; i < n_stride; ++i) { + data[h*(n_tokens*n_tokens) + j*n_stride + i] = -INFINITY; + } } } } @@ -8602,7 +8602,8 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { if (hparams.need_kq_pos) { const int64_t n_kv = kv_self.n; - assert(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer)); + GGML_ASSERT(lctx.inp_KQ_pos); + GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer)); float * data = (float *) lctx.inp_KQ_pos->data; @@ -8614,6 +8615,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { const int64_t n_tokens = batch.n_tokens; + GGML_ASSERT(lctx.inp_mean); GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer)); float * data = (float *) lctx.inp_mean->data; @@ -8645,6 +8647,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { if (cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) { const int64_t n_tokens = batch.n_tokens; + GGML_ASSERT(lctx.inp_cls); GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); uint32_t * data = (uint32_t *) lctx.inp_cls->data; @@ -8665,7 +8668,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { if (kv_self.recurrent) { const int64_t n_kv = kv_self.n; - { + if (lctx.inp_s_mask) { GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_mask->buffer)); float * data = (float *) lctx.inp_s_mask->data; @@ -8687,7 +8690,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { // update the correct state(s)/sequence(s) for each token of the batch. // Like with the KQ_mask, if a token in the batch has multiple sequences, // they are assumed to be equivalent (not here, but in ggml_ssm_scan and ggml_ssm_conv). - { + if (lctx.inp_s_seq) { const int64_t n_tokens = batch.n_tokens; GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_seq->buffer)); @@ -8730,7 +8733,7 @@ static void llama_graph_compute( ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data); } - ggml_backend_sched_graph_compute(lctx.sched, gf); + ggml_backend_sched_graph_compute_async(lctx.sched, gf); // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched)); @@ -8750,10 +8753,11 @@ static void llama_graph_compute( // static int llama_decode_internal( llama_context & lctx, - llama_batch batch) { - const uint32_t n_tokens = batch.n_tokens; + llama_batch batch_all) { // TODO: rename back to batch - if (n_tokens == 0) { + const uint32_t n_tokens_all = batch_all.n_tokens; + + if (n_tokens_all == 0) { LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__); return -1; } @@ -8762,14 +8766,16 @@ static int llama_decode_internal( const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; - const auto n_batch = cparams.n_batch; + GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT - GGML_ASSERT(n_tokens <= n_batch); - GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT + GGML_ASSERT(n_tokens_all <= cparams.n_batch); - int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch; + GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens"); - const int64_t t_start_us = ggml_time_us(); + if (lctx.t_compute_start_us == 0) { + lctx.t_compute_start_us = ggml_time_us(); + } + lctx.n_queued_tokens += n_tokens_all; #ifdef GGML_USE_MPI // TODO: needs fix after #3228 @@ -8777,128 +8783,261 @@ static int llama_decode_internal( //ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads); #endif - GGML_ASSERT(n_threads > 0); - auto & kv_self = lctx.kv_self; const int64_t n_embd = hparams.n_embd; const int64_t n_vocab = hparams.n_vocab; - // helpers for smoother batch API transition - // after deprecating the llama_eval calls, these will be removed - std::vector pos; + auto * logits_out = lctx.logits; + +#ifndef NDEBUG + auto & logits_valid = lctx.logits_valid; + logits_valid.clear(); + logits_valid.resize(n_tokens_all); + + memset(logits_out, 0, lctx.logits_size*sizeof(float)); +#endif + + const auto n_ubatch = cparams.n_ubatch; + + std::vector pos; std::vector n_seq_id; std::vector seq_id_arr; std::vector> seq_id; - if (batch.pos == nullptr) { - pos.resize(n_tokens); - for (uint32_t i = 0; i < n_tokens; i++) { - pos[i] = batch.all_pos_0 + i*batch.all_pos_1; - } + for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) { + const uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token); + llama_batch u_batch = { + /* .n_tokens = */ (int32_t) n_tokens, + /* .token = */ batch_all.token ? batch_all.token + cur_token : nullptr, + /* .embd = */ batch_all.embd ? batch_all.embd + cur_token*n_embd : nullptr, + /* .pos = */ batch_all.pos ? batch_all.pos + cur_token : nullptr, + /* .n_seq_id = */ batch_all.n_seq_id ? batch_all.n_seq_id + cur_token : nullptr, + /* .seq_id = */ batch_all.seq_id ? batch_all.seq_id + cur_token : nullptr, + /* .logits = */ batch_all.logits ? batch_all.logits + cur_token : nullptr, + /* .all_pos_0 = */ batch_all.all_pos_0 + (llama_pos) cur_token*batch_all.all_pos_1, + /* .all_pos_1 = */ batch_all.all_pos_1, + /* .all_seq_id = */ batch_all.all_seq_id, + }; - batch.pos = pos.data(); - } + int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch; + GGML_ASSERT(n_threads > 0); - if (batch.seq_id == nullptr) { - n_seq_id.resize(n_tokens); - seq_id.resize(n_tokens); - seq_id_arr.resize(n_tokens); - for (uint32_t i = 0; i < n_tokens; i++) { - n_seq_id[i] = 1; - seq_id[i].resize(1); - seq_id[i][0] = batch.all_seq_id; - seq_id_arr[i] = seq_id[i].data(); - } - - batch.n_seq_id = n_seq_id.data(); - batch.seq_id = seq_id_arr.data(); - } - - // non-causal masks do not use the KV cache - if (hparams.causal_attn) { - llama_kv_cache_update(&lctx); - - // if we have enough unused cells before the current head -> - // better to start searching from the beginning of the cache, hoping to fill it - if (kv_self.head > kv_self.used + 2*n_tokens) { - kv_self.head = 0; - } - - if (!llama_kv_cache_find_slot(kv_self, batch)) { - return 1; - } - - if (!kv_self.recurrent) { - // a heuristic, to avoid attending the full cache if it is not yet utilized - // after enough generations, the benefit from this heuristic disappears - // if we start defragmenting the cache, the benefit from this will be more important - kv_self.n = std::min(kv_self.size, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32))); - //kv_self.n = llama_kv_cache_cell_max(kv_self); - } - } - - //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); - - ggml_backend_sched_reset(lctx.sched); - ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); - - ggml_cgraph * gf = llama_build_graph(lctx, batch, false); - - // the output is always the last tensor in the graph - struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; - struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2]; - - if (!hparams.causal_attn) { - res = nullptr; // do not extract logits for embedding models such as BERT - - // token or sequence embeddings - embd = gf->nodes[gf->n_nodes - 1]; - - GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0); - } else { - if (strcmp(res->name, "result_output") == 0) { - // the token embeddings could be the second to last tensor, or the third to last tensor - if (strcmp(embd->name, "result_norm") != 0) { - embd = gf->nodes[gf->n_nodes - 3]; - GGML_ASSERT(strcmp(embd->name, "result_norm") == 0); + // helpers for smoother batch API transition + // after deprecating the llama_eval calls, these will be removed + if (u_batch.pos == nullptr) { + pos.resize(n_tokens); + for (uint32_t i = 0; i < n_tokens; i++) { + pos[i] = u_batch.all_pos_0 + i*u_batch.all_pos_1; } + + u_batch.pos = pos.data(); + } + + if (u_batch.seq_id == nullptr) { + n_seq_id.resize(n_tokens); + seq_id.resize(n_tokens); + seq_id_arr.resize(n_tokens); + for (uint32_t i = 0; i < n_tokens; i++) { + n_seq_id[i] = 1; + seq_id[i].resize(1); + seq_id[i][0] = u_batch.all_seq_id; + seq_id_arr[i] = seq_id[i].data(); + } + + u_batch.n_seq_id = n_seq_id.data(); + u_batch.seq_id = seq_id_arr.data(); + } + + // non-causal masks do not use the KV cache + if (hparams.causal_attn) { + llama_kv_cache_update(&lctx); + + // if we have enough unused cells before the current head -> + // better to start searching from the beginning of the cache, hoping to fill it + if (kv_self.head > kv_self.used + 2*n_tokens) { + kv_self.head = 0; + } + + if (!llama_kv_cache_find_slot(kv_self, u_batch)) { + return 1; + } + + if (!kv_self.recurrent) { + // a heuristic, to avoid attending the full cache if it is not yet utilized + // after enough generations, the benefit from this heuristic disappears + // if we start defragmenting the cache, the benefit from this will be more important + kv_self.n = std::min(kv_self.size, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32))); + //kv_self.n = llama_kv_cache_cell_max(kv_self); + } + } + + //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); + + ggml_backend_sched_reset(lctx.sched); + ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); + + ggml_cgraph * gf = llama_build_graph(lctx, u_batch, false); + + // the output is always the last tensor in the graph + struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; + struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2]; + + if (!hparams.causal_attn) { + res = nullptr; // do not extract logits for embedding models such as BERT + + // token or sequence embeddings + embd = gf->nodes[gf->n_nodes - 1]; + + GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0); } else { - GGML_ASSERT(false && "missing result_output tensor"); + if (strcmp(res->name, "result_output") == 0) { + // the token embeddings could be the second to last tensor, or the third to last tensor + if (strcmp(embd->name, "result_norm") != 0) { + embd = gf->nodes[gf->n_nodes - 3]; + GGML_ASSERT(strcmp(embd->name, "result_norm") == 0); + } + } else { + GGML_ASSERT(false && "missing result_output tensor"); + } + } + // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); + + // for big prompts, if BLAS is enabled, it is better to use only one thread + // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance + // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well + // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering + // with the BLAS calls. need a better solution + // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is + // being processed then Accelerate/BLAS will not be involved, so capping would limit performance. + if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) { + n_threads = std::min(4, n_threads); + } + + ggml_backend_sched_alloc_graph(lctx.sched, gf); + + llama_set_inputs(lctx, u_batch); + + llama_graph_compute(lctx, gf, n_threads); + + // update the kv ring buffer + { + kv_self.head += n_tokens; + + // Ensure kv cache head points to a valid index. + if (kv_self.head >= kv_self.size) { + kv_self.head = 0; + } + } + +#ifdef GGML_PERF + // print timing information per ggml operation (for debugging purposes) + // requires GGML_PERF to be defined + ggml_graph_print(gf); +#endif + + // plot the computation graph in dot format (for debugging purposes) + //if (n_past%100 == 0) { + // ggml_graph_dump_dot(gf, NULL, "llama.dot"); + //} + + // extract logits + // TODO: do not compute and extract logits if only embeddings are needed + // update the graphs to skip "result_output" if logits are not needed + if (res) { + ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res); + GGML_ASSERT(backend_res != nullptr); + if (u_batch.logits) { + int32_t i_first = -1; + for (uint32_t i = 0; i < n_tokens; i++) { + if (u_batch.logits[i] && i_first == -1) { + i_first = (int32_t) i; + } + if (u_batch.logits[i] == 0 || i == n_tokens - 1) { + if (i_first != -1) { + int i_last = u_batch.logits[i] == 0 ? i : i + 1; + // extract logits for the range [i_first, i_last) + // group the requests to minimize the number of calls to the backend + ggml_backend_tensor_get_async(backend_res, res, + logits_out + n_vocab*(cur_token + i_first), + i_first*n_vocab*sizeof(float), + (i_last - i_first)*n_vocab*sizeof(float)); + i_first = -1; + } + } +#ifndef NDEBUG + logits_valid[cur_token + i] = u_batch.logits[i] != 0;; +#endif + } + } else if (lctx.logits_all) { + ggml_backend_tensor_get_async(backend_res, res, logits_out + n_vocab*cur_token, 0, n_vocab*n_tokens*sizeof(float)); +#ifndef NDEBUG + std::fill(logits_valid.begin() + cur_token, logits_valid.begin() + cur_token + n_tokens, true); +#endif + } else { + if (cur_token + n_tokens >= n_tokens_all) { + ggml_backend_tensor_get_async(backend_res, res, logits_out, n_vocab*(n_tokens - 1)*sizeof(float), n_vocab*sizeof(float)); +#ifndef NDEBUG + logits_valid[0] = true; +#endif + } + } + } + + // extract embeddings + if (cparams.embeddings && embd) { + ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd); + GGML_ASSERT(backend_embd != nullptr); + + switch (cparams.pooling_type) { + case LLAMA_POOLING_TYPE_NONE: + { + // extract token embeddings + auto & embd_out = lctx.embd; + + if (u_batch.logits) { + //embd_out.resize(n_embd * n_tokens); + for (uint32_t i = 0; i < n_tokens; i++) { + if (u_batch.logits[i] == 0) { + continue; + } + ggml_backend_tensor_get_async(backend_embd, embd, embd_out + n_embd*(i + cur_token), (n_embd*i)*sizeof(float), n_embd*sizeof(float)); + } + } + } break; + case LLAMA_POOLING_TYPE_CLS: + case LLAMA_POOLING_TYPE_MEAN: + { + GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0); + + // extract sequence embeddings + auto & embd_seq_out = lctx.embd_seq; + embd_seq_out.clear(); + + for (uint32_t i = 0; i < n_tokens; i++) { + const llama_seq_id seq_id = u_batch.seq_id[i][0]; + if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { + continue; + } + embd_seq_out[seq_id].resize(n_embd); + ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float)); + } + } break; + case LLAMA_POOLING_TYPE_UNSPECIFIED: + { + GGML_ASSERT(false && "unknown pooling type"); + } break; + } } } - // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); - - // for big prompts, if BLAS is enabled, it is better to use only one thread - // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance - // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well - // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering - // with the BLAS calls. need a better solution - // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is - // being processed then Accelerate/BLAS will not be involved, so capping would limit performance. - if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) { - n_threads = std::min(4, n_threads); - } - - llama_set_inputs(lctx, batch); - - llama_graph_compute(lctx, gf, n_threads); - - // update the kv ring buffer - { - kv_self.head += n_tokens; - - // Ensure kv cache head points to a valid index. - if (kv_self.head >= kv_self.size) { - kv_self.head = 0; - } - } + // wait for the computation to finish (automatically done when obtaining the model output) + //llama_synchronize(&lctx); // decide if we need to defrag the kv cache if (cparams.defrag_thold >= 0.0f) { - const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used + n_tokens)/float(kv_self.n) : 0.0f; + const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used + n_tokens_all)/float(kv_self.n) : 0.0f; // queue defragmentation for next llama_kv_cache_update if (fragmentation > cparams.defrag_thold) { @@ -8908,141 +9047,10 @@ static int llama_decode_internal( } } -#ifdef GGML_PERF - // print timing information per ggml operation (for debugging purposes) - // requires GGML_PERF to be defined - ggml_graph_print(gf); -#endif - - // plot the computation graph in dot format (for debugging purposes) - //if (n_past%100 == 0) { - // ggml_graph_dump_dot(gf, NULL, "llama.dot"); - //} - - // extract logits - // TODO: do not compute and extract logits if only embeddings are needed - // need to update the graphs to skip "result_output" - if (res) { - auto & logits_out = lctx.logits; - -#ifndef NDEBUG - auto & logits_valid = lctx.logits_valid; - logits_valid.clear(); - logits_valid.resize(n_tokens); - - logits_out.clear(); -#endif - - ggml_backend_t backend_res = ggml_backend_sched_get_node_backend(lctx.sched, res); - GGML_ASSERT(backend_res != nullptr); - - if (batch.logits) { - logits_out.resize(n_vocab * n_tokens); - int32_t i_first = -1; - for (uint32_t i = 0; i < n_tokens; i++) { - if (batch.logits[i] && i_first == -1) { - i_first = (int32_t) i; - } - if (batch.logits[i] == 0 || i == n_tokens - 1) { - if (i_first != -1) { - int i_last = batch.logits[i] == 0 ? i : i + 1; - // extract logits for the range [i_first, i_last) - // group the requests to minimize the number of calls to the backend - ggml_backend_tensor_get_async(backend_res, res, - logits_out.data() + (n_vocab*i_first), - (n_vocab*i_first)*sizeof(float), - (i_last - i_first)*n_vocab*sizeof(float)); - i_first = -1; - } - } -#ifndef NDEBUG - logits_valid[i] = batch.logits[i] != 0; -#endif - } - } else if (lctx.logits_all) { - logits_out.resize(n_vocab*n_tokens); - ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float)); -#ifndef NDEBUG - std::fill(logits_valid.begin(), logits_valid.end(), true); -#endif - } else { - logits_out.resize(n_vocab); - ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float)); -#ifndef NDEBUG - logits_valid[0] = true; -#endif - } - ggml_backend_synchronize(backend_res); - } - - // extract embeddings - if (cparams.embeddings && embd) { - ggml_backend_t backend_embd = ggml_backend_sched_get_node_backend(lctx.sched, embd); - GGML_ASSERT(backend_embd != nullptr); - - switch (cparams.pooling_type) { - case LLAMA_POOLING_TYPE_NONE: - { - // extract token embeddings - auto & embd_out = lctx.embd; - - if (batch.logits) { - embd_out.resize(n_embd * n_tokens); - for (uint32_t i = 0; i < n_tokens; i++) { - if (batch.logits[i] == 0) { - continue; - } - - ggml_backend_tensor_get_async(backend_embd, embd, embd_out.data() + (n_embd*i), (n_embd*i)*sizeof(float), n_embd*sizeof(float)); - } - } - } break; - case LLAMA_POOLING_TYPE_CLS: - case LLAMA_POOLING_TYPE_MEAN: - { - GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0); - - // extract sequence embeddings - auto & embd_seq_out = lctx.embd_seq; - embd_seq_out.clear(); - - for (uint32_t i = 0; i < n_tokens; i++) { - const llama_seq_id seq_id = batch.seq_id[i][0]; - if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { - continue; - } - embd_seq_out[seq_id].resize(n_embd); - ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float)); - } - } break; - case LLAMA_POOLING_TYPE_UNSPECIFIED: - { - GGML_ASSERT(false && "unknown pooling type"); - } break; - } - ggml_backend_synchronize(backend_embd); - } - - // measure the performance only for the single-token evals - if (n_tokens == 1) { - lctx.t_eval_us += ggml_time_us() - t_start_us; - lctx.n_eval++; - } - else if (n_tokens > 1) { - lctx.t_p_eval_us += ggml_time_us() - t_start_us; - lctx.n_p_eval += n_tokens; - } - - // get a more accurate load time, upon first eval - // TODO: fix this - if (!lctx.has_evaluated_once) { - lctx.t_load_us = ggml_time_us() - lctx.t_start_us; - lctx.has_evaluated_once = true; - } - return 0; } + // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { auto & kv_self = lctx.kv_self; @@ -9242,6 +9250,8 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { #else // ggml_graph defrag + ggml_backend_sched_reset(lctx.sched); + ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids); llama_graph_compute(lctx, gf, lctx.cparams.n_threads); @@ -9253,14 +9263,22 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { } static void llama_kv_cache_update_internal(struct llama_context & lctx) { + bool need_reserve = false; + // apply K-shift if needed if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) { - llama_set_k_shift(lctx); - { + ggml_backend_sched_reset(lctx.sched); + ggml_cgraph * gf = llama_build_graph_k_shift(lctx); + ggml_backend_sched_alloc_graph(lctx.sched, gf); + + llama_set_k_shift(lctx); + llama_graph_compute(lctx, gf, lctx.cparams.n_threads); + + need_reserve = true; } { @@ -9275,12 +9293,18 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { } if (lctx.kv_self.recurrent && lctx.kv_self.do_copy) { - llama_set_s_copy(lctx); - { + ggml_backend_sched_reset(lctx.sched); + ggml_cgraph * gf = llama_build_graph_s_copy(lctx); + ggml_backend_sched_alloc_graph(lctx.sched, gf); + + llama_set_s_copy(lctx); + llama_graph_compute(lctx, gf, lctx.cparams.n_threads); + + need_reserve = true; } { @@ -9298,8 +9322,26 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { if (lctx.kv_self.do_defrag) { llama_kv_cache_defrag_internal(lctx); + need_reserve = true; + lctx.kv_self.do_defrag = false; } + + // reserve a worst case graph again + if (need_reserve) { + // TODO: extract to a function + // build worst-case graph + int n_tokens = (int)std::min(lctx.cparams.n_ctx, lctx.cparams.n_ubatch); + int n_past = lctx.cparams.n_ctx - n_tokens; + llama_token token = llama_token_bos(&lctx.model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + ggml_cgraph * gf = llama_build_graph(lctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true); + + // initialize scheduler with the worst-case graph + ggml_backend_sched_reset(lctx.sched); + if (!ggml_backend_sched_reserve(lctx.sched, gf)) { + LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); + } + } } // @@ -12537,7 +12579,8 @@ struct llama_context_params llama_context_default_params() { struct llama_context_params result = { /*.seed =*/ LLAMA_DEFAULT_SEED, /*.n_ctx =*/ 512, - /*.n_batch =*/ 512, + /*.n_batch =*/ 2048, + /*.n_ubatch =*/ 512, /*.n_seq_max =*/ 1, /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS, @@ -12691,6 +12734,17 @@ struct llama_context * llama_new_context_with_model( struct llama_context_params params) { if (!model) { + LLAMA_LOG_ERROR("%s: model cannot be NULL\n", __func__); + return nullptr; + } + + if (params.n_batch == 0 && params.n_ubatch == 0) { + LLAMA_LOG_ERROR("%s: n_batch and n_ubatch cannot both be zero\n", __func__); + return nullptr; + } + + if (params.n_ctx == 0 && model->hparams.n_ctx_train == 0) { + LLAMA_LOG_ERROR("%s: n_ctx and model->hparams.n_ctx_train cannot both be zero\n", __func__); return nullptr; } @@ -12699,7 +12753,6 @@ struct llama_context * llama_new_context_with_model( const auto & hparams = model->hparams; auto & cparams = ctx->cparams; - cparams.n_batch = params.n_batch; // TODO: maybe add n_seq_max here too cparams.n_threads = params.n_threads; cparams.n_threads_batch = params.n_threads_batch; @@ -12716,6 +12769,11 @@ struct llama_context * llama_new_context_with_model( cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; + // with causal attention, the batch size is limited by the context size + cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch; + cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch); + + cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx : hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx : hparams.n_ctx_train; @@ -12751,6 +12809,8 @@ struct llama_context * llama_new_context_with_model( } LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx); + LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch); + LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch); LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); @@ -12895,54 +12955,31 @@ struct llama_context * llama_new_context_with_model( ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); } - // resized during inference, reserve maximum - ctx->logits.reserve(hparams.n_vocab*cparams.n_batch); - - if (params.embeddings) { - ctx->embd.reserve(hparams.n_embd*cparams.n_batch); - } - - // graph inputs + // graph outputs buffer { - ggml_init_params init_params = { - /* .mem_size */ ggml_tensor_overhead()*(8 + 3*(ctx->kv_self.recurrent)), - /* .mem_buffer */ nullptr, - /* .no_alloc */ true, - }; - ctx->ctx_input = ggml_init(init_params); + // resized during inference, reserve maximum + ctx->logits_size = hparams.n_vocab*cparams.n_batch; + ctx->embd_size = params.embeddings ? hparams.n_embd*cparams.n_batch : 0; - ctx->inp_tokens = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch); - ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch); - ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch); - ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, kv_size, cparams.n_batch); - ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, kv_size); - ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, kv_size); - ctx->inp_mean = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch); - ctx->inp_cls = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch); - if (ctx->kv_self.recurrent) { - ctx->inp_s_copy = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, kv_size); - ctx->inp_s_mask = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, kv_size); - ctx->inp_s_seq = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_I32, kv_size, cparams.n_batch); + const size_t buf_output_size = (ctx->logits_size + ctx->embd_size)*sizeof(float); + + ctx->buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buf_output_size); + if (ctx->buf_output == nullptr) { + LLAMA_LOG_ERROR("%s: failed to allocate logits buffer\n", __func__); + llama_free(ctx); + return nullptr; + } + ggml_backend_buffer_clear(ctx->buf_output, 0); + + + ctx->logits = (float *) ggml_backend_buffer_get_base(ctx->buf_output); + if (params.embeddings) { + ctx->embd = ctx->logits + ctx->logits_size; } - ggml_set_name(ctx->inp_tokens, "inp_tokens"); - ggml_set_name(ctx->inp_embd, "inp_embd"); - ggml_set_name(ctx->inp_pos, "inp_pos"); - ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask"); - ggml_set_name(ctx->inp_KQ_pos, "inp_KQ_pos"); - ggml_set_name(ctx->inp_K_shift, "inp_K_shift"); - ggml_set_name(ctx->inp_mean, "inp_mean"); - ggml_set_name(ctx->inp_cls, "inp_cls"); - if (ctx->kv_self.recurrent) { - ggml_set_name(ctx->inp_s_copy, "inp_s_copy"); - ggml_set_name(ctx->inp_s_mask, "inp_s_mask"); - ggml_set_name(ctx->inp_s_seq, "inp_s_seq"); - } - - ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true)); - LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__, - ggml_backend_buffer_name(ctx->buf_input), - ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__, + ggml_backend_buffer_name(ctx->buf_output), + ggml_backend_buffer_get_size(ctx->buf_output) / 1024.0 / 1024.0); } // scheduler and compute buffers @@ -12961,10 +12998,21 @@ struct llama_context * llama_new_context_with_model( // buffer used to store the computation graph and the tensor meta data ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false)); - ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES); + // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary + bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER; +#ifndef GGML_USE_CUBLAS + // pipeline parallelism requires support for async compute and events + // currently this is only implemented in the CUDA backend + pipeline_parallel = false; +#endif + ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES, pipeline_parallel); + + if (pipeline_parallel) { + LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched)); + } // build worst-case graph - int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch); + int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_ubatch); int n_past = cparams.n_ctx - n_tokens; llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true); @@ -12987,7 +13035,7 @@ struct llama_context * llama_new_context_with_model( // note: the number of splits during measure is higher than during inference due to the kv shift int n_splits = ggml_backend_sched_get_n_splits(ctx->sched); - LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits); + LLAMA_LOG_INFO("%s: graph splits: %d\n", __func__, n_splits); } } @@ -13024,6 +13072,10 @@ uint32_t llama_n_batch(const struct llama_context * ctx) { return ctx->cparams.n_batch; } +uint32_t llama_n_ubatch(const struct llama_context * ctx) { + return ctx->cparams.n_ubatch; +} + uint32_t llama_n_seq_max(const struct llama_context * ctx) { return ctx->kv_self.size; } @@ -13347,9 +13399,9 @@ size_t llama_get_state_size(const struct llama_context * ctx) { const size_t s_rng = LLAMA_MAX_RNG_STATE; const size_t s_logits_size = sizeof(size_t); // assume worst case for logits although only currently set ones are serialized - const size_t s_logits = ctx->logits.capacity() * sizeof(float); + const size_t s_logits = ctx->logits_size * sizeof(float); const size_t s_embedding_size = sizeof(size_t); - const size_t s_embedding = ctx->embd.capacity() * sizeof(float); + const size_t s_embedding = ctx->embd_size * sizeof(float); const size_t s_kv_buf_size = sizeof(size_t); const size_t s_kv_head = sizeof(uint32_t); const size_t s_kv_size = sizeof(uint32_t); @@ -13447,23 +13499,23 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat // copy logits { - const size_t logits_size = ctx->logits.size(); + const size_t logits_size = ctx->logits_size; data_ctx->write(&logits_size, sizeof(logits_size)); if (logits_size) { - data_ctx->write(ctx->logits.data(), logits_size * sizeof(float)); + data_ctx->write(ctx->logits, logits_size * sizeof(float)); } } // copy embeddings { - const size_t embeddings_size = ctx->embd.size(); + const size_t embeddings_size = ctx->embd_size; data_ctx->write(&embeddings_size, sizeof(embeddings_size)); if (embeddings_size) { - data_ctx->write(ctx->embd.data(), embeddings_size * sizeof(float)); + data_ctx->write(ctx->embd, embeddings_size * sizeof(float)); } } @@ -13566,12 +13618,10 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size); - GGML_ASSERT(ctx->logits.capacity() >= logits_size); + GGML_ASSERT(ctx->logits_size >= logits_size); if (logits_size) { - ctx->logits.resize(logits_size); - - memcpy(ctx->logits.data(), inp, logits_size * sizeof(float)); + memcpy(ctx->logits, inp, logits_size * sizeof(float)); inp += logits_size * sizeof(float); } } @@ -13582,12 +13632,10 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size); - GGML_ASSERT(ctx->embd.capacity() == embeddings_size); + GGML_ASSERT(ctx->embd_size == embeddings_size); if (embeddings_size) { - ctx->embd.resize(embeddings_size); - - memcpy(ctx->embd.data(), inp, embeddings_size * sizeof(float)); + memcpy(ctx->embd, inp, embeddings_size * sizeof(float)); inp += embeddings_size * sizeof(float); } } @@ -13842,24 +13890,61 @@ int32_t llama_decode( return ret; } +void llama_synchronize(struct llama_context * ctx) { + ggml_backend_sched_synchronize(ctx->sched); + + // FIXME: if multiple single tokens are evaluated without a synchronization, + // the stats will be added to the prompt evaluation stats + // this should only happen when using batch size 1 to evaluate a batch + + // add the evaluation to the stats + if (ctx->n_queued_tokens == 1) { + ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us; + ctx->n_eval++; + } else if (ctx->n_queued_tokens > 1) { + ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us; + ctx->n_p_eval += ctx->n_queued_tokens; + } + + // get a more accurate load time, upon first eval + if (ctx->n_queued_tokens > 0 && !ctx->has_evaluated_once) { + ctx->t_load_us = ggml_time_us() - ctx->t_start_us; + ctx->has_evaluated_once = true; + } + + ctx->n_queued_tokens = 0; + ctx->t_compute_start_us = 0; +} + float * llama_get_logits(struct llama_context * ctx) { - return ctx->logits.data(); + llama_synchronize(ctx); + + return ctx->logits; } float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) { assert(ctx->logits_valid.at(i)); - return ctx->logits.data() + i*ctx->model.hparams.n_vocab; + + llama_synchronize(ctx); + + return ctx->logits + i*ctx->model.hparams.n_vocab; } float * llama_get_embeddings(struct llama_context * ctx) { - return ctx->embd.data(); + llama_synchronize(ctx); + + return ctx->embd; } float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) { - return ctx->embd.data() + i*ctx->model.hparams.n_embd; + llama_synchronize(ctx); + + return ctx->embd + i*ctx->model.hparams.n_embd; } float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) { + llama_synchronize(ctx); + auto it = ctx->embd_seq.find(seq_id); if (it == ctx->embd_seq.end()) { return nullptr; diff --git a/llama.h b/llama.h index 446899da6..2d16cc9b9 100644 --- a/llama.h +++ b/llama.h @@ -234,7 +234,8 @@ extern "C" { struct llama_context_params { uint32_t seed; // RNG seed, -1 for random uint32_t n_ctx; // text context, 0 = from model - uint32_t n_batch; // prompt processing maximum batch size + uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode + uint32_t n_ubatch; // physical maximum batch size uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models) uint32_t n_threads; // number of threads to use for generation uint32_t n_threads_batch; // number of threads to use for batch processing @@ -377,6 +378,7 @@ extern "C" { LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); + LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx); LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model); @@ -650,6 +652,11 @@ extern "C" { // Set abort callback LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data); + // Wait until all computations are finished + // This is automatically done when using one of the functions below to obtain the computation results + // and is not necessary to call it explicitly in most cases + LLAMA_API void llama_synchronize(struct llama_context * ctx); + // Token logits obtained from the last call to llama_decode() // The logits for the last token are stored in the last row // Logits for which llama_batch.logits[i] == 0 are undefined From 463628372d5fe3a0c1e5864aa5fc57deb7387039 Mon Sep 17 00:00:00 2001 From: Clint Herron Date: Wed, 13 Mar 2024 14:10:40 -0400 Subject: [PATCH 05/56] grammar : handle missing "root" node (#6004) --- common/sampling.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/common/sampling.cpp b/common/sampling.cpp index 823031feb..5a5450982 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -17,6 +17,13 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_ return nullptr; } + // Ensure that there is a "root" node. + if (result->parsed_grammar.symbol_ids.find("root") == result->parsed_grammar.symbol_ids.end()) { + fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__); + delete result; + return nullptr; + } + std::vector grammar_rules(result->parsed_grammar.c_rules()); result->grammar = llama_grammar_init( From 76a936c8939c249a7c3e8e66dfefbab13eae194f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 13 Mar 2024 20:33:56 +0200 Subject: [PATCH 06/56] readme : update API changes and hot topics --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 54bf84bec..80037782f 100644 --- a/README.md +++ b/README.md @@ -10,12 +10,14 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ### Recent API changes +- [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017 - [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328 - [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796 - [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849 ### Hot topics +- Multi-GPU pipeline parallelizm support https://github.com/ggerganov/llama.cpp/pull/6017 - Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981 - Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962 - Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328 From 19885d205e768579ab090d1e99281cae58c21b54 Mon Sep 17 00:00:00 2001 From: Linwei Wang Date: Thu, 14 Mar 2024 02:34:40 +0800 Subject: [PATCH 07/56] readme : update details about running llama in Termux on Android (#6039) --- README.md | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 80037782f..61bedc3f8 100644 --- a/README.md +++ b/README.md @@ -904,6 +904,9 @@ First, install the essential packages for termux: pkg install clang wget git cmake ``` Second, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake: + +You can execute the following commands on your computer to avoid downloading the NDK to your mobile. Of course, you can also do this in Termux. + ``` $ mkdir build-android $ cd build-android @@ -912,7 +915,28 @@ $ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROI $ make ``` Install [termux](https://termux.dev/) on your device and run `termux-setup-storage` to get access to your SD card. -Finally, copy the `llama` binary and the model files to your device storage. Here is a demo of an interactive session running on Pixel 5 phone: +Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission: + +(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`) +``` +$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/ +$cd /data/data/com.termux/files/home/bin +$chmod +x ./* +``` + +Download model [llama-2-7b-chat.Q4_K_M.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/blob/main/llama-2-7b-chat.Q4_K_M.gguf), and push it to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/` + +``` +$mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/home/model/ +``` + +Now, you can start chatting: +``` +$cd /data/data/com.termux/files/home/bin +$./main -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml +``` + +Here is a demo of an interactive session running on Pixel 5 phone: https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4 From 0fd6c1f015f6cccf3b527f7dbd8386a434728126 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 14 Mar 2024 10:12:29 +0200 Subject: [PATCH 08/56] embedding : print cosine similarity (#899) --- common/common.cpp | 13 +++++++++++++ common/common.h | 1 + examples/embedding/embedding.cpp | 21 ++++++++++++++++----- examples/gritlm/gritlm.cpp | 26 ++++++-------------------- 4 files changed, 36 insertions(+), 25 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 73b1b61ba..58fbd05aa 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1877,3 +1877,16 @@ void llama_embd_normalize(const float * inp, float * out, int n) { } } +float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n){ + double sum = 0.0; + double sum1 = 0.0; + double sum2 = 0.0; + + for (int i = 0; i < n; i++) { + sum += embd1[i] * embd2[i]; + sum1 += embd1[i] * embd1[i]; + sum2 += embd2[i] * embd2[i]; + } + + return sum / (sqrt(sum1) * sqrt(sum2)); +} diff --git a/common/common.h b/common/common.h index 0f178b9eb..d250eef8b 100644 --- a/common/common.h +++ b/common/common.h @@ -268,3 +268,4 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40 void llama_embd_normalize(const float * inp, float * out, int n); +float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n); diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 49302a199..f390c4061 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -168,14 +168,25 @@ int main(int argc, char ** argv) { batch_decode(ctx, batch, out, s, n_embd); // print first 3 embeddings + fprintf(stdout, "\n"); for (int j = 0; j < std::min(3, n_prompts); j++) { - fprintf(stderr, "embedding %d: ", j); - for (int i = 0; i < n_embd; i++) { - fprintf(stderr, "%f ", emb[j * n_embd + i]); + fprintf(stdout, "embedding %d: ", j); + for (int i = 0; i < std::min(16, n_embd); i++) { + fprintf(stdout, "%f ", emb[j * n_embd + i]); } - fprintf(stderr, "\n\n"); + fprintf(stdout, "\n"); + } + + // print cosine similarity matrix + fprintf(stdout, "\n"); + printf("cosine similarity matrix:\n\n"); + for (int i = 0; i < n_prompts; i++) { + for (int j = 0; j < n_prompts; j++) { + float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); + fprintf(stdout, "%6.2f ", sim); + } + fprintf(stdout, "\n"); } - fprintf(stderr, "\n"); // clean up llama_print_timings(ctx); diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index 3d4b085d6..52fd719b3 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -6,22 +6,6 @@ // #define GRIT_DEBUG -static float dot_product(const std::vector & v1, const std::vector & v2) { - float dot = 0.0f; - for (uint64_t i = 0; i < v1.size(); ++i) { - dot += v1[i] * v2[i]; - } - return dot; -} - -static float norm(const std::vector & v) { - return std::sqrt(dot_product(v, v)); -} - -static float cosine_similarity(const std::vector & v1, const std::vector & v2) { - return dot_product(v1, v2) / (norm(v1) * norm(v2)); -} - static std::vector> encode(llama_context * ctx, const std::vector & sentences, const std::string & instruction) { std::vector> result; @@ -203,10 +187,12 @@ int main(int argc, char * argv[]) { const std::vector> d_rep = encode(ctx, documents, gritlm_instruction("")); const std::vector> q_rep = encode(ctx, queries, gritlm_instruction(instruction)); - const float cosine_sim_q0_d0 = cosine_similarity(q_rep[0], d_rep[0]); - const float cosine_sim_q0_d1 = cosine_similarity(q_rep[0], d_rep[1]); - const float cosine_sim_q1_d0 = cosine_similarity(q_rep[1], d_rep[0]); - const float cosine_sim_q1_d1 = cosine_similarity(q_rep[1], d_rep[1]); + const int n_embd = llama_n_embd(mdl); + + const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd); + const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd); + const float cosine_sim_q1_d0 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd); + const float cosine_sim_q1_d1 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd); std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0); std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1); From 381da2d9f0940d7009e3e918bed36338c8ff2fbb Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 14 Mar 2024 11:55:23 +0200 Subject: [PATCH 09/56] metal : build metallib + fix embed path (#6015) * metal : build metallib + fix embed path ggml-ci * metal : fix embed build + update library load logic ggml-ci * metal : fix embeded library build ggml-ci * ci : fix iOS builds to use embedded library --- .github/workflows/build.yml | 2 ++ .gitignore | 2 ++ CMakeLists.txt | 70 ++++++++++++++++++++----------------- Makefile | 15 ++++---- ggml-metal.m | 49 +++++++++++++++++--------- ggml-metal.metal | 3 -- 6 files changed, 83 insertions(+), 58 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d39cd6bc3..0da01d5ba 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -333,6 +333,7 @@ jobs: mkdir build cd build cmake -G Xcode .. \ + -DLLAMA_METAL_EMBED_LIBRARY=ON \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TESTS=OFF \ -DLLAMA_BUILD_SERVER=OFF \ @@ -361,6 +362,7 @@ jobs: mkdir build cd build cmake -G Xcode .. \ + -DLLAMA_METAL_EMBED_LIBRARY=ON \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TESTS=OFF \ -DLLAMA_BUILD_SERVER=OFF \ diff --git a/.gitignore b/.gitignore index d28f4d1b8..1ad8d929b 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,8 @@ .vscode/ .idea/ +ggml-metal-embed.metal + lcov-report/ gcovr-report/ diff --git a/CMakeLists.txt b/CMakeLists.txt index a8abf4088..3ac2804a6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -200,9 +200,6 @@ if (LLAMA_METAL) add_compile_definitions(GGML_METAL_NDEBUG) endif() - # get full path to the file - #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/") - # copy ggml-common.h and ggml-metal.metal to bin directory configure_file(ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY) configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY) @@ -211,53 +208,62 @@ if (LLAMA_METAL) enable_language(ASM) add_compile_definitions(GGML_METAL_EMBED_LIBRARY) + set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/ggml-common.h") set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal") + file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated") - set(EMBED_METALLIB_ASSEMBLY "${CMAKE_BINARY_DIR}/autogenerated/ggml-embed-metallib.s") + + # merge ggml-common.h and ggml-metal.metal into a single file + set(METALLIB_EMBED_ASM "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.s") + set(METALLIB_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal") add_custom_command( - OUTPUT ${EMBED_METALLIB_ASSEMBLY} - COMMAND echo ".section __DATA,__ggml_metallib" > ${EMBED_METALLIB_ASSEMBLY} - COMMAND echo ".globl _ggml_metallib_start" >> ${EMBED_METALLIB_ASSEMBLY} - COMMAND echo "_ggml_metallib_start:" >> ${EMBED_METALLIB_ASSEMBLY} - COMMAND echo ".incbin \\\"${METALLIB_SOURCE}\\\"" >> ${EMBED_METALLIB_ASSEMBLY} - COMMAND echo ".globl _ggml_metallib_end" >> ${EMBED_METALLIB_ASSEMBLY} - COMMAND echo "_ggml_metallib_end:" >> ${EMBED_METALLIB_ASSEMBLY} - DEPENDS ${METALLIB_SOURCE} + OUTPUT ${METALLIB_EMBED_ASM} + COMMAND echo "Embedding Metal library" + COMMAND sed -e '/\#include \"ggml-common.h\"/r ${METALLIB_COMMON}' -e '/\#include \"ggml-common.h\"/d' < ${METALLIB_SOURCE} > ${METALLIB_SOURCE_EMBED} + COMMAND echo ".section __DATA,__ggml_metallib" > ${METALLIB_EMBED_ASM} + COMMAND echo ".globl _ggml_metallib_start" >> ${METALLIB_EMBED_ASM} + COMMAND echo "_ggml_metallib_start:" >> ${METALLIB_EMBED_ASM} + COMMAND echo ".incbin \\\"${METALLIB_SOURCE_EMBED}\\\"" >> ${METALLIB_EMBED_ASM} + COMMAND echo ".globl _ggml_metallib_end" >> ${METALLIB_EMBED_ASM} + COMMAND echo "_ggml_metallib_end:" >> ${METALLIB_EMBED_ASM} + DEPENDS ggml-metal.metal ggml-common.h COMMENT "Generate assembly for embedded Metal library" ) - set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${EMBED_METALLIB_ASSEMBLY}) - endif() - - if (LLAMA_METAL_SHADER_DEBUG) - # custom command to do the following: - # xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air - # xcrun -sdk macosx metallib ggml-metal.air -o default.metallib - # - # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works - # disabling fast math is needed in order to pass tests/test-backend-ops - # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1 - # note: unfortunately, we have to call it default.metallib instead of ggml.metallib - # ref: https://github.com/ggerganov/whisper.cpp/issues/1720 - set(XC_FLAGS -fno-fast-math -fno-inline -g) - if (LLAMA_QKK_64) - set(XC_FLAGS ${XC_FLAGS} -DQK_K=64) + set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${METALLIB_EMBED_ASM}) + else() + if (LLAMA_METAL_SHADER_DEBUG) + # custom command to do the following: + # xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air + # xcrun -sdk macosx metallib ggml-metal.air -o default.metallib + # + # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works + # disabling fast math is needed in order to pass tests/test-backend-ops + # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1 + # note: unfortunately, we have to call it default.metallib instead of ggml.metallib + # ref: https://github.com/ggerganov/whisper.cpp/issues/1720 + set(XC_FLAGS -fno-fast-math -fno-inline -g) + else() + set(XC_FLAGS -O3) endif() add_custom_command( OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air COMMAND xcrun -sdk macosx metallib ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib - DEPENDS ggml-metal.metal + COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air + COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h + COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal + DEPENDS ggml-metal.metal ggml-common.h COMMENT "Compiling Metal kernels" - ) + ) add_custom_target( ggml-metal ALL DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib - ) - endif() + ) + endif() # LLAMA_METAL_EMBED_LIBRARY set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${FOUNDATION_LIBRARY} diff --git a/Makefile b/Makefile index db9968efb..cb597b209 100644 --- a/Makefile +++ b/Makefile @@ -557,15 +557,16 @@ ggml-metal.o: ggml-metal.m ggml-metal.h $(CC) $(CFLAGS) -c $< -o $@ ifdef LLAMA_METAL_EMBED_LIBRARY -ggml-metal-embed.o: ggml-metal.metal +ggml-metal-embed.o: ggml-metal.metal ggml-common.h @echo "Embedding Metal library" + @sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-embed.metal $(eval TEMP_ASSEMBLY=$(shell mktemp)) - @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY) - @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY) - @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY) - @echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY) - @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY) - @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY) + @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY) + @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY) + @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY) + @echo ".incbin \"ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY) + @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY) + @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY) @$(AS) $(TEMP_ASSEMBLY) -o $@ @rm -f ${TEMP_ASSEMBLY} endif diff --git a/ggml-metal.m b/ggml-metal.m index 3a5476c52..c3451a79b 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -280,6 +280,11 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { id metal_library; // load library + // + // - first check if the library is embedded + // - then check if the library is in the bundle + // - if not found, load the source and compile it + // - if that fails, return NULL { NSBundle * bundle = nil; #ifdef SWIFT_PACKAGE @@ -287,12 +292,21 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { #else bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; #endif + NSError * error = nil; - NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"]; - if (libPath != nil) { + +#if GGML_METAL_EMBED_LIBRARY + const bool try_metallib = false; +#else + const bool try_metallib = true; +#endif + + NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"]; + if (try_metallib && path_lib != nil) { // pre-compiled library found - NSURL * libURL = [NSURL fileURLWithPath:libPath]; - GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]); + NSURL * libURL = [NSURL fileURLWithPath:path_lib]; + GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]); + metal_library = [ctx->device newLibraryWithURL:libURL error:&error]; if (error) { GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); @@ -305,31 +319,34 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { extern const char ggml_metallib_start[]; extern const char ggml_metallib_end[]; - NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding]; + NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding]; #else GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__); - NSString * sourcePath; - NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"]; + NSString * path_source; + NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"]; - GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, ggmlMetalPathResources ? [ggmlMetalPathResources UTF8String] : "nil"); + GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil"); - if (ggmlMetalPathResources) { - sourcePath = [ggmlMetalPathResources stringByAppendingPathComponent:@"ggml-metal.metal"]; + if (path_resource) { + path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"]; } else { - sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; + path_source = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; } - if (sourcePath == nil) { + + if (path_source == nil) { GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__); - sourcePath = @"ggml-metal.metal"; + path_source = @"ggml-metal.metal"; } - GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]); - NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error]; + + GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]); + + NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error]; if (error) { GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } -#endif +#endif // GGML_METAL_EMBED_LIBRARY @autoreleasepool { // dictionary of preprocessor macros diff --git a/ggml-metal.metal b/ggml-metal.metal index ebf2f5b47..63de56325 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -4,9 +4,6 @@ #include -#define GGML_COMMON_IMPL_METAL -#include "ggml-common.h" - using namespace metal; #define MAX(x, y) ((x) > (y) ? (x) : (y)) From 68265ebfc6a1bed022973ea0c3145be1450b7e70 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 14 Mar 2024 12:37:20 +0200 Subject: [PATCH 10/56] embedding : print all resulting embeddings (#899) --- examples/embedding/embedding.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index f390c4061..895469a31 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -167,9 +167,9 @@ int main(int argc, char ** argv) { float * out = emb + p * n_embd; batch_decode(ctx, batch, out, s, n_embd); - // print first 3 embeddings + // print the first part of the embeddings fprintf(stdout, "\n"); - for (int j = 0; j < std::min(3, n_prompts); j++) { + for (int j = 0; j < n_prompts; j++) { fprintf(stdout, "embedding %d: ", j); for (int i = 0; i < std::min(16, n_embd); i++) { fprintf(stdout, "%f ", emb[j * n_embd + i]); From 3fe8d7a17f84bd721cd4d8db35365da44b69f68b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 14 Mar 2024 12:38:37 +0200 Subject: [PATCH 11/56] ggml : designate enum vals for integer types (#6050) --- ggml.h | 64 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/ggml.h b/ggml.h index 1171088a9..ab26c8f59 100644 --- a/ggml.h +++ b/ggml.h @@ -337,24 +337,24 @@ extern "C" { struct ggml_object; struct ggml_context; + // NOTE: always add types at the end of the enum to keep backward compatibility enum ggml_type { - GGML_TYPE_F32 = 0, - GGML_TYPE_F16 = 1, - GGML_TYPE_Q4_0 = 2, - GGML_TYPE_Q4_1 = 3, + GGML_TYPE_F32 = 0, + GGML_TYPE_F16 = 1, + GGML_TYPE_Q4_0 = 2, + GGML_TYPE_Q4_1 = 3, // GGML_TYPE_Q4_2 = 4, support has been removed - // GGML_TYPE_Q4_3 (5) support has been removed - GGML_TYPE_Q5_0 = 6, - GGML_TYPE_Q5_1 = 7, - GGML_TYPE_Q8_0 = 8, - GGML_TYPE_Q8_1 = 9, - // k-quantizations - GGML_TYPE_Q2_K = 10, - GGML_TYPE_Q3_K = 11, - GGML_TYPE_Q4_K = 12, - GGML_TYPE_Q5_K = 13, - GGML_TYPE_Q6_K = 14, - GGML_TYPE_Q8_K = 15, + // GGML_TYPE_Q4_3 = 5, support has been removed + GGML_TYPE_Q5_0 = 6, + GGML_TYPE_Q5_1 = 7, + GGML_TYPE_Q8_0 = 8, + GGML_TYPE_Q8_1 = 9, + GGML_TYPE_Q2_K = 10, + GGML_TYPE_Q3_K = 11, + GGML_TYPE_Q4_K = 12, + GGML_TYPE_Q5_K = 13, + GGML_TYPE_Q6_K = 14, + GGML_TYPE_Q8_K = 15, GGML_TYPE_IQ2_XXS = 16, GGML_TYPE_IQ2_XS = 17, GGML_TYPE_IQ3_XXS = 18, @@ -363,9 +363,9 @@ extern "C" { GGML_TYPE_IQ3_S = 21, GGML_TYPE_IQ2_S = 22, GGML_TYPE_IQ4_XS = 23, - GGML_TYPE_I8, - GGML_TYPE_I16, - GGML_TYPE_I32, + GGML_TYPE_I8 = 24, + GGML_TYPE_I16 = 25, + GGML_TYPE_I32 = 26, GGML_TYPE_COUNT, }; @@ -383,20 +383,20 @@ extern "C" { // model file types enum ggml_ftype { - GGML_FTYPE_UNKNOWN = -1, - GGML_FTYPE_ALL_F32 = 0, - GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors + GGML_FTYPE_UNKNOWN = -1, + GGML_FTYPE_ALL_F32 = 0, + GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 - GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors - GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors - GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors - GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors - GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors - GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors - GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors + GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors + GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors + GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors + GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors + GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors + GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors + GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors From 3ca23481dd309bd51cc31c73a4cc34f922cc372f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20=C4=8Cert=C3=ADk?= Date: Thu, 14 Mar 2024 04:40:14 -0600 Subject: [PATCH 12/56] gguf-py : add support for I8, I16 and I32 (#6045) * Refactor dtype handling to be extensible This code is equivalent as before, but now it is prepared to easily add more NumPy dtypes. * Add support for I8, I16 and I32 These types are allowed in the GGUF specification. * Add support for I8, I16 and I32 to gguf_writer * Add support for I8, I16, I32 to gguf_reader --- gguf-py/gguf/constants.py | 6 ++++++ gguf-py/gguf/gguf_reader.py | 9 +++++++++ gguf-py/gguf/gguf_writer.py | 16 ++++++++++++---- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index b23badb10..99f71f0a1 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -661,6 +661,9 @@ class GGMLQuantizationType(IntEnum): IQ3_S = 21 IQ2_S = 22 IQ4_XS = 23 + I8 = 24 + I16 = 25 + I32 = 26 class GGUFEndian(IntEnum): @@ -727,6 +730,9 @@ GGML_QUANT_SIZES = { GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4), GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16), GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64), + GGMLQuantizationType.I8: (1, 1), + GGMLQuantizationType.I16: (1, 2), + GGMLQuantizationType.I32: (1, 4), } diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py index 5b6d4ba6b..1c10f5753 100644 --- a/gguf-py/gguf/gguf_reader.py +++ b/gguf-py/gguf/gguf_reader.py @@ -248,6 +248,15 @@ class GGUFReader: elif ggml_type == GGMLQuantizationType.F16: item_count = n_elems item_type = np.float16 + elif ggml_type == GGMLQuantizationType.I8: + item_count = n_elems + item_type = np.int8 + elif ggml_type == GGMLQuantizationType.I16: + item_count = n_elems + item_type = np.int16 + elif ggml_type == GGMLQuantizationType.I32: + item_count = n_elems + item_type = np.int32 else: item_count = n_bytes item_type = np.uint8 diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index e49c5db68..9c1eeac31 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -196,9 +196,6 @@ class GGUFWriter: if self.state is not WriterState.EMPTY: raise ValueError(f'Expected output file to be empty, got {self.state}') - if raw_dtype is None and tensor_dtype not in (np.float32, np.float16): - raise ValueError("Only F32 and F16 tensors are supported for now") - encoded_name = name.encode("utf8") self.ti_data += self._pack("Q", len(encoded_name)) self.ti_data += encoded_name @@ -207,7 +204,18 @@ class GGUFWriter: for i in range(n_dims): self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i]) if raw_dtype is None: - dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16 + if tensor_shape == np.float32: + dtype = GGMLQuantizationType.F32 + elif tensor_dtype == np.float16: + dtype = GGMLQuantizationType.F16 + elif tensor_dtype == np.int8: + dtype = GGMLQuantizationType.I8 + elif tensor_dtype == np.int16: + dtype = GGMLQuantizationType.I16 + elif tensor_dtype == np.int32: + dtype = GGMLQuantizationType.I32 + else: + raise ValueError("Only F32, F16, I8, I16, I32 tensors are supported for now") else: dtype = raw_dtype self.ti_data += self._pack("I", dtype) From 2c4fb69246834503db7b78bcbedcef506bbc60c4 Mon Sep 17 00:00:00 2001 From: Michael Podvitskiy Date: Thu, 14 Mar 2024 11:56:48 +0100 Subject: [PATCH 13/56] llama : optimize defrag moves + fix fragmentation calculation (#6037) * attempt to reduce the impact of a worst-case scenario * fragmentation calculation fix * Update llama.cpp --------- Co-authored-by: Georgi Gerganov --- llama.cpp | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/llama.cpp b/llama.cpp index 38e7036a7..ff467c575 100644 --- a/llama.cpp +++ b/llama.cpp @@ -9036,8 +9036,8 @@ static int llama_decode_internal( //llama_synchronize(&lctx); // decide if we need to defrag the kv cache - if (cparams.defrag_thold >= 0.0f) { - const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used + n_tokens_all)/float(kv_self.n) : 0.0f; + if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) { + const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f; // queue defragmentation for next llama_kv_cache_update if (fragmentation > cparams.defrag_thold) { @@ -9069,6 +9069,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { // number of cells moved uint32_t n_moves = 0; + // each move requires 6*n_layer tensors (see build_defrag) + // - source view, destination view, copy operation + // - x2 for keys and values + const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer); + // determine which KV cells to move where // // cell i moves to ids[i] @@ -9095,15 +9100,6 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { nh++; } - // each move requires 6*n_layer tensors (see build_defrag) - // - source view, destination view, copy operation - // - x2 for keys and values - // - if (6*(n_moves + nh)*n_layer >= LLAMA_MAX_NODES) { - // the graph is too big, we cannot move more cells - break; - } - uint32_t nf = 0; uint32_t is = n_kv - 1; @@ -9133,11 +9129,19 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { // are we moving a continuous block of memory? bool cont = false; + // should we stop searching for the next move? + bool stop = false; + // go back and move the nf cells to the hole for (; i1 < n_kv; ++i1) { auto & cell1 = kv_self.cells[i1]; if (cell1.is_empty() || ids[i1] != n_kv) { + if (n_moves == max_moves) { + stop = true; + break; + } + cont = false; continue; } @@ -9164,6 +9168,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { } } + if (stop || n_moves == max_moves) { + break; + } + //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh); i0 += nh - 1; From a44bc969e4cd62ca9f4332e17fe3c51f2093e7c6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 14 Mar 2024 13:13:06 +0200 Subject: [PATCH 14/56] llama : fix typo --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index ff467c575..eb48b1e90 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3932,7 +3932,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert); LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used); - LLAMA_LOG_INFO("%s: causal attm = %d\n", __func__, hparams.causal_attn); + LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn); LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type); LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type); LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type); From 43241adf22e8231ffaf3827d2c9310cc0ffd5ac5 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Thu, 14 Mar 2024 12:15:39 +0100 Subject: [PATCH 15/56] server: disable debug release type sanitizer, simplify trigger (#6047) - increase time out for server - do not fail fast --- .github/workflows/server.yml | 17 ++++++++--------- examples/server/tests/features/steps/steps.py | 9 ++++++++- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index e385e03f3..5e38b3547 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -25,17 +25,14 @@ jobs: strategy: matrix: sanitizer: [ADDRESS, THREAD, UNDEFINED] - build_type: [Debug, Release] + build_type: [Debug] include: - build_type: Release sanitizer: "" - exclude: - - build_type: Release - sanitizer: ADDRESS - - build_type: Release + - build_type: Debug sanitizer: THREAD - - build_type: Release - sanitizer: UNDEFINED + disabled_on_pr: true + fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken container: image: ubuntu:latest @@ -81,13 +78,14 @@ jobs: - name: Tests id: server_integration_tests + if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }} run: | cd examples/server/tests PORT=8888 ./tests.sh - name: Slow tests id: server_integration_tests_slow - if: ${{ github.event.schedule != '' && matrix.build_type == 'Release' || github.event.inputs.slow_tests == 'true' }} + if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }} run: | cd examples/server/tests PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow @@ -124,13 +122,14 @@ jobs: - name: Tests id: server_integration_tests + if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }} run: | cd examples/server/tests behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp - name: Slow tests id: server_integration_tests_slow - if: ${{ github.event.schedule != '' || github.event.inputs.slow_tests == 'true' }} + if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }} run: | cd examples/server/tests behave.exe --stop --no-skipped --no-capture --tags slow diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index cfa9f96ec..a59a52d21 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -119,6 +119,10 @@ def step_server_metrics(context): def step_start_server(context): start_server_background(context) attempts = 0 + max_attempts = 20 + if 'GITHUB_ACTIONS' in os.environ: + max_attempts *= 2 + while True: with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: result = sock.connect_ex((context.server_fqdn, context.server_port)) @@ -126,7 +130,7 @@ def step_start_server(context): print("\x1b[33;46mserver started!\x1b[0m") return attempts += 1 - if attempts > 20: + if attempts > max_attempts: assert False, "server not started" print(f"waiting for server to start, connect error code = {result}...") time.sleep(0.1) @@ -943,6 +947,9 @@ async def wait_for_health_status(context, print(f"Starting checking for health for expected_health_status={expected_health_status}\n") interval = 0.5 counter = 0 + if 'GITHUB_ACTIONS' in os.environ: + timeout *= 2 + async with aiohttp.ClientSession() as session: while True: async with await session.get(f'{base_url}/health', params=params) as health_response: From 15a333260ab637a040ed0864c206a2ceaf806bb8 Mon Sep 17 00:00:00 2001 From: Jian Liao Date: Thu, 14 Mar 2024 04:18:23 -0700 Subject: [PATCH 16/56] readme : improve readme for Llava-1.6 example (#6044) Co-authored-by: Jian Liao --- examples/llava/README.md | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/examples/llava/README.md b/examples/llava/README.md index 35e6d9e5d..67cb0f22b 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -63,12 +63,20 @@ Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` director ```console git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b ``` -2) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models: + +2) Install the required Python packages: + +```sh +pip install -r examples/llava/requirements.txt +``` + +3) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models: ```console python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/ ``` - you will find a llava.projector and a llava.clip file in your model directory -3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory: + +4) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory: ```console mkdir vit cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin @@ -76,18 +84,18 @@ cp ../llava-v1.6-vicuna-7b/llava.projector vit/ curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json ``` -4) Create the visual gguf model: +5) Create the visual gguf model: ```console python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision ``` - This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP -5) Then convert the model to gguf format: +6) Then convert the model to gguf format: ```console python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown ``` -6) And finally we can run the llava-cli using the 1.6 model version: +7) And finally we can run the llava-cli using the 1.6 model version: ```console ./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096 ``` From 77178eedc83d49f31bf757d8e12315d76460be78 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 14 Mar 2024 13:32:14 +0200 Subject: [PATCH 17/56] gguf-py : fix dtype check (#6045) --- gguf-py/gguf/gguf_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 9c1eeac31..4d389be95 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -204,7 +204,7 @@ class GGUFWriter: for i in range(n_dims): self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i]) if raw_dtype is None: - if tensor_shape == np.float32: + if tensor_dtype == np.float32: dtype = GGMLQuantizationType.F32 elif tensor_dtype == np.float16: dtype = GGMLQuantizationType.F16 From 044ec4b2a567f649459ccd20af2f387c784faa51 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 14 Mar 2024 15:14:14 +0200 Subject: [PATCH 18/56] embedding : add EOS token if not present (#899) --- examples/embedding/embedding.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 895469a31..cbf9aa2b5 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -112,13 +112,20 @@ int main(int argc, char ** argv) { // tokenize the prompts and trim std::vector> inputs; for (const auto & prompt : prompts) { - auto inp = ::llama_tokenize(ctx, prompt, true); + auto inp = ::llama_tokenize(ctx, prompt, true, false); if (inp.size() > n_batch) { inp.resize(n_batch); } inputs.push_back(inp); } + // add eos if not present + for (auto & inp : inputs) { + if (inp.empty() || inp.back() != llama_token_eos(model)) { + inp.push_back(llama_token_eos(model)); + } + } + // tokenization stats if (params.verbose_prompt) { for (int i = 0; i < (int) inputs.size(); i++) { @@ -172,7 +179,7 @@ int main(int argc, char ** argv) { for (int j = 0; j < n_prompts; j++) { fprintf(stdout, "embedding %d: ", j); for (int i = 0; i < std::min(16, n_embd); i++) { - fprintf(stdout, "%f ", emb[j * n_embd + i]); + fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); } fprintf(stdout, "\n"); } From 69ff61397d2b7b550dcdda4a35b35128892408b0 Mon Sep 17 00:00:00 2001 From: Michael Podvitskiy Date: Thu, 14 Mar 2024 17:21:56 +0100 Subject: [PATCH 19/56] llama : support models without vocabulary (#5798) * additional methods to read model and ctx parameters * vocab size as a part of a model metadata * models without vocabulary, convert.py part * models without vocabulary, llama.cpp part * PR clean up * converter scrypt fixes * llama_vocab_type update (renamed the new key) * pr review fixes * revert function renaming * one more NoVocab assert --- convert.py | 126 +++++++++++++++++++++--------------- gguf-py/gguf/constants.py | 2 + gguf-py/gguf/gguf_writer.py | 3 + llama.cpp | 92 +++++++++++++++++--------- llama.h | 7 +- 5 files changed, 142 insertions(+), 88 deletions(-) diff --git a/convert.py b/convert.py index c15f8c47e..161430f3e 100755 --- a/convert.py +++ b/convert.py @@ -332,6 +332,9 @@ class Params: # class BpeVocab: + tokenizer_model = "gpt2" + name = "bpe" + def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read()) if isinstance(self.bpe_tokenizer.get('model'), dict): @@ -390,6 +393,9 @@ class BpeVocab: class SentencePieceVocab: + tokenizer_model = "llama" + name = "spm" + def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer)) added_tokens: dict[str, int] @@ -453,6 +459,9 @@ class SentencePieceVocab: class HfVocab: + tokenizer_model = "llama" + name = "hfft" + def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None: try: from transformers import AutoTokenizer @@ -553,7 +562,15 @@ class HfVocab: return f"" -Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab" +class NoVocab: + tokenizer_model = "no_vocab" + name = "no_vocab" + + def __repr__(self) -> str: + return "" + + +Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab | NoVocab" # @@ -935,8 +952,10 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N # Handle special case where the model's vocab size is not set if params.n_vocab == -1: raise ValueError( - f"The model's vocab size is set to -1 in params.json. Please update it manually. Maybe {vocab.vocab_size}?" + f"The model's vocab size is set to -1 in params.json. Please update it manually.{f' Maybe {vocab.vocab_size}?' if hasattr(vocab, 'vocab_size') else ''}" ) + if isinstance(vocab, NoVocab): + return # model has no vocab # Check for a vocab size mismatch if params.n_vocab == vocab.vocab_size: @@ -977,6 +996,7 @@ class OutputFile: name = str(params.path_model.parent).split('/')[-1] self.gguf.add_name (name) + self.gguf.add_vocab_size (params.n_vocab) self.gguf.add_context_length (params.n_ctx) self.gguf.add_embedding_length (params.n_embd) self.gguf.add_block_count (params.n_layer) @@ -1013,21 +1033,9 @@ class OutputFile: if params.ftype is not None: self.gguf.add_file_type(params.ftype) - def handle_tokenizer_model(self, vocab: Vocab) -> str: - # Map the vocab types to the supported tokenizer models - tokenizer_model = { - SentencePieceVocab: "llama", - HfVocab: "llama", - BpeVocab: "gpt2", - }.get(type(vocab)) - - # Block if vocab type is not predefined - if tokenizer_model is None: - raise ValueError("Unknown vocab type: Not supported") - - return tokenizer_model - def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]: + assert not isinstance(vocab, NoVocab) + tokens = [] scores = [] toktypes = [] @@ -1043,11 +1051,8 @@ class OutputFile: return tokens, scores, toktypes def add_meta_vocab(self, vocab: Vocab) -> None: - # Handle the tokenizer model - tokenizer_model = self.handle_tokenizer_model(vocab) - # Ensure that tokenizer_model is added to the GGUF model - self.gguf.add_tokenizer_model(tokenizer_model) + self.gguf.add_tokenizer_model(vocab.tokenizer_model) # Extract model vocabulary for model conversion tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab) @@ -1074,6 +1079,26 @@ class OutputFile: def write_tensor_info(self) -> None: self.gguf.write_ti_data_to_file() + def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency: int) -> None: + ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency=concurrency) + if ftype == GGMLFileType.MostlyQ8_0: + ndarrays = bounded_parallel_map( + OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency, + use_processpool_executor=True, + ) + else: + ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner) + + start = time.time() + for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)): + elapsed = time.time() - start + size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape) + padi = len(str(len(model))) + print( + f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}" + ) + self.gguf.write_tensor_data(ndarray) + def close(self) -> None: self.gguf.close() @@ -1082,7 +1107,7 @@ class OutputFile: fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, ) -> None: - check_vocab_size(params, vocab, pad_vocab = pad_vocab) + check_vocab_size(params, vocab, pad_vocab=pad_vocab) of = OutputFile(fname_out, endianess=endianess) @@ -1120,8 +1145,11 @@ class OutputFile: # meta data of.add_meta_arch(params) - of.add_meta_vocab(vocab) - of.add_meta_special_vocab(svocab) + if isinstance(vocab, NoVocab): + of.gguf.add_tokenizer_model(vocab.tokenizer_model) + else: + of.add_meta_vocab(vocab) + of.add_meta_special_vocab(svocab) # tensor info for name, lazy_tensor in model.items(): @@ -1131,24 +1159,7 @@ class OutputFile: of.write_tensor_info() # tensor data - ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency) - if ftype == GGMLFileType.MostlyQ8_0: - ndarrays = bounded_parallel_map( - OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency, - use_processpool_executor=True, - ) - else: - ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner) - - start = time.time() - for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)): - elapsed = time.time() - start - size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape) - padi = len(str(len(model))) - print( - f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}" - ) - of.gguf.write_tensor_data(ndarray) + of.write_tensor_data(ftype, model, concurrency) of.close() @@ -1309,8 +1320,8 @@ class VocabFactory: return vtype, path raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}") - def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab: - load_merges = vocabtype == "bpe" + def _create_special_vocab(self, vocab: Vocab, model_parent_path: Path) -> gguf.SpecialVocab: + load_merges = vocab.name == "bpe" n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None return gguf.SpecialVocab( model_parent_path, @@ -1319,30 +1330,34 @@ class VocabFactory: n_vocab=n_vocab, ) - def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]: + def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab: vocab_type, path = self._select_file(vocab_types) print(f"Loading vocab file {path!r}, type {vocab_type!r}") added_tokens_path = path.parent / "added_tokens.json" - vocab: Vocab if vocab_type == "bpe": - vocab = BpeVocab( + return BpeVocab( path, added_tokens_path if added_tokens_path.exists() else None ) - elif vocab_type == "spm": - vocab = SentencePieceVocab( + if vocab_type == "spm": + return SentencePieceVocab( path, added_tokens_path if added_tokens_path.exists() else None ) - elif vocab_type == "hfft": - vocab = HfVocab( + if vocab_type == "hfft": + return HfVocab( path.parent, added_tokens_path if added_tokens_path.exists() else None ) + raise ValueError(vocab_type) + + def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]: + vocab: Vocab + if len(vocab_types) == 1 and "no_vocab" in vocab_types: + vocab = NoVocab() else: - raise ValueError(vocab_type) + vocab = self._create_vocab_by_path(vocab_types) # FIXME: Respect --vocab-dir? special_vocab = self._create_special_vocab( vocab, - vocab_type, model_parent_path, ) return vocab, special_vocab @@ -1380,6 +1395,7 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file") parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") + parser.add_argument("--no-vocab", action="store_true", help="store model without the vocab") parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") parser.add_argument("--vocab-type", help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft") @@ -1392,6 +1408,10 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing") args = parser.parse_args(args_in) + if args.no_vocab: + if args.vocab_only: + raise ValueError("no need to specify --vocab-only if using --no-vocab") + args.vocab_type = "no_vocab" if args.dump_single: model_plus = lazy_load_file(args.model) @@ -1442,7 +1462,7 @@ def main(args_in: list[str] | None = None) -> None: print(f"Wrote {outfile}") return - if model_plus.vocab is not None and args.vocab_dir is None: + if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab: vocab = model_plus.vocab print(f"Vocab info: {vocab}") diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 99f71f0a1..2d7cf16c1 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -32,6 +32,7 @@ class Keys: FILE_TYPE = "general.file_type" class LLM: + VOCAB_SIZE = "{arch}.vocab_size" CONTEXT_LENGTH = "{arch}.context_length" EMBEDDING_LENGTH = "{arch}.embedding_length" BLOCK_COUNT = "{arch}.block_count" @@ -752,6 +753,7 @@ KEY_GENERAL_SOURCE_HF_REPO = Keys.General.SOURCE_HF_REPO KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE # LLM +KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 4d389be95..81b2eb884 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -321,6 +321,9 @@ class GGUFWriter: self.data_alignment = alignment self.add_uint32(Keys.General.ALIGNMENT, alignment) + def add_vocab_size(self, size: int) -> None: + self.add_uint32(Keys.LLM.VOCAB_SIZE.format(arch=self.arch), size) + def add_context_length(self, length: int) -> None: self.add_uint32(Keys.LLM.CONTEXT_LENGTH.format(arch=self.arch), length) diff --git a/llama.cpp b/llama.cpp index eb48b1e90..10fd53469 100644 --- a/llama.cpp +++ b/llama.cpp @@ -258,6 +258,7 @@ enum llm_kv { LLM_KV_GENERAL_SOURCE_URL, LLM_KV_GENERAL_SOURCE_HF_REPO, + LLM_KV_VOCAB_SIZE, LLM_KV_CONTEXT_LENGTH, LLM_KV_EMBEDDING_LENGTH, LLM_KV_BLOCK_COUNT, @@ -321,6 +322,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" }, { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" }, + { LLM_KV_VOCAB_SIZE, "%s.vocab_size" }, { LLM_KV_CONTEXT_LENGTH, "%s.context_length" }, { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" }, { LLM_KV_BLOCK_COUNT, "%s.block_count" }, @@ -3242,10 +3244,11 @@ static const char * llama_model_type_name(e_model type) { static const char * llama_model_vocab_type_name(enum llama_vocab_type type){ switch (type) { - case LLAMA_VOCAB_TYPE_SPM: return "SPM"; - case LLAMA_VOCAB_TYPE_BPE: return "BPE"; - case LLAMA_VOCAB_TYPE_WPM: return "WPM"; - default: return "unknown"; + case LLAMA_VOCAB_TYPE_NONE: return "no vocab"; + case LLAMA_VOCAB_TYPE_SPM: return "SPM"; + case LLAMA_VOCAB_TYPE_BPE: return "BPE"; + case LLAMA_VOCAB_TYPE_WPM: return "WPM"; + default: return "unknown"; } } @@ -3277,14 +3280,14 @@ static void llm_load_hparams( ml.get_key(LLM_KV_GENERAL_NAME, model.name, false); // get hparams kv - ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab); - ml.get_key (LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train); - ml.get_key (LLM_KV_EMBEDDING_LENGTH, hparams.n_embd); - ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff); - ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head); - ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer); - ml.get_key (LLM_KV_EXPERT_COUNT, hparams.n_expert, false); - ml.get_key (LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); + ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab); + ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train); + ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd); + ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff); + ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head); + ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer); + ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false); + ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS); GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert); @@ -3645,30 +3648,25 @@ static void llm_load_vocab( const auto kv = LLM_KV(model.arch); - const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str()); - if (token_idx == -1) { - throw std::runtime_error("cannot find tokenizer vocab in model file\n"); - } - - const float * scores = nullptr; - const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str()); - if (score_idx != -1) { - scores = (const float * ) gguf_get_arr_data(ctx, score_idx); - } - - const int * toktypes = nullptr; - const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str()); - if (toktype_idx != -1) { - toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx); - } - // determine vocab type { std::string tokenizer_name; ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name); - if (tokenizer_name == "llama") { + if (tokenizer_name == "no_vocab") { + vocab.type = LLAMA_VOCAB_TYPE_NONE; + + // default special tokens + vocab.special_bos_id = -1; + vocab.special_eos_id = -1; + vocab.special_unk_id = -1; + vocab.special_sep_id = -1; + vocab.special_pad_id = -1; + vocab.linefeed_id = -1; + + return; + } else if (tokenizer_name == "llama") { vocab.type = LLAMA_VOCAB_TYPE_SPM; // default special tokens @@ -3734,6 +3732,23 @@ static void llm_load_vocab( } } + const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str()); + if (token_idx == -1) { + throw std::runtime_error("cannot find tokenizer vocab in model file\n"); + } + + const float * scores = nullptr; + const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str()); + if (score_idx != -1) { + scores = (const float * ) gguf_get_arr_data(ctx, score_idx); + } + + const int * toktypes = nullptr; + const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str()); + if (toktype_idx != -1) { + toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx); + } + const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx); vocab.id_to_token.resize(n_vocab); @@ -5023,7 +5038,8 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam llm_load_print_meta(ml, model); - if (model.hparams.n_vocab != model.vocab.id_to_token.size()) { + if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE && + model.hparams.n_vocab != model.vocab.id_to_token.size()) { throw std::runtime_error("vocab size mismatch"); } @@ -9361,26 +9377,32 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) { } static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) { + GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL; } static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) { + GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN; } static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) { + GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL; } static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) { + GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE; } static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) { + GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED; } static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) { + GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE); GGML_ASSERT(llama_is_byte_token(vocab, id)); const auto& token_data = vocab.id_to_token.at(id); switch (llama_vocab_get_type(vocab)) { @@ -9401,6 +9423,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) { } static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) { + GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE); static const char * hex = "0123456789ABCDEF"; switch (llama_vocab_get_type(vocab)) { case LLAMA_VOCAB_TYPE_SPM: { @@ -10232,6 +10255,8 @@ static std::vector llama_tokenize_internal(const llama_vocab & } } } break; + case LLAMA_VOCAB_TYPE_NONE: + GGML_ASSERT(false); } return output; @@ -13138,7 +13163,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { } int32_t llama_n_vocab(const struct llama_model * model) { - return model->vocab.id_to_token.size(); + return model->hparams.n_vocab; } int32_t llama_n_ctx_train(const struct llama_model * model) { @@ -13962,14 +13987,17 @@ float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id } const char * llama_token_get_text(const struct llama_model * model, llama_token token) { + GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE); return model->vocab.id_to_token[token].text.c_str(); } float llama_token_get_score(const struct llama_model * model, llama_token token) { + GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE); return model->vocab.id_to_token[token].score; } llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) { + GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE); return model->vocab.id_to_token[token].type; } diff --git a/llama.h b/llama.h index 2d16cc9b9..90aa5372e 100644 --- a/llama.h +++ b/llama.h @@ -59,9 +59,10 @@ extern "C" { typedef int32_t llama_seq_id; enum llama_vocab_type { - LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece - LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding - LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece + LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab + LLAMA_VOCAB_TYPE_SPM = 1, // SentencePiece + LLAMA_VOCAB_TYPE_BPE = 2, // Byte Pair Encoding + LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece }; // note: these values should be synchronized with ggml_rope From 727107707a73b3dc8a497cf9fc9405722c16dd2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20=C4=8Cert=C3=ADk?= Date: Thu, 14 Mar 2024 11:57:31 -0600 Subject: [PATCH 20/56] gguf-py : bump version to 0.8.0 (#6060) --- gguf-py/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml index 9789c2c87..96396e04e 100644 --- a/gguf-py/pyproject.toml +++ b/gguf-py/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gguf" -version = "0.7.0" +version = "0.8.0" description = "Read and write ML models in GGUF for GGML" authors = ["GGML "] packages = [ From 6e0438da3cc95b89cdbf55f45fa4e324d9076792 Mon Sep 17 00:00:00 2001 From: Steve Grubb Date: Thu, 14 Mar 2024 14:29:32 -0400 Subject: [PATCH 21/56] gguf : fix resource leaks (#6061) There several places where a gguf context is allocated. A call to gguf_free is missing in some error paths. Also on linux, llama-bench was missing a fclose. --- examples/gguf/gguf.cpp | 1 + examples/llama-bench/llama-bench.cpp | 1 + examples/llava/clip.cpp | 4 ++++ examples/train-text-from-scratch/train-text-from-scratch.cpp | 1 + 4 files changed, 7 insertions(+) diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index e67be4fb2..5444503a5 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -211,6 +211,7 @@ static bool gguf_ex_read_1(const std::string & fname) { for (int j = 0; j < ggml_nelements(cur); ++j) { if (data[j] != 100 + i) { fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]); + gguf_free(ctx); return false; } } diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index bf94e7e7a..d6e5e0497 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -103,6 +103,7 @@ static std::string get_cpu_info() { } } } + fclose(f); } #endif // TODO: other platforms diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 6653b815d..2035554ea 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -995,6 +995,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { if (!new_clip->ctx_data) { fprintf(stderr, "%s: ggml_init() failed\n", __func__); clip_free(new_clip); + gguf_free(ctx); return nullptr; } @@ -1002,6 +1003,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { if (!fin) { printf("cannot open model file for loading tensors\n"); clip_free(new_clip); + gguf_free(ctx); return nullptr; } @@ -1023,6 +1025,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { if (!fin) { printf("%s: failed to seek for tensor %s\n", __func__, name); clip_free(new_clip); + gguf_free(ctx); return nullptr; } int num_bytes = ggml_nbytes(cur); @@ -1908,6 +1911,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i break; default: printf("Please use an input file in f32 or f16\n"); + gguf_free(ctx_out); return false; } diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 7eafe8515..7d06e401b 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -711,6 +711,7 @@ static bool load_checkpoint_file(const char * filename, struct my_llama_model * load_checkpoint_gguf(fctx, f_ggml_ctx, model, train); + gguf_free(fctx); return true; } From 4755afd1cbd40d93c017e5b98c39796f52345314 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 14 Mar 2024 22:58:41 +0200 Subject: [PATCH 22/56] llama : fix integer overflow during quantization (#6063) --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 10fd53469..2c3841974 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11977,7 +11977,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n return new_type; } -static int32_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector & workers, const int nthread) { +static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector & workers, const int nthread) { std::mutex mutex; int counter = 0; size_t new_size = 0; From b0bc9f4a9da7c19f4779106ea83b23feca747566 Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 15 Mar 2024 09:22:24 +0100 Subject: [PATCH 23/56] llama-bench : use random tokens to improve accuracy with mixtral (#6069) --- examples/llama-bench/llama-bench.cpp | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index d6e5e0497..32eea7869 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -1123,15 +1124,19 @@ struct sql_printer : public printer { static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) { llama_set_n_threads(ctx, n_threads, n_threads); - //std::vector tokens(n_prompt, llama_token_bos(llama_get_model(ctx))); - //llama_decode(ctx, llama_batch_get_one(tokens.data(), n_prompt, n_past, 0)); - //GGML_UNUSED(n_batch); + const llama_model * model = llama_get_model(ctx); + const int32_t n_vocab = llama_n_vocab(model); + + std::vector tokens(n_batch); - std::vector tokens(n_batch, llama_token_bos(llama_get_model(ctx))); int n_processed = 0; while (n_processed < n_prompt) { int n_tokens = std::min(n_prompt - n_processed, n_batch); + tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; + for (int i = 1; i < n_tokens; i++) { + tokens[i] = std::rand() % n_vocab; + } llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0)); n_processed += n_tokens; } @@ -1142,11 +1147,15 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) { llama_set_n_threads(ctx, n_threads, n_threads); - llama_token token = llama_token_bos(llama_get_model(ctx)); + const llama_model * model = llama_get_model(ctx); + const int32_t n_vocab = llama_n_vocab(model); + + llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; for (int i = 0; i < n_gen; i++) { llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0)); llama_synchronize(ctx); + token = std::rand() % n_vocab; } } From aab606a11fc0a9740a7f297521c3eef851dfb351 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 15 Mar 2024 09:44:57 +0100 Subject: [PATCH 24/56] llama : add Orion chat template (#6066) --- llama.cpp | 20 ++++++++++++++++++++ tests/test-chat-template.cpp | 4 ++++ 2 files changed, 24 insertions(+) diff --git a/llama.cpp b/llama.cpp index 2c3841974..b8a8d2723 100644 --- a/llama.cpp +++ b/llama.cpp @@ -14242,6 +14242,26 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "model\n"; } + } else if (tmpl == "orion" || tmpl.find("'\\n\\nAssistant: ' + eos_token") != std::string::npos) { + // OrionStarAI/Orion-14B-Chat + std::string system_prompt = ""; + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + // there is no system message support, we will merge it with user prompt + system_prompt = message->content; + continue; + } else if (role == "user") { + ss << "Human: "; + if (!system_prompt.empty()) { + ss << system_prompt << "\n\n"; + system_prompt = ""; + } + ss << message->content << "\n\nAssistant: "; + } else { + ss << message->content << ""; + } + } } else { // template not supported return -1; diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp index fa2eb577b..6e9e4bd1e 100644 --- a/tests/test-chat-template.cpp +++ b/tests/test-chat-template.cpp @@ -31,6 +31,8 @@ int main(void) { "{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}", // google/gemma-7b-it "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\\n' + message['content'] | trim + '\\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\\n'}}{% endif %}", + // OrionStarAI/Orion-14B-Chat + "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\\n\\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}", }; std::vector expected_output = { // teknium/OpenHermes-2.5-Mistral-7B @@ -45,6 +47,8 @@ int main(void) { "system\nYou are a helpful assistant\nuser\nHello\nassistant\nHi there\nuser\nWho are you\nassistant\n I am an assistant \nuser\nAnother question\nassistant\n", // google/gemma-7b-it "user\nYou are a helpful assistant\n\nHello\nmodel\nHi there\nuser\nWho are you\nmodel\nI am an assistant\nuser\nAnother question\nmodel\n", + // OrionStarAI/Orion-14B-Chat + "Human: You are a helpful assistant\n\nHello\n\nAssistant: Hi thereHuman: Who are you\n\nAssistant: I am an assistant Human: Another question\n\nAssistant: ", }; std::vector formatted_chat(1024); int32_t res; From 7ce2c77f88e1ca66ec48417e56f91746bac018c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20=C4=8Cert=C3=ADk?= Date: Fri, 15 Mar 2024 02:46:51 -0600 Subject: [PATCH 25/56] gguf : add support for I64 and F64 arrays (#6062) * gguf : add support for I64 and F64 arrays GGML currently does not support I64 or F64 arrays and they are not often used in machine learning, however if in the future the need arises, it would be nice to add them now, so that the types are next to the other types I8, I16, I32 in the enums, and it also reserves their type number. Furthermore, with this addition the GGUF format becomes very usable for most computational applications of NumPy (being compatible with the most common NumPy dtypes: i8, i16, i32, i64, f32, f64), providing a faster, and more versatile alternative to the `npz` format, and a simpler alternative to the `hdf5` format. The change in this PR seems small, not significantly increasing the maintenance burden. I tested this from Python using GGUFWriter/Reader and `gguf-dump`, as well as from C, everything seems to work. * Fix compiler warnings --- ggml.c | 17 +++++++++++++++++ ggml.h | 2 ++ gguf-py/gguf/constants.py | 4 ++++ gguf-py/gguf/gguf_reader.py | 14 ++++++++++---- gguf-py/gguf/gguf_writer.py | 12 ++++++++---- 5 files changed, 41 insertions(+), 8 deletions(-) diff --git a/ggml.c b/ggml.c index fbc66f65b..c94006e51 100644 --- a/ggml.c +++ b/ggml.c @@ -470,6 +470,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .type_size = sizeof(int32_t), .is_quantized = false, }, + [GGML_TYPE_I64] = { + .type_name = "i64", + .blck_size = 1, + .type_size = sizeof(int64_t), + .is_quantized = false, + }, + [GGML_TYPE_F64] = { + .type_name = "f64", + .blck_size = 1, + .type_size = sizeof(double), + .is_quantized = false, + .nrows = 1, + }, [GGML_TYPE_F32] = { .type_name = "f32", .blck_size = 1, @@ -12418,6 +12431,8 @@ static void ggml_compute_forward_alibi( case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: + case GGML_TYPE_I64: + case GGML_TYPE_F64: case GGML_TYPE_COUNT: { GGML_ASSERT(false); @@ -12504,6 +12519,8 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: + case GGML_TYPE_I64: + case GGML_TYPE_F64: case GGML_TYPE_COUNT: { GGML_ASSERT(false); diff --git a/ggml.h b/ggml.h index ab26c8f59..c937d4a53 100644 --- a/ggml.h +++ b/ggml.h @@ -366,6 +366,8 @@ extern "C" { GGML_TYPE_I8 = 24, GGML_TYPE_I16 = 25, GGML_TYPE_I32 = 26, + GGML_TYPE_I64 = 27, + GGML_TYPE_F64 = 28, GGML_TYPE_COUNT, }; diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 2d7cf16c1..458a641dc 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -665,6 +665,8 @@ class GGMLQuantizationType(IntEnum): I8 = 24 I16 = 25 I32 = 26 + I64 = 27 + F64 = 28 class GGUFEndian(IntEnum): @@ -734,6 +736,8 @@ GGML_QUANT_SIZES = { GGMLQuantizationType.I8: (1, 1), GGMLQuantizationType.I16: (1, 2), GGMLQuantizationType.I32: (1, 4), + GGMLQuantizationType.I64: (1, 8), + GGMLQuantizationType.F64: (1, 8), } diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py index 1c10f5753..33afac552 100644 --- a/gguf-py/gguf/gguf_reader.py +++ b/gguf-py/gguf/gguf_reader.py @@ -242,12 +242,15 @@ class GGUFReader: n_bytes = n_elems * type_size // block_size data_offs = int(start_offs + offset_tensor[0]) item_type: npt.DTypeLike - if ggml_type == GGMLQuantizationType.F32: - item_count = n_elems - item_type = np.float32 - elif ggml_type == GGMLQuantizationType.F16: + if ggml_type == GGMLQuantizationType.F16: item_count = n_elems item_type = np.float16 + elif ggml_type == GGMLQuantizationType.F32: + item_count = n_elems + item_type = np.float32 + elif ggml_type == GGMLQuantizationType.F64: + item_count = n_elems + item_type = np.float64 elif ggml_type == GGMLQuantizationType.I8: item_count = n_elems item_type = np.int8 @@ -257,6 +260,9 @@ class GGUFReader: elif ggml_type == GGMLQuantizationType.I32: item_count = n_elems item_type = np.int32 + elif ggml_type == GGMLQuantizationType.I64: + item_count = n_elems + item_type = np.int64 else: item_count = n_bytes item_type = np.uint8 diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 81b2eb884..1967b633c 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -204,18 +204,22 @@ class GGUFWriter: for i in range(n_dims): self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i]) if raw_dtype is None: - if tensor_dtype == np.float32: - dtype = GGMLQuantizationType.F32 - elif tensor_dtype == np.float16: + if tensor_dtype == np.float16: dtype = GGMLQuantizationType.F16 + elif tensor_dtype == np.float32: + dtype = GGMLQuantizationType.F32 + elif tensor_dtype == np.float64: + dtype = GGMLQuantizationType.F64 elif tensor_dtype == np.int8: dtype = GGMLQuantizationType.I8 elif tensor_dtype == np.int16: dtype = GGMLQuantizationType.I16 elif tensor_dtype == np.int32: dtype = GGMLQuantizationType.I32 + elif tensor_dtype == np.int64: + dtype = GGMLQuantizationType.I64 else: - raise ValueError("Only F32, F16, I8, I16, I32 tensors are supported for now") + raise ValueError("Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now") else: dtype = raw_dtype self.ti_data += self._pack("I", dtype) From 753e36f650fa2a5869f89188d9ee745dc74cf14b Mon Sep 17 00:00:00 2001 From: AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com> Date: Fri, 15 Mar 2024 09:26:20 +0000 Subject: [PATCH 26/56] [SYCL] Fix non-intel device selection (#6042) * Fix non-intel device selection * Update ggml-sycl.cpp Co-authored-by: Neo Zhang Jianyu * Update ggml-sycl.cpp Co-authored-by: Neo Zhang Jianyu --------- Co-authored-by: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Co-authored-by: Neo Zhang Jianyu --- ggml-sycl.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 9f6506383..a1ca6aba5 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -3451,7 +3451,7 @@ class sycl_gpu_mgr { dpct::device_info prop; dpct::get_device_info(prop, device); if (max_compute_units == prop.get_max_compute_units() && - prop.get_major_version() == 1) { + is_ext_oneapi_device(device)) { gpus.push_back(id); devices.push_back(device); work_group_size = prop.get_max_work_group_size(); @@ -3484,6 +3484,15 @@ class sycl_gpu_mgr { assert(false); return -1; } + + bool is_ext_oneapi_device(const sycl::device &dev) { + sycl::backend dev_backend = dev.get_backend(); + if (dev_backend == sycl::backend::ext_oneapi_level_zero || + dev_backend == sycl::backend::ext_oneapi_cuda || + dev_backend == sycl::backend::ext_oneapi_hip) + return true; + return false; + } }; static sycl_gpu_mgr *g_sycl_gpu_mgr = NULL; From 131b0584096ee9df4d07cb28759dfea6efe6475f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 15 Mar 2024 11:36:50 +0200 Subject: [PATCH 27/56] make : ggml-metal.o depends on ggml.h --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index cb597b209..c0f125036 100644 --- a/Makefile +++ b/Makefile @@ -553,7 +553,7 @@ endif endif # LLAMA_METAL ifdef LLAMA_METAL -ggml-metal.o: ggml-metal.m ggml-metal.h +ggml-metal.o: ggml-metal.m ggml-metal.h ggml.h $(CC) $(CFLAGS) -c $< -o $@ ifdef LLAMA_METAL_EMBED_LIBRARY From 46acb3676718b983157058aecf729a2064fc7d34 Mon Sep 17 00:00:00 2001 From: Neo Zhang Jianyu Date: Fri, 15 Mar 2024 18:53:53 +0800 Subject: [PATCH 28/56] fix set main gpu error (#6073) --- examples/sycl/build.sh | 5 +- examples/sycl/run-llama2.sh | 16 +- ggml-sycl.cpp | 332 ++++++++++++++++++++++++++---------- ggml-sycl.h | 5 + llama.cpp | 23 ++- 5 files changed, 282 insertions(+), 99 deletions(-) diff --git a/examples/sycl/build.sh b/examples/sycl/build.sh index 26ad2f7da..f20391d7a 100755 --- a/examples/sycl/build.sh +++ b/examples/sycl/build.sh @@ -13,8 +13,11 @@ source /opt/intel/oneapi/setvars.sh #for FP32 cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -#build example/main only +#build example/main #cmake --build . --config Release --target main +#build example/llama-bench +#cmake --build . --config Release --target llama-bench + #build all binary cmake --build . --config Release -v diff --git a/examples/sycl/run-llama2.sh b/examples/sycl/run-llama2.sh index 52f7c01a4..c979a52f6 100755 --- a/examples/sycl/run-llama2.sh +++ b/examples/sycl/run-llama2.sh @@ -9,18 +9,28 @@ source /opt/intel/oneapi/setvars.sh if [ $# -gt 0 ]; then GGML_SYCL_DEVICE=$1 + GGML_SYCL_SINGLE_GPU=1 else GGML_SYCL_DEVICE=0 fi -echo "use $GGML_SYCL_DEVICE as main GPU" + #export GGML_SYCL_DEBUG=1 #ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer. -#use all GPUs with same max compute units -ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 +if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then + echo "use $GGML_SYCL_DEVICE as main GPU" + #use signle GPU only + ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none +else + #use multiple GPUs with same max compute units + ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 +fi #use main GPU only #ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none +#use multiple GPUs with same max compute units +#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 + diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index a1ca6aba5..6dc5eb20c 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -24,10 +25,9 @@ #include #include #include - #include #include - +#include #include #include @@ -82,6 +82,30 @@ Following definition copied from DPCT head files, which are used by ggml-sycl.cp #define __dpct_noinline__ __attribute__((noinline)) #endif + +std::string get_device_type_name(const sycl::device &Device) { + auto DeviceType = Device.get_info(); + switch (DeviceType) { + case sycl::info::device_type::cpu: + return "cpu"; + case sycl::info::device_type::gpu: + return "gpu"; + case sycl::info::device_type::host: + return "host"; + case sycl::info::device_type::accelerator: + return "acc"; + default: + return "unknown"; + } +} + +std::string get_device_backend_and_type(const sycl::device &device) { + std::stringstream device_type; + sycl::backend backend = device.get_backend(); + device_type << backend << ":" << get_device_type_name(device); + return device_type.str(); +} + namespace dpct { typedef sycl::queue *queue_ptr; @@ -942,17 +966,65 @@ namespace dpct private: mutable std::recursive_mutex m_mutex; + static bool compare_dev(sycl::device &device1, sycl::device &device2) + { + dpct::device_info prop1; + dpct::get_device_info(prop1, device1); + dpct::device_info prop2; + dpct::get_device_info(prop2, device2); + return prop1.get_max_compute_units() > prop2.get_max_compute_units(); + } + static int convert_backend_index(std::string & backend) { + if (backend == "ext_oneapi_level_zero:gpu") return 0; + if (backend == "opencl:gpu") return 1; + if (backend == "opencl:cpu") return 2; + if (backend == "opencl:acc") return 3; + printf("convert_backend_index: can't handle backend=%s\n", backend.c_str()); + GGML_ASSERT(false); + } + static bool compare_backend(std::string &backend1, std::string &backend2) { + return convert_backend_index(backend1) < convert_backend_index(backend2); + } dev_mgr() { sycl::device default_device = sycl::device(sycl::default_selector_v); _devs.push_back(std::make_shared(default_device)); - std::vector sycl_all_devs = - sycl::device::get_devices(sycl::info::device_type::all); + std::vector sycl_all_devs; // Collect other devices except for the default device. if (default_device.is_cpu()) _cpu_device = 0; + + auto Platforms = sycl::platform::get_platforms(); + // Keep track of the number of devices per backend + std::map DeviceNums; + std::map> backend_devices; + + while (!Platforms.empty()) { + auto Platform = Platforms.back(); + Platforms.pop_back(); + auto devices = Platform.get_devices(); + std::string backend_type = get_device_backend_and_type(devices[0]); + for (const auto &device : devices) { + backend_devices[backend_type].push_back(device); + } + } + + std::vector keys; + for(auto it = backend_devices.begin(); it != backend_devices.end(); ++it) { + keys.push_back(it->first); + } + std::sort(keys.begin(), keys.end(), compare_backend); + + for (auto &key : keys) { + std::vector devs = backend_devices[key]; + std::sort(devs.begin(), devs.end(), compare_dev); + for (const auto &dev : devs) { + sycl_all_devs.push_back(dev); + } + } + for (auto &dev : sycl_all_devs) { if (dev == default_device) @@ -3202,6 +3274,11 @@ static int g_work_group_size = 0; #define GGML_SYCL_MMV_Y 1 #endif +enum ggml_sycl_backend_gpu_mode { + SYCL_UNSET_GPU_MODE = -1, + SYCL_SINGLE_GPU_MODE = 0, + SYCL_MUL_GPU_MODE +}; static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size"); @@ -3401,12 +3478,31 @@ class sycl_gpu_mgr { int work_group_size = 0; std::string gpus_list = ""; + /* + Use all GPUs with same top max compute units + */ sycl_gpu_mgr() { detect_sycl_gpu_list_with_max_cu(); get_allow_gpus(); create_context_with_gpus(); } + /* + Only use the assigned GPU + */ + sycl_gpu_mgr(int main_gpu_id) { + sycl::device device = dpct::dev_mgr::instance().get_device(main_gpu_id); + dpct::device_info prop; + dpct::get_device_info(prop, device); + gpus.push_back(main_gpu_id); + devices.push_back(device); + work_group_size = prop.get_max_work_group_size(); + max_compute_units = prop.get_max_compute_units(); + + get_allow_gpus(); + create_context_with_gpus(); + } + void create_context_with_gpus() { sycl::context ctx = sycl::context(devices); assert(gpus.size() > 0); @@ -3422,7 +3518,7 @@ class sycl_gpu_mgr { gpus_list += std::to_string(gpus[i]); gpus_list += ","; } - if (gpus_list.length() > 2) { + if (gpus_list.length() > 1) { gpus_list.pop_back(); } } @@ -3471,8 +3567,8 @@ class sycl_gpu_mgr { if (gpus[i] == id) return i; } - assert(false); - return -1; + printf("miss to get device index by id=%d\n", id); + GGML_ASSERT(false); } int get_next_index(int id) { @@ -3481,8 +3577,7 @@ class sycl_gpu_mgr { if (gpus[i] == id) return i; } - assert(false); - return -1; + GGML_ASSERT(false); } bool is_ext_oneapi_device(const sycl::device &dev) { @@ -3500,11 +3595,14 @@ static int g_device_count = -1; static int g_all_sycl_device_count = -1; static int g_main_device = -1; static int g_main_device_id = -1; +static bool g_ggml_backend_sycl_buffer_type_initialized = false; static std::array g_default_tensor_split = {}; static float g_tensor_split[GGML_SYCL_MAX_DEVICES] = {0}; +static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode = SYCL_UNSET_GPU_MODE; + struct sycl_device_capabilities { int cc; // compute capability bool vmm; // virtual memory support @@ -13008,17 +13106,20 @@ bool ggml_sycl_loaded(void) { return g_sycl_loaded; } -void print_device_detail(int id) { +void print_device_detail(int id, sycl::device &device, std::string device_type) { + dpct::device_info prop; SYCL_CHECK(CHECK_TRY_ERROR( - dpct::get_device_info(prop, dpct::dev_mgr::instance().get_device(id)))); - sycl::device cur_device = dpct::dev_mgr::instance().get_device(id); + dpct::get_device_info(prop, device))); + std::string version; version += std::to_string(prop.get_major_version()); version += "."; version += std::to_string(prop.get_minor_version()); - fprintf(stderr, "|%2d|%45s|%18s|%17d|%14d|%13d|%15lu|\n", id, + device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), ""); + + fprintf(stderr, "|%2d|%18s|%45s|%10s|%11d|%8d|%7d|%15lu|\n", id, device_type.c_str(), prop.get_name(), version.c_str(), prop.get_max_compute_units(), prop.get_max_work_group_size(), prop.get_max_sub_group_size(), prop.get_global_mem_size()); @@ -13026,19 +13127,35 @@ void print_device_detail(int id) { void ggml_backend_sycl_print_sycl_devices() { int device_count = dpct::dev_mgr::instance().device_count(); + std::map DeviceNums; fprintf(stderr, "found %d SYCL devices:\n", device_count); - fprintf(stderr, "|ID| Name |compute capability|Max compute units|Max work group|Max sub group|Global mem size|\n"); - fprintf(stderr, "|--|---------------------------------------------|------------------|-----------------|--------------|-------------|---------------|\n"); + fprintf(stderr, "| | | |Compute |Max compute|Max work|Max sub| |\n"); + fprintf(stderr, "|ID| Device Type| Name|capability|units |group |group |Global mem size|\n"); + fprintf(stderr, "|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|\n"); for (int id = 0; id < device_count; ++id) { - print_device_detail(id); + sycl::device device = dpct::dev_mgr::instance().get_device(id); + sycl::backend backend = device.get_backend(); + std::string backend_type = get_device_backend_and_type(device); + int type_id=DeviceNums[backend_type]++; + std::stringstream device_type; + device_type << "[" << backend_type << ":" << std::to_string(type_id) << "]"; + print_device_detail(id, device, device_type.str()); } } void print_gpu_device_list() { - fprintf(stderr, "detect %d SYCL GPUs: [%s] with Max compute units:%d\n", - g_sycl_gpu_mgr->get_gpu_count(), - g_sycl_gpu_mgr->gpus_list.c_str(), - g_sycl_gpu_mgr->max_compute_units); + GGML_ASSERT(g_sycl_gpu_mgr); + + char* hint=NULL; + if (g_ggml_sycl_backend_gpu_mode == SYCL_SINGLE_GPU_MODE) { + hint = "use %d SYCL GPUs: [%s] with Max compute units:%d\n"; + } else { + hint = "detect %d SYCL GPUs: [%s] with top Max compute units:%d\n"; + } + fprintf(stderr, hint, + g_sycl_gpu_mgr->get_gpu_count(), + g_sycl_gpu_mgr->gpus_list.c_str(), + g_sycl_gpu_mgr->max_compute_units); } int get_sycl_env(const char *env_name, int default_val) { @@ -13074,23 +13191,6 @@ void ggml_init_sycl() try { #else fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__); #endif - if (CHECK_TRY_ERROR(g_all_sycl_device_count = - dpct::dev_mgr::instance().device_count()) != 0) { - initialized = true; - g_sycl_loaded = false; - return; - } - GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES); - ggml_backend_sycl_print_sycl_devices(); - - if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr(); - - g_device_count = g_sycl_gpu_mgr->get_gpu_count(); - g_work_group_size = g_sycl_gpu_mgr->work_group_size; - - print_gpu_device_list(); - - int64_t total_vram = 0; /* NOT REMOVE, keep it for next optimize for XMX. #if defined(SYCL_USE_XMX) @@ -13099,49 +13199,15 @@ void ggml_init_sycl() try { fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__); #endif */ - for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) { - g_device_caps[id].vmm = 0; - g_device_caps[id].device_id = -1; - g_device_caps[id].cc = 0; - g_tensor_split[id] = 0; - g_default_tensor_split[id] = 0; + + if (CHECK_TRY_ERROR(g_all_sycl_device_count = + dpct::dev_mgr::instance().device_count()) != 0) { + initialized = true; + g_sycl_loaded = false; + return; } - - for (int i = 0; i < g_device_count; ++i) { - int device_id = g_sycl_gpu_mgr->gpus[i]; - g_device_caps[i].vmm = 0; - - dpct::device_info prop; - SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( - prop, dpct::dev_mgr::instance().get_device(device_id)))); - - g_default_tensor_split[i] = total_vram; - total_vram += prop.get_global_mem_size(); - - g_device_caps[i].cc = - 100 * prop.get_major_version() + 10 * prop.get_minor_version(); - } - - for (int i = 0; i < g_device_count; ++i) { - g_default_tensor_split[i] /= total_vram; - } - - for (int i = 0; i < g_device_count; ++i) { - SYCL_CHECK(ggml_sycl_set_device(i)); - - // create sycl streams - for (int is = 0; is < MAX_STREAMS; ++is) { - SYCL_CHECK(CHECK_TRY_ERROR( - g_syclStreams[i][is] = - dpct::get_current_device().create_queue( - g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device()))); - } - - const dpct::queue_ptr stream = g_syclStreams[i][0]; - // create sycl handle - SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream)); - } - + GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES); + ggml_backend_sycl_print_sycl_devices(); initialized = true; g_sycl_loaded = true; } @@ -13152,6 +13218,63 @@ catch (sycl::exception const &exc) { std::exit(1); } +void ggml_init_by_gpus(int device_count) try { + g_device_count = device_count; + g_work_group_size = g_sycl_gpu_mgr->work_group_size; + + int64_t total_vram = 0; + + print_gpu_device_list(); + + for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) { + g_device_caps[id].vmm = 0; + g_device_caps[id].device_id = -1; + g_device_caps[id].cc = 0; + g_tensor_split[id] = 0; + g_default_tensor_split[id] = 0; + } + + for (int i = 0; i < g_device_count; ++i) { + int device_id = g_sycl_gpu_mgr->gpus[i]; + g_device_caps[i].vmm = 0; + + dpct::device_info prop; + SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( + prop, dpct::dev_mgr::instance().get_device(device_id)))); + + g_default_tensor_split[i] = total_vram; + total_vram += prop.get_global_mem_size(); + + g_device_caps[i].cc = + 100 * prop.get_major_version() + 10 * prop.get_minor_version(); + } + + for (int i = 0; i < g_device_count; ++i) { + g_default_tensor_split[i] /= total_vram; + } + + for (int i = 0; i < g_device_count; ++i) { + SYCL_CHECK(ggml_sycl_set_device(i)); + + // create sycl streams + for (int is = 0; is < MAX_STREAMS; ++is) { + SYCL_CHECK(CHECK_TRY_ERROR( + g_syclStreams[i][is] = + dpct::get_current_device().create_queue( + g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device()))); + } + + const dpct::queue_ptr stream = g_syclStreams[i][0]; + // create sycl handle + SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream)); + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + void *ggml_sycl_host_malloc(size_t size) try { if (getenv("GGML_SYCL_NO_PINNED") != nullptr) { return nullptr; @@ -16551,22 +16674,24 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = { /* .is_host = */ nullptr, }; -ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) { +ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) { + if (device_index>=g_device_count or device_index<0) { + printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n", + device_index, g_device_count-1); + GGML_ASSERT(device_indexgpus[i])}, }; } - ggml_backend_sycl_buffer_type_initialized = true; + g_ggml_backend_sycl_buffer_type_initialized = true; } - - return &ggml_backend_sycl_buffer_types[device]; + return &ggml_backend_sycl_buffer_types[device_index]; } // sycl split buffer type @@ -17319,11 +17444,42 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) { return g_sycl_gpu_mgr->get_index(device_id); } +GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index) { + return g_sycl_gpu_mgr->gpus[device_index]; +} + +GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) { + GGML_ASSERT(main_gpu_idget_gpu_count()); + g_ggml_backend_sycl_buffer_type_initialized = false; +} + +GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode() { + if (g_ggml_sycl_backend_gpu_mode == SYCL_MUL_GPU_MODE) { + return; + } + + fprintf(stderr, "ggml_backend_sycl_set_mul_device_mode: true\n"); + + if (g_sycl_gpu_mgr) { + delete g_sycl_gpu_mgr; + } + g_sycl_gpu_mgr = new sycl_gpu_mgr(); + g_ggml_sycl_backend_gpu_mode = SYCL_MUL_GPU_MODE; + ggml_init_by_gpus(g_sycl_gpu_mgr->get_gpu_count()); + g_ggml_backend_sycl_buffer_type_initialized = false; +} + extern "C" int ggml_backend_sycl_reg_devices(); int ggml_backend_sycl_reg_devices() { - if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr(); - g_device_count = g_sycl_gpu_mgr->get_gpu_count(); + ggml_backend_sycl_set_mul_device_mode(); assert(g_device_count>0); for (int i = 0; i < g_device_count; i++) { int id = g_sycl_gpu_mgr->gpus[i]; diff --git a/ggml-sycl.h b/ggml-sycl.h index bf5b11b36..c549a64a1 100644 --- a/ggml-sycl.h +++ b/ggml-sycl.h @@ -29,6 +29,11 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_typ GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total); GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id); +// TODO: these are temporary +// ref: https://github.com/ggerganov/llama.cpp/pull/6022#issuecomment-1992615670 +GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index); +GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id); +GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode(); #ifdef __cplusplus } #endif diff --git a/llama.cpp b/llama.cpp index b8a8d2723..8e185d4bf 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5064,6 +5064,16 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam } #endif +#ifdef GGML_USE_SYCL + if (params.split_mode == LLAMA_SPLIT_MODE_NONE) { + ggml_backend_sycl_set_single_device_mode(params.main_gpu); + //SYCL use device index (0, 1, 2) directly, uer input device id, then convert to device index. + params.main_gpu = ggml_backend_sycl_get_device_index(params.main_gpu); + } else { + ggml_backend_sycl_set_mul_device_mode(); + } +#endif + if (!llm_load_tensors( ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock, params.progress_callback, params.progress_callback_user_data @@ -12921,23 +12931,22 @@ struct llama_context * llama_new_context_with_model( if (model->n_gpu_layers > 0) { // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { - int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu); - ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index); + ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu); if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index); + int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu); + LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu); llama_free(ctx); return nullptr; } ctx->backends.push_back(backend); } else { // LLAMA_SPLIT_LAYER requires a backend for each GPU - int id_list[GGML_SYCL_MAX_DEVICES]; - ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES); for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) { - int device_id = id_list[i]; ggml_backend_t backend = ggml_backend_sycl_init(i); if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i); + int id_list[GGML_SYCL_MAX_DEVICES]; + ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES); + LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i); llama_free(ctx); return nullptr; } From 3020327f6cd6d2ce50528dd65f4b199d2ea8b1ae Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 15 Mar 2024 13:24:03 +0100 Subject: [PATCH 29/56] cuda : disable unused cudaLaunchHostFunc code (#6078) --- ggml-cuda.cu | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index d1b5e52ba..db595409a 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -11541,6 +11541,7 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev if (ggml_backend_is_cuda(event->backend)) { CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx->device][0], (cudaEvent_t)event->context, 0)); } else { +#if 0 // untested auto wait_fn = [](void * user_data) { ggml_backend_event_t event = (ggml_backend_event_t)user_data; @@ -11548,6 +11549,8 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev }; CUDA_CHECK(cudaLaunchHostFunc(g_cudaStreams[cuda_ctx->device][0], wait_fn, event)); +#endif + GGML_ASSERT(false); } } From 4e9a7f7f7fb6acbddd1462909c8d696e38edbfcc Mon Sep 17 00:00:00 2001 From: Ting Lou Date: Fri, 15 Mar 2024 22:31:05 +0800 Subject: [PATCH 30/56] llava : change API to pure C style for Rust FFI bindgen (#6079) Co-authored-by: Lou Ting --- examples/llava/clip.cpp | 36 ++++++++++++++++++------------------ examples/llava/clip.h | 6 +++--- examples/llava/llava.cpp | 2 +- examples/llava/llava.h | 4 ++-- 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 2035554ea..a0ed82d7e 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1235,16 +1235,16 @@ struct clip_image_f32 * clip_image_f32_init() { void clip_image_u8_free(struct clip_image_u8 * img) { delete img; } void clip_image_f32_free(struct clip_image_f32 * img) { delete img; } -void clip_image_u8_batch_free(struct clip_image_u8_batch & batch) { - if (batch.size > 0) { - delete[] batch.data; - batch.size = 0; +void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { + if (batch->size > 0) { + delete[] batch->data; + batch->size = 0; } } -void clip_image_f32_batch_free(struct clip_image_f32_batch & batch) { - if (batch.size > 0) { - delete[] batch.data; - batch.size = 0; +void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { + if (batch->size > 0) { + delete[] batch->data; + batch->size = 0; } } @@ -1497,7 +1497,7 @@ static std::vector divide_to_patches_u8(const clip_image_u8 & im // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector // res_imgs memory is being allocated here, previous allocations will be freed if found -bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs) { +bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) { bool pad_to_square = true; if (!ctx->has_vision_encoder) { printf("This gguf file seems to have no vision encoder\n"); @@ -1509,11 +1509,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli pad_to_square = false; } // free the previous res_imgs if any set - if (res_imgs.size > 0) { + if (res_imgs->size > 0) { clip_image_f32_batch_free(res_imgs); } - res_imgs.data = nullptr; - res_imgs.size = 0; + res_imgs->data = nullptr; + res_imgs->size = 0; // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 @@ -1568,11 +1568,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square patches.insert(patches.begin(), image_original_resize); // clip_image_f32_batch_init(patches.size()); - res_imgs.size = patches.size(); - res_imgs.data = new clip_image_f32[res_imgs.size]; + res_imgs->size = patches.size(); + res_imgs->data = new clip_image_f32[res_imgs->size]; int num=0; for (auto& patch : patches) { - normalize_image_u8_to_f32(patch, &res_imgs.data[num], ctx->image_mean, ctx->image_std); + normalize_image_u8_to_f32(patch, &res_imgs->data[num], ctx->image_mean, ctx->image_std); num++; } @@ -1660,9 +1660,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli // } // res_imgs.push_back(res); - res_imgs.size = 1; - res_imgs.data = new clip_image_f32[res_imgs.size]; - res_imgs.data[0] = *res; + res_imgs->size = 1; + res_imgs->data = new clip_image_f32[res_imgs->size]; + res_imgs->data[0] = *res; clip_image_f32_free(res); return true; diff --git a/examples/llava/clip.h b/examples/llava/clip.h index e5bd54924..45bdad689 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -60,8 +60,8 @@ CLIP_API struct clip_image_f32 * clip_image_f32_init(); CLIP_API void clip_image_u8_free (struct clip_image_u8 * img); CLIP_API void clip_image_f32_free(struct clip_image_f32 * img); -CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch & batch); -CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch & batch); +CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch); +CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch); CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); @@ -69,7 +69,7 @@ CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img); /** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */ -CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs ); +CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs ); CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx); diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 980128166..29764757a 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -223,7 +223,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli clip_image_f32_batch img_res_v; img_res_v.size = 0; img_res_v.data = nullptr; - if (!clip_image_preprocess(ctx_clip, img, img_res_v)) { + if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) { fprintf(stderr, "%s: unable to preprocess image\n", __func__); delete[] img_res_v.data; return false; diff --git a/examples/llava/llava.h b/examples/llava/llava.h index 2d40f3f1d..19212f6e9 100644 --- a/examples/llava/llava.h +++ b/examples/llava/llava.h @@ -29,9 +29,9 @@ struct llava_image_embed { }; /** sanity check for clip <-> llava embed size match */ -LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip); +LLAVA_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip); -LLAVA_API bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out); +LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out); /** build an image embed from image file bytes */ LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length); From 12247f4c69a173b9482f68aaa174ec37fc909ccf Mon Sep 17 00:00:00 2001 From: Andrew Canis Date: Fri, 15 Mar 2024 16:41:22 -0400 Subject: [PATCH 31/56] llama : add Command-R support (#6033) Information about the Command-R 35B model (128k context) can be found at: https://huggingface.co/CohereForAI/c4ai-command-r-v01 Based on the llama2 model with a few changes: 1) New hyper parameter to scale output logits (logit_scale) 2) Uses LayerNorm instead of RMSNorm 3) Transfomer layers have a single shared LayerNorm that feeds into both the self-attention and FFN layers in parallel. There is no post-attention LayerNorm. 4) No support for Rotary Position Embeddings (RoPE) scaling 5) No biases used Find GGUF files here: https://huggingface.co/andrewcanis/c4ai-command-r-v01-GGUF To convert model to GGUF format yourself: 1) Download Command-R Hugging Face safetensors: git lfs install git clone https://huggingface.co/CohereForAI/c4ai-command-r-v01 2) Run: python3 convert-hf-to-gguf.py --outtype f16 ./c4ai-command-r-v01 --- README.md | 1 + convert-hf-to-gguf.py | 17 ++++ gguf-py/gguf/constants.py | 15 +++ gguf-py/gguf/gguf_writer.py | 3 + llama.cpp | 183 ++++++++++++++++++++++++++++++++++++ 5 files changed, 219 insertions(+) diff --git a/README.md b/README.md index 61bedc3f8..5cbdf7e47 100644 --- a/README.md +++ b/README.md @@ -112,6 +112,7 @@ Typically finetunes of the base models below are supported as well. - [x] [CodeShell](https://github.com/WisdomShell/codeshell) - [x] [Gemma](https://ai.google.dev/gemma) - [x] [Mamba](https://github.com/state-spaces/mamba) +- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01) **Multimodal models:** diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 5eee32016..cf1f98d66 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1965,6 +1965,23 @@ class MambaModel(Model): self.gguf_writer.add_tensor(new_name, data) +@Model.register("CohereForCausalLM") +class CommandR2Model(Model): + model_arch = gguf.MODEL_ARCH.COMMAND_R + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # max_position_embeddings = 8192 in config.json but model was actually + # trained on 128k context length + self.hparams["max_position_embeddings"] = self.hparams["model_max_length"] + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_logit_scale(self.hparams["logit_scale"]) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + + ###### CONVERSION LOGIC ###### diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 458a641dc..4a4facb06 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -42,6 +42,7 @@ class Keys: EXPERT_COUNT = "{arch}.expert_count" EXPERT_USED_COUNT = "{arch}.expert_used_count" POOLING_TYPE = "{arch}.pooling_type" + LOGIT_SCALE = "{arch}.logit_scale" class Attention: HEAD_COUNT = "{arch}.attention.head_count" @@ -121,6 +122,7 @@ class MODEL_ARCH(IntEnum): GEMMA = auto() STARCODER2 = auto() MAMBA = auto() + COMMAND_R = auto() class MODEL_TENSOR(IntEnum): @@ -187,6 +189,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.GEMMA: "gemma", MODEL_ARCH.STARCODER2: "starcoder2", MODEL_ARCH.MAMBA: "mamba", + MODEL_ARCH.COMMAND_R: "command-r", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -579,6 +582,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.SSM_D, MODEL_TENSOR.SSM_OUT, ], + MODEL_ARCH.COMMAND_R: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], # TODO } diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 1967b633c..2ae6c814b 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -361,6 +361,9 @@ class GGUFWriter: def add_clamp_kqv(self, value: float) -> None: self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value) + def add_logit_scale(self, value: float) -> None: + self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value) + def add_expert_count(self, count: int) -> None: self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count) diff --git a/llama.cpp b/llama.cpp index 8e185d4bf..fc5dd5cb4 100644 --- a/llama.cpp +++ b/llama.cpp @@ -214,6 +214,7 @@ enum llm_arch { LLM_ARCH_GEMMA, LLM_ARCH_STARCODER2, LLM_ARCH_MAMBA, + LLM_ARCH_COMMAND_R, LLM_ARCH_UNKNOWN, }; @@ -243,6 +244,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_GEMMA, "gemma" }, { LLM_ARCH_STARCODER2, "starcoder2" }, { LLM_ARCH_MAMBA, "mamba" }, + { LLM_ARCH_COMMAND_R, "command-r" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -268,6 +270,7 @@ enum llm_kv { LLM_KV_EXPERT_COUNT, LLM_KV_EXPERT_USED_COUNT, LLM_KV_POOLING_TYPE, + LLM_KV_LOGIT_SCALE, LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT_KV, @@ -332,6 +335,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_EXPERT_COUNT, "%s.expert_count" }, { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" }, { LLM_KV_POOLING_TYPE , "%s.pooling_type" }, + { LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, @@ -838,6 +842,21 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" }, }, }, + { + LLM_ARCH_COMMAND_R, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -1597,6 +1616,7 @@ enum e_model { MODEL_20B, MODEL_30B, MODEL_34B, + MODEL_35B, MODEL_40B, MODEL_65B, MODEL_70B, @@ -1643,6 +1663,7 @@ struct llama_hparams { float f_clamp_kqv = 0.0f; float f_max_alibi_bias = 0.0f; + float f_logit_scale = 0.0f; bool causal_attn = true; bool need_kq_pos = false; @@ -3231,6 +3252,7 @@ static const char * llama_model_type_name(e_model type) { case MODEL_20B: return "20B"; case MODEL_30B: return "30B"; case MODEL_34B: return "34B"; + case MODEL_35B: return "35B"; case MODEL_40B: return "40B"; case MODEL_65B: return "65B"; case MODEL_70B: return "70B"; @@ -3623,6 +3645,15 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_COMMAND_R: + { + ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + switch (hparams.n_layer) { + case 40: model.type = e_model::MODEL_35B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -3944,6 +3975,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps); LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv); LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias); + LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale); LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert); LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used); @@ -4918,6 +4950,37 @@ static bool llm_load_tensors( layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}); } } break; + case LLM_ARCH_COMMAND_R: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + // init output from the input tok embed + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + ml.n_created--; // artificial tensor + ml.size_data += ggml_nbytes(model.output); + } + + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -8315,6 +8378,121 @@ struct llm_build_context { return gf; } + + struct ggml_cgraph * build_command_r() { + + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + const float f_logit_scale = hparams.f_logit_scale; + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); + struct ggml_tensor * ffn_inp = cur; + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + struct ggml_tensor * attn_out = cur; + + // feed-forward network + { + cur = llm_build_ffn(ctx0, ffn_inp, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } + + // add together residual + FFN + self-attention + cur = ggml_add(ctx0, cur, inpL); + cur = ggml_add(ctx0, cur, attn_out); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + + if (f_logit_scale) { + cur = ggml_scale(ctx0, cur, f_logit_scale); + } + + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + + } }; static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { @@ -8497,6 +8675,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_mamba(); } break; + case LLM_ARCH_COMMAND_R: + { + result = llm.build_command_r(); + } break; default: GGML_ASSERT(false); } @@ -13147,6 +13329,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_ORION: case LLM_ARCH_INTERNLM2: case LLM_ARCH_MINICPM: + case LLM_ARCH_COMMAND_R: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 From 877b4d0c628cc70dddb5df72ed8fc14d126ca7e8 Mon Sep 17 00:00:00 2001 From: Theia Vogel Date: Fri, 15 Mar 2024 13:43:02 -0700 Subject: [PATCH 32/56] llama : add support for control vectors (#5970) * control vector api and implementation * control-vectors : minor code style updates * disable control vector when data == nullptr use -1 for disabled range (also on init) in case we ever support controlling layer 0 (embeddings) --------- Co-authored-by: Georgi Gerganov --- common/common.cpp | 215 ++++++++++++++++++++++++++++++++++++++++++++++ common/common.h | 31 ++++++- llama.cpp | 128 +++++++++++++++++++++++++++ llama.h | 23 ++++- 4 files changed, 392 insertions(+), 5 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 58fbd05aa..4912237e0 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -568,6 +568,34 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.lora_base = argv[i]; + } else if (arg == "--control-vector") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.control_vectors.push_back({ 1.0f, argv[i], }); + } else if (arg == "--control-vector-scaled") { + if (++i >= argc) { + invalid_param = true; + break; + } + const char * fname = argv[i]; + if (++i >= argc) { + invalid_param = true; + break; + } + params.control_vectors.push_back({ std::stof(argv[i]), fname, }); + } else if (arg == "--control-vector-layer-range") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.control_vector_layer_start = std::stoi(argv[i]); + if (++i >= argc) { + invalid_param = true; + break; + } + params.control_vector_layer_end = std::stoi(argv[i]); } else if (arg == "--mmproj") { if (++i >= argc) { invalid_param = true; @@ -1095,6 +1123,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n"); printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); + printf(" --control-vector FNAME\n"); + printf(" add a control vector\n"); + printf(" --control-vector-scaled FNAME S\n"); + printf(" add a control vector with user defined scaling S\n"); + printf(" --control-vector-layer-range START END\n"); + printf(" layer range to apply the control vector(s) to, start and end inclusive\n"); printf(" -m FNAME, --model FNAME\n"); printf(" model path (default: %s)\n", params.model.c_str()); printf(" -md FNAME, --model-draft FNAME\n"); @@ -1360,6 +1394,30 @@ std::tuple llama_init_from_gpt_par return std::make_tuple(nullptr, nullptr); } + if (!params.control_vectors.empty()) { + if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1; + if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model); + + const auto cvec = llama_control_vector_load(params.control_vectors); + if (cvec.n_embd == -1) { + llama_free(lctx); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); + } + + int err = llama_control_vector_apply(lctx, + cvec.data.data(), + cvec.data.size(), + cvec.n_embd, + params.control_vector_layer_start, + params.control_vector_layer_end); + if (err) { + llama_free(lctx); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); + } + } + for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]); float lora_scale = std::get<1>(params.lora_adapter[i]); @@ -1890,3 +1948,160 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n) return sum / (sqrt(sum1) * sqrt(sum2)); } + +// +// Control vector utils +// + +static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) { + int32_t n_tensors; + + size_t n_bytes = 0; + + uint32_t max_direction_layer = 0; + + llama_control_vector_data result = { -1, {} }; + + // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer + { + struct ggml_init_params meta_params = { + /* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(), + /* .mem_buffer = */ nullptr, + /* .no_alloc = */ true, + }; + ggml_context * meta_ctx = ggml_init(meta_params); + struct gguf_init_params meta_gguf_params = { + /* .no_alloc = */ true, + /* .ctx = */ &meta_ctx, + }; + struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params); + if (!meta_ctx_gguf) { + fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str()); + ggml_free(meta_ctx); + return result; + } + + n_tensors = gguf_get_n_tensors(meta_ctx_gguf); + for (int i = 0; i < n_tensors; i++) { + std::string name = gguf_get_tensor_name(meta_ctx_gguf, i); + + // split on '.' + size_t dotpos = name.find('.'); + if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") { + try { + uint32_t layer = std::stoi(name.substr(dotpos + 1)); + if (layer == 0) { + fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return result; + } + if (layer > max_direction_layer) { + max_direction_layer = layer; + } + } catch (...) { + fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return result; + } + } + + struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str()); + if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) { + fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return result; + } + if (result.n_embd == -1) { + result.n_embd = ggml_nelements(tensor_meta); + } else if (ggml_nelements(tensor_meta) != result.n_embd) { + fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return result; + } + n_bytes += ggml_nbytes(tensor_meta); + } + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + } + + if (n_tensors == 0) { + fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str()); + return result; + } + + // load and scale tensors into final control vector context + struct ggml_init_params ggml_params = { + /* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes, + /* .mem_buffer = */ nullptr, + /* .no_alloc = */ false, + }; + struct ggml_context * ctx = ggml_init(ggml_params); + + struct gguf_init_params params = { + /*.no_alloc = */ false, + /*.ctx = */ &ctx, + }; + struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), params); + if (!ctx_gguf) { + fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str()); + ggml_free(ctx); + return result; + } + + // do not store data for layer 0 (it's not used) + result.data.resize(result.n_embd * max_direction_layer); + + for (uint32_t il = 1; il <= max_direction_layer; il++) { + const std::string name = "direction." + std::to_string(il); + const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str()); + + float * dst = result.data.data() + result.n_embd * (il - 1); + + if (tensor) { + const float * src = (const float *) tensor->data; + for (int j = 0; j < result.n_embd; j++) { + dst[j] = src[j] * load_info.strength; + } + } else { + for (int j = 0; j < result.n_embd; j++) { + dst[j] = 0.0f; + } + } + } + + return result; +} + +llama_control_vector_data llama_control_vector_load(const std::vector & load_infos) { + llama_control_vector_data result = { -1, {} }; + + for (const auto & info : load_infos) { + auto cur = llama_control_vector_load_one(info); + + if (cur.n_embd == -1) { + return result; + } + if (result.n_embd != -1 && (result.n_embd != cur.n_embd || result.data.size() != cur.data.size())) { + fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, info.fname.c_str()); + return result; + } + + if (result.n_embd == -1) { + result = std::move(cur); + } else { + for (size_t i = 0; i < cur.data.size(); i++) { + result.data[i] += cur.data[i]; + } + } + } + + if (result.n_embd == -1) { + fprintf(stderr, "%s: no vectors passed\n", __func__); + } + + return result; +} diff --git a/common/common.h b/common/common.h index d250eef8b..687f3425e 100644 --- a/common/common.h +++ b/common/common.h @@ -37,10 +37,13 @@ extern char const *LLAMA_COMMIT; extern char const *LLAMA_COMPILER; extern char const *LLAMA_BUILD_TARGET; +struct llama_control_vector_load_info; + +int32_t get_num_physical_cores(); + // // CLI argument parsing // -int32_t get_num_physical_cores(); struct gpt_params { uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed @@ -103,6 +106,11 @@ struct gpt_params { std::vector> lora_adapter; // lora adapter path with user defined scale std::string lora_base = ""; // base model path for the lora adapter + std::vector control_vectors; // control vector with user defined scale + + int32_t control_vector_layer_start = -1; // layer range for control vector + int32_t control_vector_layer_end = -1; // layer range for control vector + int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used. int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line // (which is more convenient to use for plotting) @@ -269,3 +277,24 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40 void llama_embd_normalize(const float * inp, float * out, int n); float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n); + +// +// Control vector utils +// + +struct llama_control_vector_data { + int n_embd; + + // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd + std::vector data; +}; + +struct llama_control_vector_load_info { + float strength; + + std::string fname; +}; + +// Load control vectors, scale each by strength, and add them together. +// On error, returns {-1, empty} +llama_control_vector_data llama_control_vector_load(const std::vector & load_infos); diff --git a/llama.cpp b/llama.cpp index fc5dd5cb4..52bd718ba 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1894,6 +1894,31 @@ struct llama_kv_cache { } }; +struct llama_control_vector { + std::vector tensors; // per layer + std::vector ctxs; + std::vector bufs; + + int32_t layer_start = -1; + int32_t layer_end = -1; + + ggml_tensor * tensor_for(int il) const { + if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) { + return nullptr; + } + return tensors[il]; + } + + ~llama_control_vector() { + for (struct ggml_context * ctx : ctxs) { + ggml_free(ctx); + } + for (ggml_backend_buffer_t buf : bufs) { + ggml_backend_buffer_free(buf); + } + } +}; + struct llama_vocab { using id = int32_t; using token = std::string; @@ -2108,6 +2133,9 @@ struct llama_context { struct ggml_tensor * inp_s_mask; // F32 [1, kv_size] struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch] + // control vectors + struct llama_control_vector cvec; + #ifdef GGML_USE_MPI ggml_mpi_context * ctx_mpi = NULL; #endif @@ -5931,6 +5959,12 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); + if (layer_dir != nullptr) { + cur = ggml_add(ctx0, cur, layer_dir); + } cb(cur, "l_out", il); // input for next layer @@ -13366,6 +13400,10 @@ int32_t llama_n_embd(const struct llama_model * model) { return model->hparams.n_embd; } +int32_t llama_n_layer(const struct llama_model * model) { + return model->hparams.n_layer; +} + float llama_rope_freq_scale_train(const struct llama_model * model) { return model->hparams.rope_freq_scale_train; } @@ -13465,6 +13503,96 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const } } +static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) { + GGML_ASSERT(cvec.tensors.empty()); + GGML_ASSERT(cvec.ctxs.empty()); + GGML_ASSERT(cvec.bufs.empty()); + + // count layer buffer types + std::map buft_layer_count; + for (int64_t i = 0; i < model.hparams.n_layer; i++) { + buft_layer_count[model.buft_layer[i].buft]++; + } + + // allocate contexts + std::map ctx_map; + for (auto & it : buft_layer_count) { + int n_layers = it.second; + struct ggml_init_params params = { + /*.mem_size =*/ n_layers * ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ggml_context * ctx = ggml_init(params); + if (!ctx) { + LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__); + return 1; + } + ctx_map[it.first] = ctx; + } + + // make tensors + cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0 + for (size_t il = 1; il < model.hparams.n_layer; il++) { + struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft); + ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd); + cvec.tensors.push_back(tensor); + } + + // allocate tensors / buffers and zero + for (auto it : ctx_map) { + ggml_backend_buffer_type_t buft = it.first; + ggml_context * ctx = it.second; + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (!buf) { + LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__); + return false; + } + ggml_backend_buffer_clear(buf, 0); + cvec.ctxs.push_back(ctx); + cvec.bufs.push_back(buf); + } + + return true; +} + +int32_t llama_control_vector_apply(struct llama_context * lctx, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) { + const llama_model & model = lctx->model; + llama_control_vector & cvec = lctx->cvec; + + if (data == nullptr) { + // disable the current control vector (but leave allocated for later) + cvec.layer_start = -1; + cvec.layer_end = -1; + return 0; + } + + if (n_embd != (int) model.hparams.n_embd) { + LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__); + return 1; + } + + if (cvec.tensors.empty()) { + if (!llama_control_vector_init(cvec, model)) { + return 1; + } + } + + cvec.layer_start = il_start; + cvec.layer_end = il_end; + + for (size_t il = 1; il < model.hparams.n_layer; il++) { + assert(cvec.tensors[il] != nullptr); + + const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present + if (off + n_embd <= len) { + ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il])); + } + } + + return 0; +} + struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) { struct llama_kv_cache_view result = { /*.n_cells = */ 0, diff --git a/llama.h b/llama.h index 90aa5372e..40dcf54e3 100644 --- a/llama.h +++ b/llama.h @@ -388,6 +388,7 @@ extern "C" { LLAMA_API int32_t llama_n_vocab (const struct llama_model * model); LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model); LLAMA_API int32_t llama_n_embd (const struct llama_model * model); + LLAMA_API int32_t llama_n_layer (const struct llama_model * model); // Get the model's RoPE frequency scaling factor LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model); @@ -435,10 +436,24 @@ extern "C" { // Returns 0 on success LLAMA_API int32_t llama_model_apply_lora_from_file( const struct llama_model * model, - const char * path_lora, - float scale, - const char * path_base_model, - int32_t n_threads); + const char * path_lora, + float scale, + const char * path_base_model, + int32_t n_threads); + + // Apply a loaded control vector to a llama_context, or if data is NULL, clear + // the currently loaded vector. + // n_embd should be the size of a single layer's control, and data should point + // to an n_embd x n_layers buffer starting from layer 1. + // il_start and il_end are the layer range the vector should apply to (both inclusive) + // See llama_control_vector_load in common to load a control vector. + LLAMA_API int32_t llama_control_vector_apply( + struct llama_context * lctx, + const float * data, + size_t len, + int32_t n_embd, + int32_t il_start, + int32_t il_end); // // KV cache From d84c48505f60bcd358b82a751d40418c4d235643 Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 15 Mar 2024 22:14:16 +0100 Subject: [PATCH 33/56] llama : fix Baichuan2 13B (#6092) --- llama.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index 52bd718ba..e4db288dd 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6000,7 +6000,7 @@ struct llm_build_context { inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + struct ggml_tensor * inp_pos = model.type == MODEL_7B ? build_inp_pos() : nullptr; // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); @@ -6050,7 +6050,6 @@ struct llm_build_context { cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); From a56d09a4407f29c21e149b44fd5308f83aa1cb09 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Sat, 16 Mar 2024 13:20:53 +0100 Subject: [PATCH 34/56] ci : close inactive issue with workflow (#6053) * issues: ci - close inactive issue with workflow * ci: close issue, change workflow schedule time --- .github/workflows/close-issue.yml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 .github/workflows/close-issue.yml diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml new file mode 100644 index 000000000..bc08a72d0 --- /dev/null +++ b/.github/workflows/close-issue.yml @@ -0,0 +1,22 @@ +name: Close inactive issues +on: + schedule: + - cron: "42 0 * * *" + +jobs: + close-issues: + runs-on: ubuntu-latest + permissions: + issues: write + pull-requests: write + steps: + - uses: actions/stale@v5 + with: + days-before-issue-stale: 30 + days-before-issue-close: 14 + stale-issue-label: "stale" + stale-issue-message: "This issue is stale because it has been open for 30 days with no activity." + close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale." + days-before-pr-stale: -1 + days-before-pr-close: -1 + repo-token: ${{ secrets.GITHUB_TOKEN }} From 15961ec04dbd59d21d8984d42e4c0f7e7e7d320a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?DAN=E2=84=A2?= Date: Sat, 16 Mar 2024 11:39:15 -0400 Subject: [PATCH 35/56] common : refactor nested if causing error C1061 on MSVC (#6101) * Refactor nested if causing error C1061 on MSVC. * Revert back and remove else's. * Add flag to track found arguments. --- common/common.cpp | 475 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 356 insertions(+), 119 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 4912237e0..1b0ba8493 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -151,13 +151,17 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { std::replace(arg.begin(), arg.end(), '_', '-'); } + bool arg_found = false; if (arg == "-s" || arg == "--seed") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.seed = std::stoul(argv[i]); - } else if (arg == "-t" || arg == "--threads") { + } + if (arg == "-t" || arg == "--threads") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -166,7 +170,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (params.n_threads <= 0) { params.n_threads = std::thread::hardware_concurrency(); } - } else if (arg == "-tb" || arg == "--threads-batch") { + } + if (arg == "-tb" || arg == "--threads-batch") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -175,7 +181,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (params.n_threads_batch <= 0) { params.n_threads_batch = std::thread::hardware_concurrency(); } - } else if (arg == "-td" || arg == "--threads-draft") { + } + if (arg == "-td" || arg == "--threads-draft") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -184,7 +192,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (params.n_threads_draft <= 0) { params.n_threads_draft = std::thread::hardware_concurrency(); } - } else if (arg == "-tbd" || arg == "--threads-batch-draft") { + } + if (arg == "-tbd" || arg == "--threads-batch-draft") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -193,25 +203,37 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (params.n_threads_batch_draft <= 0) { params.n_threads_batch_draft = std::thread::hardware_concurrency(); } - } else if (arg == "-p" || arg == "--prompt") { + } + if (arg == "-p" || arg == "--prompt") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.prompt = argv[i]; - } else if (arg == "-e" || arg == "--escape") { + } + if (arg == "-e" || arg == "--escape") { + arg_found = true; params.escape = true; - } else if (arg == "--prompt-cache") { + } + if (arg == "--prompt-cache") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.path_prompt_cache = argv[i]; - } else if (arg == "--prompt-cache-all") { + } + if (arg == "--prompt-cache-all") { + arg_found = true; params.prompt_cache_all = true; - } else if (arg == "--prompt-cache-ro") { + } + if (arg == "--prompt-cache-ro") { + arg_found = true; params.prompt_cache_ro = true; - } else if (arg == "-bf" || arg == "--binary-file") { + } + if (arg == "-bf" || arg == "--binary-file") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -228,7 +250,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { ss << file.rdbuf(); params.prompt = ss.str(); fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]); - } else if (arg == "-f" || arg == "--file") { + } + if (arg == "-f" || arg == "--file") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -245,51 +269,67 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (!params.prompt.empty() && params.prompt.back() == '\n') { params.prompt.pop_back(); } - } else if (arg == "-n" || arg == "--n-predict") { + } + if (arg == "-n" || arg == "--n-predict") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_predict = std::stoi(argv[i]); - } else if (arg == "--top-k") { + } + if (arg == "--top-k") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.top_k = std::stoi(argv[i]); - } else if (arg == "-c" || arg == "--ctx-size") { + } + if (arg == "-c" || arg == "--ctx-size") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_ctx = std::stoi(argv[i]); - } else if (arg == "--grp-attn-n" || arg == "-gan") { + } + if (arg == "--grp-attn-n" || arg == "-gan") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.grp_attn_n = std::stoi(argv[i]); - } else if (arg == "--grp-attn-w" || arg == "-gaw") { + } + if (arg == "--grp-attn-w" || arg == "-gaw") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.grp_attn_w = std::stoi(argv[i]); - } else if (arg == "--rope-freq-base") { + } + if (arg == "--rope-freq-base") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.rope_freq_base = std::stof(argv[i]); - } else if (arg == "--rope-freq-scale") { + } + if (arg == "--rope-freq-scale") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.rope_freq_scale = std::stof(argv[i]); - } else if (arg == "--rope-scaling") { + } + if (arg == "--rope-scaling") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -299,43 +339,57 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } else { invalid_param = true; break; } - } else if (arg == "--rope-scale") { + } + if (arg == "--rope-scale") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.rope_freq_scale = 1.0f/std::stof(argv[i]); - } else if (arg == "--yarn-orig-ctx") { + } + if (arg == "--yarn-orig-ctx") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.yarn_orig_ctx = std::stoi(argv[i]); - } else if (arg == "--yarn-ext-factor") { + } + if (arg == "--yarn-ext-factor") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.yarn_ext_factor = std::stof(argv[i]); - } else if (arg == "--yarn-attn-factor") { + } + if (arg == "--yarn-attn-factor") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.yarn_attn_factor = std::stof(argv[i]); - } else if (arg == "--yarn-beta-fast") { + } + if (arg == "--yarn-beta-fast") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.yarn_beta_fast = std::stof(argv[i]); - } else if (arg == "--yarn-beta-slow") { + } + if (arg == "--yarn-beta-slow") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.yarn_beta_slow = std::stof(argv[i]); - } else if (arg == "--pooling") { + } + if (arg == "--pooling") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -345,118 +399,156 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } else { invalid_param = true; break; } - } else if (arg == "--defrag-thold" || arg == "-dt") { + } + if (arg == "--defrag-thold" || arg == "-dt") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.defrag_thold = std::stof(argv[i]); - } else if (arg == "--samplers") { + } + if (arg == "--samplers") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } const auto sampler_names = string_split(argv[i], ';'); sparams.samplers_sequence = sampler_types_from_names(sampler_names, true); - } else if (arg == "--sampling-seq") { + } + if (arg == "--sampling-seq") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.samplers_sequence = sampler_types_from_chars(argv[i]); - } else if (arg == "--top-p") { + } + if (arg == "--top-p") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.top_p = std::stof(argv[i]); - } else if (arg == "--min-p") { + } + if (arg == "--min-p") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.min_p = std::stof(argv[i]); - } else if (arg == "--temp") { + } + if (arg == "--temp") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.temp = std::stof(argv[i]); sparams.temp = std::max(sparams.temp, 0.0f); - } else if (arg == "--tfs") { + } + if (arg == "--tfs") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.tfs_z = std::stof(argv[i]); - } else if (arg == "--typical") { + } + if (arg == "--typical") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.typical_p = std::stof(argv[i]); - } else if (arg == "--repeat-last-n") { + } + if (arg == "--repeat-last-n") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.penalty_last_n = std::stoi(argv[i]); sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n); - } else if (arg == "--repeat-penalty") { + } + if (arg == "--repeat-penalty") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.penalty_repeat = std::stof(argv[i]); - } else if (arg == "--frequency-penalty") { + } + if (arg == "--frequency-penalty") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.penalty_freq = std::stof(argv[i]); - } else if (arg == "--presence-penalty") { + } + if (arg == "--presence-penalty") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.penalty_present = std::stof(argv[i]); - } else if (arg == "--dynatemp-range") { + } + if (arg == "--dynatemp-range") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.dynatemp_range = std::stof(argv[i]); - } else if (arg == "--dynatemp-exp") { + } + if (arg == "--dynatemp-exp") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.dynatemp_exponent = std::stof(argv[i]); - } else if (arg == "--mirostat") { + } + if (arg == "--mirostat") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.mirostat = std::stoi(argv[i]); - } else if (arg == "--mirostat-lr") { + } + if (arg == "--mirostat-lr") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.mirostat_eta = std::stof(argv[i]); - } else if (arg == "--mirostat-ent") { + } + if (arg == "--mirostat-ent") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.mirostat_tau = std::stof(argv[i]); - } else if (arg == "--cfg-negative-prompt") { + } + if (arg == "--cfg-negative-prompt") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.cfg_negative_prompt = argv[i]; - } else if (arg == "--cfg-negative-prompt-file") { + } + if (arg == "--cfg-negative-prompt-file") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -471,86 +563,114 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') { sparams.cfg_negative_prompt.pop_back(); } - } else if (arg == "--cfg-scale") { + } + if (arg == "--cfg-scale") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.cfg_scale = std::stof(argv[i]); - } else if (arg == "-b" || arg == "--batch-size") { + } + if (arg == "-b" || arg == "--batch-size") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_batch = std::stoi(argv[i]); - } else if (arg == "-ub" || arg == "--ubatch-size") { + } + if (arg == "-ub" || arg == "--ubatch-size") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_ubatch = std::stoi(argv[i]); - } else if (arg == "--keep") { + } + if (arg == "--keep") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_keep = std::stoi(argv[i]); - } else if (arg == "--draft") { + } + if (arg == "--draft") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_draft = std::stoi(argv[i]); - } else if (arg == "--chunks") { + } + if (arg == "--chunks") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_chunks = std::stoi(argv[i]); - } else if (arg == "-np" || arg == "--parallel") { + } + if (arg == "-np" || arg == "--parallel") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_parallel = std::stoi(argv[i]); - } else if (arg == "-ns" || arg == "--sequences") { + } + if (arg == "-ns" || arg == "--sequences") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_sequences = std::stoi(argv[i]); - } else if (arg == "--p-split" || arg == "-ps") { + } + if (arg == "--p-split" || arg == "-ps") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.p_split = std::stof(argv[i]); - } else if (arg == "-m" || arg == "--model") { + } + if (arg == "-m" || arg == "--model") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.model = argv[i]; - } else if (arg == "-md" || arg == "--model-draft") { + } + if (arg == "-md" || arg == "--model-draft") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.model_draft = argv[i]; - } else if (arg == "-a" || arg == "--alias") { + } + if (arg == "-a" || arg == "--alias") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.model_alias = argv[i]; - } else if (arg == "--lora") { + } + if (arg == "--lora") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.lora_adapter.emplace_back(argv[i], 1.0f); params.use_mmap = false; - } else if (arg == "--lora-scaled") { + } + if (arg == "--lora-scaled") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -562,19 +682,25 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); params.use_mmap = false; - } else if (arg == "--lora-base") { + } + if (arg == "--lora-base") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.lora_base = argv[i]; - } else if (arg == "--control-vector") { + } + if (arg == "--control-vector") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.control_vectors.push_back({ 1.0f, argv[i], }); - } else if (arg == "--control-vector-scaled") { + } + if (arg == "--control-vector-scaled") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -585,7 +711,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.control_vectors.push_back({ std::stof(argv[i]), fname, }); - } else if (arg == "--control-vector-layer-range") { + } + if (arg == "--control-vector-layer-range") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -596,49 +724,85 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.control_vector_layer_end = std::stoi(argv[i]); - } else if (arg == "--mmproj") { + } + if (arg == "--mmproj") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.mmproj = argv[i]; - } else if (arg == "--image") { + } + if (arg == "--image") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.image = argv[i]; - } else if (arg == "-i" || arg == "--interactive") { + } + if (arg == "-i" || arg == "--interactive") { + arg_found = true; params.interactive = true; - } else if (arg == "--embedding") { + } + if (arg == "--embedding") { + arg_found = true; params.embedding = true; - } else if (arg == "--interactive-first") { + } + if (arg == "--interactive-first") { + arg_found = true; params.interactive_first = true; - } else if (arg == "-ins" || arg == "--instruct") { + } + if (arg == "-ins" || arg == "--instruct") { + arg_found = true; params.instruct = true; - } else if (arg == "-cml" || arg == "--chatml") { + } + if (arg == "-cml" || arg == "--chatml") { + arg_found = true; params.chatml = true; - } else if (arg == "--infill") { + } + if (arg == "--infill") { + arg_found = true; params.infill = true; - } else if (arg == "-dkvc" || arg == "--dump-kv-cache") { + } + if (arg == "-dkvc" || arg == "--dump-kv-cache") { + arg_found = true; params.dump_kv_cache = true; - } else if (arg == "-nkvo" || arg == "--no-kv-offload") { + } + if (arg == "-nkvo" || arg == "--no-kv-offload") { + arg_found = true; params.no_kv_offload = true; - } else if (arg == "-ctk" || arg == "--cache-type-k") { + } + if (arg == "-ctk" || arg == "--cache-type-k") { + arg_found = true; params.cache_type_k = argv[++i]; - } else if (arg == "-ctv" || arg == "--cache-type-v") { + } + if (arg == "-ctv" || arg == "--cache-type-v") { + arg_found = true; params.cache_type_v = argv[++i]; - } else if (arg == "--multiline-input") { + } + if (arg == "--multiline-input") { + arg_found = true; params.multiline_input = true; - } else if (arg == "--simple-io") { + } + if (arg == "--simple-io") { + arg_found = true; params.simple_io = true; - } else if (arg == "-cb" || arg == "--cont-batching") { + } + if (arg == "-cb" || arg == "--cont-batching") { + arg_found = true; params.cont_batching = true; - } else if (arg == "--color") { + } + if (arg == "--color") { + arg_found = true; params.use_color = true; - } else if (arg == "--mlock") { + } + if (arg == "--mlock") { + arg_found = true; params.use_mlock = true; - } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { + } + if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -648,7 +812,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); } - } else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") { + } + if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -658,7 +824,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); } - } else if (arg == "--main-gpu" || arg == "-mg") { + } + if (arg == "--main-gpu" || arg == "-mg") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -667,7 +835,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { #ifndef GGML_USE_CUBLAS_SYCL fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n"); #endif // GGML_USE_CUBLAS_SYCL - } else if (arg == "--split-mode" || arg == "-sm") { + } + if (arg == "--split-mode" || arg == "-sm") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -691,7 +861,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n"); #endif // GGML_USE_CUBLAS_SYCL - } else if (arg == "--tensor-split" || arg == "-ts") { + } + if (arg == "--tensor-split" || arg == "-ts") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -716,9 +888,13 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { #ifndef GGML_USE_CUBLAS_SYCL_VULKAN fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n"); #endif // GGML_USE_CUBLAS_SYCL - } else if (arg == "--no-mmap") { + } + if (arg == "--no-mmap") { + arg_found = true; params.use_mmap = false; - } else if (arg == "--numa") { + } + if (arg == "--numa") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -728,17 +904,25 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } else { invalid_param = true; break; } - } else if (arg == "--verbose-prompt") { + } + if (arg == "--verbose-prompt") { + arg_found = true; params.verbose_prompt = true; - } else if (arg == "--no-display-prompt") { + } + if (arg == "--no-display-prompt") { + arg_found = true; params.display_prompt = false; - } else if (arg == "-r" || arg == "--reverse-prompt") { + } + if (arg == "-r" || arg == "--reverse-prompt") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.antiprompt.emplace_back(argv[i]); - } else if (arg == "-ld" || arg == "--logdir") { + } + if (arg == "-ld" || arg == "--logdir") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -748,63 +932,93 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (params.logdir.back() != DIRECTORY_SEPARATOR) { params.logdir += DIRECTORY_SEPARATOR; } - } else if (arg == "--save-all-logits" || arg == "--kl-divergence-base") { + } + if (arg == "--save-all-logits" || arg == "--kl-divergence-base") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.logits_file = argv[i]; - } else if (arg == "--perplexity" || arg == "--all-logits") { + } + if (arg == "--perplexity" || arg == "--all-logits") { + arg_found = true; params.logits_all = true; - } else if (arg == "--ppl-stride") { + } + if (arg == "--ppl-stride") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.ppl_stride = std::stoi(argv[i]); - } else if (arg == "-ptc" || arg == "--print-token-count") { + } + if (arg == "-ptc" || arg == "--print-token-count") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_print = std::stoi(argv[i]); - } else if (arg == "--ppl-output-type") { + } + if (arg == "--ppl-output-type") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.ppl_output_type = std::stoi(argv[i]); - } else if (arg == "--hellaswag") { + } + if (arg == "--hellaswag") { + arg_found = true; params.hellaswag = true; - } else if (arg == "--hellaswag-tasks") { + } + if (arg == "--hellaswag-tasks") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.hellaswag_tasks = std::stoi(argv[i]); - } else if (arg == "--winogrande") { + } + if (arg == "--winogrande") { + arg_found = true; params.winogrande = true; - } else if (arg == "--winogrande-tasks") { + } + if (arg == "--winogrande-tasks") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.winogrande_tasks = std::stoi(argv[i]); - } else if (arg == "--multiple-choice") { + } + if (arg == "--multiple-choice") { + arg_found = true; params.multiple_choice = true; - } else if (arg == "--multiple-choice-tasks") { + } + if (arg == "--multiple-choice-tasks") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.multiple_choice_tasks = std::stoi(argv[i]); - } else if (arg == "--kl-divergence") { + } + if (arg == "--kl-divergence") { + arg_found = true; params.kl_divergence = true; - } else if (arg == "--ignore-eos") { + } + if (arg == "--ignore-eos") { + arg_found = true; params.ignore_eos = true; - } else if (arg == "--no-penalize-nl") { + } + if (arg == "--no-penalize-nl") { + arg_found = true; sparams.penalize_nl = false; - } else if (arg == "-l" || arg == "--logit-bias") { + } + if (arg == "-l" || arg == "--logit-bias") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -823,36 +1037,51 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } - } else if (arg == "-h" || arg == "--help") { + } + if (arg == "-h" || arg == "--help") { + arg_found = true; return false; - - } else if (arg == "--version") { + } + if (arg == "--version") { + arg_found = true; fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); exit(0); - } else if (arg == "--random-prompt") { + } + if (arg == "--random-prompt") { + arg_found = true; params.random_prompt = true; - } else if (arg == "--in-prefix-bos") { + } + if (arg == "--in-prefix-bos") { + arg_found = true; params.input_prefix_bos = true; - } else if (arg == "--in-prefix") { + } + if (arg == "--in-prefix") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.input_prefix = argv[i]; - } else if (arg == "--in-suffix") { + } + if (arg == "--in-suffix") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.input_suffix = argv[i]; - } else if (arg == "--grammar") { + } + if (arg == "--grammar") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.grammar = argv[i]; - } else if (arg == "--grammar-file") { + } + if (arg == "--grammar-file") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -868,7 +1097,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { std::istreambuf_iterator(), std::back_inserter(sparams.grammar) ); - } else if (arg == "--override-kv") { + } + if (arg == "--override-kv") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -911,10 +1142,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { params.kv_overrides.push_back(kvo); #ifndef LOG_DISABLE_LOGS // Parse args for logging parameters - } else if ( log_param_single_parse( argv[i] ) ) { + } + if ( log_param_single_parse( argv[i] ) ) { + arg_found = true; // Do nothing, log_param_single_parse automatically does it's thing // and returns if a match was found and parsed. - } else if ( log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i] ) ) { + } + if ( log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i] ) ) { + arg_found = true; // We have a matching known parameter requiring an argument, // now we need to check if there is anything after this argv // and flag invalid_param or parse it. @@ -928,7 +1163,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } // End of Parse args for logging parameters #endif // LOG_DISABLE_LOGS - } else { + } + + if (!arg_found) { throw std::invalid_argument("error: unknown argument: " + arg); } } From dfbfdd60f90207404039c6578d709231496831d9 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 16 Mar 2024 16:42:08 +0100 Subject: [PATCH 36/56] readme : add wllama as a wasm binding (#6100) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 5cbdf7e47..c2f3342f0 100644 --- a/README.md +++ b/README.md @@ -134,6 +134,7 @@ Typically finetunes of the base models below are supported as well. - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp) - JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp) - JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm) +- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama) - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb) - Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp) - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs) From b5f4ae09c3244ae1644b67c03ed9f4227ab25ad2 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Sat, 16 Mar 2024 16:46:29 +0100 Subject: [PATCH 37/56] gritlm : add initial README.md (#6086) * gritlm: add initial README.md to examples/gritlm This commit adds a suggestion for an initial README.md for the gritlm example. Signed-off-by: Daniel Bevenius * squash! gritlm: add initial README.md to examples/gritlm Use the `scripts/hf.sh` script to download the model file. Signed-off-by: Daniel Bevenius * squash! gritlm: add initial README.md to examples/gritlm Fix editorconfig-checker error in examples/gritlm/README.md. Signed-off-by: Daniel Bevenius --------- Signed-off-by: Daniel Bevenius --- examples/gritlm/README.md | 62 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 examples/gritlm/README.md diff --git a/examples/gritlm/README.md b/examples/gritlm/README.md new file mode 100644 index 000000000..64cc19204 --- /dev/null +++ b/examples/gritlm/README.md @@ -0,0 +1,62 @@ +## Generative Representational Instruction Tuning (GRIT) Example +[gritlm] a model which can generate embeddings as well as "normal" text +generation depending on the instructions in the prompt. + +* Paper: https://arxiv.org/pdf/2402.09906.pdf + +### Retrieval-Augmented Generation (RAG) use case +One use case for `gritlm` is to use it with RAG. If we recall how RAG works is +that we take documents that we want to use as context, to ground the large +language model (LLM), and we create token embeddings for them. We then store +these token embeddings in a vector database. + +When we perform a query, prompt the LLM, we will first create token embeddings +for the query and then search the vector database to retrieve the most +similar vectors, and return those documents so they can be passed to the LLM as +context. Then the query and the context will be passed to the LLM which will +have to _again_ create token embeddings for the query. But because gritlm is used +the first query can be cached and the second query tokenization generation does +not have to be performed at all. + +### Running the example +Download a Grit model: +```console +$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf +``` + +Run the example using the downloaded model: +```console +$ ./gritlm -m gritlm-7b_q4_1.gguf + +Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605 +Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103 +Cosine similarity between "Generative Representational Instruction Tuning" and "A purely peer-to-peer version of electronic cash w" is: 0.112 +Cosine similarity between "Generative Representational Instruction Tuning" and "All text-based language problems can be reduced to" is: 0.547 + +Oh, brave adventurer, who dared to climb +The lofty peak of Mt. Fuji in the night, +When shadows lurk and ghosts do roam, +And darkness reigns, a fearsome sight. + +Thou didst set out, with heart aglow, +To conquer this mountain, so high, +And reach the summit, where the stars do glow, +And the moon shines bright, up in the sky. + +Through the mist and fog, thou didst press on, +With steadfast courage, and a steadfast will, +Through the darkness, thou didst not be gone, +But didst climb on, with a steadfast skill. + +At last, thou didst reach the summit's crest, +And gazed upon the world below, +And saw the beauty of the night's best, +And felt the peace, that only nature knows. + +Oh, brave adventurer, who dared to climb +The lofty peak of Mt. Fuji in the night, +Thou art a hero, in the eyes of all, +For thou didst conquer this mountain, so bright. +``` + +[gritlm]: https://github.com/ContextualAI/gritlm From c47cf414efafb8f60596edc7edb5a2d68065e992 Mon Sep 17 00:00:00 2001 From: AmirAli Mirian <37371367+amiralimi@users.noreply.github.com> Date: Sat, 16 Mar 2024 11:52:02 -0400 Subject: [PATCH 38/56] ggml : add AVX512F SIMD (#6088) --- ggml.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) diff --git a/ggml.c b/ggml.c index c94006e51..fa23cb3c4 100644 --- a/ggml.c +++ b/ggml.c @@ -931,6 +931,101 @@ inline static float vaddvq_f32(float32x4_t v) { #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE #endif +#elif defined(__AVX512F__) + +#define GGML_SIMD + +// F32 AVX512 + +#define GGML_F32_STEP 64 +#define GGML_F32_EPR 16 + +#define GGML_F32x16 __m512 +#define GGML_F32x16_ZERO _mm512_setzero_ps() +#define GGML_F32x16_SET1(x) _mm512_set1_ps(x) +#define GGML_F32x16_LOAD _mm512_loadu_ps +#define GGML_F32x16_STORE _mm512_storeu_ps +// _mm512_fmadd_ps is defined in AVX512F so no guard is required +#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a) +#define GGML_F32x16_ADD _mm512_add_ps +#define GGML_F32x16_MUL _mm512_mul_ps +#define GGML_F32x16_REDUCE(res, x) \ +do { \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ps(x[i], x[offset+i]); \ + } \ + res = _mm512_reduce_add_ps(x[0]); \ +} while (0) + +// TODO: is this optimal ? + +#define GGML_F32_VEC GGML_F32x16 +#define GGML_F32_VEC_ZERO GGML_F32x16_ZERO +#define GGML_F32_VEC_SET1 GGML_F32x16_SET1 +#define GGML_F32_VEC_LOAD GGML_F32x16_LOAD +#define GGML_F32_VEC_STORE GGML_F32x16_STORE +#define GGML_F32_VEC_FMA GGML_F32x16_FMA +#define GGML_F32_VEC_ADD GGML_F32x16_ADD +#define GGML_F32_VEC_MUL GGML_F32x16_MUL +#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE + +// F16 AVX512 + +// F16 AVX + +#define GGML_F16_STEP 64 +#define GGML_F16_EPR 16 + +// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead + +#define GGML_F32Cx16 __m512 +#define GGML_F32Cx16_ZERO _mm512_setzero_ps() +#define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x) + +// unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F +// so F16C guard isn't required +#define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((__m256i *)(x))) +#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0)) + +#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a) +#define GGML_F32Cx16_ADD _mm512_add_ps +#define GGML_F32Cx16_MUL _mm512_mul_ps +#define GGML_F32Cx16_REDUCE(res, x) \ +do { \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ps(x[i], x[offset+i]); \ + } \ + res = _mm512_reduce_add_ps(x[0]); \ +} while (0) + +#define GGML_F16_VEC GGML_F32Cx16 +#define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO +#define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1 +#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p) +#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i]) +#define GGML_F16_VEC_FMA GGML_F32Cx16_FMA +#define GGML_F16_VEC_ADD GGML_F32Cx16_ADD +#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL +#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE + #elif defined(__AVX__) #define GGML_SIMD From dc0f6125487dcfbff913360f9d877bc0ccf6aa57 Mon Sep 17 00:00:00 2001 From: GainLee Date: Mon, 18 Mar 2024 01:12:22 +0800 Subject: [PATCH 39/56] ggml:fix finding transfer queue family index error (#6094) Co-authored-by: GainLee --- ggml-vulkan.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 7cce616ba..698b31496 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -710,6 +710,12 @@ static uint32_t ggml_vk_find_queue_family_index(std::vector= 0) { + return compute_index; + } + std::cerr << "ggml_vulkan: No suitable queue family index found." << std::endl; for(auto &q_family : queue_family_props) { From cd776c37c945bf58efc8fe44b370456680cb1b59 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 17 Mar 2024 19:51:57 +0200 Subject: [PATCH 40/56] ci : close all stale issues at once (#6115) --- .github/workflows/close-issue.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml index bc08a72d0..2682f308c 100644 --- a/.github/workflows/close-issue.yml +++ b/.github/workflows/close-issue.yml @@ -19,4 +19,5 @@ jobs: close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale." days-before-pr-stale: -1 days-before-pr-close: -1 + operations-per-run: 1000 repo-token: ${{ secrets.GITHUB_TOKEN }} From d01b3c4c32357567f3531d4e6ceffc5d23e87583 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Sun, 17 Mar 2024 19:12:37 +0100 Subject: [PATCH 41/56] common: llama_load_model_from_url using --model-url (#6098) * common: llama_load_model_from_url with libcurl dependency Co-authored-by: Georgi Gerganov --- .github/workflows/build.yml | 22 ++ .github/workflows/server.yml | 20 +- CMakeLists.txt | 1 + Makefile | 5 + common/CMakeLists.txt | 13 +- common/common.cpp | 238 +++++++++++++++++- common/common.h | 4 + examples/main/README.md | 1 + examples/server/README.md | 1 + examples/server/server.cpp | 8 + examples/server/tests/README.md | 2 +- .../server/tests/features/embeddings.feature | 3 +- examples/server/tests/features/environment.py | 93 +++---- examples/server/tests/features/server.feature | 3 +- examples/server/tests/features/steps/steps.py | 37 ++- examples/server/tests/requirements.txt | 1 + 16 files changed, 397 insertions(+), 55 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0da01d5ba..945df42f8 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -48,6 +48,28 @@ jobs: CC=gcc-8 make tests -j $(nproc) make test -j $(nproc) + ubuntu-focal-make-curl: + runs-on: ubuntu-20.04 + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v3 + + - name: Dependencies + id: depends + run: | + sudo apt-get update + sudo apt-get install build-essential gcc-8 libcurl4-openssl-dev + + - name: Build + id: make_build + env: + LLAMA_FATAL_WARNINGS: 1 + LLAMA_CURL: 1 + run: | + CC=gcc-8 make -j $(nproc) + ubuntu-latest-cmake: runs-on: ubuntu-latest diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 5e38b3547..4ea09115a 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -57,7 +57,8 @@ jobs: cmake \ python3-pip \ wget \ - language-pack-en + language-pack-en \ + libcurl4-openssl-dev - name: Build id: cmake_build @@ -67,6 +68,7 @@ jobs: cmake .. \ -DLLAMA_NATIVE=OFF \ -DLLAMA_BUILD_SERVER=ON \ + -DLLAMA_CURL=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ; cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server @@ -101,12 +103,21 @@ jobs: with: fetch-depth: 0 + - name: libCURL + id: get_libcurl + env: + CURL_VERSION: 8.6.0_6 + run: | + curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip" + mkdir $env:RUNNER_TEMP/libcurl + tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl + - name: Build id: cmake_build run: | mkdir build cd build - cmake .. -DLLAMA_BUILD_SERVER=ON -DCMAKE_BUILD_TYPE=Release ; + cmake .. -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include" cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server - name: Python setup @@ -120,6 +131,11 @@ jobs: run: | pip install -r examples/server/tests/requirements.txt + - name: Copy Libcurl + id: prepare_libcurl + run: | + cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll + - name: Tests id: server_integration_tests if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }} diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ac2804a6..fc4cff28f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -99,6 +99,7 @@ option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K") set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING "llama: max. batch size for using peer access") +option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF) option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF) option(LLAMA_CLBLAST "llama: use CLBlast" OFF) diff --git a/Makefile b/Makefile index c0f125036..838daf5c0 100644 --- a/Makefile +++ b/Makefile @@ -595,6 +595,11 @@ include scripts/get-flags.mk CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic endif +ifdef LLAMA_CURL +override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL +override LDFLAGS := $(LDFLAGS) -lcurl +endif + # # Print build information # diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 350bbdf7f..af2629a46 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -68,6 +68,17 @@ if (BUILD_SHARED_LIBS) set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) endif() +set(LLAMA_COMMON_EXTRA_LIBS build_info) + +# Use curl to download model url +if (LLAMA_CURL) + find_package(CURL REQUIRED) + add_definitions(-DLLAMA_USE_CURL) + include_directories(${CURL_INCLUDE_DIRS}) + find_library(CURL_LIBRARY curl REQUIRED) + set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY}) +endif () + target_include_directories(${TARGET} PUBLIC .) target_compile_features(${TARGET} PUBLIC cxx_std_11) -target_link_libraries(${TARGET} PRIVATE build_info PUBLIC llama) +target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama) diff --git a/common/common.cpp b/common/common.cpp index 1b0ba8493..2f5d965d6 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -37,6 +37,9 @@ #include #include #endif +#if defined(LLAMA_USE_CURL) +#include +#endif #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -50,6 +53,18 @@ #define GGML_USE_CUBLAS_SYCL_VULKAN #endif +#if defined(LLAMA_USE_CURL) +#ifdef __linux__ +#include +#elif defined(_WIN32) +#define PATH_MAX MAX_PATH +#else +#include +#endif +#define LLAMA_CURL_MAX_PATH_LENGTH PATH_MAX +#define LLAMA_CURL_MAX_HEADER_LENGTH 256 +#endif // LLAMA_USE_CURL + int32_t get_num_physical_cores() { #ifdef __linux__ // enumerate the set of thread siblings, num entries is num cores @@ -644,6 +659,13 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } params.model = argv[i]; } + if (arg == "-mu" || arg == "--model-url") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.model_url = argv[i]; + } if (arg == "-md" || arg == "--model-draft") { arg_found = true; if (++i >= argc) { @@ -1368,6 +1390,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" layer range to apply the control vector(s) to, start and end inclusive\n"); printf(" -m FNAME, --model FNAME\n"); printf(" model path (default: %s)\n", params.model.c_str()); + printf(" -mu MODEL_URL, --model-url MODEL_URL\n"); + printf(" model download url (default: %s)\n", params.model_url.c_str()); printf(" -md FNAME, --model-draft FNAME\n"); printf(" draft model for speculative decoding\n"); printf(" -ld LOGDIR, --logdir LOGDIR\n"); @@ -1613,10 +1637,222 @@ void llama_batch_add( batch.n_tokens++; } +#ifdef LLAMA_USE_CURL + +struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, + struct llama_model_params params) { + // Basic validation of the model_url + if (!model_url || strlen(model_url) == 0) { + fprintf(stderr, "%s: invalid model_url\n", __func__); + return NULL; + } + + // Initialize libcurl globally + auto curl = curl_easy_init(); + + if (!curl) { + fprintf(stderr, "%s: error initializing libcurl\n", __func__); + return NULL; + } + + // Set the URL, allow to follow http redirection + curl_easy_setopt(curl, CURLOPT_URL, model_url); + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); +#if defined(_WIN32) + // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of + // operating system. Currently implemented under MS-Windows. + curl_easy_setopt(curl, CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA); +#endif + + // Check if the file already exists locally + struct stat model_file_info; + auto file_exists = (stat(path_model, &model_file_info) == 0); + + // If the file exists, check for ${path_model}.etag or ${path_model}.lastModified files + char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; + char etag_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0}; + snprintf(etag_path, sizeof(etag_path), "%s.etag", path_model); + + char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; + char last_modified_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0}; + snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path_model); + + if (file_exists) { + auto * f_etag = fopen(etag_path, "r"); + if (f_etag) { + if (!fgets(etag, sizeof(etag), f_etag)) { + fprintf(stderr, "%s: unable to read file %s\n", __func__, etag_path); + } else { + fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, etag_path, etag); + } + fclose(f_etag); + } + + auto * f_last_modified = fopen(last_modified_path, "r"); + if (f_last_modified) { + if (!fgets(last_modified, sizeof(last_modified), f_last_modified)) { + fprintf(stderr, "%s: unable to read file %s\n", __func__, last_modified_path); + } else { + fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, last_modified_path, + last_modified); + } + fclose(f_last_modified); + } + } + + // Send a HEAD request to retrieve the etag and last-modified headers + struct llama_load_model_from_url_headers { + char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; + char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; + }; + llama_load_model_from_url_headers headers; + { + typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *); + auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t { + llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata; + + const char * etag_prefix = "etag: "; + if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) { + strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove CRLF + } + + const char * last_modified_prefix = "last-modified: "; + if (strncmp(buffer, last_modified_prefix, strlen(last_modified_prefix)) == 0) { + strncpy(headers->last_modified, buffer + strlen(last_modified_prefix), + n_items - strlen(last_modified_prefix) - 2); // Remove CRLF + } + return n_items; + }; + + curl_easy_setopt(curl, CURLOPT_NOBODY, 1L); // will trigger the HEAD verb + curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L); // hide head request progress + curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, static_cast(header_callback)); + curl_easy_setopt(curl, CURLOPT_HEADERDATA, &headers); + + CURLcode res = curl_easy_perform(curl); + if (res != CURLE_OK) { + curl_easy_cleanup(curl); + fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res)); + return NULL; + } + + long http_code = 0; + curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code); + if (http_code != 200) { + // HEAD not supported, we don't know if the file has changed + // force trigger downloading + file_exists = false; + fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code); + } + } + + // If the ETag or the Last-Modified headers are different: trigger a new download + if (!file_exists || strcmp(etag, headers.etag) != 0 || strcmp(last_modified, headers.last_modified) != 0) { + char path_model_temporary[LLAMA_CURL_MAX_PATH_LENGTH] = {0}; + snprintf(path_model_temporary, sizeof(path_model_temporary), "%s.downloadInProgress", path_model); + if (file_exists) { + fprintf(stderr, "%s: deleting previous downloaded model file: %s\n", __func__, path_model); + if (remove(path_model) != 0) { + curl_easy_cleanup(curl); + fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path_model); + return NULL; + } + } + + // Set the output file + auto * outfile = fopen(path_model_temporary, "wb"); + if (!outfile) { + curl_easy_cleanup(curl); + fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path_model); + return NULL; + } + + typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd); + auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t { + return fwrite(data, size, nmemb, (FILE *)fd); + }; + curl_easy_setopt(curl, CURLOPT_NOBODY, 0L); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, static_cast(write_callback)); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile); + + // display download progress + curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L); + + // start the download + fprintf(stderr, "%s: downloading model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, + model_url, path_model, headers.etag, headers.last_modified); + auto res = curl_easy_perform(curl); + if (res != CURLE_OK) { + fclose(outfile); + curl_easy_cleanup(curl); + fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res)); + return NULL; + } + + long http_code = 0; + curl_easy_getinfo (curl, CURLINFO_RESPONSE_CODE, &http_code); + if (http_code < 200 || http_code >= 400) { + fclose(outfile); + curl_easy_cleanup(curl); + fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code); + return NULL; + } + + // Clean up + fclose(outfile); + + // Write the new ETag to the .etag file + if (strlen(headers.etag) > 0) { + auto * etag_file = fopen(etag_path, "w"); + if (etag_file) { + fputs(headers.etag, etag_file); + fclose(etag_file); + fprintf(stderr, "%s: model etag saved %s: %s\n", __func__, etag_path, headers.etag); + } + } + + // Write the new lastModified to the .etag file + if (strlen(headers.last_modified) > 0) { + auto * last_modified_file = fopen(last_modified_path, "w"); + if (last_modified_file) { + fputs(headers.last_modified, last_modified_file); + fclose(last_modified_file); + fprintf(stderr, "%s: model last modified saved %s: %s\n", __func__, last_modified_path, + headers.last_modified); + } + } + + if (rename(path_model_temporary, path_model) != 0) { + curl_easy_cleanup(curl); + fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_model_temporary, path_model); + return NULL; + } + } + + curl_easy_cleanup(curl); + + return llama_load_model_from_file(path_model, params); +} + +#else + +struct llama_model * llama_load_model_from_url(const char * /*model_url*/, const char * /*path_model*/, + struct llama_model_params /*params*/) { + fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__); + return nullptr; +} + +#endif // LLAMA_USE_CURL + std::tuple llama_init_from_gpt_params(gpt_params & params) { auto mparams = llama_model_params_from_gpt_params(params); - llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams); + llama_model * model = nullptr; + if (!params.model_url.empty()) { + model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams); + } else { + model = llama_load_model_from_file(params.model.c_str(), mparams); + } if (model == NULL) { fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); return std::make_tuple(nullptr, nullptr); diff --git a/common/common.h b/common/common.h index 687f3425e..8dd8a3edc 100644 --- a/common/common.h +++ b/common/common.h @@ -89,6 +89,7 @@ struct gpt_params { struct llama_sampling_params sparams; std::string model = "models/7B/ggml-model-f16.gguf"; // model path + std::string model_url = ""; // model url to download std::string model_draft = ""; // draft model for speculative decoding std::string model_alias = "unknown"; // model alias std::string prompt = ""; @@ -191,6 +192,9 @@ std::tuple llama_init_from_gpt_par struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params); struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params); +struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, + struct llama_model_params params); + // Batch utils void llama_batch_clear(struct llama_batch & batch); diff --git a/examples/main/README.md b/examples/main/README.md index 7f84e4262..6a8d1e1c5 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -67,6 +67,7 @@ main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt In this section, we cover the most commonly used options for running the `main` program with the LLaMA models: - `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`). +- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf). - `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses. - `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models. - `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text. diff --git a/examples/server/README.md b/examples/server/README.md index 8f8454aff..755e1d538 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -20,6 +20,7 @@ The project is under active development, and we are [looking for feedback and co - `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. - `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`) - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`). +- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf). - `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096. - `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 895d608fd..d2a8e541d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2195,6 +2195,8 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co } printf(" -m FNAME, --model FNAME\n"); printf(" model path (default: %s)\n", params.model.c_str()); + printf(" -mu MODEL_URL, --model-url MODEL_URL\n"); + printf(" model download url (default: %s)\n", params.model_url.c_str()); printf(" -a ALIAS, --alias ALIAS\n"); printf(" set an alias for the model, will be added as `model` field in completion response\n"); printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); @@ -2317,6 +2319,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, break; } params.model = argv[i]; + } else if (arg == "-mu" || arg == "--model-url") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.model_url = argv[i]; } else if (arg == "-a" || arg == "--alias") { if (++i >= argc) { invalid_param = true; diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md index 95a0353b6..feb2b1d6c 100644 --- a/examples/server/tests/README.md +++ b/examples/server/tests/README.md @@ -57,7 +57,7 @@ Feature or Scenario must be annotated with `@llama.cpp` to be included in the de To run a scenario annotated with `@bug`, start: ```shell -DEBUG=ON ./tests.sh --no-skipped --tags bug +DEBUG=ON ./tests.sh --no-skipped --tags bug --stop ``` After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated. diff --git a/examples/server/tests/features/embeddings.feature b/examples/server/tests/features/embeddings.feature index 57359b267..dcf1434f9 100644 --- a/examples/server/tests/features/embeddings.feature +++ b/examples/server/tests/features/embeddings.feature @@ -4,7 +4,8 @@ Feature: llama.cpp server Background: Server startup Given a server listening on localhost:8080 - And a model file bert-bge-small/ggml-model-f16.gguf from HF repo ggml-org/models + And a model url https://huggingface.co/ggml-org/models/resolve/main/bert-bge-small/ggml-model-f16.gguf + And a model file ggml-model-f16.gguf And a model alias bert-bge-small And 42 as server seed And 2 slots diff --git a/examples/server/tests/features/environment.py b/examples/server/tests/features/environment.py index 8ad987e1b..82104e920 100644 --- a/examples/server/tests/features/environment.py +++ b/examples/server/tests/features/environment.py @@ -1,10 +1,12 @@ -import errno import os -import socket -import subprocess -import time -from contextlib import closing import signal +import socket +import sys +import time +import traceback +from contextlib import closing + +import psutil def before_scenario(context, scenario): @@ -20,33 +22,40 @@ def before_scenario(context, scenario): def after_scenario(context, scenario): - if context.server_process is None: - return - if scenario.status == "failed": - if 'GITHUB_ACTIONS' in os.environ: - print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n") - if os.path.isfile('llama.log'): - with closing(open('llama.log', 'r')) as f: - for line in f: - print(line) - if not is_server_listening(context.server_fqdn, context.server_port): - print("\x1b[33;101mERROR: Server stopped listening\x1b[0m\n") + try: + if 'server_process' not in context or context.server_process is None: + return + if scenario.status == "failed": + if 'GITHUB_ACTIONS' in os.environ: + print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n") + if os.path.isfile('llama.log'): + with closing(open('llama.log', 'r')) as f: + for line in f: + print(line) + if not is_server_listening(context.server_fqdn, context.server_port): + print("\x1b[33;101mERROR: Server stopped listening\x1b[0m\n") - if not pid_exists(context.server_process.pid): - assert False, f"Server not running pid={context.server_process.pid} ..." + if not pid_exists(context.server_process.pid): + assert False, f"Server not running pid={context.server_process.pid} ..." - server_graceful_shutdown(context) + server_graceful_shutdown(context) - # Wait few for socket to free up - time.sleep(0.05) + # Wait few for socket to free up + time.sleep(0.05) - attempts = 0 - while pid_exists(context.server_process.pid) or is_server_listening(context.server_fqdn, context.server_port): - server_kill(context) - time.sleep(0.1) - attempts += 1 - if attempts > 5: - server_kill_hard(context) + attempts = 0 + while pid_exists(context.server_process.pid) or is_server_listening(context.server_fqdn, context.server_port): + server_kill(context) + time.sleep(0.1) + attempts += 1 + if attempts > 5: + server_kill_hard(context) + except: + exc = sys.exception() + print("error in after scenario: \n") + print(exc) + print("*** print_tb: \n") + traceback.print_tb(exc.__traceback__, file=sys.stdout) def server_graceful_shutdown(context): @@ -67,11 +76,11 @@ def server_kill_hard(context): path = context.server_path print(f"Server dangling exits, hard killing force {pid}={path}...\n") - if os.name == 'nt': - process = subprocess.check_output(['taskkill', '/F', '/pid', str(pid)]).decode() - print(process) - else: - os.kill(-pid, signal.SIGKILL) + try: + psutil.Process(pid).kill() + except psutil.NoSuchProcess: + return False + return True def is_server_listening(server_fqdn, server_port): @@ -84,17 +93,9 @@ def is_server_listening(server_fqdn, server_port): def pid_exists(pid): - """Check whether pid exists in the current process table.""" - if pid < 0: + try: + psutil.Process(pid) + except psutil.NoSuchProcess: return False - if os.name == 'nt': - output = subprocess.check_output(['TASKLIST', '/FI', f'pid eq {pid}']).decode() - print(output) - return "No tasks are running" not in output - else: - try: - os.kill(pid, 0) - except OSError as e: - return e.errno == errno.EPERM - else: - return True + return True + diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index 5014f326d..7448986e7 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -4,7 +4,8 @@ Feature: llama.cpp server Background: Server startup Given a server listening on localhost:8080 - And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models + And a model url https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K.gguf + And a model file stories260K.gguf And a model alias tinyllama-2 And 42 as server seed # KV Cache corresponds to the total amount of tokens diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index a59a52d21..9e348d5fc 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -5,6 +5,8 @@ import os import re import socket import subprocess +import sys +import threading import time from contextlib import closing from re import RegexFlag @@ -32,6 +34,8 @@ def step_server_config(context, server_fqdn, server_port): context.base_url = f'http://{context.server_fqdn}:{context.server_port}' context.model_alias = None + context.model_file = None + context.model_url = None context.n_batch = None context.n_ubatch = None context.n_ctx = None @@ -65,6 +69,16 @@ def step_download_hf_model(context, hf_file, hf_repo): print(f"model file: {context.model_file}\n") +@step('a model file {model_file}') +def step_model_file(context, model_file): + context.model_file = model_file + + +@step('a model url {model_url}') +def step_model_url(context, model_url): + context.model_url = model_url + + @step('a model alias {model_alias}') def step_model_alias(context, model_alias): context.model_alias = model_alias @@ -141,7 +155,8 @@ def step_start_server(context): async def step_wait_for_the_server_to_be_started(context, expecting_status): match expecting_status: case 'healthy': - await wait_for_health_status(context, context.base_url, 200, 'ok') + await wait_for_health_status(context, context.base_url, 200, 'ok', + timeout=30) case 'ready' | 'idle': await wait_for_health_status(context, context.base_url, 200, 'ok', @@ -1038,8 +1053,11 @@ def start_server_background(context): server_args = [ '--host', server_listen_addr, '--port', context.server_port, - '--model', context.model_file ] + if context.model_file: + server_args.extend(['--model', context.model_file]) + if context.model_url: + server_args.extend(['--model-url', context.model_url]) if context.n_batch: server_args.extend(['--batch-size', context.n_batch]) if context.n_ubatch: @@ -1079,8 +1097,23 @@ def start_server_background(context): pkwargs = { 'creationflags': flags, + 'stdout': subprocess.PIPE, + 'stderr': subprocess.PIPE } context.server_process = subprocess.Popen( [str(arg) for arg in [context.server_path, *server_args]], **pkwargs) + + def log_stdout(process): + for line in iter(process.stdout.readline, b''): + print(line.decode('utf-8'), end='') + thread_stdout = threading.Thread(target=log_stdout, args=(context.server_process,)) + thread_stdout.start() + + def log_stderr(process): + for line in iter(process.stderr.readline, b''): + print(line.decode('utf-8'), end='', file=sys.stderr) + thread_stderr = threading.Thread(target=log_stderr, args=(context.server_process,)) + thread_stderr.start() + print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}") diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt index 2e4f42ad2..c2c960102 100644 --- a/examples/server/tests/requirements.txt +++ b/examples/server/tests/requirements.txt @@ -3,4 +3,5 @@ behave~=1.2.6 huggingface_hub~=0.20.3 numpy~=1.24.4 openai~=0.25.0 +psutil~=5.9.8 prometheus-client~=0.20.0 From 3a6efdd03c46c5ba08e43880d34260c02dd9999b Mon Sep 17 00:00:00 2001 From: Romain D <90720+Artefact2@users.noreply.github.com> Date: Mon, 18 Mar 2024 09:04:41 +0100 Subject: [PATCH 42/56] convert : use f32 outtype for bf16 tensors (#6106) The old behaviour is to use f16, but bf16 to f16 is not a lossless conversion. Change the outtype to f32 to default to a lossless conversion. --- convert.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/convert.py b/convert.py index 161430f3e..817cb6612 100755 --- a/convert.py +++ b/convert.py @@ -1167,9 +1167,9 @@ class OutputFile: def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType: wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type - if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32): + if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)): return GGMLFileType.AllF32 - if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)): + if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16): return GGMLFileType.MostlyF16 if output_type_str == "q8_0": return GGMLFileType.MostlyQ8_0 From 9b03719ad712e2dc36c5c0c20f352bf3e4bda332 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9rence?= <13496987+Royalphax@users.noreply.github.com> Date: Mon, 18 Mar 2024 09:17:00 +0100 Subject: [PATCH 43/56] convert : add support for CamembertModel architecture (#6119) Adding support for CamembertModel architecture used by : https://huggingface.co/dangvantuan/sentence-camembert-large --- convert-hf-to-gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index cf1f98d66..1e49d56c1 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1634,7 +1634,7 @@ in chat mode so that the conversation can end normally.") self.post_write_tensors(tensor_map, name, data_torch) -@Model.register("BertModel") +@Model.register("BertModel", "CamembertModel") class BertModel(Model): model_arch = gguf.MODEL_ARCH.BERT From 496bc79bc2b79bfd6124b8687a8dbd6a646e9b06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?DAN=E2=84=A2?= Date: Mon, 18 Mar 2024 04:27:44 -0400 Subject: [PATCH 44/56] common : tidy-up argument parsing (#6105) * Tidy-up argument parsing. * Missing ref. * common : minor * common : add static classifier --------- Co-authored-by: Georgi Gerganov --- common/common.cpp | 2057 +++++++++++++++++++++++---------------------- 1 file changed, 1035 insertions(+), 1022 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 2f5d965d6..919182862 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -154,6 +154,1040 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { return result; } +static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int & i, bool & invalid_param) { + std::string arg = argv[i]; + llama_sampling_params& sparams = params.sparams; + + if (arg == "-s" || arg == "--seed") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.seed = std::stoul(argv[i]); + return true; + } + if (arg == "-t" || arg == "--threads") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_threads = std::stoi(argv[i]); + if (params.n_threads <= 0) { + params.n_threads = std::thread::hardware_concurrency(); + } + return true; + } + if (arg == "-tb" || arg == "--threads-batch") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_threads_batch = std::stoi(argv[i]); + if (params.n_threads_batch <= 0) { + params.n_threads_batch = std::thread::hardware_concurrency(); + } + return true; + } + if (arg == "-td" || arg == "--threads-draft") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_threads_draft = std::stoi(argv[i]); + if (params.n_threads_draft <= 0) { + params.n_threads_draft = std::thread::hardware_concurrency(); + } + return true; + } + if (arg == "-tbd" || arg == "--threads-batch-draft") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_threads_batch_draft = std::stoi(argv[i]); + if (params.n_threads_batch_draft <= 0) { + params.n_threads_batch_draft = std::thread::hardware_concurrency(); + } + return true; + } + if (arg == "-p" || arg == "--prompt") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.prompt = argv[i]; + return true; + } + if (arg == "-e" || arg == "--escape") { + params.escape = true; + return true; + } + if (arg == "--prompt-cache") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.path_prompt_cache = argv[i]; + return true; + } + if (arg == "--prompt-cache-all") { + params.prompt_cache_all = true; + return true; + } + if (arg == "--prompt-cache-ro") { + params.prompt_cache_ro = true; + return true; + } + if (arg == "-bf" || arg == "--binary-file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::ifstream file(argv[i], std::ios::binary); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; + } + // store the external file name in params + params.prompt_file = argv[i]; + std::ostringstream ss; + ss << file.rdbuf(); + params.prompt = ss.str(); + fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]); + return true; + } + if (arg == "-f" || arg == "--file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; + } + // store the external file name in params + params.prompt_file = argv[i]; + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); + if (!params.prompt.empty() && params.prompt.back() == '\n') { + params.prompt.pop_back(); + } + return true; + } + if (arg == "-n" || arg == "--n-predict") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_predict = std::stoi(argv[i]); + return true; + } + if (arg == "--top-k") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.top_k = std::stoi(argv[i]); + return true; + } + if (arg == "-c" || arg == "--ctx-size") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_ctx = std::stoi(argv[i]); + return true; + } + if (arg == "--grp-attn-n" || arg == "-gan") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.grp_attn_n = std::stoi(argv[i]); + return true; + } + if (arg == "--grp-attn-w" || arg == "-gaw") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.grp_attn_w = std::stoi(argv[i]); + return true; + } + if (arg == "--rope-freq-base") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.rope_freq_base = std::stof(argv[i]); + return true; + } + if (arg == "--rope-freq-scale") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.rope_freq_scale = std::stof(argv[i]); + return true; + } + if (arg == "--rope-scaling") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::string value(argv[i]); + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } + else { invalid_param = true; } + return true; + } + if (arg == "--rope-scale") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.rope_freq_scale = 1.0f / std::stof(argv[i]); + return true; + } + if (arg == "--yarn-orig-ctx") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.yarn_orig_ctx = std::stoi(argv[i]); + return true; + } + if (arg == "--yarn-ext-factor") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.yarn_ext_factor = std::stof(argv[i]); + return true; + } + if (arg == "--yarn-attn-factor") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.yarn_attn_factor = std::stof(argv[i]); + return true; + } + if (arg == "--yarn-beta-fast") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.yarn_beta_fast = std::stof(argv[i]); + return true; + } + if (arg == "--yarn-beta-slow") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.yarn_beta_slow = std::stof(argv[i]); + return true; + } + if (arg == "--pooling") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::string value(argv[i]); + /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } + else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } + else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } + else { invalid_param = true; } + return true; + } + if (arg == "--defrag-thold" || arg == "-dt") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.defrag_thold = std::stof(argv[i]); + return true; + } + if (arg == "--samplers") { + if (++i >= argc) { + invalid_param = true; + return true; + } + const auto sampler_names = string_split(argv[i], ';'); + sparams.samplers_sequence = sampler_types_from_names(sampler_names, true); + return true; + } + if (arg == "--sampling-seq") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.samplers_sequence = sampler_types_from_chars(argv[i]); + return true; + } + if (arg == "--top-p") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.top_p = std::stof(argv[i]); + return true; + } + if (arg == "--min-p") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.min_p = std::stof(argv[i]); + return true; + } + if (arg == "--temp") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.temp = std::stof(argv[i]); + sparams.temp = std::max(sparams.temp, 0.0f); + return true; + } + if (arg == "--tfs") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.tfs_z = std::stof(argv[i]); + return true; + } + if (arg == "--typical") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.typical_p = std::stof(argv[i]); + return true; + } + if (arg == "--repeat-last-n") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.penalty_last_n = std::stoi(argv[i]); + sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n); + return true; + } + if (arg == "--repeat-penalty") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.penalty_repeat = std::stof(argv[i]); + return true; + } + if (arg == "--frequency-penalty") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.penalty_freq = std::stof(argv[i]); + return true; + } + if (arg == "--presence-penalty") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.penalty_present = std::stof(argv[i]); + return true; + } + if (arg == "--dynatemp-range") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.dynatemp_range = std::stof(argv[i]); + return true; + } + if (arg == "--dynatemp-exp") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.dynatemp_exponent = std::stof(argv[i]); + return true; + } + if (arg == "--mirostat") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.mirostat = std::stoi(argv[i]); + return true; + } + if (arg == "--mirostat-lr") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.mirostat_eta = std::stof(argv[i]); + return true; + } + if (arg == "--mirostat-ent") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.mirostat_tau = std::stof(argv[i]); + return true; + } + if (arg == "--cfg-negative-prompt") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.cfg_negative_prompt = argv[i]; + return true; + } + if (arg == "--cfg-negative-prompt-file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; + } + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(sparams.cfg_negative_prompt)); + if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') { + sparams.cfg_negative_prompt.pop_back(); + } + return true; + } + if (arg == "--cfg-scale") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.cfg_scale = std::stof(argv[i]); + return true; + } + if (arg == "-b" || arg == "--batch-size") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_batch = std::stoi(argv[i]); + return true; + } + if (arg == "-ub" || arg == "--ubatch-size") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_ubatch = std::stoi(argv[i]); + return true; + } + if (arg == "--keep") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_keep = std::stoi(argv[i]); + return true; + } + if (arg == "--draft") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_draft = std::stoi(argv[i]); + return true; + } + if (arg == "--chunks") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_chunks = std::stoi(argv[i]); + return true; + } + if (arg == "-np" || arg == "--parallel") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_parallel = std::stoi(argv[i]); + return true; + } + if (arg == "-ns" || arg == "--sequences") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_sequences = std::stoi(argv[i]); + return true; + } + if (arg == "--p-split" || arg == "-ps") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.p_split = std::stof(argv[i]); + return true; + } + if (arg == "-m" || arg == "--model") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.model = argv[i]; + return true; + } + if (arg == "-mu" || arg == "--model-url") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.model_url = argv[i]; + return true; + } + if (arg == "-md" || arg == "--model-draft") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.model_draft = argv[i]; + return true; + } + if (arg == "-a" || arg == "--alias") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.model_alias = argv[i]; + return true; + } + if (arg == "--lora") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.lora_adapter.emplace_back(argv[i], 1.0f); + params.use_mmap = false; + return true; + } + if (arg == "--lora-scaled") { + if (++i >= argc) { + invalid_param = true; + return true; + } + const char* lora_adapter = argv[i]; + if (++i >= argc) { + invalid_param = true; + return true; + } + params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); + params.use_mmap = false; + return true; + } + if (arg == "--lora-base") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.lora_base = argv[i]; + return true; + } + if (arg == "--control-vector") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.control_vectors.push_back({ 1.0f, argv[i], }); + return true; + } + if (arg == "--control-vector-scaled") { + if (++i >= argc) { + invalid_param = true; + return true; + } + const char* fname = argv[i]; + if (++i >= argc) { + invalid_param = true; + return true; + } + params.control_vectors.push_back({ std::stof(argv[i]), fname, }); + return true; + } + if (arg == "--control-vector-layer-range") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.control_vector_layer_start = std::stoi(argv[i]); + if (++i >= argc) { + invalid_param = true; + return true; + } + params.control_vector_layer_end = std::stoi(argv[i]); + return true; + } + if (arg == "--mmproj") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.mmproj = argv[i]; + return true; + } + if (arg == "--image") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.image = argv[i]; + return true; + } + if (arg == "-i" || arg == "--interactive") { + params.interactive = true; + return true; + } + if (arg == "--embedding") { + params.embedding = true; + return true; + } + if (arg == "--interactive-first") { + params.interactive_first = true; + return true; + } + if (arg == "-ins" || arg == "--instruct") { + params.instruct = true; + return true; + } + if (arg == "-cml" || arg == "--chatml") { + params.chatml = true; + return true; + } + if (arg == "--infill") { + params.infill = true; + return true; + } + if (arg == "-dkvc" || arg == "--dump-kv-cache") { + params.dump_kv_cache = true; + return true; + } + if (arg == "-nkvo" || arg == "--no-kv-offload") { + params.no_kv_offload = true; + return true; + } + if (arg == "-ctk" || arg == "--cache-type-k") { + params.cache_type_k = argv[++i]; + return true; + } + if (arg == "-ctv" || arg == "--cache-type-v") { + params.cache_type_v = argv[++i]; + return true; + } + if (arg == "--multiline-input") { + params.multiline_input = true; + return true; + } + if (arg == "--simple-io") { + params.simple_io = true; + return true; + } + if (arg == "-cb" || arg == "--cont-batching") { + params.cont_batching = true; + return true; + } + if (arg == "--color") { + params.use_color = true; + return true; + } + if (arg == "--mlock") { + params.use_mlock = true; + return true; + } + if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_gpu_layers = std::stoi(argv[i]); + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + } + return true; + } + if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_gpu_layers_draft = std::stoi(argv[i]); + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + } + return true; + } + if (arg == "--main-gpu" || arg == "-mg") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.main_gpu = std::stoi(argv[i]); +#ifndef GGML_USE_CUBLAS_SYCL + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n"); +#endif // GGML_USE_CUBLAS_SYCL + return true; + } + if (arg == "--split-mode" || arg == "-sm") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::string arg_next = argv[i]; + if (arg_next == "none") { + params.split_mode = LLAMA_SPLIT_MODE_NONE; + } + else if (arg_next == "layer") { + params.split_mode = LLAMA_SPLIT_MODE_LAYER; + } + else if (arg_next == "row") { +#ifdef GGML_USE_SYCL + fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n"); + exit(1); +#endif // GGML_USE_SYCL + params.split_mode = LLAMA_SPLIT_MODE_ROW; + } + else { + invalid_param = true; + return true; + } +#ifndef GGML_USE_CUBLAS_SYCL + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n"); +#endif // GGML_USE_CUBLAS_SYCL + return true; + } + if (arg == "--tensor-split" || arg == "-ts") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::string arg_next = argv[i]; + + // split string by , and / + const std::regex regex{ R"([,/]+)" }; + std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 }; + std::vector split_arg{ it, {} }; + if (split_arg.size() >= llama_max_devices()) { + invalid_param = true; + return true; + } + for (size_t i = 0; i < llama_max_devices(); ++i) { + if (i < split_arg.size()) { + params.tensor_split[i] = std::stof(split_arg[i]); + } + else { + params.tensor_split[i] = 0.0f; + } + } +#ifndef GGML_USE_CUBLAS_SYCL_VULKAN + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n"); +#endif // GGML_USE_CUBLAS_SYCL + return true; + } + if (arg == "--no-mmap") { + params.use_mmap = false; + return true; + } + if (arg == "--numa") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::string value(argv[i]); + /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } + else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } + else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } + else { invalid_param = true; } + return true; + } + if (arg == "--verbose-prompt") { + params.verbose_prompt = true; + return true; + } + if (arg == "--no-display-prompt") { + params.display_prompt = false; + return true; + } + if (arg == "-r" || arg == "--reverse-prompt") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.antiprompt.emplace_back(argv[i]); + return true; + } + if (arg == "-ld" || arg == "--logdir") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.logdir = argv[i]; + + if (params.logdir.back() != DIRECTORY_SEPARATOR) { + params.logdir += DIRECTORY_SEPARATOR; + } + return true; + } + if (arg == "--save-all-logits" || arg == "--kl-divergence-base") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.logits_file = argv[i]; + return true; + } + if (arg == "--perplexity" || arg == "--all-logits") { + params.logits_all = true; + return true; + } + if (arg == "--ppl-stride") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.ppl_stride = std::stoi(argv[i]); + return true; + } + if (arg == "-ptc" || arg == "--print-token-count") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_print = std::stoi(argv[i]); + return true; + } + if (arg == "--ppl-output-type") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.ppl_output_type = std::stoi(argv[i]); + return true; + } + if (arg == "--hellaswag") { + params.hellaswag = true; + return true; + } + if (arg == "--hellaswag-tasks") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.hellaswag_tasks = std::stoi(argv[i]); + return true; + } + if (arg == "--winogrande") { + params.winogrande = true; + return true; + } + if (arg == "--winogrande-tasks") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.winogrande_tasks = std::stoi(argv[i]); + return true; + } + if (arg == "--multiple-choice") { + params.multiple_choice = true; + return true; + } + if (arg == "--multiple-choice-tasks") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.multiple_choice_tasks = std::stoi(argv[i]); + return true; + } + if (arg == "--kl-divergence") { + params.kl_divergence = true; + return true; + } + if (arg == "--ignore-eos") { + params.ignore_eos = true; + return true; + } + if (arg == "--no-penalize-nl") { + sparams.penalize_nl = false; + return true; + } + if (arg == "-l" || arg == "--logit-bias") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::stringstream ss(argv[i]); + llama_token key; + char sign; + std::string value_str; + try { + if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { + sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); + } + else { + throw std::exception(); + } + } + catch (const std::exception&) { + invalid_param = true; + return true; + } + return true; + } + if (arg == "-h" || arg == "--help") { + return false; + } + if (arg == "--version") { + fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); + fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); + exit(0); + } + if (arg == "--random-prompt") { + params.random_prompt = true; + return true; + } + if (arg == "--in-prefix-bos") { + params.input_prefix_bos = true; + return true; + } + if (arg == "--in-prefix") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.input_prefix = argv[i]; + return true; + } + if (arg == "--in-suffix") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.input_suffix = argv[i]; + return true; + } + if (arg == "--grammar") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.grammar = argv[i]; + return true; + } + if (arg == "--grammar-file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; + } + std::copy( + std::istreambuf_iterator(file), + std::istreambuf_iterator(), + std::back_inserter(sparams.grammar) + ); + return true; + } + if (arg == "--override-kv") { + if (++i >= argc) { + invalid_param = true; + return true; + } + char* sep = strchr(argv[i], '='); + if (sep == nullptr || sep - argv[i] >= 128) { + fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]); + invalid_param = true; + return true; + } + struct llama_model_kv_override kvo; + std::strncpy(kvo.key, argv[i], sep - argv[i]); + kvo.key[sep - argv[i]] = 0; + sep++; + if (strncmp(sep, "int:", 4) == 0) { + sep += 4; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; + kvo.int_value = std::atol(sep); + } + else if (strncmp(sep, "float:", 6) == 0) { + sep += 6; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; + kvo.float_value = std::atof(sep); + } + else if (strncmp(sep, "bool:", 5) == 0) { + sep += 5; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; + if (std::strcmp(sep, "true") == 0) { + kvo.bool_value = true; + } + else if (std::strcmp(sep, "false") == 0) { + kvo.bool_value = false; + } + else { + fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]); + invalid_param = true; + return true; + } + } + else { + fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); + invalid_param = true; + return true; + } + params.kv_overrides.push_back(kvo); + return true; + } +#ifndef LOG_DISABLE_LOGS + // Parse args for logging parameters + if (log_param_single_parse(argv[i])) { + // Do nothing, log_param_single_parse automatically does it's thing + // and returns if a match was found and parsed. + return true; + } + if (log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i])) { + // We have a matching known parameter requiring an argument, + // now we need to check if there is anything after this argv + // and flag invalid_param or parse it. + if (++i >= argc) { + invalid_param = true; + return true; + } + if (!log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i - 1], argv[i])) { + invalid_param = true; + return true; + } + return true; + } + // End of Parse args for logging parameters +#endif // LOG_DISABLE_LOGS + + return false; +} + bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { bool invalid_param = false; std::string arg; @@ -166,1028 +1200,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { std::replace(arg.begin(), arg.end(), '_', '-'); } - bool arg_found = false; - if (arg == "-s" || arg == "--seed") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.seed = std::stoul(argv[i]); - } - if (arg == "-t" || arg == "--threads") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_threads = std::stoi(argv[i]); - if (params.n_threads <= 0) { - params.n_threads = std::thread::hardware_concurrency(); - } - } - if (arg == "-tb" || arg == "--threads-batch") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_threads_batch = std::stoi(argv[i]); - if (params.n_threads_batch <= 0) { - params.n_threads_batch = std::thread::hardware_concurrency(); - } - } - if (arg == "-td" || arg == "--threads-draft") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_threads_draft = std::stoi(argv[i]); - if (params.n_threads_draft <= 0) { - params.n_threads_draft = std::thread::hardware_concurrency(); - } - } - if (arg == "-tbd" || arg == "--threads-batch-draft") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_threads_batch_draft = std::stoi(argv[i]); - if (params.n_threads_batch_draft <= 0) { - params.n_threads_batch_draft = std::thread::hardware_concurrency(); - } - } - if (arg == "-p" || arg == "--prompt") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.prompt = argv[i]; - } - if (arg == "-e" || arg == "--escape") { - arg_found = true; - params.escape = true; - } - if (arg == "--prompt-cache") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.path_prompt_cache = argv[i]; - } - if (arg == "--prompt-cache-all") { - arg_found = true; - params.prompt_cache_all = true; - } - if (arg == "--prompt-cache-ro") { - arg_found = true; - params.prompt_cache_ro = true; - } - if (arg == "-bf" || arg == "--binary-file") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - std::ifstream file(argv[i], std::ios::binary); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - break; - } - // store the external file name in params - params.prompt_file = argv[i]; - std::ostringstream ss; - ss << file.rdbuf(); - params.prompt = ss.str(); - fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]); - } - if (arg == "-f" || arg == "--file") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - break; - } - // store the external file name in params - params.prompt_file = argv[i]; - std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); - if (!params.prompt.empty() && params.prompt.back() == '\n') { - params.prompt.pop_back(); - } - } - if (arg == "-n" || arg == "--n-predict") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_predict = std::stoi(argv[i]); - } - if (arg == "--top-k") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.top_k = std::stoi(argv[i]); - } - if (arg == "-c" || arg == "--ctx-size") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_ctx = std::stoi(argv[i]); - } - if (arg == "--grp-attn-n" || arg == "-gan") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - - params.grp_attn_n = std::stoi(argv[i]); - } - if (arg == "--grp-attn-w" || arg == "-gaw") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - - params.grp_attn_w = std::stoi(argv[i]); - } - if (arg == "--rope-freq-base") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.rope_freq_base = std::stof(argv[i]); - } - if (arg == "--rope-freq-scale") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.rope_freq_scale = std::stof(argv[i]); - } - if (arg == "--rope-scaling") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - std::string value(argv[i]); - /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } - else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } - else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } - else { invalid_param = true; break; } - } - if (arg == "--rope-scale") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.rope_freq_scale = 1.0f/std::stof(argv[i]); - } - if (arg == "--yarn-orig-ctx") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.yarn_orig_ctx = std::stoi(argv[i]); - } - if (arg == "--yarn-ext-factor") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.yarn_ext_factor = std::stof(argv[i]); - } - if (arg == "--yarn-attn-factor") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.yarn_attn_factor = std::stof(argv[i]); - } - if (arg == "--yarn-beta-fast") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.yarn_beta_fast = std::stof(argv[i]); - } - if (arg == "--yarn-beta-slow") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.yarn_beta_slow = std::stof(argv[i]); - } - if (arg == "--pooling") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - std::string value(argv[i]); - /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } - else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } - else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } - else { invalid_param = true; break; } - } - if (arg == "--defrag-thold" || arg == "-dt") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.defrag_thold = std::stof(argv[i]); - } - if (arg == "--samplers") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - const auto sampler_names = string_split(argv[i], ';'); - sparams.samplers_sequence = sampler_types_from_names(sampler_names, true); - } - if (arg == "--sampling-seq") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.samplers_sequence = sampler_types_from_chars(argv[i]); - } - if (arg == "--top-p") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.top_p = std::stof(argv[i]); - } - if (arg == "--min-p") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.min_p = std::stof(argv[i]); - } - if (arg == "--temp") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.temp = std::stof(argv[i]); - sparams.temp = std::max(sparams.temp, 0.0f); - } - if (arg == "--tfs") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.tfs_z = std::stof(argv[i]); - } - if (arg == "--typical") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.typical_p = std::stof(argv[i]); - } - if (arg == "--repeat-last-n") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.penalty_last_n = std::stoi(argv[i]); - sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n); - } - if (arg == "--repeat-penalty") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.penalty_repeat = std::stof(argv[i]); - } - if (arg == "--frequency-penalty") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.penalty_freq = std::stof(argv[i]); - } - if (arg == "--presence-penalty") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.penalty_present = std::stof(argv[i]); - } - if (arg == "--dynatemp-range") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.dynatemp_range = std::stof(argv[i]); - } - if (arg == "--dynatemp-exp") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.dynatemp_exponent = std::stof(argv[i]); - } - if (arg == "--mirostat") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.mirostat = std::stoi(argv[i]); - } - if (arg == "--mirostat-lr") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.mirostat_eta = std::stof(argv[i]); - } - if (arg == "--mirostat-ent") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.mirostat_tau = std::stof(argv[i]); - } - if (arg == "--cfg-negative-prompt") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.cfg_negative_prompt = argv[i]; - } - if (arg == "--cfg-negative-prompt-file") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - break; - } - std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(sparams.cfg_negative_prompt)); - if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') { - sparams.cfg_negative_prompt.pop_back(); - } - } - if (arg == "--cfg-scale") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.cfg_scale = std::stof(argv[i]); - } - if (arg == "-b" || arg == "--batch-size") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_batch = std::stoi(argv[i]); - } - if (arg == "-ub" || arg == "--ubatch-size") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_ubatch = std::stoi(argv[i]); - } - if (arg == "--keep") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_keep = std::stoi(argv[i]); - } - if (arg == "--draft") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_draft = std::stoi(argv[i]); - } - if (arg == "--chunks") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_chunks = std::stoi(argv[i]); - } - if (arg == "-np" || arg == "--parallel") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_parallel = std::stoi(argv[i]); - } - if (arg == "-ns" || arg == "--sequences") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_sequences = std::stoi(argv[i]); - } - if (arg == "--p-split" || arg == "-ps") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.p_split = std::stof(argv[i]); - } - if (arg == "-m" || arg == "--model") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.model = argv[i]; - } - if (arg == "-mu" || arg == "--model-url") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.model_url = argv[i]; - } - if (arg == "-md" || arg == "--model-draft") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.model_draft = argv[i]; - } - if (arg == "-a" || arg == "--alias") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.model_alias = argv[i]; - } - if (arg == "--lora") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.lora_adapter.emplace_back(argv[i], 1.0f); - params.use_mmap = false; - } - if (arg == "--lora-scaled") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - const char * lora_adapter = argv[i]; - if (++i >= argc) { - invalid_param = true; - break; - } - params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); - params.use_mmap = false; - } - if (arg == "--lora-base") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.lora_base = argv[i]; - } - if (arg == "--control-vector") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.control_vectors.push_back({ 1.0f, argv[i], }); - } - if (arg == "--control-vector-scaled") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - const char * fname = argv[i]; - if (++i >= argc) { - invalid_param = true; - break; - } - params.control_vectors.push_back({ std::stof(argv[i]), fname, }); - } - if (arg == "--control-vector-layer-range") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.control_vector_layer_start = std::stoi(argv[i]); - if (++i >= argc) { - invalid_param = true; - break; - } - params.control_vector_layer_end = std::stoi(argv[i]); - } - if (arg == "--mmproj") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.mmproj = argv[i]; - } - if (arg == "--image") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.image = argv[i]; - } - if (arg == "-i" || arg == "--interactive") { - arg_found = true; - params.interactive = true; - } - if (arg == "--embedding") { - arg_found = true; - params.embedding = true; - } - if (arg == "--interactive-first") { - arg_found = true; - params.interactive_first = true; - } - if (arg == "-ins" || arg == "--instruct") { - arg_found = true; - params.instruct = true; - } - if (arg == "-cml" || arg == "--chatml") { - arg_found = true; - params.chatml = true; - } - if (arg == "--infill") { - arg_found = true; - params.infill = true; - } - if (arg == "-dkvc" || arg == "--dump-kv-cache") { - arg_found = true; - params.dump_kv_cache = true; - } - if (arg == "-nkvo" || arg == "--no-kv-offload") { - arg_found = true; - params.no_kv_offload = true; - } - if (arg == "-ctk" || arg == "--cache-type-k") { - arg_found = true; - params.cache_type_k = argv[++i]; - } - if (arg == "-ctv" || arg == "--cache-type-v") { - arg_found = true; - params.cache_type_v = argv[++i]; - } - if (arg == "--multiline-input") { - arg_found = true; - params.multiline_input = true; - } - if (arg == "--simple-io") { - arg_found = true; - params.simple_io = true; - } - if (arg == "-cb" || arg == "--cont-batching") { - arg_found = true; - params.cont_batching = true; - } - if (arg == "--color") { - arg_found = true; - params.use_color = true; - } - if (arg == "--mlock") { - arg_found = true; - params.use_mlock = true; - } - if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_gpu_layers = std::stoi(argv[i]); - if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); - } - } - if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_gpu_layers_draft = std::stoi(argv[i]); - if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); - } - } - if (arg == "--main-gpu" || arg == "-mg") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.main_gpu = std::stoi(argv[i]); -#ifndef GGML_USE_CUBLAS_SYCL - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n"); -#endif // GGML_USE_CUBLAS_SYCL - } - if (arg == "--split-mode" || arg == "-sm") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - std::string arg_next = argv[i]; - if (arg_next == "none") { - params.split_mode = LLAMA_SPLIT_MODE_NONE; - } else if (arg_next == "layer") { - params.split_mode = LLAMA_SPLIT_MODE_LAYER; - } else if (arg_next == "row") { -#ifdef GGML_USE_SYCL - fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n"); - exit(1); -#endif // GGML_USE_SYCL - params.split_mode = LLAMA_SPLIT_MODE_ROW; - } else { - invalid_param = true; - break; - } -#ifndef GGML_USE_CUBLAS_SYCL - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n"); -#endif // GGML_USE_CUBLAS_SYCL - - } - if (arg == "--tensor-split" || arg == "-ts") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - std::string arg_next = argv[i]; - - // split string by , and / - const std::regex regex{R"([,/]+)"}; - std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1}; - std::vector split_arg{it, {}}; - if (split_arg.size() >= llama_max_devices()) { - invalid_param = true; - break; - } - for (size_t i = 0; i < llama_max_devices(); ++i) { - if (i < split_arg.size()) { - params.tensor_split[i] = std::stof(split_arg[i]); - } else { - params.tensor_split[i] = 0.0f; - } - } -#ifndef GGML_USE_CUBLAS_SYCL_VULKAN - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n"); -#endif // GGML_USE_CUBLAS_SYCL - } - if (arg == "--no-mmap") { - arg_found = true; - params.use_mmap = false; - } - if (arg == "--numa") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - std::string value(argv[i]); - /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } - else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } - else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } - else { invalid_param = true; break; } - } - if (arg == "--verbose-prompt") { - arg_found = true; - params.verbose_prompt = true; - } - if (arg == "--no-display-prompt") { - arg_found = true; - params.display_prompt = false; - } - if (arg == "-r" || arg == "--reverse-prompt") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.antiprompt.emplace_back(argv[i]); - } - if (arg == "-ld" || arg == "--logdir") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.logdir = argv[i]; - - if (params.logdir.back() != DIRECTORY_SEPARATOR) { - params.logdir += DIRECTORY_SEPARATOR; - } - } - if (arg == "--save-all-logits" || arg == "--kl-divergence-base") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.logits_file = argv[i]; - } - if (arg == "--perplexity" || arg == "--all-logits") { - arg_found = true; - params.logits_all = true; - } - if (arg == "--ppl-stride") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.ppl_stride = std::stoi(argv[i]); - } - if (arg == "-ptc" || arg == "--print-token-count") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_print = std::stoi(argv[i]); - } - if (arg == "--ppl-output-type") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.ppl_output_type = std::stoi(argv[i]); - } - if (arg == "--hellaswag") { - arg_found = true; - params.hellaswag = true; - } - if (arg == "--hellaswag-tasks") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.hellaswag_tasks = std::stoi(argv[i]); - } - if (arg == "--winogrande") { - arg_found = true; - params.winogrande = true; - } - if (arg == "--winogrande-tasks") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.winogrande_tasks = std::stoi(argv[i]); - } - if (arg == "--multiple-choice") { - arg_found = true; - params.multiple_choice = true; - } - if (arg == "--multiple-choice-tasks") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.multiple_choice_tasks = std::stoi(argv[i]); - } - if (arg == "--kl-divergence") { - arg_found = true; - params.kl_divergence = true; - } - if (arg == "--ignore-eos") { - arg_found = true; - params.ignore_eos = true; - } - if (arg == "--no-penalize-nl") { - arg_found = true; - sparams.penalize_nl = false; - } - if (arg == "-l" || arg == "--logit-bias") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - std::stringstream ss(argv[i]); - llama_token key; - char sign; - std::string value_str; - try { - if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { - sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); - } else { - throw std::exception(); - } - } catch (const std::exception&) { - invalid_param = true; - break; - } - } - if (arg == "-h" || arg == "--help") { - arg_found = true; - return false; - } - if (arg == "--version") { - arg_found = true; - fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); - fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); - exit(0); - } - if (arg == "--random-prompt") { - arg_found = true; - params.random_prompt = true; - } - if (arg == "--in-prefix-bos") { - arg_found = true; - params.input_prefix_bos = true; - } - if (arg == "--in-prefix") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.input_prefix = argv[i]; - } - if (arg == "--in-suffix") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.input_suffix = argv[i]; - } - if (arg == "--grammar") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.grammar = argv[i]; - } - if (arg == "--grammar-file") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - break; - } - std::copy( - std::istreambuf_iterator(file), - std::istreambuf_iterator(), - std::back_inserter(sparams.grammar) - ); - } - if (arg == "--override-kv") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - char * sep = strchr(argv[i], '='); - if (sep == nullptr || sep - argv[i] >= 128) { - fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]); - invalid_param = true; - break; - } - struct llama_model_kv_override kvo; - std::strncpy(kvo.key, argv[i], sep - argv[i]); - kvo.key[sep - argv[i]] = 0; - sep++; - if (strncmp(sep, "int:", 4) == 0) { - sep += 4; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; - kvo.int_value = std::atol(sep); - } else if (strncmp(sep, "float:", 6) == 0) { - sep += 6; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; - kvo.float_value = std::atof(sep); - } else if (strncmp(sep, "bool:", 5) == 0) { - sep += 5; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; - if (std::strcmp(sep, "true") == 0) { - kvo.bool_value = true; - } else if (std::strcmp(sep, "false") == 0) { - kvo.bool_value = false; - } else { - fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]); - invalid_param = true; - break; - } - } else { - fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); - invalid_param = true; - break; - } - params.kv_overrides.push_back(kvo); -#ifndef LOG_DISABLE_LOGS - // Parse args for logging parameters - } - if ( log_param_single_parse( argv[i] ) ) { - arg_found = true; - // Do nothing, log_param_single_parse automatically does it's thing - // and returns if a match was found and parsed. - } - if ( log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i] ) ) { - arg_found = true; - // We have a matching known parameter requiring an argument, - // now we need to check if there is anything after this argv - // and flag invalid_param or parse it. - if (++i >= argc) { - invalid_param = true; - break; - } - if( !log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i-1], argv[i]) ) { - invalid_param = true; - break; - } - // End of Parse args for logging parameters -#endif // LOG_DISABLE_LOGS - } - - if (!arg_found) { + if (!gpt_params_find_arg(argc, argv, params, i, invalid_param)) { throw std::invalid_argument("error: unknown argument: " + arg); } } From 2bf8d0f7c4cc1235755ad06961ca761e458c5e55 Mon Sep 17 00:00:00 2001 From: slaren Date: Mon, 18 Mar 2024 11:03:04 +0100 Subject: [PATCH 45/56] backend : offload large batches to GPU (#6083) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * backend : offload large batches to GPU * fix hip * code cleanup * fix CUDA split buffers * Update ggml-backend-impl.h Co-authored-by: Johannes Gäßler * cuda : fix memset without set_device * imatrix : remove sched affix from weight names * sched : add a new split if the current one has too many inputs reduce max inputs per split more cleanup * update backends ggml-ci --------- Co-authored-by: Johannes Gäßler --- examples/imatrix/imatrix.cpp | 32 ++- examples/llama-bench/llama-bench.cpp | 4 +- ggml-alloc.c | 10 +- ggml-backend-impl.h | 5 + ggml-backend.c | 278 ++++++++++++++----------- ggml-backend.h | 8 +- ggml-cuda.cu | 297 +++++++++------------------ ggml-cuda.h | 21 +- ggml-kompute.cpp | 1 + ggml-metal.m | 1 + ggml-sycl.cpp | 1 + ggml-vulkan.cpp | 1 + ggml.c | 19 +- llama.cpp | 67 +++--- 14 files changed, 349 insertions(+), 396 deletions(-) diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index f21bc48f3..ea79b9062 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -56,13 +56,31 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * const struct ggml_tensor * src0 = t->src[0]; const struct ggml_tensor * src1 = t->src[1]; + std::string wname; + { + // remove any prefix and suffixes from the name + // CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight + const char * p = strchr(src0->name, '#'); + if (p != NULL) { + p = p + 1; + const char * q = strchr(p, '#'); + if (q != NULL) { + wname = std::string(p, q - p); + } else { + wname = p; + } + } else { + wname = src0->name; + } + } + // when ask is true, the scheduler wants to know if we are interested in data from this tensor // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection if (ask) { if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications if (t->op != GGML_OP_MUL_MAT) return false; if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false; - if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return false; + if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false; return true; } @@ -94,12 +112,12 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * // this is necessary to guarantee equal number of "ncall" for each tensor for (int ex = 0; ex < n_as; ++ex) { src0 = t->src[2 + ex]; - auto& e = m_stats[src0->name]; + auto& e = m_stats[wname]; if (e.values.empty()) { e.values.resize(src1->ne[0], 0); } else if (e.values.size() != (size_t)src1->ne[0]) { - fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]); + fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]); exit(1); //GGML_ASSERT(false); } // NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger @@ -107,7 +125,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * //if (idx == t->src[0]->ne[0] - 1) ++e.ncall; ++e.ncall; if (m_params.verbosity > 1) { - printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); + printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); } for (int row = 0; row < (int)src1->ne[1]; ++row) { const int excur = m_ids[row*n_as + idx]; @@ -129,17 +147,17 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * } } } else { - auto& e = m_stats[src0->name]; + auto& e = m_stats[wname]; if (e.values.empty()) { e.values.resize(src1->ne[0], 0); } else if (e.values.size() != (size_t)src1->ne[0]) { - fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]); + fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]); exit(1); //GGML_ASSERT(false); } ++e.ncall; if (m_params.verbosity > 1) { - printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); + printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); } for (int row = 0; row < (int)src1->ne[1]; ++row) { const float * x = data + row * src1->ne[0]; diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 32eea7869..4cb230804 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -114,10 +114,10 @@ static std::string get_cpu_info() { static std::string get_gpu_info() { std::string id; #ifdef GGML_USE_CUBLAS - int count = ggml_cuda_get_device_count(); + int count = ggml_backend_cuda_get_device_count(); for (int i = 0; i < count; i++) { char buf[128]; - ggml_cuda_get_device_description(i, buf, sizeof(buf)); + ggml_backend_cuda_get_device_description(i, buf, sizeof(buf)); id += buf; if (i < count - 1) { id += "/"; diff --git a/ggml-alloc.c b/ggml-alloc.c index 8ac1d3e51..643b2e55f 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -548,7 +548,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - if (ggml_is_view(node)) { + // TODO: better way to add external dependencies + // GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to + // control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node + // itself is never used and should not be considered a dependency + if (ggml_is_view(node) && node->op != GGML_OP_NONE) { struct ggml_tensor * view_src = node->view_src; ggml_gallocr_hash_get(galloc, view_src)->n_views += 1; } @@ -565,8 +569,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr ggml_gallocr_hash_get(galloc, src)->n_children += 1; - // allocate explicit inputs and leafs - if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) { + // allocate explicit inputs + if (src->flags & GGML_TENSOR_FLAG_INPUT) { ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i)); } } diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h index e475e20e5..f121e1de4 100644 --- a/ggml-backend-impl.h +++ b/ggml-backend-impl.h @@ -103,6 +103,11 @@ extern "C" { // check if the backend supports an operation bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); + // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer + // these should be expensive operations with large batch sizes that may benefit from running on this backend + // even if the weight has to be copied from the CPU temporarily + bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op); + // (optional) event synchronization ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend); void (*GGML_CALL event_free) (ggml_backend_event_t event); diff --git a/ggml-backend.c b/ggml-backend.c index 31f8d5a6d..9f0084df7 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -278,7 +278,7 @@ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_ return err; } -bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) { return backend->iface.graph_compute(backend, cgraph); } @@ -286,6 +286,13 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * return backend->iface.supports_op(backend, op); } +bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) { + if (backend->iface.offload_op != NULL) { + return backend->iface.offload_op(backend, op); + } + return false; +} + // backend copy static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { @@ -761,6 +768,10 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg if (cpu_plan->cplan.work_size > 0) { cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size); + if (cpu_plan->cplan.work_data == NULL) { + free(cpu_plan); + return NULL; + } } cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback; @@ -834,6 +845,7 @@ static struct ggml_backend_i cpu_backend_i = { /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute, /* .graph_compute = */ ggml_backend_cpu_graph_compute, /* .supports_op = */ ggml_backend_cpu_supports_op, + /* .offload_op = */ NULL, /* .event_new = */ NULL, /* .event_free = */ NULL, /* .event_record = */ NULL, @@ -999,11 +1011,11 @@ static bool ggml_is_view_op(enum ggml_op op) { #endif #ifndef GGML_SCHED_MAX_SPLITS -#define GGML_SCHED_MAX_SPLITS 256 +#define GGML_SCHED_MAX_SPLITS 2048 #endif #ifndef GGML_SCHED_MAX_SPLIT_INPUTS -#define GGML_SCHED_MAX_SPLIT_INPUTS 16 +#define GGML_SCHED_MAX_SPLIT_INPUTS 4 #endif #ifndef GGML_SCHED_MAX_COPIES @@ -1043,8 +1055,9 @@ struct ggml_backend_sched { struct ggml_cgraph * graph; // graph splits - struct ggml_backend_sched_split splits[GGML_SCHED_MAX_SPLITS]; + struct ggml_backend_sched_split * splits; int n_splits; + int splits_capacity; // pipeline parallelism support int n_copies; @@ -1114,40 +1127,48 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st // TODO: use supports_op to check if the backend supports the op // assign pre-allocated nodes to their backend - // dst - int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor); - if (cur_backend != -1) { + int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor); + if (cur_backend_id != -1) { SET_CAUSE(tensor, "1.dst"); - return cur_backend; + return cur_backend_id; } // view_src if (tensor->view_src != NULL) { - cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src); - if (cur_backend != -1) { + cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src); + if (cur_backend_id != -1) { SET_CAUSE(tensor, "1.vsrc"); - return cur_backend; + return cur_backend_id; } } - // input + // graph input if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { - cur_backend = sched->n_backends - 1; // last backend (assumed CPU) + cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU) SET_CAUSE(tensor, "1.inp"); - return cur_backend; + return cur_backend_id; } // assign nodes that use weights to the backend of the weights + // operations with weights are preferably run on the same backend as the weights for (int i = 0; i < GGML_MAX_SRC; i++) { const struct ggml_tensor * src = tensor->src[i]; if (src == NULL) { continue; } if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { - int src_backend = ggml_backend_sched_backend_from_buffer(sched, src); - // operations with weights are always run on the same backend as the weights + int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src); + // check if a backend with higher prio wants to offload the op + if (src_backend_id == sched->n_backends - 1) { + for (int b = 0; b < src_backend_id; b++) { + if (ggml_backend_offload_op(sched->backends[b], tensor)) { + SET_CAUSE(tensor, "1.off"); + return b; + } + } + } SET_CAUSE(tensor, "1.wgt%d", i); - return src_backend; + return src_backend_id; } } @@ -1227,28 +1248,31 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // pass 1: assign backends to ops with pre-allocated inputs for (int i = 0; i < graph->n_leafs; i++) { struct ggml_tensor * leaf = graph->leafs[i]; - if (tensor_backend_id(leaf) != -1) { + int * leaf_backend_id = &tensor_backend_id(leaf); + if (*leaf_backend_id != -1) { // do not overwrite user assignments continue; } - tensor_backend_id(leaf) = ggml_backend_sched_backend_id_from_cur(sched, leaf); + *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf); } for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - if (tensor_backend_id(node) != -1) { + int * node_backend_id = &tensor_backend_id(node); + if (*node_backend_id != -1) { // do not overwrite user assignments continue; } - tensor_backend_id(node) = ggml_backend_sched_backend_id_from_cur(sched, node); + *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node); // src for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { continue; } - if (tensor_backend_id(src) == -1) { - tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src); + int * src_backend_id = &tensor_backend_id(src); + if (*src_backend_id == -1) { + *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src); } } } @@ -1270,21 +1294,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (ggml_is_view_op(node->op)) { continue; } - int tensor_backend_id = tensor_backend_id(node); - if (tensor_backend_id != -1) { - if (tensor_backend_id == sched->n_backends - 1) { + int * node_backend_id = &tensor_backend_id(node); + if (*node_backend_id != -1) { + if (*node_backend_id == sched->n_backends - 1) { // skip cpu (lowest prio backend) cur_backend_id = -1; } else { - cur_backend_id = tensor_backend_id; + cur_backend_id = *node_backend_id; } } else { - tensor_backend_id(node) = cur_backend_id; + *node_backend_id = cur_backend_id; SET_CAUSE(node, "2.2"); } } } - // pass 2.1 expand gpu up { int cur_backend_id = -1; @@ -1293,22 +1316,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (ggml_is_view_op(node->op)) { continue; } - int tensor_backend_id = tensor_backend_id(node); - if (tensor_backend_id != -1) { - if (tensor_backend_id == sched->n_backends - 1) { + int * node_backend_id = &tensor_backend_id(node); + if (*node_backend_id != -1) { + if (*node_backend_id == sched->n_backends - 1) { // skip cpu (lowest prio backend) cur_backend_id = -1; } else { - cur_backend_id = tensor_backend_id; + cur_backend_id = *node_backend_id; } } else { - tensor_backend_id(node) = cur_backend_id; + *node_backend_id = cur_backend_id; SET_CAUSE(node, "2.1"); } } } - - // pass 2.4 expand rest down { int cur_backend_id = -1; @@ -1317,16 +1338,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (ggml_is_view_op(node->op)) { continue; } - int tensor_backend_id = tensor_backend_id(node); - if (tensor_backend_id != -1) { - cur_backend_id = tensor_backend_id; + int * node_backend_id = &tensor_backend_id(node); + if (*node_backend_id != -1) { + cur_backend_id = *node_backend_id; } else { - tensor_backend_id(node) = cur_backend_id; + *node_backend_id = cur_backend_id; SET_CAUSE(node, "2.4"); } } } - // pass 2.3 expand rest up + // pass 2.3 expand rest up { int cur_backend_id = -1; for (int i = graph->n_nodes - 1; i >= 0; i--) { @@ -1334,11 +1355,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (ggml_is_view_op(node->op)) { continue; } - int tensor_backend_id = tensor_backend_id(node); - if (tensor_backend_id != -1) { - cur_backend_id = tensor_backend_id; + int * node_backend_id = &tensor_backend_id(node); + if (*node_backend_id != -1) { + cur_backend_id = *node_backend_id; } else { - tensor_backend_id(node) = cur_backend_id; + *node_backend_id = cur_backend_id; SET_CAUSE(node, "2.3"); } } @@ -1351,9 +1372,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // pass 3: assign backends to remaining src from dst and view_src for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - int cur_backend_id = tensor_backend_id(node); - if (node->view_src != NULL && cur_backend_id == -1) { - cur_backend_id = tensor_backend_id(node) = tensor_backend_id(node->view_src); + int * cur_backend_id = &tensor_backend_id(node); + if (node->view_src != NULL && *cur_backend_id == -1) { + *cur_backend_id = tensor_backend_id(node->view_src); SET_CAUSE(node, "3.vsrc"); } for (int j = 0; j < GGML_MAX_SRC; j++) { @@ -1361,14 +1382,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (src == NULL) { continue; } - int src_backend_id = tensor_backend_id(src); - if (src_backend_id == -1) { + int * src_backend_id = &tensor_backend_id(src); + if (*src_backend_id == -1) { if (src->view_src != NULL) { // views are always on the same backend as the source - tensor_backend_id(src) = tensor_backend_id(src->view_src); + *src_backend_id = tensor_backend_id(src->view_src); SET_CAUSE(src, "3.vsrc"); } else { - tensor_backend_id(src) = cur_backend_id; + *src_backend_id = *cur_backend_id; SET_CAUSE(src, "3.cur"); } } @@ -1380,19 +1401,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // pass 4: split graph, find tensors that need to be copied { - int cur_split = 0; + int i_split = 0; + struct ggml_backend_sched_split * split = &sched->splits[0]; // find the backend of the first split, skipping view ops for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; if (!ggml_is_view_op(node->op)) { - sched->splits[0].backend_id = tensor_backend_id(node); + split->backend_id = tensor_backend_id(node); break; } } - sched->splits[0].i_start = 0; - sched->splits[0].n_inputs = 0; - memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK - int cur_backend_id = sched->splits[0].backend_id; + split->i_start = 0; + split->n_inputs = 0; + memset(split->inputs, 0, sizeof(split->inputs)); //HACK + int cur_backend_id = split->backend_id; for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; @@ -1400,18 +1422,54 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg continue; } - int tensor_backend_id = tensor_backend_id(node); + const int node_backend_id = tensor_backend_id(node); - GGML_ASSERT(tensor_backend_id != -1); // all nodes should be assigned by now + GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now - if (tensor_backend_id != cur_backend_id) { - sched->splits[cur_split].i_end = i; - cur_split++; - GGML_ASSERT(cur_split < GGML_SCHED_MAX_SPLITS); - sched->splits[cur_split].backend_id = tensor_backend_id; - sched->splits[cur_split].i_start = i; - sched->splits[cur_split].n_inputs = 0; - cur_backend_id = tensor_backend_id; + // check if we should start a new split based on the sources of the current node + bool need_new_split = false; + if (node_backend_id == cur_backend_id && split->n_inputs > 0) { + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + continue; + } + // check if a weight is on a different backend + // by starting a new split, the memory of the previously offloaded weights can be reused + if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { + int src_backend_id = tensor_backend_id(src); + if (src_backend_id != -1 && src_backend_id != cur_backend_id) { + need_new_split = true; + break; + } + } + // check if the split has too many inputs + if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) { + const size_t id = hash_id(src); + int src_backend_id = sched->tensor_backend_id[id]; + if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) { + //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name); + need_new_split = true; + break; + } + } + } + } + + if (node_backend_id != cur_backend_id || need_new_split) { + split->i_end = i; + i_split++; + if (i_split >= sched->splits_capacity) { + sched->splits_capacity *= 2; + sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split)); + GGML_ASSERT(sched->splits != NULL); + } + GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS); + split = &sched->splits[i_split]; + split->backend_id = node_backend_id; + split->i_start = i; + split->n_inputs = 0; + cur_backend_id = node_backend_id; } // find inputs that are not on the same backend @@ -1421,10 +1479,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg continue; } - int src_backend_id = tensor_backend_id(src); + const int src_backend_id = tensor_backend_id(src); assert(src_backend_id != -1); // all inputs should be assigned by now - if (src->flags & GGML_TENSOR_FLAG_INPUT) { + if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) { size_t id = hash_id(src); if (sched->tensor_copies[id][src_backend_id][0] == NULL) { ggml_backend_t backend = sched->backends[src_backend_id]; @@ -1441,7 +1499,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor } sched->tensor_copies[id][src_backend_id][c] = tensor_copy; - tensor_backend_id(tensor_copy) = src_backend_id; SET_CAUSE(tensor_copy, "4.cpy"); } int n_graph_inputs = sched->n_graph_inputs++; @@ -1450,9 +1507,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } - if (src_backend_id != tensor_backend_id) { + if (src_backend_id != node_backend_id) { // create a copy of the input in the split's backend - size_t id = hash_id(src); + const size_t id = hash_id(src); if (sched->tensor_copies[id][cur_backend_id][0] == NULL) { ggml_backend_t backend = sched->backends[cur_backend_id]; for (int c = 0; c < sched->n_copies; c++) { @@ -1463,76 +1520,42 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor } sched->tensor_copies[id][cur_backend_id][c] = tensor_copy; - tensor_backend_id(tensor_copy) = cur_backend_id; SET_CAUSE(tensor_copy, "4.cpy"); } - int n_inputs = sched->splits[cur_split].n_inputs++; + int n_inputs = split->n_inputs++; GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); - sched->splits[cur_split].inputs[n_inputs] = src; + split->inputs[n_inputs] = src; } node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy]; } } } - sched->splits[cur_split].i_end = graph->n_nodes; - sched->n_splits = cur_split + 1; + split->i_end = graph->n_nodes; + sched->n_splits = i_split + 1; } #ifdef DEBUG_PASS4 fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph); #endif -#ifndef NDEBUG - // sanity check: all sources should have the same backend as the node - for (int i = 0; i < graph->n_nodes; i++) { - struct ggml_tensor * node = graph->nodes[i]; - ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node); - if (tensor_backend == NULL) { - fprintf(stderr, "!!!!!!! %s has no backend\n", node->name); - } - if (node->view_src != NULL && tensor_backend != ggml_backend_sched_get_tensor_backend(sched, node->view_src)) { - fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n", - node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", - node->view_src->name, ggml_backend_sched_get_tensor_backend(sched, node->view_src) ? - ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, node->view_src)) : "NULL"); - } - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * src = node->src[j]; - if (src == NULL) { - continue; - } - ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src); - if (src_backend != tensor_backend /* && src_backend != NULL */) { - fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n", - node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", - j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL"); - } - if (src->view_src != NULL && src_backend != ggml_backend_sched_get_tensor_backend(sched, src->view_src)) { - fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n", - src->name, src_backend ? ggml_backend_name(src_backend) : "NULL", - src->view_src->name, ggml_backend_sched_get_tensor_backend(sched, src->view_src) ? - ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, src->view_src)) : "NULL"); - } - } - } - fflush(stderr); -#endif - // create copies of the graph for each split // TODO: avoid this copy - struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false); + struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false); for (int i = 0; i < sched->n_splits; i++) { struct ggml_backend_sched_split * split = &sched->splits[i]; split->graph = ggml_graph_view(graph, split->i_start, split->i_end); // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split for (int j = 0; j < split->n_inputs; j++) { + assert(graph_copy->size > (graph_copy->n_nodes + 1)); + struct ggml_tensor * input = split->inputs[j]; - struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id][sched->cur_copy]; + const size_t input_id = hash_id(input); + struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy]; // add a dependency to the input source so that it is not freed before the copy is done struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input); input_dep->src[0] = input; - sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input); + sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id]; graph_copy->nodes[graph_copy->n_nodes++] = input_dep; // add a dependency to the input copy so that it is allocated at the start of the split @@ -1541,6 +1564,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } for (int j = split->i_start; j < split->i_end; j++) { + assert(graph_copy->size > graph_copy->n_nodes); sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]); graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j]; } @@ -1625,13 +1649,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } ggml_backend_tensor_copy(input, input_cpy); } else { + // wait for the split backend to finish using the input before overwriting it if (sched->events[split_backend_id][sched->cur_copy] != NULL) { ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]); } else { ggml_backend_synchronize(split_backend); - ggml_backend_synchronize(input_backend); } - ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy); } } @@ -1701,17 +1724,21 @@ ggml_backend_sched_t ggml_backend_sched_new( struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1); // initialize hash table - sched->hash_set = ggml_hash_set_new(graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS); + sched->hash_set = ggml_hash_set_new(graph_size); sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size); sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size); - sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size); - sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), graph_size); + + const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2; + sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), nodes_size); + sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size); sched->n_backends = n_backends; sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1; - GGML_ASSERT(sched->n_copies <= GGML_SCHED_MAX_COPIES); + const int initial_splits_capacity = 16; + sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity); + sched->splits_capacity = initial_splits_capacity; for (int b = 0; b < n_backends; b++) { sched->backends[b] = backends[b]; @@ -1742,6 +1769,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { } ggml_gallocr_free(sched->galloc); ggml_free(sched->ctx); + free(sched->splits); free(sched->hash_set.keys); free(sched->tensor_backend_id); free(sched->tensor_copies); @@ -1762,6 +1790,8 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) { } bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) { + GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes); + ggml_backend_sched_split_graph(sched, measure_graph); // TODO: extract this to a separate function @@ -1776,7 +1806,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * } bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { - GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS); + GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes); ggml_backend_sched_split_graph(sched, graph); diff --git a/ggml-backend.h b/ggml-backend.h index 099d9c258..422457ab6 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -70,11 +70,11 @@ extern "C" { GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan); - GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan); - GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); - - GGML_API bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph); + GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan); + GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); + GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op); + GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op); // tensor copy between different backends GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst); diff --git a/ggml-cuda.cu b/ggml-cuda.cu index db595409a..139025588 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -82,6 +82,10 @@ #define cudaGetDeviceProperties hipGetDeviceProperties #define cudaGetErrorString hipGetErrorString #define cudaGetLastError hipGetLastError +#define cudaHostRegister hipHostRegister +#define cudaHostRegisterPortable hipHostRegisterPortable +#define cudaHostRegisterReadOnly hipHostRegisterReadOnly +#define cudaHostUnregister hipHostUnregister #define cudaLaunchHostFunc hipLaunchHostFunc #ifdef GGML_HIP_UMA #define cudaMalloc hipMallocManaged @@ -7787,11 +7791,7 @@ struct cuda_pool_alloc { static bool g_cublas_loaded = false; -GGML_CALL bool ggml_cublas_loaded(void) { - return g_cublas_loaded; -} - -GGML_CALL void ggml_init_cublas() { +static void ggml_init_cublas() { static bool initialized = false; if (!initialized) { @@ -7880,7 +7880,7 @@ GGML_CALL void ggml_init_cublas() { } } -GGML_CALL void * ggml_cuda_host_malloc(size_t size) { +static void * ggml_cuda_host_malloc(size_t size) { if (getenv("GGML_CUDA_NO_PINNED") != nullptr) { return nullptr; } @@ -7890,7 +7890,7 @@ GGML_CALL void * ggml_cuda_host_malloc(size_t size) { if (err != cudaSuccess) { // clear the error cudaGetLastError(); - fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n", + fprintf(stderr, "%s: warning: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, size/1024.0/1024.0, cudaGetErrorString(err)); return nullptr; } @@ -7898,7 +7898,7 @@ GGML_CALL void * ggml_cuda_host_malloc(size_t size) { return ptr; } -GGML_CALL void ggml_cuda_host_free(void * ptr) { +static void ggml_cuda_host_free(void * ptr) { CUDA_CHECK(cudaFreeHost(ptr)); } @@ -9036,21 +9036,13 @@ static void ggml_cuda_op_soft_max( // positions tensor float * src2_dd = nullptr; - cuda_pool_alloc src2_f; ggml_tensor * src2 = dst->src[2]; const bool use_src2 = src2 != nullptr; if (use_src2) { - const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU; - - if (src2_on_device) { - ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra; - src2_dd = (float *) src2_extra->data_device[g_main_device]; - } else { - src2_dd = src2_f.alloc(ggml_nelements(src2)); - CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream)); - } + ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra; + src2_dd = (float *) src2_extra->data_device[g_main_device]; } soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream); @@ -9107,55 +9099,24 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; - const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU; - const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU; - // dd = data device float * src0_ddf = nullptr; float * src1_ddf = nullptr; float * dst_ddf = nullptr; - cuda_pool_alloc src0_f; - cuda_pool_alloc src1_f; - cuda_pool_alloc dst_f; - ggml_cuda_set_device(g_main_device); cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; - if (src0_on_device) { - src0_ddf = (float *) src0_extra->data_device[g_main_device]; - } else { - src0_ddf = src0_f.alloc(ggml_nelements(src0)); - CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream)); - } + src0_ddf = (float *) src0_extra->data_device[g_main_device]; if (use_src1) { - if (src1_on_device) { - src1_ddf = (float *) src1_extra->data_device[g_main_device]; - } else { - src1_ddf = src1_f.alloc(ggml_nelements(src1)); - CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream)); - } - } - if (dst_on_device) { - dst_ddf = (float *) dst_extra->data_device[g_main_device]; - } else { - dst_ddf = dst_f.alloc(ggml_nelements(dst)); + src1_ddf = (float *) src1_extra->data_device[g_main_device]; } + dst_ddf = (float *) dst_extra->data_device[g_main_device]; // do the computation op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream); CUDA_CHECK(cudaGetLastError()); - - // copy dst to host if necessary - if (!dst_on_device) { - CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream)); - } - - if (dst->backend == GGML_BACKEND_TYPE_CPU) { - CUDA_CHECK(cudaDeviceSynchronize()); - } } static void ggml_cuda_set_peer_access(const int n_tokens) { @@ -9251,7 +9212,6 @@ static void ggml_cuda_op_mul_mat( ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; const bool src0_is_contiguous = ggml_is_contiguous(src0); const bool src1_is_contiguous = ggml_is_contiguous(src1); @@ -9322,13 +9282,13 @@ static void ggml_cuda_op_mul_mat( used_devices++; - const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device; - const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device; + const bool src1_on_device = id == g_main_device; // TODO: check from buffer + const bool dst_on_device = id == g_main_device; ggml_cuda_set_device(id); cudaStream_t stream = g_cudaStreams[id][0]; - if (src0_on_device && src0_is_contiguous) { + if (src0_is_contiguous) { dev[id].src0_dd = (char *) src0_extra->data_device[id]; } else { dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ggml_nbytes(src0)); @@ -9374,8 +9334,8 @@ static void ggml_cuda_op_mul_mat( continue; } - const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device; - const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device; + const bool src1_on_device = id == g_main_device; // TODO: check from buffer + const bool dst_on_device = id == g_main_device; const int64_t row_diff = dev[id].row_high - dev[id].row_low; ggml_cuda_set_device(id); @@ -9400,12 +9360,12 @@ static void ggml_cuda_op_mul_mat( // the main device memory buffer can be on VRAM scratch, with space for all partial results // in that case an offset on dst_ddf_i is needed - if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device) { + if (id == g_main_device) { dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split } // copy src0, src1 to device if necessary - if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) { + if (src1_is_contiguous) { if (id != g_main_device) { if (convert_src1_to_q8_1) { char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset; @@ -9418,19 +9378,19 @@ static void ggml_cuda_op_mul_mat( src1_ncols*ne10*sizeof(float), stream)); } } - } else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) { + } else if (src1_on_device && !src1_is_contiguous) { CUDA_CHECK(ggml_cuda_cpy_tensor_2d( src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream)); } else { GGML_ASSERT(false); } - if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) { + if (convert_src1_to_q8_1 && !src1_is_contiguous) { quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream); CUDA_CHECK(cudaGetLastError()); } - if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) { + if (src1_col_0 == 0 && !src0_is_contiguous && i02 % i02_divisor == 0) { CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream)); } @@ -9441,17 +9401,7 @@ static void ggml_cuda_op_mul_mat( // copy dst to host or other device if necessary if (!dst_on_device) { - void * dst_off_device; - cudaMemcpyKind kind; - if (dst->backend == GGML_BACKEND_TYPE_CPU) { - dst_off_device = dst->data; - kind = cudaMemcpyDeviceToHost; - } else if (dst->backend == GGML_BACKEND_TYPE_GPU) { - dst_off_device = dst_extra->data_device[g_main_device]; - kind = cudaMemcpyDeviceToDevice; - } else { - GGML_ASSERT(false); - } + void * dst_off_device = dst_extra->data_device[g_main_device]; if (split) { // src0 = weight matrix is saved as a transposed matrix for better memory layout. // dst is NOT transposed. @@ -9462,28 +9412,26 @@ static void ggml_cuda_op_mul_mat( GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); dhf_dst_i += src1_col_0*ne0 + dev[id].row_low; #if !defined(GGML_USE_HIPBLAS) - if (kind == cudaMemcpyDeviceToDevice) { - // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices - cudaMemcpy3DPeerParms p = {}; - p.dstDevice = g_main_device; - p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols); - p.srcDevice = id; - p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols); - p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1); - CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream)); - } else + // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices + cudaMemcpy3DPeerParms p = {}; + p.dstDevice = g_main_device; + p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols); + p.srcDevice = id; + p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols); + p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1); + CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream)); +#else + // HIP does not support cudaMemcpy3DPeerAsync or vmm pools + CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), + dst_dd_i, row_diff*sizeof(float), + row_diff*sizeof(float), src1_ncols, + cudaMemcpyDeviceToDevice, stream)); #endif - { - CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), - dst_dd_i, row_diff*sizeof(float), - row_diff*sizeof(float), src1_ncols, - kind, stream)); - } } else { float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); dhf_dst_i += src1_col_0*ne0; - CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), kind, stream)); + CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), cudaMemcpyDeviceToDevice, stream)); } } @@ -9510,11 +9458,6 @@ static void ggml_cuda_op_mul_mat( } } } - - if (dst->backend == GGML_BACKEND_TYPE_CPU) { - ggml_cuda_set_device(g_main_device); - CUDA_CHECK(cudaDeviceSynchronize()); - } } static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -9599,36 +9542,19 @@ static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, gg static void ggml_cuda_arange(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU; - // dd = data device float * src0_ddf = nullptr; float * src1_ddf = nullptr; float * dst_ddf = nullptr; - cuda_pool_alloc dst_f; - ggml_cuda_set_device(g_main_device); cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; - if (dst_on_device) { - dst_ddf = (float *) dst_extra->data_device[g_main_device]; - } else { - dst_ddf = dst_f.alloc(ggml_nelements(dst)); - } + dst_ddf = (float *) dst_extra->data_device[g_main_device]; // do the computation ggml_cuda_op_arange(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream); CUDA_CHECK(cudaGetLastError()); - - // copy dst to host if necessary - if (!dst_on_device) { - CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream)); - } - - if (dst->backend == GGML_BACKEND_TYPE_CPU) { - CUDA_CHECK(cudaDeviceSynchronize()); - } } static void ggml_cuda_timestep_embedding(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -9639,21 +9565,6 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm); } -GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - if (!g_cublas_loaded) return false; - - const int64_t ne10 = src1->ne[0]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - - // TODO: find the optimal values for these - return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && - src1->type == GGML_TYPE_F32 && - dst->type == GGML_TYPE_F32 && - (ne0 >= 32 && ne1 >= 32 && ne10 >= 32); -} - static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); @@ -9891,11 +9802,6 @@ static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggm } static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - const bool all_on_device = - (src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) && - (src1->backend == GGML_BACKEND_TYPE_GPU) && - ( dst->backend == GGML_BACKEND_TYPE_GPU); - const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; int64_t min_compute_capability = INT_MAX; @@ -9972,13 +9878,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name); //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); - if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { + if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { // KQ single-batch ggml_cuda_mul_mat_vec_p021(src0, src1, dst); - } else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { + } else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { // KQV single-batch ggml_cuda_mul_mat_vec_nc(src0, src1, dst); - } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { + } else if (!split && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { // KQ + KQV multi-batch ggml_cuda_mul_mat_batched_cublas(src0, src1, dst); } else if (use_dequantize_mul_mat_vec) { @@ -10178,6 +10084,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s ggml_cuda_mul_mat_id_cublas(dst); // TODO: mmq/mmv support #endif + cudaStream_t stream = g_cudaStreams[g_main_device][0]; const size_t nb11 = src1->nb[1]; const size_t nb1 = dst->nb[1]; @@ -10187,16 +10094,9 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s const int32_t n_as = ((int32_t *) dst->op_params)[1]; std::vector ids_host(ggml_nbytes(ids)); - - cudaStream_t stream = g_cudaStreams[g_main_device][0]; - - if (ids->backend == GGML_BACKEND_TYPE_GPU) { - const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device]; - CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaStreamSynchronize(stream)); - } else { - memcpy(ids_host.data(), ids->data, ggml_nbytes(ids)); - } + const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device]; + CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra; const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra; @@ -10213,20 +10113,11 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s src1_row.extra = &src1_row_extra; dst_row.extra = &dst_row_extra; - char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ? - (char *) src1->data : (char *) src1_extra->data_device[g_main_device]; - char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ? - (char *) dst->data : (char *) dst_extra->data_device[g_main_device]; + char * src1_original = (char *) src1_extra->data_device[g_main_device]; + char * dst_original = (char *) dst_extra->data_device[g_main_device]; if (src1->ne[1] == 1) { - GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU); - GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU); - for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { - //int32_t row_id; - //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0])); - //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0])); - const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); GGML_ASSERT(row_id >= 0 && row_id < n_as); @@ -10248,11 +10139,6 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s src1_row_extra.data_device[g_main_device] = src1_contiguous.get(); dst_row_extra.data_device[g_main_device] = dst_contiguous.get(); - const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_TYPE_CPU ? - cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice; - const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_TYPE_CPU ? - cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice; - for (int32_t row_id = 0; row_id < n_as; ++row_id) { const struct ggml_tensor * src0_row = dst->src[row_id + 2]; @@ -10267,7 +10153,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s GGML_ASSERT(row_id >= 0 && row_id < n_as); CUDA_CHECK(cudaMemcpyAsync(src1_contiguous.get() + num_src1_rows*nb11, src1_original + i01*nb11, - nb11, src1_kind, stream)); + nb11, cudaMemcpyDeviceToDevice, stream)); num_src1_rows++; } @@ -10299,15 +10185,11 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s GGML_ASSERT(row_id >= 0 && row_id < n_as); CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous.get() + num_src1_rows*nb1, - nb1, dst_kind, stream)); + nb1, cudaMemcpyDeviceToDevice, stream)); num_src1_rows++; } } } - - if (dst->backend == GGML_BACKEND_TYPE_CPU) { - CUDA_CHECK(cudaStreamSynchronize(stream)); - } } static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -10435,7 +10317,7 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]); } -GGML_CALL static void ggml_cuda_set_main_device(const int main_device) { +static void ggml_cuda_set_main_device(const int main_device) { if (main_device >= g_device_count) { fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n", main_device, g_device_count, g_main_device); @@ -10450,18 +10332,9 @@ GGML_CALL static void ggml_cuda_set_main_device(const int main_device) { } } -GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { +static bool ggml_cuda_compute_forward(struct ggml_tensor * tensor) { if (!g_cublas_loaded) return false; - ggml_cuda_func_t func; - const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU - || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) - || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU); - - if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) { - return false; - } - if (tensor->op == GGML_OP_MUL_MAT) { if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) { #ifndef NDEBUG @@ -10471,6 +10344,8 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st } } + ggml_cuda_func_t func; + switch (tensor->op) { case GGML_OP_REPEAT: func = ggml_cuda_repeat; @@ -10548,15 +10423,9 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st func = ggml_cuda_rms_norm; break; case GGML_OP_MUL_MAT: - if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) { - return false; - } func = ggml_cuda_mul_mat; break; case GGML_OP_MUL_MAT_ID: - if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) { - return false; - } func = ggml_cuda_mul_mat_id; break; case GGML_OP_SCALE: @@ -10613,17 +10482,11 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st ggml_cuda_set_peer_access(tensor->src[1]->ne[1]); } - if (params->ith != 0) { - return true; - } - if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { - return true; - } func(tensor->src[0], tensor->src[1], tensor); return true; } -GGML_CALL int ggml_cuda_get_device_count() { +static int ggml_cuda_get_device_count() { int device_count; if (cudaGetDeviceCount(&device_count) != cudaSuccess) { return 0; @@ -10631,7 +10494,7 @@ GGML_CALL int ggml_cuda_get_device_count() { return device_count; } -GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size) { +static void ggml_cuda_get_device_description(int device, char * description, size_t description_size) { cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); snprintf(description, description_size, "%s", prop.name); @@ -10736,6 +10599,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor); if (padded_size > original_size && tensor->view_src == nullptr) { + ggml_cuda_set_device(ctx->device); CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size)); } } @@ -10873,6 +10737,8 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { }; GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { + ggml_init_cublas(); + // FIXME: this is not thread safe if (device >= ggml_backend_cuda_get_device_count()) { return nullptr; @@ -11157,6 +11023,8 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface }; GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) { + ggml_init_cublas(); + // FIXME: this is not thread safe static std::map, struct ggml_backend_buffer_type> buft_map; @@ -11348,9 +11216,6 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t ggml_cuda_set_main_device(cuda_ctx->device); - ggml_compute_params params = {}; - params.type = GGML_TASK_TYPE_COMPUTE; - params.ith = 0; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -11372,7 +11237,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t } #endif - bool ok = ggml_cuda_compute_forward(¶ms, node); + bool ok = ggml_cuda_compute_forward(node); if (!ok) { fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } @@ -11509,6 +11374,14 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons UNUSED(backend); } +GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) { + const int min_batch_size = 32; + + return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS; + + UNUSED(backend); +} + static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; @@ -11571,6 +11444,7 @@ static ggml_backend_i ggml_backend_cuda_interface = { /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_cuda_graph_compute, /* .supports_op = */ ggml_backend_cuda_supports_op, + /* .offload_op = */ ggml_backend_cuda_offload_op, /* .event_new = */ ggml_backend_cuda_event_new, /* .event_free = */ ggml_backend_cuda_event_free, /* .event_record = */ ggml_backend_cuda_event_record, @@ -11584,7 +11458,7 @@ static ggml_guid_t ggml_backend_cuda_guid() { } GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) { - ggml_init_cublas(); // TODO: remove from ggml.c + ggml_init_cublas(); if (device < 0 || device >= ggml_cuda_get_device_count()) { fprintf(stderr, "%s: error: invalid device %d\n", __func__, device); @@ -11627,6 +11501,31 @@ GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, si CUDA_CHECK(cudaMemGetInfo(free, total)); } +GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) { + if (getenv("GGML_CUDA_NO_PINNED") != nullptr) { + return false; + } + + cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly); + if (err != cudaSuccess) { + // clear the error + cudaGetLastError(); + + fprintf(stderr, "%s: warning: failed to register %.2f MiB of pinned memory: %s\n", __func__, + size/1024.0/1024.0, cudaGetErrorString(err)); + return false; + } + return true; +} + +GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) { + cudaError_t err = cudaHostUnregister(buffer); + if (err != cudaSuccess) { + // clear the error + cudaGetLastError(); + } +} + // backend registry GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) { ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data); diff --git a/ggml-cuda.h b/ggml-cuda.h index b1ebd61d7..5eb4af40f 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -17,29 +17,17 @@ extern "C" { #define GGML_CUDA_MAX_DEVICES 16 -// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`. -GGML_API GGML_CALL void ggml_init_cublas(void); - -// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`. -GGML_API GGML_CALL bool ggml_cublas_loaded(void); - -GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size); -GGML_API GGML_CALL void ggml_cuda_host_free(void * ptr); - -GGML_API GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -GGML_API GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); - -GGML_API GGML_CALL int ggml_cuda_get_device_count(void); -GGML_API GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size); - // backend API GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device); GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend); +// device buffer GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device); + // split tensor buffer that splits matrices by rows across multiple devices GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split); + // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void); @@ -47,6 +35,9 @@ GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void); GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size); GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total); +GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size); +GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer); + #ifdef __cplusplus } #endif diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp index 4caf2c9e7..81dd50678 100644 --- a/ggml-kompute.cpp +++ b/ggml-kompute.cpp @@ -1951,6 +1951,7 @@ static struct ggml_backend_i kompute_backend_i = { /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_kompute_graph_compute, /* .supports_op = */ ggml_backend_kompute_supports_op, + /* .offload_op = */ NULL, /* .event_new = */ NULL, /* .event_free = */ NULL, /* .event_record = */ NULL, diff --git a/ggml-metal.m b/ggml-metal.m index c3451a79b..109e5fe6b 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -2837,6 +2837,7 @@ static struct ggml_backend_i ggml_backend_metal_i = { /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_metal_graph_compute, /* .supports_op = */ ggml_backend_metal_supports_op, + /* .offload_op = */ NULL, /* .event_new = */ NULL, /* .event_free = */ NULL, /* .event_record = */ NULL, diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 6dc5eb20c..d51f23b41 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -17390,6 +17390,7 @@ static ggml_backend_i ggml_backend_sycl_interface = { /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_sycl_graph_compute, /* .supports_op = */ ggml_backend_sycl_supports_op, + /* .offload_op = */ NULL, /* .event_new = */ NULL, /* .event_free = */ NULL, /* .event_record = */ NULL, diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 698b31496..cbceaa19f 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -5699,6 +5699,7 @@ static ggml_backend_i ggml_backend_vk_interface = { /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_vk_graph_compute, /* .supports_op = */ ggml_backend_vk_supports_op, + /* .offload_op = */ NULL, /* .event_new = */ NULL, /* .event_free = */ NULL, /* .event_record = */ NULL, diff --git a/ggml.c b/ggml.c index fa23cb3c4..1d5854960 100644 --- a/ggml.c +++ b/ggml.c @@ -282,8 +282,6 @@ inline static void * ggml_calloc(size_t num, size_t size) { #else #include #endif -#elif defined(GGML_USE_CUBLAS) -#include "ggml-cuda.h" #elif defined(GGML_USE_CLBLAST) #include "ggml-opencl.h" #elif defined(GGML_USE_VULKAN) @@ -2640,9 +2638,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); } -#if defined(GGML_USE_CUBLAS) - ggml_init_cublas(); -#elif defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_CLBLAST) ggml_cl_init(); #elif defined(GGML_USE_VULKAN) ggml_vk_init_cpu_assist(); @@ -11105,7 +11101,6 @@ static void ggml_compute_forward_out_prod_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows - // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod // TODO: #if defined(GGML_USE_CLBLAST) #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) @@ -11305,7 +11300,6 @@ static void ggml_compute_forward_out_prod_q_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows - // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) if (params->type == GGML_TASK_TYPE_INIT) { @@ -16051,14 +16045,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm return; } -#ifdef GGML_USE_CUBLAS - bool skip_cpu = ggml_cuda_compute_forward(params, tensor); - if (skip_cpu) { - return; - } - GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU); - GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU); -#elif defined(GGML_USE_VULKAN) +#if defined(GGML_USE_VULKAN) const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor); #ifdef GGML_VULKAN_CHECK_RESULTS if (skip_cpu) { @@ -16070,7 +16057,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU); GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU); -#endif // GGML_USE_CUBLAS +#endif // GGML_USE_VULKAN #ifdef GGML_USE_SYCL bool skip_cpu = ggml_sycl_compute_forward(params, tensor); diff --git a/llama.cpp b/llama.cpp index e4db288dd..b8bef6daf 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2040,6 +2040,11 @@ struct llama_model { ggml_free(ctx); } for (ggml_backend_buffer_t buf : bufs) { +#ifdef GGML_USE_CUBLAS + if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) { + ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf)); + } +#endif ggml_backend_buffer_free(buf); } } @@ -5033,6 +5038,13 @@ static bool llm_load_tensors( size_t first, last; ml.get_mapping_range(&first, &last, ctx); buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first); +#ifdef GGML_USE_CUBLAS + if (n_layer >= n_gpu_layers) { + ggml_backend_cuda_register_host_buffer( + ggml_backend_buffer_get_base(buf), + ggml_backend_buffer_get_size(buf)); + } +#endif } #ifdef GGML_USE_METAL else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) { @@ -8231,7 +8243,6 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - cb(cur, "kqv_out", il); } struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); @@ -8601,12 +8612,15 @@ static struct ggml_cgraph * llama_build_graph( } // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends - // to fix this, we assign the norm layer manually to the backend of its layer - if (il != -1 && strcmp(name, "norm") == 0) { - for (auto * backend : lctx.backends) { - if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) { - ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend); - break; + // FIXME: fix in ggml_backend_sched + const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer; + if (batch.n_tokens < 32 || full_offload) { + if (il != -1 && strcmp(name, "norm") == 0) { + for (auto * backend : lctx.backends) { + if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) { + ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend); + break; + } } } } @@ -13107,27 +13121,25 @@ struct llama_context * llama_new_context_with_model( ctx->backends.push_back(ctx->backend_metal); } #elif defined(GGML_USE_CUBLAS) - if (model->n_gpu_layers > 0) { + if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used - if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { - ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu); + ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } else { + // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU + for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) { + ggml_backend_t backend = ggml_backend_cuda_init(device); if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu); + LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device); llama_free(ctx); return nullptr; } ctx->backends.push_back(backend); - } else { - // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU - for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) { - ggml_backend_t backend = ggml_backend_cuda_init(device); - if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device); - llama_free(ctx); - return nullptr; - } - ctx->backends.push_back(backend); - } } } #elif defined(GGML_USE_VULKAN) @@ -13285,14 +13297,17 @@ struct llama_context * llama_new_context_with_model( ggml_backend_t backend = ctx->backends[i]; ggml_backend_buffer_type_t buft = backend_buft[i]; size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend); - LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, - ggml_backend_buft_name(buft), - size / 1024.0 / 1024.0); + if (size > 1) { + LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, + ggml_backend_buft_name(buft), + size / 1024.0 / 1024.0); + } } // note: the number of splits during measure is higher than during inference due to the kv shift int n_splits = ggml_backend_sched_get_n_splits(ctx->sched); - LLAMA_LOG_INFO("%s: graph splits: %d\n", __func__, n_splits); + LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf->n_nodes); + LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits); } } From 4f6d1337ca5a409dc74aca8c479b7c34408a69c0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 18 Mar 2024 13:45:27 +0200 Subject: [PATCH 46/56] ci : temporary disable sanitizer builds (#6128) --- .github/workflows/build.yml | 68 ++++++++++++++++++------------------ .github/workflows/server.yml | 6 ++-- 2 files changed, 37 insertions(+), 37 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 945df42f8..992c34a03 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -98,40 +98,40 @@ jobs: cd build ctest -L main --verbose --timeout 900 - ubuntu-latest-cmake-sanitizer: - runs-on: ubuntu-latest - - continue-on-error: true - - strategy: - matrix: - sanitizer: [ADDRESS, THREAD, UNDEFINED] - build_type: [Debug, Release] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v3 - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential - - - name: Build - id: cmake_build - run: | - mkdir build - cd build - cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} - cmake --build . --config ${{ matrix.build_type }} -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose --timeout 900 +# ubuntu-latest-cmake-sanitizer: +# runs-on: ubuntu-latest +# +# continue-on-error: true +# +# strategy: +# matrix: +# sanitizer: [ADDRESS, THREAD, UNDEFINED] +# build_type: [Debug, Release] +# +# steps: +# - name: Clone +# id: checkout +# uses: actions/checkout@v3 +# +# - name: Dependencies +# id: depends +# run: | +# sudo apt-get update +# sudo apt-get install build-essential +# +# - name: Build +# id: cmake_build +# run: | +# mkdir build +# cd build +# cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} +# cmake --build . --config ${{ matrix.build_type }} -j $(nproc) +# +# - name: Test +# id: cmake_test +# run: | +# cd build +# ctest -L main --verbose --timeout 900 ubuntu-latest-cmake-mpi: runs-on: ubuntu-latest diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 4ea09115a..65ca7d9ca 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -24,13 +24,13 @@ jobs: strategy: matrix: - sanitizer: [ADDRESS, THREAD, UNDEFINED] + # TODO: temporary disabled due to linux kernel issues + #sanitizer: [ADDRESS, THREAD, UNDEFINED] + sanitizer: [UNDEFINED] build_type: [Debug] include: - build_type: Release sanitizer: "" - - build_type: Debug - sanitizer: THREAD disabled_on_pr: true fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken From ac9ee6a4ad740bc1ee484ede43e9f92b5af244c1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 18 Mar 2024 13:45:38 +0200 Subject: [PATCH 47/56] ci : disable stale issue messages (#6126) --- .github/workflows/close-issue.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml index 2682f308c..eaffd074d 100644 --- a/.github/workflows/close-issue.yml +++ b/.github/workflows/close-issue.yml @@ -15,7 +15,6 @@ jobs: days-before-issue-stale: 30 days-before-issue-close: 14 stale-issue-label: "stale" - stale-issue-message: "This issue is stale because it has been open for 30 days with no activity." close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale." days-before-pr-stale: -1 days-before-pr-close: -1 From 5e1b7f94a03e0b3b8e4578625bbdadc7bbd2b93c Mon Sep 17 00:00:00 2001 From: slaren Date: Mon, 18 Mar 2024 16:33:44 +0100 Subject: [PATCH 48/56] backend : set max split inputs to GGML_MAX_SRC (#6137) --- ggml-backend.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-backend.c b/ggml-backend.c index 9f0084df7..6026570ae 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -1015,7 +1015,7 @@ static bool ggml_is_view_op(enum ggml_op op) { #endif #ifndef GGML_SCHED_MAX_SPLIT_INPUTS -#define GGML_SCHED_MAX_SPLIT_INPUTS 4 +#define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC #endif #ifndef GGML_SCHED_MAX_COPIES From 104f5e0fc156d48476258295457cafeec2a2af10 Mon Sep 17 00:00:00 2001 From: Felix Date: Mon, 18 Mar 2024 16:40:22 +0100 Subject: [PATCH 49/56] clip : fix memory leak (#6138) --- examples/llava/clip.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index a0ed82d7e..690bca2eb 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -497,7 +497,6 @@ struct clip_ctx { // memory buffers to evaluate the model ggml_backend_buffer_t params_buffer = NULL; - ggml_backend_buffer_t compute_buffer = NULL; ggml_backend_t backend = NULL; ggml_gallocr_t compute_alloc = NULL; @@ -1676,6 +1675,9 @@ void clip_free(clip_ctx * ctx) { ggml_free(ctx->ctx_data); gguf_free(ctx->ctx_gguf); + ggml_backend_buffer_free(ctx->params_buffer); + ggml_backend_free(ctx->backend); + ggml_gallocr_free(ctx->compute_alloc); delete ctx; } From d199ca79f279e84ebe27caafe0aa59c461d88969 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Mon, 18 Mar 2024 12:49:02 -0400 Subject: [PATCH 50/56] mpt : implement backwards compatiblity with duped output tensor (#6139) --- llama.cpp | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/llama.cpp b/llama.cpp index b8bef6daf..1a9fe0c4d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -540,6 +540,7 @@ static const std::map> LLM_TENSOR_NA { { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output"}, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, @@ -4300,9 +4301,9 @@ static bool llm_load_tensors( { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); - if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_OUTPUT, "weight").c_str()) >= 0) { - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); - } else { + + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + if (!model.output) { model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU ml.n_created--; // artificial tensor ml.size_data += ggml_nbytes(model.output); @@ -4507,10 +4508,12 @@ static bool llm_load_tensors( model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false); - // same as tok_embd, duplicated to allow offloading - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - ml.n_created--; // artificial tensor - ml.size_data += ggml_nbytes(model.output); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + if (!model.output) { + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU + ml.n_created--; // artificial tensor + ml.size_data += ggml_nbytes(model.output); + } } for (int i = 0; i < n_layer; ++i) { From 2d15886bb092c3b780c676b5cc57ff3337af9c83 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sun, 17 Mar 2024 06:37:44 +0000 Subject: [PATCH 51/56] flake.lock: Update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Flake lock file updates: • Updated input 'nixpkgs': 'github:NixOS/nixpkgs/9df3e30ce24fd28c7b3e2de0d986769db5d6225d' (2024-03-06) → 'github:NixOS/nixpkgs/d691274a972b3165335d261cc4671335f5c67de9' (2024-03-14) --- flake.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flake.lock b/flake.lock index f9865d5e4..80de76dbf 100644 --- a/flake.lock +++ b/flake.lock @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1709703039, - "narHash": "sha256-6hqgQ8OK6gsMu1VtcGKBxKQInRLHtzulDo9Z5jxHEFY=", + "lastModified": 1710451336, + "narHash": "sha256-pP86Pcfu3BrAvRO7R64x7hs+GaQrjFes+mEPowCfkxY=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "9df3e30ce24fd28c7b3e2de0d986769db5d6225d", + "rev": "d691274a972b3165335d261cc4671335f5c67de9", "type": "github" }, "original": { From 4c28b8252907561165827125d2d1a4bad6926ac6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?DAN=E2=84=A2?= Date: Tue, 19 Mar 2024 01:59:36 -0400 Subject: [PATCH 52/56] common : print usage on '-h' and '--help' (#6145) --- common/common.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/common.cpp b/common/common.cpp index 919182862..5f10718ec 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1056,7 +1056,8 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int return true; } if (arg == "-h" || arg == "--help") { - return false; + gpt_print_usage(argc, argv, gpt_params()); + exit(0); } if (arg == "--version") { fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); From 970a48060ab9a6cc67aa063870323781c2a7bd7d Mon Sep 17 00:00:00 2001 From: slaren Date: Tue, 19 Mar 2024 09:06:54 +0100 Subject: [PATCH 53/56] ci : exempt some labels from being tagged as stale (#6140) --- .github/workflows/close-issue.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml index eaffd074d..a151c6780 100644 --- a/.github/workflows/close-issue.yml +++ b/.github/workflows/close-issue.yml @@ -12,6 +12,7 @@ jobs: steps: - uses: actions/stale@v5 with: + exempt-issue-labels: "refactor,help wanted,good first issue,research" days-before-issue-stale: 30 days-before-issue-close: 14 stale-issue-label: "stale" From b80cf3b2d1dee0ad325f7a794fecc66befce7336 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 19 Mar 2024 10:21:54 +0200 Subject: [PATCH 54/56] common : disable repeat penalties by default (#6127) --- common/sampling.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/sampling.h b/common/sampling.h index 48b2459d1..79a998be8 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -32,13 +32,13 @@ typedef struct llama_sampling_params { float dynatemp_range = 0.00f; // 0.0 = disabled float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) - float penalty_repeat = 1.10f; // 1.0 = disabled + float penalty_repeat = 1.00f; // 1.0 = disabled float penalty_freq = 0.00f; // 0.0 = disabled float penalty_present = 0.00f; // 0.0 = disabled int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 float mirostat_tau = 5.00f; // target entropy float mirostat_eta = 0.10f; // learning rate - bool penalize_nl = true; // consider newlines as a repeatable token + bool penalize_nl = false; // consider newlines as a repeatable token std::vector samplers_sequence = { llama_sampler_type::TOP_K, From d0d5de42e5a65865b5fddb6f5c785083539b74c3 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Tue, 19 Mar 2024 12:05:44 +0100 Subject: [PATCH 55/56] gguf-split: split and merge gguf per batch of tensors (#6135) * gguf-split: split and merge gguf files per tensor * gguf-split: build with make toolchain * gguf-split: rename `--split-tensors-size` to `--split-max-tensors`. Set general.split_count KV to all split * split : minor style + fix compile warnings * gguf-split: remove --upload not implemented --------- Co-authored-by: Georgi Gerganov --- Makefile | 4 + examples/CMakeLists.txt | 1 + examples/gguf-split/CMakeLists.txt | 5 + examples/gguf-split/README.md | 9 + examples/gguf-split/gguf-split.cpp | 491 +++++++++++++++++++++++++++++ 5 files changed, 510 insertions(+) create mode 100644 examples/gguf-split/CMakeLists.txt create mode 100644 examples/gguf-split/README.md create mode 100644 examples/gguf-split/gguf-split.cpp diff --git a/Makefile b/Makefile index 838daf5c0..1daad45ed 100644 --- a/Makefile +++ b/Makefile @@ -753,6 +753,10 @@ gguf: examples/gguf/gguf.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index e762cf8b9..b59cc65bf 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -21,6 +21,7 @@ else() add_subdirectory(embedding) add_subdirectory(finetune) add_subdirectory(gritlm) + add_subdirectory(gguf-split) add_subdirectory(infill) add_subdirectory(llama-bench) add_subdirectory(llava) diff --git a/examples/gguf-split/CMakeLists.txt b/examples/gguf-split/CMakeLists.txt new file mode 100644 index 000000000..828e62435 --- /dev/null +++ b/examples/gguf-split/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET gguf-split) +add_executable(${TARGET} gguf-split.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gguf-split/README.md b/examples/gguf-split/README.md new file mode 100644 index 000000000..ddb1f7649 --- /dev/null +++ b/examples/gguf-split/README.md @@ -0,0 +1,9 @@ +## GGUF split Example + +CLI to split / merge GGUF files. + +**Command line options:** + +- `--split`: split GGUF to multiple GGUF, default operation. +- `--split-max-tensors`: maximum tensors in each split: default(128) +- `--merge`: merge multiple GGUF to a single GGUF. diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp new file mode 100644 index 000000000..5d7040ab5 --- /dev/null +++ b/examples/gguf-split/gguf-split.cpp @@ -0,0 +1,491 @@ +#include "llama.h" +#include "ggml.h" +#include "common.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +enum split_operation : uint8_t { + SPLIT_OP_SPLIT, + SPLIT_OP_MERGE, +}; + +static const char * const LLM_KV_GENERAL_SPLIT_I_SPLIT = "general.split"; +static const char * const LLM_KV_GENERAL_SPLIT_N_SPLIT = "general.split_count"; + +static const int SPLIT_FILENAME_MAX = 256; + +static const char * const SPLIT_FILENAME_FORMAT = "%s-%05d-of-%05d.gguf"; + +struct split_params { + split_operation operation = SPLIT_OP_SPLIT; + int n_split_tensors = 128; + std::string input; + std::string output; +}; + +static void split_print_usage(const char * executable) { + const split_params default_params; + printf("\n"); + printf("usage: %s [options] GGUF_IN GGUF_OUT\n", executable); + printf("\n"); + printf("Apply a GGUF operation on IN to OUT."); + printf("\n"); + printf("options:\n"); + printf(" -h, --help show this help message and exit\n"); + printf(" --version show version and build info\n"); + printf(" --split split GGUF to multiple GGUF (default)\n"); + printf(" --split-max-tensors max tensors in each split: default(%d)\n", default_params.n_split_tensors); + printf(" --merge merge multiple GGUF to a single GGUF\n"); + printf("\n"); +} + +static bool split_params_parse_ex(int argc, const char ** argv, split_params & params) { + std::string arg; + const std::string arg_prefix = "--"; + bool invalid_param = false; + + int arg_idx = 1; + for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) { + arg = argv[arg_idx]; + if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); + } + + bool arg_found = false; + if (arg == "-h" || arg == "--help") { + split_print_usage(argv[0]); + exit(0); + } + if (arg == "--version") { + fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); + fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); + exit(0); + } + + if (arg == "--merge") { + arg_found = true; + params.operation = SPLIT_OP_MERGE; + } + if (arg == "--split") { + arg_found = true; + params.operation = SPLIT_OP_SPLIT; + } + if (arg == "--split-max-tensors") { + if (++arg_idx >= argc) { + invalid_param = true; + break; + } + arg_found = true; + params.n_split_tensors = atoi(argv[arg_idx]); + } + + if (!arg_found) { + throw std::invalid_argument("error: unknown argument: " + arg); + } + } + + if (invalid_param) { + throw std::invalid_argument("error: invalid parameter for argument: " + arg); + } + + if (argc - arg_idx < 2) { + printf("%s: bad arguments\n", argv[0]); + split_print_usage(argv[0]); + return false; + } + + params.input = argv[arg_idx++]; + params.output = argv[arg_idx++]; + + return true; +} + +static bool split_params_parse(int argc, const char ** argv, split_params & params) { + bool result = true; + try { + if (!split_params_parse_ex(argc, argv, params)) { + split_print_usage(argv[0]); + exit(1); + } + } + catch (const std::invalid_argument & ex) { + fprintf(stderr, "%s\n", ex.what()); + split_print_usage(argv[0]); + exit(1); + } + return result; +} + +static void zeros(std::ofstream & file, size_t n) { + char zero = 0; + for (size_t i = 0; i < n; ++i) { + file.write(&zero, 1); + } +} + +static std::string split_file_name(const std::string & path, int i_split, int n_split) { + char f_split[SPLIT_FILENAME_MAX] = {0}; + snprintf(f_split, sizeof(f_split), SPLIT_FILENAME_FORMAT, path.c_str(), i_split + 1, n_split); + return std::string(f_split); +} + +struct split_strategy { + const split_params params; + std::ifstream & f_input; + struct gguf_context * ctx_gguf; + struct ggml_context * ctx_meta = NULL; + const int n_tensors; + + const int n_split; + int i_split = 0; + + int i_tensor = 0; + + std::vector read_data; + + struct gguf_context * ctx_out; + std::ofstream fout; + + split_strategy(const split_params & params, + std::ifstream & f_input, + struct gguf_context * ctx_gguf, + struct ggml_context * ctx_meta) : + params(params), + f_input(f_input), + ctx_gguf(ctx_gguf), + ctx_meta(ctx_meta), + n_tensors(gguf_get_n_tensors(ctx_gguf)), + n_split(std::ceil(1. * n_tensors / params.n_split_tensors)) { + } + + bool should_split() const { + return i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0; + } + + void split_start() { + ctx_out = gguf_init_empty(); + + // Save all metadata in first split only + if (i_split == 0) { + gguf_set_kv(ctx_out, ctx_gguf); + } + gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_I_SPLIT, i_split); + gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, n_split); + + // populate the original tensors, so we get an initial metadata + for (int i = i_split * params.n_split_tensors; i < n_tensors && i < (i_split + 1) * params.n_split_tensors; ++i) { + struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i)); + gguf_add_tensor(ctx_out, meta); + } + + auto split_name = split_file_name(params.output, i_split, n_split); + + fprintf(stderr, "%s: %s ...", __func__, split_name.c_str()); + fout = std::ofstream(split_name, std::ios::binary); + fout.exceptions(std::ofstream::failbit); // fail fast on write errors + + auto meta_size = gguf_get_meta_size(ctx_out); + + // placeholder for the meta data + ::zeros(fout, meta_size); + + i_split++; + } + + void next_tensor() { + const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor); + struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name); + auto n_bytes = ggml_nbytes(t); + + if (read_data.size() < n_bytes) { + read_data.resize(n_bytes); + } + + auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor); + f_input.seekg(offset); + f_input.read((char *)read_data.data(), n_bytes); + + t->data = read_data.data(); + + // write tensor data + padding + fout.write((const char *)t->data, n_bytes); + zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes); + + i_tensor++; + } + + void split_end() { + // go back to beginning of file and write the updated metadata + fout.seekp(0); + std::vector data(gguf_get_meta_size(ctx_out)); + gguf_get_meta_data(ctx_out, data.data()); + fout.write((const char *)data.data(), data.size()); + + fout.close(); + gguf_free(ctx_out); + + fprintf(stderr, "\033[3Ddone\n"); + } +}; + +static void gguf_split(const split_params & split_params) { + struct ggml_context * ctx_meta = NULL; + + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx_meta, + }; + + std::ifstream f_input(split_params.input.c_str(), std::ios::binary); + if (!f_input.is_open()) { + fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_params.input.c_str()); + exit(1); + } + + auto * ctx_gguf = gguf_init_from_file(split_params.input.c_str(), params); + if (!ctx_gguf) { + fprintf(stderr, "%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str()); + exit(1); + } + + split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta); + fprintf(stderr, "%s: %s -> %s (%d tensors per file)\n", + __func__, split_params.input.c_str(), + split_file_name(split_params.output, strategy.i_split, strategy.n_split).c_str(), + split_params.n_split_tensors); + + strategy.split_start(); + + while (strategy.i_tensor < strategy.n_tensors) { + strategy.next_tensor(); + if (strategy.should_split()) { + strategy.split_end(); + strategy.split_start(); + } + } + strategy.split_end(); + + gguf_free(ctx_gguf); + f_input.close(); + + fprintf(stderr, "%s: %d gguf split written with a total of %d tensors.\n", + __func__, strategy.n_split, strategy.n_tensors); +} + +static void gguf_merge(const split_params & split_params) { + fprintf(stderr, "%s: %s -> %s\n", + __func__, split_params.input.c_str(), + split_params.output.c_str()); + int n_split = 1; + int total_tensors = 0; + + auto * ctx_out = gguf_init_empty(); + std::ofstream fout(split_params.output.c_str(), std::ios::binary); + fout.exceptions(std::ofstream::failbit); // fail fast on write errors + + std::vector read_data; + std::vector ctx_metas; + std::vector ctx_ggufs; + + std::string split_prefix; + + // First pass to find KV and tensors metadata + for (int i_split = 0; i_split < n_split; i_split++) { + struct ggml_context * ctx_meta = NULL; + + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx_meta, + }; + + auto split_name = split_params.input; + if (i_split > 0) { + split_name = split_file_name(split_prefix, i_split, n_split); + } + fprintf(stderr, "%s: reading metadata %s ...", __func__, split_name.c_str()); + + auto * ctx_gguf = gguf_init_from_file(split_name.c_str(), params); + if (!ctx_gguf) { + fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str()); + exit(1); + } + ctx_ggufs.push_back(ctx_gguf); + ctx_metas.push_back(ctx_meta); + + if (i_split == 0) { + auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT); + if (key_n_split < 0) { + fprintf(stderr, + "\n%s: input file does not contain %s metadata\n", + __func__, + LLM_KV_GENERAL_SPLIT_N_SPLIT); + gguf_free(ctx_gguf); + gguf_free(ctx_out); + fout.close(); + exit(1); + } + + n_split = gguf_get_val_u8(ctx_gguf, key_n_split); + if (n_split < 1) { + fprintf(stderr, + "\n%s: input file does not contain a valid split count %d\n", + __func__, + n_split); + gguf_free(ctx_gguf); + gguf_free(ctx_out); + fout.close(); + exit(1); + } + + // Do not trigger merge if we try to merge again the output + gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, 0); + + // Set metadata from the first split + gguf_set_kv(ctx_out, ctx_gguf); + } + + // Verify the file naming + { + int i_split_file = 0; + int n_split_file = 0; + const char * i_split_format = "-00000-of-00000.gguf"; + + if (split_name.size() < strlen(i_split_format)) { + fprintf(stderr, "\n%s: unexpected input file name: %s\n", __func__, split_params.input.c_str()); + for (auto * _ctx_gguf : ctx_ggufs) { + gguf_free(_ctx_gguf); + } + gguf_free(ctx_out); + fout.close(); + exit(1); + } + + split_prefix = split_name.substr(0, split_name.size() - strlen(i_split_format)); + + const char * split_name_c_str = split_name.c_str(); + int n_part = sscanf(&split_name_c_str[0] + split_prefix.size(), "-%d-of-%d", &i_split_file, &n_split_file); + + if (n_part != 2 || i_split_file - 1 != i_split || n_split_file != n_split) { + fprintf(stderr, "\n%s: unexpected input file name: %s" + " i_split=%d i_split_file=%d" + " n_split=%d n_split_file=%d\n", __func__, + split_params.input.c_str(), + i_split, i_split_file, + n_split, n_split_file); + for (auto * _ctx_gguf : ctx_ggufs) { + gguf_free(_ctx_gguf); + } + gguf_free(ctx_out); + fout.close(); + exit(1); + } + } + + auto n_tensors = gguf_get_n_tensors(ctx_gguf); + for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) { + const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor); + struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name); + gguf_add_tensor(ctx_out, t); + } + total_tensors += n_tensors; + + fprintf(stderr, "\033[3Ddone\n"); + } + + // placeholder for the meta data + { + auto meta_size = gguf_get_meta_size(ctx_out); + ::zeros(fout, meta_size); + } + + // Write tensors data + for (int i_split = 0; i_split < n_split; i_split++) { + auto split_name = split_file_name(split_prefix, i_split, n_split); + std::ifstream f_input(split_name.c_str(), std::ios::binary); + if (!f_input.is_open()) { + fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_name.c_str()); + for (auto * _ctx_gguf : ctx_ggufs) { + gguf_free(_ctx_gguf); + } + gguf_free(ctx_out); + fout.close(); + exit(1); + } + fprintf(stderr, "%s: writing tensors %s ...", __func__, split_name.c_str()); + + auto * ctx_gguf = ctx_ggufs[i_split]; + auto * ctx_meta = ctx_metas[i_split]; + + auto n_tensors = gguf_get_n_tensors(ctx_gguf); + for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) { + const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor); + struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name); + + auto n_bytes = ggml_nbytes(t); + + if (read_data.size() < n_bytes) { + read_data.resize(n_bytes); + } + + auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor); + f_input.seekg(offset); + f_input.read((char *)read_data.data(), n_bytes); + + // write tensor data + padding + fout.write((const char *)read_data.data(), n_bytes); + zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes); + } + + gguf_free(ctx_gguf); + ggml_free(ctx_meta); + f_input.close(); + fprintf(stderr, "\033[3Ddone\n"); + } + + { + // go back to beginning of file and write the updated metadata + fout.seekp(0); + std::vector data(gguf_get_meta_size(ctx_out)); + gguf_get_meta_data(ctx_out, data.data()); + fout.write((const char *)data.data(), data.size()); + + fout.close(); + gguf_free(ctx_out); + } + + fprintf(stderr, "%s: %s merged from %d split with %d tensors.\n", + __func__, split_params.output.c_str(), n_split, total_tensors); +} + +int main(int argc, const char ** argv) { + if (argc < 3) { + split_print_usage(argv[0]); + } + + split_params params; + split_params_parse(argc, argv, params); + + switch (params.operation) { + case SPLIT_OP_SPLIT: gguf_split(params); + break; + case SPLIT_OP_MERGE: gguf_merge(params); + break; + default:split_print_usage(argv[0]); + exit(1); + } + + return 0; +} From d8b009a9456bf5284376149f3deb09300a37701a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?DAN=E2=84=A2?= Date: Tue, 19 Mar 2024 12:16:09 -0400 Subject: [PATCH 56/56] Remove undeed header file. (#6158) --- examples/gguf-split/gguf-split.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index 5d7040ab5..8e12e6493 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -13,9 +13,7 @@ #include #include -#include #include -#include enum split_operation : uint8_t { SPLIT_OP_SPLIT,