From 0bf16de07b0692e7df26b9a633e232bbd66e0360 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 6 Aug 2024 11:48:01 +0300 Subject: [PATCH 01/13] contributing : add note about write access --- CONTRIBUTING.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b688f78ec..a9e000e52 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -5,6 +5,7 @@ - Execute [the full CI locally on your machine](ci/README.md) before publishing - Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs. - The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your convenience +- Consider allowing write access to your branch for faster review - If your PR becomes stale, don't hesitate to ping the maintainers in the comments # Pull requests (for collaborators) From efda90c93a62274ec0b0bfa80c4eee4bdb6966d0 Mon Sep 17 00:00:00 2001 From: MaggotHATE Date: Tue, 6 Aug 2024 16:32:03 +0500 Subject: [PATCH 02/13] [Vulkan] Fix compilation of `vulkan-shaders-gen` on w64devkit after `e31a4f6` (#8880) * Fix compilation issue in `vulkan-shaders-gen` https://github.com/ggerganov/llama.cpp/commit/e31a4f679779220312c165b0f5994c680a610e38 broke compilation on w64devkit. Including `algorithm` seems to fix that. * Guard it under `#ifdef _WIN32` --- ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp index f6f4f116a..a792e203b 100644 --- a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp @@ -22,6 +22,7 @@ #ifdef _WIN32 #include #include // For _mkdir on Windows + #include // For std::replace on w64devkit #else #include #include From db20f50cf4710c46e3a996919a26858cae8c80ed Mon Sep 17 00:00:00 2001 From: Jaeden Amero Date: Tue, 6 Aug 2024 17:21:47 +0400 Subject: [PATCH 03/13] cmake : Link vulkan-shaders-gen with pthreads (#8835) When using CMake to build with Vulkan support, compiling vulkan-shaders-gen fails due to missing a CMakeLists.txt specification to link vulkan-shaders-gen with the threading library, resulting in the following error. [5/172] Linking CXX executable bin/vulkan-shaders-gen FAILED: bin/vulkan-shaders-gen : && /usr/bin/c++ ggml/src/vulkan-shaders/CMakeFiles/vulkan-shaders-gen.dir/vulkan-shaders-gen.cpp.o -o bin/vulkan-shaders-gen && : ld: error: undefined symbol: pthread_create >>> referenced by vulkan-shaders-gen.cpp >>> ggml/src/vulkan-shaders/CMakeFiles/vulkan-shaders-gen.dir/vulkan-shaders-gen.cpp.o:(std::__1::__libcpp_thread_create[abi:se180100](pthread**, >>> void* (*)(void*), void*)) c++: error: linker command failed with exit code 1 (use -v to see invocation) [6/172] Generating build details from Git -- Found Git: /usr/local/bin/git (found version "2.45.2") ninja: build stopped: subcommand failed. Add the CMakeLists.txt specification to link vulkan-shaders-gen with the threading library and fix the above error. Fixes #8834 --- ggml/src/vulkan-shaders/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml/src/vulkan-shaders/CMakeLists.txt b/ggml/src/vulkan-shaders/CMakeLists.txt index 41551e009..10075db33 100644 --- a/ggml/src/vulkan-shaders/CMakeLists.txt +++ b/ggml/src/vulkan-shaders/CMakeLists.txt @@ -1,5 +1,7 @@ +find_package (Threads REQUIRED) set(TARGET vulkan-shaders-gen) add_executable(${TARGET} vulkan-shaders-gen.cpp) install(TARGETS ${TARGET} RUNTIME) target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads) From 5f4dcb1e60bbfe936b45778dad177a5b9a09b066 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 6 Aug 2024 16:44:35 +0200 Subject: [PATCH 04/13] simple : update name of executable to llama-simple (#8885) This commit updates the name of the executable in README.md from `simple` to `llama-simple`. --- examples/simple/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/simple/README.md b/examples/simple/README.md index 49e24501c..0ff342535 100644 --- a/examples/simple/README.md +++ b/examples/simple/README.md @@ -3,7 +3,7 @@ The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt. ```bash -./simple -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" +./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" ... From 641f5dd2a6422941faa9f32ab5cb50fc1b24c1f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Tue, 6 Aug 2024 17:13:55 +0200 Subject: [PATCH 05/13] CUDA: fix padding logic for FP16/FP32 (#8884) --- ggml/src/ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 68605fff6..654f93e83 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -1501,7 +1501,7 @@ static void ggml_cuda_op_mul_mat( } // If src0 is on a temporary compute buffers (partial offloading) there may be some padding that needs to be cleared: - if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) { + if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) { const int64_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00); const int64_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING); CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data , 0, nbytes_padding, stream)); From 1e6f6554aa11fa10160a5fda689e736c3c34169f Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 6 Aug 2024 17:33:39 +0200 Subject: [PATCH 06/13] server : add lora hotswap endpoint (WIP) (#8857) * server : add lora hotswap endpoint * handle lora_no_apply * fix build * updae docs * clean up struct def * fix build * add LoRA test * fix style --- common/common.cpp | 60 +++++--- common/common.h | 19 ++- examples/export-lora/export-lora.cpp | 10 +- examples/server/README.md | 128 +++++++++--------- examples/server/server.cpp | 66 ++++++++- examples/server/tests/features/lora.feature | 36 +++++ examples/server/tests/features/steps/steps.py | 21 +++ examples/server/tests/requirements.txt | 1 + gguf-py/gguf/metadata.py | 2 +- 9 files changed, 251 insertions(+), 92 deletions(-) create mode 100644 examples/server/tests/features/lora.feature diff --git a/common/common.cpp b/common/common.cpp index ee7fbcba3..2e8374d50 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -684,14 +684,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } if (arg == "--lora") { CHECK_ARG - params.lora_adapter.emplace_back(argv[i], 1.0f); + params.lora_adapters.push_back({ + std::string(argv[i]), + 1.0, + }); return true; } if (arg == "--lora-scaled") { CHECK_ARG - const char* lora_adapter = argv[i]; + std::string lora_adapter = argv[i]; CHECK_ARG - params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); + params.lora_adapters.push_back({ + lora_adapter, + std::stof(argv[i]), + }); + return true; + } + if (arg == "--lora-init-without-apply") { + params.lora_init_without_apply = true; return true; } if (arg == "--control-vector") { @@ -1654,6 +1664,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" }); options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY", "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity }); + options.push_back({ "server", " --lora-init-without-apply", "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"}); #ifndef LOG_DISABLE_LOGS options.push_back({ "logging" }); @@ -2091,17 +2102,22 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { } } - for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { - const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); - float lora_scale = std::get<1>(params.lora_adapter[i]); - auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str()); - if (adapter == nullptr) { - fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); + // load and optionally apply lora adapters + for (auto & la : params.lora_adapters) { + llama_lora_adapter_container loaded_la; + loaded_la.path = la.path; + loaded_la.scale = la.scale; + loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str()); + if (loaded_la.adapter == nullptr) { + fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); llama_free(lctx); llama_free_model(model); return iparams; } - llama_lora_adapter_set(lctx, adapter, lora_scale); + iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters + } + if (!params.lora_init_without_apply) { + llama_lora_adapters_apply(lctx, iparams.lora_adapters); } if (params.ignore_eos) { @@ -2140,6 +2156,15 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { return iparams; } +void llama_lora_adapters_apply(struct llama_context * ctx, std::vector & lora_adapters) { + llama_lora_adapter_clear(ctx); + for (auto & la : lora_adapters) { + if (la.scale != 0.0f) { + llama_lora_adapter_set(ctx, la.adapter, la.scale); + } + } +} + struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) { auto mparams = llama_model_default_params(); @@ -3162,19 +3187,18 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l } fprintf(stream, "lora:\n"); - for (std::tuple la : params.lora_adapter) { - if (std::get<1>(la) != 1.0f) { - continue; + for (auto & la : params.lora_adapters) { + if (la.scale == 1.0f) { + fprintf(stream, " - %s\n", la.path.c_str()); } - fprintf(stream, " - %s\n", std::get<0>(la).c_str()); } fprintf(stream, "lora_scaled:\n"); - for (std::tuple la : params.lora_adapter) { - if (std::get<1>(la) == 1.0f) { - continue; + for (auto & la : params.lora_adapters) { + if (la.scale != 1.0f) { + fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale); } - fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la)); } + fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false"); fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep); fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat); diff --git a/common/common.h b/common/common.h index 51dcc0d39..d88966ece 100644 --- a/common/common.h +++ b/common/common.h @@ -33,6 +33,15 @@ #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf" +struct llama_lora_adapter_info { + std::string path; + float scale; +}; + +struct llama_lora_adapter_container : llama_lora_adapter_info { + struct llama_lora_adapter * adapter; +}; + // build info extern int LLAMA_BUILD_NUMBER; extern char const * LLAMA_COMMIT; @@ -126,8 +135,8 @@ struct gpt_params { std::vector antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) std::vector kv_overrides; - // TODO: avoid tuple, use struct - std::vector> lora_adapter; // lora adapter path with user defined scale + bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply) + std::vector lora_adapters; // lora adapter path with user defined scale std::vector control_vectors; // control vector with user defined scale @@ -309,8 +318,9 @@ std::string fs_get_cache_file(const std::string & filename); // struct llama_init_result { - struct llama_model * model = nullptr; + struct llama_model * model = nullptr; struct llama_context * context = nullptr; + std::vector lora_adapters; }; struct llama_init_result llama_init_from_gpt_params(gpt_params & params); @@ -321,6 +331,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params); struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params); +// clear LoRA adapters from context, then apply new list of adapters +void llama_lora_adapters_apply(struct llama_context * ctx, std::vector & lora_adapters); + // Batch utils void llama_batch_clear(struct llama_batch & batch); diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 150f7e8d5..d228ae66e 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -135,7 +135,7 @@ struct lora_merge_ctx { lora_merge_ctx( std::string & base_fname, - std::vector> & lora_files, + std::vector & lora_files, std::string & outfile, int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) { fout.exceptions(std::ofstream::failbit); // fail fast on write errors @@ -144,9 +144,9 @@ struct lora_merge_ctx { throw std::runtime_error("split model is not yet supported"); } - for (auto lora_inp : lora_files) { - auto fname = std::get<0>(lora_inp); - auto scale = std::get<1>(lora_inp); + for (auto & lora_inp : lora_files) { + auto fname = lora_inp.path; + auto scale = lora_inp.scale; std::unique_ptr adapter(new file_input(fname, scale)); check_metadata_lora(adapter.get()); adapters.push_back(std::move(adapter)); @@ -407,7 +407,7 @@ int main(int argc, char ** argv) { g_verbose = (params.verbosity == 1); try { - lora_merge_ctx ctx(params.model, params.lora_adapter, params.lora_outfile, params.n_threads); + lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.n_threads); ctx.run_merge(); } catch (const std::exception & err) { fprintf(stderr, "%s\n", err.what()); diff --git a/examples/server/README.md b/examples/server/README.md index de83ee7d0..e17595fe8 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -207,41 +207,6 @@ model: -hff, --hf-file FILE Hugging Face model file (default: unused) -hft, --hf-token TOKEN Hugging Face access token (default: value from HF_TOKEN environment variable) -retrieval: - - --context-file FNAME file to load context from (repeat to specify multiple files) - --chunk-size N minimum length of embedded text chunks (default: 64) - --chunk-separator STRING - separator between chunks (default: ' - ') - -passkey: - - --junk N number of times to repeat the junk text (default: 250) - --pos N position of the passkey in the junk text (default: -1) - -imatrix: - - -o, --output FNAME output file (default: 'imatrix.dat') - --output-frequency N output the imatrix every N iterations (default: 10) - --save-frequency N save an imatrix copy every N iterations (default: 0) - --process-output collect data for the output tensor (default: false) - --no-ppl do not compute perplexity (default: true) - --chunk N start processing the input from chunk N (default: 0) - -bench: - - -pps is the prompt shared across parallel sequences (default: false) - -npp n0,n1,... number of prompt tokens - -ntg n0,n1,... number of text generation tokens - -npl n0,n1,... number of parallel prompts - -embedding: - - --embd-normalize normalisation for embendings (default: 2) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) - --embd-output-format empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix - --embd-separator separator of embendings (default \n) for example "<#sep#>" - server: --host HOST ip address to listen (default: 127.0.0.1) @@ -267,7 +232,8 @@ server: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template -sps, --slot-prompt-similarity SIMILARITY how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled) - + --lora-init-without-apply + load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) logging: @@ -279,15 +245,6 @@ logging: --log-file FNAME Specify a log filename (without extension) --log-new Create a separate new log file on start. Each log file will have unique name: "..log" --log-append Don't truncate the old log file. - -cvector: - - -o, --output FNAME output file (default: 'control_vector.gguf') - --positive-file FNAME positive prompts file, one prompt per line (default: 'examples/cvector-generator/positive.txt') - --negative-file FNAME negative prompts file, one prompt per line (default: 'examples/cvector-generator/negative.txt') - --pca-batch N batch size used for PCA. Larger batch runs faster, but uses more memory (default: 100) - --pca-iter N number of iterations used for PCA (default: 1000) - --method {pca,mean} dimensionality reduction method to be used (default: pca) ``` @@ -411,7 +368,8 @@ node index.js ## API Endpoints -- **GET** `/health`: Returns the current state of the server: +### GET `/health`: Returns the current state of the server + - 503 -> `{"status": "loading model"}` if the model is still being loaded. - 500 -> `{"status": "error"}` if the model failed to load. - 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below. @@ -420,7 +378,7 @@ node index.js If the query parameter `include_slots` is passed, `slots` field will contain internal slots data except if `--slots-endpoint-disable` is set. -- **POST** `/completion`: Given a `prompt`, it returns the predicted completion. +### POST `/completion`: Given a `prompt`, it returns the predicted completion. *Options:* @@ -498,7 +456,7 @@ node index.js `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values. -### Result JSON +**Response format** - Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion. @@ -537,7 +495,7 @@ Notice that each `probs` is an array of length `n_probs`. - `tokens_evaluated`: Number of tokens evaluated in total from the prompt - `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`) -- **POST** `/tokenize`: Tokenize a given text. +### POST `/tokenize`: Tokenize a given text *Options:* @@ -545,13 +503,15 @@ Notice that each `probs` is an array of length `n_probs`. `add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false` -- **POST** `/detokenize`: Convert tokens to text. +### POST `/detokenize`: Convert tokens to text *Options:* `tokens`: Set the tokens to detokenize. -- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does. +### POST `/embedding`: Generate embedding of a given text + +The same as [the embedding example](../embedding) does. *Options:* @@ -559,7 +519,9 @@ Notice that each `probs` is an array of length `n_probs`. `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA. -- **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream. +### POST `/infill`: For code infilling. + +Takes a prefix and a suffix and returns the predicted completion as stream. *Options:* @@ -571,7 +533,7 @@ Notice that each `probs` is an array of length `n_probs`. - **GET** `/props`: Return current server settings. -### Result JSON +**Response format** ```json { @@ -589,7 +551,9 @@ Notice that each `probs` is an array of length `n_probs`. - `total_slots` - the total number of slots for process requests (defined by `--parallel` option) - `chat_template` - the model's original Jinja2 prompt template -- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used. +### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API + +Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used. *Options:* @@ -641,7 +605,7 @@ Notice that each `probs` is an array of length `n_probs`. }' ``` -- **POST** `/v1/embeddings`: OpenAI-compatible embeddings API. +### POST `/v1/embeddings`: OpenAI-compatible embeddings API *Options:* @@ -675,9 +639,9 @@ Notice that each `probs` is an array of length `n_probs`. }' ``` -- **GET** `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`. +### GET `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`. -### Result JSON +**Response format** ```json [ @@ -738,7 +702,7 @@ Notice that each `probs` is an array of length `n_probs`. ] ``` -- **GET** `/metrics`: [Prometheus](https://prometheus.io/) compatible metrics exporter endpoint if `--metrics` is enabled: +### GET `/metrics`: Prometheus compatible metrics exporter endpoint if `--metrics` is enabled: Available metrics: - `llamacpp:prompt_tokens_total`: Number of prompt tokens processed. @@ -750,13 +714,13 @@ Available metrics: - `llamacpp:requests_processing`: Number of requests processing. - `llamacpp:requests_deferred`: Number of requests deferred. -- **POST** `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file. +### POST `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file. *Options:* `filename`: Name of the file to save the slot's prompt cache. The file will be saved in the directory specified by the `--slot-save-path` server parameter. -### Result JSON +**Response format** ```json { @@ -770,13 +734,13 @@ Available metrics: } ``` -- **POST** `/slots/{id_slot}?action=restore`: Restore the prompt cache of the specified slot from a file. +### POST `/slots/{id_slot}?action=restore`: Restore the prompt cache of the specified slot from a file. *Options:* `filename`: Name of the file to restore the slot's prompt cache from. The file should be located in the directory specified by the `--slot-save-path` server parameter. -### Result JSON +**Response format** ```json { @@ -790,9 +754,9 @@ Available metrics: } ``` -- **POST** `/slots/{id_slot}?action=erase`: Erase the prompt cache of the specified slot. +### POST `/slots/{id_slot}?action=erase`: Erase the prompt cache of the specified slot. -### Result JSON +**Response format** ```json { @@ -801,6 +765,42 @@ Available metrics: } ``` +### GET `/lora-adapters`: Get list of all LoRA adapters + +If an adapter is disabled, the scale will be set to 0. + +**Response format** + +```json +[ + { + "id": 0, + "path": "my_adapter_1.gguf", + "scale": 0.0 + }, + { + "id": 1, + "path": "my_adapter_2.gguf", + "scale": 0.0 + } +] +``` + +### POST `/lora-adapters`: Set list of LoRA adapters + +To disable an adapter, either remove it from the list below, or set scale to 0. + +**Request format** + +To know the `id` of the adapter, use GET `/lora-adapters` + +```json +[ + {"id": 0, "scale": 0.2}, + {"id": 1, "scale": 0.8} +] +``` + ## More examples ### Change system prompt on runtime diff --git a/examples/server/server.cpp b/examples/server/server.cpp index d178ca0f7..898c83ea3 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -78,6 +78,7 @@ enum server_task_type { SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE, + SERVER_TASK_TYPE_SET_LORA, }; struct server_task { @@ -622,6 +623,7 @@ struct server_response { struct server_context { llama_model * model = nullptr; llama_context * ctx = nullptr; + std::vector lora_adapters; gpt_params params; @@ -681,6 +683,7 @@ struct server_context { model = llama_init.model; ctx = llama_init.context; + lora_adapters = llama_init.lora_adapters; params.n_parallel -= 1; // but be sneaky about it if (model == nullptr) { LOG_ERROR("unable to load model", {{"model", params.model}}); @@ -1850,6 +1853,14 @@ struct server_context { }; queue_results.send(result); } break; + case SERVER_TASK_TYPE_SET_LORA: + { + llama_lora_adapters_apply(ctx, lora_adapters); + server_task_result result; + result.id = task.id; + result.data = json{{ "success", true }}; + queue_results.send(result); + } break; } } @@ -3328,6 +3339,55 @@ int main(int argc, char ** argv) { return res.set_content(root.dump(), "application/json; charset=utf-8"); }; + const auto handle_lora_adapters_list = [&](const httplib::Request & req, httplib::Response & res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + json result = json::array(); + for (size_t i = 0; i < ctx_server.lora_adapters.size(); ++i) { + auto & la = ctx_server.lora_adapters[i]; + result.push_back({ + {"id", i}, + {"path", la.path}, + {"scale", la.scale}, + }); + } + res.set_content(result.dump(), "application/json"); + res.status = 200; // HTTP OK + }; + + const auto handle_lora_adapters_apply = [&](const httplib::Request & req, httplib::Response & res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + + const std::vector body = json::parse(req.body); + int max_idx = ctx_server.lora_adapters.size(); + + // clear existing value + for (auto & la : ctx_server.lora_adapters) { + la.scale = 0.0f; + } + + // set value + for (auto entry : body) { + int id = entry.at("id"); + float scale = entry.at("scale"); + if (0 <= id && id < max_idx) { + ctx_server.lora_adapters[id].scale = scale; + } else { + throw std::runtime_error("invalid adapter id"); + } + } + + server_task task; + task.type = SERVER_TASK_TYPE_SET_LORA; + const int id_task = ctx_server.queue_tasks.post(task); + ctx_server.queue_results.add_waiting_task_id(id_task); + + server_task_result result = ctx_server.queue_results.recv(id_task); + ctx_server.queue_results.remove_waiting_task_id(id_task); + + res.set_content(result.data.dump(), "application/json"); + res.status = 200; // HTTP OK + }; + auto handle_static_file = [](unsigned char * content, size_t len, const char * mime_type) { return [content, len, mime_type](const httplib::Request &, httplib::Response & res) { res.set_content(reinterpret_cast(content), len, mime_type); @@ -3366,7 +3426,6 @@ int main(int argc, char ** argv) { // register API routes svr->Get ("/health", handle_health); - svr->Get ("/slots", handle_slots); svr->Get ("/metrics", handle_metrics); svr->Get ("/props", handle_props); svr->Get ("/v1/models", handle_models); @@ -3381,6 +3440,11 @@ int main(int argc, char ** argv) { svr->Post("/v1/embeddings", handle_embeddings); svr->Post("/tokenize", handle_tokenize); svr->Post("/detokenize", handle_detokenize); + // LoRA adapters hotswap + svr->Get ("/lora-adapters", handle_lora_adapters_list); + svr->Post("/lora-adapters", handle_lora_adapters_apply); + // Save & load slots + svr->Get ("/slots", handle_slots); if (!params.slot_save_path.empty()) { // only enable slot endpoints if slot_save_path is set svr->Post("/slots/:id_slot", handle_slots_action); diff --git a/examples/server/tests/features/lora.feature b/examples/server/tests/features/lora.feature new file mode 100644 index 000000000..7b85988ac --- /dev/null +++ b/examples/server/tests/features/lora.feature @@ -0,0 +1,36 @@ +@llama.cpp +@lora +Feature: llama.cpp server + + Background: Server startup + Given a server listening on localhost:8080 + And a model url https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/stories15M_MOE-F16.gguf + And a model file stories15M_MOE-F16.gguf + And a model alias stories15M_MOE + And a lora adapter file from https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf + And 42 as server seed + And 1024 as batch size + And 1024 as ubatch size + And 2048 KV cache size + And 64 max tokens to predict + And 0.0 temperature + Then the server is starting + Then the server is healthy + + Scenario: Completion LoRA disabled + Given switch off lora adapter 0 + Given a prompt: + """ + Look in thy glass + """ + And a completion request with no api error + Then 64 tokens are predicted matching little|girl|three|years|old + + Scenario: Completion LoRA enabled + Given switch on lora adapter 0 + Given a prompt: + """ + Look in thy glass + """ + And a completion request with no api error + Then 64 tokens are predicted matching eye|love|glass|sun diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index df0814cc9..6705a34fc 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -7,6 +7,7 @@ import subprocess import sys import threading import time +import requests from collections.abc import Sequence from contextlib import closing from re import RegexFlag @@ -70,6 +71,7 @@ def step_server_config(context, server_fqdn: str, server_port: str): context.user_api_key = None context.response_format = None context.temperature = None + context.lora_file = None context.tasks_result = [] context.concurrent_tasks = [] @@ -82,6 +84,12 @@ def step_download_hf_model(context, hf_file: str, hf_repo: str): context.model_hf_file = hf_file context.model_file = os.path.basename(hf_file) +@step('a lora adapter file from {lora_file_url}') +def step_download_lora_file(context, lora_file_url: str): + file_name = lora_file_url.split('/').pop() + context.lora_file = f'../../../{file_name}' + with open(context.lora_file, 'wb') as f: + f.write(requests.get(lora_file_url).content) @step('a model file {model_file}') def step_model_file(context, model_file: str): @@ -849,6 +857,17 @@ async def step_erase_slot(context, slot_id): context.response = response +@step('switch {on_or_off} lora adapter {lora_id:d}') +@async_run_until_complete +async def toggle_lora_adapter(context, on_or_off: str, lora_id: int): + async with aiohttp.ClientSession() as session: + async with session.post(f'{context.base_url}/lora-adapters', + json=[{'id': lora_id, 'scale': 1 if on_or_off == 'on' else 0}], + headers={"Content-Type": "application/json"}) as response: + context.response = response + print([{'id': lora_id, 'scale': 1 if on_or_off == 'on' else 0}]) + + @step('the server responds with status code {status_code:d}') def step_server_responds_with_status_code(context, status_code): assert context.response.status == status_code @@ -1326,6 +1345,8 @@ def start_server_background(context): server_args.extend(['--grp-attn-w', context.n_ga_w]) if context.debug: server_args.append('--verbose') + if context.lora_file: + server_args.extend(['--lora', context.lora_file]) if 'SERVER_LOG_FORMAT_JSON' not in os.environ: server_args.extend(['--log-format', "text"]) diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt index 2c741ea10..f2d7e5c57 100644 --- a/examples/server/tests/requirements.txt +++ b/examples/server/tests/requirements.txt @@ -4,3 +4,4 @@ huggingface_hub~=0.20.3 numpy~=1.26.4 openai~=1.30.3 prometheus-client~=0.20.0 +requests~=2.32.3 diff --git a/gguf-py/gguf/metadata.py b/gguf-py/gguf/metadata.py index ea4d02705..db318542a 100644 --- a/gguf-py/gguf/metadata.py +++ b/gguf-py/gguf/metadata.py @@ -174,7 +174,7 @@ class Metadata: org_component, model_full_name_component = None, model_id # Check if we erroneously matched against './' or '../' etc... - if org_component is not None and org_component[0] == '.': + if org_component is not None and len(org_component) > 0 and org_component[0] == '.': org_component = None name_parts: list[str] = model_full_name_component.split('-') From 31958546c3e4695a8a24bb7ba3b79a8f76d05afe Mon Sep 17 00:00:00 2001 From: Nexes the Old <124105151+Nexesenex@users.noreply.github.com> Date: Wed, 7 Aug 2024 01:41:54 +0200 Subject: [PATCH 07/13] typo correction (#8891) --- include/llama.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/llama.h b/include/llama.h index f23355a6b..66c266298 100644 --- a/include/llama.h +++ b/include/llama.h @@ -345,7 +345,7 @@ extern "C" { int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() enum llama_ftype ftype; // quantize to this llama_ftype enum ggml_type output_tensor_type; // output tensor type - enum ggml_type token_embedding_type; // itoken embeddings tensor type + enum ggml_type token_embedding_type; // token embeddings tensor type bool allow_requantize; // allow quantizing non-f32/f16 tensors bool quantize_output_tensor; // quantize output.weight bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored From 725e3d94379d5b619c027347308bccf2e0ead89f Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 7 Aug 2024 01:43:00 +0200 Subject: [PATCH 08/13] quantize : update usage comment in quantize.cpp (#8889) This commit updates the usage comment in quantize.cpp to reflect the new name of the executable, which is llama-quantize. --- examples/quantize/quantize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 8d7647258..7312309ae 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -91,7 +91,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp } // usage: -// ./quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] +// ./llama-quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] // [[noreturn]] static void usage(const char * executable) { From 506122d854c5d05b4a3d45a294f14bd4c02d9868 Mon Sep 17 00:00:00 2001 From: Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com> Date: Wed, 7 Aug 2024 09:01:06 +0800 Subject: [PATCH 09/13] llama-bench : add support for getting cpu info on Windows (#8824) * Add support for getting cpu info on Windows for llama_bench * refactor --------- Co-authored-by: slaren --- examples/llama-bench/llama-bench.cpp | 29 ++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 521fa8880..42918bfc7 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -27,6 +27,14 @@ #include "ggml-cann.h" #endif +#ifdef _WIN32 +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +# define NOMINMAX +#endif +#include +#endif + // utils static uint64_t get_time_ns() { using clock = std::chrono::high_resolution_clock; @@ -96,6 +104,27 @@ static std::string get_cpu_info() { } fclose(f); } +#elif defined(_WIN32) + HKEY hKey; + if (RegOpenKeyEx(HKEY_LOCAL_MACHINE, + TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"), + 0, + KEY_READ, + &hKey) != ERROR_SUCCESS) { + // fail to open registry key + return ""; + } + char cpu_brand[256]; + DWORD cpu_brand_size = sizeof(cpu_brand); + if (RegQueryValueExA(hKey, + TEXT("ProcessorNameString"), + NULL, + NULL, + (LPBYTE)cpu_brand, + &cpu_brand_size) == ERROR_SUCCESS) { + id.assign(cpu_brand, cpu_brand_size); + } + RegCloseKey(hKey); #endif // TODO: other platforms return id; From a8dbc6f753f296108121d50f78f67463403dd6fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 7 Aug 2024 09:07:52 +0200 Subject: [PATCH 10/13] CUDA/HIP: fix tests/test-backend-ops (#8896) --- ggml/src/ggml-cuda.cu | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 654f93e83..a00a7af6c 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -2742,11 +2742,12 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons case GGML_OP_MUL_MAT_ID: { struct ggml_tensor * a = op->src[0]; - if (op->op == GGML_OP_MUL_MAT) { - struct ggml_tensor * b = op->src[1]; - if (a->ne[3] != b->ne[3]) { - return false; - } + struct ggml_tensor * b = op->src[1]; + if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) { + return false; + } + if (op->op == GGML_OP_MUL_MAT && a->ne[3] != b->ne[3]) { + return false; } switch (a->type) { case GGML_TYPE_F32: @@ -2877,7 +2878,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons return true; case GGML_OP_FLASH_ATTN_EXT: #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) - return op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128; + return (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) || op->src[0]->ne[0] == 128; #else if (op->src[0]->ne[0] == 128) { return true; From 0478174d5959b66096ae6609fcb0df14cab66b51 Mon Sep 17 00:00:00 2001 From: Ouadie EL FAROUKI Date: Wed, 7 Aug 2024 11:25:36 +0100 Subject: [PATCH 11/13] [SYCL] Updated SYCL device filtering (#8901) * Updated device filter to depend on default_selector (fixes non-intel device issues) * Small related update to example/sycl Readme --- examples/sycl/README.md | 24 +++++++++--------------- ggml/src/ggml-sycl/dpct/helper.hpp | 19 ++++++++++++++++--- 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/examples/sycl/README.md b/examples/sycl/README.md index 0e3acd35b..8819d87f5 100644 --- a/examples/sycl/README.md +++ b/examples/sycl/README.md @@ -12,9 +12,9 @@ This example program provides the tools for llama.cpp for SYCL on Intel GPU. List all SYCL devices with ID, compute capability, max work group size, ect. -1. Build the llama.cpp for SYCL for all targets. +1. Build the llama.cpp for SYCL for the specified target *(using GGML_SYCL_TARGET)*. -2. Enable oneAPI running environment +2. Enable oneAPI running environment *(if GGML_SYCL_TARGET is set to INTEL -default-)* ``` source /opt/intel/oneapi/setvars.sh @@ -29,19 +29,13 @@ source /opt/intel/oneapi/setvars.sh Check the ID in startup log, like: ``` -found 4 SYCL devices: - Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3, - max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136 - Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2, - max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280 - Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0, - max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280 - Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0, - max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136 +found 2 SYCL devices: +| | | | |Max | |Max |Global | | +| | | | |compute|Max work|sub |mem | | +|ID| Device Type| Name|Version|units |group |group|size | Driver version| +|--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------| +| 0| [level_zero:gpu:0]| Intel Arc A770 Graphics| 1.3| 512| 1024| 32| 16225M| 1.3.29138| +| 1| [level_zero:gpu:1]| Intel UHD Graphics 750| 1.3| 32| 512| 32| 62631M| 1.3.29138| ``` -|Attribute|Note| -|-|-| -|compute capability 1.3|Level-zero running time, recommended | -|compute capability 3.0|OpenCL running time, slower than level-zero in most cases| diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp index ef4609e32..fe4a8f744 100644 --- a/ggml/src/ggml-sycl/dpct/helper.hpp +++ b/ggml/src/ggml-sycl/dpct/helper.hpp @@ -874,7 +874,7 @@ namespace dpct inline std::string get_preferred_gpu_platform_name() { std::string result; - std::string filter = "level-zero"; + std::string filter = ""; char* env = getenv("ONEAPI_DEVICE_SELECTOR"); if (env) { if (std::strstr(env, "level_zero")) { @@ -892,11 +892,24 @@ namespace dpct else { throw std::runtime_error("invalid device filter: " + std::string(env)); } + } else { + auto default_device = sycl::device(sycl::default_selector_v); + auto default_platform_name = default_device.get_platform().get_info(); + + if (std::strstr(default_platform_name.c_str(), "Level-Zero") || default_device.is_cpu()) { + filter = "level-zero"; + } + else if (std::strstr(default_platform_name.c_str(), "CUDA")) { + filter = "cuda"; + } + else if (std::strstr(default_platform_name.c_str(), "HIP")) { + filter = "hip"; + } } - auto plaform_list = sycl::platform::get_platforms(); + auto platform_list = sycl::platform::get_platforms(); - for (const auto& platform : plaform_list) { + for (const auto& platform : platform_list) { auto devices = platform.get_devices(); auto gpu_dev = std::find_if(devices.begin(), devices.end(), [](const sycl::device& d) { return d.is_gpu(); From be55695eff44784a141a863f273661a6bce63dfc Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 7 Aug 2024 13:29:02 +0200 Subject: [PATCH 12/13] ggml-backend : fix async copy from CPU (#8897) * ggml-backend : fix async copy from CPU * cuda : more reliable async copy, fix stream used when the devices are the same --- ggml/src/ggml-backend.c | 25 +++++++++++++++---------- ggml/src/ggml-cuda.cu | 30 ++++++++++++++++-------------- 2 files changed, 31 insertions(+), 24 deletions(-) diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c index 954ab2072..e1651cc64 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.c @@ -351,15 +351,10 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b } // an async copy would normally happen after all the queued operations on both backends are completed - // sync src, set_async dst - if (ggml_backend_buffer_is_host(src->buffer)) { - ggml_backend_synchronize(backend_src); - ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, ggml_nbytes(src)); - } else { - ggml_backend_synchronize(backend_src); - ggml_backend_tensor_copy(src, dst); - ggml_backend_synchronize(backend_dst); - } + // to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy + ggml_backend_synchronize(backend_src); + ggml_backend_synchronize(backend_dst); + ggml_backend_tensor_copy(src, dst); } // events @@ -1782,7 +1777,17 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } else { ggml_backend_synchronize(split_backend); } - ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy); + // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events + // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface + if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) { + ggml_backend_synchronize(input_backend); + if (sched->events[split_backend_id][sched->cur_copy] != NULL) { + ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]); + } else { + ggml_backend_synchronize(split_backend); + } + ggml_backend_tensor_copy(input, input_cpy); + } } } diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index a00a7af6c..682c30d45 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -2358,33 +2358,35 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, } GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) { - GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst)); - ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer; ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer; - if (!ggml_backend_buffer_is_cuda(src->buffer)) { + if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) { return false; } - if (!ggml_backend_buffer_is_cuda(dst->buffer)) { + if (!ggml_backend_buffer_is_cuda(src->buffer) || !ggml_backend_buffer_is_cuda(dst->buffer)) { return false; } - // device -> device + // device -> device copy ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context; ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context; + ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context; + ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context; + + if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) { +#ifndef NDEBUG + GGML_CUDA_LOG_WARN("%s: backend and buffer devices do not match\n", __func__); +#endif + return false; + } + if (backend_src != backend_dst) { - ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context; - ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context; - - GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device); - GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device); - // copy on src stream if (cuda_ctx_src->device == cuda_ctx_dst->device) { - CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream())); + CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream())); } else { #ifdef GGML_CUDA_NO_PEER_COPY return false; @@ -2393,7 +2395,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_ #endif } - // record event on src stream + // record event on src stream after the copy if (!cuda_ctx_src->copy_event) { ggml_cuda_set_device(cuda_ctx_src->device); CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming)); @@ -2405,7 +2407,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_ CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0)); } else { // src and dst are on the same backend - CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream())); + CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream())); } return true; } From 15fa07a5c564d3ed7e7eb64b73272cedb27e73ec Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 7 Aug 2024 18:24:05 +0200 Subject: [PATCH 13/13] make : use C compiler to build metal embed object (#8899) * make : use C compiler to build metal embed object * use rm + rmdir to avoid -r flag in rm --- Makefile | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index f4ce4f1fb..d41ebfd42 100644 --- a/Makefile +++ b/Makefile @@ -888,15 +888,16 @@ ggml/src/ggml-metal-embed.o: \ ggml/src/ggml-common.h @echo "Embedding Metal library" @sed -e '/#include "ggml-common.h"/r ggml/src/ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml/src/ggml-metal.metal > ggml/src/ggml-metal-embed.metal - $(eval TEMP_ASSEMBLY=$(shell mktemp)) - @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY) - @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY) - @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY) - @echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY) - @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY) - @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY) - @$(AS) $(TEMP_ASSEMBLY) -o $@ - @rm -f ${TEMP_ASSEMBLY} + $(eval TEMP_ASSEMBLY=$(shell mktemp -d)) + @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)/ggml-metal-embed.s + @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s + @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s + @echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s + @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s + @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s + $(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@ + @rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s + @rmdir ${TEMP_ASSEMBLY} endif endif # GGML_METAL