From c09c574d1326d8d1f2741220d8e1c5be2b7fcb2d Mon Sep 17 00:00:00 2001 From: zhhan Date: Wed, 10 Jul 2024 11:19:48 -0700 Subject: [PATCH] update multip adaptation readme --- examples/multi-adaptation/README.md | 53 +++++++++++++---------------- include/llama.h | 5 --- src/llama.cpp | 8 ++--- 3 files changed, 27 insertions(+), 39 deletions(-) diff --git a/examples/multi-adaptation/README.md b/examples/multi-adaptation/README.md index 9903ac6fc..acd25a5f3 100644 --- a/examples/multi-adaptation/README.md +++ b/examples/multi-adaptation/README.md @@ -1,40 +1,33 @@ -Server multi adaptations for different scenario. +# Server Multi Adaptations for Different Scenarios ## Goal -Service multi scenarios on memory constrained devices. The offline models are in the same folder. Use the -mpa parameter to pass the alias and model path. Split the gguf model as below: +Service multiple scenarios on memory-constrained devices. The GGUF models are stored in the same folder. +## Usage +Use the `-mpa` parameter to pass the alias and model path. -## Foundation model +### Flag to Switch Derived Model +```c +llama_ctx_switch_derived_model(ctx, "summarize"); +``` -The foundation model contains all the weights parameters used by the runtime. It play as shared split and will be referenced by other gguf models. +### Pass Model Path and Alias for Derived Models +```sh +llama_multi-adaptation.exe -m models\Phi-3-mini-4k-instruct-adaptor-base.gguf \ + -mpa code_writer=models\Phi-3-mini-4k-instruct-adaptor-code_writer.gguf \ + -mpa summarize=models\Phi-3-mini-4k-instruct-adaptor-summarization.gguf +``` -model-adaptor-taskA.gguf + model-foundation.gguf -model-adaptor-taskB.gguf + model-foundation.gguf -model-adaptor-base.gguf + model-foundation.gguf +## Foundation Model +The **foundation** GGUF contains the weights shared across models. +The **adaptor** GGUF contains the task-specific weights. -## Model adaptation +Here are the combinations for hosting three models: +- `model-adaptor-base.gguf (0.77GB) + model-foundation.gguf (1.56GB)` +- `model-adaptor-taskA.gguf + model-foundation.gguf` +- `model-adaptor-taskB.gguf + model-foundation.gguf` -Contains partial collections of the model weights that are overlaid onto the foundation model. These adaptation weights can be load dynamically and swapped out based on the usage. +The benefit is that it supports hosting multiple scenarios while keeping only one copy of the shared weights in memory. With the benefit of `mmap`, the task-specific GGUF is only loaded when the corresponding task is called. ## Example -Use the gguf splits in this model repo: https://huggingface.co/zhhan/Phi-3-mini-4k-instruct_multi-adaptor_gguf -Configuration to run multi-adaptation in visual studio: - -{ - "type": "default", - "project": "CMakeLists.txt", - "projectTarget": "llama_multi-adaptation.exe (bin\\llama_multi-adaptation.exe)", - "name": "llama_multi-adaptation.exe (bin\\llama_multi-adaptation.exe)", - "args": [ - "-ngl 32", - "-m models\phi3_adaptors\\Phi-3-mini-4k-instruct-ft-q4_att-adaptor-base.gguf", - "-mpa codewriter=models\\phi3_adaptors\\Phi-3-mini-4k-instruct-ft-q4_att-adaptor-code_writer.gguf", - "-mpa summarize=models\\phi3_adaptors\\Phi-3-mini-4k-instruct-ft-q4_att-adaptor-summarization.gguf", - "-p \u0022\u003C|user|\u003EHow to explain Internet for a medieval knight?\u003C|end|\u003E\u003C|assistant|\u003E\u0022", - "--color", - "-c 4096", - "--temp 0.7", - "--repeat_penalty 1.1", - "-n 256" - ] -} +Use the GGUF splits in this model repository: [Phi-3-mini-4k-instruct_multi-adaptor_gguf](https://huggingface.co/zhhan/Phi-3-mini-4k-instruct_multi-adaptor_gguf) diff --git a/include/llama.h b/include/llama.h index 0178b1a8e..a4e40bf23 100644 --- a/include/llama.h +++ b/include/llama.h @@ -9,11 +9,6 @@ #include #include -#ifdef _WIN32 -#else -#include -#endif // _WIN32 - #ifdef LLAMA_SHARED # if defined(_WIN32) && !defined(__MINGW32__) # ifdef LLAMA_BUILD diff --git a/src/llama.cpp b/src/llama.cpp index 853da0321..94b95ad49 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3549,10 +3549,10 @@ struct llama_model_loader { char split_prefix[PATH_MAX] = {0}; char foundation_prefix[PATH_MAX] = { 0 }; - // Two split mode: - // - abc-00001-of-00002.gguf, abc-00002-of-00002.gguf, prefix is abc, postfix is 00001-of-00002, 00002-of-00002 - // - abc-foundation.gguf, abc-adaptor-task-x.gguf, abc-adaptor-task-y.gguf, prefix is abc, postfix is -foundation, -adaptor-task-x, -adaptor-task-y + + // // model-foundation.gguf, model-adaptor-task-x.gguf, model-adaptor-task-y.gguf bool foundation_mode = false; + if (llama_foundation_prefix(foundation_prefix, sizeof(foundation_prefix), fname.c_str()) && n_split == 2) { foundation_mode = true; } @@ -12594,8 +12594,8 @@ static struct ggml_cgraph * llama_build_graph( const llama_batch & batch, bool worst_case) { const auto& foundation_model = lctx.model; - const llama_model* model_ptr = nullptr; const char* model_name = lctx.cparams.derived_model_name.c_str(); + const llama_model* model_ptr = nullptr; for (const auto& model : lctx.derived_models) { if (model->name == model_name) {