update multip adaptation readme

2024-07-10 11:19:48 -07:00 · 2024-07-10 11:19:48 -07:00 · c09c574d13
commit c09c574d13
parent ec9e5c7974
3 changed files with 27 additions and 39 deletions
--- a/examples/multi-adaptation/README.md
+++ b/examples/multi-adaptation/README.md
@ -1,40 +1,33 @@
-Server multi adaptations for different scenario.
+# Server Multi Adaptations for Different Scenarios

 ## Goal
-Service multi scenarios on memory constrained devices. The offline models are in the same folder. Use the -mpa parameter to pass the alias and model path. Split the gguf model as below:
+Service multiple scenarios on memory-constrained devices. The GGUF models are stored in the same folder.

+## Usage
+Use the `-mpa` parameter to pass the alias and model path.

-## Foundation model
+### Flag to Switch Derived Model
+```c
+llama_ctx_switch_derived_model(ctx, "summarize");
+```

-The foundation model contains all the weights parameters used by the runtime. It play as shared split and will be referenced by other gguf models.
+### Pass Model Path and Alias for Derived Models
+```sh
+llama_multi-adaptation.exe -m models\Phi-3-mini-4k-instruct-adaptor-base.gguf \
+ -mpa code_writer=models\Phi-3-mini-4k-instruct-adaptor-code_writer.gguf \
+ -mpa summarize=models\Phi-3-mini-4k-instruct-adaptor-summarization.gguf
+```

-model-adaptor-taskA.gguf + model-foundation.gguf
-model-adaptor-taskB.gguf + model-foundation.gguf
-model-adaptor-base.gguf + model-foundation.gguf
+## Foundation Model
+The **foundation** GGUF contains the weights shared across models.
+The **adaptor** GGUF contains the task-specific weights.

-## Model adaptation
+Here are the combinations for hosting three models:
+- `model-adaptor-base.gguf (0.77GB) + model-foundation.gguf (1.56GB)`
+- `model-adaptor-taskA.gguf + model-foundation.gguf`
+- `model-adaptor-taskB.gguf + model-foundation.gguf`

-Contains partial collections of the model weights that are overlaid onto the foundation model. These adaptation weights can be load dynamically and swapped out based on the usage.
+The benefit is that it supports hosting multiple scenarios while keeping only one copy of the shared weights in memory. With the benefit of `mmap`, the task-specific GGUF is only loaded when the corresponding task is called.

 ## Example
-Use the gguf splits in this model repo: https://huggingface.co/zhhan/Phi-3-mini-4k-instruct_multi-adaptor_gguf
-Configuration to run multi-adaptation in visual studio:
-
-{
-  "type": "default",
-  "project": "CMakeLists.txt",
-  "projectTarget": "llama_multi-adaptation.exe (bin\\llama_multi-adaptation.exe)",
-  "name": "llama_multi-adaptation.exe (bin\\llama_multi-adaptation.exe)",
-    "args": [
-        "-ngl 32",
-        "-m models\phi3_adaptors\\Phi-3-mini-4k-instruct-ft-q4_att-adaptor-base.gguf",
-        "-mpa codewriter=models\\phi3_adaptors\\Phi-3-mini-4k-instruct-ft-q4_att-adaptor-code_writer.gguf",
-        "-mpa summarize=models\\phi3_adaptors\\Phi-3-mini-4k-instruct-ft-q4_att-adaptor-summarization.gguf",
-        "-p \u0022\u003C|user|\u003EHow to explain Internet for a medieval knight?\u003C|end|\u003E\u003C|assistant|\u003E\u0022",
-        "--color",
-        "-c 4096",
-        "--temp 0.7",
-        "--repeat_penalty 1.1",
-        "-n 256"
-    ]
-}
+Use the GGUF splits in this model repository: [Phi-3-mini-4k-instruct_multi-adaptor_gguf](https://huggingface.co/zhhan/Phi-3-mini-4k-instruct_multi-adaptor_gguf)
--- a/include/llama.h
+++ b/include/llama.h
@ -9,11 +9,6 @@
 #include <stdio.h>
 #include <stdbool.h>

-#ifdef _WIN32
-#else
-#include <sys/stat.h>
-#endif // _WIN32
-
 #ifdef LLAMA_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
 #        ifdef LLAMA_BUILD
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -3549,10 +3549,10 @@ struct llama_model_loader {

            char split_prefix[PATH_MAX] = {0};
            char foundation_prefix[PATH_MAX] = { 0 };
-            // Two split mode:
-            // - abc-00001-of-00002.gguf, abc-00002-of-00002.gguf, prefix is abc, postfix is 00001-of-00002, 00002-of-00002
-            // - abc-foundation.gguf, abc-adaptor-task-x.gguf, abc-adaptor-task-y.gguf, prefix is abc, postfix is -foundation, -adaptor-task-x, -adaptor-task-y
+
+            // // model-foundation.gguf, model-adaptor-task-x.gguf, model-adaptor-task-y.gguf
            bool foundation_mode = false;
+
            if (llama_foundation_prefix(foundation_prefix, sizeof(foundation_prefix), fname.c_str()) && n_split == 2) {
                foundation_mode = true;
            }
@ -12594,8 +12594,8 @@ static struct ggml_cgraph * llama_build_graph(
     const llama_batch & batch,
                  bool   worst_case) {
    const auto& foundation_model = lctx.model;
-    const llama_model* model_ptr = nullptr;
    const char* model_name = lctx.cparams.derived_model_name.c_str();
+    const llama_model* model_ptr = nullptr;

    for (const auto& model : lctx.derived_models) {
        if (model->name == model_name) {