update multip adaptation readme

2024-07-10 11:19:48 -07:00 · 2024-07-10 11:19:48 -07:00 · c09c574d13
commit c09c574d13
parent ec9e5c7974
3 changed files with 27 additions and 39 deletions
--- a/examples/multi-adaptation/README.md
+++ b/examples/multi-adaptation/README.md
@ -1,40 +1,33 @@
-Server multi adaptations for different scenario.
+# Server Multi Adaptations for Different Scenarios
 ## Goal
-Service multi scenarios on memory constrained devices. The offline models are in the same folder. Use the -mpa parameter to pass the alias and model path. Split the gguf model as below:
+Service multiple scenarios on memory-constrained devices. The GGUF models are stored in the same folder.
 ## Usage
 Use the `-mpa` parameter to pass the alias and model path.
-## Foundation model
+### Flag to Switch Derived Model
 ```c
 llama_ctx_switch_derived_model(ctx, "summarize");
 ```
-The foundation model contains all the weights parameters used by the runtime. It play as shared split and will be referenced by other gguf models.
+### Pass Model Path and Alias for Derived Models
 ```sh
 llama_multi-adaptation.exe -m models\Phi-3-mini-4k-instruct-adaptor-base.gguf \
 -mpa code_writer=models\Phi-3-mini-4k-instruct-adaptor-code_writer.gguf \
 -mpa summarize=models\Phi-3-mini-4k-instruct-adaptor-summarization.gguf
 ```
-model-adaptor-taskA.gguf + model-foundation.gguf
+## Foundation Model
-model-adaptor-taskB.gguf + model-foundation.gguf
+The **foundation** GGUF contains the weights shared across models.
-model-adaptor-base.gguf + model-foundation.gguf
+The **adaptor** GGUF contains the task-specific weights.
-## Model adaptation
+Here are the combinations for hosting three models:
 - `model-adaptor-base.gguf (0.77GB) + model-foundation.gguf (1.56GB)`
 - `model-adaptor-taskA.gguf + model-foundation.gguf`
 - `model-adaptor-taskB.gguf + model-foundation.gguf`
-Contains partial collections of the model weights that are overlaid onto the foundation model. These adaptation weights can be load dynamically and swapped out based on the usage.
+The benefit is that it supports hosting multiple scenarios while keeping only one copy of the shared weights in memory. With the benefit of `mmap`, the task-specific GGUF is only loaded when the corresponding task is called.
 ## Example
-Use the gguf splits in this model repo: https://huggingface.co/zhhan/Phi-3-mini-4k-instruct_multi-adaptor_gguf
+Use the GGUF splits in this model repository: [Phi-3-mini-4k-instruct_multi-adaptor_gguf](https://huggingface.co/zhhan/Phi-3-mini-4k-instruct_multi-adaptor_gguf)
 Configuration to run multi-adaptation in visual studio:
 {
  "type": "default",
  "project": "CMakeLists.txt",
  "projectTarget": "llama_multi-adaptation.exe (bin\\llama_multi-adaptation.exe)",
  "name": "llama_multi-adaptation.exe (bin\\llama_multi-adaptation.exe)",
    "args": [
        "-ngl 32",
        "-m models\phi3_adaptors\\Phi-3-mini-4k-instruct-ft-q4_att-adaptor-base.gguf",
        "-mpa codewriter=models\\phi3_adaptors\\Phi-3-mini-4k-instruct-ft-q4_att-adaptor-code_writer.gguf",
        "-mpa summarize=models\\phi3_adaptors\\Phi-3-mini-4k-instruct-ft-q4_att-adaptor-summarization.gguf",
        "-p \u0022\u003C|user|\u003EHow to explain Internet for a medieval knight?\u003C|end|\u003E\u003C|assistant|\u003E\u0022",
        "--color",
        "-c 4096",
        "--temp 0.7",
        "--repeat_penalty 1.1",
        "-n 256"
    ]
 }
--- a/include/llama.h
+++ b/include/llama.h
@ -9,11 +9,6 @@
 #include <stdio.h>
 #include <stdbool.h>
 #ifdef _WIN32
 #else
 #include <sys/stat.h>
 #endif // _WIN32
 #ifdef LLAMA_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
 #        ifdef LLAMA_BUILD
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -3549,10 +3549,10 @@ struct llama_model_loader {
            char split_prefix[PATH_MAX] = {0};
            char foundation_prefix[PATH_MAX] = { 0 };
-            // Two split mode:
+
-            // - abc-00001-of-00002.gguf, abc-00002-of-00002.gguf, prefix is abc, postfix is 00001-of-00002, 00002-of-00002
+            // // model-foundation.gguf, model-adaptor-task-x.gguf, model-adaptor-task-y.gguf
            // - abc-foundation.gguf, abc-adaptor-task-x.gguf, abc-adaptor-task-y.gguf, prefix is abc, postfix is -foundation, -adaptor-task-x, -adaptor-task-y
            bool foundation_mode = false;
            if (llama_foundation_prefix(foundation_prefix, sizeof(foundation_prefix), fname.c_str()) && n_split == 2) {
                foundation_mode = true;
            }
@ -12594,8 +12594,8 @@ static struct ggml_cgraph * llama_build_graph(
     const llama_batch & batch,
                  bool   worst_case) {
    const auto& foundation_model = lctx.model;
    const llama_model* model_ptr = nullptr;
    const char* model_name = lctx.cparams.derived_model_name.c_str();
    const llama_model* model_ptr = nullptr;
    for (const auto& model : lctx.derived_models) {
        if (model->name == model_name) {