From c09c574d1326d8d1f2741220d8e1c5be2b7fcb2d Mon Sep 17 00:00:00 2001
From: zhhan <zhhan@microsoft.com_odspmdb>
Date: Wed, 10 Jul 2024 11:19:48 -0700
Subject: [PATCH] update multip adaptation readme

---
 examples/multi-adaptation/README.md | 53 +++++++++++++----------------
 include/llama.h                     |  5 ---
 src/llama.cpp                       |  8 ++---
 3 files changed, 27 insertions(+), 39 deletions(-)

diff --git a/examples/multi-adaptation/README.md b/examples/multi-adaptation/README.md
index 9903ac6fc..acd25a5f3 100644
--- a/examples/multi-adaptation/README.md
+++ b/examples/multi-adaptation/README.md
@@ -1,40 +1,33 @@
-Server multi adaptations for different scenario.
+# Server Multi Adaptations for Different Scenarios
 
 ## Goal
-Service multi scenarios on memory constrained devices. The offline models are in the same folder. Use the -mpa parameter to pass the alias and model path. Split the gguf model as below:
+Service multiple scenarios on memory-constrained devices. The GGUF models are stored in the same folder.
 
+## Usage
+Use the `-mpa` parameter to pass the alias and model path.
 
-## Foundation model
+### Flag to Switch Derived Model
+```c
+llama_ctx_switch_derived_model(ctx, "summarize");
+```
 
-The foundation model contains all the weights parameters used by the runtime. It play as shared split and will be referenced by other gguf models.
+### Pass Model Path and Alias for Derived Models
+```sh
+llama_multi-adaptation.exe -m models\Phi-3-mini-4k-instruct-adaptor-base.gguf \
+ -mpa code_writer=models\Phi-3-mini-4k-instruct-adaptor-code_writer.gguf \
+ -mpa summarize=models\Phi-3-mini-4k-instruct-adaptor-summarization.gguf
+```
 
-model-adaptor-taskA.gguf + model-foundation.gguf
-model-adaptor-taskB.gguf + model-foundation.gguf
-model-adaptor-base.gguf + model-foundation.gguf
+## Foundation Model
+The **foundation** GGUF contains the weights shared across models.
+The **adaptor** GGUF contains the task-specific weights.
 
-## Model adaptation
+Here are the combinations for hosting three models:
+- `model-adaptor-base.gguf (0.77GB) + model-foundation.gguf (1.56GB)`
+- `model-adaptor-taskA.gguf + model-foundation.gguf`
+- `model-adaptor-taskB.gguf + model-foundation.gguf`
 
-Contains partial collections of the model weights that are overlaid onto the foundation model. These adaptation weights can be load dynamically and swapped out based on the usage.
+The benefit is that it supports hosting multiple scenarios while keeping only one copy of the shared weights in memory. With the benefit of `mmap`, the task-specific GGUF is only loaded when the corresponding task is called.
 
 ## Example
-Use the gguf splits in this model repo: https://huggingface.co/zhhan/Phi-3-mini-4k-instruct_multi-adaptor_gguf
-Configuration to run multi-adaptation in visual studio:
-
-{
-  "type": "default",
-  "project": "CMakeLists.txt",
-  "projectTarget": "llama_multi-adaptation.exe (bin\\llama_multi-adaptation.exe)",
-  "name": "llama_multi-adaptation.exe (bin\\llama_multi-adaptation.exe)",
-    "args": [
-        "-ngl 32",
-        "-m models\phi3_adaptors\\Phi-3-mini-4k-instruct-ft-q4_att-adaptor-base.gguf",
-        "-mpa codewriter=models\\phi3_adaptors\\Phi-3-mini-4k-instruct-ft-q4_att-adaptor-code_writer.gguf",
-        "-mpa summarize=models\\phi3_adaptors\\Phi-3-mini-4k-instruct-ft-q4_att-adaptor-summarization.gguf",
-        "-p \u0022\u003C|user|\u003EHow to explain Internet for a medieval knight?\u003C|end|\u003E\u003C|assistant|\u003E\u0022",
-        "--color",
-        "-c 4096",
-        "--temp 0.7",
-        "--repeat_penalty 1.1",
-        "-n 256"
-    ]
-}
+Use the GGUF splits in this model repository: [Phi-3-mini-4k-instruct_multi-adaptor_gguf](https://huggingface.co/zhhan/Phi-3-mini-4k-instruct_multi-adaptor_gguf)
diff --git a/include/llama.h b/include/llama.h
index 0178b1a8e..a4e40bf23 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -9,11 +9,6 @@
 #include <stdio.h>
 #include <stdbool.h>
 
-#ifdef _WIN32
-#else
-#include <sys/stat.h>
-#endif // _WIN32
-
 #ifdef LLAMA_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
 #        ifdef LLAMA_BUILD
diff --git a/src/llama.cpp b/src/llama.cpp
index 853da0321..94b95ad49 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3549,10 +3549,10 @@ struct llama_model_loader {
 
             char split_prefix[PATH_MAX] = {0};
             char foundation_prefix[PATH_MAX] = { 0 };
-            // Two split mode:
-            // - abc-00001-of-00002.gguf, abc-00002-of-00002.gguf, prefix is abc, postfix is 00001-of-00002, 00002-of-00002
-            // - abc-foundation.gguf, abc-adaptor-task-x.gguf, abc-adaptor-task-y.gguf, prefix is abc, postfix is -foundation, -adaptor-task-x, -adaptor-task-y
+
+            // // model-foundation.gguf, model-adaptor-task-x.gguf, model-adaptor-task-y.gguf
             bool foundation_mode = false;
+
             if (llama_foundation_prefix(foundation_prefix, sizeof(foundation_prefix), fname.c_str()) && n_split == 2) {
                 foundation_mode = true;
             }
@@ -12594,8 +12594,8 @@ static struct ggml_cgraph * llama_build_graph(
      const llama_batch & batch,
                   bool   worst_case) {
     const auto& foundation_model = lctx.model;
-    const llama_model* model_ptr = nullptr;
     const char* model_name = lctx.cparams.derived_model_name.c_str();
+    const llama_model* model_ptr = nullptr;
 
     for (const auto& model : lctx.derived_models) {
         if (model->name == model_name) {