update multip adaptation readme
This commit is contained in:
parent
ec9e5c7974
commit
c09c574d13
3 changed files with 27 additions and 39 deletions
|
@ -1,40 +1,33 @@
|
|||
Server multi adaptations for different scenario.
|
||||
# Server Multi Adaptations for Different Scenarios
|
||||
|
||||
## Goal
|
||||
Service multi scenarios on memory constrained devices. The offline models are in the same folder. Use the -mpa parameter to pass the alias and model path. Split the gguf model as below:
|
||||
Service multiple scenarios on memory-constrained devices. The GGUF models are stored in the same folder.
|
||||
|
||||
## Usage
|
||||
Use the `-mpa` parameter to pass the alias and model path.
|
||||
|
||||
## Foundation model
|
||||
### Flag to Switch Derived Model
|
||||
```c
|
||||
llama_ctx_switch_derived_model(ctx, "summarize");
|
||||
```
|
||||
|
||||
The foundation model contains all the weights parameters used by the runtime. It play as shared split and will be referenced by other gguf models.
|
||||
### Pass Model Path and Alias for Derived Models
|
||||
```sh
|
||||
llama_multi-adaptation.exe -m models\Phi-3-mini-4k-instruct-adaptor-base.gguf \
|
||||
-mpa code_writer=models\Phi-3-mini-4k-instruct-adaptor-code_writer.gguf \
|
||||
-mpa summarize=models\Phi-3-mini-4k-instruct-adaptor-summarization.gguf
|
||||
```
|
||||
|
||||
model-adaptor-taskA.gguf + model-foundation.gguf
|
||||
model-adaptor-taskB.gguf + model-foundation.gguf
|
||||
model-adaptor-base.gguf + model-foundation.gguf
|
||||
## Foundation Model
|
||||
The **foundation** GGUF contains the weights shared across models.
|
||||
The **adaptor** GGUF contains the task-specific weights.
|
||||
|
||||
## Model adaptation
|
||||
Here are the combinations for hosting three models:
|
||||
- `model-adaptor-base.gguf (0.77GB) + model-foundation.gguf (1.56GB)`
|
||||
- `model-adaptor-taskA.gguf + model-foundation.gguf`
|
||||
- `model-adaptor-taskB.gguf + model-foundation.gguf`
|
||||
|
||||
Contains partial collections of the model weights that are overlaid onto the foundation model. These adaptation weights can be load dynamically and swapped out based on the usage.
|
||||
The benefit is that it supports hosting multiple scenarios while keeping only one copy of the shared weights in memory. With the benefit of `mmap`, the task-specific GGUF is only loaded when the corresponding task is called.
|
||||
|
||||
## Example
|
||||
Use the gguf splits in this model repo: https://huggingface.co/zhhan/Phi-3-mini-4k-instruct_multi-adaptor_gguf
|
||||
Configuration to run multi-adaptation in visual studio:
|
||||
|
||||
{
|
||||
"type": "default",
|
||||
"project": "CMakeLists.txt",
|
||||
"projectTarget": "llama_multi-adaptation.exe (bin\\llama_multi-adaptation.exe)",
|
||||
"name": "llama_multi-adaptation.exe (bin\\llama_multi-adaptation.exe)",
|
||||
"args": [
|
||||
"-ngl 32",
|
||||
"-m models\phi3_adaptors\\Phi-3-mini-4k-instruct-ft-q4_att-adaptor-base.gguf",
|
||||
"-mpa codewriter=models\\phi3_adaptors\\Phi-3-mini-4k-instruct-ft-q4_att-adaptor-code_writer.gguf",
|
||||
"-mpa summarize=models\\phi3_adaptors\\Phi-3-mini-4k-instruct-ft-q4_att-adaptor-summarization.gguf",
|
||||
"-p \u0022\u003C|user|\u003EHow to explain Internet for a medieval knight?\u003C|end|\u003E\u003C|assistant|\u003E\u0022",
|
||||
"--color",
|
||||
"-c 4096",
|
||||
"--temp 0.7",
|
||||
"--repeat_penalty 1.1",
|
||||
"-n 256"
|
||||
]
|
||||
}
|
||||
Use the GGUF splits in this model repository: [Phi-3-mini-4k-instruct_multi-adaptor_gguf](https://huggingface.co/zhhan/Phi-3-mini-4k-instruct_multi-adaptor_gguf)
|
||||
|
|
|
@ -9,11 +9,6 @@
|
|||
#include <stdio.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifdef _WIN32
|
||||
#else
|
||||
#include <sys/stat.h>
|
||||
#endif // _WIN32
|
||||
|
||||
#ifdef LLAMA_SHARED
|
||||
# if defined(_WIN32) && !defined(__MINGW32__)
|
||||
# ifdef LLAMA_BUILD
|
||||
|
|
|
@ -3549,10 +3549,10 @@ struct llama_model_loader {
|
|||
|
||||
char split_prefix[PATH_MAX] = {0};
|
||||
char foundation_prefix[PATH_MAX] = { 0 };
|
||||
// Two split mode:
|
||||
// - abc-00001-of-00002.gguf, abc-00002-of-00002.gguf, prefix is abc, postfix is 00001-of-00002, 00002-of-00002
|
||||
// - abc-foundation.gguf, abc-adaptor-task-x.gguf, abc-adaptor-task-y.gguf, prefix is abc, postfix is -foundation, -adaptor-task-x, -adaptor-task-y
|
||||
|
||||
// // model-foundation.gguf, model-adaptor-task-x.gguf, model-adaptor-task-y.gguf
|
||||
bool foundation_mode = false;
|
||||
|
||||
if (llama_foundation_prefix(foundation_prefix, sizeof(foundation_prefix), fname.c_str()) && n_split == 2) {
|
||||
foundation_mode = true;
|
||||
}
|
||||
|
@ -12594,8 +12594,8 @@ static struct ggml_cgraph * llama_build_graph(
|
|||
const llama_batch & batch,
|
||||
bool worst_case) {
|
||||
const auto& foundation_model = lctx.model;
|
||||
const llama_model* model_ptr = nullptr;
|
||||
const char* model_name = lctx.cparams.derived_model_name.c_str();
|
||||
const llama_model* model_ptr = nullptr;
|
||||
|
||||
for (const auto& model : lctx.derived_models) {
|
||||
if (model->name == model_name) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue