update multip adaptation readme
This commit is contained in:
parent
ec9e5c7974
commit
c09c574d13
3 changed files with 27 additions and 39 deletions
|
@ -1,40 +1,33 @@
|
||||||
Server multi adaptations for different scenario.
|
# Server Multi Adaptations for Different Scenarios
|
||||||
|
|
||||||
## Goal
|
## Goal
|
||||||
Service multi scenarios on memory constrained devices. The offline models are in the same folder. Use the -mpa parameter to pass the alias and model path. Split the gguf model as below:
|
Service multiple scenarios on memory-constrained devices. The GGUF models are stored in the same folder.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
Use the `-mpa` parameter to pass the alias and model path.
|
||||||
|
|
||||||
## Foundation model
|
### Flag to Switch Derived Model
|
||||||
|
```c
|
||||||
|
llama_ctx_switch_derived_model(ctx, "summarize");
|
||||||
|
```
|
||||||
|
|
||||||
The foundation model contains all the weights parameters used by the runtime. It play as shared split and will be referenced by other gguf models.
|
### Pass Model Path and Alias for Derived Models
|
||||||
|
```sh
|
||||||
|
llama_multi-adaptation.exe -m models\Phi-3-mini-4k-instruct-adaptor-base.gguf \
|
||||||
|
-mpa code_writer=models\Phi-3-mini-4k-instruct-adaptor-code_writer.gguf \
|
||||||
|
-mpa summarize=models\Phi-3-mini-4k-instruct-adaptor-summarization.gguf
|
||||||
|
```
|
||||||
|
|
||||||
model-adaptor-taskA.gguf + model-foundation.gguf
|
## Foundation Model
|
||||||
model-adaptor-taskB.gguf + model-foundation.gguf
|
The **foundation** GGUF contains the weights shared across models.
|
||||||
model-adaptor-base.gguf + model-foundation.gguf
|
The **adaptor** GGUF contains the task-specific weights.
|
||||||
|
|
||||||
## Model adaptation
|
Here are the combinations for hosting three models:
|
||||||
|
- `model-adaptor-base.gguf (0.77GB) + model-foundation.gguf (1.56GB)`
|
||||||
|
- `model-adaptor-taskA.gguf + model-foundation.gguf`
|
||||||
|
- `model-adaptor-taskB.gguf + model-foundation.gguf`
|
||||||
|
|
||||||
Contains partial collections of the model weights that are overlaid onto the foundation model. These adaptation weights can be load dynamically and swapped out based on the usage.
|
The benefit is that it supports hosting multiple scenarios while keeping only one copy of the shared weights in memory. With the benefit of `mmap`, the task-specific GGUF is only loaded when the corresponding task is called.
|
||||||
|
|
||||||
## Example
|
## Example
|
||||||
Use the gguf splits in this model repo: https://huggingface.co/zhhan/Phi-3-mini-4k-instruct_multi-adaptor_gguf
|
Use the GGUF splits in this model repository: [Phi-3-mini-4k-instruct_multi-adaptor_gguf](https://huggingface.co/zhhan/Phi-3-mini-4k-instruct_multi-adaptor_gguf)
|
||||||
Configuration to run multi-adaptation in visual studio:
|
|
||||||
|
|
||||||
{
|
|
||||||
"type": "default",
|
|
||||||
"project": "CMakeLists.txt",
|
|
||||||
"projectTarget": "llama_multi-adaptation.exe (bin\\llama_multi-adaptation.exe)",
|
|
||||||
"name": "llama_multi-adaptation.exe (bin\\llama_multi-adaptation.exe)",
|
|
||||||
"args": [
|
|
||||||
"-ngl 32",
|
|
||||||
"-m models\phi3_adaptors\\Phi-3-mini-4k-instruct-ft-q4_att-adaptor-base.gguf",
|
|
||||||
"-mpa codewriter=models\\phi3_adaptors\\Phi-3-mini-4k-instruct-ft-q4_att-adaptor-code_writer.gguf",
|
|
||||||
"-mpa summarize=models\\phi3_adaptors\\Phi-3-mini-4k-instruct-ft-q4_att-adaptor-summarization.gguf",
|
|
||||||
"-p \u0022\u003C|user|\u003EHow to explain Internet for a medieval knight?\u003C|end|\u003E\u003C|assistant|\u003E\u0022",
|
|
||||||
"--color",
|
|
||||||
"-c 4096",
|
|
||||||
"--temp 0.7",
|
|
||||||
"--repeat_penalty 1.1",
|
|
||||||
"-n 256"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
|
@ -9,11 +9,6 @@
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
|
||||||
#ifdef _WIN32
|
|
||||||
#else
|
|
||||||
#include <sys/stat.h>
|
|
||||||
#endif // _WIN32
|
|
||||||
|
|
||||||
#ifdef LLAMA_SHARED
|
#ifdef LLAMA_SHARED
|
||||||
# if defined(_WIN32) && !defined(__MINGW32__)
|
# if defined(_WIN32) && !defined(__MINGW32__)
|
||||||
# ifdef LLAMA_BUILD
|
# ifdef LLAMA_BUILD
|
||||||
|
|
|
@ -3549,10 +3549,10 @@ struct llama_model_loader {
|
||||||
|
|
||||||
char split_prefix[PATH_MAX] = {0};
|
char split_prefix[PATH_MAX] = {0};
|
||||||
char foundation_prefix[PATH_MAX] = { 0 };
|
char foundation_prefix[PATH_MAX] = { 0 };
|
||||||
// Two split mode:
|
|
||||||
// - abc-00001-of-00002.gguf, abc-00002-of-00002.gguf, prefix is abc, postfix is 00001-of-00002, 00002-of-00002
|
// // model-foundation.gguf, model-adaptor-task-x.gguf, model-adaptor-task-y.gguf
|
||||||
// - abc-foundation.gguf, abc-adaptor-task-x.gguf, abc-adaptor-task-y.gguf, prefix is abc, postfix is -foundation, -adaptor-task-x, -adaptor-task-y
|
|
||||||
bool foundation_mode = false;
|
bool foundation_mode = false;
|
||||||
|
|
||||||
if (llama_foundation_prefix(foundation_prefix, sizeof(foundation_prefix), fname.c_str()) && n_split == 2) {
|
if (llama_foundation_prefix(foundation_prefix, sizeof(foundation_prefix), fname.c_str()) && n_split == 2) {
|
||||||
foundation_mode = true;
|
foundation_mode = true;
|
||||||
}
|
}
|
||||||
|
@ -12594,8 +12594,8 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
const llama_batch & batch,
|
const llama_batch & batch,
|
||||||
bool worst_case) {
|
bool worst_case) {
|
||||||
const auto& foundation_model = lctx.model;
|
const auto& foundation_model = lctx.model;
|
||||||
const llama_model* model_ptr = nullptr;
|
|
||||||
const char* model_name = lctx.cparams.derived_model_name.c_str();
|
const char* model_name = lctx.cparams.derived_model_name.c_str();
|
||||||
|
const llama_model* model_ptr = nullptr;
|
||||||
|
|
||||||
for (const auto& model : lctx.derived_models) {
|
for (const auto& model : lctx.derived_models) {
|
||||||
if (model->name == model_name) {
|
if (model->name == model_name) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue