add gpu index opts and udpate doc commands (#2)
This commit is contained in:
parent
fe3bc49e81
commit
66a1bb4602
7 changed files with 41 additions and 39 deletions
35
README.md
35
README.md
|
@ -50,7 +50,6 @@ The SparseLLM Team is currently converting the Mistral-7B model to a sparser ver
|
||||||
|
|
||||||
- [Installation](##setup--installation)
|
- [Installation](##setup--installation)
|
||||||
- [Model Weights](##model-weights)
|
- [Model Weights](##model-weights)
|
||||||
- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
|
|
||||||
|
|
||||||
## Setup & Installation
|
## Setup & Installation
|
||||||
### Get the Code
|
### Get the Code
|
||||||
|
@ -60,27 +59,23 @@ git clone https://github.com/hodlen/PowerInfer
|
||||||
cd PowerInfer
|
cd PowerInfer
|
||||||
```
|
```
|
||||||
### Build
|
### Build
|
||||||
In order to build PowerInfer you have two different options.
|
In order to build PowerInfer you have two different options. These commands are supposed to be run from the root directory of the project.
|
||||||
|
|
||||||
- Using `make`:
|
Using `make` on Linux or MacOS:
|
||||||
- On Linux or MacOS:
|
|
||||||
```bash
|
```bash
|
||||||
make
|
make
|
||||||
```
|
```
|
||||||
- Using `CMake`:
|
|
||||||
- If you have one GPU:
|
Using `CMake`:
|
||||||
|
* If you have one GPU:
|
||||||
```bash
|
```bash
|
||||||
mkdir build
|
cmake -S . -B build -DLLAMA_CUBLAS=ON
|
||||||
cd build
|
cmake --build build --config Release
|
||||||
cmake .. -DLLAMA_CUBLAS=ON
|
|
||||||
cmake --build . --config Release
|
|
||||||
```
|
```
|
||||||
- If you just CPU:
|
* If you just CPU:
|
||||||
```bash
|
```bash
|
||||||
mkdir build
|
cmake -S . -B build
|
||||||
cd build
|
cmake --build build --config Release
|
||||||
cmake ..
|
|
||||||
cmake --build . --config Release
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Model Weights
|
## Model Weights
|
||||||
|
@ -96,11 +91,19 @@ In order to build PowerInfer you have two different options.
|
||||||
```bash
|
```bash
|
||||||
./build/bin/main -m /PATH/TO/MODEL -n $(output_token_count) -t $(thread_num) -p $(prompt)
|
./build/bin/main -m /PATH/TO/MODEL -n $(output_token_count) -t $(thread_num) -p $(prompt)
|
||||||
```
|
```
|
||||||
- If you have CPU with one consumer grade GPU:
|
- If you have CPU with one GPU:
|
||||||
```bash
|
```bash
|
||||||
./build/bin/main -m /PATH/TO/MODEL -n $(output_token_count) -t $(thread_num) -p $(prompt)
|
./build/bin/main -m /PATH/TO/MODEL -n $(output_token_count) -t $(thread_num) -p $(prompt)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
As for now, it requires a offline-generated "GPU index" file to split FFNs on GPU. If you want to try it, please use the following instruction to generate the GPU index file:
|
||||||
|
```bash
|
||||||
|
python scripts/export-gpu-split.py $(activation_count_path) $(output_idx_path) solver
|
||||||
|
```
|
||||||
|
Then, you can use the following instruction to run PowerInfer with GPU index:
|
||||||
|
```bash
|
||||||
|
./build/bin/main -m /PATH/TO/MODEL -n $(output_token_count) -t $(thread_num) -p $(prompt) --gpu-index $(split_path)
|
||||||
|
```
|
||||||
|
|
||||||
## Evaluation
|
## Evaluation
|
||||||
|
|
||||||
|
|
|
@ -471,12 +471,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.lora_base = argv[i];
|
params.lora_base = argv[i];
|
||||||
} else if (arg == "--mlp-adapter") {
|
} else if (arg == "--gpu-index") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.mlp_adapter = argv[i];
|
params.gpu_index = argv[i];
|
||||||
} else if (arg == "--mmproj") {
|
} else if (arg == "--mmproj") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -970,9 +970,8 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||||
|
|
||||||
if (llama_use_sparse_inference(model)) {
|
if (llama_use_sparse_inference(model)) {
|
||||||
fprintf(stderr, "%s: postprocessing PowerInfer model '%s'\n", __func__, params.model.c_str());
|
fprintf(stderr, "%s: postprocessing PowerInfer model '%s'\n", __func__, params.model.c_str());
|
||||||
if (!params.mlp_adapter.empty()) {
|
if (!params.gpu_index.empty()) {
|
||||||
fprintf(stderr, "%s: warning: --mlp-adapter is deprecated and has no effect\n", __func__);
|
int err = llama_model_apply_gpu_idx_from_file(model, params.gpu_index.c_str(), true);
|
||||||
int err = llama_model_apply_mlp_from_file(model, params.mlp_adapter.c_str(), true);
|
|
||||||
if (err != 0) {
|
if (err != 0) {
|
||||||
fprintf(stderr, "%s: error: failed to apply mlp adapter\n", __func__);
|
fprintf(stderr, "%s: error: failed to apply mlp adapter\n", __func__);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
@ -1358,7 +1357,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
|
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
|
||||||
}
|
}
|
||||||
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
||||||
fprintf(stream, "mlp_adapter: %s\n", params.mlp_adapter.c_str());
|
fprintf(stream, "gpu_index: %s\n", params.gpu_index.c_str());
|
||||||
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
||||||
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
|
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
|
||||||
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
||||||
|
|
|
@ -91,7 +91,7 @@ struct gpt_params {
|
||||||
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
|
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
|
||||||
std::string lora_base = ""; // base model path for the lora adapter
|
std::string lora_base = ""; // base model path for the lora adapter
|
||||||
|
|
||||||
std::string mlp_adapter = ""; // sparse activation mlp adapter path
|
std::string gpu_index = ""; // sparse activation mlp adapter path
|
||||||
|
|
||||||
int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
||||||
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
||||||
|
|
|
@ -49,11 +49,11 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (argc >= 8) {
|
if (argc >= 8) {
|
||||||
params.mlp_adapter = argv[7];
|
params.gpu_index = argv[7];
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("params: model = %s, prompt = %s, n_parallel = %d, n_len = %d, n_gpu_layers = %d, n_threads = %d, mlp_adapter = %s\n",
|
printf("params: model = %s, prompt = %s, n_parallel = %d, n_len = %d, n_gpu_layers = %d, n_threads = %d, gpu_index = %s\n",
|
||||||
params.model.c_str(), params.prompt.c_str(), n_parallel, n_len, n_gpu_layers, params.n_threads, params.mlp_adapter.c_str());
|
params.model.c_str(), params.prompt.c_str(), n_parallel, n_len, n_gpu_layers, params.n_threads, params.gpu_index.c_str());
|
||||||
|
|
||||||
if (params.prompt.empty()) {
|
if (params.prompt.empty()) {
|
||||||
params.prompt = "Hello my name is";
|
params.prompt = "Hello my name is";
|
||||||
|
@ -76,8 +76,8 @@ int main(int argc, char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.mlp_adapter.empty()) {
|
if (!params.gpu_index.empty()) {
|
||||||
int err = llama_model_apply_mlp_from_file(model, params.mlp_adapter.c_str(), true);
|
int err = llama_model_apply_gpu_idx_from_file(model, params.gpu_index.c_str(), true);
|
||||||
if (err != 0) {
|
if (err != 0) {
|
||||||
fprintf(stderr, "%s: error: failed to apply mlp adapter\n", __func__);
|
fprintf(stderr, "%s: error: failed to apply mlp adapter\n", __func__);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
|
@ -9660,7 +9660,7 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int llama_model_apply_mlp_from_file(struct llama_model * model, const char * path_mlp, bool use_mmap) {
|
int llama_model_apply_gpu_idx_from_file(struct llama_model * model, const char * path_mlp, bool use_mmap) {
|
||||||
llama_mlp_model_loader * mlp_ml = new llama_mlp_model_loader(path_mlp, use_mmap);
|
llama_mlp_model_loader * mlp_ml = new llama_mlp_model_loader(path_mlp, use_mmap);
|
||||||
if (mlp_ml -> apply_tensors_to_base_model(model) > 0) {
|
if (mlp_ml -> apply_tensors_to_base_model(model) > 0) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to apply mlp adapter\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to apply mlp adapter\n", __func__);
|
||||||
|
|
2
llama.h
2
llama.h
|
@ -342,7 +342,7 @@ extern "C" {
|
||||||
const char * path_base_model,
|
const char * path_base_model,
|
||||||
int n_threads);
|
int n_threads);
|
||||||
|
|
||||||
LLAMA_API int llama_model_apply_mlp_from_file(
|
LLAMA_API int llama_model_apply_gpu_idx_from_file(
|
||||||
struct llama_model * model,
|
struct llama_model * model,
|
||||||
const char * path_mlp,
|
const char * path_mlp,
|
||||||
bool use_mmap);
|
bool use_mmap);
|
||||||
|
|
|
@ -134,7 +134,7 @@ if __name__ == "__main__":
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"output_path",
|
"output_path",
|
||||||
help="path to the output GGML adapter",
|
help="path to the output GGML adapter",
|
||||||
default="./ggml-mlp-adapters.bin",
|
default="./gpu-index.bin",
|
||||||
)
|
)
|
||||||
parser.add_argument("solver", help="path to the solver")
|
parser.add_argument("solver", help="path to the solver")
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue