add gpu index opts and udpate doc commands (#2)

This commit is contained in:
Holden X 2023-12-16 00:42:08 +08:00 committed by GitHub
parent fe3bc49e81
commit 66a1bb4602
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 41 additions and 39 deletions

View file

@ -50,7 +50,6 @@ The SparseLLM Team is currently converting the Mistral-7B model to a sparser ver
- [Installation](##setup--installation)
- [Model Weights](##model-weights)
- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
## Setup & Installation
### Get the Code
@ -60,28 +59,24 @@ git clone https://github.com/hodlen/PowerInfer
cd PowerInfer
```
### Build
In order to build PowerInfer you have two different options.
In order to build PowerInfer you have two different options. These commands are supposed to be run from the root directory of the project.
- Using `make`:
- On Linux or MacOS:
```bash
make
```
- Using `CMake`:
- If you have one GPU:
```bash
mkdir build
cd build
cmake .. -DLLAMA_CUBLAS=ON
cmake --build . --config Release
```
- If you just CPU:
```bash
mkdir build
cd build
cmake ..
cmake --build . --config Release
```
Using `make` on Linux or MacOS:
```bash
make
```
Using `CMake`:
* If you have one GPU:
```bash
cmake -S . -B build -DLLAMA_CUBLAS=ON
cmake --build build --config Release
```
* If you just CPU:
```bash
cmake -S . -B build
cmake --build build --config Release
```
## Model Weights
@ -96,11 +91,19 @@ In order to build PowerInfer you have two different options.
```bash
./build/bin/main -m /PATH/TO/MODEL -n $(output_token_count) -t $(thread_num) -p $(prompt)
```
- If you have CPU with one consumer grade GPU:
- If you have CPU with one GPU:
```bash
./build/bin/main -m /PATH/TO/MODEL -n $(output_token_count) -t $(thread_num) -p $(prompt)
./build/bin/main -m /PATH/TO/MODEL -n $(output_token_count) -t $(thread_num) -p $(prompt)
```
As for now, it requires a offline-generated "GPU index" file to split FFNs on GPU. If you want to try it, please use the following instruction to generate the GPU index file:
```bash
python scripts/export-gpu-split.py $(activation_count_path) $(output_idx_path) solver
```
Then, you can use the following instruction to run PowerInfer with GPU index:
```bash
./build/bin/main -m /PATH/TO/MODEL -n $(output_token_count) -t $(thread_num) -p $(prompt) --gpu-index $(split_path)
```
## Evaluation

View file

@ -471,12 +471,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break;
}
params.lora_base = argv[i];
} else if (arg == "--mlp-adapter") {
} else if (arg == "--gpu-index") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.mlp_adapter = argv[i];
params.gpu_index = argv[i];
} else if (arg == "--mmproj") {
if (++i >= argc) {
invalid_param = true;
@ -970,9 +970,8 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
if (llama_use_sparse_inference(model)) {
fprintf(stderr, "%s: postprocessing PowerInfer model '%s'\n", __func__, params.model.c_str());
if (!params.mlp_adapter.empty()) {
fprintf(stderr, "%s: warning: --mlp-adapter is deprecated and has no effect\n", __func__);
int err = llama_model_apply_mlp_from_file(model, params.mlp_adapter.c_str(), true);
if (!params.gpu_index.empty()) {
int err = llama_model_apply_gpu_idx_from_file(model, params.gpu_index.c_str(), true);
if (err != 0) {
fprintf(stderr, "%s: error: failed to apply mlp adapter\n", __func__);
llama_free_model(model);
@ -1358,7 +1357,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
}
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
fprintf(stream, "mlp_adapter: %s\n", params.mlp_adapter.c_str());
fprintf(stream, "gpu_index: %s\n", params.gpu_index.c_str());
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);

View file

@ -91,7 +91,7 @@ struct gpt_params {
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
std::string lora_base = ""; // base model path for the lora adapter
std::string mlp_adapter = ""; // sparse activation mlp adapter path
std::string gpu_index = ""; // sparse activation mlp adapter path
int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line

View file

@ -49,11 +49,11 @@ int main(int argc, char ** argv) {
}
if (argc >= 8) {
params.mlp_adapter = argv[7];
params.gpu_index = argv[7];
}
printf("params: model = %s, prompt = %s, n_parallel = %d, n_len = %d, n_gpu_layers = %d, n_threads = %d, mlp_adapter = %s\n",
params.model.c_str(), params.prompt.c_str(), n_parallel, n_len, n_gpu_layers, params.n_threads, params.mlp_adapter.c_str());
printf("params: model = %s, prompt = %s, n_parallel = %d, n_len = %d, n_gpu_layers = %d, n_threads = %d, gpu_index = %s\n",
params.model.c_str(), params.prompt.c_str(), n_parallel, n_len, n_gpu_layers, params.n_threads, params.gpu_index.c_str());
if (params.prompt.empty()) {
params.prompt = "Hello my name is";
@ -76,8 +76,8 @@ int main(int argc, char ** argv) {
return 1;
}
if (!params.mlp_adapter.empty()) {
int err = llama_model_apply_mlp_from_file(model, params.mlp_adapter.c_str(), true);
if (!params.gpu_index.empty()) {
int err = llama_model_apply_gpu_idx_from_file(model, params.gpu_index.c_str(), true);
if (err != 0) {
fprintf(stderr, "%s: error: failed to apply mlp adapter\n", __func__);
llama_free_model(model);

View file

@ -9660,7 +9660,7 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
}
}
int llama_model_apply_mlp_from_file(struct llama_model * model, const char * path_mlp, bool use_mmap) {
int llama_model_apply_gpu_idx_from_file(struct llama_model * model, const char * path_mlp, bool use_mmap) {
llama_mlp_model_loader * mlp_ml = new llama_mlp_model_loader(path_mlp, use_mmap);
if (mlp_ml -> apply_tensors_to_base_model(model) > 0) {
LLAMA_LOG_ERROR("%s: failed to apply mlp adapter\n", __func__);

View file

@ -342,7 +342,7 @@ extern "C" {
const char * path_base_model,
int n_threads);
LLAMA_API int llama_model_apply_mlp_from_file(
LLAMA_API int llama_model_apply_gpu_idx_from_file(
struct llama_model * model,
const char * path_mlp,
bool use_mmap);

View file

@ -134,7 +134,7 @@ if __name__ == "__main__":
parser.add_argument(
"output_path",
help="path to the output GGML adapter",
default="./ggml-mlp-adapters.bin",
default="./gpu-index.bin",
)
parser.add_argument("solver", help="path to the solver")