Only show -ngl option when relevant + add warning for --memory-f32 option
This commit is contained in:
parent
a6704643b6
commit
764a21ce0f
3 changed files with 13 additions and 2 deletions
|
@ -278,12 +278,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|||
params.use_color = true;
|
||||
} else if (arg == "--mlock") {
|
||||
params.use_mlock = true;
|
||||
#ifdef LLAMA_CAN_GPU_OFFLOAD
|
||||
} else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_gpu_layers = std::stoi(argv[i]);
|
||||
#endif
|
||||
} else if (arg == "--no-mmap") {
|
||||
params.use_mmap = false;
|
||||
} else if (arg == "--mtest") {
|
||||
|
@ -410,7 +412,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
||||
fprintf(stderr, " --no-penalize-nl do not penalize newline token\n");
|
||||
fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value\n");
|
||||
fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||
fprintf(stderr, " --temp N temperature (default: %.1f)\n", (double)params.temp);
|
||||
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
|
||||
|
@ -421,8 +424,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
if (llama_mmap_supported()) {
|
||||
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
||||
}
|
||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
|
||||
fprintf(stderr, " number of layers to store in VRAM\n");
|
||||
#endif
|
||||
fprintf(stderr, " --mtest compute maximum memory usage\n");
|
||||
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
|
||||
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||
|
|
|
@ -264,7 +264,7 @@ These options help improve the performance and memory usage of the LLaMA models.
|
|||
|
||||
### Memory Float 32
|
||||
|
||||
- `--memory_f32`: Use 32-bit floats instead of 16-bit floats for memory key+value, allowing higher quality inference at the cost of higher memory usage.
|
||||
- `--memory_f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended.
|
||||
|
||||
### Batch Size
|
||||
|
||||
|
@ -285,5 +285,6 @@ These options provide extra functionality and customization when running the LLa
|
|||
- `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
|
||||
- `--verbose-prompt`: Print the prompt before generating text.
|
||||
- `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly.
|
||||
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently clBLAST or cuBLAST), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
||||
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
||||
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
||||
|
|
5
llama.h
5
llama.h
|
@ -31,6 +31,11 @@
|
|||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||
#define LLAMA_SESSION_VERSION 1
|
||||
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue