fix
This commit is contained in:
parent
42c297b926
commit
4a98a0f21a
2 changed files with 28 additions and 18 deletions
|
@ -54,9 +54,8 @@ int32_t get_num_physical_cores() {
|
||||||
return num_physical_cores;
|
return num_physical_cores;
|
||||||
}
|
}
|
||||||
#elif defined(_WIN32)
|
#elif defined(_WIN32)
|
||||||
SYSTEM_INFO sysinfo;
|
std::cerr << "WARNING: automatic calibration not supported on Windows. Defaulting to 4 threads.\n" << std::endl;
|
||||||
GetNativeSystemInfo(&sysinfo);
|
return 4;
|
||||||
return static_cast<int32_t>(sysinfo.dwNumberOfProcessors);
|
|
||||||
#endif
|
#endif
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
@ -237,13 +236,10 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
|
|
||||||
// Clip if not a valid number of threads
|
// Clip if not a valid number of threads
|
||||||
if (params.n_threads <= 0) {
|
if (params.n_threads <= 0) {
|
||||||
|
std::cerr << "\nWARNING: Using number of physical cores as the default number of threads.\n\
|
||||||
|
If your chipset has efficient/performance cores, use the number of performance cores instead.\n" << std::endl;
|
||||||
int32_t physical_cores = get_num_physical_cores();
|
int32_t physical_cores = get_num_physical_cores();
|
||||||
if (physical_cores > 4) {
|
params.n_threads = std::max(1, physical_cores);
|
||||||
std::cerr << "\nWARNING: Defaulting to 4 threads. "
|
|
||||||
<< "(detected " << physical_cores << " physical cores)" << std::endl
|
|
||||||
<< "Adjust --threads based on your observed inference speed in ms/token." << std::endl << std::endl;
|
|
||||||
}
|
|
||||||
params.n_threads = std::max(1, std::min(4, physical_cores));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -4,14 +4,28 @@ This example program allows you to use various LLaMA language models in an easy
|
||||||
|
|
||||||
## Table of Contents
|
## Table of Contents
|
||||||
|
|
||||||
1. [Quick Start](#quick-start)
|
- [llama.cpp/example/main](#llamacppexamplemain)
|
||||||
2. [Common Options](#common-options)
|
- [Table of Contents](#table-of-contents)
|
||||||
3. [Input Prompts](#input-prompts)
|
- [Quick Start](#quick-start)
|
||||||
4. [Interaction](#interaction)
|
- [Common Options](#common-options)
|
||||||
5. [Context Management](#context-management)
|
- [Input Prompts](#input-prompts)
|
||||||
6. [Generation Flags](#generation-flags)
|
- [Interaction](#interaction)
|
||||||
7. [Performance Tuning and Memory Options](#performance-tuning-and-memory-options)
|
- [Interaction Options](#interaction-options)
|
||||||
8. [Additional Options](#additional-options)
|
- [Reverse Prompts](#reverse-prompts)
|
||||||
|
- [In-Prefix](#in-prefix)
|
||||||
|
- [Instruction Mode](#instruction-mode)
|
||||||
|
- [Context Management](#context-management)
|
||||||
|
- [Context Size](#context-size)
|
||||||
|
- [Keep Prompt](#keep-prompt)
|
||||||
|
- [Generation Flags](#generation-flags)
|
||||||
|
- [Number of Tokens to Predict](#number-of-tokens-to-predict)
|
||||||
|
- [RNG Seed](#rng-seed)
|
||||||
|
- [Temperature](#temperature)
|
||||||
|
- [Repeat Penalty](#repeat-penalty)
|
||||||
|
- [Top-K Sampling](#top-k-sampling)
|
||||||
|
- [Top-P Sampling](#top-p-sampling)
|
||||||
|
- [Performance Tuning and Memory Options](#performance-tuning-and-memory-options)
|
||||||
|
- [Additional Options](#additional-options)
|
||||||
|
|
||||||
## Quick Start
|
## Quick Start
|
||||||
|
|
||||||
|
@ -170,7 +184,7 @@ By adjusting these options, you can control the diversity, quality, and creativi
|
||||||
|
|
||||||
These options help improve the performance and memory usage of the LLaMA models:
|
These options help improve the performance and memory usage of the LLaMA models:
|
||||||
|
|
||||||
- `-t N, --threads N`: Set the number of threads to use during computation. Using the correct number of threads can greatly improve performance. It is recommended to set this value to the number of CPU cores.
|
- `-t N, --threads N`: Set the number of threads to use during computation. Using the correct number of threads can greatly improve performance. It is recommended to set this value to the number of physical CPU cores, or the number of performance cores in a chipset with efficiency/performance (E/P) cores.
|
||||||
- `--mlock`: Lock the model in memory, preventing it from being swapped out when mmaped. This can improve performance.
|
- `--mlock`: Lock the model in memory, preventing it from being swapped out when mmaped. This can improve performance.
|
||||||
- `--no-mmap`: Do not memory-map the model. This results in a slower load time but may reduce pageouts if you're not using `mlock`.
|
- `--no-mmap`: Do not memory-map the model. This results in a slower load time but may reduce pageouts if you're not using `mlock`.
|
||||||
- `--memory_f32`: Use 32 bit floats instead of 16 bit floats for memory key+value, allowing higher quality inference at the cost of memory.
|
- `--memory_f32`: Use 32 bit floats instead of 16 bit floats for memory key+value, allowing higher quality inference at the cost of memory.
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue