fix

2023-04-26 22:37:52 +08:00 · 2023-04-26 22:37:52 +08:00 · 4a98a0f21a
commit 4a98a0f21a
parent 42c297b926
2 changed files with 28 additions and 18 deletions
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -54,9 +54,8 @@ int32_t get_num_physical_cores() {
        return num_physical_cores;
    }
 #elif defined(_WIN32)
-    SYSTEM_INFO sysinfo;
-    GetNativeSystemInfo(&sysinfo);
-    return static_cast<int32_t>(sysinfo.dwNumberOfProcessors);
+    std::cerr << "WARNING: automatic calibration not supported on Windows. Defaulting to 4 threads.\n" << std::endl;
+    return 4;
 #endif
    return -1;
 }
@ -237,13 +236,10 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {

    // Clip if not a valid number of threads
    if (params.n_threads <= 0) {
+        std::cerr << "\nWARNING: Using number of physical cores as the default number of threads.\n\
+If your chipset has efficient/performance cores, use the number of performance cores instead.\n" << std::endl;
        int32_t physical_cores = get_num_physical_cores();
-        if (physical_cores > 4) {
-            std::cerr << "\nWARNING: Defaulting to 4 threads. "
-                << "(detected " << physical_cores << " physical cores)" << std::endl
-                << "Adjust --threads based on your observed inference speed in ms/token." << std::endl << std::endl;
-        }
-        params.n_threads = std::max(1, std::min(4, physical_cores));
+        params.n_threads = std::max(1, physical_cores);
    }

    return true;
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -4,14 +4,28 @@ This example program allows you to use various LLaMA language models in an easy

 ## Table of Contents

-1. [Quick Start](#quick-start)
-2. [Common Options](#common-options)
-3. [Input Prompts](#input-prompts)
-4. [Interaction](#interaction)
-5. [Context Management](#context-management)
-6. [Generation Flags](#generation-flags)
-7. [Performance Tuning and Memory Options](#performance-tuning-and-memory-options)
-8. [Additional Options](#additional-options)
+- [llama.cpp/example/main](#llamacppexamplemain)
+  - [Table of Contents](#table-of-contents)
+  - [Quick Start](#quick-start)
+  - [Common Options](#common-options)
+  - [Input Prompts](#input-prompts)
+  - [Interaction](#interaction)
+    - [Interaction Options](#interaction-options)
+    - [Reverse Prompts](#reverse-prompts)
+    - [In-Prefix](#in-prefix)
+    - [Instruction Mode](#instruction-mode)
+  - [Context Management](#context-management)
+    - [Context Size](#context-size)
+    - [Keep Prompt](#keep-prompt)
+  - [Generation Flags](#generation-flags)
+    - [Number of Tokens to Predict](#number-of-tokens-to-predict)
+    - [RNG Seed](#rng-seed)
+    - [Temperature](#temperature)
+    - [Repeat Penalty](#repeat-penalty)
+    - [Top-K Sampling](#top-k-sampling)
+    - [Top-P Sampling](#top-p-sampling)
+  - [Performance Tuning and Memory Options](#performance-tuning-and-memory-options)
+  - [Additional Options](#additional-options)

 ## Quick Start

@ -170,7 +184,7 @@ By adjusting these options, you can control the diversity, quality, and creativi

 These options help improve the performance and memory usage of the LLaMA models:

-   `-t N, --threads N`: Set the number of threads to use during computation. Using the correct number of threads can greatly improve performance. It is recommended to set this value to the number of CPU cores.
+-   `-t N, --threads N`: Set the number of threads to use during computation. Using the correct number of threads can greatly improve performance. It is recommended to set this value to the number of physical CPU cores, or the number of performance cores in a chipset with efficiency/performance (E/P) cores.
 -   `--mlock`: Lock the model in memory, preventing it from being swapped out when mmaped. This can improve performance.
 -   `--no-mmap`: Do not memory-map the model. This results in a slower load time but may reduce pageouts if you're not using `mlock`.
 -   `--memory_f32`: Use 32 bit floats instead of 16 bit floats for memory key+value, allowing higher quality inference at the cost of memory.