From 79eac2727ab6e4e470cc84ad8cbbeb509bddca1b Mon Sep 17 00:00:00 2001 From: savesanketsw Date: Tue, 21 Jan 2025 01:10:24 -0800 Subject: [PATCH] cpu_pnp_strategy changes --- common/arg.cpp | 20 ++++++ common/common.cpp | 126 ++++++++++++++++++++++++++++------- common/common.h | 12 ++++ ggml/include/ggml.h | 5 ++ ggml/src/ggml-cpu/ggml-cpu.c | 28 +++++++- 5 files changed, 166 insertions(+), 25 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 126970950..9eda648f2 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -564,6 +564,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.cpuparams.priority = (enum ggml_sched_priority) prio; } )); + add_opt(common_arg( + { "-CPnP", "--cpu-pnp-strategy" }, "N", + string_format("set CPU PnP strategy : 0-disabled, 1-efficiency (default: %d)\n", params.cpuparams.cpu_pnp_strategy), + [](common_params& params, int strategy) { + if (strategy < 0 || strategy > 1) { + throw std::invalid_argument("invalid value"); + } + params.cpuparams.cpu_pnp_strategy = (enum ggml_cpu_pnp_strategy)strategy; + } + )); + add_opt(common_arg( + { "-CPnPb", "--cpu-pnp-strategy-batch" }, "N", + string_format("set CPU PnP strategy batch : 0-disabled, 1-efficiency (default: %d)\n", params.cpuparams.cpu_pnp_strategy), + [](common_params& params, int strategy) { + if (strategy < 0 || strategy > 1) { + throw std::invalid_argument("invalid value"); + } + params.cpuparams_batch.cpu_pnp_strategy = (enum ggml_cpu_pnp_strategy)strategy; + } + )); add_opt(common_arg( {"--poll"}, "<0...100>", string_format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll), diff --git a/common/common.cpp b/common/common.cpp index 451826d5d..0ff0b243a 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -97,6 +97,77 @@ using json = nlohmann::ordered_json; // CPU utils // +#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later + +// Print CPU Information +void print_cpu_info(const cpu_info& info) { + LOG_INF("CPU Information:\n"); + LOG_INF("----------------\n"); + LOG_INF("Is Hybrid Architecture: %s\n", info.is_hybrid ? "Yes" : "No"); + LOG_INF("Number of Logical Cores: %d\n", info.num_logical_cores); + LOG_INF("Number of Physical Cores: %d\n", info.num_physical_cores); + LOG_INF("Number of Performance Cores (P-Cores): %d\n", info.num_p_cores); + LOG_INF("Number of Efficient Cores (E-Cores): %d\n", info.num_e_cores); + LOG_INF("\nE-Core Affinity Mask:\n"); + LOG_INF("%s\n", info.e_core_affinity_mask.to_string().c_str()); + LOG_INF("\nP-Core Affinity Mask:\n"); + LOG_INF("%s\n", info.p_core_affinity_mask.to_string().c_str()); +} + +// Populate CPU Information +int get_cpu_info(cpu_info& c_info) { + DWORD buffer_size = 0; + + if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) { + if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) { + return 0; + } + } + + std::vector buffer(buffer_size); + if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast(buffer.data()), &buffer_size)) { + return 0; + } + + c_info.num_physical_cores = 0; + c_info.num_logical_cores = 0; + c_info.num_e_cores = 0; + c_info.num_p_cores = 0; + + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast(buffer.data()); + while (buffer_size > 0) { + if (info->Relationship == RelationProcessorCore) { + c_info.num_physical_cores++; + for (int i = 0; i < info->Processor.GroupCount; ++i) { + GROUP_AFFINITY *groupAffinity = &info->Processor.GroupMask[i]; + WORD groupNumber = groupAffinity->Group; + KAFFINITY mask = groupAffinity->Mask; + int baseIndex = groupNumber * 64; + c_info.num_logical_cores += __popcnt64(mask); + if (info->Processor.EfficiencyClass < 1) { + c_info.e_core_affinity_mask |= (std::bitset(mask) << baseIndex); + c_info.num_e_cores += __popcnt64(mask); + } else { + c_info.p_core_affinity_mask |= (std::bitset(mask) << baseIndex); + c_info.num_p_cores += __popcnt64(mask); + } + } + } + + buffer_size -= info->Size; + info = reinterpret_cast(reinterpret_cast(info) + info->Size); + } + + if (c_info.num_p_cores > 0 && c_info.num_e_cores > 0) + c_info.is_hybrid = true; + + return 1; +} + +#endif + + + int32_t cpu_get_num_physical_cores() { #ifdef __linux__ // enumerate the set of thread siblings, num entries is num cores @@ -131,29 +202,12 @@ int32_t cpu_get_num_physical_cores() { unsigned int n_threads_win = std::thread::hardware_concurrency(); unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4; - DWORD buffer_size = 0; - if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) { - if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) { - return default_threads; - } - } - - std::vector buffer(buffer_size); - if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast(buffer.data()), &buffer_size)) { + cpu_info info; + if(!get_cpu_info(info)) return default_threads; - } + else + return info.num_physical_cores > 0 ? info.num_physical_cores : default_threads; - int32_t num_physical_cores = 0; - PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast(buffer.data()); - while (buffer_size > 0) { - if (info->Relationship == RelationProcessorCore) { - num_physical_cores += info->Processor.GroupCount; - } - buffer_size -= info->Size; - info = reinterpret_cast(reinterpret_cast(info) + info->Size); - } - - return num_physical_cores > 0 ? num_physical_cores : default_threads; #endif unsigned int n_threads = std::thread::hardware_concurrency(); return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; @@ -291,12 +345,36 @@ bool set_process_priority(enum ggml_sched_priority prio) { void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) { int32_t n_set = 0; - if (cpuparams.n_threads < 0) { + LOG_INF("n_threads: %d, cpu pnp strategy: %d\n", cpuparams.n_threads, cpuparams.cpu_pnp_strategy); + + if (cpuparams.n_threads < 0 || cpuparams.cpu_pnp_strategy > 0) { // Assuming everything about cpuparams is invalid - if (role_model != nullptr) { + if (role_model != nullptr && cpuparams.cpu_pnp_strategy == 0) { cpuparams = *role_model; } else { - cpuparams.n_threads = cpu_get_num_math(); + if(cpuparams.n_threads < 0) + cpuparams.n_threads = cpu_get_num_math(); + + #if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later + + if(cpuparams.cpu_pnp_strategy == GGML_CPU_PNP_STRATEGY_EFFICIENCY) { + cpu_info info; + if(get_cpu_info(info)){ + print_cpu_info(info); + if(info.is_hybrid){ + LOG_INF("hybrid platform detected: applying strategy\n"); + if (cpuparams.n_threads > info.num_e_cores) { + LOG_INF("overriding num threads: %d to num efficient cores %d\n", cpuparams.n_threads, info.num_e_cores); + cpuparams.n_threads = info.num_e_cores; + } + for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) { + cpuparams.cpumask[i] = info.e_core_affinity_mask[i]; + } + cpuparams.mask_valid = true; + } + } + } + #endif } } diff --git a/common/common.h b/common/common.h index b2709c044..eb9daaa3c 100644 --- a/common/common.h +++ b/common/common.h @@ -7,6 +7,7 @@ #include #include #include +#include #ifdef _WIN32 #define DIRECTORY_SEPARATOR '\\' @@ -45,11 +46,22 @@ struct common_control_vector_load_info; // CPU utils // +struct cpu_info { + bool is_hybrid = false; + int num_logical_cores = 0; + int num_physical_cores = 0; + int num_p_cores = 0; + int num_e_cores = 0; + std::bitset e_core_affinity_mask; + std::bitset p_core_affinity_mask; +}; + struct cpu_params { int n_threads = -1; bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask. bool mask_valid = false; // Default: any CPU enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime) + enum ggml_cpu_pnp_strategy cpu_pnp_strategy = GGML_CPU_PNP_STRATEGY_DISABLED; // CPU power and performance strategy bool strict_cpu = false; // Use strict CPU placement uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling) }; diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 1198dc1fd..1bd35c76a 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2169,6 +2169,11 @@ extern "C" { GGML_SCHED_PRIO_REALTIME }; + enum ggml_cpu_pnp_strategy { + GGML_CPU_PNP_STRATEGY_DISABLED, + GGML_CPU_PNP_STRATEGY_EFFICIENCY + }; + // threadpool params // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults struct ggml_threadpool_params { diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 0ed92b3ff..1e7e501ce 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -1318,10 +1318,11 @@ struct ggml_threadpool { struct ggml_compute_state { #ifndef GGML_USE_OPENMP ggml_thread_t thrd; - bool cpumask[GGML_MAX_N_THREADS]; int last_graph; bool pending; #endif + bool cpumask[GGML_MAX_N_THREADS]; + bool mask_valid; struct ggml_threadpool * threadpool; int ith; }; @@ -14044,10 +14045,20 @@ static struct ggml_threadpool * ggml_threadpool_new_impl( const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads; struct ggml_compute_state * workers = ggml_aligned_malloc(workers_size); + // Check if cpu mask is valid + bool cpumask_valid = false; + for (int i = 0; i < GGML_MAX_N_THREADS; i++) { + if (tpp->cpumask[i]) { + cpumask_valid = true; + break; + } + } + memset(workers, 0, workers_size); for (int j = 0; j < tpp->n_threads; j++) { workers[j].threadpool = threadpool; workers[j].ith = j; + workers[j].mask_valid = cpumask_valid; // set mask_valid for worker threads use affinity } threadpool->workers = workers; @@ -14079,6 +14090,12 @@ static struct ggml_threadpool * ggml_threadpool_new_impl( } #endif // GGML_USE_OPENMP + int32_t cpumask_iter = 0; + for (int j = 1; j < tpp->n_threads; j++) { + ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter); + } + ggml_thread_cpumask_next(tpp->cpumask, workers[0].cpumask, tpp->strict_cpu, &cpumask_iter); + return threadpool; } @@ -14125,10 +14142,19 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed); } + // If mask is valid for worker thread apply affinity + if(&threadpool->workers[omp_get_thread_num()].mask_valid) + ggml_thread_apply_affinity(&threadpool->workers[omp_get_thread_num()].cpumask); + ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]); } } else { atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed); + + // If mask is valid for main thread apply affinity + if(&threadpool->workers[omp_get_thread_num()].mask_valid) + ggml_thread_apply_affinity(&threadpool->workers[omp_get_thread_num()].cpumask); + ggml_graph_compute_thread(&threadpool->workers[0]); } #else