CpuSet support for Windows

2024-04-22 20:08:51 +02:00 · 2024-04-22 20:08:51 +02:00 · b188c9c983
commit b188c9c983
parent 3b8f1ec4b1
3 changed files with 550 additions and 8 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -32,8 +32,13 @@
 #endif
 #include <locale>
 #include <windows.h>
 #include <stdio.h>
 #include <wincred.h>
 #include <fcntl.h>
 #include <io.h>
 #include <bitset>
 #include <tlhelp32.h>
 #include <tchar.h>
 #else
 #include <sys/ioctl.h>
 #include <sys/stat.h>
@ -72,8 +77,96 @@
 using json = nlohmann::ordered_json;
 #if defined(_WIN32)
 std::vector<CPU_SET_INFORMATION> cpuset;
 std::vector<CPU_SET_INFORMATION> cpuset_best;
 std::vector<CPU_SET_INFORMATION> cpuset_worst;
 bool cpuset_enable = false;
 bool cpuset_smt = false;
 int32_t numPhysicalCores = -1;
 int32_t PhysicalCores = std::thread::hardware_concurrency();
 //
 // CPUSET logging
 //
 #define CPUSET_DEBUG 1
 #if (CPUSET_DEBUG >= 1)
 #define CPUSET_PRINT_DEBUG(...) printf(__VA_ARGS__)
 #else
 #define CPUSET_PRINT_DEBUG(...)
 #endif
 int32_t get_pos_procMask(ULONG_PTR procMask) {
            std::bitset<64> bMask = procMask;
            int32_t thisPos = 0;
            for (int32_t i = 0; i < 64; ++i) {
                if (bMask[i] == 1) {
                    return i;
                    break;
                }
            }
            return thisPos;
 }
 int32_t get_count_procMask(ULONG_PTR procMask) {
            std::bitset<64> bMask = procMask;
            return bMask.count();
 }
 bool cpuset_sorter_best(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION const& rhs) {
        return lhs.SchedulingClass > rhs.SchedulingClass;
 }
 bool cpuset_sorter_worst(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION const& rhs) {
        return lhs.SchedulingClass < rhs.SchedulingClass;
 }
 ULONG generate_Mask(int direction, int32_t req_threads, int lltraversal) {
    std::bitset<64> bMask;
    std::vector<CPU_SET_INFORMATION> _cpuset;
    int32_t bVal = 0;
    int32_t assigned_t = 0;
    int32_t llcache = -1;
    if (direction == BEST_CORES) {
        _cpuset = cpuset_best;
    } else {
        _cpuset = cpuset_worst;
    }
    CPUSET_PRINT_DEBUG("\ngenerate_Mask dir=%d req_threads=%d lltraversal=%d llcache=%d\n", direction, req_threads, lltraversal, llcache);
    for (auto index : _cpuset) {
        bVal = 0;
        if (index.LogicalProcessorIndex != 0 &&
            ((cpuset_smt && index.Threads > 1) || !cpuset_smt) &&
            index.EfficiencyClass == 0 &&
            ((llcache == index.LastLevelCacheIndex && lltraversal == 0) || llcache == -1)
            ) {
            if (lltraversal == 0) {
                CPUSET_PRINT_DEBUG("cache for lltraversal %d pre llcache %d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex);
                llcache = index.LastLevelCacheIndex;
                CPUSET_PRINT_DEBUG("cache for lltraversal %d pos llcache %d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex);
            } 
            bVal = 1;
            assigned_t++;
            CPUSET_PRINT_DEBUG("Assigned LogicalCoreIndex: %d lltraversal %d llcache %d now_cache=%u\n", index.LogicalProcessorIndex, lltraversal, llcache, index.LastLevelCacheIndex);
        }
        bMask[index.LogicalProcessorIndex] = bVal;
        CPUSET_PRINT_DEBUG("Index: %d b:%d smt=%d thrds=%d\n", index.LogicalProcessorIndex, bVal, cpuset_smt, index.Threads);
        if (req_threads > 0) {
            if (assigned_t >= req_threads) {
                break;
            }
        }
    }
    return bMask.to_ullong();
 }
 #endif
 int32_t get_num_physical_cores() {
-#ifdef __linux__
+#ifdef __linux__ // __x86_64__ && __linux__
    // enumerate the set of thread siblings, num entries is num cores
    std::unordered_set<std::string> siblings;
    for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
@ -90,7 +183,7 @@ int32_t get_num_physical_cores() {
    if (!siblings.empty()) {
        return static_cast<int32_t>(siblings.size());
    }
-#elif defined(__APPLE__) && defined(__MACH__)
+#elif defined(__APPLE__) && defined(__MACH__) // __APPLE__ && __MACH__
    int32_t num_physical_cores;
    size_t len = sizeof(num_physical_cores);
    int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
@ -101,12 +194,148 @@ int32_t get_num_physical_cores() {
    if (result == 0) {
        return num_physical_cores;
    }
-#elif defined(_WIN32)
+#elif defined(_WIN32) // _WIN32
-    //TODO: Implement
+    if (numPhysicalCores > 0) {
        return numPhysicalCores;    
    }
    unsigned int d_threads = std::thread::hardware_concurrency();
    HMODULE h = GetModuleHandleW(L"kernel32.dll");
    if (NULL != h) {
        if (NULL != GetProcAddress(h, "GetSystemCpuSetInformation")){
            CPUSET_PRINT_DEBUG("Windows SystemCpuSetInformation is available\n");
            cpuset_enable = true;
        }
    }
    numPhysicalCores = d_threads > 0 ? (d_threads <= 4 ? d_threads : d_threads / 2) : 4;
    if (d_threads < 4 || d_threads > 64 || !cpuset_enable) {
        return numPhysicalCores;
    }
    ULONG bufferSize;
    ULONG bufferSizeLogical;
    HANDLE curProc = GetCurrentProcess();
    GetSystemCpuSetInformation(nullptr, 0, &bufferSize, curProc, 0);
    GetLogicalProcessorInformation(nullptr, &bufferSizeLogical);
    auto buffer = std::make_unique<uint8_t[]>(bufferSize);
    auto bufferLogical = std::make_unique<uint8_t[]>(bufferSizeLogical);
    if(!GetSystemCpuSetInformation(reinterpret_cast<PSYSTEM_CPU_SET_INFORMATION>(buffer.get()), bufferSize, &bufferSize, curProc, 0))
    {
        CPUSET_PRINT_DEBUG("Failure GetSystemCpuSetInformation, fallback\n");
        cpuset_enable = false;
        return numPhysicalCores;
    }
    uint8_t* cpuSetPtr = buffer.get();
    GetLogicalProcessorInformation(reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION>(bufferLogical.get()), &bufferSizeLogical);
    uint8_t* logicalPtr = bufferLogical.get();
    uint32_t numLogicalCores = 0;
    CPUSET_PRINT_DEBUG("\nCPUSET GetSystemCpuSetInformation:\n");
    for (ULONG cpuSetSize = 0; cpuSetSize < bufferSize; )
    {
        auto nextCPUSet = reinterpret_cast<PSYSTEM_CPU_SET_INFORMATION>(cpuSetPtr);
        if (nextCPUSet->Type == CPU_SET_INFORMATION_TYPE::CpuSetInformation)
        {
                CPU_SET_INFORMATION _cpuset;
                _cpuset.LogicalProcessorIndex = nextCPUSet->CpuSet.LogicalProcessorIndex;
                _cpuset.CoreIndex = nextCPUSet->CpuSet.CoreIndex;
                _cpuset.Id = nextCPUSet->CpuSet.Id;
                _cpuset.Group = nextCPUSet->CpuSet.Group;
                _cpuset.LastLevelCacheIndex = nextCPUSet->CpuSet.LastLevelCacheIndex;
                _cpuset.NumaNodeIndex = nextCPUSet->CpuSet.NumaNodeIndex;
                _cpuset.EfficiencyClass = nextCPUSet->CpuSet.EfficiencyClass;
                _cpuset.SchedulingClass = nextCPUSet->CpuSet.SchedulingClass;
                cpuset.push_back(_cpuset);
                numLogicalCores++;
        }
        // Should not happen but it's a fail safe
        if (numLogicalCores > d_threads) continue;
        cpuSetPtr += nextCPUSet->Size;
        cpuSetSize += nextCPUSet->Size;
    }    
    int32_t physicalCount = 0;
    int32_t thisLogical = 0;
    int32_t coreThreadsNum = 1;
    for (ULONG logicalSize = 0; logicalSize < bufferSizeLogical; )
    {
        auto nextLogical = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION>(logicalPtr);
        if (nextLogical->ProcessorCore.Flags == 1 && nextLogical->Cache.Associativity <= 2) {
            switch (nextLogical->Relationship) {
                case LOGICAL_PROCESSOR_RELATIONSHIP::RelationProcessorCore:                    
                    CPUSET_PRINT_DEBUG("Physical Count: %u\n", physicalCount);        
                    CPUSET_PRINT_DEBUG("Cache.Associativity: %d\n", nextLogical->Cache.Associativity);        
                    CPUSET_PRINT_DEBUG("Cache.Level: %d\n", nextLogical->Cache.Level);        
                    CPUSET_PRINT_DEBUG("Cache.Type: %d\n", nextLogical->Cache.Type);        
                    CPUSET_PRINT_DEBUG("Core Flags: %d\n", nextLogical->ProcessorCore.Flags);
                    coreThreadsNum = get_count_procMask(nextLogical->ProcessorMask);
                    CPUSET_PRINT_DEBUG("LogicalCore: %d is Physical with %d [%d]thread(s)\n", get_pos_procMask(nextLogical->ProcessorMask), get_count_procMask(nextLogical->ProcessorMask), coreThreadsNum);
                    if (coreThreadsNum > 1) cpuset_smt = true;
                    cpuset[get_pos_procMask(nextLogical->ProcessorMask)].Threads = coreThreadsNum;
                    for (int32_t thread = 1; thread < coreThreadsNum;) {
                        CPUSET_PRINT_DEBUG("LogicalCore: %u is a thread\n", get_pos_procMask(nextLogical->ProcessorMask)+thread);
                        cpuset[get_pos_procMask(nextLogical->ProcessorMask)+thread].Threads = 1;
                        thread++;
                    }
                    break;
            }
        }
        logicalSize += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); 
        logicalPtr += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
    }   
    cpuset_best = cpuset;
    cpuset_worst = cpuset;
    std::sort(cpuset_best.begin(), cpuset_best.end(), &cpuset_sorter_best);
    std::sort(cpuset_worst.begin(), cpuset_worst.end(), &cpuset_sorter_worst);
    physicalCount = get_count_procMask(generate_Mask(WORST_CORES, 0, 1));
    CPUSET_PRINT_DEBUG("\n\nLPhysicalCount: %d\n\n", physicalCount);
    physicalCount = physicalCount <= 0 ? numLogicalCores : physicalCount;
    CPUSET_PRINT_DEBUG("\n\nLPhysicalCount2: %d\n\n", physicalCount);
    CPUSET_PRINT_DEBUG("\n\nLogical Processors Summary\n\n");
    for (uint32_t _logicalCore = 0; _logicalCore < numLogicalCores;)
    {
            CPUSET_PRINT_DEBUG("\nLogical: %u\n", _logicalCore);
            CPUSET_PRINT_DEBUG("Threads: %u\n", cpuset[_logicalCore].Threads);
            CPUSET_PRINT_DEBUG("Id: %u\n", cpuset[_logicalCore].Id);
            CPUSET_PRINT_DEBUG("Group: %u\n", cpuset[numLogicalCores].Group);
            CPUSET_PRINT_DEBUG("LastLevelCacheIndex: %u\n", cpuset[_logicalCore].LastLevelCacheIndex);
            CPUSET_PRINT_DEBUG("NumaNodeIndex: %u\n", cpuset[_logicalCore].NumaNodeIndex);
            CPUSET_PRINT_DEBUG("LogicalProcessorIndex: %u\n", cpuset[_logicalCore].LogicalProcessorIndex);
            CPUSET_PRINT_DEBUG("EfficiencyClass: %u\n", cpuset[_logicalCore].EfficiencyClass);
            CPUSET_PRINT_DEBUG("SchedulingClass: %u\n", cpuset[_logicalCore].SchedulingClass);
            _logicalCore++;
    }
    CPUSET_PRINT_DEBUG("\n\n<Grand total> \n\n");    
    CPUSET_PRINT_DEBUG("Total Physical: %u\n", physicalCount);    
    CPUSET_PRINT_DEBUG("Total Logical: %u\n", numLogicalCores);    
    return physicalCount;
 #endif
    unsigned int n_threads = std::thread::hardware_concurrency();
-    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
+    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;}
-}
+
 #if defined(__x86_64__) && defined(__linux__)
 #include <pthread.h>
@ -156,7 +385,168 @@ static int count_math_cpus(int cpu_count) {
    return result;
 }
-#endif // __x86_64__ && __linux__
+#elif defined(_WIN32)
 #define STATUS_ACCESS_DENIED ((NTSTATUS)0xC0000022L)
 #define STATUS_SUCCESS		 ((NTSTATUS)0)
 typedef enum _SYSTEM_INFORMATION_CLASS {
 	SystemAllowedCpuSetsInformation = 168,
 	SystemCpuSetInformation = 175,
 	SystemCpuSetTagInformation = 176,
 } SYSTEM_INFORMATION_CLASS;
 typedef enum _PROCESSINFOCLASS {
 	ProcessDefaultCpuSetsInformation = 66,
 	ProcessAllowedCpuSetsInformation = 67,
 } PROCESSINFOCLASS;
 extern "C"
 NTSTATUS
 NTAPI
 NtQuerySystemInformationEx(
 	_In_ SYSTEM_INFORMATION_CLASS SystemInformationClass,
 	_In_reads_bytes_(InputBufferLength) PVOID InputBuffer,
 	_In_ ULONG InputBufferLength,
 	_Out_writes_bytes_opt_(SystemInformationLength) PVOID SystemInformation,
 	_In_ ULONG SystemInformationLength,
 	_Out_opt_ PULONG ReturnLength
 );
 extern "C"
 NTSTATUS
 NTAPI
 NtQueryInformationProcess(
 	_In_ HANDLE ProcessHandle,
 	_In_ PROCESSINFOCLASS ProcessInformationClass,
 	_Out_writes_bytes_opt_(ProcessInformationLength) PVOID ProcessInformation,
 	_In_ ULONG ProcessInformationLength,
 	_Out_opt_ PULONG ReturnLength
 );
 int32_t setCpuAffinity(std::bitset<64> cpuMask) {
    DWORD_PTR processAffinityMask;
    DWORD_PTR systemAffinityMask;
    int32_t coreSelected = get_count_procMask(cpuMask.to_ullong());
    HANDLE hToken = nullptr;
    BOOL bToken = ::OpenProcessToken(::GetCurrentProcess(), TOKEN_ALL_ACCESS, &hToken);
    if (!bToken) {
        CPUSET_PRINT_DEBUG("Could not access process main ALL\n");
    }
    HANDLE hProcess = ::OpenProcess(PROCESS_QUERY_LIMITED_INFORMATION | PROCESS_SET_INFORMATION, FALSE, GetCurrentProcessId());
    if (!hProcess) {
        CPUSET_PRINT_DEBUG("Could not access process for Affinity\n");
    }
    if (!GetProcessAffinityMask(hProcess, &processAffinityMask, &systemAffinityMask)) {
        CPUSET_PRINT_DEBUG("Could not get affinity for Process\n");
    }
    std::bitset<64> processMask = processAffinityMask;
    CPUSET_PRINT_DEBUG("Process Mask:   %s\n", processMask.to_string().c_str());
    std::bitset<64> systemMask = systemAffinityMask;
    CPUSET_PRINT_DEBUG("System Mask:    %s\n", systemMask.to_string().c_str());
    std::bitset<64> reqMask = cpuMask;
    CPUSET_PRINT_DEBUG("Requested Mask: %s\n", reqMask.to_string().c_str());
    // Set process affinity
    if (!SetProcessAffinityMask(hProcess, cpuMask.to_ullong() & systemAffinityMask)) {
        CPUSET_PRINT_DEBUG("Could not set affinity for Process\n");
    } else {
        coreSelected = get_count_procMask(cpuMask.to_ullong() & systemAffinityMask);
        CPUSET_PRINT_DEBUG("Affinity SET for Process\n");
    }
    if (!GetProcessAffinityMask(hProcess, &processAffinityMask, &systemAffinityMask)) {
        CPUSET_PRINT_DEBUG("Could not get affinity for Process\n");
    }
    std::bitset<64> newprocessMask = processAffinityMask;
    CPUSET_PRINT_DEBUG("New Proc Mask:  %s\n", newprocessMask.to_string().c_str());
    HANDLE hThread = GetCurrentThread();
    // Get the thread ID of this thread
    DWORD tid = (DWORD)GetThreadId(hThread);
    // Enumerate all threads in the process
    THREADENTRY32 te;
    HANDLE hSnapshot = CreateToolhelp32Snapshot(TH32CS_SNAPTHREAD, 0);
    if (hSnapshot != INVALID_HANDLE_VALUE) {
        te.dwSize = sizeof(THREADENTRY32);
        Thread32First(hSnapshot, &te);
        if (Thread32Next(hSnapshot, &te)) {
            do {
                // Check if the thread is part of this process
                if (te.th32OwnerProcessID == GetProcessId(hProcess)) {
                    // Set thread affinity
                    if (!SetThreadAffinityMask(hThread, cpuMask.to_ullong() & systemAffinityMask)) {
                        CPUSET_PRINT_DEBUG("Could not set affinity for Main Process Thread\n");
                    }
                }
            } while( Thread32Next(hSnapshot, &te ) );
        } 
        CloseHandle(hSnapshot);
    }
 	if (hProcess)
 		::CloseHandle(hProcess);
 	if (hThread)
 		::CloseHandle(hThread);
    HANDLE hProcess2 = ::OpenProcess(PROCESS_ALL_ACCESS, FALSE, GetCurrentProcessId());
    if (hProcess2) {
        PROCESS_POWER_THROTTLING_STATE PowerThrottling;
        RtlZeroMemory(&PowerThrottling, sizeof(PowerThrottling));
        PowerThrottling.Version = PROCESS_POWER_THROTTLING_CURRENT_VERSION;
        PowerThrottling.ControlMask = PROCESS_POWER_THROTTLING_IGNORE_TIMER_RESOLUTION;
        PowerThrottling.StateMask = 0;
        PowerThrottling.StateMask = PROCESS_POWER_THROTTLING_IGNORE_TIMER_RESOLUTION;
        SetProcessInformation(hProcess2,
            ProcessPowerThrottling,
            &PowerThrottling,
            sizeof(PowerThrottling));
        RtlZeroMemory(&PowerThrottling, sizeof(PowerThrottling));
        PowerThrottling.ControlMask = PROCESS_POWER_THROTTLING_EXECUTION_SPEED;
        PowerThrottling.StateMask = 0;
        PowerThrottling.StateMask = PROCESS_POWER_THROTTLING_EXECUTION_SPEED;
        SetProcessInformation(hProcess2,
            ProcessPowerThrottling,
            &PowerThrottling,
            sizeof(PowerThrottling));
        MEMORY_PRIORITY_INFORMATION MemPrio;
        ZeroMemory(&MemPrio, sizeof(MemPrio));
        MemPrio.MemoryPriority = MEMORY_PRIORITY_NORMAL;
        SetProcessInformation(hProcess2,
            ProcessMemoryPriority,
            &MemPrio,
            sizeof(MemPrio));
 		::CloseHandle(hProcess2);
    }
    return coreSelected;
 }
 ULONG set_procMask(int direction = 0 , int32_t req_threads = 0, int lltraversal = 0 ) {
    std::bitset<64> bMask;
    bMask = generate_Mask(direction, req_threads, lltraversal);
    numPhysicalCores = get_count_procMask(bMask.to_ullong());
    CPUSET_PRINT_DEBUG("Generated Mask: %s\n", bMask.to_string().c_str());
    return bMask.to_ullong();
 }
 #endif // _WIN32
 /**
 * Returns number of CPUs on system that are useful for math.
@ -177,10 +567,28 @@ int get_math_cpu_count() {
            }
        }
    }
 #elif defined(_WIN32)
    int32_t _numPhysical = get_num_physical_cores();
    if (cpuset_enable) {
        // Initial Affinity set
        setCpuAffinity(set_procMask(WORST_CORES, 0, 1));
    }
    return _numPhysical;
 #endif
    return get_num_physical_cores();
 }
 #if defined(_WIN32)
 int get_math_cpu_count(int32_t req_threads, int cpuset_order, int lltraversal) {
    int32_t _numPhysical = get_num_physical_cores();
    if (cpuset_enable) {
        _numPhysical = setCpuAffinity(set_procMask(cpuset_order, req_threads, lltraversal));
    }
    return _numPhysical;
 }
 #endif
 void process_escapes(std::string & input) {
    std::size_t input_len = input.length();
    std::size_t output_idx = 0;
@ -245,15 +653,48 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.seed = std::stoul(argv[i]);
        return true;
    }
    if (arg == "-llct") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
 #if defined(_WIN32)
        std::string value(argv[i]);
        if (value == "1" || value == "on" || value == "true" || value == "True") { params.cpuset_lltraversal = 1; }
        else if (value == "0" || value == "off" || value == "false" || value == "False") { params.cpuset_lltraversal = 0; }
        else { invalid_param = true; }
 #endif
        return true;
    }
    if (arg == "-bco") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
 #if defined(_WIN32)
        std::string value(argv[i]);
        if (value == "1" || value == "on" || value == "true" || value == "True") { params.cpuset_order = BEST_CORES; }
        else if (value == "0" || value == "off" || value == "false" || value == "False") { params.cpuset_order = WORST_CORES; }
        else { invalid_param = true; }
 #endif
        return true;
    }
    if (arg == "-t" || arg == "--threads") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
 #if defined(_WIN32)
        params.n_threads = std::stoi(argv[i]);
        if (params.n_threads <= 0) {
            params.n_threads = numPhysicalCores;
        }
 #else            
        params.n_threads = std::stoi(argv[i]);
        if (params.n_threads <= 0) {
            params.n_threads = std::thread::hardware_concurrency();
        }
 #endif
        return true;
    }
    if (arg == "-tb" || arg == "--threads-batch") {
@ -262,8 +703,13 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
            return true;
        }
        params.n_threads_batch = std::stoi(argv[i]);
 #if defined(_WIN32)
        if (params.n_threads_batch <= 0 || params.n_threads_batch > numPhysicalCores) {
            params.n_threads_batch = numPhysicalCores;
 #else            
        if (params.n_threads_batch <= 0) {
            params.n_threads_batch = std::thread::hardware_concurrency();
 #endif
        }
        return true;
    }
@ -273,8 +719,13 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
            return true;
        }
        params.n_threads_draft = std::stoi(argv[i]);
 #if defined(_WIN32)
        if (params.n_threads_draft <= 0 || params.n_threads_draft > numPhysicalCores) {
            params.n_threads_draft = numPhysicalCores;
 #else            
        if (params.n_threads_draft <= 0) {
            params.n_threads_draft = std::thread::hardware_concurrency();
 #endif
        }
        return true;
    }
@ -284,8 +735,13 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
            return true;
        }
        params.n_threads_batch_draft = std::stoi(argv[i]);
 #if defined(_WIN32)
        if (params.n_threads_batch_draft <= 0 || params.n_threads_batch_draft > numPhysicalCores) {
            params.n_threads_batch_draft = numPhysicalCores;
 #else            
        if (params.n_threads_batch_draft <= 0) {
            params.n_threads_batch_draft = std::thread::hardware_concurrency();
 #endif
        }
        return true;
    }
@ -1281,6 +1737,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.kv_overrides.push_back(kvo);
        return true;
    }
 #ifndef LOG_DISABLE_LOGS
    // Parse args for logging parameters
    if (log_param_single_parse(argv[i])) {
@ -1325,6 +1782,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
        }
    }
 #if defined(_WIN32)
    params.n_threads = get_math_cpu_count(params.n_threads, params.cpuset_order, params.cpuset_lltraversal);
    CPUSET_PRINT_DEBUG("Using %d threads order=%d llcache=%d\n", params.n_threads, params.cpuset_order, params.cpuset_lltraversal);
 #endif
    if (invalid_param) {
        throw std::invalid_argument("error: invalid parameter for argument: " + arg);
    }
@ -1486,6 +1948,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    if (llama_supports_mmap()) {
        printf("  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
    }
 #if defined(_WIN32)
        printf("  -bco                  change the order of the selected cores from the best to worst (default: worst to best)\n");
        printf("  -llct                 allow the core selection to traverse the last level cache (default: disabled)\n");
 #endif
    printf("  --numa TYPE           attempt optimizations that help on some NUMA systems\n");
    printf("                          - distribute: spread execution evenly over all nodes\n");
    printf("                          - isolate: only spawn threads on CPUs on the node that execution started on\n");
@ -2679,7 +3145,14 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
    fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
 #if defined(_WIN32)
    fprintf(stream, "threads: %d # default: %u\n", params.n_threads, get_math_cpu_count());
    fprintf(stream, "bco: %d # default: 0\n", params.cpuset_order);
    fprintf(stream, "llct: %d # default: 0\n", params.cpuset_lltraversal);
 #else
    fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
 #endif
    fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
    fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
    fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
--- a/common/common.h
+++ b/common/common.h
@ -39,6 +39,26 @@ extern char const *LLAMA_BUILD_TARGET;
 struct llama_control_vector_load_info;
 #ifdef _WIN32
 struct CPU_SET_INFORMATION
 {
    int32_t LogicalProcessorIndex;
    int32_t Id;
    int32_t Group;
    int32_t CoreIndex;
    int32_t LastLevelCacheIndex;
    int32_t NumaNodeIndex;
    int32_t EfficiencyClass;
    int32_t SchedulingClass;
    int32_t Priority;
    int32_t Threads;
 };
 #endif
 static const int BEST_CORES            = 0;
 static const int WORST_CORES           = 1;
 int get_math_cpu_count();
 int32_t get_num_physical_cores();
@ -53,6 +73,8 @@ struct gpt_params {
    int32_t n_threads_draft       = -1;
    int32_t n_threads_batch       = -1;    // number of threads to use for batch processing (-1 = use n_threads)
    int32_t n_threads_batch_draft = -1;
    int32_t cpuset_lltraversal    = 0;
    int32_t cpuset_order          = WORST_CORES;
    int32_t n_predict             = -1;    // new tokens to predict
    int32_t n_ctx                 = 512;   // context size
    int32_t n_batch               = 2048;  // logical batch size for prompt processing (must be >=32 to use BLAS)
@ -321,4 +343,4 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
 //
 static const char * const LLM_KV_SPLIT_NO            = "split.no";
 static const char * const LLM_KV_SPLIT_COUNT         = "split.count";
-static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
+static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
--- a/ggml.c
+++ b/ggml.c
@ -77,12 +77,59 @@ typedef DWORD thread_ret_t;
 static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) {
    (void) unused;
    HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
 #if defined(_WIN32)
    HANDLE hToken;
    DWORD_PTR processAffinityMask;
    DWORD_PTR systemAffinityMask;
    BOOL bToken = OpenProcessToken(GetCurrentProcess(), TOKEN_ALL_ACCESS, &hToken);
    if (bToken) {
        HANDLE hProcess = OpenProcess(PROCESS_QUERY_LIMITED_INFORMATION | PROCESS_SET_INFORMATION, FALSE, GetCurrentProcessId());
        if (hProcess) {
            if (GetProcessAffinityMask(hProcess, &processAffinityMask, &systemAffinityMask)) {
                SetThreadAffinityMask(handle, processAffinityMask);
            }
        }
        if (hProcess)
            CloseHandle(hProcess);
    }
    HANDLE hProcess2 = OpenProcess(PROCESS_ALL_ACCESS, FALSE, GetCurrentProcessId());
    PROCESS_POWER_THROTTLING_STATE PowerThrottling;
    RtlZeroMemory(&PowerThrottling, sizeof(PowerThrottling));
    PowerThrottling.Version = PROCESS_POWER_THROTTLING_CURRENT_VERSION;
    PowerThrottling.ControlMask = PROCESS_POWER_THROTTLING_IGNORE_TIMER_RESOLUTION;
    PowerThrottling.StateMask = 0;
 	PowerThrottling.StateMask = PROCESS_POWER_THROTTLING_IGNORE_TIMER_RESOLUTION;
    SetProcessInformation(hProcess2,
        ProcessPowerThrottling,
        &PowerThrottling,
        sizeof(PowerThrottling));
    RtlZeroMemory(&PowerThrottling, sizeof(PowerThrottling));
    PowerThrottling.ControlMask = PROCESS_POWER_THROTTLING_EXECUTION_SPEED;
    PowerThrottling.StateMask = 0;
    PowerThrottling.StateMask = PROCESS_POWER_THROTTLING_EXECUTION_SPEED;
    SetProcessInformation(hProcess2,
        ProcessPowerThrottling,
        &PowerThrottling,
        sizeof(PowerThrottling));
 	if (hProcess2)
 		CloseHandle(hProcess2);
 #endif
    if (handle == NULL)
    {
        return EAGAIN;
    }
    *out = handle;
    return 0;
 }