From f9b42b8cd8a35411174c2952c2beb665f3f34a68 Mon Sep 17 00:00:00 2001
From: mann1x <20623405+mann1x@users.noreply.github.com>
Date: Wed, 24 Apr 2024 21:50:01 +0200
Subject: [PATCH] Added new options and some fixes

---
 common/common.cpp | 150 ++++++++++++++++++++++++++++++++++++++--------
 common/common.h   |   8 ++-
 ggml.c            |   4 +-
 3 files changed, 131 insertions(+), 31 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index f447aa9ad..a62d67cb0 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -92,7 +92,7 @@ int32_t PhysicalCores = std::thread::hardware_concurrency();
 // CPUSET logging
 //
 
-#define CPUSET_DEBUG 0
+#define CPUSET_DEBUG 1
 #if (CPUSET_DEBUG >= 1)
 #define CPUSET_PRINT_DEBUG(...) printf(__VA_ARGS__)
 #else
@@ -124,13 +124,52 @@ bool cpuset_sorter_worst(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION con
         return lhs.SchedulingClass < rhs.SchedulingClass;
 }
 
-ULONG generate_Mask(int direction, int32_t req_threads, int lltraversal) {
+ULONG generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) {
     std::bitset<64> bMask;
     std::vector<CPU_SET_INFORMATION> _cpuset;
     int32_t bVal = 0;
     int32_t assigned_t = 0;
     int32_t llcache = -1;
 
+    DWORD_PTR processAffinityMask;
+    DWORD_PTR systemAffinityMask;
+    HANDLE hToken = nullptr;
+    bool gotsystemMask = true;
+
+    BOOL bToken = ::OpenProcessToken(::GetCurrentProcess(), TOKEN_ALL_ACCESS, &hToken);
+    if (!bToken) {
+        CPUSET_PRINT_DEBUG("Could not access OpenProcessToken from generate_Mask\n");
+    }
+
+    HANDLE hProcess = ::OpenProcess(PROCESS_QUERY_LIMITED_INFORMATION | PROCESS_SET_INFORMATION, FALSE, GetCurrentProcessId());
+    if (!hProcess) {
+        CPUSET_PRINT_DEBUG("Could not access OpenProcess for Affinity\n");
+        gotsystemMask = false;
+    }
+
+    if (!GetProcessAffinityMask(hProcess, &processAffinityMask, &systemAffinityMask)) {
+        CPUSET_PRINT_DEBUG("Could not get GetProcessAffinityMask for Process\n");
+        gotsystemMask = false;
+    }
+    
+	if (hProcess)
+		::CloseHandle(hProcess);
+
+    if (cpuMask != 0) {
+        std::bitset<64> reqMask = cpuMask;
+        CPUSET_PRINT_DEBUG("Custom cpuMask: %s\n", reqMask.to_string().c_str());
+        if (gotsystemMask) {
+            std::bitset<64> systemMask = systemAffinityMask;
+            CPUSET_PRINT_DEBUG("System Mask:    %s\n", systemMask.to_string().c_str());
+            std::bitset<64> newprocessMask = reqMask & systemMask;
+            CPUSET_PRINT_DEBUG("New Proc Mask:  %s\n", newprocessMask.to_string().c_str());
+            bMask = reqMask & systemMask;
+        } else{
+            bMask = cpuMask;
+        }
+        return bMask.to_ullong();
+    }
+
     if (direction == BEST_CORES) {
         _cpuset = cpuset_best;
     } else {
@@ -139,27 +178,25 @@ ULONG generate_Mask(int direction, int32_t req_threads, int lltraversal) {
     CPUSET_PRINT_DEBUG("\ngenerate_Mask dir=%d req_threads=%d lltraversal=%d llcache=%d\n", direction, req_threads, lltraversal, llcache);
     for (auto index : _cpuset) {
         bVal = 0;
-        if (index.LogicalProcessorIndex != 0 &&
-            ((cpuset_smt && index.Threads > 1) || !cpuset_smt) &&
+        if ((index.LogicalProcessorIndex != 0 || allowcz) &&
+            ((cpuset_smt && index.Threads > 1) || !cpuset_smt || allowtc) &&
             index.EfficiencyClass == 0 &&
-            ((llcache == index.LastLevelCacheIndex && lltraversal == 0) || llcache == -1)
+            ((llcache == index.LastLevelCacheIndex && lltraversal == 0) || llcache == -1 || lltraversal == 1)
             ) {
             if (lltraversal == 0) {
-                CPUSET_PRINT_DEBUG("cache for lltraversal %d pre llcache %d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex);
+                CPUSET_PRINT_DEBUG("### cache for lltraversal %d pre llcache=%d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex);
                 llcache = index.LastLevelCacheIndex;
-                CPUSET_PRINT_DEBUG("cache for lltraversal %d pos llcache %d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex);
+                CPUSET_PRINT_DEBUG("### cache for lltraversal %d pos llcache=%d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex);
             } 
             bVal = 1;
+        }
+        if (req_threads > 0 && assigned_t >= req_threads) { bVal = 0;}
+        if(bVal == 1) {
             assigned_t++;
-            CPUSET_PRINT_DEBUG("Assigned LogicalCoreIndex: %d lltraversal %d llcache %d now_cache=%u\n", index.LogicalProcessorIndex, lltraversal, llcache, index.LastLevelCacheIndex);
+            CPUSET_PRINT_DEBUG("--> Assigned LogicalCoreIndex: %d lltraversal=%d llcache=%d now_cache=%u\n", index.LogicalProcessorIndex, lltraversal, llcache, index.LastLevelCacheIndex);
         }
         bMask[index.LogicalProcessorIndex] = bVal;
-        CPUSET_PRINT_DEBUG("Index: %d b:%d smt=%d thrds=%d\n", index.LogicalProcessorIndex, bVal, cpuset_smt, index.Threads);
-        if (req_threads > 0) {
-            if (assigned_t >= req_threads) {
-                break;
-            }
-        }
+        CPUSET_PRINT_DEBUG("LogicalCoreIndex: %d b:%d smt=%d thrds=%d lltraversal=%d acz=%d atc=%d\n", index.LogicalProcessorIndex, bVal, cpuset_smt, index.Threads, lltraversal, allowcz, allowtc);
     }
     return bMask.to_ullong();
 }
@@ -262,7 +299,6 @@ int32_t get_num_physical_cores() {
         cpuSetSize += nextCPUSet->Size;
     }    
     
-    int32_t physicalCount = 0;
     int32_t thisLogical = 0;
     int32_t coreThreadsNum = 1;
 
@@ -274,7 +310,6 @@ int32_t get_num_physical_cores() {
         if (nextLogical->ProcessorCore.Flags == 1 && nextLogical->Cache.Associativity <= 2) {
             switch (nextLogical->Relationship) {
                 case LOGICAL_PROCESSOR_RELATIONSHIP::RelationProcessorCore:                    
-                    CPUSET_PRINT_DEBUG("Physical Count: %u\n", physicalCount);        
                     CPUSET_PRINT_DEBUG("Cache.Associativity: %d\n", nextLogical->Cache.Associativity);        
                     CPUSET_PRINT_DEBUG("Cache.Level: %d\n", nextLogical->Cache.Level);        
                     CPUSET_PRINT_DEBUG("Cache.Type: %d\n", nextLogical->Cache.Type);        
@@ -303,15 +338,16 @@ int32_t get_num_physical_cores() {
     std::sort(cpuset_best.begin(), cpuset_best.end(), &cpuset_sorter_best);
     std::sort(cpuset_worst.begin(), cpuset_worst.end(), &cpuset_sorter_worst);
 
-    physicalCount = get_count_procMask(generate_Mask(WORST_CORES, 0, 1));
+    int32_t physicalCount = 0;
+    physicalCount = get_count_procMask(generate_Mask(WORST_CORES, 0, 1, 0, 1, 0));
 
-    CPUSET_PRINT_DEBUG("\n\nLPhysicalCount: %d\n\n", physicalCount);
+    CPUSET_PRINT_DEBUG("\n\n1st PhysicalCount: %d\n\n", physicalCount);
 
     physicalCount = physicalCount <= 0 ? numLogicalCores : physicalCount;
 
-    CPUSET_PRINT_DEBUG("\n\nLPhysicalCount2: %d\n\n", physicalCount);
+    CPUSET_PRINT_DEBUG("\n\n2nd PhysicalCount2: %d\n\n", physicalCount);
 
-    CPUSET_PRINT_DEBUG("\n\nLogical Processors Summary\n\n");
+    CPUSET_PRINT_DEBUG("\n\n### Logical Processors Summary ###\n\n");
 
     for (uint32_t _logicalCore = 0; _logicalCore < numLogicalCores;)
     {
@@ -535,10 +571,10 @@ int32_t setCpuAffinity(std::bitset<64> cpuMask) {
     return coreSelected;
 }
 
-ULONG set_procMask(int direction = 0 , int32_t req_threads = 0, int lltraversal = 0 ) {
+ULONG set_procMask(int32_t direction = 0 , int32_t req_threads = 0, int32_t lltraversal = 0, int32_t allowtc = 0, int32_t allowcz = 0, int64_t cpuMask = 0) {
     std::bitset<64> bMask;
 
-    bMask = generate_Mask(direction, req_threads, lltraversal);
+    bMask = generate_Mask(direction, req_threads, lltraversal, allowtc, allowcz, cpuMask);
 
     numPhysicalCores = get_count_procMask(bMask.to_ullong());
 
@@ -580,10 +616,10 @@ int get_math_cpu_count() {
 }
 
 #if defined(_WIN32)
-int get_math_cpu_count(int32_t req_threads, int cpuset_order, int lltraversal) {
+int get_math_cpu_count(int32_t req_threads, int32_t cpuset_order, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) {
     int32_t _numPhysical = get_num_physical_cores();
     if (cpuset_enable) {
-        _numPhysical = setCpuAffinity(set_procMask(cpuset_order, req_threads, lltraversal));
+        _numPhysical = setCpuAffinity(set_procMask(cpuset_order, req_threads, lltraversal, allowtc, allowcz, cpuMask));
     }
     return _numPhysical;
 }
@@ -653,6 +689,61 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.seed = std::stoul(argv[i]);
         return true;
     }
+    if (arg == "-acz") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+#if defined(_WIN32)
+        std::string value(argv[i]);
+        if (value == "1" || value == "on" || value == "true" || value == "True") { params.cpuset_allowzero = 1; }
+        else if (value == "0" || value == "off" || value == "false" || value == "False") { params.cpuset_allowzero = 0; }
+        else { invalid_param = true; }
+#endif
+        return true;
+    }
+    if (arg == "-atc") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+#if defined(_WIN32)
+        std::string value(argv[i]);
+        if (value == "1" || value == "on" || value == "true" || value == "True") { params.cpuset_allowthreads = 1; }
+        else if (value == "0" || value == "off" || value == "false" || value == "False") { params.cpuset_allowthreads = 0; }
+        else { invalid_param = true; }
+#endif
+        return true;
+    }
+    if (arg == "-ccm") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+#if defined(_WIN32)
+        std::string value(argv[i]);
+        std::size_t pos{};
+        int64_t cpuMask = 0;
+        bool valid_bitmask = false;
+        try
+        {
+            const int64_t ll{std::stoll(value, &pos)};
+            cpuMask = ll;
+            valid_bitmask = true;
+        }
+        catch (std::invalid_argument const& ex)
+        {
+            fprintf(stderr, "%s\n", ex.what());
+        }
+        catch (std::out_of_range const& ex)
+        {
+            fprintf(stderr, "%s\n", ex.what());
+        }
+        if (valid_bitmask && cpuMask != 0) { params.cpuset_cpumask = cpuMask; }
+        else { invalid_param = true; }
+#endif
+        return true;
+    }
     if (arg == "-llct") {
         if (++i >= argc) {
             invalid_param = true;
@@ -695,6 +786,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
             params.n_threads = std::thread::hardware_concurrency();
         }
 #endif
+        params.n_threads_auto = false;
         return true;
     }
     if (arg == "-tb" || arg == "--threads-batch") {
@@ -1783,8 +1875,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
     }
 
 #if defined(_WIN32)
-    params.n_threads = get_math_cpu_count(params.n_threads, params.cpuset_order, params.cpuset_lltraversal);
-    CPUSET_PRINT_DEBUG("Using %d threads order=%d llcache=%d\n", params.n_threads, params.cpuset_order, params.cpuset_lltraversal);
+    params.n_threads = get_math_cpu_count(params.n_threads_auto ? 0 : params.n_threads, params.cpuset_order, params.cpuset_lltraversal, params.cpuset_allowthreads, params.cpuset_allowzero, params.cpuset_cpumask);
+    CPUSET_PRINT_DEBUG("Using %d threads order=%d llcache=%d acm=%d acz=%d cpumask=%lli\n", params.n_threads, params.cpuset_order, params.cpuset_lltraversal, params.cpuset_allowthreads, params.cpuset_allowzero, params.cpuset_cpumask);
 #endif
 
     if (invalid_param) {
@@ -1951,6 +2043,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 #if defined(_WIN32)
         printf("  -bco                  change the order of the selected cores from the best to worst (default: worst to best)\n");
         printf("  -llct                 allow the core selection to traverse the last level cache (default: disabled)\n");
+        printf("  -acz                  allow the core selection to pick the core 0 as well (default: disabled)\n");
+        printf("  -atc                  allow the core selection to pick non physical, threaded, cores (default: disabled)\n");
+        printf("  -ccm                  specify a custom CPU Affinity bitmask in hex for the core selection (default: disabled)\n");
 #endif
     printf("  --numa TYPE           attempt optimizations that help on some NUMA systems\n");
     printf("                          - distribute: spread execution evenly over all nodes\n");
@@ -3149,6 +3244,9 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "threads: %d # default: %u\n", params.n_threads, get_math_cpu_count());
     fprintf(stream, "bco: %d # default: 0\n", params.cpuset_order);
     fprintf(stream, "llct: %d # default: 0\n", params.cpuset_lltraversal);
+    fprintf(stream, "acz: %d # default: 0\n", params.cpuset_allowzero);
+    fprintf(stream, "atc: %d # default: 0\n", params.cpuset_allowthreads);
+    fprintf(stream, "ccm: %lli # default: none\n", params.cpuset_cpumask);
 #else
 
     fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
diff --git a/common/common.h b/common/common.h
index 773913c73..5ba823ace 100644
--- a/common/common.h
+++ b/common/common.h
@@ -56,8 +56,8 @@ struct CPU_SET_INFORMATION
 
 #endif
 
-static const int BEST_CORES            = 0;
-static const int WORST_CORES           = 1;
+static const int32_t BEST_CORES            = 0;
+static const int32_t WORST_CORES           = 1;
 
 int get_math_cpu_count();
 int32_t get_num_physical_cores();
@@ -73,8 +73,12 @@ struct gpt_params {
     int32_t n_threads_draft       = -1;
     int32_t n_threads_batch       = -1;    // number of threads to use for batch processing (-1 = use n_threads)
     int32_t n_threads_batch_draft = -1;
+    bool    n_threads_auto        = true;
     int32_t cpuset_lltraversal    = 0;
     int32_t cpuset_order          = WORST_CORES;
+    int64_t cpuset_cpumask        = 0;
+    int32_t cpuset_allowzero      = 0;
+    int32_t cpuset_allowthreads   = 0;
     int32_t n_predict             = -1;    // new tokens to predict
     int32_t n_ctx                 = 512;   // context size
     int32_t n_batch               = 2048;  // logical batch size for prompt processing (must be >=32 to use BLAS)
diff --git a/ggml.c b/ggml.c
index 90584e18b..b5b11ca16 100644
--- a/ggml.c
+++ b/ggml.c
@@ -78,11 +78,10 @@ static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(vo
     (void) unused;
     HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
 
-#if defined(_WIN32)
     HANDLE hToken;
     DWORD_PTR processAffinityMask;
     DWORD_PTR systemAffinityMask;
-
+        
     BOOL bToken = OpenProcessToken(GetCurrentProcess(), TOKEN_ALL_ACCESS, &hToken);
     if (bToken) {
 
@@ -122,7 +121,6 @@ static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(vo
 
 	if (hProcess2)
 		CloseHandle(hProcess2);
-#endif
     if (handle == NULL)
     {
         return EAGAIN;