Added new options and some fixes

2024-04-24 21:50:01 +02:00 · 2024-04-24 21:50:01 +02:00 · f9b42b8cd8
commit f9b42b8cd8
parent ca37f7d2c5
3 changed files with 131 additions and 31 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -92,7 +92,7 @@ int32_t PhysicalCores = std::thread::hardware_concurrency();
 // CPUSET logging
 //

-#define CPUSET_DEBUG 0
+#define CPUSET_DEBUG 1
 #if (CPUSET_DEBUG >= 1)
 #define CPUSET_PRINT_DEBUG(...) printf(__VA_ARGS__)
 #else
@ -124,13 +124,52 @@ bool cpuset_sorter_worst(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION con
        return lhs.SchedulingClass < rhs.SchedulingClass;
 }

-ULONG generate_Mask(int direction, int32_t req_threads, int lltraversal) {
+ULONG generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) {
    std::bitset<64> bMask;
    std::vector<CPU_SET_INFORMATION> _cpuset;
    int32_t bVal = 0;
    int32_t assigned_t = 0;
    int32_t llcache = -1;

+    DWORD_PTR processAffinityMask;
+    DWORD_PTR systemAffinityMask;
+    HANDLE hToken = nullptr;
+    bool gotsystemMask = true;
+
+    BOOL bToken = ::OpenProcessToken(::GetCurrentProcess(), TOKEN_ALL_ACCESS, &hToken);
+    if (!bToken) {
+        CPUSET_PRINT_DEBUG("Could not access OpenProcessToken from generate_Mask\n");
+    }
+
+    HANDLE hProcess = ::OpenProcess(PROCESS_QUERY_LIMITED_INFORMATION | PROCESS_SET_INFORMATION, FALSE, GetCurrentProcessId());
+    if (!hProcess) {
+        CPUSET_PRINT_DEBUG("Could not access OpenProcess for Affinity\n");
+        gotsystemMask = false;
+    }
+
+    if (!GetProcessAffinityMask(hProcess, &processAffinityMask, &systemAffinityMask)) {
+        CPUSET_PRINT_DEBUG("Could not get GetProcessAffinityMask for Process\n");
+        gotsystemMask = false;
+    }
+    
+	if (hProcess)
+		::CloseHandle(hProcess);
+
+    if (cpuMask != 0) {
+        std::bitset<64> reqMask = cpuMask;
+        CPUSET_PRINT_DEBUG("Custom cpuMask: %s\n", reqMask.to_string().c_str());
+        if (gotsystemMask) {
+            std::bitset<64> systemMask = systemAffinityMask;
+            CPUSET_PRINT_DEBUG("System Mask:    %s\n", systemMask.to_string().c_str());
+            std::bitset<64> newprocessMask = reqMask & systemMask;
+            CPUSET_PRINT_DEBUG("New Proc Mask:  %s\n", newprocessMask.to_string().c_str());
+            bMask = reqMask & systemMask;
+        } else{
+            bMask = cpuMask;
+        }
+        return bMask.to_ullong();
+    }
+
    if (direction == BEST_CORES) {
        _cpuset = cpuset_best;
    } else {
@ -139,27 +178,25 @@ ULONG generate_Mask(int direction, int32_t req_threads, int lltraversal) {
    CPUSET_PRINT_DEBUG("\ngenerate_Mask dir=%d req_threads=%d lltraversal=%d llcache=%d\n", direction, req_threads, lltraversal, llcache);
    for (auto index : _cpuset) {
        bVal = 0;
-        if (index.LogicalProcessorIndex != 0 &&
-            ((cpuset_smt && index.Threads > 1) || !cpuset_smt) &&
+        if ((index.LogicalProcessorIndex != 0 || allowcz) &&
+            ((cpuset_smt && index.Threads > 1) || !cpuset_smt || allowtc) &&
            index.EfficiencyClass == 0 &&
-            ((llcache == index.LastLevelCacheIndex && lltraversal == 0) || llcache == -1)
+            ((llcache == index.LastLevelCacheIndex && lltraversal == 0) || llcache == -1 || lltraversal == 1)
            ) {
            if (lltraversal == 0) {
-                CPUSET_PRINT_DEBUG("cache for lltraversal %d pre llcache %d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex);
+                CPUSET_PRINT_DEBUG("### cache for lltraversal %d pre llcache=%d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex);
                llcache = index.LastLevelCacheIndex;
-                CPUSET_PRINT_DEBUG("cache for lltraversal %d pos llcache %d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex);
+                CPUSET_PRINT_DEBUG("### cache for lltraversal %d pos llcache=%d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex);
            } 
            bVal = 1;
+        }
+        if (req_threads > 0 && assigned_t >= req_threads) { bVal = 0;}
+        if(bVal == 1) {
            assigned_t++;
-            CPUSET_PRINT_DEBUG("Assigned LogicalCoreIndex: %d lltraversal %d llcache %d now_cache=%u\n", index.LogicalProcessorIndex, lltraversal, llcache, index.LastLevelCacheIndex);
+            CPUSET_PRINT_DEBUG("--> Assigned LogicalCoreIndex: %d lltraversal=%d llcache=%d now_cache=%u\n", index.LogicalProcessorIndex, lltraversal, llcache, index.LastLevelCacheIndex);
        }
        bMask[index.LogicalProcessorIndex] = bVal;
-        CPUSET_PRINT_DEBUG("Index: %d b:%d smt=%d thrds=%d\n", index.LogicalProcessorIndex, bVal, cpuset_smt, index.Threads);
-        if (req_threads > 0) {
-            if (assigned_t >= req_threads) {
-                break;
-            }
-        }
+        CPUSET_PRINT_DEBUG("LogicalCoreIndex: %d b:%d smt=%d thrds=%d lltraversal=%d acz=%d atc=%d\n", index.LogicalProcessorIndex, bVal, cpuset_smt, index.Threads, lltraversal, allowcz, allowtc);
    }
    return bMask.to_ullong();
 }
@ -262,7 +299,6 @@ int32_t get_num_physical_cores() {
        cpuSetSize += nextCPUSet->Size;
    }    
    
-    int32_t physicalCount = 0;
    int32_t thisLogical = 0;
    int32_t coreThreadsNum = 1;

@ -274,7 +310,6 @@ int32_t get_num_physical_cores() {
        if (nextLogical->ProcessorCore.Flags == 1 && nextLogical->Cache.Associativity <= 2) {
            switch (nextLogical->Relationship) {
                case LOGICAL_PROCESSOR_RELATIONSHIP::RelationProcessorCore:                    
-                    CPUSET_PRINT_DEBUG("Physical Count: %u\n", physicalCount);        
                    CPUSET_PRINT_DEBUG("Cache.Associativity: %d\n", nextLogical->Cache.Associativity);        
                    CPUSET_PRINT_DEBUG("Cache.Level: %d\n", nextLogical->Cache.Level);        
                    CPUSET_PRINT_DEBUG("Cache.Type: %d\n", nextLogical->Cache.Type);        
@ -303,15 +338,16 @@ int32_t get_num_physical_cores() {
    std::sort(cpuset_best.begin(), cpuset_best.end(), &cpuset_sorter_best);
    std::sort(cpuset_worst.begin(), cpuset_worst.end(), &cpuset_sorter_worst);

-    physicalCount = get_count_procMask(generate_Mask(WORST_CORES, 0, 1));
+    int32_t physicalCount = 0;
+    physicalCount = get_count_procMask(generate_Mask(WORST_CORES, 0, 1, 0, 1, 0));

-    CPUSET_PRINT_DEBUG("\n\nLPhysicalCount: %d\n\n", physicalCount);
+    CPUSET_PRINT_DEBUG("\n\n1st PhysicalCount: %d\n\n", physicalCount);

    physicalCount = physicalCount <= 0 ? numLogicalCores : physicalCount;

-    CPUSET_PRINT_DEBUG("\n\nLPhysicalCount2: %d\n\n", physicalCount);
+    CPUSET_PRINT_DEBUG("\n\n2nd PhysicalCount2: %d\n\n", physicalCount);

-    CPUSET_PRINT_DEBUG("\n\nLogical Processors Summary\n\n");
+    CPUSET_PRINT_DEBUG("\n\n### Logical Processors Summary ###\n\n");

    for (uint32_t _logicalCore = 0; _logicalCore < numLogicalCores;)
    {
@ -535,10 +571,10 @@ int32_t setCpuAffinity(std::bitset<64> cpuMask) {
    return coreSelected;
 }

-ULONG set_procMask(int direction = 0 , int32_t req_threads = 0, int lltraversal = 0 ) {
+ULONG set_procMask(int32_t direction = 0 , int32_t req_threads = 0, int32_t lltraversal = 0, int32_t allowtc = 0, int32_t allowcz = 0, int64_t cpuMask = 0) {
    std::bitset<64> bMask;

-    bMask = generate_Mask(direction, req_threads, lltraversal);
+    bMask = generate_Mask(direction, req_threads, lltraversal, allowtc, allowcz, cpuMask);

    numPhysicalCores = get_count_procMask(bMask.to_ullong());

@ -580,10 +616,10 @@ int get_math_cpu_count() {
 }

 #if defined(_WIN32)
-int get_math_cpu_count(int32_t req_threads, int cpuset_order, int lltraversal) {
+int get_math_cpu_count(int32_t req_threads, int32_t cpuset_order, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) {
    int32_t _numPhysical = get_num_physical_cores();
    if (cpuset_enable) {
-        _numPhysical = setCpuAffinity(set_procMask(cpuset_order, req_threads, lltraversal));
+        _numPhysical = setCpuAffinity(set_procMask(cpuset_order, req_threads, lltraversal, allowtc, allowcz, cpuMask));
    }
    return _numPhysical;
 }
@ -653,6 +689,61 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.seed = std::stoul(argv[i]);
        return true;
    }
+    if (arg == "-acz") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+#if defined(_WIN32)
+        std::string value(argv[i]);
+        if (value == "1" || value == "on" || value == "true" || value == "True") { params.cpuset_allowzero = 1; }
+        else if (value == "0" || value == "off" || value == "false" || value == "False") { params.cpuset_allowzero = 0; }
+        else { invalid_param = true; }
+#endif
+        return true;
+    }
+    if (arg == "-atc") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+#if defined(_WIN32)
+        std::string value(argv[i]);
+        if (value == "1" || value == "on" || value == "true" || value == "True") { params.cpuset_allowthreads = 1; }
+        else if (value == "0" || value == "off" || value == "false" || value == "False") { params.cpuset_allowthreads = 0; }
+        else { invalid_param = true; }
+#endif
+        return true;
+    }
+    if (arg == "-ccm") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+#if defined(_WIN32)
+        std::string value(argv[i]);
+        std::size_t pos{};
+        int64_t cpuMask = 0;
+        bool valid_bitmask = false;
+        try
+        {
+            const int64_t ll{std::stoll(value, &pos)};
+            cpuMask = ll;
+            valid_bitmask = true;
+        }
+        catch (std::invalid_argument const& ex)
+        {
+            fprintf(stderr, "%s\n", ex.what());
+        }
+        catch (std::out_of_range const& ex)
+        {
+            fprintf(stderr, "%s\n", ex.what());
+        }
+        if (valid_bitmask && cpuMask != 0) { params.cpuset_cpumask = cpuMask; }
+        else { invalid_param = true; }
+#endif
+        return true;
+    }
    if (arg == "-llct") {
        if (++i >= argc) {
            invalid_param = true;
@ -695,6 +786,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
            params.n_threads = std::thread::hardware_concurrency();
        }
 #endif
+        params.n_threads_auto = false;
        return true;
    }
    if (arg == "-tb" || arg == "--threads-batch") {
@ -1783,8 +1875,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
    }

 #if defined(_WIN32)
-    params.n_threads = get_math_cpu_count(params.n_threads, params.cpuset_order, params.cpuset_lltraversal);
-    CPUSET_PRINT_DEBUG("Using %d threads order=%d llcache=%d\n", params.n_threads, params.cpuset_order, params.cpuset_lltraversal);
+    params.n_threads = get_math_cpu_count(params.n_threads_auto ? 0 : params.n_threads, params.cpuset_order, params.cpuset_lltraversal, params.cpuset_allowthreads, params.cpuset_allowzero, params.cpuset_cpumask);
+    CPUSET_PRINT_DEBUG("Using %d threads order=%d llcache=%d acm=%d acz=%d cpumask=%lli\n", params.n_threads, params.cpuset_order, params.cpuset_lltraversal, params.cpuset_allowthreads, params.cpuset_allowzero, params.cpuset_cpumask);
 #endif

    if (invalid_param) {
@ -1951,6 +2043,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 #if defined(_WIN32)
        printf("  -bco                  change the order of the selected cores from the best to worst (default: worst to best)\n");
        printf("  -llct                 allow the core selection to traverse the last level cache (default: disabled)\n");
+        printf("  -acz                  allow the core selection to pick the core 0 as well (default: disabled)\n");
+        printf("  -atc                  allow the core selection to pick non physical, threaded, cores (default: disabled)\n");
+        printf("  -ccm                  specify a custom CPU Affinity bitmask in hex for the core selection (default: disabled)\n");
 #endif
    printf("  --numa TYPE           attempt optimizations that help on some NUMA systems\n");
    printf("                          - distribute: spread execution evenly over all nodes\n");
@ -3149,6 +3244,9 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "threads: %d # default: %u\n", params.n_threads, get_math_cpu_count());
    fprintf(stream, "bco: %d # default: 0\n", params.cpuset_order);
    fprintf(stream, "llct: %d # default: 0\n", params.cpuset_lltraversal);
+    fprintf(stream, "acz: %d # default: 0\n", params.cpuset_allowzero);
+    fprintf(stream, "atc: %d # default: 0\n", params.cpuset_allowthreads);
+    fprintf(stream, "ccm: %lli # default: none\n", params.cpuset_cpumask);
 #else

    fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
--- a/common/common.h
+++ b/common/common.h
@ -56,8 +56,8 @@ struct CPU_SET_INFORMATION

 #endif

-static const int BEST_CORES            = 0;
-static const int WORST_CORES           = 1;
+static const int32_t BEST_CORES            = 0;
+static const int32_t WORST_CORES           = 1;

 int get_math_cpu_count();
 int32_t get_num_physical_cores();
@ -73,8 +73,12 @@ struct gpt_params {
    int32_t n_threads_draft       = -1;
    int32_t n_threads_batch       = -1;    // number of threads to use for batch processing (-1 = use n_threads)
    int32_t n_threads_batch_draft = -1;
+    bool    n_threads_auto        = true;
    int32_t cpuset_lltraversal    = 0;
    int32_t cpuset_order          = WORST_CORES;
+    int64_t cpuset_cpumask        = 0;
+    int32_t cpuset_allowzero      = 0;
+    int32_t cpuset_allowthreads   = 0;
    int32_t n_predict             = -1;    // new tokens to predict
    int32_t n_ctx                 = 512;   // context size
    int32_t n_batch               = 2048;  // logical batch size for prompt processing (must be >=32 to use BLAS)
--- a/ggml.c
+++ b/ggml.c
@ -78,7 +78,6 @@ static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(vo
    (void) unused;
    HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);

-#if defined(_WIN32)
    HANDLE hToken;
    DWORD_PTR processAffinityMask;
    DWORD_PTR systemAffinityMask;
@ -122,7 +121,6 @@ static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(vo

 	if (hProcess2)
 		CloseHandle(hProcess2);
-#endif
    if (handle == NULL)
    {
        return EAGAIN;