Fixes, Linux support over 64 CPUs, Core 0 enabled at 6 cores and below

2024-04-28 22:46:12 +02:00 · 2024-04-28 22:46:12 +02:00 · 063e201b02
commit 063e201b02
parent e5672d33cb
2 changed files with 86 additions and 57 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -92,7 +92,7 @@ int32_t PhysicalCores = std::thread::hardware_concurrency();
 // CPUSET logging
 //

-#define CPUSET_DEBUG 0
+#define CPUSET_DEBUG 1
 #if (CPUSET_DEBUG >= 1)
 #define CPUSET_PRINT_DEBUG(...) printf(__VA_ARGS__)
 #else
@ -127,9 +127,7 @@ int32_t get_count_procMask(ULONG_PTR procMask) {
            std::bitset<64> bMask = procMask;
            return bMask.count();
 }
-#endif

-#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
 uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) {
    std::bitset<64> bMask;
    std::vector<CPU_SET_INFORMATION> _cpuset;
@ -137,7 +135,6 @@ uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltravers
    int32_t assigned_t = 0;
    int32_t llcache = -1;

-#if defined(_WIN32)
    ULONG_PTR processAffinityMask;
    ULONG_PTR systemAffinityMask;
    HANDLE hToken = nullptr;
@ -176,13 +173,6 @@ uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltravers
        }
        return bMask.to_ullong();
    }
-#else
-    if (cpuMask != 0) {
-        std::bitset<64> reqMask = cpuMask;
-        CPUSET_PRINT_DEBUG("Custom cpuMask: %s\n", reqMask.to_string().c_str());
-        return reqMask.to_ullong();
-    }
-#endif

    if (direction == BEST_CORES) {
        _cpuset = cpuset_best;
@ -192,7 +182,7 @@ uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltravers
    CPUSET_PRINT_DEBUG("\ngenerate_Mask dir=%d req_threads=%d lltraversal=%d llcache=%d\n", direction, req_threads, lltraversal, llcache);
    for (auto index : _cpuset) {
        bVal = 0;
-        if ((index.LogicalProcessorIndex != 0 || allowcz) &&
+        if ((index.LogicalProcessorIndex != 0 || allowcz == 1) &&
            ((cpuset_smt && index.Threads > 1) || !cpuset_smt || allowtc) &&
            index.EfficiencyClass == 0 &&
            ((llcache == index.LastLevelCacheIndex && lltraversal == 0) || llcache == -1 || lltraversal == 1)
@ -214,33 +204,73 @@ uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltravers
    }
    return bMask.to_ullong();
 }
-#endif

-#if defined(__x86_64__) && defined(__linux__)
+#elif defined(__x86_64__) && defined(__linux__)
 #include <pthread.h>

-int32_t setCpuAffinity(std::bitset<64> cpuMask) {
-    int32_t coreSelected = cpuMask.count();
+cpu_set_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) {
+    cpu_set_t bMask;
+    CPU_ZERO(&bMask);
+    std::vector<CPU_SET_INFORMATION> _cpuset;
+    int32_t bVal = 0;
+    int32_t assigned_t = 0;
+    int32_t llcache = -1;
+    std::bitset<64> reqMask = cpuMask;

-    cpu_set_t mask;
-    CPU_ZERO(&mask);
-
-    for (int32_t i = 0; i < 64; ++i) {
-        if (cpuMask[i] == 1) {
-            CPUSET_PRINT_DEBUG("Setting CPU %d\n", i);
-            CPU_SET(i, &mask);
-        } else {
-            CPU_CLR(i, &mask);
-        }
+    if (cpuMask != 0) {
+        CPUSET_PRINT_DEBUG("Custom cpuMask: %s\n", reqMask.to_string().c_str());
    }

-    if (sched_setaffinity(0, sizeof(cpu_set_t), &mask) == -1) {
+    if (direction == BEST_CORES) {
+        _cpuset = cpuset_best;
+    } else {
+        _cpuset = cpuset_worst;
+    }
+    CPUSET_PRINT_DEBUG("\ngenerate_Mask: dir=%d req_threads=%d lltraversal=%d llcache=%d\n", direction, req_threads, lltraversal, llcache);
+    for (auto index : _cpuset) {
+        bVal = 0;
+        if ((index.LogicalProcessorIndex != 0 || allowcz == 1) &&
+            ((cpuset_smt && index.Threads > 1) || !cpuset_smt || allowtc) &&
+            index.EfficiencyClass == 0 &&
+            ((llcache == index.LastLevelCacheIndex && lltraversal == 0) || llcache == -1 || lltraversal == 1)
+            ) {
+            if (lltraversal == 0) {
+                CPUSET_PRINT_DEBUG("### cache for lltraversal %d pre llcache=%d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex);
+                llcache = index.LastLevelCacheIndex;
+                CPUSET_PRINT_DEBUG("### cache for lltraversal %d pos llcache=%d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex);
+            } 
+            bVal = 1;
+        }
+        if (req_threads > 0 && assigned_t >= req_threads) { bVal = 0;}
+        if (cpuMask != 0) {
+            bVal = 1;
+            if (reqMask[index.LogicalProcessorIndex] == 0) {
+                bVal = 0;
+            }
+        }
+        if(bVal == 1) {
+            assigned_t++;
+            CPU_SET(index.LogicalProcessorIndex, &bMask);
+            CPUSET_PRINT_DEBUG("--> Assigned LogicalCoreIndex: %d lltraversal=%d llcache=%d now_cache=%u\n", index.LogicalProcessorIndex, lltraversal, llcache, index.LastLevelCacheIndex);
+        } else {
+            CPU_CLR(index.LogicalProcessorIndex, &bMask);
+        }
+        CPUSET_PRINT_DEBUG("LogicalCoreIndex: %d b:%d smt=%d thrds=%d lltraversal=%d acz=%d atc=%d\n", index.LogicalProcessorIndex, bVal, cpuset_smt, index.Threads, lltraversal, allowcz, allowtc);
+    }
+    return bMask;
+}
+
+int32_t setCpuAffinity(cpu_set_t bMask) {
+    const cpu_set_t cpuMask = bMask;
+    int32_t coreSelected = CPU_COUNT(&cpuMask);
+
+    if (sched_setaffinity(0, sizeof(cpu_set_t), &cpuMask) == -1) {
            CPUSET_PRINT_DEBUG("setCpuAffinity sched_setaffinity error\n");
    }
-    if (pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) == -1) {
+    if (pthread_setaffinity_np(pthread_self(), sizeof(cpuMask), &cpuMask) == -1) {
            CPUSET_PRINT_DEBUG("setCpuAffinity pthread_setaffinity_np error\n");
-    }
-     
+    }     
+    
    return coreSelected;
 }

@ -289,15 +319,16 @@ static int count_math_cpus(int cpu_count) {
    return result;
 }

-uint64_t set_procMask(int32_t direction = 0 , int32_t req_threads = 0, int32_t lltraversal = 0, int32_t allowtc = 0, int32_t allowcz = 0, int64_t cpuMask = 0) {
-    std::bitset<64> bMask;
-
+cpu_set_t set_procMask(int32_t direction = 0 , int32_t req_threads = 0, int32_t lltraversal = 0, int32_t allowtc = 0, int32_t allowcz = 0, int64_t cpuMask = 0) {
+    cpu_set_t bMask;
+    CPU_ZERO(&bMask);
    bMask = generate_Mask(direction, req_threads, lltraversal, allowtc, allowcz, cpuMask);

-    numPhysicalCores = bMask.count();
+    numPhysicalCores = CPU_COUNT(&bMask);

-    CPUSET_PRINT_DEBUG("Generated Mask: %s\n", bMask.to_string().c_str());
-    return bMask.to_ullong();
+    CPUSET_PRINT_DEBUG("Generated Mask Count CPU: %d\n", numPhysicalCores);
+    
+    return bMask;
 }

 #endif
@ -318,7 +349,7 @@ int32_t get_num_physical_cores() {
    std::vector<CPU_SET_INFORMATION> _cpuset;
    int32_t numLogicalCores = 0;

-    for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
+    for (uint32_t cpu=0; cpu < 1024; ++cpu) {
        CPUSET_PRINT_DEBUG("Check for Logical CPU: %d\n", cpu);
        std::ifstream thread_siblings("/sys/devices/system/cpu/cpu"
            + std::to_string(cpu) + "/topology/thread_siblings");
@ -390,9 +421,8 @@ int32_t get_num_physical_cores() {
        std::sort(cpuset_worst.begin(), cpuset_worst.end(), &cpuset_sorter_worst);

        int32_t physicalCount = 0;
-        //int32_t physicalCount = static_cast<int32_t>(siblings.size());
-        std::bitset<64> bMask = generate_Mask(WORST_CORES, 0, 1, 0, 1, 0);
-        physicalCount = bMask.count();
+        cpu_set_t bMask = generate_Mask(WORST_CORES, 0, 1, 0, 1, 0);
+        physicalCount = CPU_COUNT(&bMask);

        CPUSET_PRINT_DEBUG("\n\n### Logical Processors Summary ###\n\n");

@ -554,7 +584,7 @@ int32_t get_num_physical_cores() {
    std::sort(cpuset_worst.begin(), cpuset_worst.end(), &cpuset_sorter_worst);

    int32_t physicalCount = 0;
-    physicalCount = get_count_procMask(generate_Mask(WORST_CORES, 0, 1, 0, 1, 0));
+    physicalCount = get_count_procMask(generate_Mask(WORST_CORES, 0, 1, 0, 2, 0));

    CPUSET_PRINT_DEBUG("\n\n1st PhysicalCount: %d\n\n", physicalCount);

@ -731,7 +761,14 @@ ULONG set_procMask(int32_t direction = 0, int32_t req_threads = 0, int32_t lltra
 * Returns number of CPUs on system that are useful for math.
 */
 int get_math_cpu_count() {
-#if defined(__x86_64__) && defined(__linux__)
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
+    int32_t _numPhysical = get_num_physical_cores();
+    if (cpuset_enable) {
+        // Initial Affinity set
+        setCpuAffinity(set_procMask(WORST_CORES, 0, 1, 0, 0));
+    }
+    return _numPhysical;
+#elif defined(__linux__)
    int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
    if (cpu_count < 1) {
        return get_num_physical_cores();
@ -746,14 +783,6 @@ int get_math_cpu_count() {
            }
        }
    }
-
-#elif defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
-    int32_t _numPhysical = get_num_physical_cores();
-    if (cpuset_enable) {
-        // Initial Affinity set
-        setCpuAffinity(set_procMask(WORST_CORES, 0, 1));
-    }
-    return _numPhysical;
 #endif
    return get_num_physical_cores();
 }
@ -762,6 +791,7 @@ int get_math_cpu_count() {
 int get_math_cpu_count(int32_t req_threads, int32_t cpuset_order, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) {
    int32_t _numPhysical = get_num_physical_cores();
    if (cpuset_enable) {
+        if (_numPhysical < 7 && allowcz == 2) allowcz = 1;
        _numPhysical = setCpuAffinity(set_procMask(cpuset_order, req_threads, lltraversal, allowtc, allowcz, cpuMask));
    }
    return _numPhysical;
@ -2191,7 +2221,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 #if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
        printf("  -bco                  change the order of the selected cores from the best to worst (default: worst to best)\n");
        printf("  -llct                 allow the core selection to traverse the last level cache (default: disabled)\n");
-        printf("  -acz                  allow the core selection to pick the core 0 as well (default: disabled)\n");
+        printf("  -acz                  allow the core selection to pick the core 0 as well (default: disabled for more than 6 cores)\n");
        printf("  -atc                  allow the core selection to pick non physical, threaded, cores (default: disabled)\n");
        printf("  -ccm                  specify a custom CPU Affinity bitmask in hex for the core selection (default: disabled)\n");
 #endif
@ -3392,7 +3422,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "threads: %d # default: %u\n", params.n_threads, get_math_cpu_count());
    fprintf(stream, "bco: %d # default: 0\n", params.cpuset_order);
    fprintf(stream, "llct: %d # default: 0\n", params.cpuset_lltraversal);
-    fprintf(stream, "acz: %d # default: 0\n", params.cpuset_allowzero);
+    fprintf(stream, "acz: %d # default: auto\n", params.cpuset_allowzero);
    fprintf(stream, "atc: %d # default: 0\n", params.cpuset_allowthreads);
 #if defined(_WIN32)
    fprintf(stream, "ccm: %lli # default: none\n", params.cpuset_cpumask);
--- a/common/common.h
+++ b/common/common.h
@ -61,10 +61,9 @@ bool cpuset_sorter_worst(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION con
 int get_math_cpu_count(int32_t req_threads, int32_t cpuset_order, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask);
 #endif
 #if defined(__x86_64__) && defined(__linux__)
-#include <bitset>
-int32_t setCpuAffinity(std::bitset<64> cpuMask);
-uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask);
-uint64_t set_procMask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask);
+int32_t setCpuAffinity(cpu_set_t cpuMask);
+cpu_set_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask);
+cpu_set_t set_procMask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask);
 #endif

 static const int32_t BEST_CORES            = 0;
@ -88,7 +87,7 @@ struct gpt_params {
    int32_t cpuset_lltraversal    = 0;
    int32_t cpuset_order          = WORST_CORES;
    int64_t cpuset_cpumask        = 0;
-    int32_t cpuset_allowzero      = 0;
+    int32_t cpuset_allowzero      = 2;
    int32_t cpuset_allowthreads   = 0;
    int32_t n_predict             = -1;    // new tokens to predict
    int32_t n_ctx                 = 512;   // context size