Initial support for Linux

2024-04-25 22:27:50 +02:00 · 2024-04-25 22:27:50 +02:00 · 63cd3dc251
commit 63cd3dc251
parent f9b42b8cd8
2 changed files with 289 additions and 104 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -77,7 +77,7 @@

 using json = nlohmann::ordered_json;

-#if defined(_WIN32)
+#if defined(_WIN32) || (defined(__linux__) && defined(__x86_64__))
 std::vector<CPU_SET_INFORMATION> cpuset;
 std::vector<CPU_SET_INFORMATION> cpuset_best;
 std::vector<CPU_SET_INFORMATION> cpuset_worst;
@ -92,13 +92,25 @@ int32_t PhysicalCores = std::thread::hardware_concurrency();
 // CPUSET logging
 //

-#define CPUSET_DEBUG 1
+#define CPUSET_DEBUG 0
 #if (CPUSET_DEBUG >= 1)
 #define CPUSET_PRINT_DEBUG(...) printf(__VA_ARGS__)
 #else
 #define CPUSET_PRINT_DEBUG(...)
 #endif

+bool cpuset_sorter_best(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION const& rhs) {
+        return lhs.SchedulingClass > rhs.SchedulingClass;
+}
+
+bool cpuset_sorter_worst(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION const& rhs) {
+        return lhs.SchedulingClass < rhs.SchedulingClass;
+}
+
+#endif
+
+#if defined(_WIN32)
+
 int32_t get_pos_procMask(ULONG_PTR procMask) {
            std::bitset<64> bMask = procMask;
            int32_t thisPos = 0;
@ -116,14 +128,6 @@ int32_t get_count_procMask(ULONG_PTR procMask) {
            return bMask.count();
 }

-bool cpuset_sorter_best(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION const& rhs) {
-        return lhs.SchedulingClass > rhs.SchedulingClass;
-}
-
-bool cpuset_sorter_worst(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION const& rhs) {
-        return lhs.SchedulingClass < rhs.SchedulingClass;
-}
-
 ULONG generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) {
    std::bitset<64> bMask;
    std::vector<CPU_SET_INFORMATION> _cpuset;
@ -202,12 +206,250 @@ ULONG generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal,
 }
 #endif

+#if defined(__x86_64__) && defined(__linux__)
+#include <pthread.h>
+
+int32_t setCpuAffinity(std::bitset<64> cpuMask) {
+    int32_t coreSelected = cpuMask.count();
+
+    cpu_set_t mask;
+    CPU_ZERO(&mask);
+
+    for (int32_t i = 0; i < 64; ++i) {
+        if (cpuMask[i] == 1) {
+            CPUSET_PRINT_DEBUG("Setting CPU %d\n", i);
+            CPU_SET(i, &mask);
+        }
+    }
+
+    if (sched_setaffinity(0, sizeof(cpu_set_t), &mask) == -1) {
+            CPUSET_PRINT_DEBUG("setCpuAffinity sched_setaffinity error\n");
+    }
+    if (pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) == -1) {
+            CPUSET_PRINT_DEBUG("setCpuAffinity pthread_setaffinity_np error\n");
+    }
+     
+    return coreSelected;
+}
+
+uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) {
+    std::bitset<64> bMask;
+    std::vector<CPU_SET_INFORMATION> _cpuset;
+    int32_t bVal = 0;
+    int32_t assigned_t = 0;
+    int32_t llcache = -1;
+
+    if (cpuMask != 0) {
+        std::bitset<64> reqMask = cpuMask;
+        CPUSET_PRINT_DEBUG("Custom cpuMask: %s\n", reqMask.to_string().c_str());
+        bMask = cpuMask;
+        return bMask.to_ullong();
+    }
+
+    if (direction == BEST_CORES) {
+        _cpuset = cpuset_best;
+    } else {
+        _cpuset = cpuset_worst;
+    }
+    CPUSET_PRINT_DEBUG("\ngenerate_Mask dir=%d req_threads=%d lltraversal=%d llcache=%d\n", direction, req_threads, lltraversal, llcache);
+    for (auto index : _cpuset) {
+        bVal = 0;
+        if ((index.LogicalProcessorIndex != 0 || allowcz) &&
+            ((cpuset_smt && index.Threads > 1) || !cpuset_smt || allowtc) &&
+            index.EfficiencyClass == 0 &&
+            ((llcache == index.LastLevelCacheIndex && lltraversal == 0) || llcache == -1 || lltraversal == 1)
+            ) {
+            if (lltraversal == 0) {
+                CPUSET_PRINT_DEBUG("### cache for lltraversal %d pre llcache=%d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex);
+                llcache = index.LastLevelCacheIndex;
+                CPUSET_PRINT_DEBUG("### cache for lltraversal %d pos llcache=%d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex);
+            } 
+            bVal = 1;
+        }
+        if (req_threads > 0 && assigned_t >= req_threads) { bVal = 0;}
+        if(bVal == 1) {
+            assigned_t++;
+            CPUSET_PRINT_DEBUG("--> Assigned LogicalCoreIndex: %d lltraversal=%d llcache=%d now_cache=%u\n", index.LogicalProcessorIndex, lltraversal, llcache, index.LastLevelCacheIndex);
+        }
+        bMask[index.LogicalProcessorIndex] = bVal;
+        CPUSET_PRINT_DEBUG("LogicalCoreIndex: %d b:%d smt=%d thrds=%d lltraversal=%d acz=%d atc=%d\n", index.LogicalProcessorIndex, bVal, cpuset_smt, index.Threads, lltraversal, allowcz, allowtc);
+    }
+    return bMask.to_ullong();
+}
+
+static void cpuid(unsigned leaf, unsigned subleaf,
+                  unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
+    __asm__("movq\t%%rbx,%%rsi\n\t"
+            "cpuid\n\t"
+            "xchgq\t%%rbx,%%rsi"
+            : "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx)
+            : "0"(leaf), "2"(subleaf));
+}
+
+static int pin_cpu(int cpu) {
+    cpu_set_t mask;
+    CPU_ZERO(&mask);
+    CPU_SET(cpu, &mask);
+    return pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask);
+}
+
+static bool is_hybrid_cpu(void) {
+    unsigned eax, ebx, ecx, edx;
+    cpuid(7, 0, &eax, &ebx, &ecx, &edx);
+    return !!(edx & (1u << 15));
+}
+
+static bool is_running_on_efficiency_core(void) {
+    unsigned eax, ebx, ecx, edx;
+    cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx);
+    int intel_atom = 0x20;
+    int core_type = (eax & 0xff000000u) >> 24;
+    return core_type == intel_atom;
+}
+
+static int count_math_cpus(int cpu_count) {
+    int result = 0;
+    for (int cpu = 0; cpu < cpu_count; ++cpu) {
+        if (pin_cpu(cpu)) {
+            return -1;
+        }
+        if (is_running_on_efficiency_core()) {
+            continue; // efficiency cores harm lockstep threading
+        }
+        ++cpu; // hyperthreading isn't useful for linear algebra
+        ++result;
+    }
+    return result;
+}
+
+uint64_t set_procMask(int32_t direction = 0 , int32_t req_threads = 0, int32_t lltraversal = 0, int32_t allowtc = 0, int32_t allowcz = 0, int64_t cpuMask = 0) {
+    std::bitset<64> bMask;
+
+    bMask = generate_Mask(direction, req_threads, lltraversal, allowtc, allowcz, cpuMask);
+
+    numPhysicalCores = bMask.count();
+
+    CPUSET_PRINT_DEBUG("Generated Mask: %s\n", bMask.to_string().c_str());
+    return bMask.to_ullong();
+}
+
+#endif
+
 int32_t get_num_physical_cores() {
-#ifdef __linux__ // __x86_64__ && __linux__
+#if defined(__linux__) && defined(__x86_64__) // __x86_64__ && __linux__
+    if (numPhysicalCores > 0) {
+        return numPhysicalCores;    
+    }
+    // enumerate the set of thread siblings, num entries is num cores
+    fprintf(stderr, "physical cpus count\n");
+    std::unordered_set<std::string> siblings;
+    int32_t cursize = 0;
+    cpu_set_t mask;
+    CPU_ZERO(&mask);
+    bool is_hybrid = is_hybrid_cpu();
+    bool is_hybrid_core = false;
+    std::vector<CPU_SET_INFORMATION> _cpuset;
+    int32_t numLogicalCores = 0;
+
+    for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
+        fprintf(stderr, "physical cpu check %d\n", cpu);
+        std::ifstream thread_siblings("/sys/devices/system/cpu/cpu"
+            + std::to_string(cpu) + "/topology/thread_siblings");
+        if (!thread_siblings.is_open()) {
+            break; // no more cpus
+        }
+        is_hybrid_core = false;
+        if (is_hybrid) {
+            if (pin_cpu(cpu) == 0) {
+                if (is_running_on_efficiency_core()) is_hybrid_core = true;
+            }
+        }
+        numLogicalCores++;
+
+        CPU_SET_INFORMATION _cpuset;
+        _cpuset.LogicalProcessorIndex = cpu;
+        _cpuset.CoreIndex = cpu;
+        _cpuset.Id = cpu;
+        _cpuset.Group = 0;
+        _cpuset.LastLevelCacheIndex = 0;
+        _cpuset.NumaNodeIndex = 0;
+        _cpuset.EfficiencyClass = is_hybrid_core ? 1 : 0;
+        _cpuset.Threads = 1;
+
+        std::ifstream cppc_tag("/sys/devices/system/cpu/cpu"
+            + std::to_string(cpu) + "/acpi_cppc/highest_perf");
+        if (!cppc_tag.is_open()) {
+            _cpuset.SchedulingClass = 256-cpu;
+        } else {
+            std::string line;
+            if (std::getline(cppc_tag, line)) {
+                int32_t _thistag = std::stoi(line);    
+                _cpuset.SchedulingClass = _thistag;
+            }
+        }
+
+        if (is_hybrid_core) continue;
+        std::string line;
+        if (std::getline(thread_siblings, line)) {
+            cursize = static_cast<int32_t>(siblings.size());
+            siblings.insert(line);
+            if (static_cast<int32_t>(siblings.size()) > cursize ) {
+                _cpuset.Threads = 2;
+                CPU_SET(cpu, &mask);
+                fprintf(stderr, "physical cpu %u: %s\n", cpu, line.c_str());
+            } else {
+                cpuset_smt = true;
+            }
+        }
+        cpuset.push_back(_cpuset);
+    }
+    if (!siblings.empty()) {
+        cpuset_enable = true;
+        if (sched_setaffinity(0, sizeof(cpu_set_t), &mask) == -1) {
+                fprintf(stdout, "sched_setaffinity error\n");
+        }
+        if (pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) == -1) {
+                fprintf(stdout, "pthread_setaffinity_np error\n");
+        }
+        fprintf(stderr, "physical cpus %li\n", siblings.size());
+
+        cpuset_best = cpuset;
+        cpuset_worst = cpuset;
+        std::sort(cpuset_best.begin(), cpuset_best.end(), &cpuset_sorter_best);
+        std::sort(cpuset_worst.begin(), cpuset_worst.end(), &cpuset_sorter_worst);
+
+        //int32_t physicalCount = 0;
+        int32_t physicalCount = static_cast<int32_t>(siblings.size());
+        //physicalCount = get_count_procMask(generate_Mask(WORST_CORES, 0, 1, 0, 1, 0));
+
+        CPUSET_PRINT_DEBUG("\n\n### Logical Processors Summary ###\n\n");
+
+        for (int32_t _logicalCore = 0; _logicalCore < numLogicalCores;)
+        {
+                CPUSET_PRINT_DEBUG("\nLogical: %u\n", _logicalCore);
+                CPUSET_PRINT_DEBUG("Threads: %u\n", cpuset[_logicalCore].Threads);
+                CPUSET_PRINT_DEBUG("Id: %u\n", cpuset[_logicalCore].Id);
+                CPUSET_PRINT_DEBUG("Group: %u\n", cpuset[numLogicalCores].Group);
+                CPUSET_PRINT_DEBUG("LastLevelCacheIndex: %u\n", cpuset[_logicalCore].LastLevelCacheIndex);
+                CPUSET_PRINT_DEBUG("NumaNodeIndex: %u\n", cpuset[_logicalCore].NumaNodeIndex);
+                CPUSET_PRINT_DEBUG("LogicalProcessorIndex: %u\n", cpuset[_logicalCore].LogicalProcessorIndex);
+                CPUSET_PRINT_DEBUG("EfficiencyClass: %u\n", cpuset[_logicalCore].EfficiencyClass);
+                CPUSET_PRINT_DEBUG("SchedulingClass: %u\n", cpuset[_logicalCore].SchedulingClass);
+                _logicalCore++;
+        }
+
+        CPUSET_PRINT_DEBUG("\n\n<Grand total> \n\n");    
+        CPUSET_PRINT_DEBUG("Total Physical: %d\n", physicalCount);    
+        CPUSET_PRINT_DEBUG("Total Logical: %u\n", numLogicalCores);    
+
+        numPhysicalCores = physicalCount;
+        return physicalCount;
+    }
+#elif defined(__linux__) // __linux__
 // enumerate the set of thread siblings, num entries is num cores
    std::unordered_set<std::string> siblings;
    for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
-        std::ifstream thread_siblings("/sys/devices/system/cpu"
+        std::ifstream thread_siblings("/sys/devices/system/cpu/cpu"
            + std::to_string(cpu) + "/topology/thread_siblings");
        if (!thread_siblings.is_open()) {
            break; // no more cpus
@ -289,6 +531,7 @@ int32_t get_num_physical_cores() {
                _cpuset.NumaNodeIndex = nextCPUSet->CpuSet.NumaNodeIndex;
                _cpuset.EfficiencyClass = nextCPUSet->CpuSet.EfficiencyClass;
                _cpuset.SchedulingClass = nextCPUSet->CpuSet.SchedulingClass;
+                _cpuset.Threads = 1;
                cpuset.push_back(_cpuset);
                numLogicalCores++;
        }
@ -370,58 +613,10 @@ int32_t get_num_physical_cores() {
    return physicalCount;
 #endif
    unsigned int n_threads = std::thread::hardware_concurrency();
-    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;}
-
-
-#if defined(__x86_64__) && defined(__linux__)
-#include <pthread.h>
-
-static void cpuid(unsigned leaf, unsigned subleaf,
-                  unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
-    __asm__("movq\t%%rbx,%%rsi\n\t"
-            "cpuid\n\t"
-            "xchgq\t%%rbx,%%rsi"
-            : "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx)
-            : "0"(leaf), "2"(subleaf));
+    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
 }

-static int pin_cpu(int cpu) {
-    cpu_set_t mask;
-    CPU_ZERO(&mask);
-    CPU_SET(cpu, &mask);
-    return pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask);
-}
-
-static bool is_hybrid_cpu(void) {
-    unsigned eax, ebx, ecx, edx;
-    cpuid(7, 0, &eax, &ebx, &ecx, &edx);
-    return !!(edx & (1u << 15));
-}
-
-static bool is_running_on_efficiency_core(void) {
-    unsigned eax, ebx, ecx, edx;
-    cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx);
-    int intel_atom = 0x20;
-    int core_type = (eax & 0xff000000u) >> 24;
-    return core_type == intel_atom;
-}
-
-static int count_math_cpus(int cpu_count) {
-    int result = 0;
-    for (int cpu = 0; cpu < cpu_count; ++cpu) {
-        if (pin_cpu(cpu)) {
-            return -1;
-        }
-        if (is_running_on_efficiency_core()) {
-            continue; // efficiency cores harm lockstep threading
-        }
-        ++cpu; // hyperthreading isn't useful for linear algebra
-        ++result;
-    }
-    return result;
-}
-
-#elif defined(_WIN32)
+#if defined(_WIN32)

 #define STATUS_ACCESS_DENIED ((NTSTATUS)0xC0000022L)
 #define STATUS_SUCCESS		 ((NTSTATUS)0)
@ -437,30 +632,6 @@ typedef enum _PROCESSINFOCLASS {
 	ProcessAllowedCpuSetsInformation = 67,
 } PROCESSINFOCLASS;

-extern "C"
-NTSTATUS
-NTAPI
-NtQuerySystemInformationEx(
-	_In_ SYSTEM_INFORMATION_CLASS SystemInformationClass,
-	_In_reads_bytes_(InputBufferLength) PVOID InputBuffer,
-	_In_ ULONG InputBufferLength,
-	_Out_writes_bytes_opt_(SystemInformationLength) PVOID SystemInformation,
-	_In_ ULONG SystemInformationLength,
-	_Out_opt_ PULONG ReturnLength
-);
-
-
-extern "C"
-NTSTATUS
-NTAPI
-NtQueryInformationProcess(
-	_In_ HANDLE ProcessHandle,
-	_In_ PROCESSINFOCLASS ProcessInformationClass,
-	_Out_writes_bytes_opt_(ProcessInformationLength) PVOID ProcessInformation,
-	_In_ ULONG ProcessInformationLength,
-	_Out_opt_ PULONG ReturnLength
-);
-
 int32_t setCpuAffinity(std::bitset<64> cpuMask) {
    DWORD_PTR processAffinityMask;
    DWORD_PTR systemAffinityMask;
@ -588,7 +759,7 @@ ULONG set_procMask(int32_t direction = 0 , int32_t req_threads = 0, int32_t lltr
 * Returns number of CPUs on system that are useful for math.
 */
 int get_math_cpu_count() {
-#if defined(__x86_64__) && defined(__linux__)
+#if defined(__x86_164__) && defined(__linux__)
    int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
    if (cpu_count < 1) {
        return get_num_physical_cores();
@ -604,7 +775,7 @@ int get_math_cpu_count() {
        }
    }

-#elif defined(_WIN32)
+#elif defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
    int32_t _numPhysical = get_num_physical_cores();
    if (cpuset_enable) {
        // Initial Affinity set
@ -615,7 +786,7 @@ int get_math_cpu_count() {
    return get_num_physical_cores();
 }

-#if defined(_WIN32)
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
 int get_math_cpu_count(int32_t req_threads, int32_t cpuset_order, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) {
    int32_t _numPhysical = get_num_physical_cores();
    if (cpuset_enable) {
@ -694,7 +865,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
            invalid_param = true;
            return true;
        }
-#if defined(_WIN32)
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
        std::string value(argv[i]);
        if (value == "1" || value == "on" || value == "true" || value == "True") { params.cpuset_allowzero = 1; }
        else if (value == "0" || value == "off" || value == "false" || value == "False") { params.cpuset_allowzero = 0; }
@ -707,7 +878,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
            invalid_param = true;
            return true;
        }
-#if defined(_WIN32)
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
        std::string value(argv[i]);
        if (value == "1" || value == "on" || value == "true" || value == "True") { params.cpuset_allowthreads = 1; }
        else if (value == "0" || value == "off" || value == "false" || value == "False") { params.cpuset_allowthreads = 0; }
@ -720,7 +891,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
            invalid_param = true;
            return true;
        }
-#if defined(_WIN32)
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
        std::string value(argv[i]);
        std::size_t pos{};
        int64_t cpuMask = 0;
@ -749,7 +920,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
            invalid_param = true;
            return true;
        }
-#if defined(_WIN32)
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
        std::string value(argv[i]);
        if (value == "1" || value == "on" || value == "true" || value == "True") { params.cpuset_lltraversal = 1; }
        else if (value == "0" || value == "off" || value == "false" || value == "False") { params.cpuset_lltraversal = 0; }
@ -762,7 +933,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
            invalid_param = true;
            return true;
        }
-#if defined(_WIN32)
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
        std::string value(argv[i]);
        if (value == "1" || value == "on" || value == "true" || value == "True") { params.cpuset_order = BEST_CORES; }
        else if (value == "0" || value == "off" || value == "false" || value == "False") { params.cpuset_order = WORST_CORES; }
@ -775,7 +946,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
            invalid_param = true;
            return true;
        }
-#if defined(_WIN32)
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
        params.n_threads = std::stoi(argv[i]);
        if (params.n_threads <= 0) {
            params.n_threads = numPhysicalCores;
@ -795,7 +966,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
            return true;
        }
        params.n_threads_batch = std::stoi(argv[i]);
-#if defined(_WIN32)
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
        if (params.n_threads_batch <= 0 || params.n_threads_batch > numPhysicalCores) {
            params.n_threads_batch = numPhysicalCores;
 #else            
@ -811,7 +982,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
            return true;
        }
        params.n_threads_draft = std::stoi(argv[i]);
-#if defined(_WIN32)
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
        if (params.n_threads_draft <= 0 || params.n_threads_draft > numPhysicalCores) {
            params.n_threads_draft = numPhysicalCores;
 #else            
@ -827,7 +998,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
            return true;
        }
        params.n_threads_batch_draft = std::stoi(argv[i]);
-#if defined(_WIN32)
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
        if (params.n_threads_batch_draft <= 0 || params.n_threads_batch_draft > numPhysicalCores) {
            params.n_threads_batch_draft = numPhysicalCores;
 #else            
@ -1874,10 +2045,15 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
        }
    }

-#if defined(_WIN32)
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
    params.n_threads = get_math_cpu_count(params.n_threads_auto ? 0 : params.n_threads, params.cpuset_order, params.cpuset_lltraversal, params.cpuset_allowthreads, params.cpuset_allowzero, params.cpuset_cpumask);
+#endif
+#if defined(_WIN32)
    CPUSET_PRINT_DEBUG("Using %d threads order=%d llcache=%d acm=%d acz=%d cpumask=%lli\n", params.n_threads, params.cpuset_order, params.cpuset_lltraversal, params.cpuset_allowthreads, params.cpuset_allowzero, params.cpuset_cpumask);
 #endif
+#if defined(__x86_64__) && defined(__linux__)
+    CPUSET_PRINT_DEBUG("Using %d threads order=%d llcache=%d acm=%d acz=%d cpumask=%li\n", params.n_threads, params.cpuset_order, params.cpuset_lltraversal, params.cpuset_allowthreads, params.cpuset_allowzero, params.cpuset_cpumask);
+#endif

    if (invalid_param) {
        throw std::invalid_argument("error: invalid parameter for argument: " + arg);
@ -2040,7 +2216,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    if (llama_supports_mmap()) {
        printf("  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
    }
-#if defined(_WIN32)
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
        printf("  -bco                  change the order of the selected cores from the best to worst (default: worst to best)\n");
        printf("  -llct                 allow the core selection to traverse the last level cache (default: disabled)\n");
        printf("  -acz                  allow the core selection to pick the core 0 as well (default: disabled)\n");
@ -3240,13 +3416,18 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);

    fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
-#if defined(_WIN32)
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
    fprintf(stream, "threads: %d # default: %u\n", params.n_threads, get_math_cpu_count());
    fprintf(stream, "bco: %d # default: 0\n", params.cpuset_order);
    fprintf(stream, "llct: %d # default: 0\n", params.cpuset_lltraversal);
    fprintf(stream, "acz: %d # default: 0\n", params.cpuset_allowzero);
    fprintf(stream, "atc: %d # default: 0\n", params.cpuset_allowthreads);
+#if defined(_WIN32)
    fprintf(stream, "ccm: %lli # default: none\n", params.cpuset_cpumask);
+#endif
+#if defined(__x86_64__) && defined(__linux__)
+    fprintf(stream, "ccm: %li # default: none\n", params.cpuset_cpumask);
+#endif
 #else

    fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
--- a/common/common.h
+++ b/common/common.h
@ -39,7 +39,7 @@ extern char const *LLAMA_BUILD_TARGET;

 struct llama_control_vector_load_info;

-#ifdef _WIN32
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
 struct CPU_SET_INFORMATION
 {
    int32_t LogicalProcessorIndex;
@ -54,8 +54,12 @@ struct CPU_SET_INFORMATION
    int32_t Threads;
 };

-#endif
+bool cpuset_sorter_best(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION const& rhs);

+bool cpuset_sorter_worst(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION const& rhs);
+
+int get_math_cpu_count(int32_t req_threads, int32_t cpuset_order, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask);
+#endif
 static const int32_t BEST_CORES            = 0;
 static const int32_t WORST_CORES           = 1;