diff --git a/common/common.cpp b/common/common.cpp
index 5a2bcbb03..759adad97 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -92,7 +92,7 @@ int32_t PhysicalCores = std::thread::hardware_concurrency();
 // CPUSET logging
 //
 
-#define CPUSET_DEBUG 0
+#define CPUSET_DEBUG 1
 #if (CPUSET_DEBUG >= 1)
 #define CPUSET_PRINT_DEBUG(...) printf(__VA_ARGS__)
 #else
@@ -127,9 +127,7 @@ int32_t get_count_procMask(ULONG_PTR procMask) {
             std::bitset<64> bMask = procMask;
             return bMask.count();
 }
-#endif
 
-#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
 uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) {
     std::bitset<64> bMask;
     std::vector<CPU_SET_INFORMATION> _cpuset;
@@ -137,7 +135,6 @@ uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltravers
     int32_t assigned_t = 0;
     int32_t llcache = -1;
 
-#if defined(_WIN32)
     ULONG_PTR processAffinityMask;
     ULONG_PTR systemAffinityMask;
     HANDLE hToken = nullptr;
@@ -176,13 +173,6 @@ uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltravers
         }
         return bMask.to_ullong();
     }
-#else
-    if (cpuMask != 0) {
-        std::bitset<64> reqMask = cpuMask;
-        CPUSET_PRINT_DEBUG("Custom cpuMask: %s\n", reqMask.to_string().c_str());
-        return reqMask.to_ullong();
-    }
-#endif
 
     if (direction == BEST_CORES) {
         _cpuset = cpuset_best;
@@ -192,7 +182,7 @@ uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltravers
     CPUSET_PRINT_DEBUG("\ngenerate_Mask dir=%d req_threads=%d lltraversal=%d llcache=%d\n", direction, req_threads, lltraversal, llcache);
     for (auto index : _cpuset) {
         bVal = 0;
-        if ((index.LogicalProcessorIndex != 0 || allowcz) &&
+        if ((index.LogicalProcessorIndex != 0 || allowcz == 1) &&
             ((cpuset_smt && index.Threads > 1) || !cpuset_smt || allowtc) &&
             index.EfficiencyClass == 0 &&
             ((llcache == index.LastLevelCacheIndex && lltraversal == 0) || llcache == -1 || lltraversal == 1)
@@ -214,33 +204,73 @@ uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltravers
     }
     return bMask.to_ullong();
 }
-#endif
 
-#if defined(__x86_64__) && defined(__linux__)
+#elif defined(__x86_64__) && defined(__linux__)
 #include <pthread.h>
 
-int32_t setCpuAffinity(std::bitset<64> cpuMask) {
-    int32_t coreSelected = cpuMask.count();
+cpu_set_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) {
+    cpu_set_t bMask;
+    CPU_ZERO(&bMask);
+    std::vector<CPU_SET_INFORMATION> _cpuset;
+    int32_t bVal = 0;
+    int32_t assigned_t = 0;
+    int32_t llcache = -1;
+    std::bitset<64> reqMask = cpuMask;
 
-    cpu_set_t mask;
-    CPU_ZERO(&mask);
-
-    for (int32_t i = 0; i < 64; ++i) {
-        if (cpuMask[i] == 1) {
-            CPUSET_PRINT_DEBUG("Setting CPU %d\n", i);
-            CPU_SET(i, &mask);
-        } else {
-            CPU_CLR(i, &mask);
-        }
+    if (cpuMask != 0) {
+        CPUSET_PRINT_DEBUG("Custom cpuMask: %s\n", reqMask.to_string().c_str());
     }
 
-    if (sched_setaffinity(0, sizeof(cpu_set_t), &mask) == -1) {
+    if (direction == BEST_CORES) {
+        _cpuset = cpuset_best;
+    } else {
+        _cpuset = cpuset_worst;
+    }
+    CPUSET_PRINT_DEBUG("\ngenerate_Mask: dir=%d req_threads=%d lltraversal=%d llcache=%d\n", direction, req_threads, lltraversal, llcache);
+    for (auto index : _cpuset) {
+        bVal = 0;
+        if ((index.LogicalProcessorIndex != 0 || allowcz == 1) &&
+            ((cpuset_smt && index.Threads > 1) || !cpuset_smt || allowtc) &&
+            index.EfficiencyClass == 0 &&
+            ((llcache == index.LastLevelCacheIndex && lltraversal == 0) || llcache == -1 || lltraversal == 1)
+            ) {
+            if (lltraversal == 0) {
+                CPUSET_PRINT_DEBUG("### cache for lltraversal %d pre llcache=%d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex);
+                llcache = index.LastLevelCacheIndex;
+                CPUSET_PRINT_DEBUG("### cache for lltraversal %d pos llcache=%d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex);
+            } 
+            bVal = 1;
+        }
+        if (req_threads > 0 && assigned_t >= req_threads) { bVal = 0;}
+        if (cpuMask != 0) {
+            bVal = 1;
+            if (reqMask[index.LogicalProcessorIndex] == 0) {
+                bVal = 0;
+            }
+        }
+        if(bVal == 1) {
+            assigned_t++;
+            CPU_SET(index.LogicalProcessorIndex, &bMask);
+            CPUSET_PRINT_DEBUG("--> Assigned LogicalCoreIndex: %d lltraversal=%d llcache=%d now_cache=%u\n", index.LogicalProcessorIndex, lltraversal, llcache, index.LastLevelCacheIndex);
+        } else {
+            CPU_CLR(index.LogicalProcessorIndex, &bMask);
+        }
+        CPUSET_PRINT_DEBUG("LogicalCoreIndex: %d b:%d smt=%d thrds=%d lltraversal=%d acz=%d atc=%d\n", index.LogicalProcessorIndex, bVal, cpuset_smt, index.Threads, lltraversal, allowcz, allowtc);
+    }
+    return bMask;
+}
+
+int32_t setCpuAffinity(cpu_set_t bMask) {
+    const cpu_set_t cpuMask = bMask;
+    int32_t coreSelected = CPU_COUNT(&cpuMask);
+
+    if (sched_setaffinity(0, sizeof(cpu_set_t), &cpuMask) == -1) {
             CPUSET_PRINT_DEBUG("setCpuAffinity sched_setaffinity error\n");
     }
-    if (pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) == -1) {
+    if (pthread_setaffinity_np(pthread_self(), sizeof(cpuMask), &cpuMask) == -1) {
             CPUSET_PRINT_DEBUG("setCpuAffinity pthread_setaffinity_np error\n");
-    }
-     
+    }     
+    
     return coreSelected;
 }
 
@@ -289,15 +319,16 @@ static int count_math_cpus(int cpu_count) {
     return result;
 }
 
-uint64_t set_procMask(int32_t direction = 0 , int32_t req_threads = 0, int32_t lltraversal = 0, int32_t allowtc = 0, int32_t allowcz = 0, int64_t cpuMask = 0) {
-    std::bitset<64> bMask;
-
+cpu_set_t set_procMask(int32_t direction = 0 , int32_t req_threads = 0, int32_t lltraversal = 0, int32_t allowtc = 0, int32_t allowcz = 0, int64_t cpuMask = 0) {
+    cpu_set_t bMask;
+    CPU_ZERO(&bMask);
     bMask = generate_Mask(direction, req_threads, lltraversal, allowtc, allowcz, cpuMask);
 
-    numPhysicalCores = bMask.count();
+    numPhysicalCores = CPU_COUNT(&bMask);
 
-    CPUSET_PRINT_DEBUG("Generated Mask: %s\n", bMask.to_string().c_str());
-    return bMask.to_ullong();
+    CPUSET_PRINT_DEBUG("Generated Mask Count CPU: %d\n", numPhysicalCores);
+    
+    return bMask;
 }
 
 #endif
@@ -318,7 +349,7 @@ int32_t get_num_physical_cores() {
     std::vector<CPU_SET_INFORMATION> _cpuset;
     int32_t numLogicalCores = 0;
 
-    for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
+    for (uint32_t cpu=0; cpu < 1024; ++cpu) {
         CPUSET_PRINT_DEBUG("Check for Logical CPU: %d\n", cpu);
         std::ifstream thread_siblings("/sys/devices/system/cpu/cpu"
             + std::to_string(cpu) + "/topology/thread_siblings");
@@ -390,9 +421,8 @@ int32_t get_num_physical_cores() {
         std::sort(cpuset_worst.begin(), cpuset_worst.end(), &cpuset_sorter_worst);
 
         int32_t physicalCount = 0;
-        //int32_t physicalCount = static_cast<int32_t>(siblings.size());
-        std::bitset<64> bMask = generate_Mask(WORST_CORES, 0, 1, 0, 1, 0);
-        physicalCount = bMask.count();
+        cpu_set_t bMask = generate_Mask(WORST_CORES, 0, 1, 0, 1, 0);
+        physicalCount = CPU_COUNT(&bMask);
 
         CPUSET_PRINT_DEBUG("\n\n### Logical Processors Summary ###\n\n");
 
@@ -554,7 +584,7 @@ int32_t get_num_physical_cores() {
     std::sort(cpuset_worst.begin(), cpuset_worst.end(), &cpuset_sorter_worst);
 
     int32_t physicalCount = 0;
-    physicalCount = get_count_procMask(generate_Mask(WORST_CORES, 0, 1, 0, 1, 0));
+    physicalCount = get_count_procMask(generate_Mask(WORST_CORES, 0, 1, 0, 2, 0));
 
     CPUSET_PRINT_DEBUG("\n\n1st PhysicalCount: %d\n\n", physicalCount);
 
@@ -731,7 +761,14 @@ ULONG set_procMask(int32_t direction = 0, int32_t req_threads = 0, int32_t lltra
  * Returns number of CPUs on system that are useful for math.
  */
 int get_math_cpu_count() {
-#if defined(__x86_64__) && defined(__linux__)
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
+    int32_t _numPhysical = get_num_physical_cores();
+    if (cpuset_enable) {
+        // Initial Affinity set
+        setCpuAffinity(set_procMask(WORST_CORES, 0, 1, 0, 0));
+    }
+    return _numPhysical;
+#elif defined(__linux__)
     int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
     if (cpu_count < 1) {
         return get_num_physical_cores();
@@ -746,14 +783,6 @@ int get_math_cpu_count() {
             }
         }
     }
-
-#elif defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
-    int32_t _numPhysical = get_num_physical_cores();
-    if (cpuset_enable) {
-        // Initial Affinity set
-        setCpuAffinity(set_procMask(WORST_CORES, 0, 1));
-    }
-    return _numPhysical;
 #endif
     return get_num_physical_cores();
 }
@@ -762,6 +791,7 @@ int get_math_cpu_count() {
 int get_math_cpu_count(int32_t req_threads, int32_t cpuset_order, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) {
     int32_t _numPhysical = get_num_physical_cores();
     if (cpuset_enable) {
+        if (_numPhysical < 7 && allowcz == 2) allowcz = 1;
         _numPhysical = setCpuAffinity(set_procMask(cpuset_order, req_threads, lltraversal, allowtc, allowcz, cpuMask));
     }
     return _numPhysical;
@@ -2191,7 +2221,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 #if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
         printf("  -bco                  change the order of the selected cores from the best to worst (default: worst to best)\n");
         printf("  -llct                 allow the core selection to traverse the last level cache (default: disabled)\n");
-        printf("  -acz                  allow the core selection to pick the core 0 as well (default: disabled)\n");
+        printf("  -acz                  allow the core selection to pick the core 0 as well (default: disabled for more than 6 cores)\n");
         printf("  -atc                  allow the core selection to pick non physical, threaded, cores (default: disabled)\n");
         printf("  -ccm                  specify a custom CPU Affinity bitmask in hex for the core selection (default: disabled)\n");
 #endif
@@ -3392,7 +3422,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "threads: %d # default: %u\n", params.n_threads, get_math_cpu_count());
     fprintf(stream, "bco: %d # default: 0\n", params.cpuset_order);
     fprintf(stream, "llct: %d # default: 0\n", params.cpuset_lltraversal);
-    fprintf(stream, "acz: %d # default: 0\n", params.cpuset_allowzero);
+    fprintf(stream, "acz: %d # default: auto\n", params.cpuset_allowzero);
     fprintf(stream, "atc: %d # default: 0\n", params.cpuset_allowthreads);
 #if defined(_WIN32)
     fprintf(stream, "ccm: %lli # default: none\n", params.cpuset_cpumask);
diff --git a/common/common.h b/common/common.h
index ca53a5750..dc7fcffb5 100644
--- a/common/common.h
+++ b/common/common.h
@@ -61,10 +61,9 @@ bool cpuset_sorter_worst(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION con
 int get_math_cpu_count(int32_t req_threads, int32_t cpuset_order, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask);
 #endif
 #if defined(__x86_64__) && defined(__linux__)
-#include <bitset>
-int32_t setCpuAffinity(std::bitset<64> cpuMask);
-uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask);
-uint64_t set_procMask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask);
+int32_t setCpuAffinity(cpu_set_t cpuMask);
+cpu_set_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask);
+cpu_set_t set_procMask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask);
 #endif
 
 static const int32_t BEST_CORES            = 0;
@@ -88,7 +87,7 @@ struct gpt_params {
     int32_t cpuset_lltraversal    = 0;
     int32_t cpuset_order          = WORST_CORES;
     int64_t cpuset_cpumask        = 0;
-    int32_t cpuset_allowzero      = 0;
+    int32_t cpuset_allowzero      = 2;
     int32_t cpuset_allowthreads   = 0;
     int32_t n_predict             = -1;    // new tokens to predict
     int32_t n_ctx                 = 512;   // context size