From 57f8355b29a8c7dfcd1fb6094758ad85644f8535 Mon Sep 17 00:00:00 2001
From: Romain Biessy <romain.biessy@codeplay.com>
Date: Fri, 15 Nov 2024 12:10:45 +0100
Subject: [PATCH 1/9] sycl: Update Intel docker images to use DPC++ 2025.0
 (#10305)

---
 .devops/llama-cli-intel.Dockerfile    | 2 +-
 .devops/llama-server-intel.Dockerfile | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.devops/llama-cli-intel.Dockerfile b/.devops/llama-cli-intel.Dockerfile
index 79dba06a7..1f4ce0730 100644
--- a/.devops/llama-cli-intel.Dockerfile
+++ b/.devops/llama-cli-intel.Dockerfile
@@ -1,4 +1,4 @@
-ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
+ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
 
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
 
diff --git a/.devops/llama-server-intel.Dockerfile b/.devops/llama-server-intel.Dockerfile
index 9c355b664..773f030a7 100644
--- a/.devops/llama-server-intel.Dockerfile
+++ b/.devops/llama-server-intel.Dockerfile
@@ -1,4 +1,4 @@
-ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
+ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
 
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
 

From f0204a0ec70d50ca60e07bc0096ec1d6508ab0c7 Mon Sep 17 00:00:00 2001
From: R0CKSTAR <xiaodong.ye@mthreads.com>
Date: Fri, 15 Nov 2024 19:47:25 +0800
Subject: [PATCH 2/9] ci: build test musa with cmake (#10298)

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
---
 .github/workflows/build.yml | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index c770bbd15..6ef0770f3 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -414,6 +414,27 @@ jobs:
           cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIP=ON
           cmake --build build2 --config Release -j $(nproc)
 
+  ubuntu-22-cmake-musa:
+    runs-on: ubuntu-22.04
+    container: mthreads/musa:rc3.1.0-devel-ubuntu22.04
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Dependencies
+        id: depends
+        run: |
+          apt-get update
+          apt-get install -y build-essential git cmake libcurl4-openssl-dev
+
+      - name: Build with native CMake MUSA support
+        id: cmake_build
+        run: |
+          cmake -B build -S . -DGGML_MUSA=ON
+          cmake --build build --config Release -j $(nproc)
+
   ubuntu-22-cmake-sycl:
     runs-on: ubuntu-22.04
 

From 18429220bdb344da1bc7df9bc580c7b41b3cd57b Mon Sep 17 00:00:00 2001
From: Eve <139727413+netrunnereve@users.noreply.github.com>
Date: Fri, 15 Nov 2024 11:47:58 +0000
Subject: [PATCH 3/9] AVX BF16 and single scale quant optimizations (#10212)

* use 128 bit loads (i've tried 256->128 to death and its slower)

* double accumulator

* avx bf16 vec dot

* +3% q4_0 inference

* +7% tg +5% pp compared to master

* slower f16c version, kep for reference

* 256b version, also slow. i tried :)

* revert f16

* faster with madd

* split to functions

* Q8_0 and IQ4_NL, 5-7% faster

* fix potential overflow (performance reduced)

* 16 bit add for q4_0 only

* merge
---
 ggml/src/ggml-cpu/ggml-cpu-quants.c | 128 +++++++++++++++++-----------
 ggml/src/ggml-cpu/ggml-cpu.c        |   6 +-
 2 files changed, 82 insertions(+), 52 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ggml/src/ggml-cpu/ggml-cpu-quants.c
index 7fa2897c2..f0e276b69 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-quants.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c
@@ -150,6 +150,28 @@ static inline __m128i packNibbles( __m256i bytes )
 #endif
 }
 #elif defined(__AVX__)
+static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
+{
+    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
+    const __m128i lowByte = _mm_set1_epi16( 0xFF );
+    __m128i high = _mm_andnot_si128( lowByte, bytes1 );
+    __m128i low = _mm_and_si128( lowByte, bytes1 );
+    high = _mm_srli_epi16( high, 4 );
+    bytes1 = _mm_or_si128( low, high );
+    high = _mm_andnot_si128( lowByte, bytes2 );
+    low = _mm_and_si128( lowByte, bytes2 );
+    high = _mm_srli_epi16( high, 4 );
+    bytes2 = _mm_or_si128( low, high );
+
+    return _mm_packus_epi16( bytes1, bytes2);
+}
+
+static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
+    const __m128i ax = _mm_sign_epi8(x, x);
+    const __m128i sy = _mm_sign_epi8(y, x);
+    return _mm_maddubs_epi16(ax, sy);
+}
+
 // spread 32 bits to 32 bytes { 0x00, 0xFF }
 static inline __m256i bytes_from_bits_32(const uint8_t * x) {
     uint32_t x32;
@@ -217,26 +239,29 @@ static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
     return sum_i16_pairs_float(doth, dotl);
 }
 
-static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
-{
-    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
-    const __m128i lowByte = _mm_set1_epi16( 0xFF );
-    __m128i high = _mm_andnot_si128( lowByte, bytes1 );
-    __m128i low = _mm_and_si128( lowByte, bytes1 );
-    high = _mm_srli_epi16( high, 4 );
-    bytes1 = _mm_or_si128( low, high );
-    high = _mm_andnot_si128( lowByte, bytes2 );
-    low = _mm_and_si128( lowByte, bytes2 );
-    high = _mm_srli_epi16( high, 4 );
-    bytes2 = _mm_or_si128( low, high );
+// larger version of mul_sum_i8_pairs_float where x and y are each represented by four 128-bit vectors
+static inline __m256 mul_sum_i8_quad_float(const __m128i x_1_0, const __m128i x_1_1, const __m128i x_2_0, const __m128i x_2_1,
+                                           const __m128i y_1_0, const __m128i y_1_1, const __m128i y_2_0, const __m128i y_2_1) {
+    const __m128i mone = _mm_set1_epi16(1);
 
-    return _mm_packus_epi16( bytes1, bytes2);
+    const __m128i p16_1_0 = mul_add_epi8_sse(x_1_0, y_1_0);
+    const __m128i p16_1_1 = mul_add_epi8_sse(x_1_1, y_1_1);
+    const __m128i p16_2_0 = mul_add_epi8_sse(x_2_0, y_2_0);
+    const __m128i p16_2_1 = mul_add_epi8_sse(x_2_1, y_2_1);
+    const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
+    const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
+    const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
+    const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
+    const __m128i p_1 = _mm_add_epi32(p_1_0, p_1_1);
+    const __m128i p_2 = _mm_add_epi32(p_2_0, p_2_1);
+    return _mm256_cvtepi32_ps(MM256_SET_M128I(p_2, p_1));
 }
 
-static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
-    const __m128i ax = _mm_sign_epi8(x, x);
-    const __m128i sy = _mm_sign_epi8(y, x);
-    return _mm_maddubs_epi16(ax, sy);
+// quad fp16 delta calculation
+static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const float x1, const float y1) {
+    // GGML_FP16_TO_FP32 is faster than Intel F16C
+    return _mm256_set_m128(_mm_set1_ps(GGML_FP16_TO_FP32(x1) * GGML_FP16_TO_FP32(y1)),
+                           _mm_set1_ps(GGML_FP16_TO_FP32(x0) * GGML_FP16_TO_FP32(y0)));
 }
 #endif
 #elif defined(__SSSE3__)
@@ -2004,10 +2029,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
 
     sumf = hsum_float_8(acc);
 #elif defined(__AVX__)
-    const __m128i mone = _mm_set1_epi16(1);
-
-    __m256 accum1 = _mm256_setzero_ps();
-    __m256 accum2 = _mm256_setzero_ps();
+    __m256 accum = _mm256_setzero_ps();
     for (; ib + 1 < nb; ib += 2) {
         const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
         const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
@@ -2020,21 +2042,20 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
         const __m128i q4b_1_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_1, 4)), _mm_set1_epi8(8));
         const __m128i q4b_2_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_2), _mm_set1_epi8(8));
         const __m128i q4b_2_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_2, 4)), _mm_set1_epi8(8));
+
         const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
         const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
         const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
         const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
-        const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
-        const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
-        const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
-        const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
-        accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
-                _mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
-        accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
-                _mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
+        const __m128i p_1 = _mm_add_epi16(p16_1_0, p16_1_1);
+        const __m128i p_2 = _mm_add_epi16(p16_2_0, p16_2_1);
+        const __m256 p =  sum_i16_pairs_float(p_2, p_1);
+
+        const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
+        accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
     }
 
-    sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
+    sumf = hsum_float_8(accum);
 #elif defined(__SSSE3__)
     // set constants
     const __m128i lowMask = _mm_set1_epi8(0xF);
@@ -3535,7 +3556,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
     }
 
     sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined(__AVX2__) || defined(__AVX__)
+#elif defined(__AVX2__)
     // Initialize accumulator with zeros
     __m256 acc = _mm256_setzero_ps();
 
@@ -3549,14 +3570,29 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
         const __m256 q = mul_sum_i8_pairs_float(qx, qy);
 
         // Multiply q with scale and accumulate
-#if defined(__AVX2__)
         acc = _mm256_fmadd_ps( d, q, acc );
-#else
-        acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc );
-#endif
     }
 
     sumf = hsum_float_8(acc);
+#elif defined(__AVX__)
+    __m256 accum = _mm256_setzero_ps();
+
+    for (; ib + 1 < nb; ib += 2) {
+        const __m128i qx_1_0 = _mm_loadu_si128((const __m128i *)x[ib].qs);
+        const __m128i qx_1_1 = _mm_loadu_si128((const __m128i *)x[ib].qs + 1);
+        const __m128i qx_2_0 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
+        const __m128i qx_2_1 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs + 1);
+        const __m128i qy_1_0 = _mm_loadu_si128((const __m128i *)y[ib].qs);
+        const __m128i qy_1_1 = _mm_loadu_si128((const __m128i *)y[ib].qs + 1);
+        const __m128i qy_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
+        const __m128i qy_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
+
+        const __m256 p = mul_sum_i8_quad_float(qx_1_0, qx_1_1, qx_2_0, qx_2_1, qy_1_0, qy_1_1, qy_2_0, qy_2_1);
+        const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
+        accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
+    }
+
+    sumf = hsum_float_8(accum);
 #elif defined(__riscv_v_intrinsic)
     size_t vl = __riscv_vsetvl_e8m1(qk);
 
@@ -10322,10 +10358,8 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
 #elif defined __AVX__
     const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
     const __m128i m4b  = _mm_set1_epi8(0x0f);
-    const __m128i mone = _mm_set1_epi16(1);
 
-    __m256 accum1 = _mm256_setzero_ps();
-    __m256 accum2 = _mm256_setzero_ps();
+    __m256 accum = _mm256_setzero_ps();
     for (; ib + 1 < nb; ib += 2) {
         const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
         const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
@@ -10338,21 +10372,13 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
         const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
         const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
         const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
-        const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
-        const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
-        const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
-        const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
-        const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
-        const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
-        const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
-        const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
-        accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
-                _mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
-        accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
-                _mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
+
+        const __m256 p = mul_sum_i8_quad_float(q4b_1_0, q4b_1_1, q4b_2_0, q4b_2_1, q8b_1_0, q8b_1_1, q8b_2_0, q8b_2_1);
+        const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
+        accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
     }
 
-    sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
+    sumf = hsum_float_8(accum);
 
 #elif defined(__POWER9_VECTOR__)
     const vector signed char lowMask = vec_splats((signed char)0xF);
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 30b1bf895..61f53cd01 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1469,8 +1469,12 @@ static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t
     sumf += (ggml_float)_mm512_reduce_add_ps(c2);
 
 #undef LOAD
-#elif defined(__AVX2__)
+#elif defined(__AVX2__) || defined(__AVX__)
+#if defined(__AVX2__)
 #define LOAD(p) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16))
+#else
+#define LOAD(p) _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16)), (_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_bsrli_si128(_mm_loadu_si128((const __m128i *)(p)), 8)), 16)), 1))
+#endif
     __m256 c1 = _mm256_setzero_ps();
     __m256 c2 = _mm256_setzero_ps();
     __m256 c3 = _mm256_setzero_ps();

From cbf5541a82952bcd7c4fceb55f5e332cafbf1720 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 15 Nov 2024 15:31:16 +0200
Subject: [PATCH 4/9] sync : ggml

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index 199237a21..6ddb71ab1 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-8a3d799484d861748f86eb87c8314fa2dbccc254
+9d0708e863f3aa2fc1eb0b75d433303c30bd0dbc

From 3225008973579cc6a784890c237e1bfc9de41819 Mon Sep 17 00:00:00 2001
From: thewh1teagle <61390950+thewh1teagle@users.noreply.github.com>
Date: Fri, 15 Nov 2024 15:33:53 +0200
Subject: [PATCH 5/9] ggml : vulkan logs (whisper/2547)

---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index c02c35665..04d671aeb 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1763,7 +1763,8 @@ static void ggml_vk_print_gpu_info(size_t idx) {
     fp16 = fp16 && vk12_features.shaderFloat16;
 
     std::string device_name = props2.properties.deviceName.data();
-    std::cerr << GGML_VK_NAME << idx << ": " << device_name << " (" << driver_props.driverName << ") | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
+    GGML_LOG_DEBUG("ggml_vulkan: %d = %s (%s) | uma: %d | fp16: %d | warp size: %d\n",
+              idx, device_name.c_str(), driver_props.driverName, uma, fp16, subgroup_size);
 
     if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
         std::cerr << "ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want." << std::endl;
@@ -1821,8 +1822,7 @@ void ggml_vk_instance_init() {
         };
         validation_features.setPNext(nullptr);
         instance_create_info.setPNext(&validation_features);
-
-        std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
+        GGML_LOG_DEBUG("ggml_vulkan: Validation layers enabled\n");
     }
     vk_instance.instance = vk::createInstance(instance_create_info);
 
@@ -1936,8 +1936,8 @@ void ggml_vk_instance_init() {
             vk_instance.device_indices.push_back(0);
         }
     }
+    GGML_LOG_DEBUG("ggml_vulkan: Found %d Vulkan devices:\n", vk_instance.device_indices.size());
 
-    std::cerr << "ggml_vulkan: Found " << vk_instance.device_indices.size() << " Vulkan devices:" << std::endl;
 
     for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
         ggml_vk_print_gpu_info(i);

From 09ecbcb596ed8fa97d503d7440f0b3eff872e8f1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 15 Nov 2024 15:35:22 +0200
Subject: [PATCH 6/9] cmake : fix ppc64 check (whisper/0)

ggml-ci
---
 ggml/src/ggml-cpu/CMakeLists.txt | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index 8b0d60d4e..30de6c99a 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -211,10 +211,13 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
     endif()
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
     message(STATUS "PowerPC detected")
-    execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1"
-                   OUTPUT_VARIABLE POWER10_M)
-    string(FIND ${POWER10_M} "POWER10" substring_index)
-    if(${substring_index} GREATER_EQUAL 0)
+    execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M)
+    string(FIND "${POWER10_M}" "POWER10" substring_index)
+    if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "")
+        set(substring_index -1)
+    endif()
+
+    if (${substring_index} GREATER_EQUAL 0)
        list(APPEND ARCH_FLAGS -mcpu=power10)
     elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
        list(APPEND ARCH_FLAGS -mcpu=powerpc64le)

From 883d206fbd2c5b2b9b589a9328503b9005e146c9 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Fri, 15 Nov 2024 20:20:54 +0100
Subject: [PATCH 7/9] ggml : fix some build issues

---
 Makefile        | 8 ++++----
 ggml/src/ggml.c | 8 ++++++++
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 87fe795aa..804c1bbec 100644
--- a/Makefile
+++ b/Makefile
@@ -359,6 +359,10 @@ ifdef LLAMA_SERVER_SSL
 	MK_LDFLAGS += -lssl -lcrypto
 endif
 
+ifndef GGML_NO_CPU_AARCH64
+	MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
+endif
+
 # warnings
 WARN_FLAGS = \
 	-Wall \
@@ -940,10 +944,6 @@ ggml/src/ggml-cuda/%.o: \
 	$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -mtgpu -c -o $@ $<
 endif # GGML_MUSA
 
-ifndef GGML_NO_CPU_AARCH64
-	MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
-endif
-
 ifdef GGML_METAL
 	MK_CPPFLAGS += -DGGML_USE_METAL
 	MK_LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 4a97bfc32..5cdf59f25 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -49,6 +49,14 @@
 
 #define UNUSED GGML_UNUSED
 
+#if defined(_MSC_VER)
+#define m512bh(p) p
+#define m512i(p) p
+#else
+#define m512bh(p) (__m512bh)(p)
+#define m512i(p) (__m512i)(p)
+#endif
+
 // precomputed f32 table for f16 (256 KB) (ggml-impl.h)
 float ggml_table_f32_f16[1 << 16];
 

From 4047be74da398acb8717a4d21b77b929ad7ed4f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Fri, 15 Nov 2024 21:19:03 +0100
Subject: [PATCH 8/9] scripts: update compare-llama-bench.py (#10319)

---
 scripts/compare-llama-bench.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py
index 4ac6b5fc0..6125a080a 100755
--- a/scripts/compare-llama-bench.py
+++ b/scripts/compare-llama-bench.py
@@ -19,22 +19,22 @@ logger = logging.getLogger("compare-llama-bench")
 
 # Properties by which to differentiate results per commit:
 KEY_PROPERTIES = [
-    "cpu_info", "gpu_info", "n_gpu_layers", "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas",
-    "blas", "model_filename", "model_type", "n_batch", "n_ubatch", "embeddings", "n_threads",
-    "type_k", "type_v", "use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen"
+    "cpu_info", "gpu_info", "backends", "n_gpu_layers", "model_filename", "model_type", "n_batch", "n_ubatch",
+    "embeddings", "cpu_mask", "cpu_strict", "poll", "n_threads", "type_k", "type_v", "use_mmap", "no_kv_offload",
+    "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen"
 ]
 
 # Properties that are boolean and are converted to Yes/No for the table:
-BOOL_PROPERTIES = ["cuda", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas", "embeddings", "use_mmap", "no_kv_offload", "flash_attn"]
+BOOL_PROPERTIES = ["embeddings", "cpu_strict", "use_mmap", "no_kv_offload", "flash_attn"]
 
 # Header names for the table:
 PRETTY_NAMES = {
-    "cuda": "CUDA", "vulkan": "Vulkan", "kompute": "Kompute", "metal": "Metal", "sycl": "SYCL", "rpc": "RPC",
-    "gpu_blas": "GPU BLAS", "blas": "BLAS", "cpu_info": "CPU", "gpu_info": "GPU", "model_filename": "File", "model_type": "Model",
-    "model_size": "Model Size [GiB]", "model_n_params": "Num. of Par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size",
-    "n_threads": "Threads", "type_k": "K type", "type_v": "V type", "n_gpu_layers": "GPU layers", "split_mode": "Split mode",
-    "main_gpu": "Main GPU", "no_kv_offload": "NKVO", "flash_attn": "FlashAttention", "tensor_split": "Tensor split",
-    "use_mmap": "Use mmap", "embeddings": "Embeddings",
+    "cpu_info": "CPU", "gpu_info": "GPU", "backends": "Backends", "n_gpu_layers": "GPU layers",
+    "model_filename": "File", "model_type": "Model", "model_size": "Model size [GiB]",
+    "model_n_params": "Num. of par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size",
+    "embeddings": "Embeddings", "cpu_mask": "CPU mask", "cpu_strict": "CPU strict", "poll": "Poll",
+    "n_threads": "Threads", "type_k": "K type", "type_v": "V type", "split_mode": "Split mode", "main_gpu": "Main GPU",
+    "no_kv_offload": "NKVO", "flash_attn": "FlashAttention", "tensor_split": "Tensor split", "use_mmap": "Use mmap",
 }
 
 DEFAULT_SHOW = ["model_type"]  # Always show these properties by default.

From 74d73dc85cc2057446bf63cc37ff649ae7cebd80 Mon Sep 17 00:00:00 2001
From: Srihari-mcw <96763064+Srihari-mcw@users.noreply.github.com>
Date: Sat, 16 Nov 2024 02:57:00 +0530
Subject: [PATCH 9/9] Make updates to fix issues with clang-cl builds while
 using AVX512 flags (#10314)

---
 ggml/src/ggml-cpu/CMakeLists.txt | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index 30de6c99a..cef41a074 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -143,14 +143,23 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
             if (GGML_AVX512_VBMI)
                 add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
                 add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
+                if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                    list(APPEND ARCH_FLAGS -mavx512vbmi)
+                endif()
             endif()
             if (GGML_AVX512_VNNI)
                 add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
                 add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
+                if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                    list(APPEND ARCH_FLAGS -mavx512vnni)
+                endif()
             endif()
             if (GGML_AVX512_BF16)
                 add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
                 add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
+                if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                    list(APPEND ARCH_FLAGS -mavx512bf16)
+                endif()
             endif()
             if (GGML_AMX_TILE)
                 add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)