From a8936f49022d585a3857e9606fce0799926169bb Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Mon, 15 Jan 2024 14:33:52 +0800
Subject: [PATCH] set nthread=1 when sycl, increase performance

---
 ggml.h    | 1 +
 llama.cpp | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml.h b/ggml.h
index 533f40c9f..5a173d362 100644
--- a/ggml.h
+++ b/ggml.h
@@ -2267,6 +2267,7 @@ extern "C" {
     GGML_API int ggml_cpu_has_gpublas    (void);
     GGML_API int ggml_cpu_has_sse3       (void);
     GGML_API int ggml_cpu_has_ssse3      (void);
+    GGML_API int ggml_cpu_has_sycl       (void);
     GGML_API int ggml_cpu_has_vsx        (void);
 
     //
diff --git a/llama.cpp b/llama.cpp
index 61bafc9d2..eb9426f44 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6703,7 +6703,7 @@ static int llama_decode_internal(
     }
 
     const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
-    if (ggml_cpu_has_cublas() && fully_offloaded) {
+    if ((ggml_cpu_has_cublas() || ggml_cpu_has_sycl()) && fully_offloaded) {
         n_threads = 1;
     }