From a8936f49022d585a3857e9606fce0799926169bb Mon Sep 17 00:00:00 2001 From: jianyuzh Date: Mon, 15 Jan 2024 14:33:52 +0800 Subject: [PATCH] set nthread=1 when sycl, increase performance --- ggml.h | 1 + llama.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml.h b/ggml.h index 533f40c9f..5a173d362 100644 --- a/ggml.h +++ b/ggml.h @@ -2267,6 +2267,7 @@ extern "C" { GGML_API int ggml_cpu_has_gpublas (void); GGML_API int ggml_cpu_has_sse3 (void); GGML_API int ggml_cpu_has_ssse3 (void); + GGML_API int ggml_cpu_has_sycl (void); GGML_API int ggml_cpu_has_vsx (void); // diff --git a/llama.cpp b/llama.cpp index 61bafc9d2..eb9426f44 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6703,7 +6703,7 @@ static int llama_decode_internal( } const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1; - if (ggml_cpu_has_cublas() && fully_offloaded) { + if ((ggml_cpu_has_cublas() || ggml_cpu_has_sycl()) && fully_offloaded) { n_threads = 1; }