diff --git a/CMakeLists.txt b/CMakeLists.txt index a827eda96..4c585c2d7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,8 +96,8 @@ option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") option(LLAMA_CUDA "llama: use CUDA" OFF) option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF) -option(LLAMA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF) -option(LLAMA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF) +option(LLAMA_FORCE_DMMV "llama: use dmmv instead of mmvq kernels on GPU" OFF) +option(LLAMA_FORCE_MMQ "llama: use mmq kernels instead of Math Lib" OFF) set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels") option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF) @@ -628,10 +628,10 @@ if (LLAMA_SYCL) add_compile_definitions(GGML_SYCL_F16) endif() - if (LLAMA_SYCL_FORCE_DMMV) + if (LLAMA_FORCE_DMMV) add_compile_definitions(GGML_SYCL_FORCE_DMMV) endif() - if (LLAMA_SYCL_FORCE_MMQ) + if (LLAMA_FORCE_MMQ) add_compile_definitions(GGML_SYCL_FORCE_MMQ) endif() diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 623202d9a..47dbfcde3 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -2978,10 +2978,9 @@ static int g_work_group_size = 0; #define GGML_SYCL_MAX_NODES 8192 //TODO: adapt to hardwares - -//define for XMX in Intel GPU -//TODO: currently, it's not used for XMX really. -#define SYCL_USE_XMX +#if !defined(GGML_SYCL_FORCE_MMQ) + #define SYCL_USE_XMX +#endif // max batch size to use MMQ kernels when tensor cores are available #define MMQ_MAX_BATCH_SIZE 32 @@ -15228,10 +15227,6 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 } } -#if !defined(GGML_SYCL_FORCE_MMQ) - #define SYCL_USE_XMX -#endif - #ifdef SYCL_USE_XMX bool use_xmx = true; #else