diff --git a/CMakeLists.txt b/CMakeLists.txt index 035c66c08..6fe3488d9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -105,6 +105,7 @@ option(LLAMA_SYCL "llama: use SYCL" option(LLAMA_MPI "llama: use MPI" OFF) option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) option(LLAMA_SYCL "llama: use SYCL" OFF) +option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF) option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) @@ -455,8 +456,9 @@ if (LLAMA_SYCL) #todo: AOT find_package(IntelSYCL REQUIRED) - - #add_compile_definitions(GGML_SYCL_F16) + if (LLAMA_SYCL_F16) + add_compile_definitions(GGML_SYCL_F16) + endif() add_compile_definitions(GGML_USE_SYCL) add_compile_options(-I./) #include DPCT diff --git a/README_sycl.md b/README_sycl.md index 8b70823b3..993155071 100644 --- a/README_sycl.md +++ b/README_sycl.md @@ -120,7 +120,7 @@ cd build source /opt/intel/oneapi/setvars.sh #for FP16 -#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON +cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx @@ -232,4 +232,4 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device - Support to build in Windows. -- Support multiple cards. \ No newline at end of file +- Support multiple cards. diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 3e7d9e3a1..9253c9534 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -9297,9 +9297,17 @@ inline void ggml_sycl_op_mul_mat_sycl( int ldc = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff; const int compute_capability = g_device_caps[id].cc; - - // if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) { - if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) { +#ifdef GGML_SYCL_F16 + bool use_fp16 = true; // TODO(Yu) SYCL capability check +#else + bool use_fp16 = false; +#endif + // if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || + // ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == + // src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) { + if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && + use_fp16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && + dst->op_params[0] == GGML_PREC_DEFAULT) { // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32 // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp16 path\n");