diff --git a/CMakeLists.txt b/CMakeLists.txt
index 035c66c08..6fe3488d9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -105,6 +105,7 @@ option(LLAMA_SYCL                            "llama: use SYCL"
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
 option(LLAMA_SYCL                            "llama: use SYCL"                                  OFF)
+option(LLAMA_SYCL_F16                        "llama: use 16 bit floats for sycl calculations"   OFF)
 
 option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES                  "llama: build examples" ${LLAMA_STANDALONE})
@@ -455,8 +456,9 @@ if (LLAMA_SYCL)
     #todo: AOT
 
     find_package(IntelSYCL REQUIRED)
-
-    #add_compile_definitions(GGML_SYCL_F16)
+    if (LLAMA_SYCL_F16)
+        add_compile_definitions(GGML_SYCL_F16)
+    endif()
     add_compile_definitions(GGML_USE_SYCL)
 
     add_compile_options(-I./) #include DPCT 
diff --git a/README_sycl.md b/README_sycl.md
index 8b70823b3..993155071 100644
--- a/README_sycl.md
+++ b/README_sycl.md
@@ -120,7 +120,7 @@ cd build
 source /opt/intel/oneapi/setvars.sh
 
 #for FP16
-#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
+cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
 
 cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 
@@ -232,4 +232,4 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
 
 - Support to build in Windows.
 
-- Support multiple cards.
\ No newline at end of file
+- Support multiple cards.
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 3e7d9e3a1..9253c9534 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -9297,9 +9297,17 @@ inline void ggml_sycl_op_mul_mat_sycl(
     int ldc = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff;
 
     const int compute_capability = g_device_caps[id].cc;
-
-    // if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
-    if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
+#ifdef GGML_SYCL_F16
+    bool use_fp16 = true;  // TODO(Yu) SYCL capability check
+#else
+    bool use_fp16 = false;
+#endif
+    // if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 ||
+    // ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff ==
+    // src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
+    if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
+        use_fp16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1] &&
+        dst->op_params[0] == GGML_PREC_DEFAULT) {
 
         // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
         // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp16 path\n");