enable SYCL_F16 support

2024-01-22 13:41:09 +08:00 · 2024-01-22 13:41:09 +08:00 · f008cc7b68
commit f008cc7b68
parent f396a3b65e
3 changed files with 17 additions and 7 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -105,6 +105,7 @@ option(LLAMA_SYCL                            "llama: use SYCL"
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
 option(LLAMA_SYCL                            "llama: use SYCL"                                  OFF)
+option(LLAMA_SYCL_F16                        "llama: use 16 bit floats for sycl calculations"   OFF)

 option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES                  "llama: build examples" ${LLAMA_STANDALONE})
@ -455,8 +456,9 @@ if (LLAMA_SYCL)
    #todo: AOT

    find_package(IntelSYCL REQUIRED)
-
-    #add_compile_definitions(GGML_SYCL_F16)
+    if (LLAMA_SYCL_F16)
+        add_compile_definitions(GGML_SYCL_F16)
+    endif()
    add_compile_definitions(GGML_USE_SYCL)

    add_compile_options(-I./) #include DPCT 
--- a/README_sycl.md
+++ b/README_sycl.md
@ -120,7 +120,7 @@ cd build
 source /opt/intel/oneapi/setvars.sh

 #for FP16
-#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
+cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference

 cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx

--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@ -9297,9 +9297,17 @@ inline void ggml_sycl_op_mul_mat_sycl(
    int ldc = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff;

    const int compute_capability = g_device_caps[id].cc;
-
-    // if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
-    if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
+#ifdef GGML_SYCL_F16
+    bool use_fp16 = true;  // TODO(Yu) SYCL capability check
+#else
+    bool use_fp16 = false;
+#endif
+    // if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 ||
+    // ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff ==
+    // src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
+    if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
+        use_fp16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1] &&
+        dst->op_params[0] == GGML_PREC_DEFAULT) {

        // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
        // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp16 path\n");