enable SYCL_F16 support

This commit is contained in:
luoyu-intel 2024-01-22 13:41:09 +08:00 committed by Meng, Hengyu
parent f396a3b65e
commit f008cc7b68
3 changed files with 17 additions and 7 deletions

View file

@ -105,6 +105,7 @@ option(LLAMA_SYCL "llama: use SYCL"
option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
option(LLAMA_SYCL "llama: use SYCL" OFF)
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
@ -455,8 +456,9 @@ if (LLAMA_SYCL)
#todo: AOT
find_package(IntelSYCL REQUIRED)
#add_compile_definitions(GGML_SYCL_F16)
if (LLAMA_SYCL_F16)
add_compile_definitions(GGML_SYCL_F16)
endif()
add_compile_definitions(GGML_USE_SYCL)
add_compile_options(-I./) #include DPCT

View file

@ -120,7 +120,7 @@ cd build
source /opt/intel/oneapi/setvars.sh
#for FP16
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx

View file

@ -9297,9 +9297,17 @@ inline void ggml_sycl_op_mul_mat_sycl(
int ldc = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff;
const int compute_capability = g_device_caps[id].cc;
// if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
#ifdef GGML_SYCL_F16
bool use_fp16 = true; // TODO(Yu) SYCL capability check
#else
bool use_fp16 = false;
#endif
// if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 ||
// ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff ==
// src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
use_fp16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1] &&
dst->op_params[0] == GGML_PREC_DEFAULT) {
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
// GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp16 path\n");