diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp index 97ab2003c..0738486bb 100644 --- a/ggml/src/ggml-sycl/common.cpp +++ b/ggml/src/ggml-sycl/common.cpp @@ -11,6 +11,7 @@ // #include "common.hpp" +#include "ggml-impl.h" int get_current_device_id() { return dpct::dev_mgr::instance().current_device_id(); @@ -28,11 +29,7 @@ void* ggml_sycl_host_malloc(size_t size) try { if (err != 0) { // clear the error - fprintf( - stderr, - "WARNING: failed to allocate %.2f MB of pinned memory: %s\n", - size / 1024.0 / 1024.0, - "syclGetErrorString is not supported"); + GGML_LOG_ERROR("WARNING: failed to allocate %.2f MB of pinned memory: %s\n", size / 1024.0 / 1024.0, "syclGetErrorString is not supported"); return nullptr; } @@ -66,17 +63,21 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, const ggml_sycl_op_flatten_t op) try { - const int64_t nrows0 = ggml_nrows(src0); + + // TODO: What's the use of these? + // const int64_t nrows0 = ggml_nrows(src0); + // const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1; const bool use_src1 = src1 != nullptr; - const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1; GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT); - ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; - ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; - ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + // TODO: What are these uses of these? + + // ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + // ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; + // ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; // dd = data device float * src0_ddf = (float *) src0->data; diff --git a/ggml/src/ggml-sycl/convert.cpp b/ggml/src/ggml-sycl/convert.cpp index 5fd15e6cd..05b01db2d 100644 --- a/ggml/src/ggml-sycl/convert.cpp +++ b/ggml/src/ggml-sycl/convert.cpp @@ -424,7 +424,7 @@ static void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t global_id = item_ct1.get_local_id(2) + work_group_size * item_ct1.get_group(2); // make each work-item deal with more elements since sycl global range can not exceed max int - const src_t * x = (src_t *) vx; + const src_t * x = (const src_t *) vx; for (int64_t i = global_id; i < k; i += work_group_size * item_ct1.get_group_range(2)) { y[i] = x[i]; } diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index c5a035bad..4e07b723f 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -237,7 +237,7 @@ void upscale_f32(const float *x, float *dst, const int nb00, const int nb01, int i02 = i12 / sf2; int i03 = i13 / sf3; - dst[index] = *(float *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00); + dst[index] = *(const float *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00); } void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02, @@ -251,8 +251,7 @@ void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const i // operation int offset_dst = nidx + item_ct1.get_group(1) * ne0 + item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); - if (nidx < ne00 && item_ct1.get_group(1) < ne01 && - item_ct1.get_group(0) < ne02) { + if (nidx < ne00 && item_ct1.get_group(1) < (size_t) ne01 && item_ct1.get_group(0) < (size_t) ne02) { int offset_src = nidx + item_ct1.get_group(1) * ne00 + item_ct1.get_group(0) * ne00 * ne01; dst[offset_dst] = x[offset_src]; diff --git a/ggml/src/ggml-sycl/gemm.hpp b/ggml/src/ggml-sycl/gemm.hpp index 2ad9b36f4..df6579f72 100644 --- a/ggml/src/ggml-sycl/gemm.hpp +++ b/ggml/src/ggml-sycl/gemm.hpp @@ -23,6 +23,10 @@ #include "dnnl.hpp" #include "dnnl_sycl.hpp" + +// TODO: Remove this when needed +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wcast-qual" class DnnlGemmWrapper { public: using dt = dnnl::memory::data_type; @@ -96,6 +100,7 @@ public: } }; +# pragma clang diagnostic pop #endif #endif // GGML_SYCL_GEMM_HPP diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index b33be7a64..fd4d904f7 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -47,7 +47,7 @@ static ggml_sycl_device_info ggml_sycl_init() { info.device_count = dpct::dev_mgr::instance().device_count(); if (info.device_count == 0) { - GGML_LOG_ERROR("%s: failed to initialize " GGML_SYCL_NAME ": %s\n", __func__); + GGML_LOG_ERROR("%s: failed to initialize: %s\n", GGML_SYCL_NAME, __func__); return info; } @@ -64,7 +64,7 @@ static ggml_sycl_device_info ggml_sycl_init() { #else GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__); #endif - GGML_LOG_INFO("%s: found %d " GGML_SYCL_NAME " devices:\n", __func__, info.device_count); + GGML_LOG_INFO("%s: found %d %s devices:\n", __func__, info.device_count, GGML_SYCL_NAME); for (int i = 0; i < info.device_count; ++i) { info.devices[i].vmm = 0; @@ -137,7 +137,8 @@ void ggml_backend_sycl_print_sycl_devices() { for (int id = 0; id < device_count; ++id) { sycl::device device = dpct::dev_mgr::instance().get_device(id); - sycl::backend backend = device.get_backend(); + // TODO: backend variable is unused here! + // sycl::backend backend = device.get_backend(); std::string backend_type = get_device_backend_and_type(device); int type_id = DeviceNums[backend_type]++; std::stringstream device_type; @@ -420,13 +421,12 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer, return true; } return false; + // TODO: Buffer is unused + (void) buffer; +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) try { @@ -1092,10 +1092,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool { ggml_sycl_buffer buffer_pool[MAX_SYCL_BUFFERS] = {}; size_t pool_size = 0; - explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) : - qptr(qptr_), - device(device_) { - } + explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) : device(device_), qptr(qptr_) {} ~ggml_sycl_pool_leg() { for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) { @@ -1238,7 +1235,7 @@ static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, zeros[i] = 0.f; qzeros[i] = 0; } - const TC xi = ix < kx ? *(TC *)&x[iy * kx + ix] : zeros; + const TC xi = ix < kx ? *(const TC *)&x[iy * kx + ix] : zeros; float sum = xi[0]; float amax = sycl::fabs(xi[0]); #pragma unroll @@ -1799,6 +1796,8 @@ static void pool2d_nchw_kernel( switch (op) { case GGML_OP_POOL_AVG: res = 0; break; case GGML_OP_POOL_MAX: res = -FLT_MAX; break; + default: + break; // TODO: handle this properly } for (int i = bh; i < eh; i += 1) { @@ -1817,6 +1816,8 @@ static void pool2d_nchw_kernel( switch (op) { case GGML_OP_POOL_AVG: res += (cur / (kh * kw)); break; case GGML_OP_POOL_MAX: res = sycl::max(res, (To)cur); break; + default: + break; // TODO: handle this properly } } } @@ -1856,6 +1857,7 @@ static void get_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor *sr }); (void) dst; + (void) ctx; } template @@ -1894,9 +1896,9 @@ static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tens } (void) dst; + (void) ctx; } - static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx, const int ky, const int kx_padded, queue_ptr stream) { @@ -2484,17 +2486,19 @@ inline void ggml_sycl_op_mul_mat_sycl( const int64_t ne00 = src0->ne[0]; const int64_t ne10 = src1->ne[0]; - const int64_t ne0 = dst->ne[0]; + const int64_t row_diff = row_high - row_low; int id; SYCL_CHECK( CHECK_TRY_ERROR(id = get_current_device_id())); - +#if !GGML_SYCL_DNNL + const int64_t ne0 = dst->ne[0]; // the main device has a larger memory buffer to hold the results from all GPUs // ldc == nrows of the matrix that cuBLAS writes into int ldc = id == ctx.device ? ne0 : row_diff; +#endif #ifdef GGML_SYCL_F16 bool use_fp16 = true; // TODO(Yu) SYCL capability check @@ -2531,9 +2535,9 @@ inline void ggml_sycl_op_mul_mat_sycl( : src1_as_f16.get(); ggml_sycl_pool_alloc dst_f16(ctx.pool(), row_diff * src1_ncols); - const sycl::half alpha_f16 = 1.0f; - const sycl::half beta_f16 = 0.0f; #if !GGML_SYCL_DNNL + const sycl::half alpha_f16 = 1.0f; + const sycl::half beta_f16 = 0.0f; SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm( *stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, @@ -2570,9 +2574,9 @@ inline void ggml_sycl_op_mul_mat_sycl( const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get(); const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get(); - const float alpha = 1.0f; - const float beta = 0.0f; #if !GGML_SYCL_DNNL + const float alpha = 1.0f; + const float beta = 0.0f; # ifdef GGML_SYCL_NVIDIA SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm( oneapi::mkl::backend_selector{ *stream }, oneapi::mkl::transpose::trans, @@ -2870,7 +2874,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; - ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + // TODO: What's the use of this? + // ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; const bool src0_is_contiguous = ggml_is_contiguous(src0); const bool src1_is_contiguous = ggml_is_contiguous(src1); @@ -3296,8 +3301,8 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_TENSOR_BINARY_OP_LOCALS - - const int64_t ne_dst = ggml_nelements(dst); + // TODO: What's the use of this? + //const int64_t ne_dst = ggml_nelements(dst); SYCL_CHECK(ggml_sycl_set_device(ctx.device)); queue_ptr main_stream = ctx.stream();; @@ -3405,6 +3410,7 @@ catch (sycl::exception const &exc) { inline bool ggml_sycl_supports_mmq(enum ggml_type type) { // TODO: accuracy issues in MMQ + (void) type; return false; } @@ -3836,13 +3842,17 @@ static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, const ggml_tensor } static void ggml_sycl_nop(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + // TODO: Why this function even exists? (void) src0; (void) src1; (void) dst; + (void) ctx; } void ggml_sycl_set_main_device(const int main_device) try { - if (dpct::get_current_device_id() == main_device) return; + if (dpct::get_current_device_id() == static_cast (main_device)) { + return; + } check_allow_gpu_index(main_device); dpct::select_device(main_device); @@ -4210,6 +4220,7 @@ try { ggml_backend_sycl_context *sycl_ctx = (ggml_backend_sycl_context *)backend->context; + sycl::event *sycl_event = static_cast(event->context); const queue_ptr &stream = sycl_ctx->stream(sycl_ctx->device, 0); @@ -4224,7 +4235,8 @@ catch (sycl::exception const &exc) } static void ggml_backend_sycl_event_wait(ggml_backend_t backend, ggml_backend_event_t event) try { - ggml_backend_sycl_context* sycl_ctx = static_cast(backend->context); + // TODO: sycl_ctx is unused here + // ggml_backend_sycl_context* sycl_ctx = static_cast(backend->context); sycl::event* sycl_event = static_cast(event->context); if (ggml_backend_is_sycl(backend)) { @@ -4632,6 +4644,8 @@ static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, cons // SYCL doesn't support registering host memory, left here for reference // "ggml_backend_register_host_buffer" // "ggml_backend_unregister_host_buffer" + // doing this to make the compiler happy + (void) name; return nullptr; } diff --git a/ggml/src/ggml-sycl/mmq.cpp b/ggml/src/ggml-sycl/mmq.cpp index e952533d3..2d18369e6 100644 --- a/ggml/src/ggml-sycl/mmq.cpp +++ b/ggml/src/ggml-sycl/mmq.cpp @@ -13,6 +13,11 @@ #include "mmq.hpp" #include "vecdotq.hpp" +// Just to make the compiler happy +// TODO: Remove it when needed +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wdivision-by-zero" + typedef void (*allocate_tiles_sycl_t)( int** x_ql, sycl::half2** x_dm, @@ -3029,3 +3034,4 @@ catch (sycl::exception const &exc) { << ", line:" << __LINE__ << std::endl; std::exit(1); } +#pragma clang diagnostic pop \ No newline at end of file diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp index 5986b4c64..dfc821ab9 100644 --- a/ggml/src/ggml-sycl/mmvq.cpp +++ b/ggml/src/ggml-sycl/mmvq.cpp @@ -1031,4 +1031,5 @@ void ggml_sycl_op_mul_mat_vec_q( (void) src1; (void) dst; (void) src1_ddf_i; + (void) ctx; } diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp index 03395c718..4ab24815e 100644 --- a/ggml/src/ggml-sycl/norm.cpp +++ b/ggml/src/ggml-sycl/norm.cpp @@ -32,7 +32,7 @@ static void norm_f32(const float* x, float* dst, const int ncols, const float ep item_ct1.barrier(sycl::access::fence_space::local_space); mean_var = 0.f; int nreduce = nwarps / WARP_SIZE; - for (size_t i = 0; i < nreduce; i += 1) + for (size_t i = 0; i < (size_t) nreduce; i += 1) { mean_var += s_sum[lane_id + i * WARP_SIZE]; } @@ -86,7 +86,7 @@ static void group_norm_f32(const float* x, float* dst, const int group_size, con */ item_ct1.barrier(); tmp = 0.f; - for (size_t i = 0; i < nreduce; i += 1) + for (size_t i = 0; i < (size_t) nreduce; i += 1) { tmp += s_sum[lane_id + i * WARP_SIZE]; } @@ -121,7 +121,7 @@ static void group_norm_f32(const float* x, float* dst, const int group_size, con */ item_ct1.barrier(); tmp = 0.f; - for (size_t i = 0; i < nreduce; i += 1) + for (size_t i = 0; i < (size_t) nreduce; i += 1) { tmp += s_sum[lane_id + i * WARP_SIZE]; } diff --git a/ggml/src/ggml-sycl/tsembd.cpp b/ggml/src/ggml-sycl/tsembd.cpp index d5c227cd1..0e9b0cb68 100644 --- a/ggml/src/ggml-sycl/tsembd.cpp +++ b/ggml/src/ggml-sycl/tsembd.cpp @@ -68,4 +68,5 @@ void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml const int max_period = dst->op_params[1]; timestep_embedding_f32_sycl(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream); + (void) src1; }